• 爬取某网站计算机类图书


    网页链接:

    https://www.ptpress.com.cn/shopping/search?tag=search&orderstr=hot&leve11-75424c57-6dd7-4d1f-b6b9-8e95773c0593

    一、为了完成爬取数据,需要进行以下步骤

    1.在浏览器中打开页面,选择"计算机"

    2.可以看到大量的"计算机"相关书籍,右键点击"检查"

    3.刷新页面,点击下一页,查看url

    4.点击"Response",查看json格式中的信息,发现与要爬取的书籍信息一致

    5.划到最低端可以发现计算机类图书查看页数(570页)

    6.查看Date格式

    7.根据书籍详情来爬取相关信息

    二、代码部分

    1.将爬取内容放入打印并放入excel表格中
    1. import requests
    2. import re
    3. import datetime
    4. from time import sleep
    5. import pandas as pd
    6. S = "bookLink"
    7. headers = {
    8. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    9. '(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.44',
    10. }
    11. url = 'https://www.ptpress.com.cn/bookinfo/getBookListForEBTag'
    12. book_info = []
    13. for page in range(1,571):
    14. data = {
    15. 'bookTagId':'a15a734f-0ae9-41d7-9012-6ef9de2e71c8',
    16. 'page':f'{page}',
    17. 'rows':'18',
    18. 'orderStr':'publish'
    19. }
    20. response = requests.post(url,data=data,headers=headers)
    21. data = response.json()
    22. author = data["data"]["data"][0]["author"]
    23. isbn = data["data"]["data"][0]["isbn"]
    24. publish = datetime.datetime.strptime(
    25. data["data"]["data"][0]["publishDate"],"%Y%m"
    26. )
    27. discountPrice = data["data"]["data"][0]["discountPrice"]
    28. bookDiscount = data["data"]["data"][0]["bookDiscount"]
    29. price = data["data"]["data"][0]["price"]
    30. bookId = data["data"]["data"][0]["bookId"]
    31. executiveEditor = data["data"]["data"][0]["executiveEditor"]
    32. bookName = data["data"]["data"][0]["bookName"]
    33. picPath = data["data"]["data"][0]["picPath"]
    34. bookLink = "https://www.ptpress.com.cn/shopping/buy?bookId=" + bookId
    35. book_info.append({
    36. "author":author,
    37. "isbn":isbn,
    38. "publish":publish,
    39. "discountPrice":discountPrice,
    40. "bookDiscount":bookDiscount,
    41. "price":price,
    42. "bookId":bookId,
    43. "executiveEditor":executiveEditor,
    44. "bookName":bookName,
    45. "picPath":picPath,
    46. "bookLink":bookLink
    47. })
    48. print(f"第{page}页爬取成功!")
    49. sleep(1)
    50. print(book_info)
    51. # 将数据保存到Excel文件中
    52. df = pd.DataFrame(book_info)
    53. df.to_excel("book_info.xlsx", index=False)
    爬取结果:

    2.将爬取内容放入打印并放入csv文件中
    1. import requests
    2. import re
    3. import datetime
    4. from time import sleep
    5. import pandas as pd
    6. import csv
    7. S = "bookLink"
    8. headers = {
    9. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    10. '(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.44',
    11. }
    12. url = 'https://www.ptpress.com.cn/bookinfo/getBookListForEBTag'
    13. book_info = []
    14. for page in range(1,571):
    15. data = {
    16. 'bookTagId':'a15a734f-0ae9-41d7-9012-6ef9de2e71c8',
    17. 'page':f'{page}',
    18. 'rows':'18',
    19. 'orderStr':'publish'
    20. }
    21. response = requests.post(url,data=data,headers=headers)
    22. data = response.json()
    23. author = data["data"]["data"][0]["author"]
    24. isbn = data["data"]["data"][0]["isbn"]
    25. publish = datetime.datetime.strptime(
    26. data["data"]["data"][0]["publishDate"],"%Y%m"
    27. )
    28. discountPrice = data["data"]["data"][0]["discountPrice"]
    29. bookDiscount = data["data"]["data"][0]["bookDiscount"]
    30. price = data["data"]["data"][0]["price"]
    31. bookId = data["data"]["data"][0]["bookId"]
    32. executiveEditor = data["data"]["data"][0]["executiveEditor"]
    33. bookName = data["data"]["data"][0]["bookName"]
    34. picPath = data["data"]["data"][0]["picPath"]
    35. bookLink = "https://www.ptpress.com.cn/shopping/buy?bookId=" + bookId
    36. book_info.append({
    37. "author":author,
    38. "isbn":isbn,
    39. "publish":publish,
    40. "discountPrice":discountPrice,
    41. "bookDiscount":bookDiscount,
    42. "price":price,
    43. "bookId":bookId,
    44. "executiveEditor":executiveEditor,
    45. "bookName":bookName,
    46. "picPath":picPath,
    47. "bookLink":bookLink
    48. })
    49. print(f"第{page}页爬取成功!")
    50. sleep(1)
    51. print(book_info)
    52. # 将数据保存到csv文件中
    53. df = pd.DataFrame(book_info)
    54. df.to_csv("人民邮电计算机书本信息.csv", index=False)
    爬取结果:

    3.将爬取内容放入打印并放入MySQL数据库中
    1. import requests
    2. import re
    3. import datetime
    4. from time import sleep
    5. import pandas as pd
    6. import csv
    7. import pymysql
    8. S = "bookLink"
    9. headers = {
    10. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
    11. '(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.44',
    12. }
    13. url = 'https://www.ptpress.com.cn/bookinfo/getBookListForEBTag'
    14. book_info = []
    15. for page in range(1,571):
    16. data = {
    17. 'bookTagId':'a15a734f-0ae9-41d7-9012-6ef9de2e71c8',
    18. 'page':f'{page}',
    19. 'rows':'18',
    20. 'orderStr':'publish'
    21. }
    22. response = requests.post(url,data=data,headers=headers)
    23. data = response.json()
    24. author = data["data"]["data"][0]["author"]
    25. isbn = data["data"]["data"][0]["isbn"]
    26. publish = datetime.datetime.strptime(
    27. data["data"]["data"][0]["publishDate"],"%Y%m"
    28. )
    29. discountPrice = data["data"]["data"][0]["discountPrice"]
    30. bookDiscount = data["data"]["data"][0]["bookDiscount"]
    31. price = data["data"]["data"][0]["price"]
    32. bookId = data["data"]["data"][0]["bookId"]
    33. executiveEditor = data["data"]["data"][0]["executiveEditor"]
    34. bookName = data["data"]["data"][0]["bookName"]
    35. picPath = data["data"]["data"][0]["picPath"]
    36. bookLink = "https://www.ptpress.com.cn/shopping/buy?bookId=" + bookId
    37. book_info.append({
    38. "author":author,
    39. "isbn":isbn,
    40. "publish":publish,
    41. "discountPrice":discountPrice,
    42. "bookDiscount":bookDiscount,
    43. "price":price,
    44. "bookId":bookId,
    45. "executiveEditor":executiveEditor,
    46. "bookName":bookName,
    47. "picPath":picPath,
    48. "bookLink":bookLink
    49. })
    50. print(f"第{page}页爬取成功!")
    51. sleep(1)
    52. print(book_info)
    53. # 将数据保存到MySQL数据库中
    54. conn = pymysql.connect(host='localhost', user='root', password='your_password', db='your_database', charset='utf8')
    55. cursor = conn.cursor()
    56. # 创建表格booklist
    57. cursor.execute('CREATE TABLE IF NOT EXISTS booklist (author VARCHAR(255), isbn VARCHAR(255), publish DATE, discountPrice FLOAT, bookDiscount FLOAT, price FLOAT, bookId VARCHAR(255), executiveEditor VARCHAR(255), bookName VARCHAR(255), picPath VARCHAR(255), bookLink VARCHAR(255))')
    58. # 将数据插入到表格booklist中
    59. for book in book_info:
    60. sql = f"INSERT INTO booklist (author, isbn, publish, discountPrice, bookDiscount, price, bookId, executiveEditor, bookName, picPath, bookLink) VALUES ('{book['author']}', '{book['isbn']}', '{book['publish'].strftime('%Y-%m-%d')}', {book['discountPrice']}, {book['bookDiscount']}, {book['price']}, '{book['bookId']}', '{book['executiveEditor']}', '{book['bookName']}', '{book['picPath']}', '{book['bookLink']}')"
    61. cursor.execute(sql)
    62. # 提交事务
    63. conn.commit()
    64. # 关闭连接
    65. cursor.close()
    66. conn.close()
    爬取结果:

  • 相关阅读:
    大咖说*计算讲谈社|商用车智能驾驶商业化实践
    3D模型格式全解|含RVT、3DS、DWG、FBX、IFC、OSGB、OBJ等70余种
    matlab创建矩阵、理解三维矩阵
    【总线】AXI第十课时:AXI协议的Ordering Model 使用ID tag
    数字孪生的技术开发平台
    Java 数组(Arrays)相关
    win10开机黑屏,只有鼠标,解决方案
    性能提升3倍之路:记Guava cache带来的GC问题
    奉劝那些刚参加工作的学弟学妹们:要想进大厂,这些核心技能是你必须要掌握的!完整学习路线!!(建议收藏)
    安全认证 | CISP考试资格及报考条件
  • 原文地址:https://blog.csdn.net/m0_74972727/article/details/133892237