• python爬取某乎保存为json文件


    1. import time
    2. import json
    3. from selenium import webdriver
    4. from selenium.webdriver.chrome.options import Options
    5. from selenium.webdriver.common.by import By
    6. from pywinauto.application import Application
    7. from selenium.webdriver.common.action_chains import ActionChains
    8. from selenium.webdriver.common.keys import Keys
    9. class ZhihuCrawler:
    10. def __init__(self):
    11. self.updated_data = [] # 存储更新的数据
    12. self.title_list = [] # 存储已处理的文章标题,避免重复处理
    13. self.link_list = [] # 存储文章链接
    14. self.file_path = 'link_list.txt'
    15. self.driver = None
    16. def run_chrome(self):
    17. """
    18. 启动 Chrome 浏览器并通过命令行启用远程调试端口。
    19. """
    20. app = Application().start(
    21. r'c:\WINDOWS\System32\cmd.exe /c cd C:\\Program Files\\Google\\Chrome\\Application && start chrome.exe --remote-debugging-port=9999',
    22. create_new_console=True, wait_for_idle=False)
    23. window = app.top_window()
    24. window.wait('ready')
    25. window.close()
    26. def get_user_browser(self):
    27. """
    28. 从已启动的 Chrome 实例中获取 WebDriver 对象。
    29. """
    30. options = Options()
    31. options.add_experimental_option("debuggerAddress", "127.0.0.1:9999")
    32. driver = webdriver.Chrome(options=options)
    33. driver.get('https://www.zhihu.com/')
    34. driver.implicitly_wait(5)
    35. time.sleep(2)
    36. return driver
    37. def read_link_list(self):
    38. """
    39. 从文件中读取链接列表,返回一个包含链接的列表。
    40. """
    41. links = []
    42. with open(self.file_path, 'r', encoding='utf-8') as file:
    43. for line in file.readlines():
    44. link = line.strip() # 去除行尾的换行符和空白
    45. links.append(link)
    46. return links
    47. def crawl_zhihu(self, search_list, link_list_path):
    48. # 启动 Chrome 并设置远程调试端口
    49. self.run_chrome()
    50. # 初始化 WebDriver
    51. self.driver = self.get_user_browser()
    52. # 初始化链接列表和文件路径
    53. self.link_list = []
    54. self.file_path = link_list_path
    55. # 从搜索关键词列表中遍历搜索关键词
    56. for search in search_list:
    57. print(search)
    58. try:
    59. # 构建搜索链接
    60. link = f"https://www.zhihu.com/search?q={search}&type=content"
    61. self.driver.get(link)
    62. self.driver.implicitly_wait(2)
    63. time.sleep(2)
    64. # 在搜索结果页面查找文章链接
    65. for _ in range(5):
    66. print(_)
    67. videourl = self.driver.find_elements(By.CSS_SELECTOR, "h2[class='ContentItem-title']")
    68. # 遍历每个搜索结果
    69. for v in videourl:
    70. urlv = v.find_element(By.TAG_NAME, "a").get_attribute('href')
    71. # 检查链接是否已经存在
    72. if urlv not in self.link_list:
    73. self.link_list.append(urlv)
    74. print(urlv)
    75. # 检查链接是否已经存在于文件中
    76. existing_urls = set()
    77. try:
    78. with open(self.file_path, 'r', encoding='utf-8') as f:
    79. existing_urls = set(line.strip() for line in f)
    80. # existing_urls = set() # 创建一个空集合
    81. # for line in f: # 遍历文件的每一行
    82. # stripped_line = line.strip() # 去除首尾空白字符
    83. # existing_urls.add(stripped_line) # 将处理后的字符串添加到集合中
    84. except FileNotFoundError:
    85. pass
    86. # 如果链接不在文件中,则添加到文件中
    87. if urlv not in existing_urls:
    88. with open(self.file_path, 'a', encoding='utf-8') as f:
    89. f.write(f'{urlv}\n')
    90. print(f'Successfully added {urlv} to the file.')
    91. else:
    92. print(f'The URL {urlv} already exists in the file.')
    93. # 模拟滚动浏览器窗口
    94. self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    95. time.sleep(1)
    96. except Exception as e:
    97. print(f"Error: {e}")
    98. print(self.link_list)
    99. # 从文件中读取链接列表
    100. links = self.read_link_list()
    101. # 遍历链接列表
    102. for url in links:
    103. print(url)
    104. try:
    105. # 访问网页
    106. self.driver.get(url)
    107. # 等待页面加载
    108. self.driver.implicitly_wait(1)
    109. # 模拟键盘操作,关闭可能出现的弹窗
    110. actions = ActionChains(self.driver)
    111. actions.send_keys(Keys.ESCAPE).perform()
    112. # 初始化标题和内容
    113. title = ""
    114. content = ""
    115. # 根据不同类型的知乎页面抓取信息
    116. if "zhuanlan.zhihu.com/p" in url:
    117. title = self.driver.find_element(By.XPATH, '//*[@id="root"]/div/main/div/article/header/h1').text
    118. content = self.driver.find_element(By.XPATH,
    119. '//*[@id="root"]/div/main/div/article/div[1]/div/div/div').text
    120. elif "www.zhihu.com/question" in url:
    121. title = self.driver.find_element(By.XPATH,
    122. '//*[@id="root"]/div/main/div/div/div[1]/div[2]/div/div[1]/div[1]/h1').text
    123. content = self.driver.find_element(By.XPATH,
    124. '//*[@id="root"]/div/main/div/div/div[3]/div[1]/div/div[2]/div/div/div/div[2]/span[1]/div/div/span').text
    125. # 检查内容长度
    126. if len(content.encode('utf-8')) < 50:
    127. print(f"Content too short, skipping: {url}")
    128. continue
    129. # 检查标题是否已经处理过,且不包含特定关键词
    130. if url not in self.title_list and "皮肤" not in title:
    131. if "无效" in title:
    132. print(content)
    133. self.title_list.append(url)
    134. # 创建字典存储数据
    135. entry = {
    136. 'url': url,
    137. 'title': title,
    138. 'content': content,
    139. 'time': int(time.time())
    140. }
    141. # 将数据添加到列表中
    142. print(entry)
    143. self.updated_data.append(entry)
    144. except SpecificException as e:
    145. # 处理特定异常类型
    146. print(f"Specific Exception: {e}")
    147. except AnotherSpecificException as e:
    148. # 处理另一种特定异常类型
    149. print(f"Another Specific Exception: {e}")
    150. except Exception as e:
    151. print(f"Error: {e}")
    152. # 将数据写入 JSON 文件
    153. with open('test.json', 'w', encoding='utf-8') as updated_file:
    154. json.dump(self.updated_data, updated_file, ensure_ascii=False, indent=4)
    155. if __name__ == '__main__':
    156. crawler = ZhihuCrawler()
    157. crawler.crawl_zhihu(['关键词1', '关键词2'], 'todo.txt')

  • 相关阅读:
    计算机导论真题(二)
    如何开通并快速入门腾讯云对象存储COS服务?
    什么测试自动化测试?
    windows工具:推荐一款可以截长图(滚动截图)的工具FSCapture
    谈谈Spring Cloud OpenFeign远程调用性能优化
    BUUCTF学习(8): 随便注,SQL
    Android—ATMS启动
    pytorch笔记:自动混合精度(AMP)
    小程序云开发笔记三
    使用QEMU调试ARM64 Linux内核v6.0.9
  • 原文地址:https://blog.csdn.net/weixin_44740756/article/details/134556670