- import time
- import json
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.common.by import By
- from pywinauto.application import Application
-
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
-
-
- class ZhihuCrawler:
- def __init__(self):
- self.updated_data = [] # 存储更新的数据
- self.title_list = [] # 存储已处理的文章标题,避免重复处理
- self.link_list = [] # 存储文章链接
- self.file_path = 'link_list.txt'
- self.driver = None
-
-
- def run_chrome(self):
- """
- 启动 Chrome 浏览器并通过命令行启用远程调试端口。
- """
- app = Application().start(
- r'c:\WINDOWS\System32\cmd.exe /c cd C:\\Program Files\\Google\\Chrome\\Application && start chrome.exe --remote-debugging-port=9999',
- create_new_console=True, wait_for_idle=False)
- window = app.top_window()
- window.wait('ready')
- window.close()
-
- def get_user_browser(self):
- """
- 从已启动的 Chrome 实例中获取 WebDriver 对象。
- """
- options = Options()
- options.add_experimental_option("debuggerAddress", "127.0.0.1:9999")
- driver = webdriver.Chrome(options=options)
- driver.get('https://www.zhihu.com/')
- driver.implicitly_wait(5)
- time.sleep(2)
- return driver
-
- def read_link_list(self):
- """
- 从文件中读取链接列表,返回一个包含链接的列表。
- """
- links = []
- with open(self.file_path, 'r', encoding='utf-8') as file:
- for line in file.readlines():
- link = line.strip() # 去除行尾的换行符和空白
- links.append(link)
- return links
-
- def crawl_zhihu(self, search_list, link_list_path):
- # 启动 Chrome 并设置远程调试端口
- self.run_chrome()
-
- # 初始化 WebDriver
- self.driver = self.get_user_browser()
-
- # 初始化链接列表和文件路径
- self.link_list = []
- self.file_path = link_list_path
-
- # 从搜索关键词列表中遍历搜索关键词
- for search in search_list:
- print(search)
- try:
- # 构建搜索链接
- link = f"https://www.zhihu.com/search?q={search}&type=content"
- self.driver.get(link)
- self.driver.implicitly_wait(2)
- time.sleep(2)
-
- # 在搜索结果页面查找文章链接
- for _ in range(5):
- print(_)
- videourl = self.driver.find_elements(By.CSS_SELECTOR, "h2[class='ContentItem-title']")
-
- # 遍历每个搜索结果
- for v in videourl:
- urlv = v.find_element(By.TAG_NAME, "a").get_attribute('href')
-
- # 检查链接是否已经存在
- if urlv not in self.link_list:
- self.link_list.append(urlv)
- print(urlv)
-
- # 检查链接是否已经存在于文件中
- existing_urls = set()
- try:
- with open(self.file_path, 'r', encoding='utf-8') as f:
- existing_urls = set(line.strip() for line in f)
- # existing_urls = set() # 创建一个空集合
- # for line in f: # 遍历文件的每一行
- # stripped_line = line.strip() # 去除首尾空白字符
- # existing_urls.add(stripped_line) # 将处理后的字符串添加到集合中
- except FileNotFoundError:
- pass
-
- # 如果链接不在文件中,则添加到文件中
- if urlv not in existing_urls:
- with open(self.file_path, 'a', encoding='utf-8') as f:
- f.write(f'{urlv}\n')
- print(f'Successfully added {urlv} to the file.')
- else:
- print(f'The URL {urlv} already exists in the file.')
-
- # 模拟滚动浏览器窗口
- self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- time.sleep(1)
-
- except Exception as e:
- print(f"Error: {e}")
-
- print(self.link_list)
-
- # 从文件中读取链接列表
- links = self.read_link_list()
-
- # 遍历链接列表
- for url in links:
- print(url)
-
- try:
- # 访问网页
- self.driver.get(url)
- # 等待页面加载
- self.driver.implicitly_wait(1)
-
- # 模拟键盘操作,关闭可能出现的弹窗
- actions = ActionChains(self.driver)
- actions.send_keys(Keys.ESCAPE).perform()
-
- # 初始化标题和内容
- title = ""
- content = ""
-
- # 根据不同类型的知乎页面抓取信息
- if "zhuanlan.zhihu.com/p" in url:
- title = self.driver.find_element(By.XPATH, '//*[@id="root"]/div/main/div/article/header/h1').text
- content = self.driver.find_element(By.XPATH,
- '//*[@id="root"]/div/main/div/article/div[1]/div/div/div').text
-
- elif "www.zhihu.com/question" in url:
- title = self.driver.find_element(By.XPATH,
- '//*[@id="root"]/div/main/div/div/div[1]/div[2]/div/div[1]/div[1]/h1').text
- content = self.driver.find_element(By.XPATH,
- '//*[@id="root"]/div/main/div/div/div[3]/div[1]/div/div[2]/div/div/div/div[2]/span[1]/div/div/span').text
-
- # 检查内容长度
- if len(content.encode('utf-8')) < 50:
- print(f"Content too short, skipping: {url}")
- continue
-
- # 检查标题是否已经处理过,且不包含特定关键词
- if url not in self.title_list and "皮肤" not in title:
- if "无效" in title:
- print(content)
- self.title_list.append(url)
- # 创建字典存储数据
- entry = {
- 'url': url,
- 'title': title,
- 'content': content,
- 'time': int(time.time())
- }
- # 将数据添加到列表中
- print(entry)
- self.updated_data.append(entry)
-
- except SpecificException as e:
- # 处理特定异常类型
- print(f"Specific Exception: {e}")
-
- except AnotherSpecificException as e:
- # 处理另一种特定异常类型
- print(f"Another Specific Exception: {e}")
-
- except Exception as e:
- print(f"Error: {e}")
-
- # 将数据写入 JSON 文件
- with open('test.json', 'w', encoding='utf-8') as updated_file:
- json.dump(self.updated_data, updated_file, ensure_ascii=False, indent=4)
-
-
- if __name__ == '__main__':
- crawler = ZhihuCrawler()
- crawler.crawl_zhihu(['关键词1', '关键词2'], 'todo.txt')