• Python_scrapy(知乎问答爬取


    ***本文章为个人记录***

    目录

    一、模拟登录知乎

    二、提取知乎question页面url

    三、提取question页面具体数据

    四、提取answer页面具体数据

    五、items.py的编写

    六、pipelines的编写

    七、Mysql数据库存储结果


    一、模拟登录知乎

    (第一次运行程序)先模拟登录->保存cookie 

    (其次运行程序)->运行已保存的cookie

            模拟登录时没有做验证码处理,所以延时10秒手动通过验证码进行登录。登录后再将知乎账号数据cookie保存起来,为下次爬取浏览器直接使用(下次使用时不需要再进行模拟登录

    第一次运行程序:

    1. # 方法覆盖 每次启动spider前,都启动模拟登录
    2. def start_requests(self):
    3. from selenium.webdriver.chrome.options import Options
    4. url = "https://www.zhihu.com/signin?next=%2F"
    5. chrome_options = Options()
    6. chrome_options.add_argument("--disable-extensions")
    7. chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
    8. browser = webdriver.Chrome(executable_path='C:/Users/86135/MySpider/chromedriver.exe',chrome_options=chrome_options)
    9. # 模拟登录知乎
    10. browser.get('https://www.zhihu.com/signin?next=%2F')
    11. browser.find_element(By.XPATH,'//*[@id="root"]/div/main/div/div/div/div/div[1]/div/div[1]/form/div[1]/div[2]').click()
    12. browser.find_element(By.CSS_SELECTOR,".SignFlow-account input[name='username']").send_keys("你的账号")
    13. browser.find_element(By.CSS_SELECTOR,".SignFlow-password input[name='password']").send_keys("你的密码")
    14. time.sleep(1)
    15. move(700,500)
    16. click()
    17. browser.find_element(By.XPATH,'//*[@id="root"]/div/main/div/div/div/div/div[1]/div/div[1]/form/button').click()
    18. time.sleep(10) # 手动通过验证码
    19. # cookies保存浏览器数据,为下次打开浏览器做准备
    20. browser.get("https://www.zhihu.com/")
    21. cookies = browser.get_cookies()
    22. pickle.dump(cookies,open("C:/Users/86135/MySpider/cookies/zhihu.cookie","wb"))
    23. cookie_dict = {}
    24. for cookie in cookies:
    25. cookie_dict[cookie["name"]] = cookie["value"]
    26. return [scrapy.Request(url=self.start_urls[0],dont_filter=True,cookies=cookie_dict)]

    其次运行程序:

    1. # 方法覆盖 每次启动spider前,都启动模拟登录
    2. def start_requests(self):
    3. # cookies读取已保存的浏览器数据,继续爬取
    4. cookies = pickle.load(open("C:/Users/86135/MySpider/cookies/zhihu.cookie","rb"))
    5. cookie_dict={}
    6. for cookie in cookies:
    7. cookie_dict[cookie["name"]] = cookie["value"]
    8. return [scrapy.Request(url=self.start_urls[0],dont_filter=True,cookies=cookie_dict)]

    保存的cookies路径 

    二、提取知乎question页面url

            如果提取到question相关的url则下载后交由parse_question函数进行提取

    1. def parse(self, response):
    2. """
    3. 提取出html页面中的所有url 并跟踪url进一步爬取
    4. 如果提取的url格式为 /question/xxx 下载之后直接进入解析函数
    5. """
    6. all_urls = response.css("a::attr(href)").extract()
    7. all_urls = [parse.urljoin(response.url, url) for url in all_urls]
    8. all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
    9. for url in all_urls:
    10. match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url) # 提取url以'/'或者结束符结尾的内容
    11. if match_obj:
    12. # 如果提取到question相关的也url则下载后交由提取函数进行提取
    13. request_url = match_obj.group(1) # question_url
    14. # scrapy通过yield提交到下载器
    15. yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)
    16. # break
    17. else:
    18. # 如果不是question页面则直接进一步跟踪
    19. yield scrapy.Request(url, headers=self.headers, callback=self.parse) # 不符合继续提取
    20. # pass

    三、提取question页面具体数据

            通过items.py编写ZhihuQuestionItem(),定义item_loader对象加载想要提取的question页面各个具体数据,然后提交到下载器进行数据保存。同时将页面answer(json数据)提交格式输出到parse_answer函数进行提取相关回答数据。

    1. # question的第一页answer的请求url
    2. start_answer_url = 'https://www.zhihu.com//api/v4/questions/{}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Creaction_instruction%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cvip_info%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset=3&limit=5&sort_by=default&platform=desktop/api/v4/questions/39684414/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Creaction_instruction%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cvip_info%2Cbadge%5B*%5D.topics%3Bdata%5B*%5D.settings.table_of_content.enabled&offset={}&limit={}'
    1. def parse_question(self, response):
    2. # 处理question页面,从页面中提取具体的question item
    3. match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) # 提取question_id
    4. if match_obj:
    5. question_id = match_obj.group(2) # question_url-Id
    6. item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
    7. item_loader.add_css("title", "h1.QuestionHeader-title::text")
    8. item_loader.add_css("content", ".QuestionRichText")
    9. item_loader.add_value("url", response.url)
    10. item_loader.add_value("zhihu_id", question_id)
    11. item_loader.add_css("answer_num", ".List-headerText span::text")
    12. item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
    13. item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text")
    14. item_loader.add_css("click_num",".NumberBoard-itemValue::text")
    15. item_loader.add_css("topics", '.QuestionHeader-topics .Popover div::text')
    16. item_loader.add_value("crawl_time", datetime.datetime.now().strftime(SQL_DATETIME_FORMAT))
    17. question_item = item_loader.load_item()
    18. # 起始0 每页20个数据
    19. yield scrapy.Request(self.start_answer_url.format(question_id, 0, 20), headers=self.headers,
    20. callback=self.parse_answer)
    21. # 提交到下载器
    22. yield question_item

    四、提取answer页面具体数据

            加载由parse_question函数提交的json数据,提取出具体字段后提交到下载器进行数据保存。

    1. def parse_answer(self, response):
    2. ans_json = json.loads(response.text)
    3. is_end = ans_json["paging"]["is_end"]
    4. next_url = ans_json["paging"]["next"]
    5. # 提取answer的具体字段
    6. for answer in ans_json["data"]:
    7. answer_item = ZhihuAnswerItem()
    8. answer_item["zhihu_id"] = answer["id"]
    9. answer_item["url"] = answer["url"]
    10. answer_item["question_id"] = answer["question"]["id"]
    11. answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None
    12. answer_item["content"] = answer["content"] if "content" in answer else None
    13. answer_item["parise_num"] = answer["voteup_count"]
    14. answer_item["comments_num"] = answer["comment_count"]
    15. answer_item["create_time"] = answer["created_time"]
    16. answer_item["update_time"] = answer["updated_time"]
    17. # answer_item["crawl_time"] = datetime.datetime.now()
    18. yield answer_item
    19. pass
    20. if not is_end:
    21. yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)

    五、items.py的编写

            编写question_item和answer_item类,并定义插入数据库函数,将Mysql插入语句及提取的数据params返回到pipelines进行数据库保存。

    1. class ZhihuQuestionItem(scrapy.Item):
    2. # 知乎的问题 item
    3. zhihu_id = scrapy.Field()
    4. topics = scrapy.Field()
    5. url = scrapy.Field()
    6. title = scrapy.Field()
    7. content = scrapy.Field()
    8. answer_num = scrapy.Field()
    9. comments_num = scrapy.Field()
    10. watch_user_num = scrapy.Field()
    11. click_num = scrapy.Field()
    12. crawl_time = scrapy.Field()
    13. def get_insert_sql(self):
    14. insert_sql="""
    15. insert into zhihu_question(zhihu_id,topics,url,title,content,answer_num,
    16. comments_num,watch_user_num,crawl_time,click_num)
    17. values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)ON DUPLICATE KEY UPDATE title=VALUES(title)
    18. """
    19. zhihu_id=self["zhihu_id"][0]
    20. topics=",".join(self["topics"])
    21. url=self["url"][0]
    22. title="".join(self["title"])
    23. content="".join(self["content"])
    24. answer_num=extract_num("".join(self["answer_num"]))
    25. comments_num=extract_num("".join(self["comments_num"]))
    26. watch_user_num="".join(self["watch_user_num"][0])
    27. click_num="".join(self["click_num"][1])
    28. crawl_time=datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
    29. params = (zhihu_id,topics,url,title,content,answer_num,comments_num,watch_user_num,crawl_time,click_num)
    30. return insert_sql,params
    31. class ZhihuAnswerItem(scrapy.Item):
    32. # 知乎的回答 item
    33. zhihu_id = scrapy.Field()
    34. url = scrapy.Field()
    35. question_id = scrapy.Field()
    36. author_id = scrapy.Field()
    37. content = scrapy.Field()
    38. parise_num = scrapy.Field()
    39. comments_num = scrapy.Field()
    40. create_time = scrapy.Field()
    41. update_time =scrapy.Field()
    42. # crawl_time = scrapy.Field()
    43. def get_insert_sql(self):
    44. # 插入知乎question表的sql语句
    45. insert_sql="""
    46. insert into zhihu_answer(zhihu_id,url,question_id,author_id,content,praise_num,
    47. comments_num,create_time,update_time)
    48. values(%s,%s,%s,%s,%s,%s,%s,%s,%s)ON DUPLICATE KEY UPDATE zhihu_id=VALUES(zhihu_id)
    49. """
    50. create_time=datetime.datetime.fromtimestamp(self['create_time'])
    51. update_time=datetime.datetime.fromtimestamp(self['update_time'])
    52. params = (
    53. self["zhihu_id"],self["url"],self['question_id'],
    54. self['author_id'],self['content'],self['parise_num'],
    55. self['comments_num'],create_time,update_time,
    56. # self['crawl_time']
    57. )
    58. return insert_sql,params

    六、pipelines的编写

            由items中的get_insert_sql函数返回的两个参数inset_sql、params,在Mysql异步入库中的do_insert函数中提取两个参数,执行SQL语句,将数据存储到数据库。

    1. # 异步入Mysql库
    2. class MysqlTwistedPipline(object):
    3. def __init__(self, dbpool):
    4. self.dbpool = dbpool
    5. @classmethod
    6. def from_settings(cls,settings):
    7. # 登录参数在settings中
    8. dbparms = dict(
    9. host = settings['MYSQL_HOST'],
    10. db = settings['MYSQL_DBNAME'],
    11. user = settings['MYSQL_USER'],
    12. passwd = settings['MYSQL_PASSWORD'],
    13. charset = 'utf8',
    14. cursorclass = DictCursor,
    15. use_unicode = True,
    16. )
    17. dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
    18. return cls(dbpool)
    19. def process_item(self, item, spider):
    20. query = self.dbpool.runInteraction(self.do_insert, item)
    21. query.addErrback(self.handle_error, item, spider)
    22. def handle_error(self,failure,item,spider):
    23. print(failure)
    24. def do_insert(self,cursor,item):
    25. insert_sql,params = item.get_insert_sql()
    26. cursor.execute(insert_sql, params) # 执行数据库语句,将数据存入SQL数据库中
    27. pass

    七、Mysql数据库存储结果

    question表

     answer表

  • 相关阅读:
    matplotlib 设置手动设置图例的位置大小
    “2022数据智能夏令营”开营,大咖授课+丰富活动已就位
    win10怎么设置不睡眠熄屏?win10设置永不睡眠的方法
    阿里饶子昊:Spring Cloud Alibaba发展和近期规划
    网课题库接口
    bug记录——设置了feign的fallback,但是没有生效
    【Linux基础】工作中常用的linux命令,经常会被面试官问到
    linux服务器升级tomcat步骤
    使用Reflect封装Excel导出工具类
    基于Ray的分布式版本的决策树与随机森林
  • 原文地址:https://blog.csdn.net/m0_65592409/article/details/125562783