• Python 爬取单个网页所需要加载的URL地址和CSS、JS文件地址


    直接上代码:

    脱敏后自用的py采集代码,

    1. #!/usr/bin/env python
    2. # -*- coding:utf-8 -*-
    3. """
    4. @author:Andy
    5. @file:xxx.py
    6. @time:下午05:50
    7. @desc:采集的文章数据进博客
    8. """
    9. import os
    10. import re
    11. import time
    12. import requests
    13. from bs4 import BeautifulSoup, SoupStrainer
    14. from requests.exceptions import RequestException
    15. from hashlib import md5
    16. from urllib.parse import urlparse
    17. import urllib
    18. headers = {
    19. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    20. }
    21. def get_content():
    22. url = 'http://ask.xxxx.com/question/xxxx' # url
    23. response = requests.get(url, headers=headers).text.replace('', '')
    24. soup = BeautifulSoup(response, 'lxml')
    25. # div = soup.select('#aw-mod-body ueditor-p-reset')
    26. pattern = re.compile('(.*?)', re.S)
    27. p = soup.find_all('a')
    28. for item in p:
    29. # print(str(item))
    30. result = re.findall(pattern, str(item))
    31. if result:
    32. # print(result)
    33. for i in result:
    34. url, name = i
    35. # print(i)
    36. yield {
    37. 'url': url,
    38. 'name': name
    39. }
    40. def mkdir(path):
    41. # 去除首位空格
    42. path=path.strip()
    43. # 去除尾部 \ 符号
    44. path=path.rstrip("\\")
    45. # 判断路径是否存在
    46. # 存在 True
    47. # 不存在 False
    48. isExists=os.path.exists(path)
    49. # 判断结果
    50. if not isExists:
    51. # 如果不存在则创建目录
    52. # 创建目录操作函数
    53. os.makedirs(path)
    54. print(path+' 创建成功')
    55. return True
    56. else:
    57. # 如果目录存在则不创建,并提示目录已存在
    58. print(path+' 目录已存在')
    59. return False
    60. def getUrl(html):
    61. #patterncss = '
    62. patternjs = '