直接上代码:
脱敏后自用的py采集代码,
- #!/usr/bin/env python
- # -*- coding:utf-8 -*-
- """
- @author:Andy
- @file:xxx.py
- @time:下午05:50
- @desc:采集的文章数据进博客
- """
- import os
- import re
- import time
- import requests
- from bs4 import BeautifulSoup, SoupStrainer
- from requests.exceptions import RequestException
- from hashlib import md5
- from urllib.parse import urlparse
- import urllib
-
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
- }
-
-
- def get_content():
- url = 'http://ask.xxxx.com/question/xxxx' # url
- response = requests.get(url, headers=headers).text.replace('', '')
- soup = BeautifulSoup(response, 'lxml')
- # div = soup.select('#aw-mod-body ueditor-p-reset')
- pattern = re.compile('(.*?)', re.S)
- p = soup.find_all('a')
- for item in p:
- # print(str(item))
- result = re.findall(pattern, str(item))
- if result:
- # print(result)
- for i in result:
- url, name = i
- # print(i)
- yield {
- 'url': url,
- 'name': name
- }
-
-
- def mkdir(path):
- # 去除首位空格
- path=path.strip()
- # 去除尾部 \ 符号
- path=path.rstrip("\\")
- # 判断路径是否存在
- # 存在 True
- # 不存在 False
- isExists=os.path.exists(path)
- # 判断结果
- if not isExists:
- # 如果不存在则创建目录
- # 创建目录操作函数
- os.makedirs(path)
- print(path+' 创建成功')
- return True
- else:
- # 如果目录存在则不创建,并提示目录已存在
- print(path+' 目录已存在')
- return False
-
- def getUrl(html):
- #patterncss = '
- patternjs = '
- patternimg = '
- #href = re.compile(patterncss, re.S).findall(html)
- href = re.compile(patternimg, re.S).findall(html)
- href += re.compile(patternjs, re.S).findall(html)
- return href
-
- def getCssUrl(html):
- patterncss = '
- href = re.compile(patterncss, re.S).findall(html)
- return href
-
- # 下载网页
- def download_html(root_path, url):
- a = urlparse(url)
- file_path = a.path
- file_name = os.path.basename(file_path)
- _, file_suffix = os.path.splitext(file_name)
- if file_suffix != '.html':
- file_name_real = file_name + '.html'
- else:
- file_name_real = file_name
- file_path_real = file_path.replace(file_name, '')
- file_path_reals = file_path_real.replace('/', "\\")
- all_file_path_real = root_path + file_path_reals + file_name_real
- headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
- re = requests.get(url, headers = headers)
- re.encoding = "utf-8"
-
- itemurl = getUrl(re.text)
- for item1 in itemurl:
- download_commonimgjs(root_path, item1)
-
- itemcssurl = getCssUrl(re.text)
- for item2 in itemcssurl:
- download_css(root_path, item2)
-
- new_text = re.text.replace('https://www.xxxxxx.com', 'http://www.xxxxx.com')
- new_texts = new_text.replace('xxxxxx.com', '3cinno.shanhubei.com')
- with open(all_file_path_real, "w+", encoding="utf-8") as html_file:
- html_file.write(new_texts)
-
- def download_commonimgjs(root_path, url):
- if str(url[:1]) == r"/":
- imgurl = "https://www.xxxxxx.com" + url
- else:
- imgurl = url
- a = urlparse(imgurl)
- file_path = a.path
- file_name = os.path.basename(file_path)
- _, file_suffix = os.path.splitext(file_name)
- # print(os.path.curdir(file_path))
- match_url = file_path.replace(file_name, '')
- match_url_new = match_url.replace('/', "\\")
- newmkpath = root_path + match_url_new
- if os.path.isfile(newmkpath + file_name):
- return
- # 调用函数
- mkdir(newmkpath)
- try:
- opener = urllib.request.build_opener()
- opener.addheaders = [('User-agent',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
- urllib.request.install_opener(opener)
- urllib.request.urlretrieve(imgurl, newmkpath + file_name)
- except urllib.error.HTTPError:
- print('error')
-
-
-
- def download_img(root_path, url):
- if str(url[:1]) == r"/":
- imgurl = "https://www.xxxxxx.com" + url
- else:
- imgurl = url
- a = urlparse(imgurl)
- file_path = a.path
- file_name = os.path.basename(file_path)
- _, file_suffix = os.path.splitext(file_name)
- # print(os.path.curdir(file_path))
- match_url = file_path.replace(file_name, '')
- match_url_new = match_url.replace('/', "\\")
- newmkpath = root_path + match_url_new
- # 调用函数
- mkdir(newmkpath)
- opener = urllib.request.build_opener()
- opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
- urllib.request.install_opener(opener)
- urllib.request.urlretrieve(imgurl, newmkpath + file_name)
-
- def download_js(root_path, url):
- if str(url[:1]) == r"/":
- imgurl = "https://www.xxxxxx.com" + url
- else:
- imgurl = url
- a = urlparse(imgurl)
- file_path = a.path
- file_name = os.path.basename(file_path)
- _, file_suffix = os.path.splitext(file_name)
- # print(os.path.curdir(file_path))
- match_url = file_path.replace(file_name, '')
- match_url_new = match_url.replace('/', "\\")
- newmkpath = root_path + match_url_new
- # 调用函数
- mkdir(newmkpath)
- opener = urllib.request.build_opener()
- opener.addheaders = [('User-agent',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
- urllib.request.install_opener(opener)
- urllib.request.urlretrieve(imgurl, newmkpath + file_name)
-
- def download_css(root_path, url):
- if str(url[:1]) == r"/":
- imgurl = "https://www.xxxxxx.com" + url
- else:
- imgurl = url
- a = urlparse(imgurl)
- file_path = a.path
- file_name = os.path.basename(file_path)
- _, file_suffix = os.path.splitext(file_name)
- if file_suffix != '.css':
- return
- # print(os.path.curdir(file_path))
- match_url = file_path.replace(file_name, '')
- match_url_new = match_url.replace('/', "\\")
- newmkpath = root_path + match_url_new
- if os.path.isfile(newmkpath + file_name):
- return
- # 调用函数
- mkdir(newmkpath)
- try:
- opener = urllib.request.build_opener()
- opener.addheaders = [('User-agent',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
- urllib.request.install_opener(opener)
- urllib.request.urlretrieve(imgurl, newmkpath + file_name)
- except urllib.error.HTTPError:
- print('error')
-
- def get_xml():
- url = 'https://www.xxxxxx.com/sitemap-1.xml'
- headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
- res = requests.get(url, headers=headers)
- res.encoding = "utf-8"
- # 根据你的文章链接格式写正则匹配,可能与我的不一样
- r = re.compile(r'https://www.xxxxxx.com/\S*?')
- big = re.findall(r, res.text)
- for i in big:
- print(i)
-
-
- def main():
- # get_content()
- # url = r'https://www.xxxxxx.com/news/xxxx-proje-20711498'
- url = r'https://www.xxxxxx.com/uploads/20218080/logo202107221507387902092.png'
- # 定义要创建的目录
- root_path = "F:\\Project-cz\\shanhubei\\3cinno"
- #download_img(root_path, url)
-
- #htmlurl = r'https://www.xxxxxx.com/3d-clear-led-dmx-ball'
- #download_html(root_path, htmlurl)
-
- cssurl = r'https://www.xxxxxx.com/images/m184/black/style.css'
- #download_css(root_path, cssurl)
-
- #demourl = 'https://www.xxxxxx.com/Content/kcim/js/layim-public.js?t=20190404'
- #demo(demourl)
-
- get_xml()
-
-
- def demo(url):
- a = urlparse(url)
- file_path = a.path
- print(a.scheme)
- print(a.hostname)
- print('a.file_path=' + file_path)
- file_name = os.path.basename(file_path)
- print('file_name=' +file_name)
- _, file_suffix = os.path.splitext(file_name)
- print('a.file_suffix=' + file_suffix)
-
-
-
-
-
- if __name__ == '__main__':
- main()
来源:http://www.shanhubei.com/archives/2491.html