python抓取制定产品图片及简介代码实现
import requests
from bs4 import BeautifulSoup
import re
import lxml
import os
def download_image(url, image_name=None):
response = requests.get(url, stream=True)
if response.status_code == 200:
if image_name is None:
import uuid
image_name = str(uuid.uuid4()) + '.jpg'
with open(image_name, 'wb') as f:
f.write(response.content)
print(f"图片已保存为:{image_name}")
else:
print(f"请求失败,HTTP状态码为:{response.status_code}")
while True:
url = input("请输入url:")
html=requests.get(url)
html.encoding = "utf-8"
content=html.text
soup = BeautifulSoup(content,'lxml')
a_list=soup.select('ul[id="Slider-Thumbnails-template--14450556207192__main"] li img')
title=soup.find_all('h1')
mulu=title[0].string
os.makedirs(mulu,exist_ok=True)
print(mulu+'产品文档建立完成')
productcontent=soup.find('div', attrs={'class': 'product__description rte quick-add-hidden'}).get_text()
with open(mulu+'/content.txt', "a",encoding='utf-8') as f:
f.write('\n'+'==========pro detail======='+'\n')
f.write(productcontent)
con_pic=soup.select('div[class="product__description rte quick-add-hidden"] img')
num=1
for p in con_pic:
savename1=mulu+'/con_pic'+str(num)+'.jpg'
download_image(p.get('src'),savename1)
num += 1
id = 1
for img in a_list:
imgurl='https:'+img.get('src').split('?')[0]
name='/pic'+str(id)+'.jpg'
savename=mulu+name
download_image(imgurl,savename)
id += 1
print('采集完成')
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81