- 设置timeout=(20, 20), verify=False避免超时和校验问题
- jpeg以jpg格式保存
- 获取图片编码的md5并存为文件名,以避免重复
import pandas as pd
import requests
import os
import hashlib
from tqdm import tqdm
file_path = 'xiaofang.xlsx'
save_dir = 'xiaofang'
df = pd.read_excel(file_path)
url_lists = df.iloc[:, 2]
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
def save_image(file_url):
r = requests.get(file_url, headers = headers, timeout=(20, 20), verify=False)
r.encoding = r.apparent_encoding
if r.status_code == 200:
ext = r.headers['Content-Type'].split('/')[-1]
if ext == 'jpeg':
ext = 'jpg'
if ext not in ('jpg', 'png'):
raise Exception("{}未包含指定格式的图片".format(file_url))
file_name = hashlib.md5(r.content).hexdigest()+'.'+ext
file_path = os.path.join(save_dir, file_name)
if os.path.exists(file_path):
raise Exception("{}图片重复".format(file_url))
with open(file_path, "wb") as f:
f.write(r.content)
else:
raise Exception("{}的状态码为{}".format(file_url, str(r.status_code)))
if __name__ == '__main__':
for url in tqdm(url_lists):
try:
save_image(url)
except Exception as e:
print(e)

- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38