一、思路:
1. 分析URL,图片的URL内嵌于base_url的返回当中
2. 下载图片
二、代码
- import time
- import requests
- import os
- from lxml import etree
-
-
- class DownloadImg():
- '''
- 爬虫进行美女图片下载
- '''
- def __init__(self):
- self.url = 'http://xxxxxx/4kmeinv/'
- self.base_url = 'xxxxxxxxxx'
- self.headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
- }
- self.page = 1
-
- #创建图片存储目录
- def create_img_dir(self):
- current_dir = os.path.dirname(__file__)
- img_dir = os.path.join(current_dir,'img')
- if not os.path.exists(img_dir):
- os.makedirs(img_dir)
- return img_dir
-
- #下载图片
- def download_img(self,url_list):
- for url in url_list:
- res = requests.get(url=url,headers=self.headers).text
- tree = etree.HTML(res)
- self.create_img_dir()
- #解析
- for li in tree.xpath('//div[@class="slist"]/ul/li'):
- img_url = li.xpath('./a/img/@src')[0]
- full_img_url = self.base_url + img_url
- print(full_img_url)
- img_name = full_img_url.split('/')[-1]
- full_img_name = os.path.join(self.create_img_dir(), img_name)
- # 开始下载图片
- with open(full_img_name, 'wb') as fs:
- content = requests.get(url=full_img_url, headers=self.headers).content
- fs.write(content)
- print("{}图片下载完成 ".format(img_name))
- time.sleep(1)
-
-
- #生成图片URL,返回每个page组成的列表
- def get_img_url(self,page):
- url_list = [self.url]
- if page == 1 :
- return url_list
- elif page > 1 :
- '''
- https://xxxxxxx/index_3.html
- '''
- for i in range(1,page+1):
- if i == 1 :
- continue
- multi_url = self.url + "index_{}.html".format(str(page))
- url_list.append(multi_url)
- return url_list
-
-
- if __name__ == '__main__':
- #下载页数,2页
- page = 2
- #定义类对象
- down_img = DownloadImg()
- url = down_img.get_img_url(2)
- print(url)
- down_img.download_img(url)
- print("图片全部下载完成,程序退出")
-