• 爬虫爬取百度图片、搜狗图片


    通过以下代码可以爬取两大图片网站(百度和搜狗)的图片,对于人工智能、深度学习中图片数据的搜集很有帮助!

    一、爬取百度图片

            该代码可以爬取任意百度图片中自定义的图片:

    1. import requests
    2. import re
    3. import time
    4. import os
    5. def saveImg(imgurlList, imgOs):
    6. for i in imgurlList:
    7. try:
    8. response = requests.get(url=i).content
    9. except:
    10. print("error!")
    11. else:
    12. imgName = i[28: 36]
    13. with open(imgOs + imgName + ".jpg", "wb") as file:
    14. file.write(response)
    15. print(i + " 下载完成!!")
    16. def get_asjson(page, gsm, word):
    17. url = f"https://image.baidu.com/search/acjson?tn=resultjson_com&logid=9123806616981181340&ipn=rj&ct=201326592&is=&fp=result&fr=&word={word}&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={str(30 * int(page))}&rn=30&gsm={gsm}&{str(int(time.time() * 1000))}="
    18. headers = {
    19. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
    20. 'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1637758492843_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&dyTabStr=MCwzLDYsMiw0LDEsNSw4LDcsOQ%3D%3D&ie=utf-8&sid=&word=hello',
    21. 'Cookie': 'BDqhfp=hello%26%26-10-1undefined%26%2628989%26%2635; BAIDUID=0C2336F5F3D356371C46DF079632E0C8:FG=1; BAIDUID_BFESS=0C2336F5F3D356371C46DF079632E0C8:FG=1; BIDUPSID=0C2336F5F3D356371C46DF079632E0C8; __yjs_duid=1_32693704d239fea9266064fc8a3d25631637737833661; PSTM=1637737880; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; userFrom=null; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; delPer=0; PSINO=6; __yjs_st=2_ZGU4ODA5ZTdmNzczMzgxNzRiZWZhNTdkODVkY2E5MzQ3NzM3Nzc2MzZlNjYzZmRiMWVjOTlmNWQzZDA3NWY1MzM2M2NkNjNmMjMzZWVlYzQxNGQ2ODIzYjlkNTdhYTUyZjdhNWQwNjQxZWE1YTI0MWZiNzQ1NTE0N2NlNTgwNjZjODlkNWVlZWI2ZDBkNjUzNmNiZDE3NzUyYTA4ZjkxYjI1NzNhODBjOGZhZTBmMzZkY2IwOWJmNjMxNjEzNmUxYjQxZmZhM2M1ODUzYTFkNTM4NTE5MzZjZjRkODliMTE1MmRmMDY1MjI4OGJiM2I3ZGMzMDdiNjI4MWE3NDgxZV83XzQyODU3N2M0; H_PS_PSSID=35295_34446_35104_31254_35237_35049_34584_34505_35245_34578_34872_26350_35210_35145_22160; indexPageSugList=%5B%22hello%22%2C%22bello%22%2C%22hello%20%22%5D; cleanHistoryStatus=0; ab_sr=1.0.1_MTJmNTIwNGNlNmI5NDg2YmZiZTI1OTM1MGZhNTJhZTZlMzVmODE2NmEwZjg5MjNlZWZjZWY1YTY3ZjQ2Yzc2MWZiNGRlODY2ZDJjOGE3N2RhMzg2NjcxZjEzY2ZiMDQ4ODNjYzgyZTZlNWM2NGQ4YjlhMzBlMWE1ZjU0ZTY2NzAxYmM0ZGRkOTM0MGI3NzUwOWZjODY2ODE5NmU1N2E1Yw=='
    22. }
    23. response = requests.get(url=url, headers=headers).text + "1111"
    24. gsm = re.findall('"gsm":"(.*?)",', response)[0]
    25. data = re.findall('"hoverURL":"(.*?)",', response)
    26. return gsm, data
    27. if __name__ == "__main__":
    28. a = "1e"
    29. key_word = "阳台" # 修改你要爬取的关键字
    30. img = key_word + "_img\\"
    31. os.mkdir(img)
    32. for i in range(1, 2): #通过改变第二个数,修改要爬取的页数
    33. asjson1 = get_asjson(i, a, key_word)
    34. saveImg(asjson1[1], img)
    35. a = asjson1[0]
    36. while True:
    37. asjson2 = get_asjson(int(i) + 1, a, key_word)
    38. saveImg(asjson2[1], img)
    39. a = asjson2[0]
    40. break

    二、爬取搜狗图片

            该代码可以爬取任意搜狗图片中自定义的图片:

    1. from urllib.parse import quote
    2. import requests
    3. # 填入需要搜索的内容
    4. key_word = quote('阳台')
    5. # 通过定义page决定爬取多少页,每一页有48张图片
    6. page=50
    7. for page in range(1, page):
    8. startN=(page-1)*48
    9. url = 'https://pic.sogou.com/napi/pc/searchList?mode=1&start={}&xml_len=48&query={}'.format(startN,key_word)
    10. response = requests.get(url)
    11. json_data = response.json()
    12. allData = json_data['data']['items']
    13. img_urls=[]
    14. i = 0
    15. for data in allData:
    16. url = data['thumbUrl']
    17. img_urls.append(url)
    18. i=i+1
    19. for num in range(i):
    20. data=requests.get(img_urls[num],timeout=5).content
    21. # 此处需要修改保存路径
    22. with open('C:/Users/wbl/Desktop/AI/pc/L/'+'page'+str(page)+'-'+str(num)+'.jpg','wb')as file:
    23. file.write(data)
    24. print(num,'下载完成!!')

     其他参考:

    百度安全验证icon-default.png?t=N7T8https://baijiahao.baidu.com/s?id=1764344909652245807&wfr=spider&for=pc

    1. import requests
    2. import re
    3. def download_images(keyword, num):
    4. url =';word='+ keyword
    5. html = requests.get(url).text
    6. img_urls = re.findall('"objURL":"(.*?)",', html, re.S)
    7. count = 0
    8. for img_url in img_urls:
    9. if count < num:
    10. print('Downloading:', img_url)
    11. try:
    12. response = requests.get(img_url, timeout=10)
    13. content = response.content
    14. file_name ='./images/'+ str(count)+'.jpg'
    15. with open(file_name,'wb') as f:
    16. f.write(content)
    17. count += 1
    18. except Exception as e:
    19. print('Error:',e)
    20. if __name__=='__main__':
    21. download_images('美食', 10)

  • 相关阅读:
    Java 入门练习(16 - 20)
    C++运算符重载+,*在QT中的实现演示
    第二章 进程与线程 十八、(生产者-消费者问题),(多生产者-多消费者问题),(抽烟者问题),(读者-写者问题),(哲学家就餐问题)
    Vue-3.1缓存组件keep-alive
    被开除的ChatGPT之父,又回来了?
    Serialiable接口和serialVersionUID的作用及其在redisson中的应用
    yolov7训练数据集详细流程bike-car-person
    计算属性的学习
    [附源码]Python计算机毕业设计SSM健身俱乐部管理系统(程序+LW)
    java计算机毕业设计医院门诊挂号系统源程序+mysql+系统+lw文档+远程调试
  • 原文地址:https://blog.csdn.net/m0_37870649/article/details/134657114