• 爬虫_爬取wasde月度供需平衡表(实例)


    注:该实例需要梯子。

    美国月度公布的世界农产品供需平衡表对研究农产品是必不可少的素材,本章是一个完整实例,爬取历年发布的wasde报告。USDA网站中有提供历史数据库查询与下载,但经过核对发现有些类目在wasde报告中有,而数据库中下载的数据中没有,为了更贴近研究需求,直接获取所有的wasde报告。

    目录

    写在前面

    步骤一:获取目录

    步骤二:获取xls链接地址

    步骤三:下载xls文件

    步骤四:获取txt链接地址

    步骤五:下载txt文件

    步骤六:获取pdf链接地址

    步骤七:下载pdf文件

    步骤八: 将pdf转成txt文件


    写在前面

     1. wasde报告有提供xls、txt、xml和pdf下载,但不是所有时间点都有这四种方式提供,早期的只有pdf,有些时间点只提供这四种方式中的几样,本例操作中,优先xls,其次txt,最次pdf
    2. 获取xls或txt或pdf文件后,需要某项数据需要从文件中提取出来,xls和txt的提取方便,pdf不方便,网站中获取的pdf需要经过ocr转成txt,比较费劲
    3. 从爬取到最终获得数据,前后步骤较多,有点复杂,需要耐心

    步骤一:获取目录

    将目录所在的html文件下载

    Publication | World Agricultural Supply and Demand Estimates | ID: 3t945q76s | USDA Economics, Statistics and Market Information System

     目录共计66页,改变url中的页码,下载这66页目录的html文件

    1. import os,json,requests,urllib3
    2. from selenium import webdriver
    3. from time import sleep
    4. from bs4 import BeautifulSoup
    5. from tools import mongodb_utils,date_tools,string_tools
    6. import xlrd
    7. def wasde_step_one():
    8. pre_dir = r'D:/temp006/'
    9. driver = webdriver.Chrome('../driver/chromedriver.exe')
    10. # 设置超时时间 10s
    11. driver.set_page_load_timeout(10)
    12. for i in range(1, 67):
    13. print(i)
    14. url_str = f"https://usda.library.cornell.edu/concern/publications/3t945q76s?locale=en&page={i}#release-items"
    15. driver.get(url_str)
    16. page_source = driver.page_source
    17. file_path = pre_dir + str(i) + '.html'
    18. with open(file_path, 'w', encoding='utf-8') as fw:
    19. fw.write(page_source)
    20. sleep(2)
    21. driver.close()
    22. driver.quit()
    23. pass

    步骤二:获取xls链接地址

    将存在xls链接的地址都获取下来

    1. def wasde_step_three():
    2. headers = {
    3. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    4. }
    5. proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'}
    6. urllib3.disable_warnings()
    7. pre_dir = r'D:/temp005/'
    8. pic_save_dir = r'D:/temp003/'
    9. for i in range(6, 67):
    10. print(i)
    11. json_file_name = f"{i}.json"
    12. json_file_path = pre_dir + json_file_name
    13. with open(json_file_path, 'r', encoding='utf-8') as fr:
    14. node_list = json.load(fr)
    15. for node in node_list:
    16. date_str = node['date']
    17. url_str = node['url']
    18. xls_save_name = f"{date_str}.xls"
    19. pre_save_dir_00 = pic_save_dir + str(i)
    20. if os.path.exists(pre_save_dir_00):
    21. pass
    22. else:
    23. os.mkdir(pre_save_dir_00)
    24. r = requests.get(url_str, headers=headers,proxies=proxies,verify=False)
    25. xls_save_path = pre_save_dir_00 + os.path.sep + xls_save_name
    26. with open(xls_save_path, 'wb') as fw:
    27. fw.write(r.content)
    28. pass
    29. pass

    分析节点,提取存在xls的目录,并存储到json文件中,并将json文件存储到指定目录下 

    步骤三:下载xls文件

    在步骤二中获取了xls的下载地址,根据这些下载地址下载xls文件

    1. def wasde_step_three():
    2. headers = {
    3. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    4. }
    5. proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'}
    6. urllib3.disable_warnings()
    7. pre_dir = r'D:/temp005/'
    8. pic_save_dir = r'D:/temp003/'
    9. for i in range(1, 67):
    10. print(i)
    11. json_file_name = f"{i}.json"
    12. json_file_path = pre_dir + json_file_name
    13. with open(json_file_path, 'r', encoding='utf-8') as fr:
    14. node_list = json.load(fr)
    15. for node in node_list:
    16. date_str = node['date']
    17. url_str = node['url']
    18. xls_save_name = f"{date_str}.xls"
    19. pre_save_dir_00 = pic_save_dir + str(i)
    20. if os.path.exists(pre_save_dir_00):
    21. pass
    22. else:
    23. os.mkdir(pre_save_dir_00)
    24. r = requests.get(url_str, headers=headers,proxies=proxies,verify=False)
    25. xls_save_path = pre_save_dir_00 + os.path.sep + xls_save_name
    26. with open(xls_save_path, 'wb') as fw:
    27. fw.write(r.content)
    28. pass
    29. pass

    步骤四:获取txt链接地址

    对于没有提供xls文件的目录,获取txt链接地址

    1. def wasde_step_four():
    2. pic_dir = r'D:/temp005/'
    3. pre_dir = r'D:/temp006/html/'
    4. for file_no in range(1, 67):
    5. # print(file_no)
    6. final_list = []
    7. file_name = f"{file_no}.html"
    8. file_path = pre_dir + file_name
    9. with open(file_path, 'r', encoding='utf-8') as fr:
    10. html_content = fr.read()
    11. soup = BeautifulSoup(html_content, 'lxml')
    12. tbody_node = soup.find('tbody', {'id': 'release-items'})
    13. tr_list = tbody_node.find_all('tr')
    14. for tr_node in tr_list:
    15. td_list = tr_node.find_all('td')
    16. td_one = td_list[0]
    17. date_str = td_one.string
    18. res_date_str = date_tools.wasde_trans_date_str(date_str)
    19. td_two = td_list[1]
    20. a_list = td_two.find_all('a')
    21. txt_url = None
    22. has_xls_yeah = False
    23. for a_node in a_list:
    24. if a_node.get('data-label') is None:
    25. continue
    26. data_label = a_node['data-label']
    27. if 'xls' in data_label:
    28. has_xls_yeah = True
    29. break
    30. if 'txt' in data_label:
    31. txt_url = a_node['href']
    32. if has_xls_yeah:
    33. continue
    34. if txt_url is None:
    35. print(f"{file_no}::{date_str}")
    36. continue
    37. final_list.append({
    38. 'date': res_date_str,
    39. 'url': txt_url
    40. })
    41. save_file_name = f"{file_no}.json"
    42. save_file_path = pic_dir + save_file_name
    43. with open(save_file_path, 'w', encoding='utf-8') as fw:
    44. json.dump(final_list, fw, ensure_ascii=False)
    45. pass

    承接步骤二,没有提供xls的目录,检查是否提供txt,有则获取并存储到json文件中,并将json文件存储到指定目录下 

    步骤五:下载txt文件

    在步骤四种获取txt下载地址,根据这些下载地址,下载txt文件

    1. def wasde_step_five():
    2. headers = {
    3. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    4. }
    5. proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'}
    6. urllib3.disable_warnings()
    7. pre_dir = r'D:/temp005/'
    8. pic_save_dir = r'D:/temp003/'
    9. for i in range(1, 67):
    10. print(i)
    11. json_file_name = f"{i}.json"
    12. json_file_path = pre_dir + json_file_name
    13. with open(json_file_path, 'r', encoding='utf-8') as fr:
    14. node_list = json.load(fr)
    15. for node in node_list:
    16. date_str = node['date']
    17. url_str = node['url']
    18. xls_save_name = f"{date_str}.txt"
    19. pre_save_dir_00 = pic_save_dir + str(i)
    20. if os.path.exists(pre_save_dir_00):
    21. pass
    22. else:
    23. os.mkdir(pre_save_dir_00)
    24. r = requests.get(url_str, headers=headers, proxies=proxies, verify=False)
    25. xls_save_path = pre_save_dir_00 + os.path.sep + xls_save_name
    26. with open(xls_save_path, 'w',encoding='utf-8') as fw:
    27. fw.write(r.text)
    28. pass

    步骤六:获取pdf链接地址

    对于没有提供xls文件也没有txt文件的目录,获取pdf链接地址

    1. def wasde_step_six():
    2. pic_dir = r'D:/temp005/'
    3. pre_dir = r'D:/temp006/html/'
    4. for file_no in range(1, 67):
    5. # print(file_no)
    6. final_list = []
    7. file_name = f"{file_no}.html"
    8. file_path = pre_dir + file_name
    9. with open(file_path, 'r', encoding='utf-8') as fr:
    10. html_content = fr.read()
    11. soup = BeautifulSoup(html_content, 'lxml')
    12. tbody_node = soup.find('tbody', {'id': 'release-items'})
    13. tr_list = tbody_node.find_all('tr')
    14. for tr_node in tr_list:
    15. td_list = tr_node.find_all('td')
    16. td_one = td_list[0]
    17. date_str = td_one.string
    18. res_date_str = date_tools.wasde_trans_date_str(date_str)
    19. td_two = td_list[1]
    20. a_list = td_two.find_all('a')
    21. pdf_url = None
    22. has_xls_yeah = False
    23. has_txt_yeah = False
    24. for a_node in a_list:
    25. if a_node.get('data-label') is None:
    26. pdf_url = a_node['href']
    27. else:
    28. data_label = a_node['data-label']
    29. if 'xls' in data_label:
    30. has_xls_yeah = True
    31. break
    32. if 'txt' in data_label:
    33. has_txt_yeah = True
    34. break
    35. if 'pdf' in data_label:
    36. pdf_url = a_node['href']
    37. if has_xls_yeah:
    38. continue
    39. if has_txt_yeah:
    40. continue
    41. if pdf_url is None:
    42. print(f"{file_no}::{date_str}")
    43. continue
    44. final_list.append({
    45. 'date': res_date_str,
    46. 'url': pdf_url
    47. })
    48. save_file_name = f"{file_no}.json"
    49. save_file_path = pic_dir + save_file_name
    50. with open(save_file_path, 'w', encoding='utf-8') as fw:
    51. json.dump(final_list, fw, ensure_ascii=False)
    52. pass

    承接步骤二,没有提供xls和txt的目录,则获取pdf并存储到json文件中,并将json文件存储到指定目录下 

    步骤七:下载pdf文件

    1. def wasde_step_seven():
    2. headers = {
    3. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    4. }
    5. proxies = {'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890'}
    6. urllib3.disable_warnings()
    7. pre_dir = r'D:/temp005/'
    8. pic_save_dir = r'D:/temp003/'
    9. for i in range(1, 67):
    10. print(i)
    11. json_file_name = f"{i}.json"
    12. json_file_path = pre_dir + json_file_name
    13. with open(json_file_path, 'r', encoding='utf-8') as fr:
    14. node_list = json.load(fr)
    15. for node in node_list:
    16. date_str = node['date']
    17. url_str = node['url']
    18. if 'pdf' not in url_str:
    19. continue
    20. xls_save_name = f"{date_str}.pdf"
    21. pre_save_dir_00 = pic_save_dir + str(i)
    22. if os.path.exists(pre_save_dir_00):
    23. pass
    24. else:
    25. os.mkdir(pre_save_dir_00)
    26. r = requests.get(url_str, headers=headers, proxies=proxies, verify=False)
    27. xls_save_path = pre_save_dir_00 + os.path.sep + xls_save_name
    28. with open(xls_save_path, 'wb') as fw:
    29. fw.write(r.content)
    30. pass

    步骤八: 将pdf转成txt文件

    该网站下载的pdf文件需要使用ocr识别文本,需要识别的内容为英文和数字,识别效果很好,pdf转换得来的txt文件和直接下载的txt文件在提取数据时需要区分开来,两者无法用同一方法提取

    8.1 下载 tesseract-ocr-w64-setup-v5.0.1.20220118.exe 并安装

    1). 下载地址:https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-v5.0.1.20220118.exe 

    2)安装时记得勾选安装语言包 

    3). 选择安装路径,要记得自己所选的安装路径,后面写代码会用到

     

    4). 安装过程时间长,大概要半个小时,并且过程会有多次弹窗, 遇到弹窗都点确定就可以。

     

     8.2 安装Poppler, Poppler主要是协助pdf2image在windows操作系统中操作pdf

    1). 下载地址https://blog.alivate.com.au/wp-content/uploads/2018/10/poppler-0.68.0_x86.7z

    2). 下载后直接解压,记得解压后的地址,等会代码中需要用到

    8.3 安装python需要用到的包

    pip install Pillow
    pip install pdf2image
    pip install pytesseract

    8.4 写一个小demo验证tesseract识别文件是否可行

    1. from PIL import Image
    2. import pytesseract
    3. from pdf2image import convert_from_path
    4. def temp_tesseract_demo():
    5. poppler_path = r'D:/python_package/poppler-0.68.0_x86/poppler-0.68.0/bin/'
    6. pytesseract.pytesseract.tesseract_cmd = r'D:\\soft\\ocr\\tesseract.exe'
    7. pdf_file_path = r'E:/temp000/1994_9_12.pdf'
    8. images = convert_from_path(pdf_path=pdf_file_path,poppler_path=poppler_path)
    9. print('开始存储为图片')
    10. pic_pre_dir = r'E:/temp000/pic/'
    11. pic_file_list = []
    12. for count,img in enumerate(images):
    13. img_path = f"{pic_pre_dir}page_{count}.png"
    14. img.save(img_path,'PNG')
    15. pic_file_list.append(img_path)
    16. print('开始转成文本')
    17. txt_pre_dir = r'E:/temp000/txt/'
    18. for file_count,file_item in enumerate(pic_file_list):
    19. print(file_count,file_item)
    20. extracted_text = pytesseract.image_to_string(Image.open(file_item),lang='eng')
    21. txt_file_path = f"{txt_pre_dir}txt_{file_count}.txt"
    22. with open(txt_file_path,'w',encoding='utf-8') as fw:
    23. fw.write(extracted_text)
    24. pass

    图片文件夹 :

     文本文件夹:

     图片与文本内容对比:

     8.5 将pdf文件转png

    1. def wasde_step_eight_02_00():
    2. poppler_path = r'D:/python_package/poppler-0.68.0_x86/poppler-0.68.0/bin/'
    3. pre_dir = r'E:/temp003/'
    4. save_dir = r'E:/temp002/'
    5. dir_two_list = os.listdir(pre_dir)
    6. for two_dir in dir_two_list:
    7. save_dir_two = save_dir + two_dir
    8. if os.path.exists(save_dir_two):
    9. pass
    10. else:
    11. os.mkdir(save_dir_two)
    12. pre_dir_two = pre_dir + two_dir
    13. pdf_file_list = os.listdir(pre_dir_two)
    14. for pdf_item in pdf_file_list:
    15. print(two_dir,pdf_item)
    16. pdf_name = pdf_item.split('.')[0]
    17. pdf_item_path = pre_dir_two + os.path.sep + pdf_item
    18. pdf_pic_dir = save_dir_two + os.path.sep + pdf_name
    19. if os.path.exists(pdf_pic_dir):
    20. pass
    21. else:
    22. os.mkdir(pdf_pic_dir)
    23. images = convert_from_path(pdf_path=pdf_item_path, poppler_path=poppler_path)
    24. for count, img in enumerate(images):
    25. img_path = f"{pdf_pic_dir}{os.path.sep}page_{count}.png"
    26. img.save(img_path, 'PNG')
    27. pass

    8.6 将png转txt文本

    1. def wasde_step_eight_02_01():
    2. pytesseract.pytesseract.tesseract_cmd = r'D:\\soft\\ocr\\tesseract.exe'
    3. png_one_dir = r'E:/temp002/'
    4. txt_one_dir = r'E:/temp001/'
    5. png_two_dir_list = os.listdir(png_one_dir)
    6. for two_dir in png_two_dir_list:
    7. txt_two_dir = txt_one_dir + two_dir
    8. if not os.path.exists(txt_two_dir):
    9. os.mkdir(txt_two_dir)
    10. png_two_dir = png_one_dir + two_dir
    11. png_three_dir_list = os.listdir(png_two_dir)
    12. for three_dir in png_three_dir_list:
    13. print(two_dir,three_dir)
    14. txt_three_dir = txt_two_dir + os.path.sep + three_dir + os.path.sep
    15. if not os.path.exists(txt_three_dir):
    16. os.mkdir(txt_three_dir)
    17. png_three_dir = png_two_dir + os.path.sep + three_dir + os.path.sep
    18. png_file_list = os.listdir(png_three_dir)
    19. png_count = len(png_file_list)
    20. for i in range(0,png_count):
    21. png_file_path = f"{png_three_dir}page_{i}.png"
    22. extracted_text = pytesseract.image_to_string(Image.open(png_file_path), lang='eng')
    23. txt_file_path = f"{txt_three_dir}txt_{i}.txt"
    24. with open(txt_file_path, 'w', encoding='utf-8') as fw:
    25. fw.write(extracted_text)
    26. pass

    至此,整个项目完毕,具体需要的数值到指定的xls、txt或txt(pdf)中提取。

  • 相关阅读:
    JAVAWEB学习笔记-前端基础
    优优嗨聚集团:抖音外卖,美食与文化的完美结合
    spring
    Python爬虫(二十四)_selenium案例:执行javascript脚本
    微信小程序遇到的一些问题及解决方法(设备安装)
    【Java】对象的实例化
    判断是否工作在docker环境
    pytorch定义datase多次重复采样
    Python语言学习:Python语言学习之网络爬虫/反爬虫技术相关的简介、案例应用之详细攻略
    本周大新闻|苹果再强调禁用“元宇宙”一词,TDK推出全球最小RGB激光模块
  • 原文地址:https://blog.csdn.net/m0_37967652/article/details/126102898