• Python实现查询一个文件中的pdf文件中的关键字


    要求,查询一个文件中的pdf文件中的关键字,输出关键字所在PDF文件的文件名及对应的页数。

    1. import os
    2. import PyPDF2
    3. def search_pdf_files(folder_path, keywords):
    4. # 初始化结果字典,以关键字为键,值为包含关键字的页面和文件名列表
    5. results = {keyword: [] for keyword in keywords}
    6. # 遍历指定文件夹下的所有文件
    7. for root, dirs, files in os.walk(folder_path):
    8. for filename in files:
    9. if filename.endswith(".pdf"):
    10. # 构建PDF文件的完整路径
    11. pdf_path = os.path.join(root, filename)
    12. # 打开PDF文件
    13. with open(pdf_path, "rb") as pdf_file:
    14. pdf_reader = PyPDF2.PdfReader(pdf_file)
    15. # 获取PDF的总页数
    16. total_pages = len(pdf_reader.pages)
    17. # 遍历PDF的每一页
    18. for page_num in range(total_pages):
    19. # 读取页面内容
    20. page = pdf_reader.pages[page_num]
    21. page_text = page.extract_text()
    22. # 检查所有关键字
    23. for keyword in keywords:
    24. if keyword in page_text:
    25. results[keyword].append({
    26. "file_name": filename,
    27. "page_number": page_num + 1 # PDF页码从1开始
    28. })
    29. return results
    30. # 示例用法
    31. folder_to_search = r"C:\Users\Administrator\Desktop\2"
    32. search_keywords = ["SVD", "线性回归", "XGBoost", "不存在的关键字"] # 添加多个关键字,包括不存在的关键字
    33. results = search_pdf_files(folder_to_search, search_keywords)
    34. # 打印结果
    35. for keyword, keyword_results in results.items():
    36. if keyword_results:
    37. print(f"关键字 '{keyword}' 所在的文件及页数:")
    38. for result in keyword_results:
    39. print(f"文件 '{result['file_name']}' 的第 {result['page_number']} 页")
    40. else:
    41. print(f"没有找到关键字 '{keyword}'。")
    42. print() # 输出换行以区分不同关键字的结果

     为了方便且高效看论文。

    用了上面那个之后发现不太对劲,找到文件后,就可以ctrl+F了,所以去掉了页数。代码如下:

    1. import os
    2. import PyPDF2
    3. def search_pdf_files(folder_path, keywords):
    4. # Initialize a results dictionary with keywords as keys and lists of files as values
    5. results = {keyword: [] for keyword in keywords}
    6. # Initialize a set to keep track of processed files for each keyword
    7. processed_files = {keyword: set() for keyword in keywords}
    8. # Traverse all files in the specified folder
    9. for root, dirs, files in os.walk(folder_path):
    10. for filename in files:
    11. if filename.endswith(".pdf"):
    12. # Build the full path of the PDF file
    13. pdf_path = os.path.join(root, filename)
    14. # Open the PDF file
    15. with open(pdf_path, "rb") as pdf_file:
    16. pdf_reader = PyPDF2.PdfReader(pdf_file)
    17. # Get the total number of pages in the PDF
    18. total_pages = len(pdf_reader.pages)
    19. # Iterate through each page of the PDF
    20. for page_num in range(total_pages):
    21. # Read the page content
    22. page = pdf_reader.pages[page_num]
    23. page_text = page.extract_text()
    24. # Check all keywords
    25. for keyword in keywords:
    26. if keyword in page_text:
    27. # Check if this file has not been processed for this keyword
    28. if filename not in processed_files[keyword]:
    29. results[keyword].append({
    30. "file_name": filename,
    31. "page_number": page_num + 1 # PDF page numbers start from 1
    32. })
    33. processed_files[keyword].add(filename)
    34. return results
    35. # 示例用法
    36. folder_to_search = r"C:\Users\Administrator\Desktop\优秀论文"
    37. search_keywords = ["ARIMA", "XGBoost", "SVM", "支持向量机","线性回归","决策树","随机森林","模拟退火","粒子群","遗传算法","LSTM","BP神经网络","t-SNE","LightGBM","GMM","距离相关系数","灰色关联分析","互信息","信息熵","递归特征消除","综合评价","熵权法"] # 添加多个关键字,包括不存在的关键字
    38. results = search_pdf_files(folder_to_search, search_keywords)
    39. # 打印结果
    40. for keyword, keyword_results in results.items():
    41. if keyword_results:
    42. print(f"关键字 '{keyword}' 所在的文件及页数:")
    43. for result in keyword_results:
    44. print(f"文件 '{result['file_name']}'")
    45. else:
    46. print(f"没有找到关键字 '{keyword}'。")
    47. print() # 输出换行以区分不同关键字的结果

  • 相关阅读:
    PDF能编辑修改吗?教你必备的几种编辑方法
    goroutine 调度
    Java面试题之并发
    BurpSuit官方实验室之SQL注入
    Web AP—PC端网页特效
    Linux下怎么修改普通用户的权限?
    bff层解决了什么痛点
    rabbitmq配置windows authentication(windows account)登录
    SpringMVC
    XILINX XC7A200T-1FBG676C FPGA - 现场可编程门阵列
  • 原文地址:https://blog.csdn.net/Catherinemin/article/details/133033211