import PyPDF2
pdffile=").pdf"
txtfile="(1).txt"
with open(pdffile,"rb") as pdf:
reader=PyPDF2.PdfReader(pdf)
text = "".join(page.extract_text() for page in reader.pages)
with open(txtfile,'w',encoding = 'utf-8') as txt:
txt.write(text)
批量转换
import os
import PyPDF2
import re
pdf_path = '.\数据PDF'
txt_path = '.\数据TXT'
pdflists = os.listdir(pdf_path)
for pdflist in pdflists:
pdffile = pdf_path + '\\' + pdflist
txtfile = txt_path + '\\' + str(re.findall('(.+).pdf',pdflist)[0]) + '.txt'
print(txtfile)
with open(pdffile,"rb") as pdf:
reader=PyPDF2.PdfReader(pdf)
text = "".join(page.extract_text() for page in reader.pages)
with open(txtfile,'w',encoding = 'utf-8') as txt:
txt.write(text)