思路:
第一步:适用python把需要导出的pdf文件单词导出到txt
第二步:把导出的txt导入到软件单词库,例如,金山词霸等软件内
第三步:熟练掌握以及删除单词库部分单词,达到对英文标准的单词记忆,方便理解专业信息。
以下代码演示如何将py当前目录下的Workspace子目录里的PDF里的英语单词提取出来。
-
- import pdfplumber
- import glob,os
-
- WordDict = dict()
-
- def isWord(word):
- retVal = True
- if len(word) < 5 or word.isidentifier() == False or word.isascii() == False:
- retVal = False
- else:
- for c in word:
- if c in ['0','1', '2', '3', '4', '5', '6', '7', '8', '9', '_']:
- retVal = False
- return retVal
-
- #DIR=r"E:\GetEnglishDictionary"
- DIR = os.getcwd() + "\\workspace\\"
- temp=os.listdir(DIR)
- Dirlist=[]
- for i in temp:
- if (i.find(".pdf"))!= -1:
- Dirlist.append(i)
-
- try:
- # out
- for dir in Dirlist:
- print ("Analyse {} file".format(dir))
- #file=glob.glob(os.path.join(DIR+"\\"+dir, "*.*"))
- pdffile = DIR + "\\" + dir
-
- wordDictFile = pdffile.replace(".pdf", "_dict") + ".txt"
- dictFile = open(wordDictFile, 'w', encoding="utf-8")
-
- with pdfplumber.open(pdffile) as pdf:
-
- #for j in range(1, 2):
- pageNum = len(pdf.pages)
- progress = 0
- now = 0
- pageIndex = 0
- for page in pdf.pages:
- pageIndex = pageIndex + 1
- progress = int(pageIndex * 100/ pageNum)
- if progress >= now + 1:
- print(pdffile + " : " + str(progress) + " %")
- now = progress
-
-
- # 读取PDF文档第i+1页
- #page = pdf.pages[j]
- # page.extract_text()函数即读取文本内容
- txt = page.extract_text()
- txt = txt.replace(',', ' ')
- txt = txt.replace('\n', ' ')
- #words = ''.join(txt.split('\n')[:-1])
- #vols = str(words).split(' ')
- vols = str(txt).split(' ')
- for vol in vols:
- if isWord(vol) == True:
- #print(vol)
- tst = WordDict.get(vol.capitalize())
- if tst == None:
- WordDict[vol.capitalize()] = vol
- dictFile.write( vol + "\n")
- #str(pageIndex) + " " +
- ##
- dictFile.close()
- print("共 " + str(pageNum) + " 页,提取单词:" + str(len(WordDict)) + " 个")
-
-
- except Exception as e :
- print(repr(e))
-
- finally:
- print("finish write")
-
-