import re
file_in =open(r'D:\works\文本分析\leopythonbookdata-master\texts\ge.txt','r')
file_out =open(r'D:\works\文本分析\ge_children.txt','a')for line in file_in.readlines():if re.search(r'[Cc]hildren',line):
file_out.write(line)
file_in.close()
file_out.close()
import re
file_in =open(r'D:\works\文本分析\leopythonbookdata-master\texts\ge.txt','r')
file_out =open(r'D:\works\文本分析\ge_tional.txt','a')for line in file_in.readlines():if re.search(r'\w+tional\b',line):
file_out.write(line+'\n')
file_in.close()
file_out.close()
1
2
3
4
5
6
7
8
7.9 实现Rrange软件功能
学术英语词汇研究是应用语言学和学术用途英语领域的重要研究问题。Nation(2001)将词汇分成常用词汇、学术词汇、科技词汇、罕见词等几类。Nation(2001)认为,学习者往往先学习常用词汇,然后再学习科技词汇,最后学习科技术语和罕见词。常用词汇一般指West(1953)开发的通用词表(General Service List,GSL)包含的英语中最常用的2000个词汇(或词族)。学者们也开发了诸多学术英语词表,以帮助研究者学习和使用学术英语词汇,其中,Coxhead(2000)开发的学术英语词表(Academic Word List,AWL)是近年来影响最大的学术英语词表,该词表包括570个高频学术词汇(或词族)。为了更好地帮助研究者将AWL词表应用于研究和教学资料开发,Paul Natio还开发了Range软件。该软件内置GSL中最常见1000个词汇(一下简称GSL1000)、GSL中最常见2000个词汇(一下简称GSL2000)和AWL三个词表。Range软件可以分析语料库文本中的哪些词汇属于GSL1000、GSL2000和AWL词表,并汇报三个词表词汇的数量及其占文本所有词汇数量的百分比(覆盖率)。
# Recognizing_gsl_awl_words.py,part 1
# the following is to make the wordlist with freq
# and store the info in a dictionary (wordlist_freq_dict)import re
file_in =open(r'D:\works\文本分析\leopythonbookdata-master\texts\ge.txt','r')
all_words =[]for line in file_in.readlines():
line2 = line.lower()
line3 = re.sub(r'\W',r' ',line2) #将非字母和数字部分替换为空格
wordlist = line3.split()for word in wordlist:
all_words.append(word)
wordlist_freq_dict ={}for word in all_words:if word in wordlist_freq_dict.keys():
wordlist_freq_dict[word]+=1else:
wordlist_freq_dict[word]=1
file_in.close()
# Recognizing_gsl_awl_words.py,Part 2
# the following is to read the GSL and the AWL words
# and save them in a dictionary
gsl1000_in =open(r'D:\works\文本分析\leopythonbookdata-master\texts\GSL1000.txt','r')
gsl2000_in =open(r'D:\works\文本分析\leopythonbookdata-master\texts\GSL2000.txt','r')
awl_in =open(r'D:\works\文本分析\leopythonbookdata-master\texts\AWL.txt','r')
gsl_awl_dict ={}for word in gsl1000_in.readlines():
gsl_awl_dict[word.strip()]=1for word in gsl2000_in.readlines():
gsl_awl_dict[word.strip()]=2for word in awl_in.readlines():
gsl_awl_dict[word.strip()]=3
gsl1000_in.close()
gsl2000_in.close()
awl_in.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
(3)分析ge.txt中的词汇属于那个词表
下面是代码的第三部分,分析读入文本的哪些词汇分属于上述三个词表,并统计它们的数量和所占百分比。我们首先分别定义四个空的字典变量,以分别储存上述三个词表的单词、频次(gsl1000_words,gsl2000_words,awl_words)和上述三个词表以外的单词(other_words)。首先,循环遍历wordlist_freq_dict的键,也就是ge.txt文本的单词(wordlist_freq_dict.keys()),如果单词不在GSL和AWL此表中(if word not in gsl_awl_dict.keys()),则other_words字典的键为该单词,值为4(other_words[word] = 4);如果单词在GSL_1000中,则gsl1000_words字典的键为该值,值为该单词在ge.txt文本中的频次;依次类推。
# Recognizing_gsl_awl_words.py,part 3-1
# the following is to categorize the words in wordlist_freq_dict
# into dictionaries of GSL1000 words,GSL2000 words,AWL words or others
gsl1000_words ={}
gsl2000_words ={}
awl_words ={}
other_words ={}for word in wordlist_freq_dict.keys():if word not in gsl_awl_dict.keys():
other_words[word]=4
elif gsl_awl_dict[word]==1:
gsl1000_words[word]= wordlist_freq_dict[word] #统计ge.txt出现的gsl1000中的单词次数
elif gsl_awl_dict[word]==2:
gsl2000_words[word]= wordlist_freq_dict[word]
elif gsl_awl_dict[word]==3:
awl_words[word]= wordlist_freq_dict[word]
# Recognizing_gsl_awl_words.py,part 3-2
# compute freq total
gsl1000_freq_total =0
gsl2000_freq_total =0
awl_freq_total =0
other_freq_total =0for word in gsl1000_words:
gsl1000_freq_total += wordlist_freq_dict[word]for word in gsl2000_words:
gsl2000_freq_total += wordlist_freq_dict[word]for word in awl_words:
awl_freq_total += wordlist_freq_dict[word]for word in other_words:
other_freq_total += wordlist_freq_dict[word]
# Recognizing_gsl_awl_words.py,part 3-3
# to compute the number of words in gsl1000,gsl2000,awl and other words
gsl1000_num_of_words =len(gsl1000_words)
gsl2000_num_of_words =len(gsl2000_words)
awl_num_of_words =len(awl_words)
other_num_of_words =len(other_words)
# 计算ge.txt中总的单词数量
freq_total = gsl1000_freq_total + gsl2000_freq_total + awl_freq_total + other_freq_total
# 计算ge.txt中总的词形数
num_of_words_total = gsl1000_num_of_words + gsl2000_num_of_words + awl_num_of_words + other_num_of_words
1
2
3
4
5
6
7
8
9
10
11
下面是代码的第四部分,将结果写出到range_wordlist_results.txt文本中。首先打开一个写出range_wordlist_results.txt文本的文件句柄。然后,在文本中写出文本的标题’RESULTS OF WORD ANALYSIS’,标题后面空两行’\n\n’。然后,分别写出文本的总词形数量和各部分的词形数量。请注意,写出到文本的内容必须是字符串,因此,当写出内容为数值变量时,必须先用str()函数将其转换为字符串,如str(num_of_words_total)。
# Recognizing_gsl_awl_words.py,part 4-1
# the following is to write out the results
# first,define the file to save the results
file_out =open(r'D:\works\文本分析\range_wordlist_results.txt','a')
# then,write out the results
file_out.write('RESULTS OF WORD ANALYSIS\n\n')
file_out.write('Total No. of word types in Great Expectations: ' +str(num_of_words_total)+'\n\n')
file_out.write('Total No. of GSL1000 word types : ' +str(gsl1000_num_of_words)+'\n\n')
file_out.write('Total No. of GSL2000 word types : ' +str(gsl2000_num_of_words)+'\n')
file_out.write('Total No. of AWL word types : ' +str(awl_num_of_words)+'\n')
file_out.write('Total No. of other word types : ' +str(other_num_of_words)+'\n')
# Recognizing_gsl_words.py,part 4-2
file_out.write('\n\n')
file_out.write('Total word frequency of Great Expectations: ' +str(freq_total)+'\n\n')
file_out.write('Total frequency of GSL1000 words: ' +str(gsl1000_freq_total)+'\n')
file_out.write('Frequency percentage of GSL1000 words: ' +str(gsl1000_freq_total /float(freq_total))+'\n\n')
file_out.write('Total frequency of GSL2000 words: ' +str(gsl2000_freq_total)+'\n')
file_out.write('Frequency percentage of GSL2000 words: ' +str(gsl2000_freq_total /float(freq_total))+'\n\n')
file_out.write('Total frequency of AWL words: ' +str(awl_freq_total)+'\n')
file_out.write('Frequency percentage of AWL words: ' +str(awl_freq_total /float(freq_total))+'\n\n')
file_out.write('Total frequency of other words: ' +str(other_freq_total)+'\n')
file_out.write('Frequency percentage of other words: ' +str(other_freq_total /float(freq_total))+'\n')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
(5)统计ge.txt文本单词总频次以及各部分单词频次
最后,我们分别将这三个词表的单词及频次和三个词表以外的单词及频次写出到结果文件。
# Recognizing_gsl_awl_words.py, Part 4-3
# write out the GSL1000 words
file_out.write('\n\n')
file_out.write('##########\n')
file_out.write('Words in GSL1000\n\n')for word in sorted(gsl1000_words.keys()):
file_out.write(word +'\t'+str(gsl1000_words[word])+'\n')
# write out the GSL2000 words
file_out.write('\n\n')
file_out.write('##########\n')
file_out.write('Words in GSL2000\n\n')for word in sorted(gsl2000_words.keys()):
file_out.write(word +'\t'+str(gsl2000_words[word])+'\n')
# write out the AWL words
file_out.write('\n\n')
file_out.write('##########\n')
file_out.write('Words in AWL\n\n')for word in sorted(awl_words.keys()):
file_out.write(word +'\t'+str(awl_words[word])+'\n')
# write out other words
file_out.write('\n\n')
file_out.write('##########\n')
file_out.write('Other words\n\n')for word in sorted(other_words.keys()):
file_out.write(word +'\t'+str(wordlist_freq_dict[word])+'\n')
file_out.close()