import nltk
string="My father's name being Pririp,and my Christian name Philip,my infant tongue could make of both names nothing longer or more explicit than Pip. So,I called myself Pip,and came to be called Pip."
string_tokenized = nltk.word_tokenize(string)
string_postagged = nltk.pos_tag(string_tokenized)
string_postagged
import nltk
string="My father's name being Pririp,and my Christian name Philip,my infant tongue could make of both names nothing longer or more explicit than Pip. So,I called myself Pip,and came to be called Pip."
# 对字符串进行分句处理
sent_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
sents_splitted = sent_splitter.tokenize(string)
file_out =open('D:\works\文本分析\sent_postagged.txt','a')
# 对分句后的文本进行词性赋码
for sent in sents_splitted:
# posttag the sentence
sent_tokenized = nltk.word_tokenize(sent)
sent_postag = nltk.pos_tag(sent_tokenized)
# save the postagged sentence in sent_postagged
for i in sent_postag:
output = i[0]+'_'+ i[1]+' '
file_out.write(output)
file_out.write('\n')
file_out.close()
语料库语言学研究的一个热点问题是对词块(Ngrams 或chunks)的研究。根据抽取词块的长度,可以将词块分为一词词块(单词)、二词词块、三词词块、四词词块等。比如从字符串"To be or not to be"中可以抽取出五个二词词块"To be"、"be or " 、"or not "、"not to "、“to be”。
#%%import nltk
from nltk.util import ngrams
string="My father's name being Pririp,and my Christian name Philip,my infant tongue could make of both names nothing longer or more explicit than Pip. So,I called myself Pip,and came to be called Pip."
string_tokenized = nltk.word_tokenize(string.lower())
n =4
n_grams =ngrams(string_tokenized,n)for grams in n_grams:print(grams)
import re
import nltk
from nltk.util import ngrams
string="My father's name being Pririp,and my Christian name Philip,my infant tongue could make of both names nothing longer or more explicit than Pip. So,I called myself Pip,and came to be called Pip."
string_tokenized = nltk.word_tokenize(string.lower())
n =4
n_grams =ngrams(string_tokenized,n)
n_grams_AlphaNum =[]for gram in n_grams:
# to test if there is any non-alphanumeric character in the ngrams
# 过滤掉存在非英文字符的gram
for i in range(4):if re.search(r'^\W+$',gram[i]): # \W匹配任何非单词字符。等价于“[^A-Za-z0-9_]”
breakelse:
n_grams_AlphaNum.append(gram)for j in n_grams_AlphaNum:print(j)