首先,需要一台syntaxnet/Parsey McParseface服务器运行在指定的url地址。
详细可参照以下官网:parsey-universal-server
它是一个简单的可以在HTTP下作为API来提供Parsey McParseface的Python Flask app。
我们需要先在矩池云租用一台支持docker的机器,如下图所示:
然后用VsCode连接这台机器,连接成功后打开终端,如下图所示:
在构建时,我们需要在终端命令行中输入以下命令:
git clone https://github.com/andersrye/parsey-universal-server.git
执行完后,资源管理器中会出现如下文件夹:
我们切到这个目录下:
cd parsey-universal-server-master
接着输入以下命令来使用 Dockerfile 创建镜像:
docker build -t parseyserver .
可能会出现timeout的情况,但不要慌,再运行一遍试试,出现下图这样的输出信息后,等它创建完成就可以啦:
创建一个新的容器并运行一个命令(-d: 后台运行容器,并返回容器ID)
docker run -d -it -p 7777:80 -e PARSEY_MODELS=English -e PARSEY_BATCH_SIZE=100 --name=syntaxnet --restart unless-stopped andersrye/parsey-universal-server
默认模式是English,可以通过设置PARSEY_MODELS环境遍历来选择模式:
docker run -it --rm -p 7777:80 -e PARSEY_MODELS=Latin,English,Greek andersrye/parsey-universal-server
我们需要先把文本处理成Syntexnet能够处理的形式:
def prepareTextForSyntaxnet(text=""): # 输入文本text
tmp = tmp.replace("\n","") # 移除换行
result = ""
for sentence in tokenizer.split_sentences(tmp): # 遍历文本中的每一句
tokenized_sentence = " ".join(tokenizer.tokenize(sentence)) # 对每一句做分词和词性标注,并加入到标记号的句子中
if(len(tokenized_sentence.replace(" ","")))>8:
result = result + tokenized_sentence + "\n" # 每8个字符一个换行
return result # 返回能够送给Syntaxnet处理的结果
这里我们可以写一个Tokenizer的类来做分词、词性标注与分句:
class Tokenizer:
def __init__(self,lang="en"):
# 加载模型
self.spacy_nlp = spacy.load('en_core_web_sm')
def tokenize(self, input):
return [x.text for x in self.spacy_nlp.tokenizer(input) if x.text != " "]
def split_sentences(self, input):
return [x.text for x in self.spacy_nlp(input).sents if x.text != " "]
不知道为什么目前矩池云公测的这台机器只能安装spacy>=3.0.0:
pip install spacy==3.0.0
在安装spacy==3.0.0后需要下载模型en_core_web_sm,我采用的是离线的方式,直接下载会timeout:
先下载压缩包,并保存的指定的路径下:/mnt/package/en_core_web_sm-3.0.0.tar.gz
然后安装就可以啦:
/mnt/package/en_core_web_sm-3.0.0.tar.gz
def processText(text="",chunkSize=50):
url = 'http://0.0.0.0:7777'
result = []
sentences = text.split("\n")
for i in range(0, len(sentences), chunkSize):
print(i,"/",len(sentences))
chunk= "\n".join(sentences[i:i + chunkSize])
#print(chunk)
if len(chunk.replace("\n","").replace(" ",""))>0:
request = Request(url, chunk.encode(),headers={"Content-Type":"text/plain"})
jsonresponse = urlopen(request,timeout=240).read().decode()
tmp = json.loads(jsonresponse)
result= result + tmp
return result
将请求的结果保存:
jsonResult = processText(docs[key])
with open(outFilename, 'w') as outfile:
json.dump(jsonResult, outfile, indent=4)
print("CREATED:"+outFilename)
处理结果示例:
[
[
{
"id": 1,
"form": "A",
"upostag": "DET",
"xpostag": "DT",
"feats": {
"fPOS": "PROPN++NNP",
"Number": "Sing"
},
"head": 3,
"deprel": "det"
},
{
"id": 2,
"form": "Christmas",
"upostag": "PROPN",
"xpostag": "NNP",
"feats": {
"fPOS": "PROPN++NNP",
"Number": "Sing"
},
"head": 3,
"deprel": "compound"
},
{
"id": 3,
"form": "Carol",
"upostag": "PROPN",
"xpostag": "NNP",
"feats": {
"fPOS": "PROPN++NNP",
"Number": "Sing"
},
"head": 7,
"deprel": "nsubj"
},
{
"id": 4,
"form": "is",
"upostag": "VERB",
"xpostag": "VBZ",
"feats": {
"Mood": "Ind",
"fPOS": "VERB++VBZ",
"Number": "Sing",
"Person": "3",
"Tense": "Pres",
"VerbForm": "Fin"
},
"head": 7,
"deprel": "cop"
},
{
"id": 5,
"form": "a",
"upostag": "DET",
"xpostag": "DT",
"feats": {
"Definite": "Ind",
"fPOS": "DET++DT",
"PronType": "Art"
},
"head": 7,
"deprel": "det"
},
{
"id": 6,
"form": "short",
"upostag": "ADJ",
"xpostag": "JJ",
"feats": {
"fPOS": "ADJ++JJ",
"Degree": "Pos"
},
"head": 7,
"deprel": "amod"
},
{
"id": 7,
"form": "novel",
"upostag": "NOUN",
"xpostag": "NN",
"feats": {
"fPOS": "NOUN++NN",
"Number": "Sing"
},
"head": 0,
"deprel": "ROOT"
},
{
"id": 8,
"form": "by",
"upostag": "ADP",
"xpostag": "IN",
"feats": {
"fPOS": "ADP++IN"
},
"head": 10,
"deprel": "case"
},
{
"id": 9,
"form": "Charles",
"upostag": "PROPN",
"xpostag": "NNP",
"feats": {
"fPOS": "PROPN++NNP",
"Number": "Sing"
},
"head": 10,
"deprel": "name"
},
{
"id": 10,
"form": "Dickens",
"upostag": "PROPN",
"xpostag": "NNP",
"feats": {
"fPOS": "PROPN++NNP",
"Number": "Sing"
},
"head": 7,
"deprel": "nmod"
},
{
"id": 11,
"form": ".",
"upostag": "PUNCT",
"xpostag": ".",
"feats": {
"fPOS": "PUNCT++."
},
"head": 7,
"deprel": "punct"
}
],
[
{
"id": 1,
"form": "It",
"upostag": "PRON",
"xpostag": "PRP",
"feats": {
"Case": "Nom",
"PronType": "Prs",
"Gender": "Neut",
"fPOS": "PRON++PRP",
"Number": "Sing",
"Person": "3"
},
"head": 4,
"deprel": "nsubjpass"
},
{
"id": 2,
"form": "was",
"upostag": "AUX",
"xpostag": "VBD",
"feats": {
"Mood": "Ind",
"fPOS": "AUX++VBD",
"Number": "Sing",
"Person": "3",
"Tense": "Past",
"VerbForm": "Fin"
},
"head": 4,
"deprel": "auxpass"
},
{
"id": 3,
"form": "first",
"upostag": "ADV",
"xpostag": "RB",
"feats": {
"fPOS": "ADV++RB"
},
"head": 4,
"deprel": "advmod"
},
{
"id": 4,
"form": "published",
"upostag": "VERB",
"xpostag": "VBN",
"feats": {
"Tense": "Past",
"Voice": "Pass",
"fPOS": "VERB++VBN",
"VerbForm": "Part"
},
"head": 0,
"deprel": "ROOT"
},
{
"id": 5,
"form": "on",
"upostag": "ADP",
"xpostag": "IN",
"feats": {
"fPOS": "ADP++IN"
},
"head": 7,
"deprel": "case"
},
{
"id": 6,
"form": "19",
"upostag": "NUM",
"xpostag": "CD",
"feats": {
"fPOS": "NUM++CD",
"NumType": "Card"
},
"head": 7,
"deprel": "nummod"
},
{
"id": 7,
"form": "December",
"upostag": "PROPN",
"xpostag": "NNP",
"feats": {
"fPOS": "PROPN++NNP",
"Number": "Sing"
},
"head": 4,
"deprel": "nmod"
},
{
"id": 8,
"form": "1843",
"upostag": "NUM",
"xpostag": "CD",
"feats": {
"fPOS": "NUM++CD",
"NumType": "Card"
},
"head": 7,
"deprel": "nummod"
},
{
"id": 9,
"form": ".",
"upostag": "PUNCT",
"xpostag": ".",
"feats": {
"fPOS": "PUNCT++."
},
"head": 4,
"deprel": "punct"
}
]
]