https://github.com/HIT-SCIR/ltp
安装

详细说明下拷贝测试代码
https://github.com/HIT-SCIR/ltp/blob/master/docs/quickstart.rst
from ltp import LTP
ltp = LTP()
segment, _ = ltp.seg(["他叫汤姆去拿外衣。"])
# [['他', '叫', '汤姆', '去', '拿', '外衣', '。']]

报错了

稍微修改下代码,添加前面两行
import sys,os
sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
from ltp import LTP
ltp = LTP()
segment, _ = ltp.seg(["他叫汤姆去拿外衣。"])
# [['他', '叫', '汤姆', '去', '拿', '外衣', '。']]
print(segment)
执行过程中,缺少什么包就安装什么包就行,我的少了这些
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple packaging
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple transformers --user
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple numpy==1.17.3
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pygtrie
执行成功,等待下载训练包,然后测试结果成功
Ignored unknown kwarg option direction
[['他', '叫', '汤姆', '去', '拿', '外衣', '。']]
仅仅为了测试,增加的自定义字典是随便写的
import sys,os
sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
from ltp import LTP
ltp = LTP()
# 也可以在代码中添加自定义的词语
ltp.add_words(words=["叫汤姆去"], max_window=4)
segment, _ = ltp.seg(["他叫汤姆去拿外衣。"])
print(segment)
其中’user_dict.txt’文件我是放在了这里


文本记得用utf-8编码保存

import sys,os
sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
from ltp import LTP
ltp = LTP()
root_path=os.path.abspath(os.path.dirname(__file__) + '/' + '..')
# user_dict.txt 是词典文件, max_window是最大前向分词窗口
ltp.init_dict(path=os.path.join(root_path,"dict",'user_dict.txt'), max_window=4)
segment, _ = ltp.seg(["他叫汤姆去拿外衣。"])
print(segment)
读取utf-8编码的文本时,读取第一个文本会出现乱码问题

代码有个地方(“.\ltp-master\ltp\algorithms\maximum_forward_matching.py”)稍微修改了下,修改之后就正常了

测试结果成功
Ignored unknown kwarg option direction
[['他', '叫汤姆去', '拿', '外衣', '。']]
主要是修改
ltp = LTP(path = “base”)
模型位置大概在
“C:\Users\LYF.cache\torch\ltp\8909177e47aa4daf900c569b86053ac68838d09da28c7bbeb42b8efcb08f56aa-edb9303f86310d4bcfd1ac0fa20a744c9a7e13ee515fe3cf88ad31921ed616b2-extracted\ltp.model”
import sys,os,time
sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
from ltp import LTP
root_path=os.path.abspath(os.path.dirname(__file__) + '/' + '..')
ltp = LTP(path = "base")
# user_dict.txt 是词典文件, max_window是最大前向分词窗口
# ltp.init_dict(path=os.path.join(root_path,"dict",'user_dict.txt'), max_window=4)
# ltp.add_words(words=["\n"], max_window=4)
# user_dict.txt 是词典文件, max_window是最大前向分词窗口
# ltp.init_dict(path=os.path.join(root_path,"dict",'user_dict.txt'), max_window=4)
# 也可以在代码中添加自定义的词语
# ltp.add_words(words=["叫汤姆去"], max_window=4)
url = "tests/zrbzdz.txt"
t1 = time.time()
contents = open(url,"r",encoding='utf-8-sig').read()
segment, _ = ltp.seg([contents])
output="/ ".join(segment[0])
# print(segment)
t2 = time.time()-t1
# 输出分词后的文件路径
LTP_f = open("tests/output/1_LTP.txt","wb")
LTP_f.write(output.encode('utf-8'))
LTP_f.close()
print('time ' + str(t2))
我的数据经过测试,模型"base1"比"base"的效果要好,模型"base2"比"base1"的效果要好
import sys,os,time
sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
from ltp import LTP
root_path=os.path.abspath(os.path.dirname(__file__) + '/' + '..')
ltp = LTP(path = "base")
url = "tests/zrbzdz.txt"
t1 = time.time()
lines = []
count=0
output=[]
with open(url,"r",encoding='utf-8-sig') as f:
for line in f:
line = line.strip()
lines.append(line)
count+=1
if count%2000==0:
output.extend(ltp.seg(lines)[0])
lines = []
# 输出分词后的文件路径
LTP_f = open("tests/output/1_LTP.txt","w",encoding='utf-8-sig')
str1='/ '
for out in output:
LTP_f.write(str1.join(out)+'\n')
LTP_f.close()
tt = time.time()-t1
print('time ' + str(tt))
将model从默认路径拷贝到自定义路径
从

拷贝到自定义目录下,还要修改文件夹名称,跟脚本中的名称一样,将.json文件重命名为config.json

import sys,os,time
sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))
from ltp import LTP
root_path=os.path.abspath(os.path.dirname(__file__) + '/' + '..')
ltp = LTP(path = "tests/model/base2.model")
url = "tests/zrbzdz.txt"
t1 = time.time()
lines = []
count=0
output=[]
with open(url,"r",encoding='utf-8-sig') as f:
for line in f:
line = line.strip()
lines.append(line)
count+=1
if count%2000==0:
output.extend(ltp.seg(lines)[0])
lines = []
# 输出分词后的文件路径
LTP_f = open("tests/output/base22_LTP.txt","w",encoding='utf-8-sig')
str1='/ '
for out in output:
LTP_f.write(str1.join(out)+'\n')
LTP_f.close()
tt = time.time()-t1
print('time ' + str(tt))
# with open(url,"r",encoding='utf-8-sig') as f:
# lines=f.readlines()
# for line in lines:
# segment, _ = ltp.seg([line])
# output+="/ ".join(segment[0])+'\n'
# def split_l(l, n=8):
# return [l[i:i + n] for i in range(0, len(l), n)]
# for batch_data in split_l(lines, 32):