进行分词, 按照空格
POST _analyze
{
"analyzer": "standard",
"text":"The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
返回 分词结果
{
"tokens" : [
{
"token" : "the",
"start_offset" : 0,
"end_offset" : 3,
"type" : "" ,
"position" : 0
},
{
"token" : "2",
"start_offset" : 4,
"end_offset" : 5,
"type" : "" ,
"position" : 1
},
{
"token" : "quick",
"start_offset" : 6,
"end_offset" : 11,
"type" : "" ,
"position" : 2
},
{
"token" : "brown",
"start_offset" : 12,
"end_offset" : 17,
"type" : "" ,
"position" : 3
},
{
"token" : "foxes",
"start_offset" : 18,
"end_offset" : 23,
"type" : "" ,
"position" : 4
},
{
"token" : "jumped",
"start_offset" : 24,
"end_offset" : 30,
"type" : "" ,
"position" : 5
},
{
"token" : "over",
"start_offset" : 31,
"end_offset" : 35,
"type" : "" ,
"position" : 6
},
{
"token" : "the",
"start_offset" : 36,
"end_offset" : 39,
"type" : "" ,
"position" : 7
},
{
"token" : "lazy",
"start_offset" : 40,
"end_offset" : 44,
"type" : "" ,
"position" : 8
},
{
"token" : "dog's",
"start_offset" : 45,
"end_offset" : 50,
"type" : "" ,
"position" : 9
},
{
"token" : "bone",
"start_offset" : 51,
"end_offset" : 55,
"type" : "" ,
"position" : 10
}
]
}
standard 默认分词器, 不能分中文
下载地址为https://github.com/medcl/elasticsearch-analysis-ik/releases
下载 elasticsearch-analysis-ik-7.15.0.zip
选择与 elasticsearch 版本一制的版本
/usr/local/elasticsearch-7.15.0/plugins 下 建立 ik 文件夹
将 ik zip文件 复制到 ik 文件夹下
# 安装 zip解压工具
yum install -y unzip zip
# 解压 文件
unzip elasticsearch-analysis-ik-7.15.0.zip
# 修改文件名
# mv elasticsearch-analysis-ik-7.15.0.zip ik
# 删除原文件
rm elasticsearch-analysis-ik-7.15.0.zip
跳转到 bin 文件夹 , 可以通过 插件列表指令查看
./elasticsearch-plugin list
ik_ smart , ik_max_word
POST _analyze
{
"analyzer": "ik_smart",
"text":"我是中国人."
}
返回基本分词
{
"tokens" : [
{
"token" : "我",
"start_offset" : 0,
"end_offset" : 1,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "是",
"start_offset" : 1,
"end_offset" : 2,
"type" : "CN_CHAR",
"position" : 1
},
{
"token" : "中国人",
"start_offset" : 2,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 2
}
]
}
POST _analyze
{
"analyzer": "ik_max_word",
"text":"我是中国人."
}
返回最多分词
{
"tokens" : [
{
"token" : "我",
"start_offset" : 0,
"end_offset" : 1,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "是",
"start_offset" : 1,
"end_offset" : 2,
"type" : "CN_CHAR",
"position" : 1
},
{
"token" : "中国人",
"start_offset" : 2,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "中国",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "国人",
"start_offset" : 3,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 4
}
]
}
在 nginx 服务器html (/usr/share/nginx/html/) 文件夹中 加入 es 文件夹
在es 文件夹中 增加文件 fenci.txt
将要设置的 中文分词 按行录入 , 如: 王小二 图灵云
修改 ik的 配置文件
/usr/local/elasticsearch-7.15.0/plugins/ik/config 文件夹下的 IKAnalyzer.cfg.xml
cd /usr/local/elasticsearch-7.15.0/plugins/ik/config
vi IKAnalyzer.cfg.xml
修改 remote_ext_dict
DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置comment>
<entry key="ext_dict">entry>
<entry key="ext_stopwords">entry>
<entry key="remote_ext_dict">http://192.168.3.220/es/fenci.txtentry>
properties>
在IK目录下有config文件夹,用于存储词典;
创建一个文件: mydict.dic , 在里面添加"王小二 图灵云"
DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置comment>
<entry key="ext_dict">mydict.dicentry>
<entry key="ext_stopwords">entry>
properties>
POST _analyze
{
"analyzer": "ik_max_word",
"text":"王小二小时候在图灵云学习."
}
分词结果
{
"tokens" : [
{
"token" : "王小二",
"start_offset" : 0,
"end_offset" : 3,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "小时候",
"start_offset" : 3,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "在",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 2
},
{
"token" : "图灵云",
"start_offset" : 7,
"end_offset" : 10,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "学习",
"start_offset" : 10,
"end_offset" : 12,
"type" : "CN_WORD",
"position" : 4
}
]
}