
目录结构

- <properties>
- <java.version>1.8java.version>
-
- <elasticsearch.version>7.6.1elasticsearch.version>
- properties>
- <dependencies>
-
- <dependency>
- <groupId>org.springframework.bootgroupId>
- <artifactId>spring-boot-starter-data-elasticsearchartifactId>
- dependency>
-
- <dependency>
- <groupId>com.alibabagroupId>
- <artifactId>fastjsonartifactId>
- <version>1.2.80version>
- dependency>
- <dependency>
- <groupId>org.springframework.bootgroupId>
- <artifactId>spring-boot-starter-webartifactId>
- dependency>
-
- <dependency>
- <groupId>org.projectlombokgroupId>
- <artifactId>lombokartifactId>
- <optional>trueoptional>
- dependency>
- <dependency>
- <groupId>org.springframework.bootgroupId>
- <artifactId>spring-boot-starter-testartifactId>
- <scope>testscope>
- dependency>
- dependencies>
- package com.wzh.config;
-
- import org.apache.http.HttpHost;
- import org.elasticsearch.client.RestClient;
- import org.elasticsearch.client.RestHighLevelClient;
- import org.springframework.context.annotation.Bean;
- import org.springframework.context.annotation.Configuration;
-
- /**
- * @ProjectName: Elasticsearch
- * @Package: com.wzh.config
- * @ClassName: ElasticSearchConfig
- * @Author: 王振华
- * @Description:
- * @Date: 2022/8/16 18:21
- * @Version: 1.0
- */
- @Configuration
- public class ElasticSearchConfig {
- //该对象可以对我们的ES进行相关的操作
- @Bean
- public RestHighLevelClient restHighLevelClient(){
- RestHighLevelClient client = new RestHighLevelClient(
- RestClient.builder(new HttpHost("127.0.0.1",9200,"http")));
- return client;
- }
- }
- package com.wzh.entity;
-
- import lombok.AllArgsConstructor;
- import lombok.Data;
- import lombok.NoArgsConstructor;
-
- /**
- * @ProjectName: Elasticsearch
- * @Package: com.wzh.entity
- * @ClassName: User
- * @Author: 王振华
- * @Description:
- * @Date: 2022/8/16 18:23
- * @Version: 1.0
- */
- @Data
- @NoArgsConstructor
- @AllArgsConstructor
- public class User {
- private String name;
- private String address;
- private Integer age;
-
- }
- //@Autowired
- //直接将@Autowired换成@Resource注解,此注解是JDK中的注解,不会向@Autowired那样去Spring容器中寻找bean。
- @Resource
- public RestHighLevelClient client;
- //创建索引 PUT /索引名称
- @Test
- public void testCreateIndex() throws IOException {
- //该类把创建索引的信息都封装到该类中
- CreateIndexRequest createIndexRequest = new CreateIndexRequest("test-index");
-
- CreateIndexResponse createIndexResponse = client.indices().create(createIndexRequest, RequestOptions.DEFAULT);
- System.out.println(createIndexResponse.isAcknowledged());//查看是否创建成功
-
- client.close();
- }
- //判断索引是否存在
- @Test
- public void testIndexExists() throws IOException {
- GetIndexRequest getIndexRequest = new GetIndexRequest("test-index");
-
- boolean exists = client.indices().exists(getIndexRequest, RequestOptions.DEFAULT);
- System.out.println(exists);
- client.close();
- }
- //删除索引
- @Test
- public void testDeleteIndex() throws IOException {
- DeleteIndexRequest deleteIndexRequest = new DeleteIndexRequest("test-index");
-
- AcknowledgedResponse delete = client.indices().delete(deleteIndexRequest, RequestOptions.DEFAULT);
- System.out.println(delete.isAcknowledged());
- client.close();
- }
- //添加文档 PUT /索引/1 {name:"",age:"",address:""}
- @Test
- public void testInsertDoc() throws IOException {
- IndexRequest indexRequest = new IndexRequest("test-index");
- indexRequest.id("2"); //指定文档的id,不写则随机生成
- //指定文档的内容:String文档的json内容,XContentType xContentType:以什么格式
- indexRequest.source(JSON.toJSONString(new User("张三","郑州",22)), XContentType.JSON);
-
- IndexResponse indexResponse = client.index(indexRequest, RequestOptions.DEFAULT);
- System.out.println(indexResponse.getResult());
- client.close();
- }
- //判断文档是否存在
- @Test
- public void testDocExist() throws IOException {
- GetRequest getRequest = new GetRequest("test-index");
- getRequest.id("2");
- boolean exists = client.exists(getRequest, RequestOptions.DEFAULT);
- System.out.println(exists);
- }
- //根据id查询文档
- @Test
- public void testGetDoc() throws IOException {
- GetRequest getRequest = new GetRequest("test-index");
- getRequest.id("2");
- GetResponse response = client.get(getRequest, RequestOptions.DEFAULT);
-
- String source = response.getSourceAsString();//转为字符串
- User user = JSON.parseObject(source, User.class); //转为Java对象
- System.out.println(user.getAddress());
-
- Map
map = response.getSourceAsMap(); //转为map - System.out.println(map.get("address"));
- client.close();
- }
- //删除文档
- @Test
- public void testDeleteDoc() throws IOException {
- DeleteRequest deleteRequest = new DeleteRequest("test-index");
- deleteRequest.id("1");
- DeleteResponse delete = client.delete(deleteRequest, RequestOptions.DEFAULT);
- System.out.println(delete.getResult());
- client.close();
- }
https://blog.csdn.net/beidaol/article/details/104491950- //修改文档
- // POST /test01/_doc/2/_update
- //{
- // "doc":{
- // "name":"周星驰"
- // }
- //}
- @Test
- public void testUpdateDoc() throws IOException {
- UpdateRequest updateRequest = new UpdateRequest("test-index","2");
- User user = new User();
- user.setName("李四");
- //updateRequest.doc(JSON.toJSONString(user),XContentType.JSON);
- updateRequest.doc((JSONObject)JSONObject.toJSON(user));
- UpdateResponse update = client.update(updateRequest, RequestOptions.DEFAULT);
- System.out.println(update.getResult());
- client.close();
- }
- //批量添加文档
- @Test
- public void testBulkDoc() throws IOException {
- BulkRequest bulkRequest = new BulkRequest("test-index");
- List
list = new ArrayList<>(); - list.add(new User("张三","上海",21));
- list.add(new User("李四","浙江",22));
- list.add(new User("王五","深圳",23));
- list.add(new User("赵六","北京",24));
- list.add(new User("孙七","南京",25));
- /* for(User user:list){
- IndexRequest indexRequest=new IndexRequest();
- indexRequest.source(JSON.toJSONString(user),XContentType.JSON);
- bulkRequest.add(indexRequest);
- }*/
- //(JSONObject)JSONObject.toJSON(item) java对象转为json对象
- list.stream().forEach(item->bulkRequest.add(new IndexRequest().source((JSONObject)JSONObject.toJSON(item))));
- BulkResponse bulkResponse = client.bulk(bulkRequest, RequestOptions.DEFAULT);
- System.out.println(bulkResponse.hasFailures());
- client.close();
- }
- //复杂查询 -GET /索引/_search
- // {
- // "query":{
- // "":{}
- // },
- // "from":
- // "size":
- // "_source":["",""],
- // "sort":{}
-
- // }
- //1. 搜索请求对象SearchRequest
- //2. 构建一个条件对象SearchSourceBuilder
- //3. 把条件对象放入搜索请求对象中
- //4. 执行搜索功能
- @Test
- public void testSearch() throws IOException {
- // 1.创建查询请求对象
- SearchRequest searchRequest = new SearchRequest("test-index");
- // 2.创建一个条件对象
- SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
- // (1)查询条件 使用QueryBuilders工具类创建
- //匹配查询
- //MatchQueryBuilder matchQuery = QueryBuilders.matchQuery("name", "张 ");
- //精准查询
- TermQueryBuilder queryBuilder = QueryBuilders.termQuery("name", "张");
-
- // 条件投入
- sourceBuilder.query(queryBuilder);
- // (2)其他<可有可无>:(可以参考 SearchSourceBuilder 的 字段部分)
- //分页
- sourceBuilder.from(0);
- sourceBuilder.size(1);
-
- //排序
- sourceBuilder.sort("age", SortOrder.ASC);
-
- //高亮显示
- HighlightBuilder highlightBuilder = new HighlightBuilder();
- highlightBuilder.field("name");
- highlightBuilder.preTags("");
- highlightBuilder.postTags("");
- sourceBuilder.highlighter(highlightBuilder);
-
-
- //3. 把条件对象放入搜索请求对象中
- searchRequest.source(sourceBuilder);
- //4. 执行搜索功能
- SearchResponse search = client.search(searchRequest, RequestOptions.DEFAULT);
- System.out.println("总条数:"+search.getHits().getTotalHits().value);
- SearchHit[] hits = search.getHits().getHits();
- Arrays.stream(hits).forEach(item-> System.out.println(item.getSourceAsString()));
- Arrays.stream(hits).forEach(item-> System.out.println(item.getHighlightFields()));
- client.close();
- }
多条件查找
- //多条件查询
- @Test
- public void testSearch02() throws IOException {
- SearchRequest searchRequest = new SearchRequest("qy151-index");
- //创建一个条件对象
- SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
- BoolQueryBuilder queryBuilder = QueryBuilders.boolQuery().must(QueryBuilders.matchQuery("name", "李 "))
- .should(QueryBuilders.termQuery("name", "三"));
- sourceBuilder.query(queryBuilder);
- //把条件对象放入搜索请求对象中
- searchRequest.source(sourceBuilder);
- SearchResponse search = client.search(searchRequest, RequestOptions.DEFAULT);
- SearchHit[] hits = search.getHits().getHits();
- Arrays.stream(hits).forEach(item-> System.out.println(item.getSourceAsMap()));
-
- client.close();
- }
ElasticSearch 几种常用分词器如下:
| 分词器 | 分词方式 |
| StandardAnalyzer | 单字分词 |
| CJKAnalyzer | 二分法 |
| IKAnalyzer | 词库分词 |
分词∶即把一段中文或者别的划分成一个个的关键字,我们在搜索时候会把自己的信息进行分词,会把数据库中或者索引库中的数据进行分词,然后进行一个匹配操作,默认的中文分词是将每个字看成一个词,比如“我爱中国"会被分为"我""爱""中""国”,这显然是不符合要求的,所以我们需要安装中文分词器ik来解决这个问题。
IK提供了两个分词算法:ik_smart和ik_max_word,其中ik smart为最少切分,ik_max_word为最细粒度划分!
ik_max_word: 会将文本做最细粒度的拆分,比如会将"中华人民共和国国歌"拆分为"中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌",会穷尽各种可能的组合;
ik_smart: 会做最粗粒度的拆分,比如会将"中华人民共和国国歌"拆分为"中华人民共和国,国歌"。
注意:IK分词器插件的版本要和ElasticSearch的版本一致

下载完后,解压安装包到 ElasticSearch 所在文件夹中的plugins目录中:

再启动ElasticSearch,查看IK分词器插件是否安装成功:

安装成功!
1、启动Kibana

2、访问请求:http://localhost:5601/

3、选择开发工具Dev Tools,点击控制台

4、在控制台编写分词请求,进行测试
IK提供了两个分词算法:ik_smart和ik_max_word,其中ik smart为最少切分,ik_max_word为最细粒度划分!
测试 ik_smart 分词算法,最少切分:

测试 ik_max_word 分词算法,最细粒度划分:
分词请求:
- GET _analyze
- {
- "analyzer": "ik_max_word",
- "text": "我爱中国共产党"
- }
分词结果:
- {
- "tokens" : [
- {
- "token" : "我",
- "start_offset" : 0,
- "end_offset" : 1,
- "type" : "CN_CHAR",
- "position" : 0
- },
- {
- "token" : "爱",
- "start_offset" : 1,
- "end_offset" : 2,
- "type" : "CN_CHAR",
- "position" : 1
- },
- {
- "token" : "中国共产党",
- "start_offset" : 2,
- "end_offset" : 7,
- "type" : "CN_WORD",
- "position" : 2
- },
- {
- "token" : "中国",
- "start_offset" : 2,
- "end_offset" : 4,
- "type" : "CN_WORD",
- "position" : 3
- },
- {
- "token" : "国共",
- "start_offset" : 3,
- "end_offset" : 5,
- "type" : "CN_WORD",
- "position" : 4
- },
- {
- "token" : "共产党",
- "start_offset" : 4,
- "end_offset" : 7,
- "type" : "CN_WORD",
- "position" : 5
- },
- {
- "token" : "共产",
- "start_offset" : 4,
- "end_offset" : 6,
- "type" : "CN_WORD",
- "position" : 6
- },
- {
- "token" : "党",
- "start_offset" : 6,
- "end_offset" : 7,
- "type" : "CN_CHAR",
- "position" : 7
- }
- ]
- }
比较两个分词算法对同一句中文的分词结果,ik_max_word比ik_smart得到的中文词更多(从两者的英文名含义就可看出来),但这样也带来一个问题,使用ik_max_word会占用更多的存储空间。
我们用分词器对 “万里顾一程” 进行分词:先使用 ik_smart 分词算法

在使用 ik_max_word分词算法,进行细粒度的划分:
- GET _analyze
- {
- "analyzer": "ik_max_word",
- "text": "万里顾一程"
- }
分词结果:
- {
- "tokens" : [
- {
- "token" : "万里",
- "start_offset" : 0,
- "end_offset" : 2,
- "type" : "CN_WORD",
- "position" : 0
- },
- {
- "token" : "万",
- "start_offset" : 0,
- "end_offset" : 1,
- "type" : "TYPE_CNUM",
- "position" : 1
- },
- {
- "token" : "里",
- "start_offset" : 1,
- "end_offset" : 2,
- "type" : "COUNT",
- "position" : 2
- },
- {
- "token" : "顾",
- "start_offset" : 2,
- "end_offset" : 3,
- "type" : "CN_CHAR",
- "position" : 3
- },
- {
- "token" : "一程",
- "start_offset" : 3,
- "end_offset" : 5,
- "type" : "CN_WORD",
- "position" : 4
- },
- {
- "token" : "一",
- "start_offset" : 3,
- "end_offset" : 4,
- "type" : "TYPE_CNUM",
- "position" : 5
- },
- {
- "token" : "程",
- "start_offset" : 4,
- "end_offset" : 5,
- "type" : "CN_CHAR",
- "position" : 6
- }
- ]
- }
使用上面两种分词算法后,发现 “万里顾一程”被分成了“万里”、“顾”、“一程”,这是因为在IK自带的字典中没有“顾一程”这个词,如果想得到“顾一程”这个词,怎么办呢?
这就要配置自己的扩展字典了,就是在IK分词器字典中加入我们自定义的字典,在词典中加入想要的词。
在ik分词器文件的config目录中新建自定义的字典文件,以.dic为后缀,并在文件中加入“顾一程”:

然后打开 IKAnalyzer.cfg.xml 文件,把自定义的字典添加到IK的字典中:
重启ES和Kibana,再用分词器对 “万里顾一程” 进行分词:此时“顾一程”就是一个词了。


jsoup是一款Java的html解析工具,主要是对html和xml文件进行解析
在写爬虫的时候,当我们用HttpClient之类的框架,得到目标网页的源码后,需要从网页源码中取得我们想要的内容。就可以使用jsoup轻松获取想要的内容。
- <dependency>
- <groupId>org.jsoupgroupId>
- <artifactId>jsoupartifactId>
- <version>1.11.3version>
- dependency>
- package com.wzh.entity;
-
- import lombok.AllArgsConstructor;
- import lombok.Data;
- import lombok.NoArgsConstructor;
-
- /**
- * @ProjectName: jd-ES
- * @Package: com.wzh.entity
- * @ClassName: Product
- * @Author: 王振华
- * @Description:
- * @Date: 2022/8/16 20:23
- * @Version: 1.0
- */
- @Data
- @NoArgsConstructor
- @AllArgsConstructor
- public class Product {
- private String title;
-
- private String price;
-
- private String imgUrl;
- }


这里没有ES配置文件,因为有默认值
