参考资料
https://www.elastic.co/guide/cn/elasticsearch/guide/current/configuring-analyzers.html https://www.elastic.co/guide/cn/elasticsearch/guide/current/analysis-intro.html https://www.elastic.co/guide/cn/elasticsearch/guide/current/custom-analyzers.html https://github.com/medcl/elasticsearch-analysis-ik https://www.elastic.co/guide/cn/elasticsearch/guide/current/multi-word-synonyms.html
简单的索引与默认分词研究
创建索引
PUT {host}/test
1
2
3
4
5
6
7
8
9
10
11
12
13
14
{
"settings":{
"number_of_shards" : 5,
"number_of_replicas" : 1
},
"mappings": {
"test": {
"properties": {
"name": {"type":"string"},
"age": {"type":"integer"}
}
}
}
}
分词研究
POST {host}/test/_analyze
输入内容
1
i love you 我爱你中国
输出结果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
{
"tokens": [
{
"token": "i",
"start_offset": 0,
"end_offset": 1,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "love",
"start_offset": 2,
"end_offset": 6,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "you",
"start_offset": 7,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "我",
"start_offset": 11,
"end_offset": 12,
"type": "<IDEOGRAPHIC>",
"position": 3
},
{
"token": "爱",
"start_offset": 12,
"end_offset": 13,
"type": "<IDEOGRAPHIC>",
"position": 4
},
{
"token": "你",
"start_offset": 13,
"end_offset": 14,
"type": "<IDEOGRAPHIC>",
"position": 5
},
{
"token": "中",
"start_offset": 14,
"end_offset": 15,
"type": "<IDEOGRAPHIC>",
"position": 6
},
{
"token": "国",
"start_offset": 15,
"end_offset": 16,
"type": "<IDEOGRAPHIC>",
"position": 7
}
]
}
高级方式与中文分词器第三方插件
创建索引
PUT {host}/test
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
{
"settings":{
"number_of_shards" : 1,
"number_of_replicas" : 0,
"analysis": {
"analyzer": {
"default": {
"type": "ik"
}
}
}
},
"mappings": {
"test": {
"properties": {
"name": {"type":"string"},
"age": {"type":"integer"}
}
}
}
}
分词研究
POST {host}/test/_analyze
输入内容
1
i love you 我爱你中国
输出结果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
{
"tokens": [
{
"token": "i",
"start_offset": 0,
"end_offset": 1,
"type": "ENGLISH",
"position": 0
},
{
"token": "love",
"start_offset": 2,
"end_offset": 6,
"type": "ENGLISH",
"position": 1
},
{
"token": "you",
"start_offset": 7,
"end_offset": 10,
"type": "ENGLISH",
"position": 2
},
{
"token": "我爱你",
"start_offset": 11,
"end_offset": 14,
"type": "CN_WORD",
"position": 3
},
{
"token": "爱你",
"start_offset": 12,
"end_offset": 14,
"type": "CN_WORD",
"position": 4
},
{
"token": "中国",
"start_offset": 14,
"end_offset": 16,
"type": "CN_WORD",
"position": 5
}
]
}
同义词设置
创建索引
PUT {host}/test
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
{
"settings":{
"number_of_shards" : 1,
"number_of_replicas" : 0,
"analysis": {
"analyzer": {
"default": {
"tokenizer": "ik",
"filter": ["synonym_filter"]
}
},
"filter": {
"synonym_filter":{
"type": "synonym",
"synonyms": ["西红柿,番茄,红果"]
}
}
}
},
"mappings": {
"test": {
"properties": {
"name": {"type":"string"},
"age": {"type":"integer"}
}
}
}
}
分词方式测试
POST /test/_analyze
输入内容
1
西红柿电影
输出词条
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
{
"tokens": [
{
"token": "西红柿",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 0
},
{
"token": "番茄",
"start_offset": 0,
"end_offset": 3,
"type": "SYNONYM",
"position": 0
},
{
"token": "红果",
"start_offset": 0,
"end_offset": 3,
"type": "SYNONYM",
"position": 0
},
{
"token": "电影",
"start_offset": 3,
"end_offset": 5,
"type": "CN_WORD",
"position": 1
}
]
}
添加文档真实测试
添加文档
POST {host}/test/test
番茄
1
2
3
4
{
"name": "番茄",
"age": 1
}
西红柿
1
2
3
4
{
"name": "西红柿",
"age": 1
}
查询
GET /test/test/_search
1
2
3
4
5
{
"query":{
"match":{"name":{"query": "红果"}}
}
}