be/src/storage/index/inverted/analyzer/analyzer.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "storage/index/inverted/analyzer/analyzer.h" |
19 | | |
20 | | #include <glog/logging.h> |
21 | | |
22 | | #include "CLucene.h" |
23 | | #include "CLucene/analysis/LanguageBasedAnalyzer.h" |
24 | | |
25 | | #ifdef __clang__ |
26 | | #pragma clang diagnostic push |
27 | | #pragma clang diagnostic ignored "-Wshadow-field" |
28 | | #endif |
29 | | // clang-format off |
30 | | #include "common/compile_check_avoid_begin.h" |
31 | | #include "CLucene/analysis/standard95/StandardAnalyzer.h" |
32 | | #include "common/compile_check_avoid_end.h" |
33 | | // clang-format on |
34 | | #ifdef __clang__ |
35 | | #pragma clang diagnostic pop |
36 | | #endif |
37 | | #include "runtime/exec_env.h" |
38 | | #include "runtime/index_policy/index_policy_mgr.h" |
39 | | #include "storage/index/inverted/analyzer/basic/basic_analyzer.h" |
40 | | #include "storage/index/inverted/analyzer/icu/icu_analyzer.h" |
41 | | #include "storage/index/inverted/analyzer/ik/IKAnalyzer.h" |
42 | | #include "storage/index/inverted/char_filter/char_replace_char_filter_factory.h" |
43 | | |
44 | | namespace doris::segment_v2::inverted_index { |
45 | | #include "common/compile_check_begin.h" |
46 | | |
47 | 2.00k | ReaderPtr InvertedIndexAnalyzer::create_reader(const CharFilterMap& char_filter_map) { |
48 | 2.00k | ReaderPtr reader = std::make_shared<lucene::util::SStringReader<char>>(); |
49 | 2.00k | if (!char_filter_map.empty()) { |
50 | 660 | auto it_type = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE); |
51 | 660 | if (it_type != char_filter_map.end() && |
52 | 660 | it_type->second == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) { |
53 | 660 | auto it_pattern = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN); |
54 | 660 | auto it_replacement = |
55 | 660 | char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT); |
56 | 660 | if (it_pattern != char_filter_map.end() && it_replacement != char_filter_map.end()) { |
57 | 660 | reader = std::make_shared<CharReplaceCharFilter>(reader, it_pattern->second, |
58 | 660 | it_replacement->second); |
59 | 660 | } |
60 | 660 | } |
61 | 660 | } |
62 | 2.00k | return reader; |
63 | 2.00k | } |
64 | | |
65 | 20 | bool InvertedIndexAnalyzer::is_builtin_analyzer(const std::string& analyzer_name) { |
66 | 20 | return analyzer_name == INVERTED_INDEX_PARSER_NONE || |
67 | 20 | analyzer_name == INVERTED_INDEX_PARSER_STANDARD || |
68 | 20 | analyzer_name == INVERTED_INDEX_PARSER_UNICODE || |
69 | 20 | analyzer_name == INVERTED_INDEX_PARSER_ENGLISH || |
70 | 20 | analyzer_name == INVERTED_INDEX_PARSER_CHINESE || |
71 | 20 | analyzer_name == INVERTED_INDEX_PARSER_ICU || |
72 | 20 | analyzer_name == INVERTED_INDEX_PARSER_BASIC || |
73 | 20 | analyzer_name == INVERTED_INDEX_PARSER_IK; |
74 | 20 | } |
75 | | |
76 | | AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserType parser_type, |
77 | | const std::string& parser_mode, |
78 | | const std::string& lower_case, |
79 | 1.57k | const std::string& stop_words) { |
80 | 1.57k | std::shared_ptr<lucene::analysis::Analyzer> analyzer; |
81 | | |
82 | 1.57k | if (parser_type == InvertedIndexParserType::PARSER_STANDARD || |
83 | 1.57k | parser_type == InvertedIndexParserType::PARSER_UNICODE) { |
84 | 479 | analyzer = std::make_shared<lucene::analysis::standard95::StandardAnalyzer>(); |
85 | 1.09k | } else if (parser_type == InvertedIndexParserType::PARSER_ENGLISH) { |
86 | 425 | analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>(); |
87 | 673 | } else if (parser_type == InvertedIndexParserType::PARSER_CHINESE) { |
88 | 661 | auto chinese_analyzer = |
89 | 661 | std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false); |
90 | 661 | chinese_analyzer->initDict(config::inverted_index_dict_path); |
91 | 661 | if (parser_mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) { |
92 | 329 | chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default); |
93 | 332 | } else { |
94 | 332 | chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All); |
95 | 332 | } |
96 | 661 | analyzer = std::move(chinese_analyzer); |
97 | 661 | } else if (parser_type == InvertedIndexParserType::PARSER_ICU) { |
98 | 3 | analyzer = std::make_shared<ICUAnalyzer>(); |
99 | 3 | analyzer->initDict(config::inverted_index_dict_path + "/icu"); |
100 | 9 | } else if (parser_type == InvertedIndexParserType::PARSER_BASIC) { |
101 | 3 | analyzer = std::make_shared<BasicAnalyzer>(); |
102 | 6 | } else if (parser_type == InvertedIndexParserType::PARSER_IK) { |
103 | 4 | auto ik_analyzer = std::make_shared<IKAnalyzer>(); |
104 | 4 | ik_analyzer->initDict(config::inverted_index_dict_path + "/ik"); |
105 | 4 | if (parser_mode == INVERTED_INDEX_PARSER_SMART) { |
106 | 1 | ik_analyzer->setMode(true); |
107 | 3 | } else { |
108 | 3 | ik_analyzer->setMode(false); |
109 | 3 | } |
110 | 4 | analyzer = std::move(ik_analyzer); |
111 | 4 | } else { |
112 | | // default |
113 | 2 | analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>(); |
114 | 2 | } |
115 | | |
116 | | // set lowercase |
117 | 1.57k | if (lower_case == INVERTED_INDEX_PARSER_TRUE) { |
118 | 852 | analyzer->set_lowercase(true); |
119 | 852 | } else if (lower_case == INVERTED_INDEX_PARSER_FALSE) { |
120 | 658 | analyzer->set_lowercase(false); |
121 | 658 | } |
122 | | |
123 | | // set stop words |
124 | 1.57k | if (stop_words == "none") { |
125 | 3 | analyzer->set_stopwords(nullptr); |
126 | 1.57k | } else { |
127 | 1.57k | analyzer->set_stopwords(&lucene::analysis::standard95::stop_words); |
128 | 1.57k | } |
129 | | |
130 | 1.57k | return analyzer; |
131 | 1.57k | } |
132 | | |
133 | 1.56k | AnalyzerPtr InvertedIndexAnalyzer::create_analyzer(const InvertedIndexAnalyzerConfig* config) { |
134 | 1.56k | DCHECK(config != nullptr); |
135 | 1.56k | const std::string& analyzer_name = config->analyzer_name; |
136 | | |
137 | | // Handle empty analyzer name - use builtin analyzer based on parser_type. |
138 | | // This is the common case when user does not specify USING ANALYZER. |
139 | 1.56k | if (analyzer_name.empty()) { |
140 | 1.55k | return create_builtin_analyzer(config->parser_type, config->parser_mode, config->lower_case, |
141 | 1.55k | config->stop_words); |
142 | 1.55k | } |
143 | | |
144 | | // Check if it's a builtin analyzer name (english, chinese, standard, etc.) |
145 | 9 | if (is_builtin_analyzer(analyzer_name)) { |
146 | 8 | InvertedIndexParserType parser_type = |
147 | 8 | get_inverted_index_parser_type_from_string(analyzer_name); |
148 | 8 | return create_builtin_analyzer(parser_type, config->parser_mode, config->lower_case, |
149 | 8 | config->stop_words); |
150 | 8 | } |
151 | | |
152 | | // Custom analyzer - look up in policy manager |
153 | 1 | auto* index_policy_mgr = doris::ExecEnv::GetInstance()->index_policy_mgr(); |
154 | 1 | if (!index_policy_mgr) { |
155 | 1 | throw Exception(ErrorCode::INVERTED_INDEX_ANALYZER_ERROR, |
156 | 1 | "Index policy manager is not initialized"); |
157 | 1 | } |
158 | | |
159 | 0 | return index_policy_mgr->get_policy_by_name(analyzer_name); |
160 | 1 | } |
161 | | |
162 | | std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result( |
163 | 169 | ReaderPtr reader, lucene::analysis::Analyzer* analyzer) { |
164 | 169 | std::vector<TermInfo> analyse_result; |
165 | | |
166 | 169 | std::unique_ptr<lucene::analysis::TokenStream> token_stream(analyzer->tokenStream(L"", reader)); |
167 | | |
168 | 169 | lucene::analysis::Token token; |
169 | 169 | int32_t position = 0; |
170 | 808 | while (token_stream->next(&token)) { |
171 | 639 | if (token.termLength<char>() != 0) { |
172 | 639 | TermInfo t; |
173 | 639 | t.term = std::string(token.termBuffer<char>(), token.termLength<char>()); |
174 | 639 | position += token.getPositionIncrement(); |
175 | 639 | t.position = position; |
176 | 639 | analyse_result.emplace_back(std::move(t)); |
177 | 639 | } |
178 | 639 | } |
179 | | |
180 | 169 | if (token_stream != nullptr) { |
181 | 169 | token_stream->close(); |
182 | 169 | } |
183 | | |
184 | 169 | return analyse_result; |
185 | 169 | } |
186 | | |
187 | | std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result( |
188 | 36 | const std::string& search_str, const std::map<std::string, std::string>& properties) { |
189 | 36 | if (!should_analyzer(properties)) { |
190 | | // Keyword index: all strings (including empty) are valid tokens for exact match. |
191 | | // Empty string is a valid value in keyword index and should be matchable. |
192 | 0 | std::vector<TermInfo> result; |
193 | 0 | result.emplace_back(search_str); |
194 | 0 | return result; |
195 | 0 | } |
196 | 36 | InvertedIndexAnalyzerConfig config; |
197 | 36 | config.analyzer_name = get_analyzer_name_from_properties(properties); |
198 | 36 | config.parser_type = get_inverted_index_parser_type_from_string( |
199 | 36 | get_parser_string_from_properties(properties)); |
200 | 36 | config.parser_mode = get_parser_mode_string_from_properties(properties); |
201 | 36 | config.lower_case = get_parser_lowercase_from_properties(properties); |
202 | 36 | config.stop_words = get_parser_stopwords_from_properties(properties); |
203 | 36 | config.char_filter_map = get_parser_char_filter_map_from_properties(properties); |
204 | 36 | auto analyzer = create_analyzer(&config); |
205 | 36 | auto reader = create_reader(config.char_filter_map); |
206 | 36 | reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true); |
207 | 36 | return get_analyse_result(reader, analyzer.get()); |
208 | 36 | } |
209 | | |
210 | 4.88k | bool InvertedIndexAnalyzer::should_analyzer(const std::map<std::string, std::string>& properties) { |
211 | 4.88k | auto parser_type = get_inverted_index_parser_type_from_string( |
212 | 4.88k | get_parser_string_from_properties(properties)); |
213 | 4.88k | auto analyzer_name = get_analyzer_name_from_properties(properties); |
214 | 4.88k | if (!analyzer_name.empty()) { |
215 | 0 | return true; |
216 | 0 | } |
217 | 4.88k | if (parser_type != InvertedIndexParserType::PARSER_UNKNOWN && |
218 | 4.88k | parser_type != InvertedIndexParserType::PARSER_NONE) { |
219 | 3.19k | return true; |
220 | 3.19k | } |
221 | 1.69k | return false; |
222 | 4.88k | } |
223 | | |
224 | | } // namespace doris::segment_v2::inverted_index |
225 | | #include "common/compile_check_end.h" |