Coverage Report

Created: 2026-03-16 23:17

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/index/inverted/analyzer/analyzer.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/index/inverted/analyzer/analyzer.h"
19
20
#include <glog/logging.h>
21
22
#include "CLucene.h"
23
#include "CLucene/analysis/LanguageBasedAnalyzer.h"
24
25
#ifdef __clang__
26
#pragma clang diagnostic push
27
#pragma clang diagnostic ignored "-Wshadow-field"
28
#endif
29
// clang-format off
30
#include "common/compile_check_avoid_begin.h"
31
#include "CLucene/analysis/standard95/StandardAnalyzer.h"
32
#include "common/compile_check_avoid_end.h"
33
// clang-format on
34
#ifdef __clang__
35
#pragma clang diagnostic pop
36
#endif
37
#include "runtime/exec_env.h"
38
#include "runtime/index_policy/index_policy_mgr.h"
39
#include "storage/index/inverted/analyzer/basic/basic_analyzer.h"
40
#include "storage/index/inverted/analyzer/icu/icu_analyzer.h"
41
#include "storage/index/inverted/analyzer/ik/IKAnalyzer.h"
42
#include "storage/index/inverted/char_filter/char_replace_char_filter_factory.h"
43
44
namespace doris::segment_v2::inverted_index {
45
#include "common/compile_check_begin.h"
46
47
2.00k
ReaderPtr InvertedIndexAnalyzer::create_reader(const CharFilterMap& char_filter_map) {
48
2.00k
    ReaderPtr reader = std::make_shared<lucene::util::SStringReader<char>>();
49
2.00k
    if (!char_filter_map.empty()) {
50
660
        auto it_type = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE);
51
660
        if (it_type != char_filter_map.end() &&
52
660
            it_type->second == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
53
660
            auto it_pattern = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
54
660
            auto it_replacement =
55
660
                    char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT);
56
660
            if (it_pattern != char_filter_map.end() && it_replacement != char_filter_map.end()) {
57
660
                reader = std::make_shared<CharReplaceCharFilter>(reader, it_pattern->second,
58
660
                                                                 it_replacement->second);
59
660
            }
60
660
        }
61
660
    }
62
2.00k
    return reader;
63
2.00k
}
64
65
20
bool InvertedIndexAnalyzer::is_builtin_analyzer(const std::string& analyzer_name) {
66
20
    return analyzer_name == INVERTED_INDEX_PARSER_NONE ||
67
20
           analyzer_name == INVERTED_INDEX_PARSER_STANDARD ||
68
20
           analyzer_name == INVERTED_INDEX_PARSER_UNICODE ||
69
20
           analyzer_name == INVERTED_INDEX_PARSER_ENGLISH ||
70
20
           analyzer_name == INVERTED_INDEX_PARSER_CHINESE ||
71
20
           analyzer_name == INVERTED_INDEX_PARSER_ICU ||
72
20
           analyzer_name == INVERTED_INDEX_PARSER_BASIC ||
73
20
           analyzer_name == INVERTED_INDEX_PARSER_IK;
74
20
}
75
76
AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserType parser_type,
77
                                                           const std::string& parser_mode,
78
                                                           const std::string& lower_case,
79
1.57k
                                                           const std::string& stop_words) {
80
1.57k
    std::shared_ptr<lucene::analysis::Analyzer> analyzer;
81
82
1.57k
    if (parser_type == InvertedIndexParserType::PARSER_STANDARD ||
83
1.57k
        parser_type == InvertedIndexParserType::PARSER_UNICODE) {
84
479
        analyzer = std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
85
1.09k
    } else if (parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
86
425
        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
87
673
    } else if (parser_type == InvertedIndexParserType::PARSER_CHINESE) {
88
661
        auto chinese_analyzer =
89
661
                std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
90
661
        chinese_analyzer->initDict(config::inverted_index_dict_path);
91
661
        if (parser_mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
92
329
            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
93
332
        } else {
94
332
            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
95
332
        }
96
661
        analyzer = std::move(chinese_analyzer);
97
661
    } else if (parser_type == InvertedIndexParserType::PARSER_ICU) {
98
3
        analyzer = std::make_shared<ICUAnalyzer>();
99
3
        analyzer->initDict(config::inverted_index_dict_path + "/icu");
100
9
    } else if (parser_type == InvertedIndexParserType::PARSER_BASIC) {
101
3
        analyzer = std::make_shared<BasicAnalyzer>();
102
6
    } else if (parser_type == InvertedIndexParserType::PARSER_IK) {
103
4
        auto ik_analyzer = std::make_shared<IKAnalyzer>();
104
4
        ik_analyzer->initDict(config::inverted_index_dict_path + "/ik");
105
4
        if (parser_mode == INVERTED_INDEX_PARSER_SMART) {
106
1
            ik_analyzer->setMode(true);
107
3
        } else {
108
3
            ik_analyzer->setMode(false);
109
3
        }
110
4
        analyzer = std::move(ik_analyzer);
111
4
    } else {
112
        // default
113
2
        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
114
2
    }
115
116
    // set lowercase
117
1.57k
    if (lower_case == INVERTED_INDEX_PARSER_TRUE) {
118
852
        analyzer->set_lowercase(true);
119
852
    } else if (lower_case == INVERTED_INDEX_PARSER_FALSE) {
120
658
        analyzer->set_lowercase(false);
121
658
    }
122
123
    // set stop words
124
1.57k
    if (stop_words == "none") {
125
3
        analyzer->set_stopwords(nullptr);
126
1.57k
    } else {
127
1.57k
        analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
128
1.57k
    }
129
130
1.57k
    return analyzer;
131
1.57k
}
132
133
1.56k
AnalyzerPtr InvertedIndexAnalyzer::create_analyzer(const InvertedIndexAnalyzerConfig* config) {
134
1.56k
    DCHECK(config != nullptr);
135
1.56k
    const std::string& analyzer_name = config->analyzer_name;
136
137
    // Handle empty analyzer name - use builtin analyzer based on parser_type.
138
    // This is the common case when user does not specify USING ANALYZER.
139
1.56k
    if (analyzer_name.empty()) {
140
1.55k
        return create_builtin_analyzer(config->parser_type, config->parser_mode, config->lower_case,
141
1.55k
                                       config->stop_words);
142
1.55k
    }
143
144
    // Check if it's a builtin analyzer name (english, chinese, standard, etc.)
145
9
    if (is_builtin_analyzer(analyzer_name)) {
146
8
        InvertedIndexParserType parser_type =
147
8
                get_inverted_index_parser_type_from_string(analyzer_name);
148
8
        return create_builtin_analyzer(parser_type, config->parser_mode, config->lower_case,
149
8
                                       config->stop_words);
150
8
    }
151
152
    // Custom analyzer - look up in policy manager
153
1
    auto* index_policy_mgr = doris::ExecEnv::GetInstance()->index_policy_mgr();
154
1
    if (!index_policy_mgr) {
155
1
        throw Exception(ErrorCode::INVERTED_INDEX_ANALYZER_ERROR,
156
1
                        "Index policy manager is not initialized");
157
1
    }
158
159
0
    return index_policy_mgr->get_policy_by_name(analyzer_name);
160
1
}
161
162
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
163
169
        ReaderPtr reader, lucene::analysis::Analyzer* analyzer) {
164
169
    std::vector<TermInfo> analyse_result;
165
166
169
    std::unique_ptr<lucene::analysis::TokenStream> token_stream(analyzer->tokenStream(L"", reader));
167
168
169
    lucene::analysis::Token token;
169
169
    int32_t position = 0;
170
808
    while (token_stream->next(&token)) {
171
639
        if (token.termLength<char>() != 0) {
172
639
            TermInfo t;
173
639
            t.term = std::string(token.termBuffer<char>(), token.termLength<char>());
174
639
            position += token.getPositionIncrement();
175
639
            t.position = position;
176
639
            analyse_result.emplace_back(std::move(t));
177
639
        }
178
639
    }
179
180
169
    if (token_stream != nullptr) {
181
169
        token_stream->close();
182
169
    }
183
184
169
    return analyse_result;
185
169
}
186
187
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
188
36
        const std::string& search_str, const std::map<std::string, std::string>& properties) {
189
36
    if (!should_analyzer(properties)) {
190
        // Keyword index: all strings (including empty) are valid tokens for exact match.
191
        // Empty string is a valid value in keyword index and should be matchable.
192
0
        std::vector<TermInfo> result;
193
0
        result.emplace_back(search_str);
194
0
        return result;
195
0
    }
196
36
    InvertedIndexAnalyzerConfig config;
197
36
    config.analyzer_name = get_analyzer_name_from_properties(properties);
198
36
    config.parser_type = get_inverted_index_parser_type_from_string(
199
36
            get_parser_string_from_properties(properties));
200
36
    config.parser_mode = get_parser_mode_string_from_properties(properties);
201
36
    config.lower_case = get_parser_lowercase_from_properties(properties);
202
36
    config.stop_words = get_parser_stopwords_from_properties(properties);
203
36
    config.char_filter_map = get_parser_char_filter_map_from_properties(properties);
204
36
    auto analyzer = create_analyzer(&config);
205
36
    auto reader = create_reader(config.char_filter_map);
206
36
    reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true);
207
36
    return get_analyse_result(reader, analyzer.get());
208
36
}
209
210
4.88k
bool InvertedIndexAnalyzer::should_analyzer(const std::map<std::string, std::string>& properties) {
211
4.88k
    auto parser_type = get_inverted_index_parser_type_from_string(
212
4.88k
            get_parser_string_from_properties(properties));
213
4.88k
    auto analyzer_name = get_analyzer_name_from_properties(properties);
214
4.88k
    if (!analyzer_name.empty()) {
215
0
        return true;
216
0
    }
217
4.88k
    if (parser_type != InvertedIndexParserType::PARSER_UNKNOWN &&
218
4.88k
        parser_type != InvertedIndexParserType::PARSER_NONE) {
219
3.19k
        return true;
220
3.19k
    }
221
1.69k
    return false;
222
4.88k
}
223
224
} // namespace doris::segment_v2::inverted_index
225
#include "common/compile_check_end.h"