Coverage Report

Created: 2026-04-14 20:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/index/inverted/analyzer/analyzer.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/index/inverted/analyzer/analyzer.h"
19
20
#include <glog/logging.h>
21
22
#include "CLucene.h"
23
#include "CLucene/analysis/LanguageBasedAnalyzer.h"
24
25
#ifdef __clang__
26
#pragma clang diagnostic push
27
#pragma clang diagnostic ignored "-Wshadow-field"
28
#endif
29
// clang-format off
30
#include "common/compile_check_avoid_begin.h"
31
#include "CLucene/analysis/standard95/StandardAnalyzer.h"
32
#include "common/compile_check_avoid_end.h"
33
// clang-format on
34
#ifdef __clang__
35
#pragma clang diagnostic pop
36
#endif
37
#include "runtime/exec_env.h"
38
#include "runtime/index_policy/index_policy_mgr.h"
39
#include "storage/index/inverted/analyzer/basic/basic_analyzer.h"
40
#include "storage/index/inverted/analyzer/icu/icu_analyzer.h"
41
#include "storage/index/inverted/analyzer/ik/IKAnalyzer.h"
42
#include "storage/index/inverted/char_filter/char_replace_char_filter_factory.h"
43
44
namespace doris::segment_v2::inverted_index {
45
46
2.00k
ReaderPtr InvertedIndexAnalyzer::create_reader(const CharFilterMap& char_filter_map) {
47
2.00k
    ReaderPtr reader = std::make_shared<lucene::util::SStringReader<char>>();
48
2.00k
    if (!char_filter_map.empty()) {
49
660
        auto it_type = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE);
50
660
        if (it_type != char_filter_map.end() &&
51
660
            it_type->second == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
52
660
            auto it_pattern = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
53
660
            auto it_replacement =
54
660
                    char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT);
55
660
            if (it_pattern != char_filter_map.end() && it_replacement != char_filter_map.end()) {
56
660
                reader = std::make_shared<CharReplaceCharFilter>(reader, it_pattern->second,
57
660
                                                                 it_replacement->second);
58
660
            }
59
660
        }
60
660
    }
61
2.00k
    return reader;
62
2.00k
}
63
64
20
bool InvertedIndexAnalyzer::is_builtin_analyzer(const std::string& analyzer_name) {
65
20
    return analyzer_name == INVERTED_INDEX_PARSER_NONE ||
66
20
           analyzer_name == INVERTED_INDEX_PARSER_STANDARD ||
67
20
           analyzer_name == INVERTED_INDEX_PARSER_UNICODE ||
68
20
           analyzer_name == INVERTED_INDEX_PARSER_ENGLISH ||
69
20
           analyzer_name == INVERTED_INDEX_PARSER_CHINESE ||
70
20
           analyzer_name == INVERTED_INDEX_PARSER_ICU ||
71
20
           analyzer_name == INVERTED_INDEX_PARSER_BASIC ||
72
20
           analyzer_name == INVERTED_INDEX_PARSER_IK;
73
20
}
74
75
AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserType parser_type,
76
                                                           const std::string& parser_mode,
77
                                                           const std::string& lower_case,
78
1.57k
                                                           const std::string& stop_words) {
79
1.57k
    std::shared_ptr<lucene::analysis::Analyzer> analyzer;
80
81
1.57k
    if (parser_type == InvertedIndexParserType::PARSER_STANDARD ||
82
1.57k
        parser_type == InvertedIndexParserType::PARSER_UNICODE) {
83
479
        analyzer = std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
84
1.09k
    } else if (parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
85
425
        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
86
673
    } else if (parser_type == InvertedIndexParserType::PARSER_CHINESE) {
87
661
        auto chinese_analyzer =
88
661
                std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
89
661
        chinese_analyzer->initDict(config::inverted_index_dict_path);
90
661
        if (parser_mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
91
329
            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
92
332
        } else {
93
332
            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
94
332
        }
95
661
        analyzer = std::move(chinese_analyzer);
96
661
    } else if (parser_type == InvertedIndexParserType::PARSER_ICU) {
97
3
        analyzer = std::make_shared<ICUAnalyzer>();
98
3
        analyzer->initDict(config::inverted_index_dict_path + "/icu");
99
9
    } else if (parser_type == InvertedIndexParserType::PARSER_BASIC) {
100
3
        analyzer = std::make_shared<BasicAnalyzer>();
101
6
    } else if (parser_type == InvertedIndexParserType::PARSER_IK) {
102
4
        auto ik_analyzer = std::make_shared<IKAnalyzer>();
103
4
        ik_analyzer->initDict(config::inverted_index_dict_path + "/ik");
104
4
        if (parser_mode == INVERTED_INDEX_PARSER_SMART) {
105
1
            ik_analyzer->setMode(true);
106
3
        } else {
107
3
            ik_analyzer->setMode(false);
108
3
        }
109
4
        analyzer = std::move(ik_analyzer);
110
4
    } else {
111
        // default
112
2
        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
113
2
    }
114
115
    // set lowercase
116
1.57k
    if (lower_case == INVERTED_INDEX_PARSER_TRUE) {
117
852
        analyzer->set_lowercase(true);
118
852
    } else if (lower_case == INVERTED_INDEX_PARSER_FALSE) {
119
658
        analyzer->set_lowercase(false);
120
658
    }
121
122
    // set stop words
123
1.57k
    if (stop_words == "none") {
124
3
        analyzer->set_stopwords(nullptr);
125
1.57k
    } else {
126
1.57k
        analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
127
1.57k
    }
128
129
1.57k
    return analyzer;
130
1.57k
}
131
132
1.56k
AnalyzerPtr InvertedIndexAnalyzer::create_analyzer(const InvertedIndexAnalyzerConfig* config) {
133
1.56k
    DCHECK(config != nullptr);
134
1.56k
    const std::string& analyzer_name = config->analyzer_name;
135
136
    // Handle empty analyzer name - use builtin analyzer based on parser_type.
137
    // This is the common case when user does not specify USING ANALYZER.
138
1.56k
    if (analyzer_name.empty()) {
139
1.55k
        return create_builtin_analyzer(config->parser_type, config->parser_mode, config->lower_case,
140
1.55k
                                       config->stop_words);
141
1.55k
    }
142
143
    // Check if it's a builtin analyzer name (english, chinese, standard, etc.)
144
9
    if (is_builtin_analyzer(analyzer_name)) {
145
8
        InvertedIndexParserType parser_type =
146
8
                get_inverted_index_parser_type_from_string(analyzer_name);
147
8
        return create_builtin_analyzer(parser_type, config->parser_mode, config->lower_case,
148
8
                                       config->stop_words);
149
8
    }
150
151
    // Custom analyzer - look up in policy manager
152
1
    auto* index_policy_mgr = doris::ExecEnv::GetInstance()->index_policy_mgr();
153
1
    if (!index_policy_mgr) {
154
1
        throw Exception(ErrorCode::INVERTED_INDEX_ANALYZER_ERROR,
155
1
                        "Index policy manager is not initialized");
156
1
    }
157
158
0
    return index_policy_mgr->get_policy_by_name(analyzer_name);
159
1
}
160
161
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
162
169
        ReaderPtr reader, lucene::analysis::Analyzer* analyzer) {
163
169
    std::vector<TermInfo> analyse_result;
164
165
169
    std::unique_ptr<lucene::analysis::TokenStream> token_stream(analyzer->tokenStream(L"", reader));
166
167
169
    lucene::analysis::Token token;
168
169
    int32_t position = 0;
169
808
    while (token_stream->next(&token)) {
170
639
        if (token.termLength<char>() != 0) {
171
639
            TermInfo t;
172
639
            t.term = std::string(token.termBuffer<char>(), token.termLength<char>());
173
639
            position += token.getPositionIncrement();
174
639
            t.position = position;
175
639
            analyse_result.emplace_back(std::move(t));
176
639
        }
177
639
    }
178
179
169
    if (token_stream != nullptr) {
180
169
        token_stream->close();
181
169
    }
182
183
169
    return analyse_result;
184
169
}
185
186
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
187
36
        const std::string& search_str, const std::map<std::string, std::string>& properties) {
188
36
    if (!should_analyzer(properties)) {
189
        // Keyword index: all strings (including empty) are valid tokens for exact match.
190
        // Empty string is a valid value in keyword index and should be matchable.
191
0
        std::vector<TermInfo> result;
192
0
        result.emplace_back(search_str);
193
0
        return result;
194
0
    }
195
36
    InvertedIndexAnalyzerConfig config;
196
36
    config.analyzer_name = get_analyzer_name_from_properties(properties);
197
36
    config.parser_type = get_inverted_index_parser_type_from_string(
198
36
            get_parser_string_from_properties(properties));
199
36
    config.parser_mode = get_parser_mode_string_from_properties(properties);
200
36
    config.lower_case = get_parser_lowercase_from_properties(properties);
201
36
    config.stop_words = get_parser_stopwords_from_properties(properties);
202
36
    config.char_filter_map = get_parser_char_filter_map_from_properties(properties);
203
36
    auto analyzer = create_analyzer(&config);
204
36
    auto reader = create_reader(config.char_filter_map);
205
36
    reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true);
206
36
    return get_analyse_result(reader, analyzer.get());
207
36
}
208
209
4.88k
bool InvertedIndexAnalyzer::should_analyzer(const std::map<std::string, std::string>& properties) {
210
4.88k
    auto parser_type = get_inverted_index_parser_type_from_string(
211
4.88k
            get_parser_string_from_properties(properties));
212
4.88k
    auto analyzer_name = get_analyzer_name_from_properties(properties);
213
4.88k
    if (!analyzer_name.empty()) {
214
0
        return true;
215
0
    }
216
4.88k
    if (parser_type != InvertedIndexParserType::PARSER_UNKNOWN &&
217
4.88k
        parser_type != InvertedIndexParserType::PARSER_NONE) {
218
3.19k
        return true;
219
3.19k
    }
220
1.69k
    return false;
221
4.88k
}
222
223
} // namespace doris::segment_v2::inverted_index