be/src/storage/index/inverted/analyzer/analyzer.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "storage/index/inverted/analyzer/analyzer.h"

#include <glog/logging.h>

#include "CLucene.h"
#include "CLucene/analysis/LanguageBasedAnalyzer.h"

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshadow-field"
#endif
// clang-format off
#include "common/compile_check_avoid_begin.h"
#include "CLucene/analysis/standard95/StandardAnalyzer.h"
#include "common/compile_check_avoid_end.h"
// clang-format on
#ifdef __clang__
#pragma clang diagnostic pop
#endif
#include "runtime/exec_env.h"
#include "runtime/index_policy/index_policy_mgr.h"
#include "storage/index/inverted/analyzer/basic/basic_analyzer.h"
#include "storage/index/inverted/analyzer/icu/icu_analyzer.h"
#include "storage/index/inverted/analyzer/ik/IKAnalyzer.h"
#include "storage/index/inverted/char_filter/char_replace_char_filter_factory.h"

namespace doris::segment_v2::inverted_index {
#include "common/compile_check_begin.h"

ReaderPtr InvertedIndexAnalyzer::create_reader(const CharFilterMap& char_filter_map) {
    ReaderPtr reader = std::make_shared<lucene::util::SStringReader<char>>();
    if (!char_filter_map.empty()) {
        auto it_type = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE);
        if (it_type != char_filter_map.end() &&
            it_type->second == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
            auto it_pattern = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
            auto it_replacement =
                    char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT);
            if (it_pattern != char_filter_map.end() && it_replacement != char_filter_map.end()) {
                reader = std::make_shared<CharReplaceCharFilter>(reader, it_pattern->second,
                                                                 it_replacement->second);
            }
        }
    }
    return reader;
}

bool InvertedIndexAnalyzer::is_builtin_analyzer(const std::string& analyzer_name) {
    return analyzer_name == INVERTED_INDEX_PARSER_NONE ||
           analyzer_name == INVERTED_INDEX_PARSER_STANDARD ||
           analyzer_name == INVERTED_INDEX_PARSER_UNICODE ||
           analyzer_name == INVERTED_INDEX_PARSER_ENGLISH ||
           analyzer_name == INVERTED_INDEX_PARSER_CHINESE ||
           analyzer_name == INVERTED_INDEX_PARSER_ICU ||
           analyzer_name == INVERTED_INDEX_PARSER_BASIC ||
           analyzer_name == INVERTED_INDEX_PARSER_IK;
}

AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserType parser_type,
                                                           const std::string& parser_mode,
                                                           const std::string& lower_case,
                                                           const std::string& stop_words) {
    std::shared_ptr<lucene::analysis::Analyzer> analyzer;

    if (parser_type == InvertedIndexParserType::PARSER_STANDARD ||
        parser_type == InvertedIndexParserType::PARSER_UNICODE) {
        analyzer = std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
    } else if (parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
    } else if (parser_type == InvertedIndexParserType::PARSER_CHINESE) {
        auto chinese_analyzer =
                std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
        chinese_analyzer->initDict(config::inverted_index_dict_path);
        if (parser_mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
        } else {
            chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
        }
        analyzer = std::move(chinese_analyzer);
    } else if (parser_type == InvertedIndexParserType::PARSER_ICU) {
        analyzer = std::make_shared<ICUAnalyzer>();
        analyzer->initDict(config::inverted_index_dict_path + "/icu");
    } else if (parser_type == InvertedIndexParserType::PARSER_BASIC) {
        analyzer = std::make_shared<BasicAnalyzer>();
    } else if (parser_type == InvertedIndexParserType::PARSER_IK) {
        auto ik_analyzer = std::make_shared<IKAnalyzer>();
        ik_analyzer->initDict(config::inverted_index_dict_path + "/ik");
        if (parser_mode == INVERTED_INDEX_PARSER_SMART) {
            ik_analyzer->setMode(true);
        } else {
            ik_analyzer->setMode(false);
        }
        analyzer = std::move(ik_analyzer);
    } else {
        // default
        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
    }

    // set lowercase
    if (lower_case == INVERTED_INDEX_PARSER_TRUE) {
        analyzer->set_lowercase(true);
    } else if (lower_case == INVERTED_INDEX_PARSER_FALSE) {
        analyzer->set_lowercase(false);
    }

    // set stop words
    if (stop_words == "none") {
        analyzer->set_stopwords(nullptr);
    } else {
        analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
    }

    return analyzer;
}

AnalyzerPtr InvertedIndexAnalyzer::create_analyzer(const InvertedIndexAnalyzerConfig* config) {
    DCHECK(config != nullptr);
    const std::string& analyzer_name = config->analyzer_name;

    // Handle empty analyzer name - use builtin analyzer based on parser_type.
    // This is the common case when user does not specify USING ANALYZER.
    if (analyzer_name.empty()) {
        return create_builtin_analyzer(config->parser_type, config->parser_mode, config->lower_case,
                                       config->stop_words);
    }

    // Check if it's a builtin analyzer name (english, chinese, standard, etc.)
    if (is_builtin_analyzer(analyzer_name)) {
        InvertedIndexParserType parser_type =
                get_inverted_index_parser_type_from_string(analyzer_name);
        return create_builtin_analyzer(parser_type, config->parser_mode, config->lower_case,
                                       config->stop_words);
    }

    // Custom analyzer - look up in policy manager
    auto* index_policy_mgr = doris::ExecEnv::GetInstance()->index_policy_mgr();
    if (!index_policy_mgr) {
        throw Exception(ErrorCode::INVERTED_INDEX_ANALYZER_ERROR,
                        "Index policy manager is not initialized");
    }

    return index_policy_mgr->get_policy_by_name(analyzer_name);
}

std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
        ReaderPtr reader, lucene::analysis::Analyzer* analyzer) {
    std::vector<TermInfo> analyse_result;

    std::unique_ptr<lucene::analysis::TokenStream> token_stream(analyzer->tokenStream(L"", reader));

    lucene::analysis::Token token;
    int32_t position = 0;
    while (token_stream->next(&token)) {
        if (token.termLength<char>() != 0) {
            TermInfo t;
            t.term = std::string(token.termBuffer<char>(), token.termLength<char>());
            position += token.getPositionIncrement();
            t.position = position;
            analyse_result.emplace_back(std::move(t));
        }
    }

    if (token_stream != nullptr) {
        token_stream->close();
    }

    return analyse_result;
}

std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
        const std::string& search_str, const std::map<std::string, std::string>& properties) {
    if (!should_analyzer(properties)) {
        // Keyword index: all strings (including empty) are valid tokens for exact match.
        // Empty string is a valid value in keyword index and should be matchable.
        std::vector<TermInfo> result;
        result.emplace_back(search_str);
        return result;
    }
    InvertedIndexAnalyzerConfig config;
    config.analyzer_name = get_analyzer_name_from_properties(properties);
    config.parser_type = get_inverted_index_parser_type_from_string(
            get_parser_string_from_properties(properties));
    config.parser_mode = get_parser_mode_string_from_properties(properties);
    config.lower_case = get_parser_lowercase_from_properties(properties);
    config.stop_words = get_parser_stopwords_from_properties(properties);
    config.char_filter_map = get_parser_char_filter_map_from_properties(properties);
    auto analyzer = create_analyzer(&config);
    auto reader = create_reader(config.char_filter_map);
    reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true);
    return get_analyse_result(reader, analyzer.get());
}

bool InvertedIndexAnalyzer::should_analyzer(const std::map<std::string, std::string>& properties) {
    auto parser_type = get_inverted_index_parser_type_from_string(
            get_parser_string_from_properties(properties));
    auto analyzer_name = get_analyzer_name_from_properties(properties);
    if (!analyzer_name.empty()) {
        return true;
    }
    if (parser_type != InvertedIndexParserType::PARSER_UNKNOWN &&
        parser_type != InvertedIndexParserType::PARSER_NONE) {
        return true;
    }
    return false;
}

} // namespace doris::segment_v2::inverted_index
#include "common/compile_check_end.h"

Coverage Report

Created: 2026-03-16 17:53

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#include "storage/index/inverted/analyzer/analyzer.h"
19
20		#include <glog/logging.h>
21
22		#include "CLucene.h"
23		#include "CLucene/analysis/LanguageBasedAnalyzer.h"
24
25		#ifdef __clang__
26		#pragma clang diagnostic push
27		#pragma clang diagnostic ignored "-Wshadow-field"
28		#endif
29		// clang-format off
30		#include "common/compile_check_avoid_begin.h"
31		#include "CLucene/analysis/standard95/StandardAnalyzer.h"
32		#include "common/compile_check_avoid_end.h"
33		// clang-format on
34		#ifdef __clang__
35		#pragma clang diagnostic pop
36		#endif
37		#include "runtime/exec_env.h"
38		#include "runtime/index_policy/index_policy_mgr.h"
39		#include "storage/index/inverted/analyzer/basic/basic_analyzer.h"
40		#include "storage/index/inverted/analyzer/icu/icu_analyzer.h"
41		#include "storage/index/inverted/analyzer/ik/IKAnalyzer.h"
42		#include "storage/index/inverted/char_filter/char_replace_char_filter_factory.h"
43
44		namespace doris::segment_v2::inverted_index {
45		#include "common/compile_check_begin.h"
46
47	2.00k	ReaderPtr InvertedIndexAnalyzer::create_reader(const CharFilterMap& char_filter_map) {
48	2.00k	ReaderPtr reader = std::make_shared<lucene::util::SStringReader<char>>();
49	2.00k	if (!char_filter_map.empty()) {
50	660	auto it_type = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE);
51	660	if (it_type != char_filter_map.end() &&
52	660	it_type->second == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
53	660	auto it_pattern = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
54	660	auto it_replacement =
55	660	char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT);
56	660	if (it_pattern != char_filter_map.end() && it_replacement != char_filter_map.end()) {
57	660	reader = std::make_shared<CharReplaceCharFilter>(reader, it_pattern->second,
58	660	it_replacement->second);
59	660	}
60	660	}
61	660	}
62	2.00k	return reader;
63	2.00k	}
64
65	20	bool InvertedIndexAnalyzer::is_builtin_analyzer(const std::string& analyzer_name) {
66	20	return analyzer_name == INVERTED_INDEX_PARSER_NONE \|\|
67	20	analyzer_name == INVERTED_INDEX_PARSER_STANDARD \|\|
68	20	analyzer_name == INVERTED_INDEX_PARSER_UNICODE \|\|
69	20	analyzer_name == INVERTED_INDEX_PARSER_ENGLISH \|\|
70	20	analyzer_name == INVERTED_INDEX_PARSER_CHINESE \|\|
71	20	analyzer_name == INVERTED_INDEX_PARSER_ICU \|\|
72	20	analyzer_name == INVERTED_INDEX_PARSER_BASIC \|\|
73	20	analyzer_name == INVERTED_INDEX_PARSER_IK;
74	20	}
75
76		AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserType parser_type,
77		const std::string& parser_mode,
78		const std::string& lower_case,
79	1.57k	const std::string& stop_words) {
80	1.57k	std::shared_ptr<lucene::analysis::Analyzer> analyzer;
81
82	1.57k	if (parser_type == InvertedIndexParserType::PARSER_STANDARD \|\|
83	1.57k	parser_type == InvertedIndexParserType::PARSER_UNICODE) {
84	479	analyzer = std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
85	1.09k	} else if (parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
86	425	analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
87	673	} else if (parser_type == InvertedIndexParserType::PARSER_CHINESE) {
88	661	auto chinese_analyzer =
89	661	std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
90	661	chinese_analyzer->initDict(config::inverted_index_dict_path);
91	661	if (parser_mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
92	329	chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
93	332	} else {
94	332	chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
95	332	}
96	661	analyzer = std::move(chinese_analyzer);
97	661	} else if (parser_type == InvertedIndexParserType::PARSER_ICU) {
98	3	analyzer = std::make_shared<ICUAnalyzer>();
99	3	analyzer->initDict(config::inverted_index_dict_path + "/icu");
100	9	} else if (parser_type == InvertedIndexParserType::PARSER_BASIC) {
101	3	analyzer = std::make_shared<BasicAnalyzer>();
102	6	} else if (parser_type == InvertedIndexParserType::PARSER_IK) {
103	4	auto ik_analyzer = std::make_shared<IKAnalyzer>();
104	4	ik_analyzer->initDict(config::inverted_index_dict_path + "/ik");
105	4	if (parser_mode == INVERTED_INDEX_PARSER_SMART) {
106	1	ik_analyzer->setMode(true);
107	3	} else {
108	3	ik_analyzer->setMode(false);
109	3	}
110	4	analyzer = std::move(ik_analyzer);
111	4	} else {
112		// default
113	2	analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
114	2	}
115
116		// set lowercase
117	1.57k	if (lower_case == INVERTED_INDEX_PARSER_TRUE) {
118	852	analyzer->set_lowercase(true);
119	852	} else if (lower_case == INVERTED_INDEX_PARSER_FALSE) {
120	658	analyzer->set_lowercase(false);
121	658	}
122
123		// set stop words
124	1.57k	if (stop_words == "none") {
125	3	analyzer->set_stopwords(nullptr);
126	1.57k	} else {
127	1.57k	analyzer->set_stopwords(&lucene::analysis::standard95::stop_words);
128	1.57k	}
129
130	1.57k	return analyzer;
131	1.57k	}
132
133	1.56k	AnalyzerPtr InvertedIndexAnalyzer::create_analyzer(const InvertedIndexAnalyzerConfig* config) {
134	1.56k	DCHECK(config != nullptr);
135	1.56k	const std::string& analyzer_name = config->analyzer_name;
136
137		// Handle empty analyzer name - use builtin analyzer based on parser_type.
138		// This is the common case when user does not specify USING ANALYZER.
139	1.56k	if (analyzer_name.empty()) {
140	1.55k	return create_builtin_analyzer(config->parser_type, config->parser_mode, config->lower_case,
141	1.55k	config->stop_words);
142	1.55k	}
143
144		// Check if it's a builtin analyzer name (english, chinese, standard, etc.)
145	9	if (is_builtin_analyzer(analyzer_name)) {
146	8	InvertedIndexParserType parser_type =
147	8	get_inverted_index_parser_type_from_string(analyzer_name);
148	8	return create_builtin_analyzer(parser_type, config->parser_mode, config->lower_case,
149	8	config->stop_words);
150	8	}
151
152		// Custom analyzer - look up in policy manager
153	1	auto* index_policy_mgr = doris::ExecEnv::GetInstance()->index_policy_mgr();
154	1	if (!index_policy_mgr) {
155	1	throw Exception(ErrorCode::INVERTED_INDEX_ANALYZER_ERROR,
156	1	"Index policy manager is not initialized");
157	1	}
158
159	0	return index_policy_mgr->get_policy_by_name(analyzer_name);
160	1	}
161
162		std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
163	169	ReaderPtr reader, lucene::analysis::Analyzer* analyzer) {
164	169	std::vector<TermInfo> analyse_result;
165
166	169	std::unique_ptr<lucene::analysis::TokenStream> token_stream(analyzer->tokenStream(L"", reader));
167
168	169	lucene::analysis::Token token;
169	169	int32_t position = 0;
170	808	while (token_stream->next(&token)) {
171	639	if (token.termLength<char>() != 0) {
172	639	TermInfo t;
173	639	t.term = std::string(token.termBuffer<char>(), token.termLength<char>());
174	639	position += token.getPositionIncrement();
175	639	t.position = position;
176	639	analyse_result.emplace_back(std::move(t));
177	639	}
178	639	}
179
180	169	if (token_stream != nullptr) {
181	169	token_stream->close();
182	169	}
183
184	169	return analyse_result;
185	169	}
186
187		std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
188	36	const std::string& search_str, const std::map<std::string, std::string>& properties) {
189	36	if (!should_analyzer(properties)) {
190		// Keyword index: all strings (including empty) are valid tokens for exact match.
191		// Empty string is a valid value in keyword index and should be matchable.
192	0	std::vector<TermInfo> result;
193	0	result.emplace_back(search_str);
194	0	return result;
195	0	}
196	36	InvertedIndexAnalyzerConfig config;
197	36	config.analyzer_name = get_analyzer_name_from_properties(properties);
198	36	config.parser_type = get_inverted_index_parser_type_from_string(
199	36	get_parser_string_from_properties(properties));
200	36	config.parser_mode = get_parser_mode_string_from_properties(properties);
201	36	config.lower_case = get_parser_lowercase_from_properties(properties);
202	36	config.stop_words = get_parser_stopwords_from_properties(properties);
203	36	config.char_filter_map = get_parser_char_filter_map_from_properties(properties);
204	36	auto analyzer = create_analyzer(&config);
205	36	auto reader = create_reader(config.char_filter_map);
206	36	reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true);
207	36	return get_analyse_result(reader, analyzer.get());
208	36	}
209
210	4.88k	bool InvertedIndexAnalyzer::should_analyzer(const std::map<std::string, std::string>& properties) {
211	4.88k	auto parser_type = get_inverted_index_parser_type_from_string(
212	4.88k	get_parser_string_from_properties(properties));
213	4.88k	auto analyzer_name = get_analyzer_name_from_properties(properties);
214	4.88k	if (!analyzer_name.empty()) {
215	0	return true;
216	0	}
217	4.88k	if (parser_type != InvertedIndexParserType::PARSER_UNKNOWN &&
218	4.88k	parser_type != InvertedIndexParserType::PARSER_NONE) {
219	3.19k	return true;
220	3.19k	}
221	1.69k	return false;
222	4.88k	}
223
224		} // namespace doris::segment_v2::inverted_index
225		#include "common/compile_check_end.h"