be/src/runtime/index_policy/index_policy_mgr.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "runtime/index_policy/index_policy_mgr.h"

#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <unordered_set>
#include <utility>

namespace doris {

const std::unordered_set<std::string> IndexPolicyMgr::BUILTIN_NORMALIZERS = {"lowercase"};

std::string IndexPolicyMgr::normalize_name(const std::string& name) {
    std::string result = name;
    boost::algorithm::trim(result);
    boost::algorithm::to_lower(result);
    return result;
}

void IndexPolicyMgr::apply_policy_changes(const std::vector<TIndexPolicy>& policys_to_update,
                                          const std::vector<int64_t>& policys_to_delete) {
    LOG(INFO) << "Starting policy changes - "
              << "Updates: " << policys_to_update.size() << " policies, "
              << "Deletions: " << policys_to_delete.size() << " policies";

    std::unique_lock lock(_mutex);
    int32_t success_deletes = 0;
    int32_t success_updates = 0;

    for (auto id : policys_to_delete) {
        if (auto it = _policys.find(id); it != _policys.end()) {
            LOG(INFO) << "Deleting policy - "
                      << "ID: " << id << ", "
                      << "Name: " << it->second.name;

            // Use normalized name for deletion
            _name_to_id.erase(normalize_name(it->second.name));
            _policys.erase(it);
            success_deletes++;
        } else {
            LOG(WARNING) << "Delete failed - Policy ID not found: " << id;
        }
    }

    for (const auto& policy : policys_to_update) {
        if (_policys.contains(policy.id)) {
            LOG(ERROR) << "Reject update - Duplicate policy ID: " << policy.id
                       << " | Existing name: " << _policys[policy.id].name
                       << " | New name: " << policy.name;
            continue;
        }

        // Use normalized name for case-insensitive lookup
        std::string normalized_name = normalize_name(policy.name);
        if (_name_to_id.contains(normalized_name)) {
            LOG(ERROR) << "Reject update - Duplicate policy name: " << policy.name
                       << " | Existing ID: " << _name_to_id[normalized_name]
                       << " | New ID: " << policy.id;
            continue;
        }

        _policys.emplace(policy.id, policy);
        // Store with normalized key for case-insensitive lookup
        _name_to_id.emplace(normalized_name, policy.id);
        success_updates++;

        LOG(INFO) << "Successfully applied policy - "
                  << "ID: " << policy.id << ", "
                  << "Name: " << policy.name << ", "
                  << "Type: " << policy.type;
    }

    LOG(INFO) << "Policy changes completed - "
              << "Deleted: " << success_deletes << "/" << policys_to_delete.size() << ", "
              << "Updated: " << success_updates << "/" << policys_to_update.size() << ", "
              << "Total policies: " << _policys.size();
}

Policys IndexPolicyMgr::get_index_policys() {
    std::shared_lock<std::shared_mutex> r_lock(_mutex);
    return _policys; // Return copy to ensure thread safety after lock release
}

// NOTE: This function holds a shared_lock while calling build_analyzer_from_policy/
// build_normalizer_from_policy, which also access _name_to_id and _policys.
// This is safe because std::shared_mutex allows the same thread to hold multiple
// shared_locks (read locks are reentrant). The lock is held throughout to ensure
// consistency when resolving nested policy references (e.g., tokenizer policies).
AnalyzerPtr IndexPolicyMgr::get_policy_by_name(const std::string& name) {
    std::shared_lock lock(_mutex);

    // Use normalized name for case-insensitive lookup
    std::string normalized_name = normalize_name(name);
    auto name_it = _name_to_id.find(normalized_name);
    if (name_it == _name_to_id.end()) {
        if (is_builtin_normalizer(normalized_name)) {
            return build_builtin_normalizer(name);
        }
        throw Exception(ErrorCode::INVALID_ARGUMENT, "Policy not found with name: " + name);
    }

    auto policy_it = _policys.find(name_it->second);
    if (policy_it == _policys.end()) {
        throw Exception(ErrorCode::INVALID_ARGUMENT, "Policy not found with id: " + name);
    }

    const auto& index_policy = policy_it->second;
    if (index_policy.type == TIndexPolicyType::ANALYZER) {
        return build_analyzer_from_policy(index_policy);
    } else if (index_policy.type == TIndexPolicyType::NORMALIZER) {
        return build_normalizer_from_policy(index_policy);
    }

    throw Exception(ErrorCode::INVALID_ARGUMENT, "Policy not found with type: " + name);
}

AnalyzerPtr IndexPolicyMgr::build_analyzer_from_policy(const TIndexPolicy& index_policy_analyzer) {
    segment_v2::inverted_index::CustomAnalyzerConfig::Builder builder;

    auto tokenizer_it = index_policy_analyzer.properties.find(PROP_TOKENIZER);
    if (tokenizer_it == index_policy_analyzer.properties.end() || tokenizer_it->second.empty()) {
        throw Exception(
                ErrorCode::INVALID_ARGUMENT,
                "Invalid tokenizer configuration in policy: analyzer must have a tokenizer");
    }

    const auto& tokenizer_name = tokenizer_it->second;
    // Use normalized name for case-insensitive lookup
    std::string normalized_tokenizer_name = normalize_name(tokenizer_name);
    if (_name_to_id.contains(normalized_tokenizer_name)) {
        const auto& tokenizer_policy = _policys[_name_to_id[normalized_tokenizer_name]];
        auto type_it = tokenizer_policy.properties.find(PROP_TYPE);
        if (type_it == tokenizer_policy.properties.end()) {
            throw Exception(ErrorCode::INVALID_ARGUMENT,
                            "Invalid tokenizer configuration in policy: " + tokenizer_name);
        }

        segment_v2::inverted_index::Settings settings;
        for (const auto& prop : tokenizer_policy.properties) {
            if (prop.first != PROP_TYPE) {
                settings.set(prop.first, prop.second);
            }
        }
        builder.with_tokenizer_config(type_it->second, settings);
    } else {
        builder.with_tokenizer_config(tokenizer_name, {});
    }

    process_filter_configs(index_policy_analyzer, PROP_CHAR_FILTER, "char filter",
                           [&builder](const std::string& name,
                                      const segment_v2::inverted_index::Settings& settings) {
                               builder.add_char_filter_config(name, settings);
                           });

    process_filter_configs(index_policy_analyzer, PROP_TOKEN_FILTER, "token filter",
                           [&builder](const std::string& name,
                                      const segment_v2::inverted_index::Settings& settings) {
                               builder.add_token_filter_config(name, settings);
                           });

    auto custom_analyzer_config = builder.build();
    return segment_v2::inverted_index::CustomAnalyzer::build_custom_analyzer(
            custom_analyzer_config);
}

AnalyzerPtr IndexPolicyMgr::build_normalizer_from_policy(
        const TIndexPolicy& index_policy_normalizer) {
    segment_v2::inverted_index::CustomNormalizerConfig::Builder builder;

    process_filter_configs(index_policy_normalizer, PROP_CHAR_FILTER, "char filter",
                           [&builder](const std::string& name,
                                      const segment_v2::inverted_index::Settings& settings) {
                               builder.add_char_filter_config(name, settings);
                           });

    process_filter_configs(index_policy_normalizer, PROP_TOKEN_FILTER, "token filter",
                           [&builder](const std::string& name,
                                      const segment_v2::inverted_index::Settings& settings) {
                               builder.add_token_filter_config(name, settings);
                           });

    auto custom_normalizer_config = builder.build();
    return segment_v2::inverted_index::CustomNormalizer::build_custom_normalizer(
            custom_normalizer_config);
}

void IndexPolicyMgr::process_filter_configs(
        const TIndexPolicy& index_policy_analyzer, const std::string& prop_name,
        const std::string& error_prefix,
        std::function<void(const std::string&, const segment_v2::inverted_index::Settings&)>
                add_config_func) {
    auto filter_it = index_policy_analyzer.properties.find(prop_name);
    if (filter_it == index_policy_analyzer.properties.end()) {
        return;
    }

    std::vector<std::string> filter_strs;
    boost::split(filter_strs, filter_it->second, boost::is_any_of(","));

    for (auto& filter_name : filter_strs) {
        boost::trim(filter_name);
        if (filter_name.empty()) {
            continue;
        }

        // Use normalized name for case-insensitive lookup
        std::string normalized_filter_name = normalize_name(filter_name);
        if (_name_to_id.contains(normalized_filter_name)) {
            // Nested filter policy
            const auto& filter_policy = _policys[_name_to_id[normalized_filter_name]];
            auto type_it = filter_policy.properties.find(PROP_TYPE);
            if (type_it == filter_policy.properties.end()) {
                throw Exception(
                        ErrorCode::INVALID_ARGUMENT,
                        "Invalid " + error_prefix + " configuration in policy: " + filter_name);
            }

            segment_v2::inverted_index::Settings settings;
            for (const auto& prop : filter_policy.properties) {
                if (prop.first != PROP_TYPE) {
                    settings.set(prop.first, prop.second);
                }
            }
            add_config_func(type_it->second, settings);
        } else {
            // Simple filter
            add_config_func(filter_name, {});
        }
    }
}

bool IndexPolicyMgr::is_builtin_normalizer(const std::string& name) {
    return BUILTIN_NORMALIZERS.contains(name);
}

AnalyzerPtr IndexPolicyMgr::build_builtin_normalizer(const std::string& name) {
    using namespace segment_v2::inverted_index;

    if (name == "lowercase") {
        CustomNormalizerConfig::Builder builder;
        builder.add_token_filter_config("lowercase", Settings {});
        auto config = builder.build();
        return CustomNormalizer::build_custom_normalizer(config);
    }

    throw Exception(ErrorCode::INVALID_ARGUMENT, "Unknown builtin normalizer: " + name);
}

} // namespace doris

Coverage Report

Created: 2026-03-14 20:54

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#include "runtime/index_policy/index_policy_mgr.h"
19
20		#include <boost/algorithm/string.hpp>
21		#include <boost/algorithm/string/split.hpp>
22		#include <boost/algorithm/string/trim.hpp>
23		#include <unordered_set>
24		#include <utility>
25
26		namespace doris {
27
28		const std::unordered_set<std::string> IndexPolicyMgr::BUILTIN_NORMALIZERS = {"lowercase"};
29
30	41	std::string IndexPolicyMgr::normalize_name(const std::string& name) {
31	41	std::string result = name;
32	41	boost::algorithm::trim(result);
33	41	boost::algorithm::to_lower(result);
34	41	return result;
35	41	}
36
37		void IndexPolicyMgr::apply_policy_changes(const std::vector<TIndexPolicy>& policys_to_update,
38	11	const std::vector<int64_t>& policys_to_delete) {
39	11	LOG(INFO) << "Starting policy changes - "
40	11	<< "Updates: " << policys_to_update.size() << " policies, "
41	11	<< "Deletions: " << policys_to_delete.size() << " policies";
42
43	11	std::unique_lock lock(_mutex);
44	11	int32_t success_deletes = 0;
45	11	int32_t success_updates = 0;
46
47	11	for (auto id : policys_to_delete) {
48	2	if (auto it = _policys.find(id); it != _policys.end()) {
49	2	LOG(INFO) << "Deleting policy - "
50	2	<< "ID: " << id << ", "
51	2	<< "Name: " << it->second.name;
52
53		// Use normalized name for deletion
54	2	_name_to_id.erase(normalize_name(it->second.name));
55	2	_policys.erase(it);
56	2	success_deletes++;
57	2	} else {
58	0	LOG(WARNING) << "Delete failed - Policy ID not found: " << id;
59	0	}
60	2	}
61
62	23	for (const auto& policy : policys_to_update) {
63	23	if (_policys.contains(policy.id)) {
64	1	LOG(ERROR) << "Reject update - Duplicate policy ID: " << policy.id
65	1	<< " \| Existing name: " << _policys[policy.id].name
66	1	<< " \| New name: " << policy.name;
67	1	continue;
68	1	}
69
70		// Use normalized name for case-insensitive lookup
71	22	std::string normalized_name = normalize_name(policy.name);
72	22	if (_name_to_id.contains(normalized_name)) {
73	1	LOG(ERROR) << "Reject update - Duplicate policy name: " << policy.name
74	1	<< " \| Existing ID: " << _name_to_id[normalized_name]
75	1	<< " \| New ID: " << policy.id;
76	1	continue;
77	1	}
78
79	21	_policys.emplace(policy.id, policy);
80		// Store with normalized key for case-insensitive lookup
81	21	_name_to_id.emplace(normalized_name, policy.id);
82	21	success_updates++;
83
84	21	LOG(INFO) << "Successfully applied policy - "
85	21	<< "ID: " << policy.id << ", "
86	21	<< "Name: " << policy.name << ", "
87	21	<< "Type: " << policy.type;
88	21	}
89
90	11	LOG(INFO) << "Policy changes completed - "
91	11	<< "Deleted: " << success_deletes << "/" << policys_to_delete.size() << ", "
92	11	<< "Updated: " << success_updates << "/" << policys_to_update.size() << ", "
93	11	<< "Total policies: " << _policys.size();
94	11	}
95
96	5	Policys IndexPolicyMgr::get_index_policys() {
97	5	std::shared_lock<std::shared_mutex> r_lock(_mutex);
98	5	return _policys; // Return copy to ensure thread safety after lock release
99	5	}
100
101		// NOTE: This function holds a shared_lock while calling build_analyzer_from_policy/
102		// build_normalizer_from_policy, which also access _name_to_id and _policys.
103		// This is safe because std::shared_mutex allows the same thread to hold multiple
104		// shared_locks (read locks are reentrant). The lock is held throughout to ensure
105		// consistency when resolving nested policy references (e.g., tokenizer policies).
106	7	AnalyzerPtr IndexPolicyMgr::get_policy_by_name(const std::string& name) {
107	7	std::shared_lock lock(_mutex);
108
109		// Use normalized name for case-insensitive lookup
110	7	std::string normalized_name = normalize_name(name);
111	7	auto name_it = _name_to_id.find(normalized_name);
112	7	if (name_it == _name_to_id.end()) {
113	1	if (is_builtin_normalizer(normalized_name)) {
114	0	return build_builtin_normalizer(name);
115	0	}
116	1	throw Exception(ErrorCode::INVALID_ARGUMENT, "Policy not found with name: " + name);
117	1	}
118
119	6	auto policy_it = _policys.find(name_it->second);
120	6	if (policy_it == _policys.end()) {
121	0	throw Exception(ErrorCode::INVALID_ARGUMENT, "Policy not found with id: " + name);
122	0	}
123
124	6	const auto& index_policy = policy_it->second;
125	6	if (index_policy.type == TIndexPolicyType::ANALYZER) {
126	6	return build_analyzer_from_policy(index_policy);
127	6	} else if (index_policy.type == TIndexPolicyType::NORMALIZER) {
128	0	return build_normalizer_from_policy(index_policy);
129	0	}
130
131	0	throw Exception(ErrorCode::INVALID_ARGUMENT, "Policy not found with type: " + name);
132	6	}
133
134	6	AnalyzerPtr IndexPolicyMgr::build_analyzer_from_policy(const TIndexPolicy& index_policy_analyzer) {
135	6	segment_v2::inverted_index::CustomAnalyzerConfig::Builder builder;
136
137	6	auto tokenizer_it = index_policy_analyzer.properties.find(PROP_TOKENIZER);
138	6	if (tokenizer_it == index_policy_analyzer.properties.end() \|\| tokenizer_it->second.empty()) {
139	1	throw Exception(
140	1	ErrorCode::INVALID_ARGUMENT,
141	1	"Invalid tokenizer configuration in policy: analyzer must have a tokenizer");
142	1	}
143
144	5	const auto& tokenizer_name = tokenizer_it->second;
145		// Use normalized name for case-insensitive lookup
146	5	std::string normalized_tokenizer_name = normalize_name(tokenizer_name);
147	5	if (_name_to_id.contains(normalized_tokenizer_name)) {
148	4	const auto& tokenizer_policy = _policys[_name_to_id[normalized_tokenizer_name]];
149	4	auto type_it = tokenizer_policy.properties.find(PROP_TYPE);
150	4	if (type_it == tokenizer_policy.properties.end()) {
151	0	throw Exception(ErrorCode::INVALID_ARGUMENT,
152	0	"Invalid tokenizer configuration in policy: " + tokenizer_name);
153	0	}
154
155	4	segment_v2::inverted_index::Settings settings;
156	9	for (const auto& prop : tokenizer_policy.properties) {
157	9	if (prop.first != PROP_TYPE) {
158	5	settings.set(prop.first, prop.second);
159	5	}
160	9	}
161	4	builder.with_tokenizer_config(type_it->second, settings);
162	4	} else {
163	1	builder.with_tokenizer_config(tokenizer_name, {});
164	1	}
165
166	5	process_filter_configs(index_policy_analyzer, PROP_CHAR_FILTER, "char filter",
167	5	[&builder](const std::string& name,
168	5	const segment_v2::inverted_index::Settings& settings) {
169	0	builder.add_char_filter_config(name, settings);
170	0	});
171
172	5	process_filter_configs(index_policy_analyzer, PROP_TOKEN_FILTER, "token filter",
173	5	[&builder](const std::string& name,
174	5	const segment_v2::inverted_index::Settings& settings) {
175	5	builder.add_token_filter_config(name, settings);
176	5	});
177
178	5	auto custom_analyzer_config = builder.build();
179	5	return segment_v2::inverted_index::CustomAnalyzer::build_custom_analyzer(
180	5	custom_analyzer_config);
181	5	}
182
183		AnalyzerPtr IndexPolicyMgr::build_normalizer_from_policy(
184	0	const TIndexPolicy& index_policy_normalizer) {
185	0	segment_v2::inverted_index::CustomNormalizerConfig::Builder builder;
186
187	0	process_filter_configs(index_policy_normalizer, PROP_CHAR_FILTER, "char filter",
188	0	[&builder](const std::string& name,
189	0	const segment_v2::inverted_index::Settings& settings) {
190	0	builder.add_char_filter_config(name, settings);
191	0	});
192
193	0	process_filter_configs(index_policy_normalizer, PROP_TOKEN_FILTER, "token filter",
194	0	[&builder](const std::string& name,
195	0	const segment_v2::inverted_index::Settings& settings) {
196	0	builder.add_token_filter_config(name, settings);
197	0	});
198
199	0	auto custom_normalizer_config = builder.build();
200	0	return segment_v2::inverted_index::CustomNormalizer::build_custom_normalizer(
201	0	custom_normalizer_config);
202	0	}
203
204		void IndexPolicyMgr::process_filter_configs(
205		const TIndexPolicy& index_policy_analyzer, const std::string& prop_name,
206		const std::string& error_prefix,
207		std::function<void(const std::string&, const segment_v2::inverted_index::Settings&)>
208	10	add_config_func) {
209	10	auto filter_it = index_policy_analyzer.properties.find(prop_name);
210	10	if (filter_it == index_policy_analyzer.properties.end()) {
211	6	return;
212	6	}
213
214	4	std::vector<std::string> filter_strs;
215	4	boost::split(filter_strs, filter_it->second, boost::is_any_of(","));
216
217	6	for (auto& filter_name : filter_strs) {
218	6	boost::trim(filter_name);
219	6	if (filter_name.empty()) {
220	1	continue;
221	1	}
222
223		// Use normalized name for case-insensitive lookup
224	5	std::string normalized_filter_name = normalize_name(filter_name);
225	5	if (_name_to_id.contains(normalized_filter_name)) {
226		// Nested filter policy
227	4	const auto& filter_policy = _policys[_name_to_id[normalized_filter_name]];
228	4	auto type_it = filter_policy.properties.find(PROP_TYPE);
229	4	if (type_it == filter_policy.properties.end()) {
230	0	throw Exception(
231	0	ErrorCode::INVALID_ARGUMENT,
232	0	"Invalid " + error_prefix + " configuration in policy: " + filter_name);
233	0	}
234
235	4	segment_v2::inverted_index::Settings settings;
236	4	for (const auto& prop : filter_policy.properties) {
237	4	if (prop.first != PROP_TYPE) {
238	0	settings.set(prop.first, prop.second);
239	0	}
240	4	}
241	4	add_config_func(type_it->second, settings);
242	4	} else {
243		// Simple filter
244	1	add_config_func(filter_name, {});
245	1	}
246	5	}
247	4	}
248
249	1	bool IndexPolicyMgr::is_builtin_normalizer(const std::string& name) {
250	1	return BUILTIN_NORMALIZERS.contains(name);
251	1	}
252
253	0	AnalyzerPtr IndexPolicyMgr::build_builtin_normalizer(const std::string& name) {
254	0	using namespace segment_v2::inverted_index;
255
256	0	if (name == "lowercase") {
257	0	CustomNormalizerConfig::Builder builder;
258	0	builder.add_token_filter_config("lowercase", Settings {});
259	0	auto config = builder.build();
260	0	return CustomNormalizer::build_custom_normalizer(config);
261	0	}
262
263	0	throw Exception(ErrorCode::INVALID_ARGUMENT, "Unknown builtin normalizer: " + name);
264	0	}
265
266		} // namespace doris