Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "exprs/function/function_search.h" |
19 | | |
20 | | #include <CLucene/config/repl_wchar.h> |
21 | | #include <CLucene/search/Scorer.h> |
22 | | #include <fmt/format.h> |
23 | | #include <gen_cpp/Exprs_types.h> |
24 | | #include <glog/logging.h> |
25 | | |
26 | | #include <limits> |
27 | | #include <memory> |
28 | | #include <roaring/roaring.hh> |
29 | | #include <set> |
30 | | #include <string> |
31 | | #include <unordered_map> |
32 | | #include <unordered_set> |
33 | | #include <vector> |
34 | | |
35 | | #include "common/status.h" |
36 | | #include "core/block/columns_with_type_and_name.h" |
37 | | #include "core/column/column_const.h" |
38 | | #include "core/data_type/data_type_array.h" |
39 | | #include "core/data_type/data_type_nullable.h" |
40 | | #include "core/data_type/data_type_string.h" |
41 | | #include "exprs/function/simple_function_factory.h" |
42 | | #include "exprs/function/variant_inverted_index_search.h" |
43 | | #include "exprs/vexpr_context.h" |
44 | | #include "runtime/runtime_profile.h" |
45 | | #include "storage/index/index_file_reader.h" |
46 | | #include "storage/index/index_query_context.h" |
47 | | #include "storage/index/inverted/analyzer/analyzer.h" |
48 | | #include "storage/index/inverted/inverted_index_compound_reader.h" |
49 | | #include "storage/index/inverted/inverted_index_iterator.h" |
50 | | #include "storage/index/inverted/inverted_index_parser.h" |
51 | | #include "storage/index/inverted/inverted_index_reader.h" |
52 | | #include "storage/index/inverted/inverted_index_searcher.h" |
53 | | #include "storage/index/inverted/query/query_helper.h" |
54 | | #include "storage/index/inverted/query_v2/all_query/all_query.h" |
55 | | #include "storage/index/inverted/query_v2/bit_set_query/bit_set_query.h" |
56 | | #include "storage/index/inverted/query_v2/boolean_query/boolean_query_builder.h" |
57 | | #include "storage/index/inverted/query_v2/boolean_query/operator.h" |
58 | | #include "storage/index/inverted/query_v2/collect/doc_set_collector.h" |
59 | | #include "storage/index/inverted/query_v2/collect/top_k_collector.h" |
60 | | #include "storage/index/inverted/query_v2/phrase_query/multi_phrase_query.h" |
61 | | #include "storage/index/inverted/query_v2/phrase_query/phrase_query.h" |
62 | | #include "storage/index/inverted/query_v2/regexp_query/regexp_query.h" |
63 | | #include "storage/index/inverted/query_v2/term_query/term_query.h" |
64 | | #include "storage/index/inverted/query_v2/wildcard_query/wildcard_query.h" |
65 | | #include "storage/index/inverted/util/string_helper.h" |
66 | | #include "storage/olap_common.h" |
67 | | #include "storage/segment/variant/nested_group_provider.h" |
68 | | #include "storage/types.h" |
69 | | #include "util/debug_points.h" |
70 | | #include "util/string_parser.hpp" |
71 | | #include "util/string_util.h" |
72 | | #include "util/thrift_util.h" |
73 | | |
74 | | namespace doris { |
75 | | |
76 | | // Build canonical DSL signature for cache key. |
77 | | // Serializes the entire TSearchParam via Thrift binary protocol so that |
78 | | // every field (DSL, AST root, field bindings, default_operator, |
79 | | // minimum_should_match, etc.) is included automatically. |
80 | 1.30k | static std::string build_dsl_signature(const TSearchParam& param) { |
81 | 1.30k | ThriftSerializer ser(false, 1024); |
82 | 1.30k | TSearchParam copy = param; |
83 | 1.30k | std::string sig; |
84 | 1.30k | auto st = ser.serialize(©, &sig); |
85 | 1.30k | if (UNLIKELY(!st.ok())) { |
86 | 0 | LOG(WARNING) << "build_dsl_signature: Thrift serialization failed: " << st.to_string() |
87 | 0 | << ", caching disabled for this query"; |
88 | 0 | return ""; |
89 | 0 | } |
90 | 1.30k | return sig; |
91 | 1.30k | } |
92 | | |
93 | | // Extract segment path prefix from the first available inverted index iterator. |
94 | | // All fields in the same segment share the same path prefix. |
95 | | static std::string extract_segment_prefix( |
96 | 1.29k | const std::unordered_map<std::string, IndexIterator*>& iterators) { |
97 | 1.29k | for (const auto& [field_name, iter] : iterators) { |
98 | 1.29k | auto* inv_iter = dynamic_cast<InvertedIndexIterator*>(iter); |
99 | 1.29k | if (!inv_iter) continue; |
100 | | // Try fulltext reader first, then string type |
101 | 1.26k | for (auto type : |
102 | 1.58k | {InvertedIndexReaderType::FULLTEXT, InvertedIndexReaderType::STRING_TYPE}) { |
103 | 1.58k | IndexReaderType reader_type = type; |
104 | 1.58k | auto reader = inv_iter->get_reader(reader_type); |
105 | 1.58k | if (!reader) continue; |
106 | 1.27k | auto inv_reader = std::dynamic_pointer_cast<InvertedIndexReader>(reader); |
107 | 1.27k | if (!inv_reader) continue; |
108 | 1.27k | auto file_reader = inv_reader->get_index_file_reader(); |
109 | 1.27k | if (!file_reader) continue; |
110 | 1.27k | return file_reader->get_index_path_prefix(); |
111 | 1.27k | } |
112 | 1.26k | } |
113 | 18.4E | VLOG_DEBUG << "extract_segment_prefix: no suitable inverted index reader found across " |
114 | 18.4E | << iterators.size() << " iterators, caching disabled for this query"; |
115 | 23 | return ""; |
116 | 1.29k | } |
117 | | |
118 | | namespace { |
119 | | |
120 | 3 | bool is_nested_group_search_supported() { |
121 | 3 | auto provider = segment_v2::create_nested_group_read_provider(); |
122 | 3 | return provider != nullptr && provider->should_enable_nested_group_read_path(); |
123 | 3 | } |
124 | | |
125 | 5 | query_v2::QueryPtr make_unknown_query(uint32_t num_rows) { |
126 | 5 | auto null_bitmap = std::make_shared<roaring::Roaring>(); |
127 | 5 | if (num_rows > 0) { |
128 | 5 | null_bitmap->addRange(0, num_rows); |
129 | 5 | } |
130 | 5 | return std::make_shared<query_v2::BitSetQuery>(std::make_shared<roaring::Roaring>(), |
131 | 5 | std::move(null_bitmap)); |
132 | 5 | } |
133 | | |
134 | 2 | DataTypePtr unwrap_direct_index_value_type(DataTypePtr column_type) { |
135 | 2 | DataTypePtr value_type = remove_nullable(std::move(column_type)); |
136 | 5 | while (value_type != nullptr && |
137 | 5 | value_type->get_storage_field_type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { |
138 | 3 | const auto* array_type = dynamic_cast<const DataTypeArray*>(value_type.get()); |
139 | 3 | if (array_type == nullptr) { |
140 | 0 | return value_type; |
141 | 0 | } |
142 | 3 | value_type = remove_nullable(array_type->get_nested_type()); |
143 | 3 | } |
144 | 2 | return value_type; |
145 | 2 | } |
146 | | |
147 | | template <PrimitiveType primitive_type, typename CppType> |
148 | 1 | Status parse_integral_search_value(const std::string& value, Field* field) { |
149 | 1 | StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; |
150 | 1 | CppType parsed = |
151 | 1 | StringParser::string_to_int<CppType>(value.data(), value.size(), &parse_result); |
152 | 1 | if (parse_result != StringParser::PARSE_SUCCESS) { |
153 | 0 | return Status::InvalidArgument("failed to parse '{}' as {}", value, |
154 | 0 | type_to_string(primitive_type)); |
155 | 0 | } |
156 | 1 | *field = Field::create_field<primitive_type>(parsed); |
157 | 1 | return Status::OK(); |
158 | 1 | } Unexecuted instantiation: function_search.cpp:_ZN5doris12_GLOBAL__N_127parse_integral_search_valueILNS_13PrimitiveTypeE3EaEENS_6StatusERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_5FieldE Unexecuted instantiation: function_search.cpp:_ZN5doris12_GLOBAL__N_127parse_integral_search_valueILNS_13PrimitiveTypeE4EsEENS_6StatusERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_5FieldE function_search.cpp:_ZN5doris12_GLOBAL__N_127parse_integral_search_valueILNS_13PrimitiveTypeE5EiEENS_6StatusERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_5FieldE Line | Count | Source | 148 | 1 | Status parse_integral_search_value(const std::string& value, Field* field) { | 149 | 1 | StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; | 150 | 1 | CppType parsed = | 151 | 1 | StringParser::string_to_int<CppType>(value.data(), value.size(), &parse_result); | 152 | 1 | if (parse_result != StringParser::PARSE_SUCCESS) { | 153 | 0 | return Status::InvalidArgument("failed to parse '{}' as {}", value, | 154 | 0 | type_to_string(primitive_type)); | 155 | 0 | } | 156 | 1 | *field = Field::create_field<primitive_type>(parsed); | 157 | 1 | return Status::OK(); | 158 | 1 | } |
Unexecuted instantiation: function_search.cpp:_ZN5doris12_GLOBAL__N_127parse_integral_search_valueILNS_13PrimitiveTypeE6ElEENS_6StatusERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_5FieldE Unexecuted instantiation: function_search.cpp:_ZN5doris12_GLOBAL__N_127parse_integral_search_valueILNS_13PrimitiveTypeE7EnEENS_6StatusERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_5FieldE |
159 | | |
160 | | Status parse_scalar_search_value(const DataTypePtr& column_type, const std::string& value, |
161 | 2 | Field* field) { |
162 | 2 | if (column_type == nullptr || field == nullptr) { |
163 | 0 | return Status::InvalidArgument("missing column type for scalar search value"); |
164 | 0 | } |
165 | | |
166 | 2 | switch (column_type->get_storage_field_type()) { |
167 | 1 | case FieldType::OLAP_FIELD_TYPE_BOOL: { |
168 | 1 | StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; |
169 | 1 | bool parsed = StringParser::string_to_bool(value.data(), value.size(), &parse_result); |
170 | 1 | if (parse_result != StringParser::PARSE_SUCCESS) { |
171 | 0 | return Status::InvalidArgument("failed to parse '{}' as bool", value); |
172 | 0 | } |
173 | 1 | *field = Field::create_field<TYPE_BOOLEAN>(parsed); |
174 | 1 | return Status::OK(); |
175 | 1 | } |
176 | 0 | case FieldType::OLAP_FIELD_TYPE_TINYINT: |
177 | 0 | return parse_integral_search_value<TYPE_TINYINT, Int8>(value, field); |
178 | 0 | case FieldType::OLAP_FIELD_TYPE_SMALLINT: |
179 | 0 | return parse_integral_search_value<TYPE_SMALLINT, Int16>(value, field); |
180 | 1 | case FieldType::OLAP_FIELD_TYPE_INT: |
181 | 1 | return parse_integral_search_value<TYPE_INT, Int32>(value, field); |
182 | 0 | case FieldType::OLAP_FIELD_TYPE_BIGINT: |
183 | 0 | return parse_integral_search_value<TYPE_BIGINT, Int64>(value, field); |
184 | 0 | case FieldType::OLAP_FIELD_TYPE_LARGEINT: |
185 | 0 | return parse_integral_search_value<TYPE_LARGEINT, Int128>(value, field); |
186 | 0 | case FieldType::OLAP_FIELD_TYPE_FLOAT: { |
187 | 0 | StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; |
188 | 0 | Float32 parsed = |
189 | 0 | StringParser::string_to_float<Float32>(value.data(), value.size(), &parse_result); |
190 | 0 | if (parse_result != StringParser::PARSE_SUCCESS) { |
191 | 0 | return Status::InvalidArgument("failed to parse '{}' as float", value); |
192 | 0 | } |
193 | 0 | *field = Field::create_field<TYPE_FLOAT>(parsed); |
194 | 0 | return Status::OK(); |
195 | 0 | } |
196 | 0 | case FieldType::OLAP_FIELD_TYPE_DOUBLE: { |
197 | 0 | StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; |
198 | 0 | Float64 parsed = |
199 | 0 | StringParser::string_to_float<Float64>(value.data(), value.size(), &parse_result); |
200 | 0 | if (parse_result != StringParser::PARSE_SUCCESS) { |
201 | 0 | return Status::InvalidArgument("failed to parse '{}' as double", value); |
202 | 0 | } |
203 | 0 | *field = Field::create_field<TYPE_DOUBLE>(parsed); |
204 | 0 | return Status::OK(); |
205 | 0 | } |
206 | 0 | default: |
207 | 0 | return Status::NotSupported("scalar search does not support storage field type {}", |
208 | 0 | static_cast<int>(column_type->get_storage_field_type())); |
209 | 2 | } |
210 | 2 | } |
211 | | |
212 | 3 | InvertedIndexQueryType direct_index_query_type_for_clause(const std::string& clause_type) { |
213 | 3 | if (clause_type == "TERM" || clause_type == "EXACT") { |
214 | 2 | return InvertedIndexQueryType::EQUAL_QUERY; |
215 | 2 | } |
216 | 1 | return InvertedIndexQueryType::UNKNOWN_QUERY; |
217 | 3 | } |
218 | | |
219 | | } // namespace |
220 | | |
221 | | Status FunctionSearch::execute_impl(FunctionContext* /*context*/, Block& /*block*/, |
222 | | const ColumnNumbers& /*arguments*/, uint32_t /*result*/, |
223 | 4 | size_t /*input_rows_count*/) const { |
224 | 4 | return Status::RuntimeError("only inverted index queries are supported"); |
225 | 4 | } |
226 | | |
227 | | // Enhanced implementation: Handle new parameter structure (DSL + SlotReferences) |
228 | | Status FunctionSearch::evaluate_inverted_index( |
229 | | const ColumnsWithTypeAndName& arguments, |
230 | | const std::vector<IndexFieldNameAndTypePair>& data_type_with_names, |
231 | | std::vector<IndexIterator*> iterators, uint32_t num_rows, |
232 | | const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/, |
233 | 1 | InvertedIndexResultBitmap& bitmap_result) const { |
234 | 1 | return Status::OK(); |
235 | 1 | } |
236 | | |
237 | | Status FunctionSearch::evaluate_inverted_index_with_search_param( |
238 | | const TSearchParam& search_param, |
239 | | const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names, |
240 | | std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows, |
241 | 31 | InvertedIndexResultBitmap& bitmap_result, bool enable_cache) const { |
242 | 31 | static const std::unordered_map<std::string, int> empty_field_to_column_id; |
243 | 31 | return evaluate_inverted_index_with_search_param( |
244 | 31 | search_param, data_type_with_names, std::move(iterators), num_rows, bitmap_result, |
245 | 31 | enable_cache, nullptr, empty_field_to_column_id); |
246 | 31 | } |
247 | | |
248 | | Status FunctionSearch::evaluate_inverted_index_with_search_param( |
249 | | const TSearchParam& search_param, |
250 | | const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names, |
251 | | std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows, |
252 | | InvertedIndexResultBitmap& bitmap_result, bool enable_cache, |
253 | | const IndexExecContext* index_exec_ctx, |
254 | | const std::unordered_map<std::string, int>& field_name_to_column_id, |
255 | 1.31k | const std::shared_ptr<IndexQueryContext>& index_query_context) const { |
256 | 1.31k | const bool is_nested_query = search_param.root.clause_type == "NESTED"; |
257 | 1.31k | if (is_nested_query && !is_nested_group_search_supported()) { |
258 | 3 | return Status::NotSupported( |
259 | 3 | "NESTED query requires NestedGroup support, which is unavailable in this build"); |
260 | 3 | } |
261 | | |
262 | 1.31k | if (!is_nested_query && (iterators.empty() || data_type_with_names.empty())) { |
263 | 5 | LOG(INFO) << "No indexed columns or iterators available, returning empty result, dsl:" |
264 | 5 | << search_param.original_dsl; |
265 | 5 | bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(), |
266 | 5 | std::make_shared<roaring::Roaring>()); |
267 | 5 | return Status::OK(); |
268 | 5 | } |
269 | | |
270 | | // Track overall query time (equivalent to inverted_index_query_timer in MATCH path). |
271 | | // Must be declared before the DSL cache lookup so that cache-hit fast paths are |
272 | | // also covered by the timer. |
273 | 1.30k | int64_t query_timer_dummy = 0; |
274 | 1.30k | OlapReaderStatistics* outer_stats = index_query_context ? index_query_context->stats : nullptr; |
275 | 1.30k | SCOPED_RAW_TIMER(outer_stats ? &outer_stats->inverted_index_query_timer : &query_timer_dummy); |
276 | | |
277 | | // DSL result cache: reuse InvertedIndexQueryCache with SEARCH_DSL_QUERY type |
278 | 1.30k | auto* dsl_cache = enable_cache ? InvertedIndexQueryCache::instance() : nullptr; |
279 | 1.30k | std::string seg_prefix; |
280 | 1.30k | std::string dsl_sig; |
281 | 1.30k | InvertedIndexQueryCache::CacheKey dsl_cache_key; |
282 | 1.30k | bool cache_usable = false; |
283 | 1.30k | if (dsl_cache) { |
284 | 1.29k | seg_prefix = extract_segment_prefix(iterators); |
285 | 1.29k | dsl_sig = build_dsl_signature(search_param); |
286 | 1.29k | if (!seg_prefix.empty() && !dsl_sig.empty()) { |
287 | 1.26k | dsl_cache_key = InvertedIndexQueryCache::CacheKey { |
288 | 1.26k | seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, |
289 | 1.26k | dsl_sig}; |
290 | 1.26k | cache_usable = true; |
291 | 1.26k | InvertedIndexQueryCacheHandle dsl_cache_handle; |
292 | 1.26k | bool dsl_hit = false; |
293 | 1.26k | { |
294 | 1.26k | int64_t lookup_dummy = 0; |
295 | 1.26k | SCOPED_RAW_TIMER(outer_stats ? &outer_stats->inverted_index_lookup_timer |
296 | 1.26k | : &lookup_dummy); |
297 | 1.26k | dsl_hit = dsl_cache->lookup(dsl_cache_key, &dsl_cache_handle); |
298 | 1.26k | } |
299 | 1.26k | if (dsl_hit) { |
300 | 175 | auto cached_bitmap = dsl_cache_handle.get_bitmap(); |
301 | 175 | if (cached_bitmap) { |
302 | 174 | if (outer_stats) { |
303 | 174 | outer_stats->inverted_index_query_cache_hit++; |
304 | 174 | } |
305 | | // Also retrieve cached null bitmap for three-valued SQL logic |
306 | | // (needed by compound operators NOT, OR, AND in VCompoundPred) |
307 | 174 | auto null_cache_key = InvertedIndexQueryCache::CacheKey { |
308 | 174 | seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, |
309 | 174 | dsl_sig + "__null"}; |
310 | 174 | InvertedIndexQueryCacheHandle null_cache_handle; |
311 | 174 | std::shared_ptr<roaring::Roaring> null_bitmap; |
312 | 175 | if (dsl_cache->lookup(null_cache_key, &null_cache_handle)) { |
313 | 175 | null_bitmap = null_cache_handle.get_bitmap(); |
314 | 175 | } |
315 | 174 | if (!null_bitmap) { |
316 | 0 | null_bitmap = std::make_shared<roaring::Roaring>(); |
317 | 0 | } |
318 | 174 | bitmap_result = |
319 | 174 | InvertedIndexResultBitmap(cached_bitmap, std::move(null_bitmap)); |
320 | 174 | return Status::OK(); |
321 | 174 | } |
322 | 175 | } |
323 | 1.10k | if (outer_stats) { |
324 | 1.10k | outer_stats->inverted_index_query_cache_miss++; |
325 | 1.10k | } |
326 | 1.09k | } |
327 | 1.29k | } |
328 | | |
329 | 1.13k | std::shared_ptr<IndexQueryContext> context; |
330 | 1.13k | if (index_query_context) { |
331 | 1.10k | context = index_query_context; |
332 | 1.10k | } else { |
333 | 31 | context = std::make_shared<IndexQueryContext>(); |
334 | 31 | context->collection_statistics = std::make_shared<CollectionStatistics>(); |
335 | 31 | context->collection_similarity = std::make_shared<CollectionSimilarity>(); |
336 | 31 | } |
337 | | |
338 | 1.13k | const auto* effective_data_type_with_names = &data_type_with_names; |
339 | | |
340 | | // Pass field_bindings to resolver for variant subcolumn detection |
341 | 1.13k | FieldReaderResolver resolver(*effective_data_type_with_names, iterators, context, |
342 | 1.13k | search_param.field_bindings); |
343 | | |
344 | 1.13k | if (is_nested_query) { |
345 | 0 | std::shared_ptr<roaring::Roaring> row_bitmap; |
346 | 0 | VariantNestedSearchEvaluator nested_evaluator(*this); |
347 | 0 | RETURN_IF_ERROR(nested_evaluator.evaluate(search_param, search_param.root, context, |
348 | 0 | resolver, num_rows, index_exec_ctx, |
349 | 0 | field_name_to_column_id, row_bitmap)); |
350 | 0 | bitmap_result = InvertedIndexResultBitmap(std::move(row_bitmap), |
351 | 0 | std::make_shared<roaring::Roaring>()); |
352 | 0 | bitmap_result.mask_out_null(); |
353 | 0 | return Status::OK(); |
354 | 0 | } |
355 | | |
356 | | // Extract default_operator from TSearchParam (default: "or") |
357 | 1.13k | std::string default_operator = "or"; |
358 | 1.13k | if (search_param.__isset.default_operator && !search_param.default_operator.empty()) { |
359 | 1.10k | default_operator = search_param.default_operator; |
360 | 1.10k | } |
361 | | // Extract minimum_should_match from TSearchParam (-1 means not set) |
362 | 1.13k | int32_t minimum_should_match = -1; |
363 | 1.13k | if (search_param.__isset.minimum_should_match) { |
364 | 48 | minimum_should_match = search_param.minimum_should_match; |
365 | 48 | } |
366 | | |
367 | 1.13k | auto* stats = context->stats; |
368 | 1.13k | int64_t dummy_timer = 0; |
369 | 1.13k | SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_timer : &dummy_timer); |
370 | | |
371 | 1.13k | query_v2::QueryPtr root_query; |
372 | 1.13k | std::string root_binding_key; |
373 | 1.13k | { |
374 | 1.13k | int64_t init_dummy = 0; |
375 | 1.13k | SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_init_timer : &init_dummy); |
376 | 1.13k | RETURN_IF_ERROR(build_query_recursive(search_param.root, context, resolver, &root_query, |
377 | 1.13k | &root_binding_key, default_operator, |
378 | 1.13k | minimum_should_match, num_rows)); |
379 | 1.13k | } |
380 | 1.11k | if (root_query == nullptr) { |
381 | 0 | LOG(INFO) << "search: Query tree resolved to empty query, dsl:" |
382 | 0 | << search_param.original_dsl; |
383 | 0 | bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(), |
384 | 0 | std::make_shared<roaring::Roaring>()); |
385 | 0 | return Status::OK(); |
386 | 0 | } |
387 | | |
388 | 1.11k | VariantSearchNullBitmapAdapter null_resolver(resolver); |
389 | 1.11k | query_v2::QueryExecutionContext exec_ctx = |
390 | 1.11k | build_variant_search_query_execution_context(num_rows, resolver, &null_resolver); |
391 | | |
392 | 1.11k | bool enable_scoring = false; |
393 | 1.11k | bool is_asc = false; |
394 | 1.11k | size_t top_k = 0; |
395 | 1.11k | if (index_query_context) { |
396 | 1.10k | enable_scoring = index_query_context->collection_similarity != nullptr; |
397 | 1.10k | is_asc = index_query_context->is_asc; |
398 | 1.10k | top_k = index_query_context->query_limit; |
399 | 1.10k | } |
400 | | |
401 | 1.11k | auto weight = root_query->weight(enable_scoring); |
402 | 1.11k | if (!weight) { |
403 | 0 | LOG(WARNING) << "search: Failed to build query weight"; |
404 | 0 | bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(), |
405 | 0 | std::make_shared<roaring::Roaring>()); |
406 | 0 | return Status::OK(); |
407 | 0 | } |
408 | | |
409 | 1.11k | std::shared_ptr<roaring::Roaring> roaring = std::make_shared<roaring::Roaring>(); |
410 | 1.11k | { |
411 | 1.11k | int64_t exec_dummy = 0; |
412 | 1.11k | SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_exec_timer : &exec_dummy); |
413 | 1.11k | if (enable_scoring && !is_asc && top_k > 0) { |
414 | 0 | bool use_wand = index_query_context->runtime_state != nullptr && |
415 | 0 | index_query_context->runtime_state->query_options() |
416 | 0 | .enable_inverted_index_wand_query; |
417 | 0 | query_v2::collect_multi_segment_top_k( |
418 | 0 | weight, exec_ctx, root_binding_key, top_k, roaring, |
419 | 0 | index_query_context->collection_similarity, use_wand); |
420 | 1.11k | } else { |
421 | 1.11k | query_v2::collect_multi_segment_doc_set( |
422 | 1.11k | weight, exec_ctx, root_binding_key, roaring, |
423 | 1.11k | index_query_context ? index_query_context->collection_similarity : nullptr, |
424 | 1.11k | enable_scoring); |
425 | 1.11k | } |
426 | 1.11k | } |
427 | | |
428 | 1.11k | VLOG_DEBUG << "search: Query completed, matched " << roaring->cardinality() << " documents"; |
429 | | |
430 | | // Extract NULL bitmap from three-valued logic scorer |
431 | | // The scorer correctly computes which documents evaluate to NULL based on query logic |
432 | | // For example: TRUE OR NULL = TRUE (not NULL), FALSE OR NULL = NULL |
433 | 1.11k | std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>(); |
434 | 1.11k | if (exec_ctx.null_resolver) { |
435 | 1.11k | auto scorer = weight->scorer(exec_ctx, root_binding_key); |
436 | 1.11k | if (scorer && scorer->has_null_bitmap(exec_ctx.null_resolver)) { |
437 | 222 | const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver); |
438 | 222 | if (bitmap != nullptr) { |
439 | 222 | *null_bitmap = *bitmap; |
440 | 18.4E | VLOG_TRACE << "search: Extracted NULL bitmap with " << null_bitmap->cardinality() |
441 | 18.4E | << " NULL documents"; |
442 | 222 | } |
443 | 222 | } |
444 | 1.11k | } |
445 | | |
446 | 1.11k | VLOG_TRACE << "search: Before mask - true_bitmap=" << roaring->cardinality() |
447 | 1 | << ", null_bitmap=" << null_bitmap->cardinality(); |
448 | | |
449 | | // Create result and mask out NULLs (SQL WHERE clause semantics: only TRUE rows) |
450 | 1.11k | bitmap_result = InvertedIndexResultBitmap(std::move(roaring), std::move(null_bitmap)); |
451 | 1.11k | bitmap_result.mask_out_null(); |
452 | | |
453 | 1.11k | VLOG_TRACE << "search: After mask - result_bitmap=" |
454 | 3 | << bitmap_result.get_data_bitmap()->cardinality(); |
455 | | |
456 | | // Insert post-mask_out_null result into DSL cache for future reuse |
457 | | // Cache both data bitmap and null bitmap so compound operators (NOT, OR, AND) |
458 | | // can apply correct three-valued SQL logic on cache hit |
459 | 1.11k | if (dsl_cache && cache_usable) { |
460 | 1.10k | InvertedIndexQueryCacheHandle insert_handle; |
461 | 1.10k | dsl_cache->insert(dsl_cache_key, bitmap_result.get_data_bitmap(), &insert_handle); |
462 | 1.10k | if (bitmap_result.get_null_bitmap()) { |
463 | 1.10k | auto null_cache_key = InvertedIndexQueryCache::CacheKey { |
464 | 1.10k | seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, |
465 | 1.10k | dsl_sig + "__null"}; |
466 | 1.10k | InvertedIndexQueryCacheHandle null_insert_handle; |
467 | 1.10k | dsl_cache->insert(null_cache_key, bitmap_result.get_null_bitmap(), &null_insert_handle); |
468 | 1.10k | } |
469 | 1.10k | } |
470 | | |
471 | 1.11k | return Status::OK(); |
472 | 1.11k | } |
473 | | |
474 | | // Aligned with FE QsClauseType enum - uses enum.name() as clause_type |
475 | | FunctionSearch::ClauseTypeCategory FunctionSearch::get_clause_type_category( |
476 | 7.26k | const std::string& clause_type) const { |
477 | 7.26k | if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" || |
478 | 7.26k | clause_type == "OCCUR_BOOLEAN" || clause_type == "NESTED") { |
479 | 146 | return ClauseTypeCategory::COMPOUND; |
480 | 7.11k | } else if (clause_type == "TERM" || clause_type == "PREFIX" || clause_type == "WILDCARD" || |
481 | 7.11k | clause_type == "REGEXP" || clause_type == "RANGE" || clause_type == "LIST" || |
482 | 7.11k | clause_type == "EXACT") { |
483 | | // Non-tokenized queries: exact matching, pattern matching, range, list operations |
484 | 6.68k | return ClauseTypeCategory::NON_TOKENIZED; |
485 | 6.68k | } else if (clause_type == "PHRASE" || clause_type == "MATCH" || clause_type == "ANY" || |
486 | 437 | clause_type == "ALL") { |
487 | | // Tokenized queries: phrase search, full-text search, multi-value matching |
488 | | // Note: ANY and ALL require tokenization of their input values |
489 | 432 | return ClauseTypeCategory::TOKENIZED; |
490 | 432 | } else { |
491 | | // Default to NON_TOKENIZED for unknown types |
492 | 5 | LOG(WARNING) << "Unknown clause type '" << clause_type |
493 | 5 | << "', defaulting to NON_TOKENIZED category"; |
494 | 5 | return ClauseTypeCategory::NON_TOKENIZED; |
495 | 5 | } |
496 | 7.26k | } |
497 | | |
498 | | // Analyze query type for a specific field in the search clause |
499 | | InvertedIndexQueryType FunctionSearch::analyze_field_query_type(const std::string& field_name, |
500 | 5.28k | const TSearchClause& clause) const { |
501 | 5.28k | const std::string& clause_type = clause.clause_type; |
502 | 5.28k | ClauseTypeCategory category = get_clause_type_category(clause_type); |
503 | | |
504 | | // Handle leaf queries - use direct mapping |
505 | 5.28k | if (category != ClauseTypeCategory::COMPOUND) { |
506 | | // Check if this clause targets the specific field |
507 | 5.14k | if (clause.field_name == field_name) { |
508 | | // Use direct mapping from clause_type to InvertedIndexQueryType |
509 | 163 | return clause_type_to_query_type(clause_type); |
510 | 163 | } |
511 | 5.14k | } |
512 | | |
513 | | // Handle boolean queries - recursively analyze children |
514 | 5.11k | if (!clause.children.empty()) { |
515 | 5.09k | for (const auto& child_clause : clause.children) { |
516 | | // Recursively analyze each child |
517 | 5.09k | InvertedIndexQueryType child_type = analyze_field_query_type(field_name, child_clause); |
518 | | // If this child targets the field (not default EQUAL_QUERY), return its query type |
519 | 5.09k | if (child_type != InvertedIndexQueryType::UNKNOWN_QUERY) { |
520 | 124 | return child_type; |
521 | 124 | } |
522 | 5.09k | } |
523 | 132 | } |
524 | | |
525 | | // If no children target this field, return UNKNOWN_QUERY as default |
526 | 4.99k | return InvertedIndexQueryType::UNKNOWN_QUERY; |
527 | 5.11k | } |
528 | | |
529 | | // Map clause_type string to InvertedIndexQueryType |
530 | | InvertedIndexQueryType FunctionSearch::clause_type_to_query_type( |
531 | 2.17k | const std::string& clause_type) const { |
532 | | // Use static map for better performance and maintainability |
533 | 2.17k | static const std::unordered_map<std::string, InvertedIndexQueryType> clause_type_map = { |
534 | | // Boolean operations |
535 | 2.17k | {"AND", InvertedIndexQueryType::BOOLEAN_QUERY}, |
536 | 2.17k | {"OR", InvertedIndexQueryType::BOOLEAN_QUERY}, |
537 | 2.17k | {"NOT", InvertedIndexQueryType::BOOLEAN_QUERY}, |
538 | 2.17k | {"OCCUR_BOOLEAN", InvertedIndexQueryType::BOOLEAN_QUERY}, |
539 | 2.17k | {"NESTED", InvertedIndexQueryType::BOOLEAN_QUERY}, |
540 | | |
541 | | // Non-tokenized queries (exact matching, pattern matching) |
542 | 2.17k | {"TERM", InvertedIndexQueryType::EQUAL_QUERY}, |
543 | 2.17k | {"PREFIX", InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY}, |
544 | 2.17k | {"WILDCARD", InvertedIndexQueryType::WILDCARD_QUERY}, |
545 | 2.17k | {"REGEXP", InvertedIndexQueryType::MATCH_REGEXP_QUERY}, |
546 | 2.17k | {"RANGE", InvertedIndexQueryType::RANGE_QUERY}, |
547 | 2.17k | {"LIST", InvertedIndexQueryType::LIST_QUERY}, |
548 | | |
549 | | // Tokenized queries (full-text search, phrase search) |
550 | 2.17k | {"PHRASE", InvertedIndexQueryType::MATCH_PHRASE_QUERY}, |
551 | 2.17k | {"MATCH", InvertedIndexQueryType::MATCH_ANY_QUERY}, |
552 | 2.17k | {"ANY", InvertedIndexQueryType::MATCH_ANY_QUERY}, |
553 | 2.17k | {"ALL", InvertedIndexQueryType::MATCH_ALL_QUERY}, |
554 | | |
555 | | // Exact match without tokenization |
556 | 2.17k | {"EXACT", InvertedIndexQueryType::EQUAL_QUERY}, |
557 | 2.17k | }; |
558 | | |
559 | 2.17k | auto it = clause_type_map.find(clause_type); |
560 | 2.17k | if (it != clause_type_map.end()) { |
561 | 2.16k | return it->second; |
562 | 2.16k | } |
563 | | |
564 | | // Unknown clause type |
565 | 2.17k | LOG(WARNING) << "Unknown clause type '" << clause_type << "', defaulting to EQUAL_QUERY"; |
566 | 7 | return InvertedIndexQueryType::EQUAL_QUERY; |
567 | 2.17k | } |
568 | | |
569 | | // Map Thrift TSearchOccur to query_v2::Occur |
570 | 903 | static query_v2::Occur map_thrift_occur(TSearchOccur::type thrift_occur) { |
571 | 903 | switch (thrift_occur) { |
572 | 303 | case TSearchOccur::MUST: |
573 | 303 | return query_v2::Occur::MUST; |
574 | 534 | case TSearchOccur::SHOULD: |
575 | 534 | return query_v2::Occur::SHOULD; |
576 | 67 | case TSearchOccur::MUST_NOT: |
577 | 67 | return query_v2::Occur::MUST_NOT; |
578 | 0 | default: |
579 | 0 | return query_v2::Occur::MUST; |
580 | 903 | } |
581 | 903 | } |
582 | | |
583 | | Status FunctionSearch::build_query_recursive( |
584 | | const TSearchClause& clause, const std::shared_ptr<IndexQueryContext>& context, |
585 | | FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, |
586 | | std::string* binding_key, const std::string& default_operator, int32_t minimum_should_match, |
587 | 2.81k | uint32_t num_rows) const { |
588 | 2.81k | DCHECK(out != nullptr); |
589 | 2.81k | *out = nullptr; |
590 | 2.83k | if (binding_key) { |
591 | 2.83k | binding_key->clear(); |
592 | 2.83k | } |
593 | | |
594 | 2.81k | const std::string& clause_type = clause.clause_type; |
595 | | |
596 | | // Handle MATCH_ALL_DOCS - matches all documents in the segment |
597 | 2.81k | if (clause_type == "MATCH_ALL_DOCS") { |
598 | 44 | *out = std::make_shared<query_v2::AllQuery>(); |
599 | 44 | return Status::OK(); |
600 | 44 | } |
601 | | |
602 | | // Handle OCCUR_BOOLEAN - Lucene-style boolean query with MUST/SHOULD/MUST_NOT |
603 | 2.77k | if (clause_type == "OCCUR_BOOLEAN") { |
604 | 439 | auto builder = segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder(); |
605 | | |
606 | | // Set minimum_should_match if specified |
607 | 439 | if (clause.__isset.minimum_should_match) { |
608 | 423 | builder->set_minimum_number_should_match(clause.minimum_should_match); |
609 | 423 | } |
610 | | |
611 | 439 | if (clause.__isset.children) { |
612 | 899 | for (const auto& child_clause : clause.children) { |
613 | 899 | query_v2::QueryPtr child_query; |
614 | 899 | std::string child_binding_key; |
615 | 899 | RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, |
616 | 899 | &child_binding_key, default_operator, |
617 | 899 | minimum_should_match, num_rows)); |
618 | | |
619 | | // Determine occur type from child clause |
620 | 898 | query_v2::Occur occur = query_v2::Occur::MUST; // default |
621 | 903 | if (child_clause.__isset.occur) { |
622 | 903 | occur = map_thrift_occur(child_clause.occur); |
623 | 903 | } |
624 | | |
625 | 898 | builder->add(child_query, occur, std::move(child_binding_key)); |
626 | 898 | } |
627 | 435 | } |
628 | | |
629 | 438 | *out = builder->build(); |
630 | 438 | return Status::OK(); |
631 | 439 | } |
632 | | |
633 | 2.33k | if (clause_type == "NESTED") { |
634 | 1 | return Status::InvalidArgument("NESTED clause must be evaluated at top level"); |
635 | 1 | } |
636 | | |
637 | | // Handle standard boolean operators (AND/OR/NOT) |
638 | 2.33k | if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT") { |
639 | 377 | query_v2::OperatorType op = query_v2::OperatorType::OP_AND; |
640 | 377 | if (clause_type == "OR") { |
641 | 199 | op = query_v2::OperatorType::OP_OR; |
642 | 199 | } else if (clause_type == "NOT") { |
643 | 88 | op = query_v2::OperatorType::OP_NOT; |
644 | 88 | } |
645 | | |
646 | 377 | auto builder = create_operator_boolean_query_builder(op); |
647 | 379 | if (clause.__isset.children) { |
648 | 803 | for (const auto& child_clause : clause.children) { |
649 | 803 | query_v2::QueryPtr child_query; |
650 | 803 | std::string child_binding_key; |
651 | 803 | RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, |
652 | 803 | &child_binding_key, default_operator, |
653 | 803 | minimum_should_match, num_rows)); |
654 | | // Add all children including empty BitSetQuery |
655 | | // BooleanQuery will handle the logic: |
656 | | // - AND with empty bitmap → result is empty |
657 | | // - OR with empty bitmap → empty bitmap is ignored by OR logic |
658 | | // - NOT with empty bitmap → NOT(empty) = all rows (handled by BooleanQuery) |
659 | 802 | builder->add(child_query, std::move(child_binding_key)); |
660 | 802 | } |
661 | 379 | } |
662 | | |
663 | 376 | *out = builder->build(); |
664 | 376 | return Status::OK(); |
665 | 377 | } |
666 | | |
667 | 1.95k | return build_leaf_query(clause, context, resolver, out, binding_key, default_operator, |
668 | 1.95k | minimum_should_match, num_rows); |
669 | 2.33k | } |
670 | | |
671 | | Status FunctionSearch::build_leaf_query(const TSearchClause& clause, |
672 | | const std::shared_ptr<IndexQueryContext>& context, |
673 | | FieldReaderResolver& resolver, |
674 | | inverted_index::query_v2::QueryPtr* out, |
675 | | std::string* binding_key, |
676 | | const std::string& default_operator, |
677 | 1.97k | int32_t minimum_should_match, uint32_t num_rows) const { |
678 | 1.97k | DCHECK(out != nullptr); |
679 | 1.97k | *out = nullptr; |
680 | 1.97k | if (binding_key) { |
681 | 1.97k | binding_key->clear(); |
682 | 1.97k | } |
683 | | |
684 | 1.97k | if (!clause.__isset.field_name || !clause.__isset.value) { |
685 | 0 | return Status::InvalidArgument("search clause missing field_name or value"); |
686 | 0 | } |
687 | | |
688 | 1.97k | const std::string& field_name = clause.field_name; |
689 | 1.97k | const std::string& value = clause.value; |
690 | 1.97k | const std::string& clause_type = clause.clause_type; |
691 | | |
692 | 1.97k | auto query_type = clause_type_to_query_type(clause_type); |
693 | | // TERM, WILDCARD, PREFIX, and REGEXP in search DSL operate on individual index terms |
694 | | // (like Lucene TermQuery, WildcardQuery, PrefixQuery, RegexpQuery). |
695 | | // Override to MATCH_ANY_QUERY so select_best_reader() prefers the FULLTEXT reader |
696 | | // when multiple indexes exist on the same column (one tokenized, one untokenized). |
697 | | // Without this, these queries would select the untokenized index and try to match |
698 | | // patterns like "h*llo" against full strings ("hello world") instead of individual |
699 | | // tokens ("hello"), returning empty results. |
700 | | // EXACT must remain EQUAL_QUERY to prefer the untokenized STRING_TYPE reader. |
701 | | // |
702 | | // Safe for single-index columns: select_best_reader() has a single-reader fast path |
703 | | // that returns the only reader directly, bypassing the query_type preference logic. |
704 | 1.97k | if (clause_type == "TERM" || clause_type == "WILDCARD" || clause_type == "PREFIX" || |
705 | 1.97k | clause_type == "REGEXP") { |
706 | 1.47k | query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; |
707 | 1.47k | } |
708 | | |
709 | 1.97k | auto finish_leaf_query = [&](query_v2::QueryPtr query) -> Status { |
710 | 1.94k | *out = std::move(query); |
711 | 1.94k | return resolver.map_leaf_query(field_name, out); |
712 | 1.94k | }; |
713 | | |
714 | 1.97k | FieldReaderBinding binding; |
715 | 1.97k | RETURN_IF_ERROR(resolver.resolve(field_name, query_type, &binding)); |
716 | | |
717 | 1.95k | if (!binding.is_bound()) { |
718 | 4 | LOG(INFO) << "search: No inverted index for field '" << field_name |
719 | 4 | << "' in this segment, clause_type='" << clause_type |
720 | 4 | << "', query_type=" << static_cast<int>(query_type) |
721 | 4 | << ", returning UNKNOWN bitmap"; |
722 | 4 | if (binding_key) { |
723 | 4 | binding_key->clear(); |
724 | 4 | } |
725 | 4 | return finish_leaf_query(make_unknown_query(num_rows)); |
726 | 4 | } |
727 | | |
728 | 1.94k | if (binding_key) { |
729 | 1.94k | *binding_key = binding.binding_key; |
730 | 1.94k | } |
731 | | |
732 | 1.94k | if (binding.use_direct_index_reader()) { |
733 | 3 | auto direct_query_type = direct_index_query_type_for_clause(clause_type); |
734 | 3 | if (direct_query_type == InvertedIndexQueryType::UNKNOWN_QUERY) { |
735 | 1 | return finish_leaf_query(make_unknown_query(num_rows)); |
736 | 1 | } |
737 | | |
738 | 2 | auto value_type = unwrap_direct_index_value_type(binding.column_type); |
739 | 2 | Field param_value; |
740 | 2 | auto parse_status = parse_scalar_search_value(value_type, value, ¶m_value); |
741 | 2 | if (!parse_status.ok()) { |
742 | 0 | LOG(INFO) << "search: scalar leaf value is unsupported, field=" << field_name |
743 | 0 | << ", value='" << value << "', reason=" << parse_status.to_string(); |
744 | 0 | return finish_leaf_query(make_unknown_query(num_rows)); |
745 | 0 | } |
746 | | |
747 | 2 | auto* iterator = resolver.get_iterator(field_name); |
748 | 2 | if (iterator == nullptr) { |
749 | 0 | return finish_leaf_query(make_unknown_query(num_rows)); |
750 | 0 | } |
751 | | |
752 | 2 | segment_v2::InvertedIndexParam param; |
753 | 2 | param.column_name = binding.stored_field_name; |
754 | 2 | param.column_type = value_type; |
755 | 2 | param.query_value = param_value; |
756 | 2 | param.query_type = direct_query_type; |
757 | 2 | param.num_rows = num_rows; |
758 | 2 | param.roaring = std::make_shared<roaring::Roaring>(); |
759 | 2 | RETURN_IF_ERROR(iterator->read_from_index(segment_v2::IndexParam {¶m})); |
760 | | |
761 | 2 | std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>(); |
762 | 2 | auto has_null = iterator->has_null(); |
763 | 2 | if (has_null.has_value() && has_null.value()) { |
764 | 0 | segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle; |
765 | 0 | RETURN_IF_ERROR(iterator->read_null_bitmap(&null_bitmap_cache_handle)); |
766 | 0 | if (auto bitmap = null_bitmap_cache_handle.get_bitmap(); bitmap != nullptr) { |
767 | 0 | null_bitmap = bitmap; |
768 | 0 | } |
769 | 0 | } |
770 | 2 | return finish_leaf_query(std::make_shared<query_v2::BitSetQuery>(std::move(param.roaring), |
771 | 2 | std::move(null_bitmap))); |
772 | 2 | } |
773 | | |
774 | 1.94k | if (binding.lucene_reader == nullptr) { |
775 | 0 | return finish_leaf_query(make_unknown_query(num_rows)); |
776 | 0 | } |
777 | | |
778 | 1.94k | FunctionSearch::ClauseTypeCategory category = get_clause_type_category(clause_type); |
779 | 1.94k | std::wstring field_wstr = binding.stored_field_wstr; |
780 | 1.94k | std::wstring value_wstr = StringHelper::to_wstring(value); |
781 | | |
782 | 1.96k | auto make_term_query = [&](const std::wstring& term) -> query_v2::QueryPtr { |
783 | 1.96k | return std::make_shared<query_v2::TermQuery>(context, field_wstr, term); |
784 | 1.96k | }; |
785 | | |
786 | 1.94k | if (clause_type == "TERM") { |
787 | 1.37k | bool should_analyze = |
788 | 1.37k | inverted_index::InvertedIndexAnalyzer::should_analyzer(binding.index_properties); |
789 | 1.37k | if (should_analyze) { |
790 | 1.10k | if (binding.index_properties.empty()) { |
791 | 0 | LOG(WARNING) << "search: analyzer required but index properties empty for field '" |
792 | 0 | << field_name << "'"; |
793 | 0 | return finish_leaf_query(make_term_query(value_wstr)); |
794 | 0 | } |
795 | | |
796 | 1.10k | std::vector<TermInfo> term_infos = |
797 | 1.10k | inverted_index::InvertedIndexAnalyzer::get_analyse_result( |
798 | 1.10k | value, binding.index_properties); |
799 | 1.10k | if (term_infos.empty()) { |
800 | 0 | LOG(WARNING) << "search: No terms found after tokenization for TERM query, field=" |
801 | 0 | << field_name << ", value='" << value |
802 | 0 | << "', returning empty BitSetQuery"; |
803 | 0 | return finish_leaf_query( |
804 | 0 | std::make_shared<query_v2::BitSetQuery>(roaring::Roaring())); |
805 | 0 | } |
806 | | |
807 | 1.10k | if (term_infos.size() == 1) { |
808 | 1.09k | std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term()); |
809 | 1.09k | return finish_leaf_query(make_term_query(term_wstr)); |
810 | 1.09k | } |
811 | | |
812 | | // When minimum_should_match is specified, use OccurBooleanQuery |
813 | | // ES behavior: msm only applies to SHOULD clauses |
814 | 16 | if (minimum_should_match > 0) { |
815 | 0 | auto builder = |
816 | 0 | segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder(); |
817 | 0 | builder->set_minimum_number_should_match(minimum_should_match); |
818 | 0 | query_v2::Occur occur = (default_operator == "and") ? query_v2::Occur::MUST |
819 | 0 | : query_v2::Occur::SHOULD; |
820 | 0 | for (const auto& term_info : term_infos) { |
821 | 0 | std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); |
822 | 0 | builder->add(make_term_query(term_wstr), occur); |
823 | 0 | } |
824 | 0 | return finish_leaf_query(builder->build()); |
825 | 0 | } |
826 | | |
827 | | // Use default_operator to determine how to combine tokenized terms |
828 | 16 | query_v2::OperatorType op_type = (default_operator == "and") |
829 | 16 | ? query_v2::OperatorType::OP_AND |
830 | 16 | : query_v2::OperatorType::OP_OR; |
831 | 16 | auto builder = create_operator_boolean_query_builder(op_type); |
832 | 19 | for (const auto& term_info : term_infos) { |
833 | 19 | std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); |
834 | 19 | builder->add(make_term_query(term_wstr), binding.binding_key); |
835 | 19 | } |
836 | | |
837 | 16 | return finish_leaf_query(builder->build()); |
838 | 16 | } |
839 | | |
840 | 261 | return finish_leaf_query(make_term_query(value_wstr)); |
841 | 1.37k | } |
842 | | |
843 | 574 | if (category == FunctionSearch::ClauseTypeCategory::TOKENIZED) { |
844 | 410 | if (clause_type == "PHRASE") { |
845 | 123 | bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer( |
846 | 123 | binding.index_properties); |
847 | 123 | if (!should_analyze) { |
848 | 11 | VLOG_DEBUG << "search: PHRASE on non-tokenized field '" << field_name |
849 | 0 | << "', falling back to TERM"; |
850 | 11 | return finish_leaf_query(make_term_query(value_wstr)); |
851 | 11 | } |
852 | | |
853 | 112 | if (binding.index_properties.empty()) { |
854 | 0 | LOG(WARNING) << "search: analyzer required but index properties empty for PHRASE " |
855 | 0 | "query on field '" |
856 | 0 | << field_name << "'"; |
857 | 0 | return finish_leaf_query(make_term_query(value_wstr)); |
858 | 0 | } |
859 | | |
860 | 112 | std::vector<TermInfo> term_infos = |
861 | 112 | inverted_index::InvertedIndexAnalyzer::get_analyse_result( |
862 | 112 | value, binding.index_properties); |
863 | 112 | if (term_infos.empty()) { |
864 | 9 | LOG(WARNING) << "search: No terms found after tokenization for PHRASE query, field=" |
865 | 9 | << field_name << ", value='" << value |
866 | 9 | << "', returning empty BitSetQuery"; |
867 | 9 | return finish_leaf_query( |
868 | 9 | std::make_shared<query_v2::BitSetQuery>(roaring::Roaring())); |
869 | 9 | } |
870 | | |
871 | 103 | std::vector<TermInfo> phrase_term_infos = |
872 | 103 | QueryHelper::build_phrase_term_infos(term_infos); |
873 | 103 | if (phrase_term_infos.size() == 1) { |
874 | 63 | const auto& term_info = phrase_term_infos[0]; |
875 | 63 | if (term_info.is_single_term()) { |
876 | 63 | std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); |
877 | 63 | return finish_leaf_query( |
878 | 63 | std::make_shared<query_v2::TermQuery>(context, field_wstr, term_wstr)); |
879 | 63 | } else { |
880 | 0 | auto builder = |
881 | 0 | create_operator_boolean_query_builder(query_v2::OperatorType::OP_OR); |
882 | 0 | for (const auto& term : term_info.get_multi_terms()) { |
883 | 0 | std::wstring term_wstr = StringHelper::to_wstring(term); |
884 | 0 | builder->add(make_term_query(term_wstr), binding.binding_key); |
885 | 0 | } |
886 | 0 | return finish_leaf_query(builder->build()); |
887 | 0 | } |
888 | 63 | } else { |
889 | 40 | if (QueryHelper::is_simple_phrase(phrase_term_infos)) { |
890 | 21 | return finish_leaf_query(std::make_shared<query_v2::PhraseQuery>( |
891 | 21 | context, field_wstr, phrase_term_infos)); |
892 | 21 | } else { |
893 | 19 | return finish_leaf_query(std::make_shared<query_v2::MultiPhraseQuery>( |
894 | 19 | context, field_wstr, phrase_term_infos)); |
895 | 19 | } |
896 | 40 | } |
897 | | |
898 | 0 | return Status::OK(); |
899 | 103 | } |
900 | 287 | if (clause_type == "MATCH") { |
901 | 0 | VLOG_DEBUG << "search: MATCH clause not implemented, fallback to TERM"; |
902 | 0 | return finish_leaf_query(make_term_query(value_wstr)); |
903 | 0 | } |
904 | | |
905 | 287 | if (clause_type == "ANY" || clause_type == "ALL") { |
906 | 286 | bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer( |
907 | 286 | binding.index_properties); |
908 | 286 | if (!should_analyze) { |
909 | 1 | return finish_leaf_query(make_term_query(value_wstr)); |
910 | 1 | } |
911 | | |
912 | 285 | if (binding.index_properties.empty()) { |
913 | 0 | LOG(WARNING) << "search: index properties empty for tokenized clause '" |
914 | 0 | << clause_type << "' field=" << field_name; |
915 | 0 | return finish_leaf_query(make_term_query(value_wstr)); |
916 | 0 | } |
917 | | |
918 | 285 | std::vector<TermInfo> term_infos = |
919 | 285 | inverted_index::InvertedIndexAnalyzer::get_analyse_result( |
920 | 285 | value, binding.index_properties); |
921 | 285 | if (term_infos.empty()) { |
922 | 0 | LOG(WARNING) << "search: tokenization yielded no terms for clause '" << clause_type |
923 | 0 | << "', field=" << field_name << ", returning empty BitSetQuery"; |
924 | 0 | return finish_leaf_query( |
925 | 0 | std::make_shared<query_v2::BitSetQuery>(roaring::Roaring())); |
926 | 0 | } |
927 | | |
928 | 285 | query_v2::OperatorType bool_type = query_v2::OperatorType::OP_OR; |
929 | 285 | if (clause_type == "ALL") { |
930 | 194 | bool_type = query_v2::OperatorType::OP_AND; |
931 | 194 | } |
932 | | |
933 | 285 | if (term_infos.size() == 1) { |
934 | 109 | std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term()); |
935 | 109 | return finish_leaf_query(make_term_query(term_wstr)); |
936 | 109 | } |
937 | | |
938 | 176 | auto builder = create_operator_boolean_query_builder(bool_type); |
939 | 402 | for (const auto& term_info : term_infos) { |
940 | 402 | std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); |
941 | 402 | builder->add(make_term_query(term_wstr), binding.binding_key); |
942 | 402 | } |
943 | 176 | return finish_leaf_query(builder->build()); |
944 | 285 | } |
945 | | |
946 | | // Default tokenized clause fallback |
947 | 1 | return finish_leaf_query(make_term_query(value_wstr)); |
948 | 287 | } |
949 | | |
950 | 173 | if (category == FunctionSearch::ClauseTypeCategory::NON_TOKENIZED) { |
951 | 173 | if (clause_type == "EXACT") { |
952 | | // EXACT match: exact string matching without tokenization |
953 | | // Note: EXACT prefers untokenized index (STRING_TYPE) which doesn't support lowercase |
954 | | // If only tokenized index exists, EXACT may return empty results because |
955 | | // tokenized indexes store individual tokens, not complete strings |
956 | 82 | VLOG_DEBUG << "search: EXACT clause processed, field=" << field_name << ", value='" |
957 | 0 | << value << "'"; |
958 | 82 | return finish_leaf_query(make_term_query(value_wstr)); |
959 | 82 | } |
960 | 91 | if (clause_type == "PREFIX") { |
961 | | // Apply lowercase only if: |
962 | | // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing) |
963 | | // 2. lower_case is explicitly set to "true" |
964 | 36 | bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer( |
965 | 36 | binding.index_properties); |
966 | 36 | std::string lowercase_setting = |
967 | 36 | get_parser_lowercase_from_properties(binding.index_properties); |
968 | 36 | bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE); |
969 | 36 | std::string pattern = should_lowercase ? to_lower(value) : value; |
970 | 36 | VLOG_DEBUG << "search: PREFIX clause processed, field=" << field_name << ", pattern='" |
971 | 0 | << pattern << "' (original='" << value << "', has_parser=" << has_parser |
972 | 0 | << ", lower_case=" << lowercase_setting << ")"; |
973 | 36 | return finish_leaf_query( |
974 | 36 | std::make_shared<query_v2::WildcardQuery>(context, field_wstr, pattern)); |
975 | 36 | } |
976 | | |
977 | 55 | if (clause_type == "WILDCARD") { |
978 | | // Standalone wildcard "*" matches all non-null values for this field |
979 | | // Consistent with ES query_string behavior where field:* becomes FieldExistsQuery |
980 | 23 | if (value == "*") { |
981 | 0 | VLOG_DEBUG << "search: WILDCARD '*' converted to AllQuery(nullable=true), field=" |
982 | 0 | << field_name; |
983 | 0 | return finish_leaf_query(std::make_shared<query_v2::AllQuery>(field_wstr, true)); |
984 | 0 | } |
985 | | // Apply lowercase only if: |
986 | | // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing) |
987 | | // 2. lower_case is explicitly set to "true" |
988 | 23 | bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer( |
989 | 23 | binding.index_properties); |
990 | 23 | std::string lowercase_setting = |
991 | 23 | get_parser_lowercase_from_properties(binding.index_properties); |
992 | 23 | bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE); |
993 | 23 | std::string pattern = should_lowercase ? to_lower(value) : value; |
994 | 23 | VLOG_DEBUG << "search: WILDCARD clause processed, field=" << field_name << ", pattern='" |
995 | 0 | << pattern << "' (original='" << value << "', has_parser=" << has_parser |
996 | 0 | << ", lower_case=" << lowercase_setting << ")"; |
997 | 23 | return finish_leaf_query( |
998 | 23 | std::make_shared<query_v2::WildcardQuery>(context, field_wstr, pattern)); |
999 | 23 | } |
1000 | | |
1001 | 32 | if (clause_type == "REGEXP") { |
1002 | | // ES-compatible: regex patterns are NOT lowercased (case-sensitive matching) |
1003 | | // This matches ES query_string behavior where regex patterns bypass analysis |
1004 | 29 | VLOG_DEBUG << "search: REGEXP clause processed, field=" << field_name << ", pattern='" |
1005 | 0 | << value << "'"; |
1006 | 29 | return finish_leaf_query( |
1007 | 29 | std::make_shared<query_v2::RegexpQuery>(context, field_wstr, value)); |
1008 | 29 | } |
1009 | | |
1010 | 3 | if (clause_type == "RANGE" || clause_type == "LIST") { |
1011 | 3 | VLOG_DEBUG << "search: clause type '" << clause_type |
1012 | 0 | << "' not implemented, fallback to TERM"; |
1013 | 3 | } |
1014 | 3 | return finish_leaf_query(make_term_query(value_wstr)); |
1015 | 32 | } |
1016 | | |
1017 | 18.4E | LOG(WARNING) << "search: Unexpected clause type '" << clause_type << "', using TERM fallback"; |
1018 | 18.4E | return finish_leaf_query(make_term_query(value_wstr)); |
1019 | 164 | } |
1020 | | |
1021 | 8 | void register_function_search(SimpleFunctionFactory& factory) { |
1022 | 8 | factory.register_function<FunctionSearch>(); |
1023 | 8 | } |
1024 | | |
1025 | | } // namespace doris |