be/src/exprs/function/function_search.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "exprs/function/function_search.h" |
19 | | |
20 | | #include <CLucene/config/repl_wchar.h> |
21 | | #include <CLucene/search/Scorer.h> |
22 | | #include <gen_cpp/Exprs_types.h> |
23 | | #include <glog/logging.h> |
24 | | |
25 | | #include <limits> |
26 | | #include <memory> |
27 | | #include <roaring/roaring.hh> |
28 | | #include <set> |
29 | | #include <string> |
30 | | #include <unordered_map> |
31 | | #include <unordered_set> |
32 | | #include <vector> |
33 | | |
34 | | #include "common/status.h" |
35 | | #include "core/block/columns_with_type_and_name.h" |
36 | | #include "core/column/column_const.h" |
37 | | #include "core/data_type/data_type_string.h" |
38 | | #include "exprs/function/simple_function_factory.h" |
39 | | #include "exprs/vexpr_context.h" |
40 | | #include "runtime/runtime_profile.h" |
41 | | #include "storage/index/index_file_reader.h" |
42 | | #include "storage/index/index_query_context.h" |
43 | | #include "storage/index/inverted/analyzer/analyzer.h" |
44 | | #include "storage/index/inverted/inverted_index_compound_reader.h" |
45 | | #include "storage/index/inverted/inverted_index_iterator.h" |
46 | | #include "storage/index/inverted/inverted_index_parser.h" |
47 | | #include "storage/index/inverted/inverted_index_reader.h" |
48 | | #include "storage/index/inverted/inverted_index_searcher.h" |
49 | | #include "storage/index/inverted/query/query_helper.h" |
50 | | #include "storage/index/inverted/query_v2/all_query/all_query.h" |
51 | | #include "storage/index/inverted/query_v2/bit_set_query/bit_set_query.h" |
52 | | #include "storage/index/inverted/query_v2/boolean_query/boolean_query_builder.h" |
53 | | #include "storage/index/inverted/query_v2/boolean_query/operator.h" |
54 | | #include "storage/index/inverted/query_v2/collect/doc_set_collector.h" |
55 | | #include "storage/index/inverted/query_v2/collect/top_k_collector.h" |
56 | | #include "storage/index/inverted/query_v2/phrase_query/multi_phrase_query.h" |
57 | | #include "storage/index/inverted/query_v2/phrase_query/phrase_query.h" |
58 | | #include "storage/index/inverted/query_v2/regexp_query/regexp_query.h" |
59 | | #include "storage/index/inverted/query_v2/term_query/term_query.h" |
60 | | #include "storage/index/inverted/query_v2/wildcard_query/wildcard_query.h" |
61 | | #include "storage/index/inverted/util/string_helper.h" |
62 | | #include "storage/segment/segment.h" |
63 | | #include "storage/segment/variant/nested_group_path.h" |
64 | | #include "storage/segment/variant/nested_group_provider.h" |
65 | | #include "storage/segment/variant/variant_column_reader.h" |
66 | | #include "storage/types.h" |
67 | | #include "util/debug_points.h" |
68 | | #include "util/string_util.h" |
69 | | #include "util/thrift_util.h" |
70 | | |
71 | | namespace doris { |
72 | | |
73 | | // Build canonical DSL signature for cache key. |
74 | | // Serializes the entire TSearchParam via Thrift binary protocol so that |
75 | | // every field (DSL, AST root, field bindings, default_operator, |
76 | | // minimum_should_match, etc.) is included automatically. |
77 | 1.26k | static std::string build_dsl_signature(const TSearchParam& param) { |
78 | 1.26k | ThriftSerializer ser(false, 1024); |
79 | 1.26k | TSearchParam copy = param; |
80 | 1.26k | std::string sig; |
81 | 1.26k | auto st = ser.serialize(©, &sig); |
82 | 1.26k | if (UNLIKELY(!st.ok())) { |
83 | 0 | LOG(WARNING) << "build_dsl_signature: Thrift serialization failed: " << st.to_string() |
84 | 0 | << ", caching disabled for this query"; |
85 | 0 | return ""; |
86 | 0 | } |
87 | 1.26k | return sig; |
88 | 1.26k | } |
89 | | |
90 | | // Extract segment path prefix from the first available inverted index iterator. |
91 | | // All fields in the same segment share the same path prefix. |
92 | | static std::string extract_segment_prefix( |
93 | 1.23k | const std::unordered_map<std::string, IndexIterator*>& iterators) { |
94 | 1.24k | for (const auto& [field_name, iter] : iterators) { |
95 | 1.24k | auto* inv_iter = dynamic_cast<InvertedIndexIterator*>(iter); |
96 | 1.24k | if (!inv_iter) continue; |
97 | | // Try fulltext reader first, then string type |
98 | 1.21k | for (auto type : |
99 | 1.52k | {InvertedIndexReaderType::FULLTEXT, InvertedIndexReaderType::STRING_TYPE}) { |
100 | 1.52k | IndexReaderType reader_type = type; |
101 | 1.52k | auto reader = inv_iter->get_reader(reader_type); |
102 | 1.52k | if (!reader) continue; |
103 | 1.21k | auto inv_reader = std::dynamic_pointer_cast<InvertedIndexReader>(reader); |
104 | 1.21k | if (!inv_reader) continue; |
105 | 1.21k | auto file_reader = inv_reader->get_index_file_reader(); |
106 | 1.21k | if (!file_reader) continue; |
107 | 1.21k | return file_reader->get_index_path_prefix(); |
108 | 1.21k | } |
109 | 1.21k | } |
110 | 25 | VLOG_DEBUG << "extract_segment_prefix: no suitable inverted index reader found across " |
111 | 0 | << iterators.size() << " iterators, caching disabled for this query"; |
112 | 25 | return ""; |
113 | 1.23k | } |
114 | | |
115 | | namespace { |
116 | | |
117 | 3 | bool is_nested_group_search_supported() { |
118 | 3 | auto provider = segment_v2::create_nested_group_read_provider(); |
119 | 3 | return provider != nullptr && provider->should_enable_nested_group_read_path(); |
120 | 3 | } |
121 | | |
122 | | class ResolverNullBitmapAdapter final : public query_v2::NullBitmapResolver { |
123 | | public: |
124 | 1.07k | explicit ResolverNullBitmapAdapter(const FieldReaderResolver& resolver) : _resolver(resolver) {} |
125 | | |
126 | | segment_v2::IndexIterator* iterator_for(const query_v2::Scorer& /*scorer*/, |
127 | 2.76k | const std::string& logical_field) const override { |
128 | 2.76k | if (logical_field.empty()) { |
129 | 0 | return nullptr; |
130 | 0 | } |
131 | 2.76k | return _resolver.get_iterator(logical_field); |
132 | 2.76k | } |
133 | | |
134 | | private: |
135 | | const FieldReaderResolver& _resolver; |
136 | | }; |
137 | | |
138 | | void populate_binding_context(const FieldReaderResolver& resolver, |
139 | 1.06k | query_v2::QueryExecutionContext* exec_ctx) { |
140 | 1.06k | DCHECK(exec_ctx != nullptr); |
141 | 1.06k | exec_ctx->readers = resolver.readers(); |
142 | 1.06k | exec_ctx->reader_bindings = resolver.reader_bindings(); |
143 | 1.06k | exec_ctx->field_reader_bindings = resolver.field_readers(); |
144 | 1.56k | for (const auto& [binding_key, binding] : resolver.binding_cache()) { |
145 | 1.56k | if (binding_key.empty()) { |
146 | 0 | continue; |
147 | 0 | } |
148 | 1.56k | query_v2::FieldBindingContext binding_ctx; |
149 | 1.56k | binding_ctx.logical_field_name = binding.logical_field_name; |
150 | 1.56k | binding_ctx.stored_field_name = binding.stored_field_name; |
151 | 1.56k | binding_ctx.stored_field_wstr = binding.stored_field_wstr; |
152 | 1.56k | exec_ctx->binding_fields.emplace(binding_key, std::move(binding_ctx)); |
153 | 1.56k | } |
154 | 1.06k | } |
155 | | |
156 | | query_v2::QueryExecutionContext build_query_execution_context( |
157 | | uint32_t segment_num_rows, const FieldReaderResolver& resolver, |
158 | 1.05k | query_v2::NullBitmapResolver* null_resolver) { |
159 | 1.05k | query_v2::QueryExecutionContext exec_ctx; |
160 | 1.05k | exec_ctx.segment_num_rows = segment_num_rows; |
161 | 1.05k | populate_binding_context(resolver, &exec_ctx); |
162 | 1.05k | exec_ctx.null_resolver = null_resolver; |
163 | 1.05k | return exec_ctx; |
164 | 1.05k | } |
165 | | |
166 | | } // namespace |
167 | | |
168 | | Status FieldReaderResolver::resolve(const std::string& field_name, |
169 | | InvertedIndexQueryType query_type, |
170 | 1.92k | FieldReaderBinding* binding) { |
171 | 1.92k | DCHECK(binding != nullptr); |
172 | | |
173 | | // Check if this is a variant subcolumn |
174 | 1.92k | bool is_variant_sub = is_variant_subcolumn(field_name); |
175 | | |
176 | 1.92k | auto data_it = _data_type_with_names.find(field_name); |
177 | 1.92k | if (data_it == _data_type_with_names.end()) { |
178 | | // For variant subcolumns, not finding the index is normal (the subcolumn may not exist in this segment) |
179 | | // Return OK but with null binding to signal "no match" |
180 | 8 | if (is_variant_sub) { |
181 | 3 | VLOG_DEBUG << "Variant subcolumn '" << field_name |
182 | 0 | << "' not found in this segment, treating as no match"; |
183 | 3 | *binding = FieldReaderBinding(); |
184 | 3 | return Status::OK(); |
185 | 3 | } |
186 | | // For normal fields, this is an error |
187 | 5 | return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>( |
188 | 5 | "field '{}' not found in inverted index metadata", field_name); |
189 | 8 | } |
190 | | |
191 | 1.91k | const auto& stored_field_name = data_it->second.first; |
192 | 1.91k | const auto binding_key = binding_key_for(stored_field_name, query_type); |
193 | | |
194 | 1.91k | auto cache_it = _cache.find(binding_key); |
195 | 1.91k | if (cache_it != _cache.end()) { |
196 | 303 | *binding = cache_it->second; |
197 | 303 | if (_context->stats) { |
198 | 302 | _context->stats->inverted_index_searcher_cache_hit++; |
199 | 302 | } |
200 | 303 | return Status::OK(); |
201 | 303 | } |
202 | | |
203 | 1.61k | auto iterator_it = _iterators.find(field_name); |
204 | 1.61k | if (iterator_it == _iterators.end() || iterator_it->second == nullptr) { |
205 | | // For variant subcolumns, not finding the iterator is normal |
206 | 17 | if (is_variant_sub) { |
207 | 0 | VLOG_DEBUG << "Variant subcolumn '" << field_name |
208 | 0 | << "' iterator not found in this segment, treating as no match"; |
209 | 0 | *binding = FieldReaderBinding(); |
210 | 0 | return Status::OK(); |
211 | 0 | } |
212 | 17 | return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>( |
213 | 17 | "iterator not found for field '{}'", field_name); |
214 | 17 | } |
215 | | |
216 | 1.59k | auto* inverted_iterator = dynamic_cast<InvertedIndexIterator*>(iterator_it->second); |
217 | 1.59k | if (inverted_iterator == nullptr) { |
218 | 2 | return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>( |
219 | 2 | "iterator for field '{}' is not InvertedIndexIterator", field_name); |
220 | 2 | } |
221 | | |
222 | | // For variant subcolumns, FE resolves the field pattern to a specific index and sends |
223 | | // its index_properties via TSearchFieldBinding. When FE picks an analyzer-based index, |
224 | | // upgrade EQUAL_QUERY/WILDCARD_QUERY to MATCH_ANY_QUERY so select_best_reader picks the |
225 | | // FULLTEXT reader instead of STRING_TYPE. Without this upgrade: |
226 | | // - TERM (EQUAL_QUERY) clauses would open the wrong (untokenized) index directory |
227 | | // - WILDCARD clauses would enumerate terms from the wrong index, returning empty results |
228 | | // |
229 | | // For regular (non-variant) columns with multiple indexes, the caller (build_leaf_query) |
230 | | // is responsible for passing the appropriate query_type: MATCH_ANY_QUERY for tokenized |
231 | | // queries (TERM) and EQUAL_QUERY for exact-match queries (EXACT). This ensures |
232 | | // select_best_reader picks FULLTEXT vs STRING_TYPE correctly without needing an explicit |
233 | | // analyzer key, since the query_type alone drives the reader type preference. |
234 | 1.59k | InvertedIndexQueryType effective_query_type = query_type; |
235 | 1.59k | auto fb_it = _field_binding_map.find(field_name); |
236 | 1.59k | std::string analyzer_key; |
237 | 1.59k | if (is_variant_sub && fb_it != _field_binding_map.end() && |
238 | 1.59k | fb_it->second->__isset.index_properties && !fb_it->second->index_properties.empty()) { |
239 | 60 | analyzer_key = normalize_analyzer_key( |
240 | 60 | build_analyzer_key_from_properties(fb_it->second->index_properties)); |
241 | 60 | if (inverted_index::InvertedIndexAnalyzer::should_analyzer( |
242 | 60 | fb_it->second->index_properties) && |
243 | 60 | (effective_query_type == InvertedIndexQueryType::EQUAL_QUERY || |
244 | 60 | effective_query_type == InvertedIndexQueryType::WILDCARD_QUERY)) { |
245 | 0 | effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; |
246 | 0 | } |
247 | 60 | } |
248 | | |
249 | 1.59k | Result<InvertedIndexReaderPtr> reader_result; |
250 | 1.59k | const auto& column_type = data_it->second.second; |
251 | 1.59k | if (column_type) { |
252 | 1.57k | reader_result = inverted_iterator->select_best_reader(column_type, effective_query_type, |
253 | 1.57k | analyzer_key); |
254 | 1.57k | } else { |
255 | 17 | reader_result = inverted_iterator->select_best_reader(analyzer_key); |
256 | 17 | } |
257 | | |
258 | 1.59k | if (!reader_result.has_value()) { |
259 | 0 | return reader_result.error(); |
260 | 0 | } |
261 | | |
262 | 1.59k | auto inverted_reader = reader_result.value(); |
263 | 1.59k | if (inverted_reader == nullptr) { |
264 | 0 | return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>( |
265 | 0 | "selected reader is null for field '{}'", field_name); |
266 | 0 | } |
267 | | |
268 | 1.59k | auto index_file_reader = inverted_reader->get_index_file_reader(); |
269 | 1.59k | if (index_file_reader == nullptr) { |
270 | 0 | return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>( |
271 | 0 | "index file reader is null for field '{}'", field_name); |
272 | 0 | } |
273 | | |
274 | | // Use InvertedIndexSearcherCache to avoid re-opening index files repeatedly |
275 | 1.59k | auto index_file_key = |
276 | 1.59k | index_file_reader->get_index_file_cache_key(&inverted_reader->get_index_meta()); |
277 | 1.59k | InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); |
278 | 1.59k | InvertedIndexCacheHandle searcher_cache_handle; |
279 | 1.59k | bool cache_hit = InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, |
280 | 1.59k | &searcher_cache_handle); |
281 | | |
282 | 1.59k | std::shared_ptr<lucene::index::IndexReader> reader_holder; |
283 | 1.59k | if (cache_hit) { |
284 | 1.33k | if (_context->stats) { |
285 | 1.33k | _context->stats->inverted_index_searcher_cache_hit++; |
286 | 1.33k | } |
287 | 1.33k | auto searcher_variant = searcher_cache_handle.get_index_searcher(); |
288 | 1.33k | auto* searcher_ptr = std::get_if<FulltextIndexSearcherPtr>(&searcher_variant); |
289 | 1.33k | if (searcher_ptr != nullptr && *searcher_ptr != nullptr) { |
290 | 1.33k | reader_holder = std::shared_ptr<lucene::index::IndexReader>( |
291 | 1.33k | (*searcher_ptr)->getReader(), |
292 | 1.34k | [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ }); |
293 | 1.33k | } |
294 | 1.33k | } |
295 | | |
296 | 1.59k | if (!reader_holder) { |
297 | 262 | if (_context->stats) { |
298 | 262 | _context->stats->inverted_index_searcher_cache_miss++; |
299 | 262 | } |
300 | | // Cache miss: open directory, build IndexSearcher, insert into cache |
301 | 261 | int64_t dummy_timer = 0; |
302 | 261 | SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_searcher_open_timer |
303 | 261 | : &dummy_timer); |
304 | 261 | RETURN_IF_ERROR( |
305 | 261 | index_file_reader->init(config::inverted_index_read_buffer_size, _context->io_ctx)); |
306 | 261 | auto directory = DORIS_TRY( |
307 | 261 | index_file_reader->open(&inverted_reader->get_index_meta(), _context->io_ctx)); |
308 | | |
309 | 261 | auto index_searcher_builder = DORIS_TRY( |
310 | 261 | IndexSearcherBuilder::create_index_searcher_builder(inverted_reader->type())); |
311 | 261 | auto searcher_result = |
312 | 261 | DORIS_TRY(index_searcher_builder->get_index_searcher(directory.get())); |
313 | 261 | auto reader_size = index_searcher_builder->get_reader_size(); |
314 | | |
315 | | // Initialization reads are done. Clear io_ctx on the main stream so the |
316 | | // cached searcher does not carry a stale reference. Subsequent query-phase |
317 | | // reads receive the caller's io_ctx through the CLucene API parameters |
318 | | // (termDocs/termPositions/terms) — the same pattern used by the MATCH path |
319 | | // in InvertedIndexReader::create_index_searcher(). |
320 | 261 | auto* stream = static_cast<DorisCompoundReader*>(directory.get())->getDorisIndexInput(); |
321 | 261 | DBUG_EXECUTE_IF( |
322 | 261 | "FieldReaderResolver.resolve.io_ctx", ({ |
323 | 261 | const auto* cur_io_ctx = (const io::IOContext*)stream->getIoContext(); |
324 | 261 | if (cur_io_ctx->file_cache_stats) { |
325 | 261 | if (cur_io_ctx->file_cache_stats != &_context->stats->file_cache_stats) { |
326 | 261 | LOG(FATAL) << "search: io_ctx file_cache_stats mismatch: " |
327 | 261 | << cur_io_ctx->file_cache_stats << " vs " |
328 | 261 | << &_context->stats->file_cache_stats; |
329 | 261 | } |
330 | 261 | } |
331 | 261 | })); |
332 | 261 | stream->setIoContext(nullptr); |
333 | 261 | stream->setIndexFile(false); |
334 | | |
335 | 261 | auto* cache_value = new InvertedIndexSearcherCache::CacheValue(std::move(searcher_result), |
336 | 261 | reader_size, UnixMillis()); |
337 | 261 | InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value, |
338 | 261 | &searcher_cache_handle); |
339 | | |
340 | 261 | auto new_variant = searcher_cache_handle.get_index_searcher(); |
341 | 261 | auto* new_ptr = std::get_if<FulltextIndexSearcherPtr>(&new_variant); |
342 | 261 | if (new_ptr != nullptr && *new_ptr != nullptr) { |
343 | 260 | reader_holder = std::shared_ptr<lucene::index::IndexReader>( |
344 | 260 | (*new_ptr)->getReader(), |
345 | 260 | [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ }); |
346 | 260 | } |
347 | | |
348 | 261 | if (!reader_holder) { |
349 | 0 | return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>( |
350 | 0 | "failed to build IndexSearcher for field '{}'", field_name); |
351 | 0 | } |
352 | 261 | } |
353 | | |
354 | 1.59k | _searcher_cache_handles.push_back(std::move(searcher_cache_handle)); |
355 | | |
356 | 1.59k | FieldReaderBinding resolved; |
357 | 1.59k | resolved.logical_field_name = field_name; |
358 | 1.59k | resolved.stored_field_name = stored_field_name; |
359 | 1.59k | resolved.stored_field_wstr = StringHelper::to_wstring(resolved.stored_field_name); |
360 | 1.59k | resolved.column_type = column_type; |
361 | 1.59k | resolved.query_type = effective_query_type; |
362 | 1.59k | resolved.inverted_reader = inverted_reader; |
363 | 1.59k | resolved.lucene_reader = reader_holder; |
364 | | // Prefer FE-provided index_properties (needed for variant subcolumn field_pattern matching) |
365 | | // Reuse fb_it from earlier lookup above. |
366 | 1.59k | if (fb_it != _field_binding_map.end() && fb_it->second->__isset.index_properties && |
367 | 1.59k | !fb_it->second->index_properties.empty()) { |
368 | 1.28k | resolved.index_properties = fb_it->second->index_properties; |
369 | 1.28k | } else { |
370 | 305 | resolved.index_properties = inverted_reader->get_index_properties(); |
371 | 305 | } |
372 | 1.59k | resolved.binding_key = binding_key; |
373 | 1.59k | resolved.analyzer_key = |
374 | 1.59k | normalize_analyzer_key(build_analyzer_key_from_properties(resolved.index_properties)); |
375 | | |
376 | 1.59k | _binding_readers[binding_key] = reader_holder; |
377 | 1.59k | _field_readers[resolved.stored_field_wstr] = reader_holder; |
378 | 1.59k | _readers.emplace_back(reader_holder); |
379 | 1.59k | _cache.emplace(binding_key, resolved); |
380 | 1.59k | *binding = resolved; |
381 | 1.59k | return Status::OK(); |
382 | 1.59k | } |
383 | | |
384 | | Status FunctionSearch::execute_impl(FunctionContext* /*context*/, Block& /*block*/, |
385 | | const ColumnNumbers& /*arguments*/, uint32_t /*result*/, |
386 | 4 | size_t /*input_rows_count*/) const { |
387 | 4 | return Status::RuntimeError("only inverted index queries are supported"); |
388 | 4 | } |
389 | | |
390 | | // Enhanced implementation: Handle new parameter structure (DSL + SlotReferences) |
391 | | Status FunctionSearch::evaluate_inverted_index( |
392 | | const ColumnsWithTypeAndName& arguments, |
393 | | const std::vector<IndexFieldNameAndTypePair>& data_type_with_names, |
394 | | std::vector<IndexIterator*> iterators, uint32_t num_rows, |
395 | | const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/, |
396 | 1 | InvertedIndexResultBitmap& bitmap_result) const { |
397 | 1 | return Status::OK(); |
398 | 1 | } |
399 | | |
400 | | Status FunctionSearch::evaluate_inverted_index_with_search_param( |
401 | | const TSearchParam& search_param, |
402 | | const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names, |
403 | | std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows, |
404 | 31 | InvertedIndexResultBitmap& bitmap_result, bool enable_cache) const { |
405 | 31 | static const std::unordered_map<std::string, int> empty_field_to_column_id; |
406 | 31 | return evaluate_inverted_index_with_search_param( |
407 | 31 | search_param, data_type_with_names, std::move(iterators), num_rows, bitmap_result, |
408 | 31 | enable_cache, nullptr, empty_field_to_column_id); |
409 | 31 | } |
410 | | |
411 | | Status FunctionSearch::evaluate_inverted_index_with_search_param( |
412 | | const TSearchParam& search_param, |
413 | | const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names, |
414 | | std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows, |
415 | | InvertedIndexResultBitmap& bitmap_result, bool enable_cache, |
416 | | const IndexExecContext* index_exec_ctx, |
417 | | const std::unordered_map<std::string, int>& field_name_to_column_id, |
418 | 1.27k | const std::shared_ptr<IndexQueryContext>& index_query_context) const { |
419 | 1.27k | const bool is_nested_query = search_param.root.clause_type == "NESTED"; |
420 | 1.27k | if (is_nested_query && !is_nested_group_search_supported()) { |
421 | 3 | return Status::NotSupported( |
422 | 3 | "NESTED query requires NestedGroup support, which is unavailable in this build"); |
423 | 3 | } |
424 | | |
425 | 1.27k | if (!is_nested_query && (iterators.empty() || data_type_with_names.empty())) { |
426 | 5 | LOG(INFO) << "No indexed columns or iterators available, returning empty result, dsl:" |
427 | 5 | << search_param.original_dsl; |
428 | 5 | bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(), |
429 | 5 | std::make_shared<roaring::Roaring>()); |
430 | 5 | return Status::OK(); |
431 | 5 | } |
432 | | |
433 | | // DSL result cache: reuse InvertedIndexQueryCache with SEARCH_DSL_QUERY type |
434 | 1.26k | auto* dsl_cache = enable_cache ? InvertedIndexQueryCache::instance() : nullptr; |
435 | 1.26k | std::string seg_prefix; |
436 | 1.26k | std::string dsl_sig; |
437 | 1.26k | InvertedIndexQueryCache::CacheKey dsl_cache_key; |
438 | 1.26k | bool cache_usable = false; |
439 | 1.26k | if (dsl_cache) { |
440 | 1.24k | seg_prefix = extract_segment_prefix(iterators); |
441 | 1.24k | dsl_sig = build_dsl_signature(search_param); |
442 | 1.24k | if (!seg_prefix.empty() && !dsl_sig.empty()) { |
443 | 1.22k | dsl_cache_key = InvertedIndexQueryCache::CacheKey { |
444 | 1.22k | seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, |
445 | 1.22k | dsl_sig}; |
446 | 1.22k | cache_usable = true; |
447 | 1.22k | InvertedIndexQueryCacheHandle dsl_cache_handle; |
448 | 1.22k | if (dsl_cache->lookup(dsl_cache_key, &dsl_cache_handle)) { |
449 | 169 | auto cached_bitmap = dsl_cache_handle.get_bitmap(); |
450 | 169 | if (cached_bitmap) { |
451 | | // Also retrieve cached null bitmap for three-valued SQL logic |
452 | | // (needed by compound operators NOT, OR, AND in VCompoundPred) |
453 | 168 | auto null_cache_key = InvertedIndexQueryCache::CacheKey { |
454 | 168 | seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, |
455 | 168 | dsl_sig + "__null"}; |
456 | 168 | InvertedIndexQueryCacheHandle null_cache_handle; |
457 | 168 | std::shared_ptr<roaring::Roaring> null_bitmap; |
458 | 169 | if (dsl_cache->lookup(null_cache_key, &null_cache_handle)) { |
459 | 169 | null_bitmap = null_cache_handle.get_bitmap(); |
460 | 169 | } |
461 | 168 | if (!null_bitmap) { |
462 | 0 | null_bitmap = std::make_shared<roaring::Roaring>(); |
463 | 0 | } |
464 | 168 | bitmap_result = |
465 | 168 | InvertedIndexResultBitmap(cached_bitmap, std::move(null_bitmap)); |
466 | 168 | return Status::OK(); |
467 | 168 | } |
468 | 169 | } |
469 | 1.22k | } |
470 | 1.24k | } |
471 | | |
472 | | // Track overall query time (equivalent to inverted_index_query_timer in MATCH path) |
473 | 1.10k | int64_t query_timer_dummy = 0; |
474 | 1.10k | OlapReaderStatistics* outer_stats = index_query_context ? index_query_context->stats : nullptr; |
475 | 1.10k | SCOPED_RAW_TIMER(outer_stats ? &outer_stats->inverted_index_query_timer : &query_timer_dummy); |
476 | | |
477 | 1.10k | std::shared_ptr<IndexQueryContext> context; |
478 | 1.10k | if (index_query_context) { |
479 | 1.07k | context = index_query_context; |
480 | 1.07k | } else { |
481 | 31 | context = std::make_shared<IndexQueryContext>(); |
482 | 31 | context->collection_statistics = std::make_shared<CollectionStatistics>(); |
483 | 31 | context->collection_similarity = std::make_shared<CollectionSimilarity>(); |
484 | 31 | } |
485 | | |
486 | | // NESTED() queries evaluate predicates on the flattened "element space" of a nested group. |
487 | | // For VARIANT nested groups, the indexed lucene field (stored_field_name) uses: |
488 | | // parent_unique_id + "." + <variant-relative nested path> |
489 | | // where the nested path is rooted at either: |
490 | | // - "__D0_root__" for top-level array<object> (NESTED(data, ...)) |
491 | | // - "<nested_path_after_variant_root>" for object fields (NESTED(data.items, ...)) |
492 | | // |
493 | | // FE field bindings are expressed using logical column paths (e.g. "data.items.msg"), so for |
494 | | // NESTED() we normalize stored_field_name suffix to be consistent with the nested group root. |
495 | 1.10k | std::unordered_map<std::string, IndexFieldNameAndTypePair> patched_data_type_with_names; |
496 | 1.10k | const auto* effective_data_type_with_names = &data_type_with_names; |
497 | 1.10k | if (is_nested_query && search_param.root.__isset.nested_path) { |
498 | 0 | const std::string& nested_path = search_param.root.nested_path; |
499 | 0 | const auto dot_pos = nested_path.find('.'); |
500 | 0 | const std::string root_field = |
501 | 0 | (dot_pos == std::string::npos) ? nested_path : nested_path.substr(0, dot_pos); |
502 | 0 | const std::string root_prefix = root_field + "."; |
503 | 0 | const std::string array_path = (dot_pos == std::string::npos) |
504 | 0 | ? std::string(segment_v2::kRootNestedGroupPath) |
505 | 0 | : nested_path.substr(dot_pos + 1); |
506 | |
|
507 | 0 | bool copied = false; |
508 | 0 | for (const auto& fb : search_param.field_bindings) { |
509 | 0 | if (!fb.__isset.is_variant_subcolumn || !fb.is_variant_subcolumn) { |
510 | 0 | continue; |
511 | 0 | } |
512 | 0 | if (fb.field_name.empty()) { |
513 | 0 | continue; |
514 | 0 | } |
515 | 0 | const auto it_orig = data_type_with_names.find(fb.field_name); |
516 | 0 | if (it_orig == data_type_with_names.end()) { |
517 | 0 | continue; |
518 | 0 | } |
519 | 0 | const std::string& old_stored = it_orig->second.first; |
520 | 0 | const auto first_dot = old_stored.find('.'); |
521 | 0 | if (first_dot == std::string::npos) { |
522 | 0 | continue; |
523 | 0 | } |
524 | 0 | std::string sub_path; |
525 | 0 | if (fb.__isset.subcolumn_path && !fb.subcolumn_path.empty()) { |
526 | 0 | sub_path = fb.subcolumn_path; |
527 | 0 | } else if (fb.field_name.starts_with(nested_path + ".")) { |
528 | 0 | sub_path = fb.field_name.substr(nested_path.size() + 1); |
529 | 0 | } else if (fb.field_name.starts_with(root_prefix)) { |
530 | 0 | sub_path = fb.field_name.substr(root_prefix.size()); |
531 | 0 | } else { |
532 | 0 | sub_path = fb.field_name; |
533 | 0 | } |
534 | 0 | if (sub_path.empty()) { |
535 | 0 | continue; |
536 | 0 | } |
537 | 0 | const std::string array_prefix = array_path + "."; |
538 | 0 | const std::string suffix_path = |
539 | 0 | sub_path.starts_with(array_prefix) ? sub_path : (array_prefix + sub_path); |
540 | 0 | const std::string parent_uid = old_stored.substr(0, first_dot); |
541 | 0 | const std::string expected_stored = parent_uid + "." + suffix_path; |
542 | 0 | if (old_stored == expected_stored) { |
543 | 0 | continue; |
544 | 0 | } |
545 | | |
546 | 0 | if (!copied) { |
547 | 0 | patched_data_type_with_names = data_type_with_names; |
548 | 0 | effective_data_type_with_names = &patched_data_type_with_names; |
549 | 0 | copied = true; |
550 | 0 | } |
551 | 0 | auto it = patched_data_type_with_names.find(fb.field_name); |
552 | 0 | if (it == patched_data_type_with_names.end()) { |
553 | 0 | continue; |
554 | 0 | } |
555 | 0 | it->second.first = expected_stored; |
556 | 0 | } |
557 | 0 | } |
558 | | |
559 | | // Pass field_bindings to resolver for variant subcolumn detection |
560 | 1.10k | FieldReaderResolver resolver(*effective_data_type_with_names, iterators, context, |
561 | 1.10k | search_param.field_bindings); |
562 | | |
563 | 1.10k | if (is_nested_query) { |
564 | 0 | std::shared_ptr<roaring::Roaring> row_bitmap; |
565 | 0 | RETURN_IF_ERROR(evaluate_nested_query(search_param, search_param.root, context, resolver, |
566 | 0 | num_rows, index_exec_ctx, field_name_to_column_id, |
567 | 0 | row_bitmap)); |
568 | 0 | bitmap_result = InvertedIndexResultBitmap(std::move(row_bitmap), |
569 | 0 | std::make_shared<roaring::Roaring>()); |
570 | 0 | bitmap_result.mask_out_null(); |
571 | 0 | return Status::OK(); |
572 | 0 | } |
573 | | |
574 | | // Extract default_operator from TSearchParam (default: "or") |
575 | 1.10k | std::string default_operator = "or"; |
576 | 1.10k | if (search_param.__isset.default_operator && !search_param.default_operator.empty()) { |
577 | 1.07k | default_operator = search_param.default_operator; |
578 | 1.07k | } |
579 | | // Extract minimum_should_match from TSearchParam (-1 means not set) |
580 | 1.10k | int32_t minimum_should_match = -1; |
581 | 1.10k | if (search_param.__isset.minimum_should_match) { |
582 | 48 | minimum_should_match = search_param.minimum_should_match; |
583 | 48 | } |
584 | | |
585 | 1.10k | auto* stats = context->stats; |
586 | 1.10k | int64_t dummy_timer = 0; |
587 | 1.10k | SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_timer : &dummy_timer); |
588 | | |
589 | 1.10k | query_v2::QueryPtr root_query; |
590 | 1.10k | std::string root_binding_key; |
591 | 1.10k | { |
592 | 1.10k | int64_t init_dummy = 0; |
593 | 1.10k | SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_init_timer : &init_dummy); |
594 | 1.10k | RETURN_IF_ERROR(build_query_recursive(search_param.root, context, resolver, &root_query, |
595 | 1.10k | &root_binding_key, default_operator, |
596 | 1.10k | minimum_should_match)); |
597 | 1.10k | } |
598 | 1.07k | if (root_query == nullptr) { |
599 | 0 | LOG(INFO) << "search: Query tree resolved to empty query, dsl:" |
600 | 0 | << search_param.original_dsl; |
601 | 0 | bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(), |
602 | 0 | std::make_shared<roaring::Roaring>()); |
603 | 0 | return Status::OK(); |
604 | 0 | } |
605 | | |
606 | 1.07k | ResolverNullBitmapAdapter null_resolver(resolver); |
607 | 1.07k | query_v2::QueryExecutionContext exec_ctx = |
608 | 1.07k | build_query_execution_context(num_rows, resolver, &null_resolver); |
609 | | |
610 | 1.07k | bool enable_scoring = false; |
611 | 1.07k | bool is_asc = false; |
612 | 1.07k | size_t top_k = 0; |
613 | 1.07k | if (index_query_context) { |
614 | 1.07k | enable_scoring = index_query_context->collection_similarity != nullptr; |
615 | 1.07k | is_asc = index_query_context->is_asc; |
616 | 1.07k | top_k = index_query_context->query_limit; |
617 | 1.07k | } |
618 | | |
619 | 1.07k | auto weight = root_query->weight(enable_scoring); |
620 | 1.07k | if (!weight) { |
621 | 0 | LOG(WARNING) << "search: Failed to build query weight"; |
622 | 0 | bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(), |
623 | 0 | std::make_shared<roaring::Roaring>()); |
624 | 0 | return Status::OK(); |
625 | 0 | } |
626 | | |
627 | 1.07k | std::shared_ptr<roaring::Roaring> roaring = std::make_shared<roaring::Roaring>(); |
628 | 1.07k | { |
629 | 1.07k | int64_t exec_dummy = 0; |
630 | 1.07k | SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_exec_timer : &exec_dummy); |
631 | 1.07k | if (enable_scoring && !is_asc && top_k > 0) { |
632 | 0 | bool use_wand = index_query_context->runtime_state != nullptr && |
633 | 0 | index_query_context->runtime_state->query_options() |
634 | 0 | .enable_inverted_index_wand_query; |
635 | 0 | query_v2::collect_multi_segment_top_k( |
636 | 0 | weight, exec_ctx, root_binding_key, top_k, roaring, |
637 | 0 | index_query_context->collection_similarity, use_wand); |
638 | 1.07k | } else { |
639 | 1.07k | query_v2::collect_multi_segment_doc_set( |
640 | 1.07k | weight, exec_ctx, root_binding_key, roaring, |
641 | 1.07k | index_query_context ? index_query_context->collection_similarity : nullptr, |
642 | 1.07k | enable_scoring); |
643 | 1.07k | } |
644 | 1.07k | } |
645 | | |
646 | 18.4E | VLOG_DEBUG << "search: Query completed, matched " << roaring->cardinality() << " documents"; |
647 | | |
648 | | // Extract NULL bitmap from three-valued logic scorer |
649 | | // The scorer correctly computes which documents evaluate to NULL based on query logic |
650 | | // For example: TRUE OR NULL = TRUE (not NULL), FALSE OR NULL = NULL |
651 | 1.07k | std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>(); |
652 | 1.08k | if (exec_ctx.null_resolver) { |
653 | 1.08k | auto scorer = weight->scorer(exec_ctx, root_binding_key); |
654 | 1.08k | if (scorer && scorer->has_null_bitmap(exec_ctx.null_resolver)) { |
655 | 224 | const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver); |
656 | 224 | if (bitmap != nullptr) { |
657 | 224 | *null_bitmap = *bitmap; |
658 | 224 | VLOG_TRACE << "search: Extracted NULL bitmap with " << null_bitmap->cardinality() |
659 | 0 | << " NULL documents"; |
660 | 224 | } |
661 | 224 | } |
662 | 1.08k | } |
663 | | |
664 | 18.4E | VLOG_TRACE << "search: Before mask - true_bitmap=" << roaring->cardinality() |
665 | 18.4E | << ", null_bitmap=" << null_bitmap->cardinality(); |
666 | | |
667 | | // Create result and mask out NULLs (SQL WHERE clause semantics: only TRUE rows) |
668 | 1.07k | bitmap_result = InvertedIndexResultBitmap(std::move(roaring), std::move(null_bitmap)); |
669 | 1.07k | bitmap_result.mask_out_null(); |
670 | | |
671 | 18.4E | VLOG_TRACE << "search: After mask - result_bitmap=" |
672 | 18.4E | << bitmap_result.get_data_bitmap()->cardinality(); |
673 | | |
674 | | // Insert post-mask_out_null result into DSL cache for future reuse |
675 | | // Cache both data bitmap and null bitmap so compound operators (NOT, OR, AND) |
676 | | // can apply correct three-valued SQL logic on cache hit |
677 | 1.08k | if (dsl_cache && cache_usable) { |
678 | 1.08k | InvertedIndexQueryCacheHandle insert_handle; |
679 | 1.08k | dsl_cache->insert(dsl_cache_key, bitmap_result.get_data_bitmap(), &insert_handle); |
680 | 1.08k | if (bitmap_result.get_null_bitmap()) { |
681 | 1.07k | auto null_cache_key = InvertedIndexQueryCache::CacheKey { |
682 | 1.07k | seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, |
683 | 1.07k | dsl_sig + "__null"}; |
684 | 1.07k | InvertedIndexQueryCacheHandle null_insert_handle; |
685 | 1.07k | dsl_cache->insert(null_cache_key, bitmap_result.get_null_bitmap(), &null_insert_handle); |
686 | 1.07k | } |
687 | 1.08k | } |
688 | | |
689 | 1.07k | return Status::OK(); |
690 | 1.07k | } |
691 | | |
692 | | Status FunctionSearch::evaluate_nested_query( |
693 | | const TSearchParam& search_param, const TSearchClause& nested_clause, |
694 | | const std::shared_ptr<IndexQueryContext>& context, FieldReaderResolver& resolver, |
695 | | uint32_t num_rows, const IndexExecContext* index_exec_ctx, |
696 | | const std::unordered_map<std::string, int>& field_name_to_column_id, |
697 | 7 | std::shared_ptr<roaring::Roaring>& result_bitmap) const { |
698 | 7 | (void)field_name_to_column_id; |
699 | 7 | if (!(nested_clause.__isset.nested_path)) { |
700 | 2 | return Status::InvalidArgument("NESTED clause missing nested_path"); |
701 | 2 | } |
702 | 5 | if (!(nested_clause.__isset.children) || nested_clause.children.empty()) { |
703 | 2 | return Status::InvalidArgument("NESTED clause missing inner query"); |
704 | 2 | } |
705 | 3 | if (result_bitmap == nullptr) { |
706 | 2 | result_bitmap = std::make_shared<roaring::Roaring>(); |
707 | 2 | } else { |
708 | 1 | *result_bitmap = roaring::Roaring(); |
709 | 1 | } |
710 | | |
711 | | // 1. Get the nested group chain directly |
712 | 3 | std::string root_field = nested_clause.nested_path; |
713 | 3 | auto dot_pos = nested_clause.nested_path.find('.'); |
714 | 3 | if (dot_pos != std::string::npos) { |
715 | 1 | root_field = nested_clause.nested_path.substr(0, dot_pos); |
716 | 1 | } |
717 | 3 | if (index_exec_ctx == nullptr || index_exec_ctx->segment() == nullptr) { |
718 | 3 | return Status::InvalidArgument("NESTED query requires IndexExecContext with valid segment"); |
719 | 3 | } |
720 | 0 | auto* segment = index_exec_ctx->segment(); |
721 | 0 | const int32_t ordinal = segment->tablet_schema()->field_index(root_field); |
722 | 0 | if (ordinal < 0) { |
723 | 0 | return Status::InvalidArgument("Column '{}' not found in tablet schema for nested query", |
724 | 0 | root_field); |
725 | 0 | } |
726 | 0 | const ColumnId column_id = static_cast<ColumnId>(ordinal); |
727 | |
|
728 | 0 | std::shared_ptr<segment_v2::ColumnReader> column_reader; |
729 | 0 | RETURN_IF_ERROR(segment->get_column_reader(segment->tablet_schema()->column(column_id), |
730 | 0 | &column_reader, |
731 | 0 | index_exec_ctx->column_iter_opts().stats)); |
732 | 0 | auto* variant_reader = dynamic_cast<segment_v2::VariantColumnReader*>(column_reader.get()); |
733 | 0 | if (variant_reader == nullptr) { |
734 | 0 | return Status::InvalidArgument("Column '{}' is not VARIANT for nested query", root_field); |
735 | 0 | } |
736 | | |
737 | 0 | std::string array_path; |
738 | 0 | if (dot_pos == std::string::npos) { |
739 | 0 | array_path = std::string(segment_v2::kRootNestedGroupPath); |
740 | 0 | } else { |
741 | 0 | array_path = nested_clause.nested_path.substr(dot_pos + 1); |
742 | 0 | } |
743 | |
|
744 | 0 | auto [found, group_chain, _] = variant_reader->collect_nested_group_chain(array_path); |
745 | 0 | if (!found || group_chain.empty()) { |
746 | 0 | return Status::OK(); |
747 | 0 | } |
748 | | |
749 | | // Use the read provider for element counting and bitmap mapping. |
750 | 0 | auto read_provider = segment_v2::create_nested_group_read_provider(); |
751 | 0 | if (!read_provider || !read_provider->should_enable_nested_group_read_path()) { |
752 | 0 | return Status::NotSupported( |
753 | 0 | "NestedGroup search is an enterprise capability, not available in this build"); |
754 | 0 | } |
755 | | |
756 | 0 | auto& leaf_group = group_chain.back(); |
757 | 0 | uint64_t total_elements = 0; |
758 | 0 | RETURN_IF_ERROR(read_provider->get_total_elements(index_exec_ctx->column_iter_opts(), |
759 | 0 | leaf_group, &total_elements)); |
760 | 0 | if (total_elements == 0) { |
761 | 0 | return Status::OK(); |
762 | 0 | } |
763 | | |
764 | | // 3. Evaluate inner query |
765 | 0 | std::string default_operator = "or"; |
766 | 0 | if (search_param.__isset.default_operator && !search_param.default_operator.empty()) { |
767 | 0 | default_operator = search_param.default_operator; |
768 | 0 | } |
769 | 0 | int32_t minimum_should_match = -1; |
770 | 0 | if (search_param.__isset.minimum_should_match) { |
771 | 0 | minimum_should_match = search_param.minimum_should_match; |
772 | 0 | } |
773 | |
|
774 | 0 | query_v2::QueryPtr inner_query; |
775 | 0 | std::string inner_binding_key; |
776 | 0 | RETURN_IF_ERROR(build_query_recursive(nested_clause.children[0], context, resolver, |
777 | 0 | &inner_query, &inner_binding_key, default_operator, |
778 | 0 | minimum_should_match)); |
779 | 0 | if (inner_query == nullptr) { |
780 | 0 | return Status::OK(); |
781 | 0 | } |
782 | | |
783 | 0 | if (total_elements > std::numeric_limits<uint32_t>::max()) { |
784 | 0 | return Status::InvalidArgument("nested element_count exceeds uint32_t max"); |
785 | 0 | } |
786 | | |
787 | 0 | ResolverNullBitmapAdapter null_resolver(resolver); |
788 | 0 | query_v2::QueryExecutionContext exec_ctx = build_query_execution_context( |
789 | 0 | static_cast<uint32_t>(total_elements), resolver, &null_resolver); |
790 | |
|
791 | 0 | auto weight = inner_query->weight(false); |
792 | 0 | if (!weight) { |
793 | 0 | return Status::OK(); |
794 | 0 | } |
795 | 0 | auto scorer = weight->scorer(exec_ctx, inner_binding_key); |
796 | 0 | if (!scorer) { |
797 | 0 | return Status::OK(); |
798 | 0 | } |
799 | | |
800 | 0 | roaring::Roaring element_bitmap; |
801 | 0 | uint32_t doc = scorer->doc(); |
802 | 0 | while (doc != query_v2::TERMINATED) { |
803 | 0 | element_bitmap.add(doc); |
804 | 0 | doc = scorer->advance(); |
805 | 0 | } |
806 | |
|
807 | 0 | if (scorer->has_null_bitmap(exec_ctx.null_resolver)) { |
808 | 0 | const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver); |
809 | 0 | if (bitmap != nullptr && !bitmap->isEmpty()) { |
810 | 0 | element_bitmap -= *bitmap; |
811 | 0 | } |
812 | 0 | } |
813 | | |
814 | | // 4. Map element-level hits back to row-level hits through NestedGroup chain. |
815 | 0 | if (result_bitmap == nullptr) { |
816 | 0 | result_bitmap = std::make_shared<roaring::Roaring>(); |
817 | 0 | } |
818 | 0 | roaring::Roaring parent_bitmap; |
819 | 0 | RETURN_IF_ERROR(read_provider->map_elements_to_parent_ords( |
820 | 0 | group_chain, index_exec_ctx->column_iter_opts(), element_bitmap, &parent_bitmap)); |
821 | 0 | *result_bitmap = std::move(parent_bitmap); |
822 | 0 | return Status::OK(); |
823 | 0 | } |
824 | | |
825 | | // Aligned with FE QsClauseType enum - uses enum.name() as clause_type |
826 | | FunctionSearch::ClauseTypeCategory FunctionSearch::get_clause_type_category( |
827 | 7.18k | const std::string& clause_type) const { |
828 | 7.18k | if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" || |
829 | 7.18k | clause_type == "OCCUR_BOOLEAN" || clause_type == "NESTED") { |
830 | 146 | return ClauseTypeCategory::COMPOUND; |
831 | 7.04k | } else if (clause_type == "TERM" || clause_type == "PREFIX" || clause_type == "WILDCARD" || |
832 | 7.04k | clause_type == "REGEXP" || clause_type == "RANGE" || clause_type == "LIST" || |
833 | 7.04k | clause_type == "EXACT") { |
834 | | // Non-tokenized queries: exact matching, pattern matching, range, list operations |
835 | 6.62k | return ClauseTypeCategory::NON_TOKENIZED; |
836 | 6.62k | } else if (clause_type == "PHRASE" || clause_type == "MATCH" || clause_type == "ANY" || |
837 | 429 | clause_type == "ALL") { |
838 | | // Tokenized queries: phrase search, full-text search, multi-value matching |
839 | | // Note: ANY and ALL require tokenization of their input values |
840 | 429 | return ClauseTypeCategory::TOKENIZED; |
841 | 18.4E | } else { |
842 | | // Default to NON_TOKENIZED for unknown types |
843 | 18.4E | LOG(WARNING) << "Unknown clause type '" << clause_type |
844 | 18.4E | << "', defaulting to NON_TOKENIZED category"; |
845 | 18.4E | return ClauseTypeCategory::NON_TOKENIZED; |
846 | 18.4E | } |
847 | 7.18k | } |
848 | | |
849 | | // Analyze query type for a specific field in the search clause |
850 | | InvertedIndexQueryType FunctionSearch::analyze_field_query_type(const std::string& field_name, |
851 | 5.28k | const TSearchClause& clause) const { |
852 | 5.28k | const std::string& clause_type = clause.clause_type; |
853 | 5.28k | ClauseTypeCategory category = get_clause_type_category(clause_type); |
854 | | |
855 | | // Handle leaf queries - use direct mapping |
856 | 5.28k | if (category != ClauseTypeCategory::COMPOUND) { |
857 | | // Check if this clause targets the specific field |
858 | 5.14k | if (clause.field_name == field_name) { |
859 | | // Use direct mapping from clause_type to InvertedIndexQueryType |
860 | 163 | return clause_type_to_query_type(clause_type); |
861 | 163 | } |
862 | 5.14k | } |
863 | | |
864 | | // Handle boolean queries - recursively analyze children |
865 | 5.11k | if (!clause.children.empty()) { |
866 | 5.09k | for (const auto& child_clause : clause.children) { |
867 | | // Recursively analyze each child |
868 | 5.09k | InvertedIndexQueryType child_type = analyze_field_query_type(field_name, child_clause); |
869 | | // If this child targets the field (not default EQUAL_QUERY), return its query type |
870 | 5.09k | if (child_type != InvertedIndexQueryType::UNKNOWN_QUERY) { |
871 | 124 | return child_type; |
872 | 124 | } |
873 | 5.09k | } |
874 | 132 | } |
875 | | |
876 | | // If no children target this field, return UNKNOWN_QUERY as default |
877 | 4.99k | return InvertedIndexQueryType::UNKNOWN_QUERY; |
878 | 5.11k | } |
879 | | |
880 | | // Map clause_type string to InvertedIndexQueryType |
881 | | InvertedIndexQueryType FunctionSearch::clause_type_to_query_type( |
882 | 2.10k | const std::string& clause_type) const { |
883 | | // Use static map for better performance and maintainability |
884 | 2.10k | static const std::unordered_map<std::string, InvertedIndexQueryType> clause_type_map = { |
885 | | // Boolean operations |
886 | 2.10k | {"AND", InvertedIndexQueryType::BOOLEAN_QUERY}, |
887 | 2.10k | {"OR", InvertedIndexQueryType::BOOLEAN_QUERY}, |
888 | 2.10k | {"NOT", InvertedIndexQueryType::BOOLEAN_QUERY}, |
889 | 2.10k | {"OCCUR_BOOLEAN", InvertedIndexQueryType::BOOLEAN_QUERY}, |
890 | 2.10k | {"NESTED", InvertedIndexQueryType::BOOLEAN_QUERY}, |
891 | | |
892 | | // Non-tokenized queries (exact matching, pattern matching) |
893 | 2.10k | {"TERM", InvertedIndexQueryType::EQUAL_QUERY}, |
894 | 2.10k | {"PREFIX", InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY}, |
895 | 2.10k | {"WILDCARD", InvertedIndexQueryType::WILDCARD_QUERY}, |
896 | 2.10k | {"REGEXP", InvertedIndexQueryType::MATCH_REGEXP_QUERY}, |
897 | 2.10k | {"RANGE", InvertedIndexQueryType::RANGE_QUERY}, |
898 | 2.10k | {"LIST", InvertedIndexQueryType::LIST_QUERY}, |
899 | | |
900 | | // Tokenized queries (full-text search, phrase search) |
901 | 2.10k | {"PHRASE", InvertedIndexQueryType::MATCH_PHRASE_QUERY}, |
902 | 2.10k | {"MATCH", InvertedIndexQueryType::MATCH_ANY_QUERY}, |
903 | 2.10k | {"ANY", InvertedIndexQueryType::MATCH_ANY_QUERY}, |
904 | 2.10k | {"ALL", InvertedIndexQueryType::MATCH_ALL_QUERY}, |
905 | | |
906 | | // Exact match without tokenization |
907 | 2.10k | {"EXACT", InvertedIndexQueryType::EQUAL_QUERY}, |
908 | 2.10k | }; |
909 | | |
910 | 2.10k | auto it = clause_type_map.find(clause_type); |
911 | 2.10k | if (it != clause_type_map.end()) { |
912 | 2.09k | return it->second; |
913 | 2.09k | } |
914 | | |
915 | | // Unknown clause type |
916 | 2.10k | LOG(WARNING) << "Unknown clause type '" << clause_type << "', defaulting to EQUAL_QUERY"; |
917 | 11 | return InvertedIndexQueryType::EQUAL_QUERY; |
918 | 2.10k | } |
919 | | |
920 | | // Map Thrift TSearchOccur to query_v2::Occur |
921 | 863 | static query_v2::Occur map_thrift_occur(TSearchOccur::type thrift_occur) { |
922 | 863 | switch (thrift_occur) { |
923 | 296 | case TSearchOccur::MUST: |
924 | 296 | return query_v2::Occur::MUST; |
925 | 501 | case TSearchOccur::SHOULD: |
926 | 501 | return query_v2::Occur::SHOULD; |
927 | 66 | case TSearchOccur::MUST_NOT: |
928 | 66 | return query_v2::Occur::MUST_NOT; |
929 | 0 | default: |
930 | 0 | return query_v2::Occur::MUST; |
931 | 863 | } |
932 | 863 | } |
933 | | |
934 | | Status FunctionSearch::build_query_recursive(const TSearchClause& clause, |
935 | | const std::shared_ptr<IndexQueryContext>& context, |
936 | | FieldReaderResolver& resolver, |
937 | | inverted_index::query_v2::QueryPtr* out, |
938 | | std::string* binding_key, |
939 | | const std::string& default_operator, |
940 | 2.72k | int32_t minimum_should_match) const { |
941 | 2.72k | DCHECK(out != nullptr); |
942 | 2.72k | *out = nullptr; |
943 | 2.75k | if (binding_key) { |
944 | 2.75k | binding_key->clear(); |
945 | 2.75k | } |
946 | | |
947 | 2.72k | const std::string& clause_type = clause.clause_type; |
948 | | |
949 | | // Handle MATCH_ALL_DOCS - matches all documents in the segment |
950 | 2.72k | if (clause_type == "MATCH_ALL_DOCS") { |
951 | 43 | *out = std::make_shared<query_v2::AllQuery>(); |
952 | 43 | return Status::OK(); |
953 | 43 | } |
954 | | |
955 | | // Handle OCCUR_BOOLEAN - Lucene-style boolean query with MUST/SHOULD/MUST_NOT |
956 | 2.67k | if (clause_type == "OCCUR_BOOLEAN") { |
957 | 420 | auto builder = segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder(); |
958 | | |
959 | | // Set minimum_should_match if specified |
960 | 420 | if (clause.__isset.minimum_should_match) { |
961 | 404 | builder->set_minimum_number_should_match(clause.minimum_should_match); |
962 | 404 | } |
963 | | |
964 | 420 | if (clause.__isset.children) { |
965 | 859 | for (const auto& child_clause : clause.children) { |
966 | 859 | query_v2::QueryPtr child_query; |
967 | 859 | std::string child_binding_key; |
968 | 859 | RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, |
969 | 859 | &child_binding_key, default_operator, |
970 | 859 | minimum_should_match)); |
971 | | |
972 | | // Determine occur type from child clause |
973 | 858 | query_v2::Occur occur = query_v2::Occur::MUST; // default |
974 | 863 | if (child_clause.__isset.occur) { |
975 | 863 | occur = map_thrift_occur(child_clause.occur); |
976 | 863 | } |
977 | | |
978 | 858 | builder->add(child_query, occur, std::move(child_binding_key)); |
979 | 858 | } |
980 | 418 | } |
981 | | |
982 | 419 | *out = builder->build(); |
983 | 419 | return Status::OK(); |
984 | 420 | } |
985 | | |
986 | 2.25k | if (clause_type == "NESTED") { |
987 | 1 | return Status::InvalidArgument("NESTED clause must be evaluated at top level"); |
988 | 1 | } |
989 | | |
990 | | // Handle standard boolean operators (AND/OR/NOT) |
991 | 2.25k | if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT") { |
992 | 376 | query_v2::OperatorType op = query_v2::OperatorType::OP_AND; |
993 | 376 | if (clause_type == "OR") { |
994 | 199 | op = query_v2::OperatorType::OP_OR; |
995 | 199 | } else if (clause_type == "NOT") { |
996 | 90 | op = query_v2::OperatorType::OP_NOT; |
997 | 90 | } |
998 | | |
999 | 376 | auto builder = create_operator_boolean_query_builder(op); |
1000 | 376 | if (clause.__isset.children) { |
1001 | 788 | for (const auto& child_clause : clause.children) { |
1002 | 788 | query_v2::QueryPtr child_query; |
1003 | 788 | std::string child_binding_key; |
1004 | 788 | RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, |
1005 | 788 | &child_binding_key, default_operator, |
1006 | 788 | minimum_should_match)); |
1007 | | // Add all children including empty BitSetQuery |
1008 | | // BooleanQuery will handle the logic: |
1009 | | // - AND with empty bitmap → result is empty |
1010 | | // - OR with empty bitmap → empty bitmap is ignored by OR logic |
1011 | | // - NOT with empty bitmap → NOT(empty) = all rows (handled by BooleanQuery) |
1012 | 787 | builder->add(child_query, std::move(child_binding_key)); |
1013 | 787 | } |
1014 | 375 | } |
1015 | | |
1016 | 375 | *out = builder->build(); |
1017 | 375 | return Status::OK(); |
1018 | 376 | } |
1019 | | |
1020 | 1.88k | return build_leaf_query(clause, context, resolver, out, binding_key, default_operator, |
1021 | 1.88k | minimum_should_match); |
1022 | 2.25k | } |
1023 | | |
1024 | | Status FunctionSearch::build_leaf_query(const TSearchClause& clause, |
1025 | | const std::shared_ptr<IndexQueryContext>& context, |
1026 | | FieldReaderResolver& resolver, |
1027 | | inverted_index::query_v2::QueryPtr* out, |
1028 | | std::string* binding_key, |
1029 | | const std::string& default_operator, |
1030 | 1.92k | int32_t minimum_should_match) const { |
1031 | 1.92k | DCHECK(out != nullptr); |
1032 | 1.92k | *out = nullptr; |
1033 | 1.93k | if (binding_key) { |
1034 | 1.93k | binding_key->clear(); |
1035 | 1.93k | } |
1036 | | |
1037 | 1.92k | if (!clause.__isset.field_name || !clause.__isset.value) { |
1038 | 0 | return Status::InvalidArgument("search clause missing field_name or value"); |
1039 | 0 | } |
1040 | | |
1041 | 1.92k | const std::string& field_name = clause.field_name; |
1042 | 1.92k | const std::string& value = clause.value; |
1043 | 1.92k | const std::string& clause_type = clause.clause_type; |
1044 | | |
1045 | 1.92k | auto query_type = clause_type_to_query_type(clause_type); |
1046 | | // TERM, WILDCARD, PREFIX, and REGEXP in search DSL operate on individual index terms |
1047 | | // (like Lucene TermQuery, WildcardQuery, PrefixQuery, RegexpQuery). |
1048 | | // Override to MATCH_ANY_QUERY so select_best_reader() prefers the FULLTEXT reader |
1049 | | // when multiple indexes exist on the same column (one tokenized, one untokenized). |
1050 | | // Without this, these queries would select the untokenized index and try to match |
1051 | | // patterns like "h*llo" against full strings ("hello world") instead of individual |
1052 | | // tokens ("hello"), returning empty results. |
1053 | | // EXACT must remain EQUAL_QUERY to prefer the untokenized STRING_TYPE reader. |
1054 | | // |
1055 | | // Safe for single-index columns: select_best_reader() has a single-reader fast path |
1056 | | // that returns the only reader directly, bypassing the query_type preference logic. |
1057 | 1.92k | if (clause_type == "TERM" || clause_type == "WILDCARD" || clause_type == "PREFIX" || |
1058 | 1.92k | clause_type == "REGEXP") { |
1059 | 1.41k | query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; |
1060 | 1.41k | } |
1061 | | |
1062 | 1.92k | FieldReaderBinding binding; |
1063 | 1.92k | RETURN_IF_ERROR(resolver.resolve(field_name, query_type, &binding)); |
1064 | | |
1065 | | // Check if binding is empty (variant subcolumn not found in this segment) |
1066 | 1.90k | if (binding.lucene_reader == nullptr) { |
1067 | 3 | LOG(INFO) << "search: No inverted index for field '" << field_name |
1068 | 3 | << "' in this segment, clause_type='" << clause_type |
1069 | 3 | << "', query_type=" << static_cast<int>(query_type) << ", returning no matches"; |
1070 | | // Variant subcolumn doesn't exist - create empty BitSetQuery (no matches) |
1071 | 3 | *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring()); |
1072 | 3 | if (binding_key) { |
1073 | 3 | binding_key->clear(); |
1074 | 3 | } |
1075 | 3 | return Status::OK(); |
1076 | 3 | } |
1077 | | |
1078 | 1.90k | if (binding_key) { |
1079 | 1.88k | *binding_key = binding.binding_key; |
1080 | 1.88k | } |
1081 | | |
1082 | 1.90k | FunctionSearch::ClauseTypeCategory category = get_clause_type_category(clause_type); |
1083 | 1.90k | std::wstring field_wstr = binding.stored_field_wstr; |
1084 | 1.90k | std::wstring value_wstr = StringHelper::to_wstring(value); |
1085 | | |
1086 | 1.91k | auto make_term_query = [&](const std::wstring& term) -> query_v2::QueryPtr { |
1087 | 1.91k | return std::make_shared<query_v2::TermQuery>(context, field_wstr, term); |
1088 | 1.91k | }; |
1089 | | |
1090 | 1.90k | if (clause_type == "TERM") { |
1091 | 1.30k | bool should_analyze = |
1092 | 1.30k | inverted_index::InvertedIndexAnalyzer::should_analyzer(binding.index_properties); |
1093 | 1.30k | if (should_analyze) { |
1094 | 1.05k | if (binding.index_properties.empty()) { |
1095 | 0 | LOG(WARNING) << "search: analyzer required but index properties empty for field '" |
1096 | 0 | << field_name << "'"; |
1097 | 0 | *out = make_term_query(value_wstr); |
1098 | 0 | return Status::OK(); |
1099 | 0 | } |
1100 | | |
1101 | 1.05k | std::vector<TermInfo> term_infos = |
1102 | 1.05k | inverted_index::InvertedIndexAnalyzer::get_analyse_result( |
1103 | 1.05k | value, binding.index_properties); |
1104 | 1.05k | if (term_infos.empty()) { |
1105 | 0 | LOG(WARNING) << "search: No terms found after tokenization for TERM query, field=" |
1106 | 0 | << field_name << ", value='" << value |
1107 | 0 | << "', returning empty BitSetQuery"; |
1108 | 0 | *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring()); |
1109 | 0 | return Status::OK(); |
1110 | 0 | } |
1111 | | |
1112 | 1.05k | if (term_infos.size() == 1) { |
1113 | 1.04k | std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term()); |
1114 | 1.04k | *out = make_term_query(term_wstr); |
1115 | 1.04k | return Status::OK(); |
1116 | 1.04k | } |
1117 | | |
1118 | | // When minimum_should_match is specified, use OccurBooleanQuery |
1119 | | // ES behavior: msm only applies to SHOULD clauses |
1120 | 7 | if (minimum_should_match > 0) { |
1121 | 0 | auto builder = |
1122 | 0 | segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder(); |
1123 | 0 | builder->set_minimum_number_should_match(minimum_should_match); |
1124 | 0 | query_v2::Occur occur = (default_operator == "and") ? query_v2::Occur::MUST |
1125 | 0 | : query_v2::Occur::SHOULD; |
1126 | 0 | for (const auto& term_info : term_infos) { |
1127 | 0 | std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); |
1128 | 0 | builder->add(make_term_query(term_wstr), occur); |
1129 | 0 | } |
1130 | 0 | *out = builder->build(); |
1131 | 0 | return Status::OK(); |
1132 | 0 | } |
1133 | | |
1134 | | // Use default_operator to determine how to combine tokenized terms |
1135 | 7 | query_v2::OperatorType op_type = (default_operator == "and") |
1136 | 7 | ? query_v2::OperatorType::OP_AND |
1137 | 7 | : query_v2::OperatorType::OP_OR; |
1138 | 7 | auto builder = create_operator_boolean_query_builder(op_type); |
1139 | 18 | for (const auto& term_info : term_infos) { |
1140 | 18 | std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); |
1141 | 18 | builder->add(make_term_query(term_wstr), binding.binding_key); |
1142 | 18 | } |
1143 | | |
1144 | 7 | *out = builder->build(); |
1145 | 7 | return Status::OK(); |
1146 | 7 | } |
1147 | | |
1148 | 255 | *out = make_term_query(value_wstr); |
1149 | 255 | return Status::OK(); |
1150 | 1.30k | } |
1151 | | |
1152 | 593 | if (category == FunctionSearch::ClauseTypeCategory::TOKENIZED) { |
1153 | 400 | if (clause_type == "PHRASE") { |
1154 | 123 | bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer( |
1155 | 123 | binding.index_properties); |
1156 | 123 | if (!should_analyze) { |
1157 | 10 | VLOG_DEBUG << "search: PHRASE on non-tokenized field '" << field_name |
1158 | 0 | << "', falling back to TERM"; |
1159 | 10 | *out = make_term_query(value_wstr); |
1160 | 10 | return Status::OK(); |
1161 | 10 | } |
1162 | | |
1163 | 113 | if (binding.index_properties.empty()) { |
1164 | 0 | LOG(WARNING) << "search: analyzer required but index properties empty for PHRASE " |
1165 | 0 | "query on field '" |
1166 | 0 | << field_name << "'"; |
1167 | 0 | *out = make_term_query(value_wstr); |
1168 | 0 | return Status::OK(); |
1169 | 0 | } |
1170 | | |
1171 | 113 | std::vector<TermInfo> term_infos = |
1172 | 113 | inverted_index::InvertedIndexAnalyzer::get_analyse_result( |
1173 | 113 | value, binding.index_properties); |
1174 | 113 | if (term_infos.empty()) { |
1175 | 9 | LOG(WARNING) << "search: No terms found after tokenization for PHRASE query, field=" |
1176 | 9 | << field_name << ", value='" << value |
1177 | 9 | << "', returning empty BitSetQuery"; |
1178 | 9 | *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring()); |
1179 | 9 | return Status::OK(); |
1180 | 9 | } |
1181 | | |
1182 | 104 | std::vector<TermInfo> phrase_term_infos = |
1183 | 104 | QueryHelper::build_phrase_term_infos(term_infos); |
1184 | 104 | if (phrase_term_infos.size() == 1) { |
1185 | 62 | const auto& term_info = phrase_term_infos[0]; |
1186 | 62 | if (term_info.is_single_term()) { |
1187 | 61 | std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); |
1188 | 61 | *out = std::make_shared<query_v2::TermQuery>(context, field_wstr, term_wstr); |
1189 | 61 | } else { |
1190 | 1 | auto builder = |
1191 | 1 | create_operator_boolean_query_builder(query_v2::OperatorType::OP_OR); |
1192 | 1 | for (const auto& term : term_info.get_multi_terms()) { |
1193 | 0 | std::wstring term_wstr = StringHelper::to_wstring(term); |
1194 | 0 | builder->add(make_term_query(term_wstr), binding.binding_key); |
1195 | 0 | } |
1196 | 1 | *out = builder->build(); |
1197 | 1 | } |
1198 | 62 | } else { |
1199 | 42 | if (QueryHelper::is_simple_phrase(phrase_term_infos)) { |
1200 | 21 | *out = std::make_shared<query_v2::PhraseQuery>(context, field_wstr, |
1201 | 21 | phrase_term_infos); |
1202 | 21 | } else { |
1203 | 21 | *out = std::make_shared<query_v2::MultiPhraseQuery>(context, field_wstr, |
1204 | 21 | phrase_term_infos); |
1205 | 21 | } |
1206 | 42 | } |
1207 | | |
1208 | 104 | return Status::OK(); |
1209 | 113 | } |
1210 | 277 | if (clause_type == "MATCH") { |
1211 | 0 | VLOG_DEBUG << "search: MATCH clause not implemented, fallback to TERM"; |
1212 | 0 | *out = make_term_query(value_wstr); |
1213 | 0 | return Status::OK(); |
1214 | 0 | } |
1215 | | |
1216 | 280 | if (clause_type == "ANY" || clause_type == "ALL") { |
1217 | 280 | bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer( |
1218 | 280 | binding.index_properties); |
1219 | 280 | if (!should_analyze) { |
1220 | 1 | *out = make_term_query(value_wstr); |
1221 | 1 | return Status::OK(); |
1222 | 1 | } |
1223 | | |
1224 | 279 | if (binding.index_properties.empty()) { |
1225 | 0 | LOG(WARNING) << "search: index properties empty for tokenized clause '" |
1226 | 0 | << clause_type << "' field=" << field_name; |
1227 | 0 | *out = make_term_query(value_wstr); |
1228 | 0 | return Status::OK(); |
1229 | 0 | } |
1230 | | |
1231 | 279 | std::vector<TermInfo> term_infos = |
1232 | 279 | inverted_index::InvertedIndexAnalyzer::get_analyse_result( |
1233 | 279 | value, binding.index_properties); |
1234 | 279 | if (term_infos.empty()) { |
1235 | 0 | LOG(WARNING) << "search: tokenization yielded no terms for clause '" << clause_type |
1236 | 0 | << "', field=" << field_name << ", returning empty BitSetQuery"; |
1237 | 0 | *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring()); |
1238 | 0 | return Status::OK(); |
1239 | 0 | } |
1240 | | |
1241 | 279 | query_v2::OperatorType bool_type = query_v2::OperatorType::OP_OR; |
1242 | 279 | if (clause_type == "ALL") { |
1243 | 191 | bool_type = query_v2::OperatorType::OP_AND; |
1244 | 191 | } |
1245 | | |
1246 | 279 | if (term_infos.size() == 1) { |
1247 | 108 | std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term()); |
1248 | 108 | *out = make_term_query(term_wstr); |
1249 | 108 | return Status::OK(); |
1250 | 108 | } |
1251 | | |
1252 | 171 | auto builder = create_operator_boolean_query_builder(bool_type); |
1253 | 392 | for (const auto& term_info : term_infos) { |
1254 | 392 | std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); |
1255 | 392 | builder->add(make_term_query(term_wstr), binding.binding_key); |
1256 | 392 | } |
1257 | 171 | *out = builder->build(); |
1258 | 171 | return Status::OK(); |
1259 | 279 | } |
1260 | | |
1261 | | // Default tokenized clause fallback |
1262 | 18.4E | *out = make_term_query(value_wstr); |
1263 | 18.4E | return Status::OK(); |
1264 | 277 | } |
1265 | | |
1266 | 193 | if (category == FunctionSearch::ClauseTypeCategory::NON_TOKENIZED) { |
1267 | 173 | if (clause_type == "EXACT") { |
1268 | | // EXACT match: exact string matching without tokenization |
1269 | | // Note: EXACT prefers untokenized index (STRING_TYPE) which doesn't support lowercase |
1270 | | // If only tokenized index exists, EXACT may return empty results because |
1271 | | // tokenized indexes store individual tokens, not complete strings |
1272 | 82 | *out = make_term_query(value_wstr); |
1273 | 82 | VLOG_DEBUG << "search: EXACT clause processed, field=" << field_name << ", value='" |
1274 | 0 | << value << "'"; |
1275 | 82 | return Status::OK(); |
1276 | 82 | } |
1277 | 91 | if (clause_type == "PREFIX") { |
1278 | | // Apply lowercase only if: |
1279 | | // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing) |
1280 | | // 2. lower_case is explicitly set to "true" |
1281 | 36 | bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer( |
1282 | 36 | binding.index_properties); |
1283 | 36 | std::string lowercase_setting = |
1284 | 36 | get_parser_lowercase_from_properties(binding.index_properties); |
1285 | 36 | bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE); |
1286 | 36 | std::string pattern = should_lowercase ? to_lower(value) : value; |
1287 | 36 | *out = std::make_shared<query_v2::WildcardQuery>(context, field_wstr, pattern); |
1288 | 36 | VLOG_DEBUG << "search: PREFIX clause processed, field=" << field_name << ", pattern='" |
1289 | 0 | << pattern << "' (original='" << value << "', has_parser=" << has_parser |
1290 | 0 | << ", lower_case=" << lowercase_setting << ")"; |
1291 | 36 | return Status::OK(); |
1292 | 36 | } |
1293 | | |
1294 | 55 | if (clause_type == "WILDCARD") { |
1295 | | // Standalone wildcard "*" matches all non-null values for this field |
1296 | | // Consistent with ES query_string behavior where field:* becomes FieldExistsQuery |
1297 | 23 | if (value == "*") { |
1298 | 0 | *out = std::make_shared<query_v2::AllQuery>(field_wstr, true); |
1299 | 0 | VLOG_DEBUG << "search: WILDCARD '*' converted to AllQuery(nullable=true), field=" |
1300 | 0 | << field_name; |
1301 | 0 | return Status::OK(); |
1302 | 0 | } |
1303 | | // Apply lowercase only if: |
1304 | | // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing) |
1305 | | // 2. lower_case is explicitly set to "true" |
1306 | 23 | bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer( |
1307 | 23 | binding.index_properties); |
1308 | 23 | std::string lowercase_setting = |
1309 | 23 | get_parser_lowercase_from_properties(binding.index_properties); |
1310 | 23 | bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE); |
1311 | 23 | std::string pattern = should_lowercase ? to_lower(value) : value; |
1312 | 23 | *out = std::make_shared<query_v2::WildcardQuery>(context, field_wstr, pattern); |
1313 | 23 | VLOG_DEBUG << "search: WILDCARD clause processed, field=" << field_name << ", pattern='" |
1314 | 0 | << pattern << "' (original='" << value << "', has_parser=" << has_parser |
1315 | 0 | << ", lower_case=" << lowercase_setting << ")"; |
1316 | 23 | return Status::OK(); |
1317 | 23 | } |
1318 | | |
1319 | 32 | if (clause_type == "REGEXP") { |
1320 | | // ES-compatible: regex patterns are NOT lowercased (case-sensitive matching) |
1321 | | // This matches ES query_string behavior where regex patterns bypass analysis |
1322 | 29 | *out = std::make_shared<query_v2::RegexpQuery>(context, field_wstr, value); |
1323 | 29 | VLOG_DEBUG << "search: REGEXP clause processed, field=" << field_name << ", pattern='" |
1324 | 0 | << value << "'"; |
1325 | 29 | return Status::OK(); |
1326 | 29 | } |
1327 | | |
1328 | 3 | if (clause_type == "RANGE" || clause_type == "LIST") { |
1329 | 3 | VLOG_DEBUG << "search: clause type '" << clause_type |
1330 | 0 | << "' not implemented, fallback to TERM"; |
1331 | 3 | } |
1332 | 3 | *out = make_term_query(value_wstr); |
1333 | 3 | return Status::OK(); |
1334 | 32 | } |
1335 | | |
1336 | 193 | LOG(WARNING) << "search: Unexpected clause type '" << clause_type << "', using TERM fallback"; |
1337 | 20 | *out = make_term_query(value_wstr); |
1338 | 20 | return Status::OK(); |
1339 | 193 | } |
1340 | | |
1341 | 8 | void register_function_search(SimpleFunctionFactory& factory) { |
1342 | 8 | factory.register_function<FunctionSearch>(); |
1343 | 8 | } |
1344 | | |
1345 | | } // namespace doris |