Coverage Report

Created: 2026-03-17 11:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_search.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/function_search.h"
19
20
#include <CLucene/config/repl_wchar.h>
21
#include <CLucene/search/Scorer.h>
22
#include <gen_cpp/Exprs_types.h>
23
#include <glog/logging.h>
24
25
#include <limits>
26
#include <memory>
27
#include <roaring/roaring.hh>
28
#include <set>
29
#include <string>
30
#include <unordered_map>
31
#include <unordered_set>
32
#include <vector>
33
34
#include "common/status.h"
35
#include "core/block/columns_with_type_and_name.h"
36
#include "core/column/column_const.h"
37
#include "core/data_type/data_type_string.h"
38
#include "exprs/function/simple_function_factory.h"
39
#include "exprs/vexpr_context.h"
40
#include "storage/index/index_file_reader.h"
41
#include "storage/index/index_query_context.h"
42
#include "storage/index/inverted/analyzer/analyzer.h"
43
#include "storage/index/inverted/inverted_index_iterator.h"
44
#include "storage/index/inverted/inverted_index_parser.h"
45
#include "storage/index/inverted/inverted_index_reader.h"
46
#include "storage/index/inverted/inverted_index_searcher.h"
47
#include "storage/index/inverted/query/query_helper.h"
48
#include "storage/index/inverted/query_v2/all_query/all_query.h"
49
#include "storage/index/inverted/query_v2/bit_set_query/bit_set_query.h"
50
#include "storage/index/inverted/query_v2/boolean_query/boolean_query_builder.h"
51
#include "storage/index/inverted/query_v2/boolean_query/operator.h"
52
#include "storage/index/inverted/query_v2/collect/doc_set_collector.h"
53
#include "storage/index/inverted/query_v2/collect/top_k_collector.h"
54
#include "storage/index/inverted/query_v2/phrase_query/multi_phrase_query.h"
55
#include "storage/index/inverted/query_v2/phrase_query/phrase_query.h"
56
#include "storage/index/inverted/query_v2/regexp_query/regexp_query.h"
57
#include "storage/index/inverted/query_v2/term_query/term_query.h"
58
#include "storage/index/inverted/query_v2/wildcard_query/wildcard_query.h"
59
#include "storage/index/inverted/util/string_helper.h"
60
#include "storage/segment/segment.h"
61
#include "storage/segment/variant/nested_group_path.h"
62
#include "storage/segment/variant/nested_group_provider.h"
63
#include "storage/segment/variant/variant_column_reader.h"
64
#include "storage/types.h"
65
#include "util/string_util.h"
66
#include "util/thrift_util.h"
67
68
namespace doris {
69
70
// Build canonical DSL signature for cache key.
71
// Serializes the entire TSearchParam via Thrift binary protocol so that
72
1.26k
// every field (DSL, AST root, field bindings, default_operator,
73
1.26k
// minimum_should_match, etc.) is included automatically.
74
1.26k
static std::string build_dsl_signature(const TSearchParam& param) {
75
1.26k
    ThriftSerializer ser(false, 1024);
76
1.26k
    TSearchParam copy = param;
77
1.26k
    std::string sig;
78
0
    auto st = ser.serialize(&copy, &sig);
79
0
    if (UNLIKELY(!st.ok())) {
80
0
        LOG(WARNING) << "build_dsl_signature: Thrift serialization failed: " << st.to_string()
81
0
                     << ", caching disabled for this query";
82
1.26k
        return "";
83
1.26k
    }
84
    return sig;
85
}
86
87
// Extract segment path prefix from the first available inverted index iterator.
88
1.24k
// All fields in the same segment share the same path prefix.
89
1.24k
static std::string extract_segment_prefix(
90
1.24k
        const std::unordered_map<std::string, IndexIterator*>& iterators) {
91
1.24k
    for (const auto& [field_name, iter] : iterators) {
92
        auto* inv_iter = dynamic_cast<InvertedIndexIterator*>(iter);
93
1.22k
        if (!inv_iter) continue;
94
1.53k
        // Try fulltext reader first, then string type
95
1.53k
        for (auto type :
96
1.53k
             {InvertedIndexReaderType::FULLTEXT, InvertedIndexReaderType::STRING_TYPE}) {
97
1.53k
            IndexReaderType reader_type = type;
98
1.21k
            auto reader = inv_iter->get_reader(reader_type);
99
1.21k
            if (!reader) continue;
100
1.21k
            auto inv_reader = std::dynamic_pointer_cast<InvertedIndexReader>(reader);
101
1.21k
            if (!inv_reader) continue;
102
1.21k
            auto file_reader = inv_reader->get_index_file_reader();
103
1.21k
            if (!file_reader) continue;
104
1.22k
            return file_reader->get_index_path_prefix();
105
18.4E
        }
106
18.4E
    }
107
23
    VLOG_DEBUG << "extract_segment_prefix: no suitable inverted index reader found across "
108
1.24k
               << iterators.size() << " iterators, caching disabled for this query";
109
    return "";
110
}
111
112
3
namespace {
113
3
114
3
bool is_nested_group_search_supported() {
115
3
    auto provider = segment_v2::create_nested_group_read_provider();
116
    return provider != nullptr && provider->should_enable_nested_group_read_path();
117
}
118
119
1.06k
class ResolverNullBitmapAdapter final : public query_v2::NullBitmapResolver {
120
public:
121
    explicit ResolverNullBitmapAdapter(const FieldReaderResolver& resolver) : _resolver(resolver) {}
122
1.55k
123
1.55k
    segment_v2::IndexIterator* iterator_for(const query_v2::Scorer& /*scorer*/,
124
0
                                            const std::string& logical_field) const override {
125
0
        if (logical_field.empty()) {
126
1.55k
            return nullptr;
127
1.55k
        }
128
        return _resolver.get_iterator(logical_field);
129
    }
130
131
private:
132
    const FieldReaderResolver& _resolver;
133
};
134
1.06k
135
1.06k
void populate_binding_context(const FieldReaderResolver& resolver,
136
1.06k
                              query_v2::QueryExecutionContext* exec_ctx) {
137
1.06k
    DCHECK(exec_ctx != nullptr);
138
1.06k
    exec_ctx->readers = resolver.readers();
139
1.57k
    exec_ctx->reader_bindings = resolver.reader_bindings();
140
1.57k
    exec_ctx->field_reader_bindings = resolver.field_readers();
141
0
    for (const auto& [binding_key, binding] : resolver.binding_cache()) {
142
0
        if (binding_key.empty()) {
143
1.57k
            continue;
144
1.57k
        }
145
1.57k
        query_v2::FieldBindingContext binding_ctx;
146
1.57k
        binding_ctx.logical_field_name = binding.logical_field_name;
147
1.57k
        binding_ctx.stored_field_name = binding.stored_field_name;
148
1.57k
        binding_ctx.stored_field_wstr = binding.stored_field_wstr;
149
1.06k
        exec_ctx->binding_fields.emplace(binding_key, std::move(binding_ctx));
150
    }
151
}
152
153
1.04k
query_v2::QueryExecutionContext build_query_execution_context(
154
1.04k
        uint32_t segment_num_rows, const FieldReaderResolver& resolver,
155
1.04k
        query_v2::NullBitmapResolver* null_resolver) {
156
1.04k
    query_v2::QueryExecutionContext exec_ctx;
157
1.04k
    exec_ctx.segment_num_rows = segment_num_rows;
158
1.04k
    populate_binding_context(resolver, &exec_ctx);
159
1.04k
    exec_ctx.null_resolver = null_resolver;
160
    return exec_ctx;
161
}
162
163
} // namespace
164
165
1.92k
Status FieldReaderResolver::resolve(const std::string& field_name,
166
1.92k
                                    InvertedIndexQueryType query_type,
167
                                    FieldReaderBinding* binding) {
168
    DCHECK(binding != nullptr);
169
1.92k
170
    // Check if this is a variant subcolumn
171
1.92k
    bool is_variant_sub = is_variant_subcolumn(field_name);
172
1.92k
173
    auto data_it = _data_type_with_names.find(field_name);
174
    if (data_it == _data_type_with_names.end()) {
175
8
        // For variant subcolumns, not finding the index is normal (the subcolumn may not exist in this segment)
176
3
        // Return OK but with null binding to signal "no match"
177
0
        if (is_variant_sub) {
178
3
            VLOG_DEBUG << "Variant subcolumn '" << field_name
179
3
                       << "' not found in this segment, treating as no match";
180
3
            *binding = FieldReaderBinding();
181
            return Status::OK();
182
5
        }
183
5
        // For normal fields, this is an error
184
8
        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
185
                "field '{}' not found in inverted index metadata", field_name);
186
1.91k
    }
187
1.91k
188
    const auto& stored_field_name = data_it->second.first;
189
1.91k
    const auto binding_key = binding_key_for(stored_field_name, query_type);
190
1.91k
191
309
    auto cache_it = _cache.find(binding_key);
192
309
    if (cache_it != _cache.end()) {
193
309
        *binding = cache_it->second;
194
        return Status::OK();
195
1.60k
    }
196
1.60k
197
    auto iterator_it = _iterators.find(field_name);
198
17
    if (iterator_it == _iterators.end() || iterator_it->second == nullptr) {
199
0
        // For variant subcolumns, not finding the iterator is normal
200
0
        if (is_variant_sub) {
201
0
            VLOG_DEBUG << "Variant subcolumn '" << field_name
202
0
                       << "' iterator not found in this segment, treating as no match";
203
0
            *binding = FieldReaderBinding();
204
17
            return Status::OK();
205
17
        }
206
17
        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
207
                "iterator not found for field '{}'", field_name);
208
1.59k
    }
209
1.59k
210
2
    auto* inverted_iterator = dynamic_cast<InvertedIndexIterator*>(iterator_it->second);
211
2
    if (inverted_iterator == nullptr) {
212
2
        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
213
                "iterator for field '{}' is not InvertedIndexIterator", field_name);
214
    }
215
216
    // For variant subcolumns, FE resolves the field pattern to a specific index and sends
217
    // its index_properties via TSearchFieldBinding. When FE picks an analyzer-based index,
218
    // upgrade EQUAL_QUERY/WILDCARD_QUERY to MATCH_ANY_QUERY so select_best_reader picks the
219
    // FULLTEXT reader instead of STRING_TYPE. Without this upgrade:
220
    // - TERM (EQUAL_QUERY) clauses would open the wrong (untokenized) index directory
221
    // - WILDCARD clauses would enumerate terms from the wrong index, returning empty results
222
    //
223
    // For regular (non-variant) columns with multiple indexes, the caller (build_leaf_query)
224
    // is responsible for passing the appropriate query_type: MATCH_ANY_QUERY for tokenized
225
    // queries (TERM) and EQUAL_QUERY for exact-match queries (EXACT). This ensures
226
1.58k
    // select_best_reader picks FULLTEXT vs STRING_TYPE correctly without needing an explicit
227
1.58k
    // analyzer key, since the query_type alone drives the reader type preference.
228
1.58k
    InvertedIndexQueryType effective_query_type = query_type;
229
1.58k
    auto fb_it = _field_binding_map.find(field_name);
230
1.58k
    std::string analyzer_key;
231
60
    if (is_variant_sub && fb_it != _field_binding_map.end() &&
232
60
        fb_it->second->__isset.index_properties && !fb_it->second->index_properties.empty()) {
233
60
        analyzer_key = normalize_analyzer_key(
234
60
                build_analyzer_key_from_properties(fb_it->second->index_properties));
235
60
        if (inverted_index::InvertedIndexAnalyzer::should_analyzer(
236
60
                    fb_it->second->index_properties) &&
237
0
            (effective_query_type == InvertedIndexQueryType::EQUAL_QUERY ||
238
0
             effective_query_type == InvertedIndexQueryType::WILDCARD_QUERY)) {
239
60
            effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY;
240
        }
241
1.58k
    }
242
1.58k
243
1.58k
    Result<InvertedIndexReaderPtr> reader_result;
244
1.57k
    const auto& column_type = data_it->second.second;
245
1.57k
    if (column_type) {
246
1.57k
        reader_result = inverted_iterator->select_best_reader(column_type, effective_query_type,
247
19
                                                              analyzer_key);
248
19
    } else {
249
        reader_result = inverted_iterator->select_best_reader(analyzer_key);
250
1.58k
    }
251
0
252
0
    if (!reader_result.has_value()) {
253
        return reader_result.error();
254
1.58k
    }
255
1.58k
256
0
    auto inverted_reader = reader_result.value();
257
0
    if (inverted_reader == nullptr) {
258
0
        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
259
                "selected reader is null for field '{}'", field_name);
260
1.58k
    }
261
1.58k
262
0
    auto index_file_reader = inverted_reader->get_index_file_reader();
263
0
    if (index_file_reader == nullptr) {
264
0
        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
265
                "index file reader is null for field '{}'", field_name);
266
    }
267
1.58k
268
1.58k
    // Use InvertedIndexSearcherCache to avoid re-opening index files repeatedly
269
1.58k
    auto index_file_key =
270
1.58k
            index_file_reader->get_index_file_cache_key(&inverted_reader->get_index_meta());
271
1.58k
    InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key);
272
1.58k
    InvertedIndexCacheHandle searcher_cache_handle;
273
    bool cache_hit = InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key,
274
1.58k
                                                                    &searcher_cache_handle);
275
1.58k
276
1.33k
    std::shared_ptr<lucene::index::IndexReader> reader_holder;
277
1.33k
    if (cache_hit) {
278
1.33k
        auto searcher_variant = searcher_cache_handle.get_index_searcher();
279
1.33k
        auto* searcher_ptr = std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
280
1.33k
        if (searcher_ptr != nullptr && *searcher_ptr != nullptr) {
281
1.34k
            reader_holder = std::shared_ptr<lucene::index::IndexReader>(
282
1.33k
                    (*searcher_ptr)->getReader(),
283
1.33k
                    [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ });
284
        }
285
1.58k
    }
286
287
257
    if (!reader_holder) {
288
257
        // Cache miss: open directory, build IndexSearcher, insert into cache
289
257
        RETURN_IF_ERROR(
290
257
                index_file_reader->init(config::inverted_index_read_buffer_size, _context->io_ctx));
291
        auto directory = DORIS_TRY(
292
257
                index_file_reader->open(&inverted_reader->get_index_meta(), _context->io_ctx));
293
257
294
257
        auto index_searcher_builder = DORIS_TRY(
295
257
                IndexSearcherBuilder::create_index_searcher_builder(inverted_reader->type()));
296
257
        auto searcher_result =
297
                DORIS_TRY(index_searcher_builder->get_index_searcher(directory.get()));
298
257
        auto reader_size = index_searcher_builder->get_reader_size();
299
257
300
257
        auto* cache_value = new InvertedIndexSearcherCache::CacheValue(std::move(searcher_result),
301
257
                                                                       reader_size, UnixMillis());
302
        InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value,
303
257
                                                       &searcher_cache_handle);
304
257
305
260
        auto new_variant = searcher_cache_handle.get_index_searcher();
306
260
        auto* new_ptr = std::get_if<FulltextIndexSearcherPtr>(&new_variant);
307
260
        if (new_ptr != nullptr && *new_ptr != nullptr) {
308
260
            reader_holder = std::shared_ptr<lucene::index::IndexReader>(
309
260
                    (*new_ptr)->getReader(),
310
                    [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ });
311
257
        }
312
0
313
0
        if (!reader_holder) {
314
0
            return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
315
257
                    "failed to build IndexSearcher for field '{}'", field_name);
316
        }
317
1.58k
    }
318
319
1.58k
    _searcher_cache_handles.push_back(std::move(searcher_cache_handle));
320
1.58k
321
1.58k
    FieldReaderBinding resolved;
322
1.58k
    resolved.logical_field_name = field_name;
323
1.58k
    resolved.stored_field_name = stored_field_name;
324
1.58k
    resolved.stored_field_wstr = StringHelper::to_wstring(resolved.stored_field_name);
325
1.58k
    resolved.column_type = column_type;
326
1.58k
    resolved.query_type = effective_query_type;
327
    resolved.inverted_reader = inverted_reader;
328
    resolved.lucene_reader = reader_holder;
329
1.59k
    // Prefer FE-provided index_properties (needed for variant subcolumn field_pattern matching)
330
1.58k
    // Reuse fb_it from earlier lookup above.
331
1.27k
    if (fb_it != _field_binding_map.end() && fb_it->second->__isset.index_properties &&
332
1.27k
        !fb_it->second->index_properties.empty()) {
333
310
        resolved.index_properties = fb_it->second->index_properties;
334
310
    } else {
335
1.58k
        resolved.index_properties = inverted_reader->get_index_properties();
336
1.58k
    }
337
1.58k
    resolved.binding_key = binding_key;
338
    resolved.analyzer_key =
339
1.58k
            normalize_analyzer_key(build_analyzer_key_from_properties(resolved.index_properties));
340
1.58k
341
1.58k
    _binding_readers[binding_key] = reader_holder;
342
1.58k
    _field_readers[resolved.stored_field_wstr] = reader_holder;
343
1.58k
    _readers.emplace_back(reader_holder);
344
1.58k
    _cache.emplace(binding_key, resolved);
345
1.58k
    *binding = resolved;
346
    return Status::OK();
347
}
348
349
4
Status FunctionSearch::execute_impl(FunctionContext* /*context*/, Block& /*block*/,
350
4
                                    const ColumnNumbers& /*arguments*/, uint32_t /*result*/,
351
4
                                    size_t /*input_rows_count*/) const {
352
    return Status::RuntimeError("only inverted index queries are supported");
353
}
354
355
// Enhanced implementation: Handle new parameter structure (DSL + SlotReferences)
356
Status FunctionSearch::evaluate_inverted_index(
357
        const ColumnsWithTypeAndName& arguments,
358
        const std::vector<IndexFieldNameAndTypePair>& data_type_with_names,
359
1
        std::vector<IndexIterator*> iterators, uint32_t num_rows,
360
1
        const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/,
361
1
        InvertedIndexResultBitmap& bitmap_result) const {
362
    return Status::OK();
363
}
364
365
Status FunctionSearch::evaluate_inverted_index_with_search_param(
366
        const TSearchParam& search_param,
367
31
        const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names,
368
31
        std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows,
369
31
        InvertedIndexResultBitmap& bitmap_result, bool enable_cache) const {
370
31
    static const std::unordered_map<std::string, int> empty_field_to_column_id;
371
31
    return evaluate_inverted_index_with_search_param(
372
31
            search_param, data_type_with_names, std::move(iterators), num_rows, bitmap_result,
373
            enable_cache, nullptr, empty_field_to_column_id);
374
}
375
376
Status FunctionSearch::evaluate_inverted_index_with_search_param(
377
        const TSearchParam& search_param,
378
        const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names,
379
        std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows,
380
1.27k
        InvertedIndexResultBitmap& bitmap_result, bool enable_cache,
381
1.27k
        const IndexExecContext* index_exec_ctx,
382
1.27k
        const std::unordered_map<std::string, int>& field_name_to_column_id,
383
3
        const std::shared_ptr<IndexQueryContext>& index_query_context) const {
384
3
    const bool is_nested_query = search_param.root.clause_type == "NESTED";
385
3
    if (is_nested_query && !is_nested_group_search_supported()) {
386
        return Status::NotSupported(
387
1.27k
                "NESTED query requires NestedGroup support, which is unavailable in this build");
388
5
    }
389
5
390
5
    if (!is_nested_query && (iterators.empty() || data_type_with_names.empty())) {
391
5
        LOG(INFO) << "No indexed columns or iterators available, returning empty result, dsl:"
392
5
                  << search_param.original_dsl;
393
5
        bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
394
                                                  std::make_shared<roaring::Roaring>());
395
        return Status::OK();
396
1.27k
    }
397
1.27k
398
1.27k
    // DSL result cache: reuse InvertedIndexQueryCache with SEARCH_DSL_QUERY type
399
1.27k
    auto* dsl_cache = enable_cache ? InvertedIndexQueryCache::instance() : nullptr;
400
1.27k
    std::string seg_prefix;
401
1.27k
    std::string dsl_sig;
402
1.24k
    InvertedIndexQueryCache::CacheKey dsl_cache_key;
403
1.24k
    bool cache_usable = false;
404
1.24k
    if (dsl_cache) {
405
1.22k
        seg_prefix = extract_segment_prefix(iterators);
406
1.22k
        dsl_sig = build_dsl_signature(search_param);
407
1.22k
        if (!seg_prefix.empty() && !dsl_sig.empty()) {
408
1.22k
            dsl_cache_key = InvertedIndexQueryCache::CacheKey {
409
1.22k
                    seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY,
410
1.22k
                    dsl_sig};
411
171
            cache_usable = true;
412
171
            InvertedIndexQueryCacheHandle dsl_cache_handle;
413
            if (dsl_cache->lookup(dsl_cache_key, &dsl_cache_handle)) {
414
                auto cached_bitmap = dsl_cache_handle.get_bitmap();
415
169
                if (cached_bitmap) {
416
169
                    // Also retrieve cached null bitmap for three-valued SQL logic
417
169
                    // (needed by compound operators NOT, OR, AND in VCompoundPred)
418
169
                    auto null_cache_key = InvertedIndexQueryCache::CacheKey {
419
169
                            seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY,
420
171
                            dsl_sig + "__null"};
421
171
                    InvertedIndexQueryCacheHandle null_cache_handle;
422
171
                    std::shared_ptr<roaring::Roaring> null_bitmap;
423
169
                    if (dsl_cache->lookup(null_cache_key, &null_cache_handle)) {
424
0
                        null_bitmap = null_cache_handle.get_bitmap();
425
0
                    }
426
169
                    if (!null_bitmap) {
427
169
                        null_bitmap = std::make_shared<roaring::Roaring>();
428
169
                    }
429
169
                    bitmap_result =
430
171
                            InvertedIndexResultBitmap(cached_bitmap, std::move(null_bitmap));
431
1.22k
                    return Status::OK();
432
1.24k
                }
433
            }
434
1.10k
        }
435
1.10k
    }
436
1.10k
437
    std::shared_ptr<IndexQueryContext> context;
438
    if (index_query_context) {
439
        context = index_query_context;
440
    } else {
441
        context = std::make_shared<IndexQueryContext>();
442
        context->collection_statistics = std::make_shared<CollectionStatistics>();
443
        context->collection_similarity = std::make_shared<CollectionSimilarity>();
444
    }
445
446
    // NESTED() queries evaluate predicates on the flattened "element space" of a nested group.
447
1.10k
    // For VARIANT nested groups, the indexed lucene field (stored_field_name) uses:
448
1.10k
    //   parent_unique_id + "." + <variant-relative nested path>
449
1.10k
    // where the nested path is rooted at either:
450
0
    //   - "__D0_root__" for top-level array<object> (NESTED(data, ...))
451
0
    //   - "<nested_path_after_variant_root>" for object fields (NESTED(data.items, ...))
452
0
    //
453
0
    // FE field bindings are expressed using logical column paths (e.g. "data.items.msg"), so for
454
0
    // NESTED() we normalize stored_field_name suffix to be consistent with the nested group root.
455
0
    std::unordered_map<std::string, IndexFieldNameAndTypePair> patched_data_type_with_names;
456
0
    const auto* effective_data_type_with_names = &data_type_with_names;
457
0
    if (is_nested_query && search_param.root.__isset.nested_path) {
458
        const std::string& nested_path = search_param.root.nested_path;
459
0
        const auto dot_pos = nested_path.find('.');
460
0
        const std::string root_field =
461
0
                (dot_pos == std::string::npos) ? nested_path : nested_path.substr(0, dot_pos);
462
0
        const std::string root_prefix = root_field + ".";
463
0
        const std::string array_path = (dot_pos == std::string::npos)
464
0
                                               ? std::string(segment_v2::kRootNestedGroupPath)
465
0
                                               : nested_path.substr(dot_pos + 1);
466
0
467
0
        bool copied = false;
468
0
        for (const auto& fb : search_param.field_bindings) {
469
0
            if (!fb.__isset.is_variant_subcolumn || !fb.is_variant_subcolumn) {
470
0
                continue;
471
0
            }
472
0
            if (fb.field_name.empty()) {
473
0
                continue;
474
0
            }
475
0
            const auto it_orig = data_type_with_names.find(fb.field_name);
476
0
            if (it_orig == data_type_with_names.end()) {
477
0
                continue;
478
0
            }
479
0
            const std::string& old_stored = it_orig->second.first;
480
0
            const auto first_dot = old_stored.find('.');
481
0
            if (first_dot == std::string::npos) {
482
0
                continue;
483
0
            }
484
0
            std::string sub_path;
485
0
            if (fb.__isset.subcolumn_path && !fb.subcolumn_path.empty()) {
486
0
                sub_path = fb.subcolumn_path;
487
0
            } else if (fb.field_name.starts_with(nested_path + ".")) {
488
0
                sub_path = fb.field_name.substr(nested_path.size() + 1);
489
0
            } else if (fb.field_name.starts_with(root_prefix)) {
490
0
                sub_path = fb.field_name.substr(root_prefix.size());
491
0
            } else {
492
0
                sub_path = fb.field_name;
493
0
            }
494
0
            if (sub_path.empty()) {
495
0
                continue;
496
0
            }
497
            const std::string array_prefix = array_path + ".";
498
0
            const std::string suffix_path =
499
0
                    sub_path.starts_with(array_prefix) ? sub_path : (array_prefix + sub_path);
500
0
            const std::string parent_uid = old_stored.substr(0, first_dot);
501
0
            const std::string expected_stored = parent_uid + "." + suffix_path;
502
0
            if (old_stored == expected_stored) {
503
0
                continue;
504
0
            }
505
0
506
0
            if (!copied) {
507
0
                patched_data_type_with_names = data_type_with_names;
508
0
                effective_data_type_with_names = &patched_data_type_with_names;
509
0
                copied = true;
510
            }
511
            auto it = patched_data_type_with_names.find(fb.field_name);
512
1.10k
            if (it == patched_data_type_with_names.end()) {
513
1.10k
                continue;
514
            }
515
1.10k
            it->second.first = expected_stored;
516
0
        }
517
0
    }
518
0
519
0
    // Pass field_bindings to resolver for variant subcolumn detection
520
0
    FieldReaderResolver resolver(*effective_data_type_with_names, iterators, context,
521
0
                                 search_param.field_bindings);
522
0
523
0
    if (is_nested_query) {
524
0
        std::shared_ptr<roaring::Roaring> row_bitmap;
525
        RETURN_IF_ERROR(evaluate_nested_query(search_param, search_param.root, context, resolver,
526
                                              num_rows, index_exec_ctx, field_name_to_column_id,
527
1.10k
                                              row_bitmap));
528
1.10k
        bitmap_result = InvertedIndexResultBitmap(std::move(row_bitmap),
529
1.07k
                                                  std::make_shared<roaring::Roaring>());
530
1.07k
        bitmap_result.mask_out_null();
531
        return Status::OK();
532
1.10k
    }
533
1.10k
534
46
    // Extract default_operator from TSearchParam (default: "or")
535
46
    std::string default_operator = "or";
536
    if (search_param.__isset.default_operator && !search_param.default_operator.empty()) {
537
1.10k
        default_operator = search_param.default_operator;
538
1.10k
    }
539
1.10k
    // Extract minimum_should_match from TSearchParam (-1 means not set)
540
1.10k
    int32_t minimum_should_match = -1;
541
1.10k
    if (search_param.__isset.minimum_should_match) {
542
1.07k
        minimum_should_match = search_param.minimum_should_match;
543
0
    }
544
0
545
0
    query_v2::QueryPtr root_query;
546
0
    std::string root_binding_key;
547
0
    RETURN_IF_ERROR(build_query_recursive(search_param.root, context, resolver, &root_query,
548
0
                                          &root_binding_key, default_operator,
549
                                          minimum_should_match));
550
1.07k
    if (root_query == nullptr) {
551
1.07k
        LOG(INFO) << "search: Query tree resolved to empty query, dsl:"
552
1.07k
                  << search_param.original_dsl;
553
        bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
554
1.07k
                                                  std::make_shared<roaring::Roaring>());
555
1.07k
        return Status::OK();
556
0
    }
557
0
558
0
    ResolverNullBitmapAdapter null_resolver(resolver);
559
0
    query_v2::QueryExecutionContext exec_ctx =
560
0
            build_query_execution_context(num_rows, resolver, &null_resolver);
561
562
1.07k
    bool enable_scoring = false;
563
1.07k
    bool is_asc = false;
564
0
    size_t top_k = 0;
565
0
    if (index_query_context) {
566
0
        enable_scoring = index_query_context->collection_similarity != nullptr;
567
0
        is_asc = index_query_context->is_asc;
568
0
        top_k = index_query_context->query_limit;
569
    }
570
1.07k
571
1.07k
    auto weight = root_query->weight(enable_scoring);
572
1.07k
    if (!weight) {
573
2.94k
        LOG(WARNING) << "search: Failed to build query weight";
574
1.86k
        bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
575
1.86k
                                                  std::make_shared<roaring::Roaring>());
576
1.86k
        return Status::OK();
577
1.86k
    }
578
579
18.4E
    std::shared_ptr<roaring::Roaring> roaring = std::make_shared<roaring::Roaring>();
580
    if (enable_scoring && !is_asc && top_k > 0) {
581
        bool use_wand = index_query_context->runtime_state != nullptr &&
582
                        index_query_context->runtime_state->query_options()
583
                                .enable_inverted_index_wand_query;
584
1.07k
        query_v2::collect_multi_segment_top_k(weight, exec_ctx, root_binding_key, top_k, roaring,
585
1.07k
                                              index_query_context->collection_similarity, use_wand);
586
225
    } else {
587
225
        query_v2::collect_multi_segment_doc_set(
588
225
                weight, exec_ctx, root_binding_key, roaring,
589
18.4E
                index_query_context ? index_query_context->collection_similarity : nullptr,
590
18.4E
                enable_scoring);
591
225
    }
592
225
593
    VLOG_DEBUG << "search: Query completed, matched " << roaring->cardinality() << " documents";
594
18.4E
595
18.4E
    // Extract NULL bitmap from three-valued logic scorer
596
    // The scorer correctly computes which documents evaluate to NULL based on query logic
597
    // For example: TRUE OR NULL = TRUE (not NULL), FALSE OR NULL = NULL
598
1.07k
    std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
599
1.07k
    if (exec_ctx.null_resolver) {
600
        auto scorer = weight->scorer(exec_ctx, root_binding_key);
601
18.4E
        if (scorer && scorer->has_null_bitmap(exec_ctx.null_resolver)) {
602
18.4E
            const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver);
603
            if (bitmap != nullptr) {
604
                *null_bitmap = *bitmap;
605
                VLOG_TRACE << "search: Extracted NULL bitmap with " << null_bitmap->cardinality()
606
                           << " NULL documents";
607
1.07k
            }
608
1.07k
        }
609
1.07k
    }
610
1.08k
611
1.08k
    VLOG_TRACE << "search: Before mask - true_bitmap=" << roaring->cardinality()
612
1.08k
               << ", null_bitmap=" << null_bitmap->cardinality();
613
1.08k
614
1.08k
    // Create result and mask out NULLs (SQL WHERE clause semantics: only TRUE rows)
615
1.08k
    bitmap_result = InvertedIndexResultBitmap(std::move(roaring), std::move(null_bitmap));
616
1.08k
    bitmap_result.mask_out_null();
617
1.07k
618
    VLOG_TRACE << "search: After mask - result_bitmap="
619
1.07k
               << bitmap_result.get_data_bitmap()->cardinality();
620
1.07k
621
    // Insert post-mask_out_null result into DSL cache for future reuse
622
    // Cache both data bitmap and null bitmap so compound operators (NOT, OR, AND)
623
    // can apply correct three-valued SQL logic on cache hit
624
    if (dsl_cache && cache_usable) {
625
        InvertedIndexQueryCacheHandle insert_handle;
626
        dsl_cache->insert(dsl_cache_key, bitmap_result.get_data_bitmap(), &insert_handle);
627
7
        if (bitmap_result.get_null_bitmap()) {
628
7
            auto null_cache_key = InvertedIndexQueryCache::CacheKey {
629
7
                    seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY,
630
2
                    dsl_sig + "__null"};
631
2
            InvertedIndexQueryCacheHandle null_insert_handle;
632
5
            dsl_cache->insert(null_cache_key, bitmap_result.get_null_bitmap(), &null_insert_handle);
633
2
        }
634
2
    }
635
3
636
2
    return Status::OK();
637
2
}
638
1
639
1
Status FunctionSearch::evaluate_nested_query(
640
        const TSearchParam& search_param, const TSearchClause& nested_clause,
641
        const std::shared_ptr<IndexQueryContext>& context, FieldReaderResolver& resolver,
642
3
        uint32_t num_rows, const IndexExecContext* index_exec_ctx,
643
3
        const std::unordered_map<std::string, int>& field_name_to_column_id,
644
3
        std::shared_ptr<roaring::Roaring>& result_bitmap) const {
645
1
    (void)field_name_to_column_id;
646
1
    if (!(nested_clause.__isset.nested_path)) {
647
3
        return Status::InvalidArgument("NESTED clause missing nested_path");
648
3
    }
649
3
    if (!(nested_clause.__isset.children) || nested_clause.children.empty()) {
650
0
        return Status::InvalidArgument("NESTED clause missing inner query");
651
0
    }
652
0
    if (result_bitmap == nullptr) {
653
0
        result_bitmap = std::make_shared<roaring::Roaring>();
654
0
    } else {
655
0
        *result_bitmap = roaring::Roaring();
656
0
    }
657
658
0
    // 1. Get the nested group chain directly
659
0
    std::string root_field = nested_clause.nested_path;
660
0
    auto dot_pos = nested_clause.nested_path.find('.');
661
0
    if (dot_pos != std::string::npos) {
662
0
        root_field = nested_clause.nested_path.substr(0, dot_pos);
663
0
    }
664
0
    if (index_exec_ctx == nullptr || index_exec_ctx->segment() == nullptr) {
665
0
        return Status::InvalidArgument("NESTED query requires IndexExecContext with valid segment");
666
    }
667
0
    auto* segment = index_exec_ctx->segment();
668
0
    const int32_t ordinal = segment->tablet_schema()->field_index(root_field);
669
0
    if (ordinal < 0) {
670
0
        return Status::InvalidArgument("Column '{}' not found in tablet schema for nested query",
671
0
                                       root_field);
672
0
    }
673
    const ColumnId column_id = static_cast<ColumnId>(ordinal);
674
0
675
0
    std::shared_ptr<segment_v2::ColumnReader> column_reader;
676
0
    RETURN_IF_ERROR(segment->get_column_reader(segment->tablet_schema()->column(column_id),
677
0
                                               &column_reader,
678
                                               index_exec_ctx->column_iter_opts().stats));
679
    auto* variant_reader = dynamic_cast<segment_v2::VariantColumnReader*>(column_reader.get());
680
0
    if (variant_reader == nullptr) {
681
0
        return Status::InvalidArgument("Column '{}' is not VARIANT for nested query", root_field);
682
0
    }
683
0
684
0
    std::string array_path;
685
    if (dot_pos == std::string::npos) {
686
0
        array_path = std::string(segment_v2::kRootNestedGroupPath);
687
0
    } else {
688
0
        array_path = nested_clause.nested_path.substr(dot_pos + 1);
689
0
    }
690
0
691
0
    auto [found, group_chain, _] = variant_reader->collect_nested_group_chain(array_path);
692
0
    if (!found || group_chain.empty()) {
693
        return Status::OK();
694
    }
695
0
696
0
    // Use the read provider for element counting and bitmap mapping.
697
0
    auto read_provider = segment_v2::create_nested_group_read_provider();
698
0
    if (!read_provider || !read_provider->should_enable_nested_group_read_path()) {
699
0
        return Status::NotSupported(
700
0
                "NestedGroup search is an enterprise capability, not available in this build");
701
0
    }
702
0
703
    auto& leaf_group = group_chain.back();
704
0
    uint64_t total_elements = 0;
705
0
    RETURN_IF_ERROR(read_provider->get_total_elements(index_exec_ctx->column_iter_opts(),
706
0
                                                      leaf_group, &total_elements));
707
0
    if (total_elements == 0) {
708
0
        return Status::OK();
709
0
    }
710
0
711
0
    // 3. Evaluate inner query
712
    std::string default_operator = "or";
713
0
    if (search_param.__isset.default_operator && !search_param.default_operator.empty()) {
714
0
        default_operator = search_param.default_operator;
715
0
    }
716
    int32_t minimum_should_match = -1;
717
0
    if (search_param.__isset.minimum_should_match) {
718
0
        minimum_should_match = search_param.minimum_should_match;
719
0
    }
720
721
0
    query_v2::QueryPtr inner_query;
722
0
    std::string inner_binding_key;
723
0
    RETURN_IF_ERROR(build_query_recursive(nested_clause.children[0], context, resolver,
724
0
                                          &inner_query, &inner_binding_key, default_operator,
725
0
                                          minimum_should_match));
726
0
    if (inner_query == nullptr) {
727
0
        return Status::OK();
728
0
    }
729
730
0
    if (total_elements > std::numeric_limits<uint32_t>::max()) {
731
0
        return Status::InvalidArgument("nested element_count exceeds uint32_t max");
732
0
    }
733
0
734
0
    ResolverNullBitmapAdapter null_resolver(resolver);
735
0
    query_v2::QueryExecutionContext exec_ctx = build_query_execution_context(
736
            static_cast<uint32_t>(total_elements), resolver, &null_resolver);
737
0
738
0
    auto weight = inner_query->weight(false);
739
0
    if (!weight) {
740
0
        return Status::OK();
741
0
    }
742
0
    auto scorer = weight->scorer(exec_ctx, inner_binding_key);
743
    if (!scorer) {
744
        return Status::OK();
745
0
    }
746
0
747
0
    roaring::Roaring element_bitmap;
748
0
    uint32_t doc = scorer->doc();
749
0
    while (doc != query_v2::TERMINATED) {
750
0
        element_bitmap.add(doc);
751
0
        doc = scorer->advance();
752
0
    }
753
0
754
    if (scorer->has_null_bitmap(exec_ctx.null_resolver)) {
755
        const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver);
756
        if (bitmap != nullptr && !bitmap->isEmpty()) {
757
7.19k
            element_bitmap -= *bitmap;
758
7.19k
        }
759
7.19k
    }
760
146
761
7.05k
    // 4. Map element-level hits back to row-level hits through NestedGroup chain.
762
7.05k
    if (result_bitmap == nullptr) {
763
7.05k
        result_bitmap = std::make_shared<roaring::Roaring>();
764
    }
765
6.64k
    roaring::Roaring parent_bitmap;
766
6.64k
    RETURN_IF_ERROR(read_provider->map_elements_to_parent_ords(
767
430
            group_chain, index_exec_ctx->column_iter_opts(), element_bitmap, &parent_bitmap));
768
    *result_bitmap = std::move(parent_bitmap);
769
    return Status::OK();
770
430
}
771
18.4E
772
// Aligned with FE QsClauseType enum - uses enum.name() as clause_type
773
18.4E
FunctionSearch::ClauseTypeCategory FunctionSearch::get_clause_type_category(
774
18.4E
        const std::string& clause_type) const {
775
18.4E
    if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" ||
776
18.4E
        clause_type == "OCCUR_BOOLEAN" || clause_type == "NESTED") {
777
7.19k
        return ClauseTypeCategory::COMPOUND;
778
    } else if (clause_type == "TERM" || clause_type == "PREFIX" || clause_type == "WILDCARD" ||
779
               clause_type == "REGEXP" || clause_type == "RANGE" || clause_type == "LIST" ||
780
               clause_type == "EXACT") {
781
5.28k
        // Non-tokenized queries: exact matching, pattern matching, range, list operations
782
5.28k
        return ClauseTypeCategory::NON_TOKENIZED;
783
5.28k
    } else if (clause_type == "PHRASE" || clause_type == "MATCH" || clause_type == "ANY" ||
784
               clause_type == "ALL") {
785
        // Tokenized queries: phrase search, full-text search, multi-value matching
786
5.28k
        // Note: ANY and ALL require tokenization of their input values
787
        return ClauseTypeCategory::TOKENIZED;
788
5.14k
    } else {
789
        // Default to NON_TOKENIZED for unknown types
790
163
        LOG(WARNING) << "Unknown clause type '" << clause_type
791
163
                     << "', defaulting to NON_TOKENIZED category";
792
5.14k
        return ClauseTypeCategory::NON_TOKENIZED;
793
    }
794
}
795
5.11k
796
5.09k
// Analyze query type for a specific field in the search clause
797
InvertedIndexQueryType FunctionSearch::analyze_field_query_type(const std::string& field_name,
798
5.09k
                                                                const TSearchClause& clause) const {
799
    const std::string& clause_type = clause.clause_type;
800
5.09k
    ClauseTypeCategory category = get_clause_type_category(clause_type);
801
124
802
124
    // Handle leaf queries - use direct mapping
803
5.09k
    if (category != ClauseTypeCategory::COMPOUND) {
804
132
        // Check if this clause targets the specific field
805
        if (clause.field_name == field_name) {
806
            // Use direct mapping from clause_type to InvertedIndexQueryType
807
4.99k
            return clause_type_to_query_type(clause_type);
808
5.11k
        }
809
    }
810
811
    // Handle boolean queries - recursively analyze children
812
2.10k
    if (!clause.children.empty()) {
813
        for (const auto& child_clause : clause.children) {
814
2.10k
            // Recursively analyze each child
815
            InvertedIndexQueryType child_type = analyze_field_query_type(field_name, child_clause);
816
2.10k
            // If this child targets the field (not default EQUAL_QUERY), return its query type
817
2.10k
            if (child_type != InvertedIndexQueryType::UNKNOWN_QUERY) {
818
2.10k
                return child_type;
819
2.10k
            }
820
2.10k
        }
821
    }
822
823
2.10k
    // If no children target this field, return UNKNOWN_QUERY as default
824
2.10k
    return InvertedIndexQueryType::UNKNOWN_QUERY;
825
2.10k
}
826
2.10k
827
2.10k
// Map clause_type string to InvertedIndexQueryType
828
2.10k
InvertedIndexQueryType FunctionSearch::clause_type_to_query_type(
829
        const std::string& clause_type) const {
830
    // Use static map for better performance and maintainability
831
2.10k
    static const std::unordered_map<std::string, InvertedIndexQueryType> clause_type_map = {
832
2.10k
            // Boolean operations
833
2.10k
            {"AND", InvertedIndexQueryType::BOOLEAN_QUERY},
834
2.10k
            {"OR", InvertedIndexQueryType::BOOLEAN_QUERY},
835
            {"NOT", InvertedIndexQueryType::BOOLEAN_QUERY},
836
            {"OCCUR_BOOLEAN", InvertedIndexQueryType::BOOLEAN_QUERY},
837
2.10k
            {"NESTED", InvertedIndexQueryType::BOOLEAN_QUERY},
838
2.10k
839
            // Non-tokenized queries (exact matching, pattern matching)
840
2.10k
            {"TERM", InvertedIndexQueryType::EQUAL_QUERY},
841
2.10k
            {"PREFIX", InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY},
842
2.10k
            {"WILDCARD", InvertedIndexQueryType::WILDCARD_QUERY},
843
2.10k
            {"REGEXP", InvertedIndexQueryType::MATCH_REGEXP_QUERY},
844
            {"RANGE", InvertedIndexQueryType::RANGE_QUERY},
845
            {"LIST", InvertedIndexQueryType::LIST_QUERY},
846
2.10k
847
2
            // Tokenized queries (full-text search, phrase search)
848
2.10k
            {"PHRASE", InvertedIndexQueryType::MATCH_PHRASE_QUERY},
849
            {"MATCH", InvertedIndexQueryType::MATCH_ANY_QUERY},
850
            {"ANY", InvertedIndexQueryType::MATCH_ANY_QUERY},
851
876
            {"ALL", InvertedIndexQueryType::MATCH_ALL_QUERY},
852
876
853
298
            // Exact match without tokenization
854
298
            {"EXACT", InvertedIndexQueryType::EQUAL_QUERY},
855
511
    };
856
511
857
63
    auto it = clause_type_map.find(clause_type);
858
63
    if (it != clause_type_map.end()) {
859
0
        return it->second;
860
0
    }
861
876
862
876
    // Unknown clause type
863
    LOG(WARNING) << "Unknown clause type '" << clause_type << "', defaulting to EQUAL_QUERY";
864
    return InvertedIndexQueryType::EQUAL_QUERY;
865
}
866
867
// Map Thrift TSearchOccur to query_v2::Occur
868
static query_v2::Occur map_thrift_occur(TSearchOccur::type thrift_occur) {
869
    switch (thrift_occur) {
870
2.73k
    case TSearchOccur::MUST:
871
2.73k
        return query_v2::Occur::MUST;
872
2.73k
    case TSearchOccur::SHOULD:
873
2.76k
        return query_v2::Occur::SHOULD;
874
2.76k
    case TSearchOccur::MUST_NOT:
875
2.76k
        return query_v2::Occur::MUST_NOT;
876
    default:
877
2.73k
        return query_v2::Occur::MUST;
878
    }
879
}
880
2.73k
881
40
Status FunctionSearch::build_query_recursive(const TSearchClause& clause,
882
40
                                             const std::shared_ptr<IndexQueryContext>& context,
883
40
                                             FieldReaderResolver& resolver,
884
                                             inverted_index::query_v2::QueryPtr* out,
885
                                             std::string* binding_key,
886
2.69k
                                             const std::string& default_operator,
887
425
                                             int32_t minimum_should_match) const {
888
    DCHECK(out != nullptr);
889
    *out = nullptr;
890
425
    if (binding_key) {
891
410
        binding_key->clear();
892
410
    }
893
894
425
    const std::string& clause_type = clause.clause_type;
895
870
896
870
    // Handle MATCH_ALL_DOCS - matches all documents in the segment
897
870
    if (clause_type == "MATCH_ALL_DOCS") {
898
870
        *out = std::make_shared<query_v2::AllQuery>();
899
870
        return Status::OK();
900
870
    }
901
902
    // Handle OCCUR_BOOLEAN - Lucene-style boolean query with MUST/SHOULD/MUST_NOT
903
869
    if (clause_type == "OCCUR_BOOLEAN") {
904
875
        auto builder = segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder();
905
875
906
875
        // Set minimum_should_match if specified
907
        if (clause.__isset.minimum_should_match) {
908
869
            builder->set_minimum_number_should_match(clause.minimum_should_match);
909
869
        }
910
421
911
        if (clause.__isset.children) {
912
424
            for (const auto& child_clause : clause.children) {
913
424
                query_v2::QueryPtr child_query;
914
425
                std::string child_binding_key;
915
                RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query,
916
2.26k
                                                      &child_binding_key, default_operator,
917
1
                                                      minimum_should_match));
918
1
919
                // Determine occur type from child clause
920
                query_v2::Occur occur = query_v2::Occur::MUST; // default
921
2.26k
                if (child_clause.__isset.occur) {
922
380
                    occur = map_thrift_occur(child_clause.occur);
923
380
                }
924
199
925
199
                builder->add(child_query, occur, std::move(child_binding_key));
926
90
            }
927
90
        }
928
929
380
        *out = builder->build();
930
380
        return Status::OK();
931
793
    }
932
793
933
793
    if (clause_type == "NESTED") {
934
793
        return Status::InvalidArgument("NESTED clause must be evaluated at top level");
935
793
    }
936
793
937
    // Handle standard boolean operators (AND/OR/NOT)
938
    if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT") {
939
        query_v2::OperatorType op = query_v2::OperatorType::OP_AND;
940
        if (clause_type == "OR") {
941
            op = query_v2::OperatorType::OP_OR;
942
792
        } else if (clause_type == "NOT") {
943
792
            op = query_v2::OperatorType::OP_NOT;
944
373
        }
945
946
379
        auto builder = create_operator_boolean_query_builder(op);
947
379
        if (clause.__isset.children) {
948
380
            for (const auto& child_clause : clause.children) {
949
                query_v2::QueryPtr child_query;
950
1.88k
                std::string child_binding_key;
951
1.88k
                RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query,
952
2.26k
                                                      &child_binding_key, default_operator,
953
                                                      minimum_should_match));
954
                // Add all children including empty BitSetQuery
955
                // BooleanQuery will handle the logic:
956
                // - AND with empty bitmap → result is empty
957
                // - OR with empty bitmap → empty bitmap is ignored by OR logic
958
                // - NOT with empty bitmap → NOT(empty) = all rows (handled by BooleanQuery)
959
                builder->add(child_query, std::move(child_binding_key));
960
1.93k
            }
961
1.93k
        }
962
1.93k
963
1.94k
        *out = builder->build();
964
1.94k
        return Status::OK();
965
1.94k
    }
966
967
1.93k
    return build_leaf_query(clause, context, resolver, out, binding_key, default_operator,
968
0
                            minimum_should_match);
969
0
}
970
971
1.93k
Status FunctionSearch::build_leaf_query(const TSearchClause& clause,
972
1.93k
                                        const std::shared_ptr<IndexQueryContext>& context,
973
1.93k
                                        FieldReaderResolver& resolver,
974
                                        inverted_index::query_v2::QueryPtr* out,
975
1.93k
                                        std::string* binding_key,
976
                                        const std::string& default_operator,
977
                                        int32_t minimum_should_match) const {
978
    DCHECK(out != nullptr);
979
    *out = nullptr;
980
    if (binding_key) {
981
        binding_key->clear();
982
    }
983
984
    if (!clause.__isset.field_name || !clause.__isset.value) {
985
        return Status::InvalidArgument("search clause missing field_name or value");
986
    }
987
1.93k
988
1.93k
    const std::string& field_name = clause.field_name;
989
1.42k
    const std::string& value = clause.value;
990
1.42k
    const std::string& clause_type = clause.clause_type;
991
992
1.93k
    auto query_type = clause_type_to_query_type(clause_type);
993
1.93k
    // TERM, WILDCARD, PREFIX, and REGEXP in search DSL operate on individual index terms
994
    // (like Lucene TermQuery, WildcardQuery, PrefixQuery, RegexpQuery).
995
    // Override to MATCH_ANY_QUERY so select_best_reader() prefers the FULLTEXT reader
996
1.90k
    // when multiple indexes exist on the same column (one tokenized, one untokenized).
997
3
    // Without this, these queries would select the untokenized index and try to match
998
3
    // patterns like "h*llo" against full strings ("hello world") instead of individual
999
3
    // tokens ("hello"), returning empty results.
1000
    // EXACT must remain EQUAL_QUERY to prefer the untokenized STRING_TYPE reader.
1001
3
    //
1002
3
    // Safe for single-index columns: select_best_reader() has a single-reader fast path
1003
3
    // that returns the only reader directly, bypassing the query_type preference logic.
1004
3
    if (clause_type == "TERM" || clause_type == "WILDCARD" || clause_type == "PREFIX" ||
1005
3
        clause_type == "REGEXP") {
1006
3
        query_type = InvertedIndexQueryType::MATCH_ANY_QUERY;
1007
    }
1008
1.90k
1009
1.88k
    FieldReaderBinding binding;
1010
1.88k
    RETURN_IF_ERROR(resolver.resolve(field_name, query_type, &binding));
1011
1012
1.90k
    // Check if binding is empty (variant subcolumn not found in this segment)
1013
1.90k
    if (binding.lucene_reader == nullptr) {
1014
1.90k
        LOG(INFO) << "search: No inverted index for field '" << field_name
1015
                  << "' in this segment, clause_type='" << clause_type
1016
1.91k
                  << "', query_type=" << static_cast<int>(query_type) << ", returning no matches";
1017
1.91k
        // Variant subcolumn doesn't exist - create empty BitSetQuery (no matches)
1018
1.91k
        *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
1019
        if (binding_key) {
1020
1.90k
            binding_key->clear();
1021
1.32k
        }
1022
1.32k
        return Status::OK();
1023
1.32k
    }
1024
1.06k
1025
0
    if (binding_key) {
1026
0
        *binding_key = binding.binding_key;
1027
0
    }
1028
0
1029
0
    FunctionSearch::ClauseTypeCategory category = get_clause_type_category(clause_type);
1030
    std::wstring field_wstr = binding.stored_field_wstr;
1031
1.06k
    std::wstring value_wstr = StringHelper::to_wstring(value);
1032
1.06k
1033
1.06k
    auto make_term_query = [&](const std::wstring& term) -> query_v2::QueryPtr {
1034
1.06k
        return std::make_shared<query_v2::TermQuery>(context, field_wstr, term);
1035
0
    };
1036
0
1037
0
    if (clause_type == "TERM") {
1038
0
        bool should_analyze =
1039
0
                inverted_index::InvertedIndexAnalyzer::should_analyzer(binding.index_properties);
1040
0
        if (should_analyze) {
1041
            if (binding.index_properties.empty()) {
1042
1.07k
                LOG(WARNING) << "search: analyzer required but index properties empty for field '"
1043
1.07k
                             << field_name << "'";
1044
1.07k
                *out = make_term_query(value_wstr);
1045
1.07k
                return Status::OK();
1046
1.07k
            }
1047
1048
            std::vector<TermInfo> term_infos =
1049
                    inverted_index::InvertedIndexAnalyzer::get_analyse_result(
1050
18.4E
                            value, binding.index_properties);
1051
0
            if (term_infos.empty()) {
1052
0
                LOG(WARNING) << "search: No terms found after tokenization for TERM query, field="
1053
0
                             << field_name << ", value='" << value
1054
0
                             << "', returning empty BitSetQuery";
1055
0
                *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
1056
0
                return Status::OK();
1057
0
            }
1058
0
1059
0
            if (term_infos.size() == 1) {
1060
0
                std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term());
1061
0
                *out = make_term_query(term_wstr);
1062
0
                return Status::OK();
1063
            }
1064
1065
18.4E
            // When minimum_should_match is specified, use OccurBooleanQuery
1066
18.4E
            // ES behavior: msm only applies to SHOULD clauses
1067
18.4E
            if (minimum_should_match > 0) {
1068
18.4E
                auto builder =
1069
18.4E
                        segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder();
1070
8
                builder->set_minimum_number_should_match(minimum_should_match);
1071
8
                query_v2::Occur occur = (default_operator == "and") ? query_v2::Occur::MUST
1072
8
                                                                    : query_v2::Occur::SHOULD;
1073
                for (const auto& term_info : term_infos) {
1074
18.4E
                    std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
1075
18.4E
                    builder->add(make_term_query(term_wstr), occur);
1076
18.4E
                }
1077
                *out = builder->build();
1078
262
                return Status::OK();
1079
262
            }
1080
1.32k
1081
            // Use default_operator to determine how to combine tokenized terms
1082
578
            query_v2::OperatorType op_type = (default_operator == "and")
1083
399
                                                     ? query_v2::OperatorType::OP_AND
1084
122
                                                     : query_v2::OperatorType::OP_OR;
1085
122
            auto builder = create_operator_boolean_query_builder(op_type);
1086
122
            for (const auto& term_info : term_infos) {
1087
11
                std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
1088
0
                builder->add(make_term_query(term_wstr), binding.binding_key);
1089
11
            }
1090
11
1091
11
            *out = builder->build();
1092
            return Status::OK();
1093
111
        }
1094
0
1095
0
        *out = make_term_query(value_wstr);
1096
0
        return Status::OK();
1097
0
    }
1098
0
1099
0
    if (category == FunctionSearch::ClauseTypeCategory::TOKENIZED) {
1100
        if (clause_type == "PHRASE") {
1101
111
            bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer(
1102
111
                    binding.index_properties);
1103
111
            if (!should_analyze) {
1104
111
                VLOG_DEBUG << "search: PHRASE on non-tokenized field '" << field_name
1105
9
                           << "', falling back to TERM";
1106
9
                *out = make_term_query(value_wstr);
1107
9
                return Status::OK();
1108
9
            }
1109
9
1110
9
            if (binding.index_properties.empty()) {
1111
                LOG(WARNING) << "search: analyzer required but index properties empty for PHRASE "
1112
102
                                "query on field '"
1113
102
                             << field_name << "'";
1114
102
                *out = make_term_query(value_wstr);
1115
62
                return Status::OK();
1116
62
            }
1117
62
1118
62
            std::vector<TermInfo> term_infos =
1119
62
                    inverted_index::InvertedIndexAnalyzer::get_analyse_result(
1120
0
                            value, binding.index_properties);
1121
0
            if (term_infos.empty()) {
1122
0
                LOG(WARNING) << "search: No terms found after tokenization for PHRASE query, field="
1123
0
                             << field_name << ", value='" << value
1124
0
                             << "', returning empty BitSetQuery";
1125
0
                *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
1126
0
                return Status::OK();
1127
0
            }
1128
62
1129
40
            std::vector<TermInfo> phrase_term_infos =
1130
21
                    QueryHelper::build_phrase_term_infos(term_infos);
1131
21
            if (phrase_term_infos.size() == 1) {
1132
21
                const auto& term_info = phrase_term_infos[0];
1133
19
                if (term_info.is_single_term()) {
1134
19
                    std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
1135
19
                    *out = std::make_shared<query_v2::TermQuery>(context, field_wstr, term_wstr);
1136
40
                } else {
1137
                    auto builder =
1138
102
                            create_operator_boolean_query_builder(query_v2::OperatorType::OP_OR);
1139
111
                    for (const auto& term : term_info.get_multi_terms()) {
1140
277
                        std::wstring term_wstr = StringHelper::to_wstring(term);
1141
0
                        builder->add(make_term_query(term_wstr), binding.binding_key);
1142
0
                    }
1143
0
                    *out = builder->build();
1144
0
                }
1145
            } else {
1146
278
                if (QueryHelper::is_simple_phrase(phrase_term_infos)) {
1147
278
                    *out = std::make_shared<query_v2::PhraseQuery>(context, field_wstr,
1148
278
                                                                   phrase_term_infos);
1149
278
                } else {
1150
1
                    *out = std::make_shared<query_v2::MultiPhraseQuery>(context, field_wstr,
1151
1
                                                                        phrase_term_infos);
1152
1
                }
1153
            }
1154
277
1155
0
            return Status::OK();
1156
0
        }
1157
0
        if (clause_type == "MATCH") {
1158
0
            VLOG_DEBUG << "search: MATCH clause not implemented, fallback to TERM";
1159
0
            *out = make_term_query(value_wstr);
1160
            return Status::OK();
1161
277
        }
1162
277
1163
277
        if (clause_type == "ANY" || clause_type == "ALL") {
1164
277
            bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer(
1165
0
                    binding.index_properties);
1166
0
            if (!should_analyze) {
1167
0
                *out = make_term_query(value_wstr);
1168
0
                return Status::OK();
1169
0
            }
1170
1171
277
            if (binding.index_properties.empty()) {
1172
277
                LOG(WARNING) << "search: index properties empty for tokenized clause '"
1173
185
                             << clause_type << "' field=" << field_name;
1174
185
                *out = make_term_query(value_wstr);
1175
                return Status::OK();
1176
277
            }
1177
105
1178
105
            std::vector<TermInfo> term_infos =
1179
105
                    inverted_index::InvertedIndexAnalyzer::get_analyse_result(
1180
105
                            value, binding.index_properties);
1181
            if (term_infos.empty()) {
1182
172
                LOG(WARNING) << "search: tokenization yielded no terms for clause '" << clause_type
1183
387
                             << "', field=" << field_name << ", returning empty BitSetQuery";
1184
387
                *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
1185
387
                return Status::OK();
1186
387
            }
1187
172
1188
172
            query_v2::OperatorType bool_type = query_v2::OperatorType::OP_OR;
1189
277
            if (clause_type == "ALL") {
1190
                bool_type = query_v2::OperatorType::OP_AND;
1191
            }
1192
18.4E
1193
18.4E
            if (term_infos.size() == 1) {
1194
277
                std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term());
1195
                *out = make_term_query(term_wstr);
1196
179
                return Status::OK();
1197
168
            }
1198
1199
            auto builder = create_operator_boolean_query_builder(bool_type);
1200
            for (const auto& term_info : term_infos) {
1201
                std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
1202
79
                builder->add(make_term_query(term_wstr), binding.binding_key);
1203
79
            }
1204
0
            *out = builder->build();
1205
79
            return Status::OK();
1206
79
        }
1207
89
1208
        // Default tokenized clause fallback
1209
        *out = make_term_query(value_wstr);
1210
        return Status::OK();
1211
36
    }
1212
36
1213
36
    if (category == FunctionSearch::ClauseTypeCategory::NON_TOKENIZED) {
1214
36
        if (clause_type == "EXACT") {
1215
36
            // EXACT match: exact string matching without tokenization
1216
36
            // Note: EXACT prefers untokenized index (STRING_TYPE) which doesn't support lowercase
1217
36
            // If only tokenized index exists, EXACT may return empty results because
1218
36
            // tokenized indexes store individual tokens, not complete strings
1219
0
            *out = make_term_query(value_wstr);
1220
0
            VLOG_DEBUG << "search: EXACT clause processed, field=" << field_name << ", value='"
1221
36
                       << value << "'";
1222
36
            return Status::OK();
1223
        }
1224
53
        if (clause_type == "PREFIX") {
1225
            // Apply lowercase only if:
1226
            // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing)
1227
23
            // 2. lower_case is explicitly set to "true"
1228
0
            bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer(
1229
0
                    binding.index_properties);
1230
0
            std::string lowercase_setting =
1231
0
                    get_parser_lowercase_from_properties(binding.index_properties);
1232
0
            bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE);
1233
            std::string pattern = should_lowercase ? to_lower(value) : value;
1234
            *out = std::make_shared<query_v2::WildcardQuery>(context, field_wstr, pattern);
1235
            VLOG_DEBUG << "search: PREFIX clause processed, field=" << field_name << ", pattern='"
1236
23
                       << pattern << "' (original='" << value << "', has_parser=" << has_parser
1237
23
                       << ", lower_case=" << lowercase_setting << ")";
1238
23
            return Status::OK();
1239
23
        }
1240
23
1241
23
        if (clause_type == "WILDCARD") {
1242
23
            // Standalone wildcard "*" matches all non-null values for this field
1243
23
            // Consistent with ES query_string behavior where field:* becomes FieldExistsQuery
1244
0
            if (value == "*") {
1245
0
                *out = std::make_shared<query_v2::AllQuery>(field_wstr, true);
1246
23
                VLOG_DEBUG << "search: WILDCARD '*' converted to AllQuery(nullable=true), field="
1247
23
                           << field_name;
1248
                return Status::OK();
1249
30
            }
1250
            // Apply lowercase only if:
1251
            // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing)
1252
28
            // 2. lower_case is explicitly set to "true"
1253
28
            bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer(
1254
0
                    binding.index_properties);
1255
28
            std::string lowercase_setting =
1256
28
                    get_parser_lowercase_from_properties(binding.index_properties);
1257
            bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE);
1258
2
            std::string pattern = should_lowercase ? to_lower(value) : value;
1259
18.4E
            *out = std::make_shared<query_v2::WildcardQuery>(context, field_wstr, pattern);
1260
18.4E
            VLOG_DEBUG << "search: WILDCARD clause processed, field=" << field_name << ", pattern='"
1261
2
                       << pattern << "' (original='" << value << "', has_parser=" << has_parser
1262
2
                       << ", lower_case=" << lowercase_setting << ")";
1263
2
            return Status::OK();
1264
30
        }
1265
1266
179
        if (clause_type == "REGEXP") {
1267
11
            // ES-compatible: regex patterns are NOT lowercased (case-sensitive matching)
1268
11
            // This matches ES query_string behavior where regex patterns bypass analysis
1269
179
            *out = std::make_shared<query_v2::RegexpQuery>(context, field_wstr, value);
1270
            VLOG_DEBUG << "search: REGEXP clause processed, field=" << field_name << ", pattern='"
1271
8
                       << value << "'";
1272
8
            return Status::OK();
1273
8
        }
1274
1275
        if (clause_type == "RANGE" || clause_type == "LIST") {
1276
            VLOG_DEBUG << "search: clause type '" << clause_type
1277
                       << "' not implemented, fallback to TERM";
1278
        }
1279
        *out = make_term_query(value_wstr);
1280
        return Status::OK();
1281
    }
1282
1283
    LOG(WARNING) << "search: Unexpected clause type '" << clause_type << "', using TERM fallback";
1284
    *out = make_term_query(value_wstr);
1285
    return Status::OK();
1286
}
1287
1288
void register_function_search(SimpleFunctionFactory& factory) {
1289
    factory.register_function<FunctionSearch>();
1290
}
1291
1292
} // namespace doris