Coverage Report

Created: 2026-04-09 04:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_search.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/function_search.h"
19
20
#include <CLucene/config/repl_wchar.h>
21
#include <CLucene/search/Scorer.h>
22
#include <gen_cpp/Exprs_types.h>
23
#include <glog/logging.h>
24
25
#include <limits>
26
#include <memory>
27
#include <roaring/roaring.hh>
28
#include <set>
29
#include <string>
30
#include <unordered_map>
31
#include <unordered_set>
32
#include <vector>
33
34
#include "common/status.h"
35
#include "core/block/columns_with_type_and_name.h"
36
#include "core/column/column_const.h"
37
#include "core/data_type/data_type_string.h"
38
#include "exprs/function/simple_function_factory.h"
39
#include "exprs/vexpr_context.h"
40
#include "runtime/runtime_profile.h"
41
#include "storage/index/index_file_reader.h"
42
#include "storage/index/index_query_context.h"
43
#include "storage/index/inverted/analyzer/analyzer.h"
44
#include "storage/index/inverted/inverted_index_compound_reader.h"
45
#include "storage/index/inverted/inverted_index_iterator.h"
46
#include "storage/index/inverted/inverted_index_parser.h"
47
#include "storage/index/inverted/inverted_index_reader.h"
48
#include "storage/index/inverted/inverted_index_searcher.h"
49
#include "storage/index/inverted/query/query_helper.h"
50
#include "storage/index/inverted/query_v2/all_query/all_query.h"
51
#include "storage/index/inverted/query_v2/bit_set_query/bit_set_query.h"
52
#include "storage/index/inverted/query_v2/boolean_query/boolean_query_builder.h"
53
#include "storage/index/inverted/query_v2/boolean_query/operator.h"
54
#include "storage/index/inverted/query_v2/collect/doc_set_collector.h"
55
#include "storage/index/inverted/query_v2/collect/top_k_collector.h"
56
#include "storage/index/inverted/query_v2/phrase_query/multi_phrase_query.h"
57
#include "storage/index/inverted/query_v2/phrase_query/phrase_query.h"
58
#include "storage/index/inverted/query_v2/regexp_query/regexp_query.h"
59
#include "storage/index/inverted/query_v2/term_query/term_query.h"
60
#include "storage/index/inverted/query_v2/wildcard_query/wildcard_query.h"
61
#include "storage/index/inverted/util/string_helper.h"
62
#include "storage/segment/segment.h"
63
#include "storage/segment/variant/nested_group_path.h"
64
#include "storage/segment/variant/nested_group_provider.h"
65
#include "storage/segment/variant/variant_column_reader.h"
66
#include "storage/types.h"
67
#include "util/debug_points.h"
68
#include "util/string_util.h"
69
#include "util/thrift_util.h"
70
71
namespace doris {
72
73
// Build canonical DSL signature for cache key.
74
// Serializes the entire TSearchParam via Thrift binary protocol so that
75
// every field (DSL, AST root, field bindings, default_operator,
76
// minimum_should_match, etc.) is included automatically.
77
1.30k
static std::string build_dsl_signature(const TSearchParam& param) {
78
1.30k
    ThriftSerializer ser(false, 1024);
79
1.30k
    TSearchParam copy = param;
80
1.30k
    std::string sig;
81
1.30k
    auto st = ser.serialize(&copy, &sig);
82
1.30k
    if (UNLIKELY(!st.ok())) {
83
0
        LOG(WARNING) << "build_dsl_signature: Thrift serialization failed: " << st.to_string()
84
0
                     << ", caching disabled for this query";
85
0
        return "";
86
0
    }
87
1.30k
    return sig;
88
1.30k
}
89
90
// Extract segment path prefix from the first available inverted index iterator.
91
// All fields in the same segment share the same path prefix.
92
static std::string extract_segment_prefix(
93
1.29k
        const std::unordered_map<std::string, IndexIterator*>& iterators) {
94
1.29k
    for (const auto& [field_name, iter] : iterators) {
95
1.29k
        auto* inv_iter = dynamic_cast<InvertedIndexIterator*>(iter);
96
1.29k
        if (!inv_iter) continue;
97
        // Try fulltext reader first, then string type
98
1.26k
        for (auto type :
99
1.58k
             {InvertedIndexReaderType::FULLTEXT, InvertedIndexReaderType::STRING_TYPE}) {
100
1.58k
            IndexReaderType reader_type = type;
101
1.58k
            auto reader = inv_iter->get_reader(reader_type);
102
1.58k
            if (!reader) continue;
103
1.26k
            auto inv_reader = std::dynamic_pointer_cast<InvertedIndexReader>(reader);
104
1.26k
            if (!inv_reader) continue;
105
1.26k
            auto file_reader = inv_reader->get_index_file_reader();
106
1.26k
            if (!file_reader) continue;
107
1.26k
            return file_reader->get_index_path_prefix();
108
1.26k
        }
109
1.26k
    }
110
29
    VLOG_DEBUG << "extract_segment_prefix: no suitable inverted index reader found across "
111
4
               << iterators.size() << " iterators, caching disabled for this query";
112
29
    return "";
113
1.29k
}
114
115
namespace {
116
117
3
bool is_nested_group_search_supported() {
118
3
    auto provider = segment_v2::create_nested_group_read_provider();
119
3
    return provider != nullptr && provider->should_enable_nested_group_read_path();
120
3
}
121
122
class ResolverNullBitmapAdapter final : public query_v2::NullBitmapResolver {
123
public:
124
1.10k
    explicit ResolverNullBitmapAdapter(const FieldReaderResolver& resolver) : _resolver(resolver) {}
125
126
    segment_v2::IndexIterator* iterator_for(const query_v2::Scorer& /*scorer*/,
127
2.78k
                                            const std::string& logical_field) const override {
128
2.78k
        if (logical_field.empty()) {
129
0
            return nullptr;
130
0
        }
131
2.78k
        return _resolver.get_iterator(logical_field);
132
2.78k
    }
133
134
private:
135
    const FieldReaderResolver& _resolver;
136
};
137
138
void populate_binding_context(const FieldReaderResolver& resolver,
139
1.10k
                              query_v2::QueryExecutionContext* exec_ctx) {
140
1.10k
    DCHECK(exec_ctx != nullptr);
141
1.10k
    exec_ctx->readers = resolver.readers();
142
1.10k
    exec_ctx->reader_bindings = resolver.reader_bindings();
143
1.10k
    exec_ctx->field_reader_bindings = resolver.field_readers();
144
1.63k
    for (const auto& [binding_key, binding] : resolver.binding_cache()) {
145
1.63k
        if (binding_key.empty()) {
146
0
            continue;
147
0
        }
148
1.63k
        query_v2::FieldBindingContext binding_ctx;
149
1.63k
        binding_ctx.logical_field_name = binding.logical_field_name;
150
1.63k
        binding_ctx.stored_field_name = binding.stored_field_name;
151
1.63k
        binding_ctx.stored_field_wstr = binding.stored_field_wstr;
152
1.63k
        exec_ctx->binding_fields.emplace(binding_key, std::move(binding_ctx));
153
1.63k
    }
154
1.10k
}
155
156
query_v2::QueryExecutionContext build_query_execution_context(
157
        uint32_t segment_num_rows, const FieldReaderResolver& resolver,
158
1.09k
        query_v2::NullBitmapResolver* null_resolver) {
159
1.09k
    query_v2::QueryExecutionContext exec_ctx;
160
1.09k
    exec_ctx.segment_num_rows = segment_num_rows;
161
1.09k
    populate_binding_context(resolver, &exec_ctx);
162
1.09k
    exec_ctx.null_resolver = null_resolver;
163
1.09k
    return exec_ctx;
164
1.09k
}
165
166
} // namespace
167
168
Status FieldReaderResolver::resolve(const std::string& field_name,
169
                                    InvertedIndexQueryType query_type,
170
1.97k
                                    FieldReaderBinding* binding) {
171
1.97k
    DCHECK(binding != nullptr);
172
173
    // Check if this is a variant subcolumn
174
1.97k
    bool is_variant_sub = is_variant_subcolumn(field_name);
175
176
1.97k
    auto data_it = _data_type_with_names.find(field_name);
177
1.97k
    if (data_it == _data_type_with_names.end()) {
178
        // For variant subcolumns, not finding the index is normal (the subcolumn may not exist in this segment)
179
        // Return OK but with null binding to signal "no match"
180
8
        if (is_variant_sub) {
181
3
            VLOG_DEBUG << "Variant subcolumn '" << field_name
182
0
                       << "' not found in this segment, treating as no match";
183
3
            *binding = FieldReaderBinding();
184
3
            return Status::OK();
185
3
        }
186
        // For normal fields, this is an error
187
5
        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
188
5
                "field '{}' not found in inverted index metadata", field_name);
189
8
    }
190
191
1.96k
    const auto& stored_field_name = data_it->second.first;
192
1.96k
    const auto binding_key = binding_key_for(stored_field_name, query_type);
193
194
1.96k
    auto cache_it = _cache.find(binding_key);
195
1.96k
    if (cache_it != _cache.end()) {
196
314
        *binding = cache_it->second;
197
314
        return Status::OK();
198
314
    }
199
200
1.65k
    auto iterator_it = _iterators.find(field_name);
201
1.65k
    if (iterator_it == _iterators.end() || iterator_it->second == nullptr) {
202
        // For variant subcolumns, not finding the iterator is normal
203
17
        if (is_variant_sub) {
204
0
            VLOG_DEBUG << "Variant subcolumn '" << field_name
205
0
                       << "' iterator not found in this segment, treating as no match";
206
0
            *binding = FieldReaderBinding();
207
0
            return Status::OK();
208
0
        }
209
17
        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
210
17
                "iterator not found for field '{}'", field_name);
211
17
    }
212
213
1.63k
    auto* inverted_iterator = dynamic_cast<InvertedIndexIterator*>(iterator_it->second);
214
1.63k
    if (inverted_iterator == nullptr) {
215
2
        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
216
2
                "iterator for field '{}' is not InvertedIndexIterator", field_name);
217
2
    }
218
219
    // For variant subcolumns, FE resolves the field pattern to a specific index and sends
220
    // its index_properties via TSearchFieldBinding. When FE picks an analyzer-based index,
221
    // upgrade EQUAL_QUERY/WILDCARD_QUERY to MATCH_ANY_QUERY so select_best_reader picks the
222
    // FULLTEXT reader instead of STRING_TYPE. Without this upgrade:
223
    // - TERM (EQUAL_QUERY) clauses would open the wrong (untokenized) index directory
224
    // - WILDCARD clauses would enumerate terms from the wrong index, returning empty results
225
    //
226
    // For regular (non-variant) columns with multiple indexes, the caller (build_leaf_query)
227
    // is responsible for passing the appropriate query_type: MATCH_ANY_QUERY for tokenized
228
    // queries (TERM) and EQUAL_QUERY for exact-match queries (EXACT). This ensures
229
    // select_best_reader picks FULLTEXT vs STRING_TYPE correctly without needing an explicit
230
    // analyzer key, since the query_type alone drives the reader type preference.
231
1.63k
    InvertedIndexQueryType effective_query_type = query_type;
232
1.63k
    auto fb_it = _field_binding_map.find(field_name);
233
1.63k
    std::string analyzer_key;
234
1.63k
    if (is_variant_sub && fb_it != _field_binding_map.end() &&
235
1.63k
        fb_it->second->__isset.index_properties && !fb_it->second->index_properties.empty()) {
236
60
        analyzer_key = normalize_analyzer_key(
237
60
                build_analyzer_key_from_properties(fb_it->second->index_properties));
238
60
        if (inverted_index::InvertedIndexAnalyzer::should_analyzer(
239
60
                    fb_it->second->index_properties) &&
240
60
            (effective_query_type == InvertedIndexQueryType::EQUAL_QUERY ||
241
60
             effective_query_type == InvertedIndexQueryType::WILDCARD_QUERY)) {
242
0
            effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY;
243
0
        }
244
60
    }
245
246
1.63k
    Result<InvertedIndexReaderPtr> reader_result;
247
1.63k
    const auto& column_type = data_it->second.second;
248
1.63k
    if (column_type) {
249
1.62k
        reader_result = inverted_iterator->select_best_reader(column_type, effective_query_type,
250
1.62k
                                                              analyzer_key);
251
1.62k
    } else {
252
14
        reader_result = inverted_iterator->select_best_reader(analyzer_key);
253
14
    }
254
255
1.63k
    if (!reader_result.has_value()) {
256
0
        return reader_result.error();
257
0
    }
258
259
1.63k
    auto inverted_reader = reader_result.value();
260
1.63k
    if (inverted_reader == nullptr) {
261
0
        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
262
0
                "selected reader is null for field '{}'", field_name);
263
0
    }
264
265
1.63k
    auto index_file_reader = inverted_reader->get_index_file_reader();
266
1.63k
    if (index_file_reader == nullptr) {
267
0
        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
268
0
                "index file reader is null for field '{}'", field_name);
269
0
    }
270
271
    // Use InvertedIndexSearcherCache to avoid re-opening index files repeatedly,
272
    // respecting the enable_inverted_index_searcher_cache session variable.
273
1.63k
    auto index_file_key =
274
1.63k
            index_file_reader->get_index_file_cache_key(&inverted_reader->get_index_meta());
275
1.63k
    InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key);
276
1.63k
    InvertedIndexCacheHandle searcher_cache_handle;
277
278
1.63k
    bool searcher_cache_enabled =
279
1.63k
            _context->runtime_state != nullptr &&
280
1.63k
            _context->runtime_state->query_options().enable_inverted_index_searcher_cache;
281
282
1.63k
    bool cache_hit = false;
283
1.63k
    if (searcher_cache_enabled) {
284
1.62k
        int64_t lookup_dummy = 0;
285
1.62k
        SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_lookup_timer
286
1.62k
                                         : &lookup_dummy);
287
1.62k
        cache_hit = InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key,
288
1.62k
                                                                   &searcher_cache_handle);
289
1.62k
    }
290
291
1.63k
    std::shared_ptr<lucene::index::IndexReader> reader_holder;
292
1.63k
    if (cache_hit) {
293
1.37k
        if (_context->stats) {
294
1.37k
            _context->stats->inverted_index_searcher_cache_hit++;
295
1.37k
        }
296
1.37k
        auto searcher_variant = searcher_cache_handle.get_index_searcher();
297
1.37k
        auto* searcher_ptr = std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
298
1.37k
        if (searcher_ptr != nullptr && *searcher_ptr != nullptr) {
299
1.37k
            reader_holder = std::shared_ptr<lucene::index::IndexReader>(
300
1.37k
                    (*searcher_ptr)->getReader(),
301
1.37k
                    [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ });
302
1.37k
        }
303
1.37k
    }
304
305
1.63k
    if (!reader_holder) {
306
265
        if (_context->stats) {
307
265
            _context->stats->inverted_index_searcher_cache_miss++;
308
265
        }
309
        // Cache miss: open directory, build IndexSearcher, insert into cache
310
264
        int64_t dummy_timer = 0;
311
264
        SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_searcher_open_timer
312
264
                                         : &dummy_timer);
313
264
        RETURN_IF_ERROR(
314
264
                index_file_reader->init(config::inverted_index_read_buffer_size, _context->io_ctx));
315
264
        auto directory = DORIS_TRY(
316
264
                index_file_reader->open(&inverted_reader->get_index_meta(), _context->io_ctx));
317
318
264
        auto index_searcher_builder = DORIS_TRY(
319
264
                IndexSearcherBuilder::create_index_searcher_builder(inverted_reader->type()));
320
264
        auto searcher_result =
321
264
                DORIS_TRY(index_searcher_builder->get_index_searcher(directory.get()));
322
264
        auto reader_size = index_searcher_builder->get_reader_size();
323
324
        // Initialization reads are done. Clear io_ctx on the main stream so the
325
        // cached searcher does not carry a stale reference. Subsequent query-phase
326
        // reads receive the caller's io_ctx through the CLucene API parameters
327
        // (termDocs/termPositions/terms) — the same pattern used by the MATCH path
328
        // in InvertedIndexReader::create_index_searcher().
329
264
        auto* stream = static_cast<DorisCompoundReader*>(directory.get())->getDorisIndexInput();
330
264
        DBUG_EXECUTE_IF(
331
264
                "FieldReaderResolver.resolve.io_ctx", ({
332
264
                    const auto* cur_io_ctx = (const io::IOContext*)stream->getIoContext();
333
264
                    if (cur_io_ctx->file_cache_stats) {
334
264
                        if (cur_io_ctx->file_cache_stats != &_context->stats->file_cache_stats) {
335
264
                            LOG(FATAL) << "search: io_ctx file_cache_stats mismatch: "
336
264
                                       << cur_io_ctx->file_cache_stats << " vs "
337
264
                                       << &_context->stats->file_cache_stats;
338
264
                        }
339
264
                    }
340
264
                }));
341
264
        stream->setIoContext(nullptr);
342
264
        stream->setIndexFile(false);
343
344
264
        auto* cache_value = new InvertedIndexSearcherCache::CacheValue(std::move(searcher_result),
345
264
                                                                       reader_size, UnixMillis());
346
264
        InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value,
347
264
                                                       &searcher_cache_handle);
348
349
264
        auto new_variant = searcher_cache_handle.get_index_searcher();
350
264
        auto* new_ptr = std::get_if<FulltextIndexSearcherPtr>(&new_variant);
351
265
        if (new_ptr != nullptr && *new_ptr != nullptr) {
352
265
            reader_holder = std::shared_ptr<lucene::index::IndexReader>(
353
265
                    (*new_ptr)->getReader(),
354
265
                    [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ });
355
265
        }
356
357
264
        if (!reader_holder) {
358
0
            return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
359
0
                    "failed to build IndexSearcher for field '{}'", field_name);
360
0
        }
361
264
    }
362
363
1.63k
    _searcher_cache_handles.push_back(std::move(searcher_cache_handle));
364
365
1.63k
    FieldReaderBinding resolved;
366
1.63k
    resolved.logical_field_name = field_name;
367
1.63k
    resolved.stored_field_name = stored_field_name;
368
1.63k
    resolved.stored_field_wstr = StringHelper::to_wstring(resolved.stored_field_name);
369
1.63k
    resolved.column_type = column_type;
370
1.63k
    resolved.query_type = effective_query_type;
371
1.63k
    resolved.inverted_reader = inverted_reader;
372
1.63k
    resolved.lucene_reader = reader_holder;
373
    // Prefer FE-provided index_properties (needed for variant subcolumn field_pattern matching)
374
    // Reuse fb_it from earlier lookup above.
375
1.63k
    if (fb_it != _field_binding_map.end() && fb_it->second->__isset.index_properties &&
376
1.63k
        !fb_it->second->index_properties.empty()) {
377
1.32k
        resolved.index_properties = fb_it->second->index_properties;
378
1.32k
    } else {
379
314
        resolved.index_properties = inverted_reader->get_index_properties();
380
314
    }
381
1.63k
    resolved.binding_key = binding_key;
382
1.63k
    resolved.analyzer_key =
383
1.63k
            normalize_analyzer_key(build_analyzer_key_from_properties(resolved.index_properties));
384
385
1.63k
    _binding_readers[binding_key] = reader_holder;
386
1.63k
    _field_readers[resolved.stored_field_wstr] = reader_holder;
387
1.63k
    _readers.emplace_back(reader_holder);
388
1.63k
    _cache.emplace(binding_key, resolved);
389
1.63k
    *binding = resolved;
390
1.63k
    return Status::OK();
391
1.63k
}
392
393
Status FunctionSearch::execute_impl(FunctionContext* /*context*/, Block& /*block*/,
394
                                    const ColumnNumbers& /*arguments*/, uint32_t /*result*/,
395
2
                                    size_t /*input_rows_count*/) const {
396
2
    return Status::RuntimeError("only inverted index queries are supported");
397
2
}
398
399
// Enhanced implementation: Handle new parameter structure (DSL + SlotReferences)
400
Status FunctionSearch::evaluate_inverted_index(
401
        const ColumnsWithTypeAndName& arguments,
402
        const std::vector<IndexFieldNameAndTypePair>& data_type_with_names,
403
        std::vector<IndexIterator*> iterators, uint32_t num_rows,
404
        const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/,
405
1
        InvertedIndexResultBitmap& bitmap_result) const {
406
1
    return Status::OK();
407
1
}
408
409
Status FunctionSearch::evaluate_inverted_index_with_search_param(
410
        const TSearchParam& search_param,
411
        const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names,
412
        std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows,
413
31
        InvertedIndexResultBitmap& bitmap_result, bool enable_cache) const {
414
31
    static const std::unordered_map<std::string, int> empty_field_to_column_id;
415
31
    return evaluate_inverted_index_with_search_param(
416
31
            search_param, data_type_with_names, std::move(iterators), num_rows, bitmap_result,
417
31
            enable_cache, nullptr, empty_field_to_column_id);
418
31
}
419
420
Status FunctionSearch::evaluate_inverted_index_with_search_param(
421
        const TSearchParam& search_param,
422
        const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names,
423
        std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows,
424
        InvertedIndexResultBitmap& bitmap_result, bool enable_cache,
425
        const IndexExecContext* index_exec_ctx,
426
        const std::unordered_map<std::string, int>& field_name_to_column_id,
427
1.31k
        const std::shared_ptr<IndexQueryContext>& index_query_context) const {
428
1.31k
    const bool is_nested_query = search_param.root.clause_type == "NESTED";
429
1.31k
    if (is_nested_query && !is_nested_group_search_supported()) {
430
3
        return Status::NotSupported(
431
3
                "NESTED query requires NestedGroup support, which is unavailable in this build");
432
3
    }
433
434
1.31k
    if (!is_nested_query && (iterators.empty() || data_type_with_names.empty())) {
435
5
        LOG(INFO) << "No indexed columns or iterators available, returning empty result, dsl:"
436
5
                  << search_param.original_dsl;
437
5
        bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
438
5
                                                  std::make_shared<roaring::Roaring>());
439
5
        return Status::OK();
440
5
    }
441
442
    // Track overall query time (equivalent to inverted_index_query_timer in MATCH path).
443
    // Must be declared before the DSL cache lookup so that cache-hit fast paths are
444
    // also covered by the timer.
445
1.30k
    int64_t query_timer_dummy = 0;
446
1.30k
    OlapReaderStatistics* outer_stats = index_query_context ? index_query_context->stats : nullptr;
447
1.30k
    SCOPED_RAW_TIMER(outer_stats ? &outer_stats->inverted_index_query_timer : &query_timer_dummy);
448
449
    // DSL result cache: reuse InvertedIndexQueryCache with SEARCH_DSL_QUERY type
450
1.30k
    auto* dsl_cache = enable_cache ? InvertedIndexQueryCache::instance() : nullptr;
451
1.30k
    std::string seg_prefix;
452
1.30k
    std::string dsl_sig;
453
1.30k
    InvertedIndexQueryCache::CacheKey dsl_cache_key;
454
1.30k
    bool cache_usable = false;
455
1.30k
    if (dsl_cache) {
456
1.29k
        seg_prefix = extract_segment_prefix(iterators);
457
1.29k
        dsl_sig = build_dsl_signature(search_param);
458
1.29k
        if (!seg_prefix.empty() && !dsl_sig.empty()) {
459
1.27k
            dsl_cache_key = InvertedIndexQueryCache::CacheKey {
460
1.27k
                    seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY,
461
1.27k
                    dsl_sig};
462
1.27k
            cache_usable = true;
463
1.27k
            InvertedIndexQueryCacheHandle dsl_cache_handle;
464
1.27k
            bool dsl_hit = false;
465
1.27k
            {
466
1.27k
                int64_t lookup_dummy = 0;
467
1.27k
                SCOPED_RAW_TIMER(outer_stats ? &outer_stats->inverted_index_lookup_timer
468
1.27k
                                             : &lookup_dummy);
469
1.27k
                dsl_hit = dsl_cache->lookup(dsl_cache_key, &dsl_cache_handle);
470
1.27k
            }
471
1.27k
            if (dsl_hit) {
472
174
                auto cached_bitmap = dsl_cache_handle.get_bitmap();
473
175
                if (cached_bitmap) {
474
175
                    if (outer_stats) {
475
175
                        outer_stats->inverted_index_query_cache_hit++;
476
175
                    }
477
                    // Also retrieve cached null bitmap for three-valued SQL logic
478
                    // (needed by compound operators NOT, OR, AND in VCompoundPred)
479
175
                    auto null_cache_key = InvertedIndexQueryCache::CacheKey {
480
175
                            seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY,
481
175
                            dsl_sig + "__null"};
482
175
                    InvertedIndexQueryCacheHandle null_cache_handle;
483
175
                    std::shared_ptr<roaring::Roaring> null_bitmap;
484
175
                    if (dsl_cache->lookup(null_cache_key, &null_cache_handle)) {
485
175
                        null_bitmap = null_cache_handle.get_bitmap();
486
175
                    }
487
175
                    if (!null_bitmap) {
488
0
                        null_bitmap = std::make_shared<roaring::Roaring>();
489
0
                    }
490
175
                    bitmap_result =
491
175
                            InvertedIndexResultBitmap(cached_bitmap, std::move(null_bitmap));
492
175
                    return Status::OK();
493
175
                }
494
174
            }
495
1.10k
            if (outer_stats) {
496
1.10k
                outer_stats->inverted_index_query_cache_miss++;
497
1.10k
            }
498
1.09k
        }
499
1.29k
    }
500
501
1.13k
    std::shared_ptr<IndexQueryContext> context;
502
1.13k
    if (index_query_context) {
503
1.10k
        context = index_query_context;
504
1.10k
    } else {
505
29
        context = std::make_shared<IndexQueryContext>();
506
29
        context->collection_statistics = std::make_shared<CollectionStatistics>();
507
29
        context->collection_similarity = std::make_shared<CollectionSimilarity>();
508
29
    }
509
510
    // NESTED() queries evaluate predicates on the flattened "element space" of a nested group.
511
    // For VARIANT nested groups, the indexed lucene field (stored_field_name) uses:
512
    //   parent_unique_id + "." + <variant-relative nested path>
513
    // where the nested path is rooted at either:
514
    //   - "__D0_root__" for top-level array<object> (NESTED(data, ...))
515
    //   - "<nested_path_after_variant_root>" for object fields (NESTED(data.items, ...))
516
    //
517
    // FE field bindings are expressed using logical column paths (e.g. "data.items.msg"), so for
518
    // NESTED() we normalize stored_field_name suffix to be consistent with the nested group root.
519
1.13k
    std::unordered_map<std::string, IndexFieldNameAndTypePair> patched_data_type_with_names;
520
1.13k
    const auto* effective_data_type_with_names = &data_type_with_names;
521
1.13k
    if (is_nested_query && search_param.root.__isset.nested_path) {
522
0
        const std::string& nested_path = search_param.root.nested_path;
523
0
        const auto dot_pos = nested_path.find('.');
524
0
        const std::string root_field =
525
0
                (dot_pos == std::string::npos) ? nested_path : nested_path.substr(0, dot_pos);
526
0
        const std::string root_prefix = root_field + ".";
527
0
        const std::string array_path = (dot_pos == std::string::npos)
528
0
                                               ? std::string(segment_v2::kRootNestedGroupPath)
529
0
                                               : nested_path.substr(dot_pos + 1);
530
531
0
        bool copied = false;
532
0
        for (const auto& fb : search_param.field_bindings) {
533
0
            if (!fb.__isset.is_variant_subcolumn || !fb.is_variant_subcolumn) {
534
0
                continue;
535
0
            }
536
0
            if (fb.field_name.empty()) {
537
0
                continue;
538
0
            }
539
0
            const auto it_orig = data_type_with_names.find(fb.field_name);
540
0
            if (it_orig == data_type_with_names.end()) {
541
0
                continue;
542
0
            }
543
0
            const std::string& old_stored = it_orig->second.first;
544
0
            const auto first_dot = old_stored.find('.');
545
0
            if (first_dot == std::string::npos) {
546
0
                continue;
547
0
            }
548
0
            std::string sub_path;
549
0
            if (fb.__isset.subcolumn_path && !fb.subcolumn_path.empty()) {
550
0
                sub_path = fb.subcolumn_path;
551
0
            } else if (fb.field_name.starts_with(nested_path + ".")) {
552
0
                sub_path = fb.field_name.substr(nested_path.size() + 1);
553
0
            } else if (fb.field_name.starts_with(root_prefix)) {
554
0
                sub_path = fb.field_name.substr(root_prefix.size());
555
0
            } else {
556
0
                sub_path = fb.field_name;
557
0
            }
558
0
            if (sub_path.empty()) {
559
0
                continue;
560
0
            }
561
0
            const std::string array_prefix = array_path + ".";
562
0
            const std::string suffix_path =
563
0
                    sub_path.starts_with(array_prefix) ? sub_path : (array_prefix + sub_path);
564
0
            const std::string parent_uid = old_stored.substr(0, first_dot);
565
0
            const std::string expected_stored = parent_uid + "." + suffix_path;
566
0
            if (old_stored == expected_stored) {
567
0
                continue;
568
0
            }
569
570
0
            if (!copied) {
571
0
                patched_data_type_with_names = data_type_with_names;
572
0
                effective_data_type_with_names = &patched_data_type_with_names;
573
0
                copied = true;
574
0
            }
575
0
            auto it = patched_data_type_with_names.find(fb.field_name);
576
0
            if (it == patched_data_type_with_names.end()) {
577
0
                continue;
578
0
            }
579
0
            it->second.first = expected_stored;
580
0
        }
581
0
    }
582
583
    // Pass field_bindings to resolver for variant subcolumn detection
584
1.13k
    FieldReaderResolver resolver(*effective_data_type_with_names, iterators, context,
585
1.13k
                                 search_param.field_bindings);
586
587
1.13k
    if (is_nested_query) {
588
0
        std::shared_ptr<roaring::Roaring> row_bitmap;
589
0
        RETURN_IF_ERROR(evaluate_nested_query(search_param, search_param.root, context, resolver,
590
0
                                              num_rows, index_exec_ctx, field_name_to_column_id,
591
0
                                              row_bitmap));
592
0
        bitmap_result = InvertedIndexResultBitmap(std::move(row_bitmap),
593
0
                                                  std::make_shared<roaring::Roaring>());
594
0
        bitmap_result.mask_out_null();
595
0
        return Status::OK();
596
0
    }
597
598
    // Extract default_operator from TSearchParam (default: "or")
599
1.13k
    std::string default_operator = "or";
600
1.13k
    if (search_param.__isset.default_operator && !search_param.default_operator.empty()) {
601
1.10k
        default_operator = search_param.default_operator;
602
1.10k
    }
603
    // Extract minimum_should_match from TSearchParam (-1 means not set)
604
1.13k
    int32_t minimum_should_match = -1;
605
1.13k
    if (search_param.__isset.minimum_should_match) {
606
48
        minimum_should_match = search_param.minimum_should_match;
607
48
    }
608
609
1.13k
    auto* stats = context->stats;
610
1.13k
    int64_t dummy_timer = 0;
611
1.13k
    SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_timer : &dummy_timer);
612
613
1.13k
    query_v2::QueryPtr root_query;
614
1.13k
    std::string root_binding_key;
615
1.13k
    {
616
1.13k
        int64_t init_dummy = 0;
617
1.13k
        SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_init_timer : &init_dummy);
618
1.13k
        RETURN_IF_ERROR(build_query_recursive(search_param.root, context, resolver, &root_query,
619
1.13k
                                              &root_binding_key, default_operator,
620
1.13k
                                              minimum_should_match));
621
1.13k
    }
622
1.10k
    if (root_query == nullptr) {
623
0
        LOG(INFO) << "search: Query tree resolved to empty query, dsl:"
624
0
                  << search_param.original_dsl;
625
0
        bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
626
0
                                                  std::make_shared<roaring::Roaring>());
627
0
        return Status::OK();
628
0
    }
629
630
1.10k
    ResolverNullBitmapAdapter null_resolver(resolver);
631
1.10k
    query_v2::QueryExecutionContext exec_ctx =
632
1.10k
            build_query_execution_context(num_rows, resolver, &null_resolver);
633
634
1.10k
    bool enable_scoring = false;
635
1.10k
    bool is_asc = false;
636
1.10k
    size_t top_k = 0;
637
1.10k
    if (index_query_context) {
638
1.10k
        enable_scoring = index_query_context->collection_similarity != nullptr;
639
1.10k
        is_asc = index_query_context->is_asc;
640
1.10k
        top_k = index_query_context->query_limit;
641
1.10k
    }
642
643
1.10k
    auto weight = root_query->weight(enable_scoring);
644
1.10k
    if (!weight) {
645
0
        LOG(WARNING) << "search: Failed to build query weight";
646
0
        bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
647
0
                                                  std::make_shared<roaring::Roaring>());
648
0
        return Status::OK();
649
0
    }
650
651
1.10k
    std::shared_ptr<roaring::Roaring> roaring = std::make_shared<roaring::Roaring>();
652
1.10k
    {
653
1.10k
        int64_t exec_dummy = 0;
654
1.10k
        SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_exec_timer : &exec_dummy);
655
1.10k
        if (enable_scoring && !is_asc && top_k > 0) {
656
0
            bool use_wand = index_query_context->runtime_state != nullptr &&
657
0
                            index_query_context->runtime_state->query_options()
658
0
                                    .enable_inverted_index_wand_query;
659
0
            query_v2::collect_multi_segment_top_k(
660
0
                    weight, exec_ctx, root_binding_key, top_k, roaring,
661
0
                    index_query_context->collection_similarity, use_wand);
662
1.10k
        } else {
663
1.10k
            query_v2::collect_multi_segment_doc_set(
664
1.10k
                    weight, exec_ctx, root_binding_key, roaring,
665
1.10k
                    index_query_context ? index_query_context->collection_similarity : nullptr,
666
1.10k
                    enable_scoring);
667
1.10k
        }
668
1.10k
    }
669
670
18.4E
    VLOG_DEBUG << "search: Query completed, matched " << roaring->cardinality() << " documents";
671
672
    // Extract NULL bitmap from three-valued logic scorer
673
    // The scorer correctly computes which documents evaluate to NULL based on query logic
674
    // For example: TRUE OR NULL = TRUE (not NULL), FALSE OR NULL = NULL
675
1.10k
    std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
676
1.11k
    if (exec_ctx.null_resolver) {
677
1.11k
        auto scorer = weight->scorer(exec_ctx, root_binding_key);
678
1.11k
        if (scorer && scorer->has_null_bitmap(exec_ctx.null_resolver)) {
679
224
            const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver);
680
224
            if (bitmap != nullptr) {
681
224
                *null_bitmap = *bitmap;
682
224
                VLOG_TRACE << "search: Extracted NULL bitmap with " << null_bitmap->cardinality()
683
0
                           << " NULL documents";
684
224
            }
685
224
        }
686
1.11k
    }
687
688
18.4E
    VLOG_TRACE << "search: Before mask - true_bitmap=" << roaring->cardinality()
689
18.4E
               << ", null_bitmap=" << null_bitmap->cardinality();
690
691
    // Create result and mask out NULLs (SQL WHERE clause semantics: only TRUE rows)
692
1.10k
    bitmap_result = InvertedIndexResultBitmap(std::move(roaring), std::move(null_bitmap));
693
1.10k
    bitmap_result.mask_out_null();
694
695
1.10k
    VLOG_TRACE << "search: After mask - result_bitmap="
696
1
               << bitmap_result.get_data_bitmap()->cardinality();
697
698
    // Insert post-mask_out_null result into DSL cache for future reuse
699
    // Cache both data bitmap and null bitmap so compound operators (NOT, OR, AND)
700
    // can apply correct three-valued SQL logic on cache hit
701
1.10k
    if (dsl_cache && cache_usable) {
702
1.10k
        InvertedIndexQueryCacheHandle insert_handle;
703
1.10k
        dsl_cache->insert(dsl_cache_key, bitmap_result.get_data_bitmap(), &insert_handle);
704
1.10k
        if (bitmap_result.get_null_bitmap()) {
705
1.10k
            auto null_cache_key = InvertedIndexQueryCache::CacheKey {
706
1.10k
                    seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY,
707
1.10k
                    dsl_sig + "__null"};
708
1.10k
            InvertedIndexQueryCacheHandle null_insert_handle;
709
1.10k
            dsl_cache->insert(null_cache_key, bitmap_result.get_null_bitmap(), &null_insert_handle);
710
1.10k
        }
711
1.10k
    }
712
713
1.10k
    return Status::OK();
714
1.10k
}
715
716
Status FunctionSearch::evaluate_nested_query(
717
        const TSearchParam& search_param, const TSearchClause& nested_clause,
718
        const std::shared_ptr<IndexQueryContext>& context, FieldReaderResolver& resolver,
719
        uint32_t num_rows, const IndexExecContext* index_exec_ctx,
720
        const std::unordered_map<std::string, int>& field_name_to_column_id,
721
7
        std::shared_ptr<roaring::Roaring>& result_bitmap) const {
722
7
    (void)field_name_to_column_id;
723
7
    if (!(nested_clause.__isset.nested_path)) {
724
2
        return Status::InvalidArgument("NESTED clause missing nested_path");
725
2
    }
726
5
    if (!(nested_clause.__isset.children) || nested_clause.children.empty()) {
727
2
        return Status::InvalidArgument("NESTED clause missing inner query");
728
2
    }
729
3
    if (result_bitmap == nullptr) {
730
2
        result_bitmap = std::make_shared<roaring::Roaring>();
731
2
    } else {
732
1
        *result_bitmap = roaring::Roaring();
733
1
    }
734
735
    // 1. Get the nested group chain directly
736
3
    std::string root_field = nested_clause.nested_path;
737
3
    auto dot_pos = nested_clause.nested_path.find('.');
738
3
    if (dot_pos != std::string::npos) {
739
1
        root_field = nested_clause.nested_path.substr(0, dot_pos);
740
1
    }
741
3
    if (index_exec_ctx == nullptr || index_exec_ctx->segment() == nullptr) {
742
3
        return Status::InvalidArgument("NESTED query requires IndexExecContext with valid segment");
743
3
    }
744
0
    auto* segment = index_exec_ctx->segment();
745
0
    const int32_t ordinal = segment->tablet_schema()->field_index(root_field);
746
0
    if (ordinal < 0) {
747
0
        return Status::InvalidArgument("Column '{}' not found in tablet schema for nested query",
748
0
                                       root_field);
749
0
    }
750
0
    const ColumnId column_id = static_cast<ColumnId>(ordinal);
751
752
0
    std::shared_ptr<segment_v2::ColumnReader> column_reader;
753
0
    RETURN_IF_ERROR(segment->get_column_reader(segment->tablet_schema()->column(column_id),
754
0
                                               &column_reader,
755
0
                                               index_exec_ctx->column_iter_opts().stats));
756
0
    auto* variant_reader = dynamic_cast<segment_v2::VariantColumnReader*>(column_reader.get());
757
0
    if (variant_reader == nullptr) {
758
0
        return Status::InvalidArgument("Column '{}' is not VARIANT for nested query", root_field);
759
0
    }
760
761
0
    std::string array_path;
762
0
    if (dot_pos == std::string::npos) {
763
0
        array_path = std::string(segment_v2::kRootNestedGroupPath);
764
0
    } else {
765
0
        array_path = nested_clause.nested_path.substr(dot_pos + 1);
766
0
    }
767
768
0
    auto [found, group_chain, _] = variant_reader->collect_nested_group_chain(array_path);
769
0
    if (!found || group_chain.empty()) {
770
0
        return Status::OK();
771
0
    }
772
773
    // Use the read provider for element counting and bitmap mapping.
774
0
    auto read_provider = segment_v2::create_nested_group_read_provider();
775
0
    if (!read_provider || !read_provider->should_enable_nested_group_read_path()) {
776
0
        return Status::NotSupported(
777
0
                "NestedGroup search is an enterprise capability, not available in this build");
778
0
    }
779
780
0
    auto& leaf_group = group_chain.back();
781
0
    uint64_t total_elements = 0;
782
0
    RETURN_IF_ERROR(read_provider->get_total_elements(index_exec_ctx->column_iter_opts(),
783
0
                                                      leaf_group, &total_elements));
784
0
    if (total_elements == 0) {
785
0
        return Status::OK();
786
0
    }
787
788
    // 3. Evaluate inner query
789
0
    std::string default_operator = "or";
790
0
    if (search_param.__isset.default_operator && !search_param.default_operator.empty()) {
791
0
        default_operator = search_param.default_operator;
792
0
    }
793
0
    int32_t minimum_should_match = -1;
794
0
    if (search_param.__isset.minimum_should_match) {
795
0
        minimum_should_match = search_param.minimum_should_match;
796
0
    }
797
798
0
    query_v2::QueryPtr inner_query;
799
0
    std::string inner_binding_key;
800
0
    RETURN_IF_ERROR(build_query_recursive(nested_clause.children[0], context, resolver,
801
0
                                          &inner_query, &inner_binding_key, default_operator,
802
0
                                          minimum_should_match));
803
0
    if (inner_query == nullptr) {
804
0
        return Status::OK();
805
0
    }
806
807
0
    if (total_elements > std::numeric_limits<uint32_t>::max()) {
808
0
        return Status::InvalidArgument("nested element_count exceeds uint32_t max");
809
0
    }
810
811
0
    ResolverNullBitmapAdapter null_resolver(resolver);
812
0
    query_v2::QueryExecutionContext exec_ctx = build_query_execution_context(
813
0
            static_cast<uint32_t>(total_elements), resolver, &null_resolver);
814
815
0
    auto weight = inner_query->weight(false);
816
0
    if (!weight) {
817
0
        return Status::OK();
818
0
    }
819
0
    auto scorer = weight->scorer(exec_ctx, inner_binding_key);
820
0
    if (!scorer) {
821
0
        return Status::OK();
822
0
    }
823
824
0
    roaring::Roaring element_bitmap;
825
0
    uint32_t doc = scorer->doc();
826
0
    while (doc != query_v2::TERMINATED) {
827
0
        element_bitmap.add(doc);
828
0
        doc = scorer->advance();
829
0
    }
830
831
0
    if (scorer->has_null_bitmap(exec_ctx.null_resolver)) {
832
0
        const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver);
833
0
        if (bitmap != nullptr && !bitmap->isEmpty()) {
834
0
            element_bitmap -= *bitmap;
835
0
        }
836
0
    }
837
838
    // 4. Map element-level hits back to row-level hits through NestedGroup chain.
839
0
    if (result_bitmap == nullptr) {
840
0
        result_bitmap = std::make_shared<roaring::Roaring>();
841
0
    }
842
0
    roaring::Roaring parent_bitmap;
843
0
    RETURN_IF_ERROR(read_provider->map_elements_to_parent_ords(
844
0
            group_chain, index_exec_ctx->column_iter_opts(), element_bitmap, &parent_bitmap));
845
0
    *result_bitmap = std::move(parent_bitmap);
846
0
    return Status::OK();
847
0
}
848
849
// Aligned with FE QsClauseType enum - uses enum.name() as clause_type
850
FunctionSearch::ClauseTypeCategory FunctionSearch::get_clause_type_category(
851
7.24k
        const std::string& clause_type) const {
852
7.24k
    if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" ||
853
7.24k
        clause_type == "OCCUR_BOOLEAN" || clause_type == "NESTED") {
854
146
        return ClauseTypeCategory::COMPOUND;
855
7.10k
    } else if (clause_type == "TERM" || clause_type == "PREFIX" || clause_type == "WILDCARD" ||
856
7.10k
               clause_type == "REGEXP" || clause_type == "RANGE" || clause_type == "LIST" ||
857
7.10k
               clause_type == "EXACT") {
858
        // Non-tokenized queries: exact matching, pattern matching, range, list operations
859
6.68k
        return ClauseTypeCategory::NON_TOKENIZED;
860
6.68k
    } else if (clause_type == "PHRASE" || clause_type == "MATCH" || clause_type == "ANY" ||
861
435
               clause_type == "ALL") {
862
        // Tokenized queries: phrase search, full-text search, multi-value matching
863
        // Note: ANY and ALL require tokenization of their input values
864
435
        return ClauseTypeCategory::TOKENIZED;
865
18.4E
    } else {
866
        // Default to NON_TOKENIZED for unknown types
867
18.4E
        LOG(WARNING) << "Unknown clause type '" << clause_type
868
18.4E
                     << "', defaulting to NON_TOKENIZED category";
869
18.4E
        return ClauseTypeCategory::NON_TOKENIZED;
870
18.4E
    }
871
7.24k
}
872
873
// Analyze query type for a specific field in the search clause
874
InvertedIndexQueryType FunctionSearch::analyze_field_query_type(const std::string& field_name,
875
5.28k
                                                                const TSearchClause& clause) const {
876
5.28k
    const std::string& clause_type = clause.clause_type;
877
5.28k
    ClauseTypeCategory category = get_clause_type_category(clause_type);
878
879
    // Handle leaf queries - use direct mapping
880
5.28k
    if (category != ClauseTypeCategory::COMPOUND) {
881
        // Check if this clause targets the specific field
882
5.14k
        if (clause.field_name == field_name) {
883
            // Use direct mapping from clause_type to InvertedIndexQueryType
884
163
            return clause_type_to_query_type(clause_type);
885
163
        }
886
5.14k
    }
887
888
    // Handle boolean queries - recursively analyze children
889
5.11k
    if (!clause.children.empty()) {
890
5.09k
        for (const auto& child_clause : clause.children) {
891
            // Recursively analyze each child
892
5.09k
            InvertedIndexQueryType child_type = analyze_field_query_type(field_name, child_clause);
893
            // If this child targets the field (not default EQUAL_QUERY), return its query type
894
5.09k
            if (child_type != InvertedIndexQueryType::UNKNOWN_QUERY) {
895
124
                return child_type;
896
124
            }
897
5.09k
        }
898
132
    }
899
900
    // If no children target this field, return UNKNOWN_QUERY as default
901
4.99k
    return InvertedIndexQueryType::UNKNOWN_QUERY;
902
5.11k
}
903
904
// Map clause_type string to InvertedIndexQueryType
905
InvertedIndexQueryType FunctionSearch::clause_type_to_query_type(
906
2.16k
        const std::string& clause_type) const {
907
    // Use static map for better performance and maintainability
908
2.16k
    static const std::unordered_map<std::string, InvertedIndexQueryType> clause_type_map = {
909
            // Boolean operations
910
2.16k
            {"AND", InvertedIndexQueryType::BOOLEAN_QUERY},
911
2.16k
            {"OR", InvertedIndexQueryType::BOOLEAN_QUERY},
912
2.16k
            {"NOT", InvertedIndexQueryType::BOOLEAN_QUERY},
913
2.16k
            {"OCCUR_BOOLEAN", InvertedIndexQueryType::BOOLEAN_QUERY},
914
2.16k
            {"NESTED", InvertedIndexQueryType::BOOLEAN_QUERY},
915
916
            // Non-tokenized queries (exact matching, pattern matching)
917
2.16k
            {"TERM", InvertedIndexQueryType::EQUAL_QUERY},
918
2.16k
            {"PREFIX", InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY},
919
2.16k
            {"WILDCARD", InvertedIndexQueryType::WILDCARD_QUERY},
920
2.16k
            {"REGEXP", InvertedIndexQueryType::MATCH_REGEXP_QUERY},
921
2.16k
            {"RANGE", InvertedIndexQueryType::RANGE_QUERY},
922
2.16k
            {"LIST", InvertedIndexQueryType::LIST_QUERY},
923
924
            // Tokenized queries (full-text search, phrase search)
925
2.16k
            {"PHRASE", InvertedIndexQueryType::MATCH_PHRASE_QUERY},
926
2.16k
            {"MATCH", InvertedIndexQueryType::MATCH_ANY_QUERY},
927
2.16k
            {"ANY", InvertedIndexQueryType::MATCH_ANY_QUERY},
928
2.16k
            {"ALL", InvertedIndexQueryType::MATCH_ALL_QUERY},
929
930
            // Exact match without tokenization
931
2.16k
            {"EXACT", InvertedIndexQueryType::EQUAL_QUERY},
932
2.16k
    };
933
934
2.16k
    auto it = clause_type_map.find(clause_type);
935
2.16k
    if (it != clause_type_map.end()) {
936
2.15k
        return it->second;
937
2.15k
    }
938
939
    // Unknown clause type
940
2.16k
    LOG(WARNING) << "Unknown clause type '" << clause_type << "', defaulting to EQUAL_QUERY";
941
8
    return InvertedIndexQueryType::EQUAL_QUERY;
942
2.16k
}
943
944
// Map Thrift TSearchOccur to query_v2::Occur
945
904
static query_v2::Occur map_thrift_occur(TSearchOccur::type thrift_occur) {
946
904
    switch (thrift_occur) {
947
305
    case TSearchOccur::MUST:
948
305
        return query_v2::Occur::MUST;
949
531
    case TSearchOccur::SHOULD:
950
531
        return query_v2::Occur::SHOULD;
951
68
    case TSearchOccur::MUST_NOT:
952
68
        return query_v2::Occur::MUST_NOT;
953
0
    default:
954
0
        return query_v2::Occur::MUST;
955
904
    }
956
904
}
957
958
Status FunctionSearch::build_query_recursive(const TSearchClause& clause,
959
                                             const std::shared_ptr<IndexQueryContext>& context,
960
                                             FieldReaderResolver& resolver,
961
                                             inverted_index::query_v2::QueryPtr* out,
962
                                             std::string* binding_key,
963
                                             const std::string& default_operator,
964
2.78k
                                             int32_t minimum_should_match) const {
965
2.78k
    DCHECK(out != nullptr);
966
2.78k
    *out = nullptr;
967
2.83k
    if (binding_key) {
968
2.83k
        binding_key->clear();
969
2.83k
    }
970
971
2.78k
    const std::string& clause_type = clause.clause_type;
972
973
    // Handle MATCH_ALL_DOCS - matches all documents in the segment
974
2.78k
    if (clause_type == "MATCH_ALL_DOCS") {
975
44
        *out = std::make_shared<query_v2::AllQuery>();
976
44
        return Status::OK();
977
44
    }
978
979
    // Handle OCCUR_BOOLEAN - Lucene-style boolean query with MUST/SHOULD/MUST_NOT
980
2.74k
    if (clause_type == "OCCUR_BOOLEAN") {
981
440
        auto builder = segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder();
982
983
        // Set minimum_should_match if specified
984
440
        if (clause.__isset.minimum_should_match) {
985
427
            builder->set_minimum_number_should_match(clause.minimum_should_match);
986
427
        }
987
988
441
        if (clause.__isset.children) {
989
902
            for (const auto& child_clause : clause.children) {
990
902
                query_v2::QueryPtr child_query;
991
902
                std::string child_binding_key;
992
902
                RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query,
993
902
                                                      &child_binding_key, default_operator,
994
902
                                                      minimum_should_match));
995
996
                // Determine occur type from child clause
997
901
                query_v2::Occur occur = query_v2::Occur::MUST; // default
998
906
                if (child_clause.__isset.occur) {
999
906
                    occur = map_thrift_occur(child_clause.occur);
1000
906
                }
1001
1002
901
                builder->add(child_query, occur, std::move(child_binding_key));
1003
901
            }
1004
441
        }
1005
1006
439
        *out = builder->build();
1007
439
        return Status::OK();
1008
440
    }
1009
1010
2.30k
    if (clause_type == "NESTED") {
1011
1
        return Status::InvalidArgument("NESTED clause must be evaluated at top level");
1012
1
    }
1013
1014
    // Handle standard boolean operators (AND/OR/NOT)
1015
2.30k
    if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT") {
1016
375
        query_v2::OperatorType op = query_v2::OperatorType::OP_AND;
1017
375
        if (clause_type == "OR") {
1018
198
            op = query_v2::OperatorType::OP_OR;
1019
198
        } else if (clause_type == "NOT") {
1020
87
            op = query_v2::OperatorType::OP_NOT;
1021
87
        }
1022
1023
375
        auto builder = create_operator_boolean_query_builder(op);
1024
375
        if (clause.__isset.children) {
1025
794
            for (const auto& child_clause : clause.children) {
1026
794
                query_v2::QueryPtr child_query;
1027
794
                std::string child_binding_key;
1028
794
                RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query,
1029
794
                                                      &child_binding_key, default_operator,
1030
794
                                                      minimum_should_match));
1031
                // Add all children including empty BitSetQuery
1032
                // BooleanQuery will handle the logic:
1033
                // - AND with empty bitmap → result is empty
1034
                // - OR with empty bitmap → empty bitmap is ignored by OR logic
1035
                // - NOT with empty bitmap → NOT(empty) = all rows (handled by BooleanQuery)
1036
793
                builder->add(child_query, std::move(child_binding_key));
1037
793
            }
1038
371
        }
1039
1040
374
        *out = builder->build();
1041
374
        return Status::OK();
1042
375
    }
1043
1044
1.92k
    return build_leaf_query(clause, context, resolver, out, binding_key, default_operator,
1045
1.92k
                            minimum_should_match);
1046
2.30k
}
1047
1048
Status FunctionSearch::build_leaf_query(const TSearchClause& clause,
1049
                                        const std::shared_ptr<IndexQueryContext>& context,
1050
                                        FieldReaderResolver& resolver,
1051
                                        inverted_index::query_v2::QueryPtr* out,
1052
                                        std::string* binding_key,
1053
                                        const std::string& default_operator,
1054
1.97k
                                        int32_t minimum_should_match) const {
1055
1.97k
    DCHECK(out != nullptr);
1056
1.97k
    *out = nullptr;
1057
1.97k
    if (binding_key) {
1058
1.97k
        binding_key->clear();
1059
1.97k
    }
1060
1061
1.97k
    if (!clause.__isset.field_name || !clause.__isset.value) {
1062
0
        return Status::InvalidArgument("search clause missing field_name or value");
1063
0
    }
1064
1065
1.97k
    const std::string& field_name = clause.field_name;
1066
1.97k
    const std::string& value = clause.value;
1067
1.97k
    const std::string& clause_type = clause.clause_type;
1068
1069
1.97k
    auto query_type = clause_type_to_query_type(clause_type);
1070
    // TERM, WILDCARD, PREFIX, and REGEXP in search DSL operate on individual index terms
1071
    // (like Lucene TermQuery, WildcardQuery, PrefixQuery, RegexpQuery).
1072
    // Override to MATCH_ANY_QUERY so select_best_reader() prefers the FULLTEXT reader
1073
    // when multiple indexes exist on the same column (one tokenized, one untokenized).
1074
    // Without this, these queries would select the untokenized index and try to match
1075
    // patterns like "h*llo" against full strings ("hello world") instead of individual
1076
    // tokens ("hello"), returning empty results.
1077
    // EXACT must remain EQUAL_QUERY to prefer the untokenized STRING_TYPE reader.
1078
    //
1079
    // Safe for single-index columns: select_best_reader() has a single-reader fast path
1080
    // that returns the only reader directly, bypassing the query_type preference logic.
1081
1.97k
    if (clause_type == "TERM" || clause_type == "WILDCARD" || clause_type == "PREFIX" ||
1082
1.97k
        clause_type == "REGEXP") {
1083
1.47k
        query_type = InvertedIndexQueryType::MATCH_ANY_QUERY;
1084
1.47k
    }
1085
1086
1.97k
    FieldReaderBinding binding;
1087
1.97k
    RETURN_IF_ERROR(resolver.resolve(field_name, query_type, &binding));
1088
1089
    // Check if binding is empty (variant subcolumn not found in this segment)
1090
1.95k
    if (binding.lucene_reader == nullptr) {
1091
3
        LOG(INFO) << "search: No inverted index for field '" << field_name
1092
3
                  << "' in this segment, clause_type='" << clause_type
1093
3
                  << "', query_type=" << static_cast<int>(query_type) << ", returning no matches";
1094
        // Variant subcolumn doesn't exist - create empty BitSetQuery (no matches)
1095
3
        *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
1096
3
        if (binding_key) {
1097
3
            binding_key->clear();
1098
3
        }
1099
3
        return Status::OK();
1100
3
    }
1101
1102
1.94k
    if (binding_key) {
1103
1.94k
        *binding_key = binding.binding_key;
1104
1.94k
    }
1105
1106
1.94k
    FunctionSearch::ClauseTypeCategory category = get_clause_type_category(clause_type);
1107
1.94k
    std::wstring field_wstr = binding.stored_field_wstr;
1108
1.94k
    std::wstring value_wstr = StringHelper::to_wstring(value);
1109
1110
1.96k
    auto make_term_query = [&](const std::wstring& term) -> query_v2::QueryPtr {
1111
1.96k
        return std::make_shared<query_v2::TermQuery>(context, field_wstr, term);
1112
1.96k
    };
1113
1114
1.94k
    if (clause_type == "TERM") {
1115
1.36k
        bool should_analyze =
1116
1.36k
                inverted_index::InvertedIndexAnalyzer::should_analyzer(binding.index_properties);
1117
1.36k
        if (should_analyze) {
1118
1.10k
            if (binding.index_properties.empty()) {
1119
0
                LOG(WARNING) << "search: analyzer required but index properties empty for field '"
1120
0
                             << field_name << "'";
1121
0
                *out = make_term_query(value_wstr);
1122
0
                return Status::OK();
1123
0
            }
1124
1125
1.10k
            std::vector<TermInfo> term_infos =
1126
1.10k
                    inverted_index::InvertedIndexAnalyzer::get_analyse_result(
1127
1.10k
                            value, binding.index_properties);
1128
1.10k
            if (term_infos.empty()) {
1129
0
                LOG(WARNING) << "search: No terms found after tokenization for TERM query, field="
1130
0
                             << field_name << ", value='" << value
1131
0
                             << "', returning empty BitSetQuery";
1132
0
                *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
1133
0
                return Status::OK();
1134
0
            }
1135
1136
1.10k
            if (term_infos.size() == 1) {
1137
1.08k
                std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term());
1138
1.08k
                *out = make_term_query(term_wstr);
1139
1.08k
                return Status::OK();
1140
1.08k
            }
1141
1142
            // When minimum_should_match is specified, use OccurBooleanQuery
1143
            // ES behavior: msm only applies to SHOULD clauses
1144
19
            if (minimum_should_match > 0) {
1145
0
                auto builder =
1146
0
                        segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder();
1147
0
                builder->set_minimum_number_should_match(minimum_should_match);
1148
0
                query_v2::Occur occur = (default_operator == "and") ? query_v2::Occur::MUST
1149
0
                                                                    : query_v2::Occur::SHOULD;
1150
0
                for (const auto& term_info : term_infos) {
1151
0
                    std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
1152
0
                    builder->add(make_term_query(term_wstr), occur);
1153
0
                }
1154
0
                *out = builder->build();
1155
0
                return Status::OK();
1156
0
            }
1157
1158
            // Use default_operator to determine how to combine tokenized terms
1159
19
            query_v2::OperatorType op_type = (default_operator == "and")
1160
19
                                                     ? query_v2::OperatorType::OP_AND
1161
19
                                                     : query_v2::OperatorType::OP_OR;
1162
19
            auto builder = create_operator_boolean_query_builder(op_type);
1163
20
            for (const auto& term_info : term_infos) {
1164
20
                std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
1165
20
                builder->add(make_term_query(term_wstr), binding.binding_key);
1166
20
            }
1167
1168
19
            *out = builder->build();
1169
19
            return Status::OK();
1170
19
        }
1171
1172
264
        *out = make_term_query(value_wstr);
1173
264
        return Status::OK();
1174
1.36k
    }
1175
1176
583
    if (category == FunctionSearch::ClauseTypeCategory::TOKENIZED) {
1177
410
        if (clause_type == "PHRASE") {
1178
124
            bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer(
1179
124
                    binding.index_properties);
1180
124
            if (!should_analyze) {
1181
11
                VLOG_DEBUG << "search: PHRASE on non-tokenized field '" << field_name
1182
0
                           << "', falling back to TERM";
1183
11
                *out = make_term_query(value_wstr);
1184
11
                return Status::OK();
1185
11
            }
1186
1187
113
            if (binding.index_properties.empty()) {
1188
0
                LOG(WARNING) << "search: analyzer required but index properties empty for PHRASE "
1189
0
                                "query on field '"
1190
0
                             << field_name << "'";
1191
0
                *out = make_term_query(value_wstr);
1192
0
                return Status::OK();
1193
0
            }
1194
1195
113
            std::vector<TermInfo> term_infos =
1196
113
                    inverted_index::InvertedIndexAnalyzer::get_analyse_result(
1197
113
                            value, binding.index_properties);
1198
113
            if (term_infos.empty()) {
1199
9
                LOG(WARNING) << "search: No terms found after tokenization for PHRASE query, field="
1200
9
                             << field_name << ", value='" << value
1201
9
                             << "', returning empty BitSetQuery";
1202
9
                *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
1203
9
                return Status::OK();
1204
9
            }
1205
1206
104
            std::vector<TermInfo> phrase_term_infos =
1207
104
                    QueryHelper::build_phrase_term_infos(term_infos);
1208
104
            if (phrase_term_infos.size() == 1) {
1209
63
                const auto& term_info = phrase_term_infos[0];
1210
63
                if (term_info.is_single_term()) {
1211
63
                    std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
1212
63
                    *out = std::make_shared<query_v2::TermQuery>(context, field_wstr, term_wstr);
1213
63
                } else {
1214
0
                    auto builder =
1215
0
                            create_operator_boolean_query_builder(query_v2::OperatorType::OP_OR);
1216
0
                    for (const auto& term : term_info.get_multi_terms()) {
1217
0
                        std::wstring term_wstr = StringHelper::to_wstring(term);
1218
0
                        builder->add(make_term_query(term_wstr), binding.binding_key);
1219
0
                    }
1220
0
                    *out = builder->build();
1221
0
                }
1222
63
            } else {
1223
41
                if (QueryHelper::is_simple_phrase(phrase_term_infos)) {
1224
21
                    *out = std::make_shared<query_v2::PhraseQuery>(context, field_wstr,
1225
21
                                                                   phrase_term_infos);
1226
21
                } else {
1227
20
                    *out = std::make_shared<query_v2::MultiPhraseQuery>(context, field_wstr,
1228
20
                                                                        phrase_term_infos);
1229
20
                }
1230
41
            }
1231
1232
104
            return Status::OK();
1233
113
        }
1234
286
        if (clause_type == "MATCH") {
1235
0
            VLOG_DEBUG << "search: MATCH clause not implemented, fallback to TERM";
1236
0
            *out = make_term_query(value_wstr);
1237
0
            return Status::OK();
1238
0
        }
1239
1240
287
        if (clause_type == "ANY" || clause_type == "ALL") {
1241
287
            bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer(
1242
287
                    binding.index_properties);
1243
287
            if (!should_analyze) {
1244
1
                *out = make_term_query(value_wstr);
1245
1
                return Status::OK();
1246
1
            }
1247
1248
286
            if (binding.index_properties.empty()) {
1249
0
                LOG(WARNING) << "search: index properties empty for tokenized clause '"
1250
0
                             << clause_type << "' field=" << field_name;
1251
0
                *out = make_term_query(value_wstr);
1252
0
                return Status::OK();
1253
0
            }
1254
1255
286
            std::vector<TermInfo> term_infos =
1256
286
                    inverted_index::InvertedIndexAnalyzer::get_analyse_result(
1257
286
                            value, binding.index_properties);
1258
286
            if (term_infos.empty()) {
1259
0
                LOG(WARNING) << "search: tokenization yielded no terms for clause '" << clause_type
1260
0
                             << "', field=" << field_name << ", returning empty BitSetQuery";
1261
0
                *out = std::make_shared<query_v2::BitSetQuery>(roaring::Roaring());
1262
0
                return Status::OK();
1263
0
            }
1264
1265
286
            query_v2::OperatorType bool_type = query_v2::OperatorType::OP_OR;
1266
286
            if (clause_type == "ALL") {
1267
196
                bool_type = query_v2::OperatorType::OP_AND;
1268
196
            }
1269
1270
286
            if (term_infos.size() == 1) {
1271
110
                std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term());
1272
110
                *out = make_term_query(term_wstr);
1273
110
                return Status::OK();
1274
110
            }
1275
1276
176
            auto builder = create_operator_boolean_query_builder(bool_type);
1277
398
            for (const auto& term_info : term_infos) {
1278
398
                std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
1279
398
                builder->add(make_term_query(term_wstr), binding.binding_key);
1280
398
            }
1281
176
            *out = builder->build();
1282
176
            return Status::OK();
1283
286
        }
1284
1285
        // Default tokenized clause fallback
1286
18.4E
        *out = make_term_query(value_wstr);
1287
18.4E
        return Status::OK();
1288
286
    }
1289
1290
173
    if (category == FunctionSearch::ClauseTypeCategory::NON_TOKENIZED) {
1291
173
        if (clause_type == "EXACT") {
1292
            // EXACT match: exact string matching without tokenization
1293
            // Note: EXACT prefers untokenized index (STRING_TYPE) which doesn't support lowercase
1294
            // If only tokenized index exists, EXACT may return empty results because
1295
            // tokenized indexes store individual tokens, not complete strings
1296
81
            *out = make_term_query(value_wstr);
1297
18.4E
            VLOG_DEBUG << "search: EXACT clause processed, field=" << field_name << ", value='"
1298
18.4E
                       << value << "'";
1299
81
            return Status::OK();
1300
81
        }
1301
92
        if (clause_type == "PREFIX") {
1302
            // Apply lowercase only if:
1303
            // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing)
1304
            // 2. lower_case is explicitly set to "true"
1305
36
            bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer(
1306
36
                    binding.index_properties);
1307
36
            std::string lowercase_setting =
1308
36
                    get_parser_lowercase_from_properties(binding.index_properties);
1309
36
            bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE);
1310
36
            std::string pattern = should_lowercase ? to_lower(value) : value;
1311
36
            *out = std::make_shared<query_v2::WildcardQuery>(context, field_wstr, pattern);
1312
36
            VLOG_DEBUG << "search: PREFIX clause processed, field=" << field_name << ", pattern='"
1313
0
                       << pattern << "' (original='" << value << "', has_parser=" << has_parser
1314
0
                       << ", lower_case=" << lowercase_setting << ")";
1315
36
            return Status::OK();
1316
36
        }
1317
1318
56
        if (clause_type == "WILDCARD") {
1319
            // Standalone wildcard "*" matches all non-null values for this field
1320
            // Consistent with ES query_string behavior where field:* becomes FieldExistsQuery
1321
23
            if (value == "*") {
1322
0
                *out = std::make_shared<query_v2::AllQuery>(field_wstr, true);
1323
0
                VLOG_DEBUG << "search: WILDCARD '*' converted to AllQuery(nullable=true), field="
1324
0
                           << field_name;
1325
0
                return Status::OK();
1326
0
            }
1327
            // Apply lowercase only if:
1328
            // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing)
1329
            // 2. lower_case is explicitly set to "true"
1330
23
            bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer(
1331
23
                    binding.index_properties);
1332
23
            std::string lowercase_setting =
1333
23
                    get_parser_lowercase_from_properties(binding.index_properties);
1334
23
            bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE);
1335
23
            std::string pattern = should_lowercase ? to_lower(value) : value;
1336
23
            *out = std::make_shared<query_v2::WildcardQuery>(context, field_wstr, pattern);
1337
23
            VLOG_DEBUG << "search: WILDCARD clause processed, field=" << field_name << ", pattern='"
1338
0
                       << pattern << "' (original='" << value << "', has_parser=" << has_parser
1339
0
                       << ", lower_case=" << lowercase_setting << ")";
1340
23
            return Status::OK();
1341
23
        }
1342
1343
33
        if (clause_type == "REGEXP") {
1344
            // ES-compatible: regex patterns are NOT lowercased (case-sensitive matching)
1345
            // This matches ES query_string behavior where regex patterns bypass analysis
1346
29
            *out = std::make_shared<query_v2::RegexpQuery>(context, field_wstr, value);
1347
29
            VLOG_DEBUG << "search: REGEXP clause processed, field=" << field_name << ", pattern='"
1348
0
                       << value << "'";
1349
29
            return Status::OK();
1350
29
        }
1351
1352
4
        if (clause_type == "RANGE" || clause_type == "LIST") {
1353
3
            VLOG_DEBUG << "search: clause type '" << clause_type
1354
0
                       << "' not implemented, fallback to TERM";
1355
3
        }
1356
4
        *out = make_term_query(value_wstr);
1357
4
        return Status::OK();
1358
33
    }
1359
1360
173
    LOG(WARNING) << "search: Unexpected clause type '" << clause_type << "', using TERM fallback";
1361
0
    *out = make_term_query(value_wstr);
1362
0
    return Status::OK();
1363
173
}
1364
1365
8
void register_function_search(SimpleFunctionFactory& factory) {
1366
8
    factory.register_function<FunctionSearch>();
1367
8
}
1368
1369
} // namespace doris