Coverage Report

Created: 2026-05-27 05:02

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_search.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/function_search.h"
19
20
#include <CLucene/config/repl_wchar.h>
21
#include <CLucene/search/Scorer.h>
22
#include <fmt/format.h>
23
#include <gen_cpp/Exprs_types.h>
24
#include <glog/logging.h>
25
26
#include <limits>
27
#include <memory>
28
#include <roaring/roaring.hh>
29
#include <set>
30
#include <string>
31
#include <unordered_map>
32
#include <unordered_set>
33
#include <vector>
34
35
#include "common/status.h"
36
#include "core/block/columns_with_type_and_name.h"
37
#include "core/column/column_const.h"
38
#include "core/data_type/data_type_array.h"
39
#include "core/data_type/data_type_nullable.h"
40
#include "core/data_type/data_type_string.h"
41
#include "exprs/function/simple_function_factory.h"
42
#include "exprs/function/variant_inverted_index_search.h"
43
#include "exprs/vexpr_context.h"
44
#include "runtime/runtime_profile.h"
45
#include "storage/index/index_file_reader.h"
46
#include "storage/index/index_query_context.h"
47
#include "storage/index/inverted/analyzer/analyzer.h"
48
#include "storage/index/inverted/inverted_index_compound_reader.h"
49
#include "storage/index/inverted/inverted_index_iterator.h"
50
#include "storage/index/inverted/inverted_index_parser.h"
51
#include "storage/index/inverted/inverted_index_reader.h"
52
#include "storage/index/inverted/inverted_index_searcher.h"
53
#include "storage/index/inverted/query/query_helper.h"
54
#include "storage/index/inverted/query_v2/all_query/all_query.h"
55
#include "storage/index/inverted/query_v2/bit_set_query/bit_set_query.h"
56
#include "storage/index/inverted/query_v2/boolean_query/boolean_query_builder.h"
57
#include "storage/index/inverted/query_v2/boolean_query/operator.h"
58
#include "storage/index/inverted/query_v2/collect/doc_set_collector.h"
59
#include "storage/index/inverted/query_v2/collect/top_k_collector.h"
60
#include "storage/index/inverted/query_v2/phrase_query/multi_phrase_query.h"
61
#include "storage/index/inverted/query_v2/phrase_query/phrase_query.h"
62
#include "storage/index/inverted/query_v2/regexp_query/regexp_query.h"
63
#include "storage/index/inverted/query_v2/term_query/term_query.h"
64
#include "storage/index/inverted/query_v2/wildcard_query/wildcard_query.h"
65
#include "storage/index/inverted/util/string_helper.h"
66
#include "storage/olap_common.h"
67
#include "storage/segment/variant/nested_group_provider.h"
68
#include "storage/types.h"
69
#include "util/debug_points.h"
70
#include "util/string_parser.hpp"
71
#include "util/string_util.h"
72
#include "util/thrift_util.h"
73
74
namespace doris {
75
76
// Build canonical DSL signature for cache key.
77
// Serializes the entire TSearchParam via Thrift binary protocol so that
78
// every field (DSL, AST root, field bindings, default_operator,
79
// minimum_should_match, etc.) is included automatically.
80
1.30k
static std::string build_dsl_signature(const TSearchParam& param) {
81
1.30k
    ThriftSerializer ser(false, 1024);
82
1.30k
    TSearchParam copy = param;
83
1.30k
    std::string sig;
84
1.30k
    auto st = ser.serialize(&copy, &sig);
85
1.30k
    if (UNLIKELY(!st.ok())) {
86
0
        LOG(WARNING) << "build_dsl_signature: Thrift serialization failed: " << st.to_string()
87
0
                     << ", caching disabled for this query";
88
0
        return "";
89
0
    }
90
1.30k
    return sig;
91
1.30k
}
92
93
// Extract segment path prefix from the first available inverted index iterator.
94
// All fields in the same segment share the same path prefix.
95
static std::string extract_segment_prefix(
96
1.29k
        const std::unordered_map<std::string, IndexIterator*>& iterators) {
97
1.29k
    for (const auto& [field_name, iter] : iterators) {
98
1.29k
        auto* inv_iter = dynamic_cast<InvertedIndexIterator*>(iter);
99
1.29k
        if (!inv_iter) continue;
100
        // Try fulltext reader first, then string type
101
1.26k
        for (auto type :
102
1.58k
             {InvertedIndexReaderType::FULLTEXT, InvertedIndexReaderType::STRING_TYPE}) {
103
1.58k
            IndexReaderType reader_type = type;
104
1.58k
            auto reader = inv_iter->get_reader(reader_type);
105
1.58k
            if (!reader) continue;
106
1.27k
            auto inv_reader = std::dynamic_pointer_cast<InvertedIndexReader>(reader);
107
1.27k
            if (!inv_reader) continue;
108
1.27k
            auto file_reader = inv_reader->get_index_file_reader();
109
1.27k
            if (!file_reader) continue;
110
1.27k
            return file_reader->get_index_path_prefix();
111
1.27k
        }
112
1.26k
    }
113
18.4E
    VLOG_DEBUG << "extract_segment_prefix: no suitable inverted index reader found across "
114
18.4E
               << iterators.size() << " iterators, caching disabled for this query";
115
23
    return "";
116
1.29k
}
117
118
namespace {
119
120
3
bool is_nested_group_search_supported() {
121
3
    auto provider = segment_v2::create_nested_group_read_provider();
122
3
    return provider != nullptr && provider->should_enable_nested_group_read_path();
123
3
}
124
125
5
query_v2::QueryPtr make_unknown_query(uint32_t num_rows) {
126
5
    auto null_bitmap = std::make_shared<roaring::Roaring>();
127
5
    if (num_rows > 0) {
128
5
        null_bitmap->addRange(0, num_rows);
129
5
    }
130
5
    return std::make_shared<query_v2::BitSetQuery>(std::make_shared<roaring::Roaring>(),
131
5
                                                   std::move(null_bitmap));
132
5
}
133
134
2
DataTypePtr unwrap_direct_index_value_type(DataTypePtr column_type) {
135
2
    DataTypePtr value_type = remove_nullable(std::move(column_type));
136
5
    while (value_type != nullptr &&
137
5
           value_type->get_storage_field_type() == FieldType::OLAP_FIELD_TYPE_ARRAY) {
138
3
        const auto* array_type = dynamic_cast<const DataTypeArray*>(value_type.get());
139
3
        if (array_type == nullptr) {
140
0
            return value_type;
141
0
        }
142
3
        value_type = remove_nullable(array_type->get_nested_type());
143
3
    }
144
2
    return value_type;
145
2
}
146
147
template <PrimitiveType primitive_type, typename CppType>
148
1
Status parse_integral_search_value(const std::string& value, Field* field) {
149
1
    StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE;
150
1
    CppType parsed =
151
1
            StringParser::string_to_int<CppType>(value.data(), value.size(), &parse_result);
152
1
    if (parse_result != StringParser::PARSE_SUCCESS) {
153
0
        return Status::InvalidArgument("failed to parse '{}' as {}", value,
154
0
                                       type_to_string(primitive_type));
155
0
    }
156
1
    *field = Field::create_field<primitive_type>(parsed);
157
1
    return Status::OK();
158
1
}
Unexecuted instantiation: function_search.cpp:_ZN5doris12_GLOBAL__N_127parse_integral_search_valueILNS_13PrimitiveTypeE3EaEENS_6StatusERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_5FieldE
Unexecuted instantiation: function_search.cpp:_ZN5doris12_GLOBAL__N_127parse_integral_search_valueILNS_13PrimitiveTypeE4EsEENS_6StatusERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_5FieldE
function_search.cpp:_ZN5doris12_GLOBAL__N_127parse_integral_search_valueILNS_13PrimitiveTypeE5EiEENS_6StatusERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_5FieldE
Line
Count
Source
148
1
Status parse_integral_search_value(const std::string& value, Field* field) {
149
1
    StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE;
150
1
    CppType parsed =
151
1
            StringParser::string_to_int<CppType>(value.data(), value.size(), &parse_result);
152
1
    if (parse_result != StringParser::PARSE_SUCCESS) {
153
0
        return Status::InvalidArgument("failed to parse '{}' as {}", value,
154
0
                                       type_to_string(primitive_type));
155
0
    }
156
1
    *field = Field::create_field<primitive_type>(parsed);
157
1
    return Status::OK();
158
1
}
Unexecuted instantiation: function_search.cpp:_ZN5doris12_GLOBAL__N_127parse_integral_search_valueILNS_13PrimitiveTypeE6ElEENS_6StatusERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_5FieldE
Unexecuted instantiation: function_search.cpp:_ZN5doris12_GLOBAL__N_127parse_integral_search_valueILNS_13PrimitiveTypeE7EnEENS_6StatusERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_5FieldE
159
160
Status parse_scalar_search_value(const DataTypePtr& column_type, const std::string& value,
161
2
                                 Field* field) {
162
2
    if (column_type == nullptr || field == nullptr) {
163
0
        return Status::InvalidArgument("missing column type for scalar search value");
164
0
    }
165
166
2
    switch (column_type->get_storage_field_type()) {
167
1
    case FieldType::OLAP_FIELD_TYPE_BOOL: {
168
1
        StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE;
169
1
        bool parsed = StringParser::string_to_bool(value.data(), value.size(), &parse_result);
170
1
        if (parse_result != StringParser::PARSE_SUCCESS) {
171
0
            return Status::InvalidArgument("failed to parse '{}' as bool", value);
172
0
        }
173
1
        *field = Field::create_field<TYPE_BOOLEAN>(parsed);
174
1
        return Status::OK();
175
1
    }
176
0
    case FieldType::OLAP_FIELD_TYPE_TINYINT:
177
0
        return parse_integral_search_value<TYPE_TINYINT, Int8>(value, field);
178
0
    case FieldType::OLAP_FIELD_TYPE_SMALLINT:
179
0
        return parse_integral_search_value<TYPE_SMALLINT, Int16>(value, field);
180
1
    case FieldType::OLAP_FIELD_TYPE_INT:
181
1
        return parse_integral_search_value<TYPE_INT, Int32>(value, field);
182
0
    case FieldType::OLAP_FIELD_TYPE_BIGINT:
183
0
        return parse_integral_search_value<TYPE_BIGINT, Int64>(value, field);
184
0
    case FieldType::OLAP_FIELD_TYPE_LARGEINT:
185
0
        return parse_integral_search_value<TYPE_LARGEINT, Int128>(value, field);
186
0
    case FieldType::OLAP_FIELD_TYPE_FLOAT: {
187
0
        StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE;
188
0
        Float32 parsed =
189
0
                StringParser::string_to_float<Float32>(value.data(), value.size(), &parse_result);
190
0
        if (parse_result != StringParser::PARSE_SUCCESS) {
191
0
            return Status::InvalidArgument("failed to parse '{}' as float", value);
192
0
        }
193
0
        *field = Field::create_field<TYPE_FLOAT>(parsed);
194
0
        return Status::OK();
195
0
    }
196
0
    case FieldType::OLAP_FIELD_TYPE_DOUBLE: {
197
0
        StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE;
198
0
        Float64 parsed =
199
0
                StringParser::string_to_float<Float64>(value.data(), value.size(), &parse_result);
200
0
        if (parse_result != StringParser::PARSE_SUCCESS) {
201
0
            return Status::InvalidArgument("failed to parse '{}' as double", value);
202
0
        }
203
0
        *field = Field::create_field<TYPE_DOUBLE>(parsed);
204
0
        return Status::OK();
205
0
    }
206
0
    default:
207
0
        return Status::NotSupported("scalar search does not support storage field type {}",
208
0
                                    static_cast<int>(column_type->get_storage_field_type()));
209
2
    }
210
2
}
211
212
3
InvertedIndexQueryType direct_index_query_type_for_clause(const std::string& clause_type) {
213
3
    if (clause_type == "TERM" || clause_type == "EXACT") {
214
2
        return InvertedIndexQueryType::EQUAL_QUERY;
215
2
    }
216
1
    return InvertedIndexQueryType::UNKNOWN_QUERY;
217
3
}
218
219
} // namespace
220
221
Status FunctionSearch::execute_impl(FunctionContext* /*context*/, Block& /*block*/,
222
                                    const ColumnNumbers& /*arguments*/, uint32_t /*result*/,
223
4
                                    size_t /*input_rows_count*/) const {
224
4
    return Status::RuntimeError("only inverted index queries are supported");
225
4
}
226
227
// Enhanced implementation: Handle new parameter structure (DSL + SlotReferences)
228
Status FunctionSearch::evaluate_inverted_index(
229
        const ColumnsWithTypeAndName& arguments,
230
        const std::vector<IndexFieldNameAndTypePair>& data_type_with_names,
231
        std::vector<IndexIterator*> iterators, uint32_t num_rows,
232
        const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/,
233
1
        InvertedIndexResultBitmap& bitmap_result) const {
234
1
    return Status::OK();
235
1
}
236
237
Status FunctionSearch::evaluate_inverted_index_with_search_param(
238
        const TSearchParam& search_param,
239
        const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names,
240
        std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows,
241
31
        InvertedIndexResultBitmap& bitmap_result, bool enable_cache) const {
242
31
    static const std::unordered_map<std::string, int> empty_field_to_column_id;
243
31
    return evaluate_inverted_index_with_search_param(
244
31
            search_param, data_type_with_names, std::move(iterators), num_rows, bitmap_result,
245
31
            enable_cache, nullptr, empty_field_to_column_id);
246
31
}
247
248
Status FunctionSearch::evaluate_inverted_index_with_search_param(
249
        const TSearchParam& search_param,
250
        const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names,
251
        std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows,
252
        InvertedIndexResultBitmap& bitmap_result, bool enable_cache,
253
        const IndexExecContext* index_exec_ctx,
254
        const std::unordered_map<std::string, int>& field_name_to_column_id,
255
1.31k
        const std::shared_ptr<IndexQueryContext>& index_query_context) const {
256
1.31k
    const bool is_nested_query = search_param.root.clause_type == "NESTED";
257
1.31k
    if (is_nested_query && !is_nested_group_search_supported()) {
258
3
        return Status::NotSupported(
259
3
                "NESTED query requires NestedGroup support, which is unavailable in this build");
260
3
    }
261
262
1.31k
    if (!is_nested_query && (iterators.empty() || data_type_with_names.empty())) {
263
5
        LOG(INFO) << "No indexed columns or iterators available, returning empty result, dsl:"
264
5
                  << search_param.original_dsl;
265
5
        bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
266
5
                                                  std::make_shared<roaring::Roaring>());
267
5
        return Status::OK();
268
5
    }
269
270
    // Track overall query time (equivalent to inverted_index_query_timer in MATCH path).
271
    // Must be declared before the DSL cache lookup so that cache-hit fast paths are
272
    // also covered by the timer.
273
1.30k
    int64_t query_timer_dummy = 0;
274
1.30k
    OlapReaderStatistics* outer_stats = index_query_context ? index_query_context->stats : nullptr;
275
1.30k
    SCOPED_RAW_TIMER(outer_stats ? &outer_stats->inverted_index_query_timer : &query_timer_dummy);
276
277
    // DSL result cache: reuse InvertedIndexQueryCache with SEARCH_DSL_QUERY type
278
1.30k
    auto* dsl_cache = enable_cache ? InvertedIndexQueryCache::instance() : nullptr;
279
1.30k
    std::string seg_prefix;
280
1.30k
    std::string dsl_sig;
281
1.30k
    InvertedIndexQueryCache::CacheKey dsl_cache_key;
282
1.30k
    bool cache_usable = false;
283
1.30k
    if (dsl_cache) {
284
1.29k
        seg_prefix = extract_segment_prefix(iterators);
285
1.29k
        dsl_sig = build_dsl_signature(search_param);
286
1.29k
        if (!seg_prefix.empty() && !dsl_sig.empty()) {
287
1.26k
            dsl_cache_key = InvertedIndexQueryCache::CacheKey {
288
1.26k
                    seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY,
289
1.26k
                    dsl_sig};
290
1.26k
            cache_usable = true;
291
1.26k
            InvertedIndexQueryCacheHandle dsl_cache_handle;
292
1.26k
            bool dsl_hit = false;
293
1.26k
            {
294
1.26k
                int64_t lookup_dummy = 0;
295
1.26k
                SCOPED_RAW_TIMER(outer_stats ? &outer_stats->inverted_index_lookup_timer
296
1.26k
                                             : &lookup_dummy);
297
1.26k
                dsl_hit = dsl_cache->lookup(dsl_cache_key, &dsl_cache_handle);
298
1.26k
            }
299
1.26k
            if (dsl_hit) {
300
175
                auto cached_bitmap = dsl_cache_handle.get_bitmap();
301
175
                if (cached_bitmap) {
302
174
                    if (outer_stats) {
303
174
                        outer_stats->inverted_index_query_cache_hit++;
304
174
                    }
305
                    // Also retrieve cached null bitmap for three-valued SQL logic
306
                    // (needed by compound operators NOT, OR, AND in VCompoundPred)
307
174
                    auto null_cache_key = InvertedIndexQueryCache::CacheKey {
308
174
                            seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY,
309
174
                            dsl_sig + "__null"};
310
174
                    InvertedIndexQueryCacheHandle null_cache_handle;
311
174
                    std::shared_ptr<roaring::Roaring> null_bitmap;
312
175
                    if (dsl_cache->lookup(null_cache_key, &null_cache_handle)) {
313
175
                        null_bitmap = null_cache_handle.get_bitmap();
314
175
                    }
315
174
                    if (!null_bitmap) {
316
0
                        null_bitmap = std::make_shared<roaring::Roaring>();
317
0
                    }
318
174
                    bitmap_result =
319
174
                            InvertedIndexResultBitmap(cached_bitmap, std::move(null_bitmap));
320
174
                    return Status::OK();
321
174
                }
322
175
            }
323
1.10k
            if (outer_stats) {
324
1.10k
                outer_stats->inverted_index_query_cache_miss++;
325
1.10k
            }
326
1.09k
        }
327
1.29k
    }
328
329
1.13k
    std::shared_ptr<IndexQueryContext> context;
330
1.13k
    if (index_query_context) {
331
1.10k
        context = index_query_context;
332
1.10k
    } else {
333
31
        context = std::make_shared<IndexQueryContext>();
334
31
        context->collection_statistics = std::make_shared<CollectionStatistics>();
335
31
        context->collection_similarity = std::make_shared<CollectionSimilarity>();
336
31
    }
337
338
1.13k
    const auto* effective_data_type_with_names = &data_type_with_names;
339
340
    // Pass field_bindings to resolver for variant subcolumn detection
341
1.13k
    FieldReaderResolver resolver(*effective_data_type_with_names, iterators, context,
342
1.13k
                                 search_param.field_bindings);
343
344
1.13k
    if (is_nested_query) {
345
0
        std::shared_ptr<roaring::Roaring> row_bitmap;
346
0
        VariantNestedSearchEvaluator nested_evaluator(*this);
347
0
        RETURN_IF_ERROR(nested_evaluator.evaluate(search_param, search_param.root, context,
348
0
                                                  resolver, num_rows, index_exec_ctx,
349
0
                                                  field_name_to_column_id, row_bitmap));
350
0
        bitmap_result = InvertedIndexResultBitmap(std::move(row_bitmap),
351
0
                                                  std::make_shared<roaring::Roaring>());
352
0
        bitmap_result.mask_out_null();
353
0
        return Status::OK();
354
0
    }
355
356
    // Extract default_operator from TSearchParam (default: "or")
357
1.13k
    std::string default_operator = "or";
358
1.13k
    if (search_param.__isset.default_operator && !search_param.default_operator.empty()) {
359
1.10k
        default_operator = search_param.default_operator;
360
1.10k
    }
361
    // Extract minimum_should_match from TSearchParam (-1 means not set)
362
1.13k
    int32_t minimum_should_match = -1;
363
1.13k
    if (search_param.__isset.minimum_should_match) {
364
48
        minimum_should_match = search_param.minimum_should_match;
365
48
    }
366
367
1.13k
    auto* stats = context->stats;
368
1.13k
    int64_t dummy_timer = 0;
369
1.13k
    SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_timer : &dummy_timer);
370
371
1.13k
    query_v2::QueryPtr root_query;
372
1.13k
    std::string root_binding_key;
373
1.13k
    {
374
1.13k
        int64_t init_dummy = 0;
375
1.13k
        SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_init_timer : &init_dummy);
376
1.13k
        RETURN_IF_ERROR(build_query_recursive(search_param.root, context, resolver, &root_query,
377
1.13k
                                              &root_binding_key, default_operator,
378
1.13k
                                              minimum_should_match, num_rows));
379
1.13k
    }
380
1.11k
    if (root_query == nullptr) {
381
0
        LOG(INFO) << "search: Query tree resolved to empty query, dsl:"
382
0
                  << search_param.original_dsl;
383
0
        bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
384
0
                                                  std::make_shared<roaring::Roaring>());
385
0
        return Status::OK();
386
0
    }
387
388
1.11k
    VariantSearchNullBitmapAdapter null_resolver(resolver);
389
1.11k
    query_v2::QueryExecutionContext exec_ctx =
390
1.11k
            build_variant_search_query_execution_context(num_rows, resolver, &null_resolver);
391
392
1.11k
    bool enable_scoring = false;
393
1.11k
    bool is_asc = false;
394
1.11k
    size_t top_k = 0;
395
1.11k
    if (index_query_context) {
396
1.10k
        enable_scoring = index_query_context->collection_similarity != nullptr;
397
1.10k
        is_asc = index_query_context->is_asc;
398
1.10k
        top_k = index_query_context->query_limit;
399
1.10k
    }
400
401
1.11k
    auto weight = root_query->weight(enable_scoring);
402
1.11k
    if (!weight) {
403
0
        LOG(WARNING) << "search: Failed to build query weight";
404
0
        bitmap_result = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
405
0
                                                  std::make_shared<roaring::Roaring>());
406
0
        return Status::OK();
407
0
    }
408
409
1.11k
    std::shared_ptr<roaring::Roaring> roaring = std::make_shared<roaring::Roaring>();
410
1.11k
    {
411
1.11k
        int64_t exec_dummy = 0;
412
1.11k
        SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_exec_timer : &exec_dummy);
413
1.11k
        if (enable_scoring && !is_asc && top_k > 0) {
414
0
            bool use_wand = index_query_context->runtime_state != nullptr &&
415
0
                            index_query_context->runtime_state->query_options()
416
0
                                    .enable_inverted_index_wand_query;
417
0
            query_v2::collect_multi_segment_top_k(
418
0
                    weight, exec_ctx, root_binding_key, top_k, roaring,
419
0
                    index_query_context->collection_similarity, use_wand);
420
1.11k
        } else {
421
1.11k
            query_v2::collect_multi_segment_doc_set(
422
1.11k
                    weight, exec_ctx, root_binding_key, roaring,
423
1.11k
                    index_query_context ? index_query_context->collection_similarity : nullptr,
424
1.11k
                    enable_scoring);
425
1.11k
        }
426
1.11k
    }
427
428
1.11k
    VLOG_DEBUG << "search: Query completed, matched " << roaring->cardinality() << " documents";
429
430
    // Extract NULL bitmap from three-valued logic scorer
431
    // The scorer correctly computes which documents evaluate to NULL based on query logic
432
    // For example: TRUE OR NULL = TRUE (not NULL), FALSE OR NULL = NULL
433
1.11k
    std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
434
1.11k
    if (exec_ctx.null_resolver) {
435
1.11k
        auto scorer = weight->scorer(exec_ctx, root_binding_key);
436
1.11k
        if (scorer && scorer->has_null_bitmap(exec_ctx.null_resolver)) {
437
222
            const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver);
438
222
            if (bitmap != nullptr) {
439
222
                *null_bitmap = *bitmap;
440
18.4E
                VLOG_TRACE << "search: Extracted NULL bitmap with " << null_bitmap->cardinality()
441
18.4E
                           << " NULL documents";
442
222
            }
443
222
        }
444
1.11k
    }
445
446
1.11k
    VLOG_TRACE << "search: Before mask - true_bitmap=" << roaring->cardinality()
447
1
               << ", null_bitmap=" << null_bitmap->cardinality();
448
449
    // Create result and mask out NULLs (SQL WHERE clause semantics: only TRUE rows)
450
1.11k
    bitmap_result = InvertedIndexResultBitmap(std::move(roaring), std::move(null_bitmap));
451
1.11k
    bitmap_result.mask_out_null();
452
453
1.11k
    VLOG_TRACE << "search: After mask - result_bitmap="
454
3
               << bitmap_result.get_data_bitmap()->cardinality();
455
456
    // Insert post-mask_out_null result into DSL cache for future reuse
457
    // Cache both data bitmap and null bitmap so compound operators (NOT, OR, AND)
458
    // can apply correct three-valued SQL logic on cache hit
459
1.11k
    if (dsl_cache && cache_usable) {
460
1.10k
        InvertedIndexQueryCacheHandle insert_handle;
461
1.10k
        dsl_cache->insert(dsl_cache_key, bitmap_result.get_data_bitmap(), &insert_handle);
462
1.10k
        if (bitmap_result.get_null_bitmap()) {
463
1.10k
            auto null_cache_key = InvertedIndexQueryCache::CacheKey {
464
1.10k
                    seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY,
465
1.10k
                    dsl_sig + "__null"};
466
1.10k
            InvertedIndexQueryCacheHandle null_insert_handle;
467
1.10k
            dsl_cache->insert(null_cache_key, bitmap_result.get_null_bitmap(), &null_insert_handle);
468
1.10k
        }
469
1.10k
    }
470
471
1.11k
    return Status::OK();
472
1.11k
}
473
474
// Aligned with FE QsClauseType enum - uses enum.name() as clause_type
475
FunctionSearch::ClauseTypeCategory FunctionSearch::get_clause_type_category(
476
7.26k
        const std::string& clause_type) const {
477
7.26k
    if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" ||
478
7.26k
        clause_type == "OCCUR_BOOLEAN" || clause_type == "NESTED") {
479
146
        return ClauseTypeCategory::COMPOUND;
480
7.11k
    } else if (clause_type == "TERM" || clause_type == "PREFIX" || clause_type == "WILDCARD" ||
481
7.11k
               clause_type == "REGEXP" || clause_type == "RANGE" || clause_type == "LIST" ||
482
7.11k
               clause_type == "EXACT") {
483
        // Non-tokenized queries: exact matching, pattern matching, range, list operations
484
6.68k
        return ClauseTypeCategory::NON_TOKENIZED;
485
6.68k
    } else if (clause_type == "PHRASE" || clause_type == "MATCH" || clause_type == "ANY" ||
486
437
               clause_type == "ALL") {
487
        // Tokenized queries: phrase search, full-text search, multi-value matching
488
        // Note: ANY and ALL require tokenization of their input values
489
432
        return ClauseTypeCategory::TOKENIZED;
490
432
    } else {
491
        // Default to NON_TOKENIZED for unknown types
492
5
        LOG(WARNING) << "Unknown clause type '" << clause_type
493
5
                     << "', defaulting to NON_TOKENIZED category";
494
5
        return ClauseTypeCategory::NON_TOKENIZED;
495
5
    }
496
7.26k
}
497
498
// Analyze query type for a specific field in the search clause
499
InvertedIndexQueryType FunctionSearch::analyze_field_query_type(const std::string& field_name,
500
5.28k
                                                                const TSearchClause& clause) const {
501
5.28k
    const std::string& clause_type = clause.clause_type;
502
5.28k
    ClauseTypeCategory category = get_clause_type_category(clause_type);
503
504
    // Handle leaf queries - use direct mapping
505
5.28k
    if (category != ClauseTypeCategory::COMPOUND) {
506
        // Check if this clause targets the specific field
507
5.14k
        if (clause.field_name == field_name) {
508
            // Use direct mapping from clause_type to InvertedIndexQueryType
509
163
            return clause_type_to_query_type(clause_type);
510
163
        }
511
5.14k
    }
512
513
    // Handle boolean queries - recursively analyze children
514
5.11k
    if (!clause.children.empty()) {
515
5.09k
        for (const auto& child_clause : clause.children) {
516
            // Recursively analyze each child
517
5.09k
            InvertedIndexQueryType child_type = analyze_field_query_type(field_name, child_clause);
518
            // If this child targets the field (not default EQUAL_QUERY), return its query type
519
5.09k
            if (child_type != InvertedIndexQueryType::UNKNOWN_QUERY) {
520
124
                return child_type;
521
124
            }
522
5.09k
        }
523
132
    }
524
525
    // If no children target this field, return UNKNOWN_QUERY as default
526
4.99k
    return InvertedIndexQueryType::UNKNOWN_QUERY;
527
5.11k
}
528
529
// Map clause_type string to InvertedIndexQueryType
530
InvertedIndexQueryType FunctionSearch::clause_type_to_query_type(
531
2.17k
        const std::string& clause_type) const {
532
    // Use static map for better performance and maintainability
533
2.17k
    static const std::unordered_map<std::string, InvertedIndexQueryType> clause_type_map = {
534
            // Boolean operations
535
2.17k
            {"AND", InvertedIndexQueryType::BOOLEAN_QUERY},
536
2.17k
            {"OR", InvertedIndexQueryType::BOOLEAN_QUERY},
537
2.17k
            {"NOT", InvertedIndexQueryType::BOOLEAN_QUERY},
538
2.17k
            {"OCCUR_BOOLEAN", InvertedIndexQueryType::BOOLEAN_QUERY},
539
2.17k
            {"NESTED", InvertedIndexQueryType::BOOLEAN_QUERY},
540
541
            // Non-tokenized queries (exact matching, pattern matching)
542
2.17k
            {"TERM", InvertedIndexQueryType::EQUAL_QUERY},
543
2.17k
            {"PREFIX", InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY},
544
2.17k
            {"WILDCARD", InvertedIndexQueryType::WILDCARD_QUERY},
545
2.17k
            {"REGEXP", InvertedIndexQueryType::MATCH_REGEXP_QUERY},
546
2.17k
            {"RANGE", InvertedIndexQueryType::RANGE_QUERY},
547
2.17k
            {"LIST", InvertedIndexQueryType::LIST_QUERY},
548
549
            // Tokenized queries (full-text search, phrase search)
550
2.17k
            {"PHRASE", InvertedIndexQueryType::MATCH_PHRASE_QUERY},
551
2.17k
            {"MATCH", InvertedIndexQueryType::MATCH_ANY_QUERY},
552
2.17k
            {"ANY", InvertedIndexQueryType::MATCH_ANY_QUERY},
553
2.17k
            {"ALL", InvertedIndexQueryType::MATCH_ALL_QUERY},
554
555
            // Exact match without tokenization
556
2.17k
            {"EXACT", InvertedIndexQueryType::EQUAL_QUERY},
557
2.17k
    };
558
559
2.17k
    auto it = clause_type_map.find(clause_type);
560
2.17k
    if (it != clause_type_map.end()) {
561
2.16k
        return it->second;
562
2.16k
    }
563
564
    // Unknown clause type
565
2.17k
    LOG(WARNING) << "Unknown clause type '" << clause_type << "', defaulting to EQUAL_QUERY";
566
7
    return InvertedIndexQueryType::EQUAL_QUERY;
567
2.17k
}
568
569
// Map Thrift TSearchOccur to query_v2::Occur
570
903
static query_v2::Occur map_thrift_occur(TSearchOccur::type thrift_occur) {
571
903
    switch (thrift_occur) {
572
303
    case TSearchOccur::MUST:
573
303
        return query_v2::Occur::MUST;
574
534
    case TSearchOccur::SHOULD:
575
534
        return query_v2::Occur::SHOULD;
576
67
    case TSearchOccur::MUST_NOT:
577
67
        return query_v2::Occur::MUST_NOT;
578
0
    default:
579
0
        return query_v2::Occur::MUST;
580
903
    }
581
903
}
582
583
Status FunctionSearch::build_query_recursive(
584
        const TSearchClause& clause, const std::shared_ptr<IndexQueryContext>& context,
585
        FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out,
586
        std::string* binding_key, const std::string& default_operator, int32_t minimum_should_match,
587
2.81k
        uint32_t num_rows) const {
588
2.81k
    DCHECK(out != nullptr);
589
2.81k
    *out = nullptr;
590
2.83k
    if (binding_key) {
591
2.83k
        binding_key->clear();
592
2.83k
    }
593
594
2.81k
    const std::string& clause_type = clause.clause_type;
595
596
    // Handle MATCH_ALL_DOCS - matches all documents in the segment
597
2.81k
    if (clause_type == "MATCH_ALL_DOCS") {
598
44
        *out = std::make_shared<query_v2::AllQuery>();
599
44
        return Status::OK();
600
44
    }
601
602
    // Handle OCCUR_BOOLEAN - Lucene-style boolean query with MUST/SHOULD/MUST_NOT
603
2.77k
    if (clause_type == "OCCUR_BOOLEAN") {
604
439
        auto builder = segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder();
605
606
        // Set minimum_should_match if specified
607
439
        if (clause.__isset.minimum_should_match) {
608
423
            builder->set_minimum_number_should_match(clause.minimum_should_match);
609
423
        }
610
611
439
        if (clause.__isset.children) {
612
899
            for (const auto& child_clause : clause.children) {
613
899
                query_v2::QueryPtr child_query;
614
899
                std::string child_binding_key;
615
899
                RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query,
616
899
                                                      &child_binding_key, default_operator,
617
899
                                                      minimum_should_match, num_rows));
618
619
                // Determine occur type from child clause
620
898
                query_v2::Occur occur = query_v2::Occur::MUST; // default
621
903
                if (child_clause.__isset.occur) {
622
903
                    occur = map_thrift_occur(child_clause.occur);
623
903
                }
624
625
898
                builder->add(child_query, occur, std::move(child_binding_key));
626
898
            }
627
435
        }
628
629
438
        *out = builder->build();
630
438
        return Status::OK();
631
439
    }
632
633
2.33k
    if (clause_type == "NESTED") {
634
1
        return Status::InvalidArgument("NESTED clause must be evaluated at top level");
635
1
    }
636
637
    // Handle standard boolean operators (AND/OR/NOT)
638
2.33k
    if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT") {
639
377
        query_v2::OperatorType op = query_v2::OperatorType::OP_AND;
640
377
        if (clause_type == "OR") {
641
199
            op = query_v2::OperatorType::OP_OR;
642
199
        } else if (clause_type == "NOT") {
643
88
            op = query_v2::OperatorType::OP_NOT;
644
88
        }
645
646
377
        auto builder = create_operator_boolean_query_builder(op);
647
379
        if (clause.__isset.children) {
648
803
            for (const auto& child_clause : clause.children) {
649
803
                query_v2::QueryPtr child_query;
650
803
                std::string child_binding_key;
651
803
                RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query,
652
803
                                                      &child_binding_key, default_operator,
653
803
                                                      minimum_should_match, num_rows));
654
                // Add all children including empty BitSetQuery
655
                // BooleanQuery will handle the logic:
656
                // - AND with empty bitmap → result is empty
657
                // - OR with empty bitmap → empty bitmap is ignored by OR logic
658
                // - NOT with empty bitmap → NOT(empty) = all rows (handled by BooleanQuery)
659
802
                builder->add(child_query, std::move(child_binding_key));
660
802
            }
661
379
        }
662
663
376
        *out = builder->build();
664
376
        return Status::OK();
665
377
    }
666
667
1.95k
    return build_leaf_query(clause, context, resolver, out, binding_key, default_operator,
668
1.95k
                            minimum_should_match, num_rows);
669
2.33k
}
670
671
Status FunctionSearch::build_leaf_query(const TSearchClause& clause,
672
                                        const std::shared_ptr<IndexQueryContext>& context,
673
                                        FieldReaderResolver& resolver,
674
                                        inverted_index::query_v2::QueryPtr* out,
675
                                        std::string* binding_key,
676
                                        const std::string& default_operator,
677
1.97k
                                        int32_t minimum_should_match, uint32_t num_rows) const {
678
1.97k
    DCHECK(out != nullptr);
679
1.97k
    *out = nullptr;
680
1.97k
    if (binding_key) {
681
1.97k
        binding_key->clear();
682
1.97k
    }
683
684
1.97k
    if (!clause.__isset.field_name || !clause.__isset.value) {
685
0
        return Status::InvalidArgument("search clause missing field_name or value");
686
0
    }
687
688
1.97k
    const std::string& field_name = clause.field_name;
689
1.97k
    const std::string& value = clause.value;
690
1.97k
    const std::string& clause_type = clause.clause_type;
691
692
1.97k
    auto query_type = clause_type_to_query_type(clause_type);
693
    // TERM, WILDCARD, PREFIX, and REGEXP in search DSL operate on individual index terms
694
    // (like Lucene TermQuery, WildcardQuery, PrefixQuery, RegexpQuery).
695
    // Override to MATCH_ANY_QUERY so select_best_reader() prefers the FULLTEXT reader
696
    // when multiple indexes exist on the same column (one tokenized, one untokenized).
697
    // Without this, these queries would select the untokenized index and try to match
698
    // patterns like "h*llo" against full strings ("hello world") instead of individual
699
    // tokens ("hello"), returning empty results.
700
    // EXACT must remain EQUAL_QUERY to prefer the untokenized STRING_TYPE reader.
701
    //
702
    // Safe for single-index columns: select_best_reader() has a single-reader fast path
703
    // that returns the only reader directly, bypassing the query_type preference logic.
704
1.97k
    if (clause_type == "TERM" || clause_type == "WILDCARD" || clause_type == "PREFIX" ||
705
1.97k
        clause_type == "REGEXP") {
706
1.47k
        query_type = InvertedIndexQueryType::MATCH_ANY_QUERY;
707
1.47k
    }
708
709
1.97k
    auto finish_leaf_query = [&](query_v2::QueryPtr query) -> Status {
710
1.94k
        *out = std::move(query);
711
1.94k
        return resolver.map_leaf_query(field_name, out);
712
1.94k
    };
713
714
1.97k
    FieldReaderBinding binding;
715
1.97k
    RETURN_IF_ERROR(resolver.resolve(field_name, query_type, &binding));
716
717
1.95k
    if (!binding.is_bound()) {
718
4
        LOG(INFO) << "search: No inverted index for field '" << field_name
719
4
                  << "' in this segment, clause_type='" << clause_type
720
4
                  << "', query_type=" << static_cast<int>(query_type)
721
4
                  << ", returning UNKNOWN bitmap";
722
4
        if (binding_key) {
723
4
            binding_key->clear();
724
4
        }
725
4
        return finish_leaf_query(make_unknown_query(num_rows));
726
4
    }
727
728
1.94k
    if (binding_key) {
729
1.94k
        *binding_key = binding.binding_key;
730
1.94k
    }
731
732
1.94k
    if (binding.use_direct_index_reader()) {
733
3
        auto direct_query_type = direct_index_query_type_for_clause(clause_type);
734
3
        if (direct_query_type == InvertedIndexQueryType::UNKNOWN_QUERY) {
735
1
            return finish_leaf_query(make_unknown_query(num_rows));
736
1
        }
737
738
2
        auto value_type = unwrap_direct_index_value_type(binding.column_type);
739
2
        Field param_value;
740
2
        auto parse_status = parse_scalar_search_value(value_type, value, &param_value);
741
2
        if (!parse_status.ok()) {
742
0
            LOG(INFO) << "search: scalar leaf value is unsupported, field=" << field_name
743
0
                      << ", value='" << value << "', reason=" << parse_status.to_string();
744
0
            return finish_leaf_query(make_unknown_query(num_rows));
745
0
        }
746
747
2
        auto* iterator = resolver.get_iterator(field_name);
748
2
        if (iterator == nullptr) {
749
0
            return finish_leaf_query(make_unknown_query(num_rows));
750
0
        }
751
752
2
        segment_v2::InvertedIndexParam param;
753
2
        param.column_name = binding.stored_field_name;
754
2
        param.column_type = value_type;
755
2
        param.query_value = param_value;
756
2
        param.query_type = direct_query_type;
757
2
        param.num_rows = num_rows;
758
2
        param.roaring = std::make_shared<roaring::Roaring>();
759
2
        RETURN_IF_ERROR(iterator->read_from_index(segment_v2::IndexParam {&param}));
760
761
2
        std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
762
2
        auto has_null = iterator->has_null();
763
2
        if (has_null.has_value() && has_null.value()) {
764
0
            segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
765
0
            RETURN_IF_ERROR(iterator->read_null_bitmap(&null_bitmap_cache_handle));
766
0
            if (auto bitmap = null_bitmap_cache_handle.get_bitmap(); bitmap != nullptr) {
767
0
                null_bitmap = bitmap;
768
0
            }
769
0
        }
770
2
        return finish_leaf_query(std::make_shared<query_v2::BitSetQuery>(std::move(param.roaring),
771
2
                                                                         std::move(null_bitmap)));
772
2
    }
773
774
1.94k
    if (binding.lucene_reader == nullptr) {
775
0
        return finish_leaf_query(make_unknown_query(num_rows));
776
0
    }
777
778
1.94k
    FunctionSearch::ClauseTypeCategory category = get_clause_type_category(clause_type);
779
1.94k
    std::wstring field_wstr = binding.stored_field_wstr;
780
1.94k
    std::wstring value_wstr = StringHelper::to_wstring(value);
781
782
1.96k
    auto make_term_query = [&](const std::wstring& term) -> query_v2::QueryPtr {
783
1.96k
        return std::make_shared<query_v2::TermQuery>(context, field_wstr, term);
784
1.96k
    };
785
786
1.94k
    if (clause_type == "TERM") {
787
1.37k
        bool should_analyze =
788
1.37k
                inverted_index::InvertedIndexAnalyzer::should_analyzer(binding.index_properties);
789
1.37k
        if (should_analyze) {
790
1.10k
            if (binding.index_properties.empty()) {
791
0
                LOG(WARNING) << "search: analyzer required but index properties empty for field '"
792
0
                             << field_name << "'";
793
0
                return finish_leaf_query(make_term_query(value_wstr));
794
0
            }
795
796
1.10k
            std::vector<TermInfo> term_infos =
797
1.10k
                    inverted_index::InvertedIndexAnalyzer::get_analyse_result(
798
1.10k
                            value, binding.index_properties);
799
1.10k
            if (term_infos.empty()) {
800
0
                LOG(WARNING) << "search: No terms found after tokenization for TERM query, field="
801
0
                             << field_name << ", value='" << value
802
0
                             << "', returning empty BitSetQuery";
803
0
                return finish_leaf_query(
804
0
                        std::make_shared<query_v2::BitSetQuery>(roaring::Roaring()));
805
0
            }
806
807
1.10k
            if (term_infos.size() == 1) {
808
1.09k
                std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term());
809
1.09k
                return finish_leaf_query(make_term_query(term_wstr));
810
1.09k
            }
811
812
            // When minimum_should_match is specified, use OccurBooleanQuery
813
            // ES behavior: msm only applies to SHOULD clauses
814
16
            if (minimum_should_match > 0) {
815
0
                auto builder =
816
0
                        segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder();
817
0
                builder->set_minimum_number_should_match(minimum_should_match);
818
0
                query_v2::Occur occur = (default_operator == "and") ? query_v2::Occur::MUST
819
0
                                                                    : query_v2::Occur::SHOULD;
820
0
                for (const auto& term_info : term_infos) {
821
0
                    std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
822
0
                    builder->add(make_term_query(term_wstr), occur);
823
0
                }
824
0
                return finish_leaf_query(builder->build());
825
0
            }
826
827
            // Use default_operator to determine how to combine tokenized terms
828
16
            query_v2::OperatorType op_type = (default_operator == "and")
829
16
                                                     ? query_v2::OperatorType::OP_AND
830
16
                                                     : query_v2::OperatorType::OP_OR;
831
16
            auto builder = create_operator_boolean_query_builder(op_type);
832
19
            for (const auto& term_info : term_infos) {
833
19
                std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
834
19
                builder->add(make_term_query(term_wstr), binding.binding_key);
835
19
            }
836
837
16
            return finish_leaf_query(builder->build());
838
16
        }
839
840
261
        return finish_leaf_query(make_term_query(value_wstr));
841
1.37k
    }
842
843
574
    if (category == FunctionSearch::ClauseTypeCategory::TOKENIZED) {
844
410
        if (clause_type == "PHRASE") {
845
123
            bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer(
846
123
                    binding.index_properties);
847
123
            if (!should_analyze) {
848
11
                VLOG_DEBUG << "search: PHRASE on non-tokenized field '" << field_name
849
0
                           << "', falling back to TERM";
850
11
                return finish_leaf_query(make_term_query(value_wstr));
851
11
            }
852
853
112
            if (binding.index_properties.empty()) {
854
0
                LOG(WARNING) << "search: analyzer required but index properties empty for PHRASE "
855
0
                                "query on field '"
856
0
                             << field_name << "'";
857
0
                return finish_leaf_query(make_term_query(value_wstr));
858
0
            }
859
860
112
            std::vector<TermInfo> term_infos =
861
112
                    inverted_index::InvertedIndexAnalyzer::get_analyse_result(
862
112
                            value, binding.index_properties);
863
112
            if (term_infos.empty()) {
864
9
                LOG(WARNING) << "search: No terms found after tokenization for PHRASE query, field="
865
9
                             << field_name << ", value='" << value
866
9
                             << "', returning empty BitSetQuery";
867
9
                return finish_leaf_query(
868
9
                        std::make_shared<query_v2::BitSetQuery>(roaring::Roaring()));
869
9
            }
870
871
103
            std::vector<TermInfo> phrase_term_infos =
872
103
                    QueryHelper::build_phrase_term_infos(term_infos);
873
103
            if (phrase_term_infos.size() == 1) {
874
63
                const auto& term_info = phrase_term_infos[0];
875
63
                if (term_info.is_single_term()) {
876
63
                    std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
877
63
                    return finish_leaf_query(
878
63
                            std::make_shared<query_v2::TermQuery>(context, field_wstr, term_wstr));
879
63
                } else {
880
0
                    auto builder =
881
0
                            create_operator_boolean_query_builder(query_v2::OperatorType::OP_OR);
882
0
                    for (const auto& term : term_info.get_multi_terms()) {
883
0
                        std::wstring term_wstr = StringHelper::to_wstring(term);
884
0
                        builder->add(make_term_query(term_wstr), binding.binding_key);
885
0
                    }
886
0
                    return finish_leaf_query(builder->build());
887
0
                }
888
63
            } else {
889
40
                if (QueryHelper::is_simple_phrase(phrase_term_infos)) {
890
21
                    return finish_leaf_query(std::make_shared<query_v2::PhraseQuery>(
891
21
                            context, field_wstr, phrase_term_infos));
892
21
                } else {
893
19
                    return finish_leaf_query(std::make_shared<query_v2::MultiPhraseQuery>(
894
19
                            context, field_wstr, phrase_term_infos));
895
19
                }
896
40
            }
897
898
0
            return Status::OK();
899
103
        }
900
287
        if (clause_type == "MATCH") {
901
0
            VLOG_DEBUG << "search: MATCH clause not implemented, fallback to TERM";
902
0
            return finish_leaf_query(make_term_query(value_wstr));
903
0
        }
904
905
287
        if (clause_type == "ANY" || clause_type == "ALL") {
906
286
            bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer(
907
286
                    binding.index_properties);
908
286
            if (!should_analyze) {
909
1
                return finish_leaf_query(make_term_query(value_wstr));
910
1
            }
911
912
285
            if (binding.index_properties.empty()) {
913
0
                LOG(WARNING) << "search: index properties empty for tokenized clause '"
914
0
                             << clause_type << "' field=" << field_name;
915
0
                return finish_leaf_query(make_term_query(value_wstr));
916
0
            }
917
918
285
            std::vector<TermInfo> term_infos =
919
285
                    inverted_index::InvertedIndexAnalyzer::get_analyse_result(
920
285
                            value, binding.index_properties);
921
285
            if (term_infos.empty()) {
922
0
                LOG(WARNING) << "search: tokenization yielded no terms for clause '" << clause_type
923
0
                             << "', field=" << field_name << ", returning empty BitSetQuery";
924
0
                return finish_leaf_query(
925
0
                        std::make_shared<query_v2::BitSetQuery>(roaring::Roaring()));
926
0
            }
927
928
285
            query_v2::OperatorType bool_type = query_v2::OperatorType::OP_OR;
929
285
            if (clause_type == "ALL") {
930
194
                bool_type = query_v2::OperatorType::OP_AND;
931
194
            }
932
933
285
            if (term_infos.size() == 1) {
934
109
                std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term());
935
109
                return finish_leaf_query(make_term_query(term_wstr));
936
109
            }
937
938
176
            auto builder = create_operator_boolean_query_builder(bool_type);
939
402
            for (const auto& term_info : term_infos) {
940
402
                std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term());
941
402
                builder->add(make_term_query(term_wstr), binding.binding_key);
942
402
            }
943
176
            return finish_leaf_query(builder->build());
944
285
        }
945
946
        // Default tokenized clause fallback
947
1
        return finish_leaf_query(make_term_query(value_wstr));
948
287
    }
949
950
173
    if (category == FunctionSearch::ClauseTypeCategory::NON_TOKENIZED) {
951
173
        if (clause_type == "EXACT") {
952
            // EXACT match: exact string matching without tokenization
953
            // Note: EXACT prefers untokenized index (STRING_TYPE) which doesn't support lowercase
954
            // If only tokenized index exists, EXACT may return empty results because
955
            // tokenized indexes store individual tokens, not complete strings
956
82
            VLOG_DEBUG << "search: EXACT clause processed, field=" << field_name << ", value='"
957
0
                       << value << "'";
958
82
            return finish_leaf_query(make_term_query(value_wstr));
959
82
        }
960
91
        if (clause_type == "PREFIX") {
961
            // Apply lowercase only if:
962
            // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing)
963
            // 2. lower_case is explicitly set to "true"
964
36
            bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer(
965
36
                    binding.index_properties);
966
36
            std::string lowercase_setting =
967
36
                    get_parser_lowercase_from_properties(binding.index_properties);
968
36
            bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE);
969
36
            std::string pattern = should_lowercase ? to_lower(value) : value;
970
36
            VLOG_DEBUG << "search: PREFIX clause processed, field=" << field_name << ", pattern='"
971
0
                       << pattern << "' (original='" << value << "', has_parser=" << has_parser
972
0
                       << ", lower_case=" << lowercase_setting << ")";
973
36
            return finish_leaf_query(
974
36
                    std::make_shared<query_v2::WildcardQuery>(context, field_wstr, pattern));
975
36
        }
976
977
55
        if (clause_type == "WILDCARD") {
978
            // Standalone wildcard "*" matches all non-null values for this field
979
            // Consistent with ES query_string behavior where field:* becomes FieldExistsQuery
980
23
            if (value == "*") {
981
0
                VLOG_DEBUG << "search: WILDCARD '*' converted to AllQuery(nullable=true), field="
982
0
                           << field_name;
983
0
                return finish_leaf_query(std::make_shared<query_v2::AllQuery>(field_wstr, true));
984
0
            }
985
            // Apply lowercase only if:
986
            // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing)
987
            // 2. lower_case is explicitly set to "true"
988
23
            bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer(
989
23
                    binding.index_properties);
990
23
            std::string lowercase_setting =
991
23
                    get_parser_lowercase_from_properties(binding.index_properties);
992
23
            bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE);
993
23
            std::string pattern = should_lowercase ? to_lower(value) : value;
994
23
            VLOG_DEBUG << "search: WILDCARD clause processed, field=" << field_name << ", pattern='"
995
0
                       << pattern << "' (original='" << value << "', has_parser=" << has_parser
996
0
                       << ", lower_case=" << lowercase_setting << ")";
997
23
            return finish_leaf_query(
998
23
                    std::make_shared<query_v2::WildcardQuery>(context, field_wstr, pattern));
999
23
        }
1000
1001
32
        if (clause_type == "REGEXP") {
1002
            // ES-compatible: regex patterns are NOT lowercased (case-sensitive matching)
1003
            // This matches ES query_string behavior where regex patterns bypass analysis
1004
29
            VLOG_DEBUG << "search: REGEXP clause processed, field=" << field_name << ", pattern='"
1005
0
                       << value << "'";
1006
29
            return finish_leaf_query(
1007
29
                    std::make_shared<query_v2::RegexpQuery>(context, field_wstr, value));
1008
29
        }
1009
1010
3
        if (clause_type == "RANGE" || clause_type == "LIST") {
1011
3
            VLOG_DEBUG << "search: clause type '" << clause_type
1012
0
                       << "' not implemented, fallback to TERM";
1013
3
        }
1014
3
        return finish_leaf_query(make_term_query(value_wstr));
1015
32
    }
1016
1017
18.4E
    LOG(WARNING) << "search: Unexpected clause type '" << clause_type << "', using TERM fallback";
1018
18.4E
    return finish_leaf_query(make_term_query(value_wstr));
1019
164
}
1020
1021
8
void register_function_search(SimpleFunctionFactory& factory) {
1022
8
    factory.register_function<FunctionSearch>();
1023
8
}
1024
1025
} // namespace doris