Coverage Report

Created: 2026-05-26 10:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/vsearch.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/vsearch.h"
19
20
#include <fmt/format.h>
21
22
#include <memory>
23
#include <roaring/roaring.hh>
24
25
#include "common/logging.h"
26
#include "common/status.h"
27
#include "core/column/column_const.h"
28
#include "exprs/function/function_search.h"
29
#include "exprs/vexpr_context.h"
30
#include "exprs/vliteral.h"
31
#include "exprs/vslot_ref.h"
32
#include "glog/logging.h"
33
#include "runtime/runtime_state.h"
34
#include "storage/index/inverted/inverted_index_reader.h"
35
#include "storage/olap_common.h"
36
#include "storage/segment/segment.h"
37
38
namespace doris {
39
using namespace segment_v2;
40
41
namespace {
42
43
struct SearchInputBundle {
44
    std::unordered_map<std::string, IndexIterator*> iterators;
45
    std::unordered_map<std::string, IndexFieldNameAndTypePair> field_types;
46
    std::unordered_map<std::string, int> field_name_to_column_id;
47
    std::vector<int> column_ids;
48
    ColumnsWithTypeAndName literal_args;
49
};
50
51
void add_search_binding_diagnostic(const IndexExecContext* index_context,
52
79
                                   const std::string& diagnostic) {
53
79
    VLOG_DEBUG << diagnostic;
54
79
    if (index_context == nullptr) {
55
0
        return;
56
0
    }
57
79
    const auto& index_query_context = index_context->get_index_query_context();
58
79
    if (index_query_context != nullptr && index_query_context->stats != nullptr) {
59
79
        index_query_context->stats->inverted_index_stats.add_binding_diagnostic(diagnostic);
60
79
    }
61
79
}
62
63
Status collect_search_inputs(const VSearchExpr& expr, VExprContext* context,
64
1.27k
                             SearchInputBundle* bundle) {
65
1.27k
    DCHECK(bundle != nullptr);
66
67
1.27k
    auto index_context = context->get_index_context();
68
1.27k
    if (index_context == nullptr) {
69
0
        LOG(WARNING) << "collect_search_inputs: No inverted index context available";
70
0
        return Status::InternalError("No inverted index context available");
71
0
    }
72
73
    // Get field bindings for variant subcolumn support
74
1.27k
    const auto& search_param = expr.get_search_param();
75
1.27k
    const auto& field_bindings = search_param.field_bindings;
76
77
1.27k
    std::unordered_map<std::string, ColumnId> parent_to_base_column_id;
78
1.27k
    std::unordered_map<std::string, std::string> parent_to_storage_field_prefix;
79
80
    // Resolve and cache the base (parent) column id for a variant field binding.
81
    // This avoids repeated schema lookups when multiple subcolumns share the same parent column.
82
1.27k
    auto resolve_parent_column_id = [&](const std::string& parent_field, ColumnId* column_id) {
83
        // Guard against invalid inputs: variant bindings may miss parent_field, and callers must
84
        // provide a valid output pointer to receive the resolved id.
85
14
        if (parent_field.empty() || column_id == nullptr) {
86
0
            return false;
87
0
        }
88
14
        auto it = parent_to_base_column_id.find(parent_field);
89
14
        if (it != parent_to_base_column_id.end()) {
90
0
            *column_id = it->second;
91
0
            return true;
92
0
        }
93
14
        if (index_context == nullptr || index_context->segment() == nullptr) {
94
0
            return false;
95
0
        }
96
14
        const int32_t ordinal =
97
14
                index_context->segment()->tablet_schema()->field_index(parent_field);
98
14
        if (ordinal < 0) {
99
0
            return false;
100
0
        }
101
14
        ColumnId resolved_id = static_cast<ColumnId>(ordinal);
102
14
        parent_to_base_column_id.emplace(parent_field, resolved_id);
103
14
        if (auto* storage_name_type = index_context->get_storage_name_and_type_by_id(resolved_id);
104
14
            storage_name_type != nullptr) {
105
14
            parent_to_storage_field_prefix[parent_field] = storage_name_type->first;
106
14
        }
107
14
        *column_id = resolved_id;
108
14
        return true;
109
14
    };
110
111
1.27k
    int child_index = 0; // Index for iterating through children
112
1.80k
    for (const auto& child : expr.children()) {
113
1.80k
        if (child->is_slot_ref()) {
114
1.79k
            auto* column_slot_ref = assert_cast<VSlotRef*>(child.get());
115
1.79k
            int column_id = column_slot_ref->column_id();
116
117
            // Determine the field_name from field_bindings (for variant subcolumns)
118
            // field_bindings and children should have the same order
119
1.79k
            std::string field_name;
120
1.79k
            const TSearchFieldBinding* binding = nullptr;
121
1.79k
            if (child_index < field_bindings.size()) {
122
                // Use field_name from binding (may include "parent.subcolumn" for variant)
123
1.77k
                binding = &field_bindings[child_index];
124
1.77k
                field_name = binding->field_name;
125
1.77k
            } else {
126
                // Fallback to column_name if binding not found
127
15
                field_name = column_slot_ref->column_name();
128
15
            }
129
130
1.79k
            bundle->field_name_to_column_id[field_name] = column_id;
131
132
1.79k
            auto* iterator = index_context->get_inverted_index_iterator_by_column_id(column_id);
133
1.79k
            const auto* storage_name_type =
134
1.79k
                    index_context->get_storage_name_and_type_by_column_id(column_id);
135
1.79k
            bool field_added = false;
136
            // For variant subcolumns, slot_ref might not map to a real indexed column in the scan schema.
137
            // Fall back to the parent variant column's iterator and synthesize lucene field name.
138
1.79k
            if (iterator == nullptr && binding != nullptr &&
139
1.79k
                binding->__isset.is_variant_subcolumn && binding->is_variant_subcolumn &&
140
1.79k
                binding->__isset.parent_field_name && !binding->parent_field_name.empty()) {
141
14
                ColumnId base_column_id = 0;
142
14
                if (resolve_parent_column_id(binding->parent_field_name, &base_column_id)) {
143
13
                    iterator = index_context->get_inverted_index_iterator_by_id(base_column_id);
144
13
                    const auto* base_storage_name_type =
145
13
                            index_context->get_storage_name_and_type_by_id(base_column_id);
146
13
                    if (iterator != nullptr && base_storage_name_type != nullptr) {
147
0
                        std::string prefix = base_storage_name_type->first;
148
0
                        if (auto pit =
149
0
                                    parent_to_storage_field_prefix.find(binding->parent_field_name);
150
0
                            pit != parent_to_storage_field_prefix.end() && !pit->second.empty()) {
151
0
                            prefix = pit->second;
152
0
                        } else {
153
0
                            parent_to_storage_field_prefix[binding->parent_field_name] = prefix;
154
0
                        }
155
156
0
                        std::string sub_path;
157
0
                        if (binding->__isset.subcolumn_path) {
158
0
                            sub_path = binding->subcolumn_path;
159
0
                        }
160
0
                        if (sub_path.empty()) {
161
                            // Fallback: strip "parent." prefix from logical field name
162
0
                            std::string pfx = binding->parent_field_name + ".";
163
0
                            if (field_name.starts_with(pfx)) {
164
0
                                sub_path = field_name.substr(pfx.size());
165
0
                            }
166
0
                        }
167
0
                        if (!sub_path.empty()) {
168
0
                            bundle->iterators[field_name] = iterator;
169
0
                            bundle->field_types[field_name] =
170
0
                                    std::make_pair(prefix + "." + sub_path, nullptr);
171
0
                            int base_column_index =
172
0
                                    index_context->column_index_by_id(base_column_id);
173
0
                            if (base_column_index >= 0) {
174
0
                                bundle->column_ids.emplace_back(base_column_index);
175
0
                            }
176
0
                            add_search_binding_diagnostic(
177
0
                                    index_context.get(),
178
0
                                    fmt::format("[VariantSearchBinding] phase=collect_inputs "
179
0
                                                "result=parent_fallback logical_field={} "
180
0
                                                "parent_field={} sub_path={} base_column_id={} "
181
0
                                                "stored_field={} reason=slot_iterator_missing",
182
0
                                                field_name, binding->parent_field_name, sub_path,
183
0
                                                base_column_id, prefix + "." + sub_path));
184
0
                            field_added = true;
185
0
                        }
186
0
                    }
187
13
                } else {
188
1
                    add_search_binding_diagnostic(
189
1
                            index_context.get(),
190
1
                            fmt::format("[VariantSearchBinding] phase=collect_inputs "
191
1
                                        "result=reject logical_field={} parent_field={} "
192
1
                                        "reason=parent_column_not_found",
193
1
                                        field_name, binding->parent_field_name));
194
1
                }
195
14
            }
196
197
            // Only collect fields that have iterators (materialized columns with indexes)
198
1.80k
            if (!field_added && iterator != nullptr) {
199
1.77k
                if (storage_name_type == nullptr) {
200
1
                    return Status::InternalError("storage_name_type not found for column {} in {}",
201
1
                                                 column_id, expr.expr_name());
202
1
                }
203
204
1.77k
                bundle->iterators.emplace(field_name, iterator);
205
1.77k
                bundle->field_types.emplace(field_name, *storage_name_type);
206
1.77k
                bundle->column_ids.emplace_back(column_id);
207
1.79k
                if (binding != nullptr && binding->__isset.is_variant_subcolumn &&
208
1.79k
                    binding->is_variant_subcolumn) {
209
62
                    add_search_binding_diagnostic(
210
62
                            index_context.get(),
211
62
                            fmt::format("[VariantSearchBinding] phase=collect_inputs "
212
62
                                        "result=direct_iterator logical_field={} column_id={} "
213
62
                                        "stored_field={}",
214
62
                                        field_name, column_id, storage_name_type->first));
215
62
                }
216
1.77k
            }
217
218
1.79k
            child_index++;
219
1.79k
        } else if (child->is_literal()) {
220
0
            auto* literal = assert_cast<VLiteral*>(child.get());
221
0
            bundle->literal_args.emplace_back(literal->get_column_ptr(), literal->get_data_type(),
222
0
                                              literal->expr_name());
223
12
        } else {
224
            // Check if this is ElementAt expression (for variant subcolumn access)
225
12
            if (child->expr_name() == "element_at" && child_index < field_bindings.size() &&
226
12
                field_bindings[child_index].__isset.is_variant_subcolumn &&
227
12
                field_bindings[child_index].is_variant_subcolumn) {
228
                // Variant subcolumn not materialized - skip, will create empty BitSetQuery in function_search
229
0
                add_search_binding_diagnostic(
230
0
                        index_context.get(),
231
0
                        fmt::format("[VariantSearchBinding] phase=collect_inputs "
232
0
                                    "result=unmaterialized_element_at logical_field={} "
233
0
                                    "parent_field={} sub_path={} reason=no_slot_ref",
234
0
                                    field_bindings[child_index].field_name,
235
0
                                    field_bindings[child_index].__isset.parent_field_name
236
0
                                            ? field_bindings[child_index].parent_field_name
237
0
                                            : "",
238
0
                                    field_bindings[child_index].__isset.subcolumn_path
239
0
                                            ? field_bindings[child_index].subcolumn_path
240
0
                                            : ""));
241
0
                child_index++;
242
0
                continue;
243
0
            }
244
245
            // Not a supported child type
246
12
            return Status::InvalidArgument("Unsupported child node type: {}", child->expr_name());
247
12
        }
248
1.80k
    }
249
250
1.25k
    return Status::OK();
251
1.27k
}
252
253
} // namespace
254
255
541
VSearchExpr::VSearchExpr(const TExprNode& node) : VExpr(node) {
256
541
    if (node.__isset.search_param) {
257
537
        _search_param = node.search_param;
258
537
        _original_dsl = _search_param.original_dsl;
259
537
    }
260
541
}
261
262
Status VSearchExpr::prepare(RuntimeState* state, const RowDescriptor& row_desc,
263
465
                            VExprContext* context) {
264
465
    RETURN_IF_ERROR(VExpr::prepare(state, row_desc, context));
265
465
    const auto& query_options = state->query_options();
266
465
    if (query_options.__isset.enable_inverted_index_query_cache) {
267
465
        _enable_cache = query_options.enable_inverted_index_query_cache;
268
465
    }
269
465
    return Status::OK();
270
465
}
271
272
94
const std::string& VSearchExpr::expr_name() const {
273
94
    static const std::string name = "VSearchExpr";
274
94
    return name;
275
94
}
276
277
Status VSearchExpr::execute_column_impl(VExprContext* context, const Block* block,
278
                                        const Selector* selector, size_t count,
279
5
                                        ColumnPtr& result_column) const {
280
5
    if (fast_execute(context, selector, count, result_column)) {
281
4
        return Status::OK();
282
4
    }
283
284
1
    return Status::InternalError("SearchExpr should not be executed without inverted index");
285
5
}
286
287
1.28k
Status VSearchExpr::evaluate_inverted_index(VExprContext* context, uint32_t segment_num_rows) {
288
1.28k
    if (_search_param.original_dsl.empty()) {
289
3
        return Status::InvalidArgument("search DSL is empty");
290
3
    }
291
292
1.28k
    auto index_context = context->get_index_context();
293
1.28k
    if (!index_context) {
294
15
        LOG(WARNING) << "VSearchExpr: No inverted index context available";
295
15
        return Status::OK();
296
15
    }
297
298
1.26k
    SearchInputBundle bundle;
299
1.26k
    RETURN_IF_ERROR(collect_search_inputs(*this, context, &bundle));
300
301
18.4E
    VLOG_DEBUG << "VSearchExpr: bundle.iterators.size()=" << bundle.iterators.size();
302
303
1.26k
    const bool is_nested_query = _search_param.root.clause_type == "NESTED";
304
1.26k
    if (bundle.iterators.empty() && !is_nested_query) {
305
18
        LOG(WARNING) << "VSearchExpr: No indexed columns available for evaluation, DSL: "
306
18
                     << _original_dsl;
307
18
        add_search_binding_diagnostic(
308
18
                index_context.get(),
309
18
                fmt::format("[VariantSearchBinding] phase=evaluate_search result=no_iterator "
310
18
                            "dsl={} reason=no_indexed_columns",
311
18
                            _original_dsl));
312
18
        auto empty_bitmap = InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(),
313
18
                                                      std::make_shared<roaring::Roaring>());
314
18
        index_context->set_index_result_for_expr(this, std::move(empty_bitmap));
315
18
        return Status::OK();
316
18
    }
317
318
1.24k
    auto index_query_context = index_context->get_index_query_context();
319
320
1.24k
    auto function = std::make_shared<FunctionSearch>();
321
1.24k
    auto result_bitmap = InvertedIndexResultBitmap();
322
1.24k
    auto status = function->evaluate_inverted_index_with_search_param(
323
1.24k
            _search_param, bundle.field_types, bundle.iterators, segment_num_rows, result_bitmap,
324
1.24k
            _enable_cache, index_context.get(), bundle.field_name_to_column_id,
325
1.24k
            index_query_context);
326
327
1.24k
    if (!status.ok()) {
328
2
        LOG(WARNING) << "VSearchExpr: Function evaluation failed: " << status.to_string();
329
2
        return status;
330
2
    }
331
332
1.24k
    index_context->set_index_result_for_expr(this, result_bitmap);
333
1.83k
    for (int column_id : bundle.column_ids) {
334
1.83k
        index_context->set_true_for_index_status(this, column_id);
335
1.83k
    }
336
337
1.24k
    return Status::OK();
338
1.24k
}
339
340
} // namespace doris