Coverage Report

Created: 2026-05-09 19:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/predicate_collector.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/predicate_collector.h"
19
20
#include <glog/logging.h>
21
22
#include <vector>
23
24
#include "exec/common/variant_util.h"
25
#include "exprs/vexpr.h"
26
#include "exprs/vexpr_context.h"
27
#include "exprs/vliteral.h"
28
#include "exprs/vsearch.h"
29
#include "exprs/vslot_ref.h"
30
#include "gen_cpp/Exprs_types.h"
31
#include "storage/index/index_reader_helper.h"
32
#include "storage/index/inverted/analyzer/analyzer.h"
33
#include "storage/index/inverted/util/string_helper.h"
34
#include "storage/tablet/tablet_schema.h"
35
36
namespace doris {
37
38
using namespace segment_v2;
39
40
26
VSlotRef* PredicateCollector::find_slot_ref(const VExprSPtr& expr) const {
41
26
    if (!expr) {
42
1
        return nullptr;
43
1
    }
44
45
25
    auto cur = VExpr::expr_without_cast(expr);
46
25
    if (cur->node_type() == TExprNodeType::SLOT_REF) {
47
22
        return static_cast<VSlotRef*>(cur.get());
48
22
    }
49
50
3
    for (const auto& ch : cur->children()) {
51
1
        if (auto* s = find_slot_ref(ch)) {
52
1
            return s;
53
1
        }
54
1
    }
55
56
2
    return nullptr;
57
3
}
58
59
std::string PredicateCollector::build_field_name(int32_t col_unique_id,
60
18
                                                 const std::string& suffix_path) const {
61
18
    std::string field_name = std::to_string(col_unique_id);
62
18
    if (!suffix_path.empty()) {
63
5
        field_name += "." + suffix_path;
64
5
    }
65
18
    return field_name;
66
18
}
67
68
Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaSPtr& tablet_schema,
69
22
                                        const VExprSPtr& expr, CollectInfoMap* collect_infos) {
70
22
    DCHECK(collect_infos != nullptr);
71
72
22
    auto* left_slot_ref = find_slot_ref(expr->children()[0]);
73
22
    if (left_slot_ref == nullptr) {
74
1
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
75
1
                "Index statistics collection failed: Cannot find slot reference in match predicate "
76
1
                "left expression");
77
1
    }
78
79
21
    auto* right_literal = static_cast<VLiteral*>(expr->children()[1].get());
80
21
    DCHECK(right_literal != nullptr);
81
82
21
    const auto* sd = state->desc_tbl().get_slot_descriptor(left_slot_ref->slot_id());
83
21
    if (sd == nullptr) {
84
1
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
85
1
                "Index statistics collection failed: Cannot find slot descriptor for slot_id={}",
86
1
                left_slot_ref->slot_id());
87
1
    }
88
89
20
    int32_t col_idx = tablet_schema->field_index(left_slot_ref->column_name());
90
20
    if (col_idx == -1) {
91
1
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
92
1
                "Index statistics collection failed: Cannot find column index for column={}",
93
1
                left_slot_ref->column_name());
94
1
    }
95
96
19
    const auto& column = tablet_schema->column(col_idx);
97
19
    auto index_metas = tablet_schema->inverted_indexs(column);
98
19
    std::vector<std::shared_ptr<const TabletIndex>> owned_index_metas;
99
19
    std::string index_suffix_path = column.suffix_path();
100
101
    // Schema-only fallback for variant sub-columns. Collector runs at tablet
102
    // level without segment context, so we cannot do nested-group inference
103
    // or inherit_index runtime-type dispatch. Two paths cover what is
104
    // resolvable from schema alone:
105
    //   1. field_pattern templates (MATCH_NAME / MATCH_NAME_GLOB) via
106
    //      generate_sub_column_info.
107
    //   2. Plain parent inverted index when the schema column is the dynamic
108
    //      path's VARIANT placeholder produced by _init_variant_columns. In
109
    //      that state inverted_indexs(column) misses because
110
    //      _path_set_info_map.subcolumn_indexes is only populated for typed
111
    //      paths / field_pattern outputs, not for plain parent indexes added
112
    //      by ALTER. Clone the parent's non-field-pattern indexes with the
113
    //      variant path as suffix so segment-side BM25 statistics can be
114
    //      collected.
115
19
    if (index_metas.empty() && column.is_extracted_column()) {
116
3
        TabletSchema::SubColumnInfo sub_column_info;
117
3
        const std::string relative_path = column.path_info_ptr()->copy_pop_front().get_path();
118
3
        if (variant_util::generate_sub_column_info(*tablet_schema, column.parent_unique_id(),
119
3
                                                   relative_path, &sub_column_info) &&
120
3
            !sub_column_info.indexes.empty()) {
121
2
            index_suffix_path = sub_column_info.column.suffix_path();
122
2
            for (auto& idx : sub_column_info.indexes) {
123
2
                index_metas.push_back(idx.get());
124
2
                owned_index_metas.emplace_back(std::move(idx));
125
2
            }
126
2
        } else if (column.is_variant_type()) {
127
1
            const auto parent_indexes = tablet_schema->inverted_indexs(column.parent_unique_id());
128
1
            for (const auto* index : parent_indexes) {
129
1
                if (!index->field_pattern().empty()) {
130
0
                    continue;
131
0
                }
132
1
                auto index_ptr = std::make_shared<TabletIndex>(*index);
133
1
                index_ptr->set_escaped_escaped_index_suffix_path(
134
1
                        column.path_info_ptr()->get_path());
135
1
                index_metas.push_back(index_ptr.get());
136
1
                owned_index_metas.emplace_back(std::move(index_ptr));
137
1
            }
138
1
        }
139
3
    }
140
141
#ifndef BE_TEST
142
    if (index_metas.empty()) {
143
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
144
                "Index statistics collection failed: Score query is not supported without inverted "
145
                "index for column={}",
146
                left_slot_ref->column_name());
147
    }
148
#endif
149
150
19
    for (const auto* index_meta : index_metas) {
151
18
        if (!InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) {
152
1
            continue;
153
1
        }
154
155
17
        if (!IndexReaderHelper::is_need_similarity_score(expr->op(), index_meta)) {
156
1
            continue;
157
1
        }
158
159
16
        auto options = DataTypeSerDe::get_default_format_options();
160
16
        options.timezone = &state->timezone_obj();
161
16
        auto term_infos = InvertedIndexAnalyzer::get_analyse_result(right_literal->value(options),
162
16
                                                                    index_meta->properties());
163
164
16
        std::string field_name =
165
16
                build_field_name(index_meta->col_unique_ids()[0], index_suffix_path);
166
16
        std::wstring ws_field_name = StringHelper::to_wstring(field_name);
167
168
16
        auto iter = collect_infos->find(ws_field_name);
169
16
        if (iter == collect_infos->end()) {
170
14
            CollectInfo collect_info;
171
14
            collect_info.term_infos.insert(term_infos.begin(), term_infos.end());
172
14
            collect_info.index_meta = index_meta;
173
14
            for (const auto& owned_index_meta : owned_index_metas) {
174
3
                if (owned_index_meta.get() == index_meta) {
175
3
                    collect_info.owned_index_meta = owned_index_meta;
176
3
                    break;
177
3
                }
178
3
            }
179
14
            (*collect_infos)[ws_field_name] = std::move(collect_info);
180
14
        } else {
181
2
            iter->second.term_infos.insert(term_infos.begin(), term_infos.end());
182
2
        }
183
16
    }
184
185
19
    return Status::OK();
186
20
}
187
188
Status SearchPredicateCollector::collect(RuntimeState* state, const TabletSchemaSPtr& tablet_schema,
189
0
                                         const VExprSPtr& expr, CollectInfoMap* collect_infos) {
190
0
    DCHECK(collect_infos != nullptr);
191
192
0
    auto* search_expr = dynamic_cast<VSearchExpr*>(expr.get());
193
0
    if (search_expr == nullptr) {
194
0
        return Status::InternalError("SearchPredicateCollector: expr is not VSearchExpr type");
195
0
    }
196
197
0
    const TSearchParam& search_param = search_expr->get_search_param();
198
199
0
    RETURN_IF_ERROR(collect_from_clause(search_param.root, state, tablet_schema, collect_infos));
200
201
0
    return Status::OK();
202
0
}
203
204
Status SearchPredicateCollector::collect_from_clause(const TSearchClause& clause,
205
                                                     RuntimeState* state,
206
                                                     const TabletSchemaSPtr& tablet_schema,
207
0
                                                     CollectInfoMap* collect_infos) {
208
0
    const std::string& clause_type = clause.clause_type;
209
0
    ClauseTypeCategory category = get_clause_type_category(clause_type);
210
211
0
    if (category == ClauseTypeCategory::COMPOUND) {
212
0
        if (clause.__isset.children) {
213
0
            for (const auto& child_clause : clause.children) {
214
0
                RETURN_IF_ERROR(
215
0
                        collect_from_clause(child_clause, state, tablet_schema, collect_infos));
216
0
            }
217
0
        }
218
0
        return Status::OK();
219
0
    }
220
221
0
    return collect_from_leaf(clause, state, tablet_schema, collect_infos);
222
0
}
223
224
Status SearchPredicateCollector::collect_from_leaf(const TSearchClause& clause, RuntimeState* state,
225
                                                   const TabletSchemaSPtr& tablet_schema,
226
0
                                                   CollectInfoMap* collect_infos) {
227
0
    if (!clause.__isset.field_name || !clause.__isset.value) {
228
0
        return Status::InvalidArgument("Search clause missing field_name or value");
229
0
    }
230
231
0
    const std::string& field_name = clause.field_name;
232
0
    const std::string& value = clause.value;
233
0
    const std::string& clause_type = clause.clause_type;
234
235
0
    if (!is_score_query_type(clause_type)) {
236
0
        return Status::OK();
237
0
    }
238
239
0
    int32_t col_idx = tablet_schema->field_index(field_name);
240
0
    if (col_idx == -1) {
241
0
        return Status::OK();
242
0
    }
243
244
0
    const auto& column = tablet_schema->column(col_idx);
245
246
0
    auto index_metas = tablet_schema->inverted_indexs(column.unique_id(), column.suffix_path());
247
0
    if (index_metas.empty()) {
248
0
        return Status::OK();
249
0
    }
250
251
0
    ClauseTypeCategory category = get_clause_type_category(clause_type);
252
0
    for (const auto* index_meta : index_metas) {
253
0
        std::set<TermInfo, TermInfoComparer> term_infos;
254
255
0
        if (category == ClauseTypeCategory::TOKENIZED) {
256
0
            if (InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) {
257
0
                auto analyzed_terms =
258
0
                        InvertedIndexAnalyzer::get_analyse_result(value, index_meta->properties());
259
0
                term_infos.insert(analyzed_terms.begin(), analyzed_terms.end());
260
0
            } else {
261
0
                term_infos.insert(TermInfo(value));
262
0
            }
263
0
        } else if (category == ClauseTypeCategory::NON_TOKENIZED) {
264
0
            if (clause_type == "TERM" &&
265
0
                InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) {
266
0
                auto analyzed_terms =
267
0
                        InvertedIndexAnalyzer::get_analyse_result(value, index_meta->properties());
268
0
                term_infos.insert(analyzed_terms.begin(), analyzed_terms.end());
269
0
            } else {
270
0
                term_infos.insert(TermInfo(value));
271
0
            }
272
0
        }
273
274
0
        std::string lucene_field_name =
275
0
                build_field_name(index_meta->col_unique_ids()[0], column.suffix_path());
276
0
        std::wstring ws_field_name = StringHelper::to_wstring(lucene_field_name);
277
278
0
        auto iter = collect_infos->find(ws_field_name);
279
0
        if (iter == collect_infos->end()) {
280
0
            CollectInfo collect_info;
281
0
            collect_info.term_infos = std::move(term_infos);
282
0
            collect_info.index_meta = index_meta;
283
0
            (*collect_infos)[ws_field_name] = std::move(collect_info);
284
0
        } else {
285
0
            iter->second.term_infos.insert(term_infos.begin(), term_infos.end());
286
0
        }
287
0
    }
288
289
0
    return Status::OK();
290
0
}
291
292
0
bool SearchPredicateCollector::is_score_query_type(const std::string& clause_type) const {
293
0
    return clause_type == "TERM" || clause_type == "EXACT" || clause_type == "PHRASE" ||
294
0
           clause_type == "MATCH" || clause_type == "ANY" || clause_type == "ALL";
295
0
}
296
297
SearchPredicateCollector::ClauseTypeCategory SearchPredicateCollector::get_clause_type_category(
298
0
        const std::string& clause_type) const {
299
0
    if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" ||
300
0
        clause_type == "OCCUR_BOOLEAN") {
301
0
        return ClauseTypeCategory::COMPOUND;
302
0
    } else if (clause_type == "TERM" || clause_type == "EXACT") {
303
0
        return ClauseTypeCategory::NON_TOKENIZED;
304
0
    } else if (clause_type == "PHRASE" || clause_type == "MATCH" || clause_type == "ANY" ||
305
0
               clause_type == "ALL") {
306
0
        return ClauseTypeCategory::TOKENIZED;
307
0
    } else {
308
        LOG(WARNING) << "Unknown clause type '" << clause_type
309
0
                     << "', defaulting to NON_TOKENIZED category";
310
0
        return ClauseTypeCategory::NON_TOKENIZED;
311
0
    }
312
0
}
313
314
} // namespace doris