Coverage Report

Created: 2026-03-18 19:38

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/predicate_collector.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/predicate_collector.h"
19
20
#include <glog/logging.h>
21
22
#include "exprs/vexpr.h"
23
#include "exprs/vexpr_context.h"
24
#include "exprs/vliteral.h"
25
#include "exprs/vsearch.h"
26
#include "exprs/vslot_ref.h"
27
#include "gen_cpp/Exprs_types.h"
28
#include "storage/index/index_reader_helper.h"
29
#include "storage/index/inverted/analyzer/analyzer.h"
30
#include "storage/index/inverted/util/string_helper.h"
31
#include "storage/tablet/tablet_schema.h"
32
33
namespace doris {
34
35
using namespace segment_v2;
36
37
9
VSlotRef* PredicateCollector::find_slot_ref(const VExprSPtr& expr) const {
38
9
    if (!expr) {
39
0
        return nullptr;
40
0
    }
41
42
9
    auto cur = VExpr::expr_without_cast(expr);
43
9
    if (cur->node_type() == TExprNodeType::SLOT_REF) {
44
9
        return static_cast<VSlotRef*>(cur.get());
45
9
    }
46
47
0
    for (const auto& ch : cur->children()) {
48
0
        if (auto* s = find_slot_ref(ch)) {
49
0
            return s;
50
0
        }
51
0
    }
52
53
0
    return nullptr;
54
0
}
55
56
std::string PredicateCollector::build_field_name(int32_t col_unique_id,
57
0
                                                 const std::string& suffix_path) const {
58
0
    std::string field_name = std::to_string(col_unique_id);
59
0
    if (!suffix_path.empty()) {
60
0
        field_name += "." + suffix_path;
61
0
    }
62
0
    return field_name;
63
0
}
64
65
Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaSPtr& tablet_schema,
66
9
                                        const VExprSPtr& expr, CollectInfoMap* collect_infos) {
67
9
    DCHECK(collect_infos != nullptr);
68
69
9
    auto* left_slot_ref = find_slot_ref(expr->children()[0]);
70
9
    if (left_slot_ref == nullptr) {
71
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
72
0
                "Index statistics collection failed: Cannot find slot reference in match predicate "
73
0
                "left expression");
74
0
    }
75
76
9
    auto* right_literal = static_cast<VLiteral*>(expr->children()[1].get());
77
9
    DCHECK(right_literal != nullptr);
78
79
9
    const auto* sd = state->desc_tbl().get_slot_descriptor(left_slot_ref->slot_id());
80
9
    if (sd == nullptr) {
81
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
82
0
                "Index statistics collection failed: Cannot find slot descriptor for slot_id={}",
83
0
                left_slot_ref->slot_id());
84
0
    }
85
86
9
    int32_t col_idx = tablet_schema->field_index(left_slot_ref->column_name());
87
9
    if (col_idx == -1) {
88
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
89
0
                "Index statistics collection failed: Cannot find column index for column={}",
90
0
                left_slot_ref->column_name());
91
0
    }
92
93
9
    const auto& column = tablet_schema->column(col_idx);
94
9
    auto index_metas = tablet_schema->inverted_indexs(sd->col_unique_id(), column.suffix_path());
95
96
#ifndef BE_TEST
97
    if (index_metas.empty()) {
98
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
99
                "Index statistics collection failed: Score query is not supported without inverted "
100
                "index for column={}",
101
                left_slot_ref->column_name());
102
    }
103
#endif
104
105
9
    for (const auto* index_meta : index_metas) {
106
0
        if (!InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) {
107
0
            continue;
108
0
        }
109
110
0
        if (!IndexReaderHelper::is_need_similarity_score(expr->op(), index_meta)) {
111
0
            continue;
112
0
        }
113
114
0
        auto options = DataTypeSerDe::get_default_format_options();
115
0
        options.timezone = &state->timezone_obj();
116
0
        auto term_infos = InvertedIndexAnalyzer::get_analyse_result(right_literal->value(options),
117
0
                                                                    index_meta->properties());
118
119
0
        std::string field_name =
120
0
                build_field_name(index_meta->col_unique_ids()[0], column.suffix_path());
121
0
        std::wstring ws_field_name = StringHelper::to_wstring(field_name);
122
123
0
        auto iter = collect_infos->find(ws_field_name);
124
0
        if (iter == collect_infos->end()) {
125
0
            CollectInfo collect_info;
126
0
            collect_info.term_infos.insert(term_infos.begin(), term_infos.end());
127
0
            collect_info.index_meta = index_meta;
128
0
            (*collect_infos)[ws_field_name] = std::move(collect_info);
129
0
        } else {
130
0
            iter->second.term_infos.insert(term_infos.begin(), term_infos.end());
131
0
        }
132
0
    }
133
134
9
    return Status::OK();
135
9
}
136
137
Status SearchPredicateCollector::collect(RuntimeState* state, const TabletSchemaSPtr& tablet_schema,
138
0
                                         const VExprSPtr& expr, CollectInfoMap* collect_infos) {
139
0
    DCHECK(collect_infos != nullptr);
140
141
0
    auto* search_expr = dynamic_cast<VSearchExpr*>(expr.get());
142
0
    if (search_expr == nullptr) {
143
0
        return Status::InternalError("SearchPredicateCollector: expr is not VSearchExpr type");
144
0
    }
145
146
0
    const TSearchParam& search_param = search_expr->get_search_param();
147
148
0
    RETURN_IF_ERROR(collect_from_clause(search_param.root, state, tablet_schema, collect_infos));
149
150
0
    return Status::OK();
151
0
}
152
153
Status SearchPredicateCollector::collect_from_clause(const TSearchClause& clause,
154
                                                     RuntimeState* state,
155
                                                     const TabletSchemaSPtr& tablet_schema,
156
0
                                                     CollectInfoMap* collect_infos) {
157
0
    const std::string& clause_type = clause.clause_type;
158
0
    ClauseTypeCategory category = get_clause_type_category(clause_type);
159
160
0
    if (category == ClauseTypeCategory::COMPOUND) {
161
0
        if (clause.__isset.children) {
162
0
            for (const auto& child_clause : clause.children) {
163
0
                RETURN_IF_ERROR(
164
0
                        collect_from_clause(child_clause, state, tablet_schema, collect_infos));
165
0
            }
166
0
        }
167
0
        return Status::OK();
168
0
    }
169
170
0
    return collect_from_leaf(clause, state, tablet_schema, collect_infos);
171
0
}
172
173
Status SearchPredicateCollector::collect_from_leaf(const TSearchClause& clause, RuntimeState* state,
174
                                                   const TabletSchemaSPtr& tablet_schema,
175
0
                                                   CollectInfoMap* collect_infos) {
176
0
    if (!clause.__isset.field_name || !clause.__isset.value) {
177
0
        return Status::InvalidArgument("Search clause missing field_name or value");
178
0
    }
179
180
0
    const std::string& field_name = clause.field_name;
181
0
    const std::string& value = clause.value;
182
0
    const std::string& clause_type = clause.clause_type;
183
184
0
    if (!is_score_query_type(clause_type)) {
185
0
        return Status::OK();
186
0
    }
187
188
0
    int32_t col_idx = tablet_schema->field_index(field_name);
189
0
    if (col_idx == -1) {
190
0
        return Status::OK();
191
0
    }
192
193
0
    const auto& column = tablet_schema->column(col_idx);
194
195
0
    auto index_metas = tablet_schema->inverted_indexs(column.unique_id(), column.suffix_path());
196
0
    if (index_metas.empty()) {
197
0
        return Status::OK();
198
0
    }
199
200
0
    ClauseTypeCategory category = get_clause_type_category(clause_type);
201
0
    for (const auto* index_meta : index_metas) {
202
0
        std::set<TermInfo, TermInfoComparer> term_infos;
203
204
0
        if (category == ClauseTypeCategory::TOKENIZED) {
205
0
            if (InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) {
206
0
                auto analyzed_terms =
207
0
                        InvertedIndexAnalyzer::get_analyse_result(value, index_meta->properties());
208
0
                term_infos.insert(analyzed_terms.begin(), analyzed_terms.end());
209
0
            } else {
210
0
                term_infos.insert(TermInfo(value));
211
0
            }
212
0
        } else if (category == ClauseTypeCategory::NON_TOKENIZED) {
213
0
            if (clause_type == "TERM" &&
214
0
                InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) {
215
0
                auto analyzed_terms =
216
0
                        InvertedIndexAnalyzer::get_analyse_result(value, index_meta->properties());
217
0
                term_infos.insert(analyzed_terms.begin(), analyzed_terms.end());
218
0
            } else {
219
0
                term_infos.insert(TermInfo(value));
220
0
            }
221
0
        }
222
223
0
        std::string lucene_field_name =
224
0
                build_field_name(index_meta->col_unique_ids()[0], column.suffix_path());
225
0
        std::wstring ws_field_name = StringHelper::to_wstring(lucene_field_name);
226
227
0
        auto iter = collect_infos->find(ws_field_name);
228
0
        if (iter == collect_infos->end()) {
229
0
            CollectInfo collect_info;
230
0
            collect_info.term_infos = std::move(term_infos);
231
0
            collect_info.index_meta = index_meta;
232
0
            (*collect_infos)[ws_field_name] = std::move(collect_info);
233
0
        } else {
234
0
            iter->second.term_infos.insert(term_infos.begin(), term_infos.end());
235
0
        }
236
0
    }
237
238
0
    return Status::OK();
239
0
}
240
241
0
bool SearchPredicateCollector::is_score_query_type(const std::string& clause_type) const {
242
0
    return clause_type == "TERM" || clause_type == "EXACT" || clause_type == "PHRASE" ||
243
0
           clause_type == "MATCH" || clause_type == "ANY" || clause_type == "ALL";
244
0
}
245
246
SearchPredicateCollector::ClauseTypeCategory SearchPredicateCollector::get_clause_type_category(
247
0
        const std::string& clause_type) const {
248
0
    if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" ||
249
0
        clause_type == "OCCUR_BOOLEAN") {
250
0
        return ClauseTypeCategory::COMPOUND;
251
0
    } else if (clause_type == "TERM" || clause_type == "EXACT") {
252
0
        return ClauseTypeCategory::NON_TOKENIZED;
253
0
    } else if (clause_type == "PHRASE" || clause_type == "MATCH" || clause_type == "ANY" ||
254
0
               clause_type == "ALL") {
255
0
        return ClauseTypeCategory::TOKENIZED;
256
0
    } else {
257
        LOG(WARNING) << "Unknown clause type '" << clause_type
258
0
                     << "', defaulting to NON_TOKENIZED category";
259
0
        return ClauseTypeCategory::NON_TOKENIZED;
260
0
    }
261
0
}
262
263
} // namespace doris