be/src/storage/predicate_collector.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "storage/predicate_collector.h" |
19 | | |
20 | | #include <glog/logging.h> |
21 | | |
22 | | #include <vector> |
23 | | |
24 | | #include "exec/common/variant_util.h" |
25 | | #include "exprs/vexpr.h" |
26 | | #include "exprs/vexpr_context.h" |
27 | | #include "exprs/vliteral.h" |
28 | | #include "exprs/vsearch.h" |
29 | | #include "exprs/vslot_ref.h" |
30 | | #include "gen_cpp/Exprs_types.h" |
31 | | #include "storage/index/index_reader_helper.h" |
32 | | #include "storage/index/inverted/analyzer/analyzer.h" |
33 | | #include "storage/index/inverted/util/string_helper.h" |
34 | | #include "storage/tablet/tablet_schema.h" |
35 | | |
36 | | namespace doris { |
37 | | |
38 | | using namespace segment_v2; |
39 | | |
40 | 26 | VSlotRef* PredicateCollector::find_slot_ref(const VExprSPtr& expr) const { |
41 | 26 | if (!expr) { |
42 | 1 | return nullptr; |
43 | 1 | } |
44 | | |
45 | 25 | auto cur = VExpr::expr_without_cast(expr); |
46 | 25 | if (cur->node_type() == TExprNodeType::SLOT_REF) { |
47 | 22 | return static_cast<VSlotRef*>(cur.get()); |
48 | 22 | } |
49 | | |
50 | 3 | for (const auto& ch : cur->children()) { |
51 | 1 | if (auto* s = find_slot_ref(ch)) { |
52 | 1 | return s; |
53 | 1 | } |
54 | 1 | } |
55 | | |
56 | 2 | return nullptr; |
57 | 3 | } |
58 | | |
59 | | std::string PredicateCollector::build_field_name(int32_t col_unique_id, |
60 | 18 | const std::string& suffix_path) const { |
61 | 18 | std::string field_name = std::to_string(col_unique_id); |
62 | 18 | if (!suffix_path.empty()) { |
63 | 5 | field_name += "." + suffix_path; |
64 | 5 | } |
65 | 18 | return field_name; |
66 | 18 | } |
67 | | |
68 | | Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaSPtr& tablet_schema, |
69 | 22 | const VExprSPtr& expr, CollectInfoMap* collect_infos) { |
70 | 22 | DCHECK(collect_infos != nullptr); |
71 | | |
72 | 22 | auto* left_slot_ref = find_slot_ref(expr->children()[0]); |
73 | 22 | if (left_slot_ref == nullptr) { |
74 | 1 | return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( |
75 | 1 | "Index statistics collection failed: Cannot find slot reference in match predicate " |
76 | 1 | "left expression"); |
77 | 1 | } |
78 | | |
79 | 21 | auto* right_literal = static_cast<VLiteral*>(expr->children()[1].get()); |
80 | 21 | DCHECK(right_literal != nullptr); |
81 | | |
82 | 21 | const auto* sd = state->desc_tbl().get_slot_descriptor(left_slot_ref->slot_id()); |
83 | 21 | if (sd == nullptr) { |
84 | 1 | return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( |
85 | 1 | "Index statistics collection failed: Cannot find slot descriptor for slot_id={}", |
86 | 1 | left_slot_ref->slot_id()); |
87 | 1 | } |
88 | | |
89 | 20 | int32_t col_idx = tablet_schema->field_index(left_slot_ref->column_name()); |
90 | 20 | if (col_idx == -1) { |
91 | 1 | return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( |
92 | 1 | "Index statistics collection failed: Cannot find column index for column={}", |
93 | 1 | left_slot_ref->column_name()); |
94 | 1 | } |
95 | | |
96 | 19 | const auto& column = tablet_schema->column(col_idx); |
97 | 19 | auto index_metas = tablet_schema->inverted_indexs(column); |
98 | 19 | std::vector<std::shared_ptr<const TabletIndex>> owned_index_metas; |
99 | 19 | std::string index_suffix_path = column.suffix_path(); |
100 | | |
101 | | // Schema-only fallback for variant sub-columns. Collector runs at tablet |
102 | | // level without segment context, so we cannot do nested-group inference |
103 | | // or inherit_index runtime-type dispatch. Two paths cover what is |
104 | | // resolvable from schema alone: |
105 | | // 1. field_pattern templates (MATCH_NAME / MATCH_NAME_GLOB) via |
106 | | // generate_sub_column_info. |
107 | | // 2. Plain parent inverted index when the schema column is the dynamic |
108 | | // path's VARIANT placeholder produced by _init_variant_columns. In |
109 | | // that state inverted_indexs(column) misses because |
110 | | // _path_set_info_map.subcolumn_indexes is only populated for typed |
111 | | // paths / field_pattern outputs, not for plain parent indexes added |
112 | | // by ALTER. Clone the parent's non-field-pattern indexes with the |
113 | | // variant path as suffix so segment-side BM25 statistics can be |
114 | | // collected. |
115 | 19 | if (index_metas.empty() && column.is_extracted_column()) { |
116 | 3 | TabletSchema::SubColumnInfo sub_column_info; |
117 | 3 | const std::string relative_path = column.path_info_ptr()->copy_pop_front().get_path(); |
118 | 3 | if (variant_util::generate_sub_column_info(*tablet_schema, column.parent_unique_id(), |
119 | 3 | relative_path, &sub_column_info) && |
120 | 3 | !sub_column_info.indexes.empty()) { |
121 | 2 | index_suffix_path = sub_column_info.column.suffix_path(); |
122 | 2 | for (auto& idx : sub_column_info.indexes) { |
123 | 2 | index_metas.push_back(idx.get()); |
124 | 2 | owned_index_metas.emplace_back(std::move(idx)); |
125 | 2 | } |
126 | 2 | } else if (column.is_variant_type()) { |
127 | 1 | const auto parent_indexes = tablet_schema->inverted_indexs(column.parent_unique_id()); |
128 | 1 | for (const auto* index : parent_indexes) { |
129 | 1 | if (!index->field_pattern().empty()) { |
130 | 0 | continue; |
131 | 0 | } |
132 | 1 | auto index_ptr = std::make_shared<TabletIndex>(*index); |
133 | 1 | index_ptr->set_escaped_escaped_index_suffix_path( |
134 | 1 | column.path_info_ptr()->get_path()); |
135 | 1 | index_metas.push_back(index_ptr.get()); |
136 | 1 | owned_index_metas.emplace_back(std::move(index_ptr)); |
137 | 1 | } |
138 | 1 | } |
139 | 3 | } |
140 | | |
141 | | #ifndef BE_TEST |
142 | | if (index_metas.empty()) { |
143 | | return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( |
144 | | "Index statistics collection failed: Score query is not supported without inverted " |
145 | | "index for column={}", |
146 | | left_slot_ref->column_name()); |
147 | | } |
148 | | #endif |
149 | | |
150 | 19 | for (const auto* index_meta : index_metas) { |
151 | 18 | if (!InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) { |
152 | 1 | continue; |
153 | 1 | } |
154 | | |
155 | 17 | if (!IndexReaderHelper::is_need_similarity_score(expr->op(), index_meta)) { |
156 | 1 | continue; |
157 | 1 | } |
158 | | |
159 | 16 | auto options = DataTypeSerDe::get_default_format_options(); |
160 | 16 | options.timezone = &state->timezone_obj(); |
161 | 16 | auto term_infos = InvertedIndexAnalyzer::get_analyse_result(right_literal->value(options), |
162 | 16 | index_meta->properties()); |
163 | | |
164 | 16 | std::string field_name = |
165 | 16 | build_field_name(index_meta->col_unique_ids()[0], index_suffix_path); |
166 | 16 | std::wstring ws_field_name = StringHelper::to_wstring(field_name); |
167 | | |
168 | 16 | auto iter = collect_infos->find(ws_field_name); |
169 | 16 | if (iter == collect_infos->end()) { |
170 | 14 | CollectInfo collect_info; |
171 | 14 | collect_info.term_infos.insert(term_infos.begin(), term_infos.end()); |
172 | 14 | collect_info.index_meta = index_meta; |
173 | 14 | for (const auto& owned_index_meta : owned_index_metas) { |
174 | 3 | if (owned_index_meta.get() == index_meta) { |
175 | 3 | collect_info.owned_index_meta = owned_index_meta; |
176 | 3 | break; |
177 | 3 | } |
178 | 3 | } |
179 | 14 | (*collect_infos)[ws_field_name] = std::move(collect_info); |
180 | 14 | } else { |
181 | 2 | iter->second.term_infos.insert(term_infos.begin(), term_infos.end()); |
182 | 2 | } |
183 | 16 | } |
184 | | |
185 | 19 | return Status::OK(); |
186 | 20 | } |
187 | | |
188 | | Status SearchPredicateCollector::collect(RuntimeState* state, const TabletSchemaSPtr& tablet_schema, |
189 | 0 | const VExprSPtr& expr, CollectInfoMap* collect_infos) { |
190 | 0 | DCHECK(collect_infos != nullptr); |
191 | |
|
192 | 0 | auto* search_expr = dynamic_cast<VSearchExpr*>(expr.get()); |
193 | 0 | if (search_expr == nullptr) { |
194 | 0 | return Status::InternalError("SearchPredicateCollector: expr is not VSearchExpr type"); |
195 | 0 | } |
196 | | |
197 | 0 | const TSearchParam& search_param = search_expr->get_search_param(); |
198 | |
|
199 | 0 | RETURN_IF_ERROR(collect_from_clause(search_param.root, state, tablet_schema, collect_infos)); |
200 | | |
201 | 0 | return Status::OK(); |
202 | 0 | } |
203 | | |
204 | | Status SearchPredicateCollector::collect_from_clause(const TSearchClause& clause, |
205 | | RuntimeState* state, |
206 | | const TabletSchemaSPtr& tablet_schema, |
207 | 0 | CollectInfoMap* collect_infos) { |
208 | 0 | const std::string& clause_type = clause.clause_type; |
209 | 0 | ClauseTypeCategory category = get_clause_type_category(clause_type); |
210 | |
|
211 | 0 | if (category == ClauseTypeCategory::COMPOUND) { |
212 | 0 | if (clause.__isset.children) { |
213 | 0 | for (const auto& child_clause : clause.children) { |
214 | 0 | RETURN_IF_ERROR( |
215 | 0 | collect_from_clause(child_clause, state, tablet_schema, collect_infos)); |
216 | 0 | } |
217 | 0 | } |
218 | 0 | return Status::OK(); |
219 | 0 | } |
220 | | |
221 | 0 | return collect_from_leaf(clause, state, tablet_schema, collect_infos); |
222 | 0 | } |
223 | | |
224 | | Status SearchPredicateCollector::collect_from_leaf(const TSearchClause& clause, RuntimeState* state, |
225 | | const TabletSchemaSPtr& tablet_schema, |
226 | 0 | CollectInfoMap* collect_infos) { |
227 | 0 | if (!clause.__isset.field_name || !clause.__isset.value) { |
228 | 0 | return Status::InvalidArgument("Search clause missing field_name or value"); |
229 | 0 | } |
230 | | |
231 | 0 | const std::string& field_name = clause.field_name; |
232 | 0 | const std::string& value = clause.value; |
233 | 0 | const std::string& clause_type = clause.clause_type; |
234 | |
|
235 | 0 | if (!is_score_query_type(clause_type)) { |
236 | 0 | return Status::OK(); |
237 | 0 | } |
238 | | |
239 | 0 | int32_t col_idx = tablet_schema->field_index(field_name); |
240 | 0 | if (col_idx == -1) { |
241 | 0 | return Status::OK(); |
242 | 0 | } |
243 | | |
244 | 0 | const auto& column = tablet_schema->column(col_idx); |
245 | |
|
246 | 0 | auto index_metas = tablet_schema->inverted_indexs(column.unique_id(), column.suffix_path()); |
247 | 0 | if (index_metas.empty()) { |
248 | 0 | return Status::OK(); |
249 | 0 | } |
250 | | |
251 | 0 | ClauseTypeCategory category = get_clause_type_category(clause_type); |
252 | 0 | for (const auto* index_meta : index_metas) { |
253 | 0 | std::set<TermInfo, TermInfoComparer> term_infos; |
254 | |
|
255 | 0 | if (category == ClauseTypeCategory::TOKENIZED) { |
256 | 0 | if (InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) { |
257 | 0 | auto analyzed_terms = |
258 | 0 | InvertedIndexAnalyzer::get_analyse_result(value, index_meta->properties()); |
259 | 0 | term_infos.insert(analyzed_terms.begin(), analyzed_terms.end()); |
260 | 0 | } else { |
261 | 0 | term_infos.insert(TermInfo(value)); |
262 | 0 | } |
263 | 0 | } else if (category == ClauseTypeCategory::NON_TOKENIZED) { |
264 | 0 | if (clause_type == "TERM" && |
265 | 0 | InvertedIndexAnalyzer::should_analyzer(index_meta->properties())) { |
266 | 0 | auto analyzed_terms = |
267 | 0 | InvertedIndexAnalyzer::get_analyse_result(value, index_meta->properties()); |
268 | 0 | term_infos.insert(analyzed_terms.begin(), analyzed_terms.end()); |
269 | 0 | } else { |
270 | 0 | term_infos.insert(TermInfo(value)); |
271 | 0 | } |
272 | 0 | } |
273 | |
|
274 | 0 | std::string lucene_field_name = |
275 | 0 | build_field_name(index_meta->col_unique_ids()[0], column.suffix_path()); |
276 | 0 | std::wstring ws_field_name = StringHelper::to_wstring(lucene_field_name); |
277 | |
|
278 | 0 | auto iter = collect_infos->find(ws_field_name); |
279 | 0 | if (iter == collect_infos->end()) { |
280 | 0 | CollectInfo collect_info; |
281 | 0 | collect_info.term_infos = std::move(term_infos); |
282 | 0 | collect_info.index_meta = index_meta; |
283 | 0 | (*collect_infos)[ws_field_name] = std::move(collect_info); |
284 | 0 | } else { |
285 | 0 | iter->second.term_infos.insert(term_infos.begin(), term_infos.end()); |
286 | 0 | } |
287 | 0 | } |
288 | |
|
289 | 0 | return Status::OK(); |
290 | 0 | } |
291 | | |
292 | 0 | bool SearchPredicateCollector::is_score_query_type(const std::string& clause_type) const { |
293 | 0 | return clause_type == "TERM" || clause_type == "EXACT" || clause_type == "PHRASE" || |
294 | 0 | clause_type == "MATCH" || clause_type == "ANY" || clause_type == "ALL"; |
295 | 0 | } |
296 | | |
297 | | SearchPredicateCollector::ClauseTypeCategory SearchPredicateCollector::get_clause_type_category( |
298 | 0 | const std::string& clause_type) const { |
299 | 0 | if (clause_type == "AND" || clause_type == "OR" || clause_type == "NOT" || |
300 | 0 | clause_type == "OCCUR_BOOLEAN") { |
301 | 0 | return ClauseTypeCategory::COMPOUND; |
302 | 0 | } else if (clause_type == "TERM" || clause_type == "EXACT") { |
303 | 0 | return ClauseTypeCategory::NON_TOKENIZED; |
304 | 0 | } else if (clause_type == "PHRASE" || clause_type == "MATCH" || clause_type == "ANY" || |
305 | 0 | clause_type == "ALL") { |
306 | 0 | return ClauseTypeCategory::TOKENIZED; |
307 | 0 | } else { |
308 | | LOG(WARNING) << "Unknown clause type '" << clause_type |
309 | 0 | << "', defaulting to NON_TOKENIZED category"; |
310 | 0 | return ClauseTypeCategory::NON_TOKENIZED; |
311 | 0 | } |
312 | 0 | } |
313 | | |
314 | | } // namespace doris |