be/src/format_v2/column_mapper_nested.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "format_v2/column_mapper_nested.h" |
19 | | |
20 | | #include <algorithm> |
21 | | #include <cstdint> |
22 | | #include <memory> |
23 | | #include <optional> |
24 | | #include <utility> |
25 | | |
26 | | #include "common/cast_set.h" |
27 | | #include "common/exception.h" |
28 | | #include "core/assert_cast.h" |
29 | | #include "core/data_type/convert_field_to_type.h" |
30 | | #include "core/data_type/data_type_nullable.h" |
31 | | #include "core/data_type/data_type_struct.h" |
32 | | #include "core/data_type/primitive_type.h" |
33 | | #include "exprs/create_predicate_function.h" |
34 | | #include "exprs/vexpr.h" |
35 | | #include "exprs/vin_predicate.h" |
36 | | #include "format_v2/expr/cast.h" |
37 | | #include "gen_cpp/Exprs_types.h" |
38 | | #include "storage/predicate/null_predicate.h" |
39 | | #include "storage/predicate/predicate_creator.h" |
40 | | |
41 | | namespace doris::format { |
42 | | |
43 | | namespace { |
44 | | |
45 | 306 | static bool is_cast_expr(const VExprSPtr& expr) { |
46 | 306 | return dynamic_cast<const Cast*>(expr.get()) != nullptr; |
47 | 306 | } |
48 | | |
49 | 63 | static bool is_binary_comparison_predicate(const VExprSPtr& expr) { |
50 | 63 | if (expr == nullptr || expr->get_num_children() != 2 || |
51 | 63 | (expr->node_type() != TExprNodeType::BINARY_PRED && |
52 | 44 | expr->node_type() != TExprNodeType::NULL_AWARE_BINARY_PRED)) { |
53 | 24 | return false; |
54 | 24 | } |
55 | 39 | switch (expr->op()) { |
56 | 1 | case TExprOpcode::EQ: |
57 | 1 | case TExprOpcode::EQ_FOR_NULL: |
58 | 1 | case TExprOpcode::NE: |
59 | 1 | case TExprOpcode::GE: |
60 | 36 | case TExprOpcode::GT: |
61 | 36 | case TExprOpcode::LE: |
62 | 39 | case TExprOpcode::LT: |
63 | 39 | return true; |
64 | 0 | default: |
65 | 0 | return false; |
66 | 39 | } |
67 | 39 | } |
68 | | |
69 | 49 | static bool is_null_predicate_function(const VExprSPtr& expr, bool* is_null) { |
70 | 49 | DORIS_CHECK(is_null != nullptr); |
71 | 49 | if (expr == nullptr || expr->node_type() != TExprNodeType::FUNCTION_CALL || |
72 | 49 | expr->get_num_children() != 1) { |
73 | 47 | return false; |
74 | 47 | } |
75 | 2 | if (expr->fn().name.function_name == "is_null_pred") { |
76 | 1 | *is_null = true; |
77 | 1 | return true; |
78 | 1 | } |
79 | 1 | if (expr->fn().name.function_name == "is_not_null_pred") { |
80 | 1 | *is_null = false; |
81 | 1 | return true; |
82 | 1 | } |
83 | 0 | return false; |
84 | 1 | } |
85 | | |
86 | 23 | static bool is_signed_integer_type(PrimitiveType type) { |
87 | 23 | switch (type) { |
88 | 0 | case TYPE_TINYINT: |
89 | 0 | case TYPE_SMALLINT: |
90 | 8 | case TYPE_INT: |
91 | 18 | case TYPE_BIGINT: |
92 | 18 | case TYPE_LARGEINT: |
93 | 18 | return true; |
94 | 5 | default: |
95 | 5 | return false; |
96 | 23 | } |
97 | 23 | } |
98 | | |
99 | 16 | static int primitive_integer_width(PrimitiveType type) { |
100 | 16 | switch (type) { |
101 | 0 | case TYPE_TINYINT: |
102 | 0 | return 1; |
103 | 0 | case TYPE_SMALLINT: |
104 | 0 | return 2; |
105 | 8 | case TYPE_INT: |
106 | 8 | return 4; |
107 | 8 | case TYPE_BIGINT: |
108 | 8 | return 8; |
109 | 0 | case TYPE_LARGEINT: |
110 | 0 | return 16; |
111 | 0 | default: |
112 | 0 | return 0; |
113 | 16 | } |
114 | 16 | } |
115 | | |
116 | 6 | static bool is_decimal_type(PrimitiveType type) { |
117 | 6 | switch (type) { |
118 | 4 | case TYPE_DECIMAL32: |
119 | 4 | case TYPE_DECIMAL64: |
120 | 4 | case TYPE_DECIMALV2: |
121 | 4 | case TYPE_DECIMAL128I: |
122 | 4 | case TYPE_DECIMAL256: |
123 | 4 | return true; |
124 | 2 | default: |
125 | 2 | return false; |
126 | 6 | } |
127 | 6 | } |
128 | | |
129 | | static bool is_order_preserving_safe_cast(const DataTypePtr& from_type, |
130 | 15 | const DataTypePtr& to_type) { |
131 | 15 | if (from_type == nullptr || to_type == nullptr) { |
132 | 0 | return false; |
133 | 0 | } |
134 | 15 | const auto from_nested_type = remove_nullable(from_type); |
135 | 15 | const auto to_nested_type = remove_nullable(to_type); |
136 | 15 | if (from_nested_type->equals(*to_nested_type)) { |
137 | 2 | return true; |
138 | 2 | } |
139 | | |
140 | 13 | const auto from_primitive_type = from_nested_type->get_primitive_type(); |
141 | 13 | const auto to_primitive_type = to_nested_type->get_primitive_type(); |
142 | 13 | if (is_signed_integer_type(from_primitive_type) && is_signed_integer_type(to_primitive_type)) { |
143 | 8 | return primitive_integer_width(to_primitive_type) >= |
144 | 8 | primitive_integer_width(from_primitive_type); |
145 | 8 | } |
146 | 5 | if (from_primitive_type == TYPE_FLOAT && to_primitive_type == TYPE_DOUBLE) { |
147 | 1 | return true; |
148 | 1 | } |
149 | 4 | if (is_decimal_type(from_primitive_type) && is_decimal_type(to_primitive_type)) { |
150 | 2 | return from_nested_type->get_scale() == to_nested_type->get_scale() && |
151 | 2 | to_nested_type->get_precision() >= from_nested_type->get_precision(); |
152 | 2 | } |
153 | 2 | return false; |
154 | 4 | } |
155 | | |
156 | 115 | static bool parse_struct_child_selector(const VExprSPtr& expr, StructChildSelector* selector) { |
157 | 115 | DORIS_CHECK(selector != nullptr); |
158 | 115 | if (expr == nullptr || !expr->is_literal()) { |
159 | 1 | return false; |
160 | 1 | } |
161 | 114 | const Field field = literal_field(expr); |
162 | 114 | switch (field.get_type()) { |
163 | 102 | case TYPE_STRING: |
164 | 102 | case TYPE_CHAR: |
165 | 102 | case TYPE_VARCHAR: |
166 | 102 | selector->by_name = true; |
167 | 102 | selector->name = std::string(field.as_string_view()); |
168 | 102 | return true; |
169 | 2 | case TYPE_BOOLEAN: |
170 | 2 | selector->by_name = false; |
171 | 2 | selector->ordinal = field.get<TYPE_BOOLEAN>() ? 1 : 0; |
172 | 2 | return selector->ordinal > 0; |
173 | 1 | case TYPE_TINYINT: |
174 | 1 | selector->by_name = false; |
175 | 1 | if (field.get<TYPE_TINYINT>() <= 0) { |
176 | 0 | return false; |
177 | 0 | } |
178 | 1 | selector->ordinal = cast_set<size_t>(field.get<TYPE_TINYINT>()); |
179 | 1 | return true; |
180 | 1 | case TYPE_SMALLINT: |
181 | 1 | selector->by_name = false; |
182 | 1 | if (field.get<TYPE_SMALLINT>() <= 0) { |
183 | 0 | return false; |
184 | 0 | } |
185 | 1 | selector->ordinal = cast_set<size_t>(field.get<TYPE_SMALLINT>()); |
186 | 1 | return true; |
187 | 5 | case TYPE_INT: |
188 | 5 | selector->by_name = false; |
189 | 5 | if (field.get<TYPE_INT>() <= 0) { |
190 | 2 | return false; |
191 | 2 | } |
192 | 3 | selector->ordinal = cast_set<size_t>(field.get<TYPE_INT>()); |
193 | 3 | return true; |
194 | 1 | case TYPE_BIGINT: |
195 | 1 | selector->by_name = false; |
196 | 1 | if (field.get<TYPE_BIGINT>() <= 0) { |
197 | 0 | return false; |
198 | 0 | } |
199 | 1 | selector->ordinal = cast_set<size_t>(field.get<TYPE_BIGINT>()); |
200 | 1 | return true; |
201 | 2 | default: |
202 | 2 | return false; |
203 | 114 | } |
204 | 114 | } |
205 | | |
206 | 416 | static bool extract_nested_struct_path(const VExprSPtr& expr, NestedStructPath* path) { |
207 | 416 | DORIS_CHECK(path != nullptr); |
208 | 416 | if (!is_struct_element_expr(expr)) { |
209 | 301 | return false; |
210 | 301 | } |
211 | | |
212 | | // Process for element_at(struct, 'field') or element_at(struct, 1) expression. |
213 | 115 | StructChildSelector selector; |
214 | 115 | if (!parse_struct_child_selector(expr->children()[1], &selector)) { |
215 | 6 | return false; |
216 | 6 | } |
217 | | |
218 | 109 | const auto& parent = expr->children()[0]; |
219 | 109 | if (parent->is_slot_ref()) { |
220 | 84 | const auto* slot_ref = assert_cast<const VSlotRef*>(parent.get()); |
221 | 84 | path->root_global_index = slot_ref_global_index(*slot_ref); |
222 | 84 | path->selectors.clear(); |
223 | 84 | path->selectors.push_back(std::move(selector)); |
224 | 84 | return true; |
225 | 84 | } |
226 | | |
227 | | // Process for element_at(element_at(struct<struct>, 'field'), 'field') or |
228 | | // element_at(element_at(struct<struct>, 1), 1) expression. |
229 | 25 | if (!extract_nested_struct_path(parent, path)) { |
230 | 3 | return false; |
231 | 3 | } |
232 | 22 | path->selectors.push_back(std::move(selector)); |
233 | 22 | return true; |
234 | 25 | } |
235 | | |
236 | 368 | static bool extract_nested_struct_path_for_pruning(const VExprSPtr& expr, NestedStructPath* path) { |
237 | 368 | DORIS_CHECK(path != nullptr); |
238 | | // Simple `ELEMENT_AT` |
239 | 368 | if (extract_nested_struct_path(expr, path)) { |
240 | 62 | return true; |
241 | 62 | } |
242 | | |
243 | | // `ELEMENT_AT` with `CAST` |
244 | 306 | if (!is_cast_expr(expr) || expr->get_num_children() != 1) { |
245 | 291 | return false; |
246 | 291 | } |
247 | 15 | const auto& child = expr->children()[0]; |
248 | 15 | if (!is_order_preserving_safe_cast(child->data_type(), expr->data_type())) { |
249 | 7 | return false; |
250 | 7 | } |
251 | | // A safe widening cast is null-preserving and keeps the comparison ordering of the nested |
252 | | // primitive leaf, so file-layer pruning can target the original leaf statistics. The row-level |
253 | | // filter still evaluates the original cast expression after read. |
254 | 8 | return extract_nested_struct_path_for_pruning(child, path); |
255 | 15 | } |
256 | | |
257 | | static const ColumnDefinition* resolve_file_child(const std::vector<ColumnDefinition>& children, |
258 | 53 | const StructChildSelector& selector) { |
259 | 53 | if (selector.by_name) { |
260 | 67 | const auto child_it = std::ranges::find_if(children, [&](const ColumnDefinition& child) { |
261 | 67 | return child.name == selector.name; |
262 | 67 | }); |
263 | 51 | return child_it == children.end() ? nullptr : &*child_it; |
264 | 51 | } |
265 | 2 | if (selector.ordinal == 0 || selector.ordinal > children.size()) { |
266 | 0 | return nullptr; |
267 | 0 | } |
268 | 2 | return &children[selector.ordinal - 1]; |
269 | 2 | } |
270 | | |
271 | 141 | static const DataTypeStruct* struct_type_or_null(const DataTypePtr& type) { |
272 | 141 | if (type == nullptr) { |
273 | 0 | return nullptr; |
274 | 0 | } |
275 | 141 | const auto nested_type = remove_nullable(type); |
276 | 141 | if (nested_type->get_primitive_type() != TYPE_STRUCT) { |
277 | 12 | return nullptr; |
278 | 12 | } |
279 | 129 | return assert_cast<const DataTypeStruct*>(nested_type.get()); |
280 | 141 | } |
281 | | |
282 | | static std::optional<int32_t> struct_child_index(const ColumnMapping& mapping, |
283 | 65 | const StructChildSelector& selector) { |
284 | 65 | const auto* struct_type = struct_type_or_null(mapping.table_type); |
285 | 65 | if (struct_type == nullptr) { |
286 | 6 | return std::nullopt; |
287 | 6 | } |
288 | 59 | if (selector.by_name) { |
289 | 59 | const auto position = struct_type->try_get_position_by_name(selector.name); |
290 | 59 | if (!position.has_value()) { |
291 | 39 | return std::nullopt; |
292 | 39 | } |
293 | 20 | return cast_set<int32_t>(*position); |
294 | 59 | } |
295 | 0 | if (selector.ordinal == 0 || selector.ordinal > struct_type->get_elements().size()) { |
296 | 0 | return std::nullopt; |
297 | 0 | } |
298 | 0 | return cast_set<int32_t>(selector.ordinal - 1); |
299 | 0 | } |
300 | | |
301 | | // Get the global child index for a child mapping. If the mapping's table type is struct, resolve |
302 | | // the child index by the child mapping's table column name; otherwise, use the fallback child index. |
303 | | static int32_t child_mapping_global_index(const ColumnMapping& mapping, |
304 | | const ColumnMapping& child_mapping, |
305 | 28 | size_t fallback_child_idx) { |
306 | 28 | const auto* struct_type = struct_type_or_null(mapping.table_type); |
307 | 28 | if (struct_type == nullptr) { |
308 | 0 | return cast_set<int32_t>(fallback_child_idx); |
309 | 0 | } |
310 | 28 | const auto position = struct_type->try_get_position_by_name(child_mapping.table_column_name); |
311 | 28 | DORIS_CHECK(position.has_value()) << "Cannot find child '" << child_mapping.table_column_name |
312 | 0 | << "' in table type " << mapping.table_type->get_name(); |
313 | 28 | return cast_set<int32_t>(*position); |
314 | 28 | } |
315 | | |
316 | | static const ColumnMapping* resolve_mapped_child(const ColumnMapping& mapping, |
317 | 20 | int32_t global_child_index) { |
318 | 31 | for (size_t child_idx = 0; child_idx < mapping.child_mappings.size(); ++child_idx) { |
319 | 28 | const auto& child_mapping = mapping.child_mappings[child_idx]; |
320 | 28 | if (child_mapping_global_index(mapping, child_mapping, child_idx) == global_child_index) { |
321 | 17 | return &child_mapping; |
322 | 17 | } |
323 | 28 | } |
324 | 3 | return nullptr; |
325 | 20 | } |
326 | | |
327 | | enum class NestedProjectionResolveResult { |
328 | | RESOLVED, |
329 | | NOT_REPRESENTED, |
330 | | MISSING_FILE_CHILD, |
331 | | }; |
332 | | |
333 | | // Resolve a table-side nested struct path through the existing ColumnMapping tree and build the |
334 | | // corresponding file-local projection. For example, if table column `s` has children |
335 | | // `{a, renamed_b}` and file column `s` has children `{a, b}`, the filter path |
336 | | // `struct_element(s, 'renamed_b')` is resolved to the file projection `s -> b` by following the |
337 | | // child mapping instead of matching the table child name against the file schema. Return |
338 | | // MISSING_FILE_CHILD when ColumnMapping explicitly says a table child is absent from this file; in |
339 | | // that case callers must not fall back to schema-name lookup, because Iceberg can drop a field and |
340 | | // later add a different field with the same name. |
341 | | static NestedProjectionResolveResult resolve_nested_projection_with_mapping( |
342 | | const NestedStructPath& path, const std::vector<ColumnMapping>& mappings, |
343 | 62 | LocalColumnIndex* root_projection) { |
344 | 62 | DORIS_CHECK(root_projection != nullptr); |
345 | 62 | *root_projection = {}; |
346 | 62 | if (path.selectors.empty()) { |
347 | 0 | return NestedProjectionResolveResult::NOT_REPRESENTED; |
348 | 0 | } |
349 | 65 | const auto mapping_it = std::ranges::find_if(mappings, [&](const ColumnMapping& mapping) { |
350 | 65 | return mapping.global_index == path.root_global_index; |
351 | 65 | }); |
352 | 62 | if (mapping_it == mappings.end() || !mapping_it->file_local_id.has_value()) { |
353 | 0 | return NestedProjectionResolveResult::NOT_REPRESENTED; |
354 | 0 | } |
355 | | |
356 | 62 | *root_projection = LocalColumnIndex::partial_local(*mapping_it->file_local_id); |
357 | 62 | auto* current_projection = root_projection; |
358 | 62 | const auto* current_mapping = &*mapping_it; |
359 | | |
360 | | // Traverse the ColumnMapping tree according to the table-side struct selectors and emit the |
361 | | // corresponding file-local child ids. A missing child mapping means this predicate-only path |
362 | | // may need schema fallback; an existing child mapping without a file id means the table child |
363 | | // is genuinely absent from this file and must stay above the file reader. |
364 | 79 | for (size_t selector_idx = 0; selector_idx < path.selectors.size(); ++selector_idx) { |
365 | 65 | const auto global_child_index = |
366 | 65 | struct_child_index(*current_mapping, path.selectors[selector_idx]); |
367 | 65 | if (!global_child_index.has_value()) { |
368 | 45 | *root_projection = {}; |
369 | 45 | return NestedProjectionResolveResult::NOT_REPRESENTED; |
370 | 45 | } |
371 | 20 | const auto* child_mapping = resolve_mapped_child(*current_mapping, *global_child_index); |
372 | 20 | if (child_mapping == nullptr) { |
373 | 3 | *root_projection = {}; |
374 | 3 | return NestedProjectionResolveResult::NOT_REPRESENTED; |
375 | 3 | } |
376 | 17 | if (!child_mapping->file_local_id.has_value()) { |
377 | 0 | *root_projection = {}; |
378 | 0 | return NestedProjectionResolveResult::MISSING_FILE_CHILD; |
379 | 0 | } |
380 | | |
381 | 17 | auto child_projection = LocalColumnIndex::partial_local(*child_mapping->file_local_id); |
382 | 17 | child_projection.project_all_children = selector_idx + 1 == path.selectors.size(); |
383 | 17 | current_projection->children.push_back(std::move(child_projection)); |
384 | 17 | current_projection = ¤t_projection->children.back(); |
385 | 17 | current_mapping = child_mapping; |
386 | 17 | } |
387 | 14 | return NestedProjectionResolveResult::RESOLVED; |
388 | 62 | } |
389 | | |
390 | 48 | static bool table_root_is_struct(const ColumnMapping& mapping) { |
391 | 48 | return struct_type_or_null(mapping.table_type) != nullptr; |
392 | 48 | } |
393 | | |
394 | 20 | static const std::vector<ColumnDefinition>& scan_file_children(const ColumnMapping& mapping) { |
395 | 20 | return !mapping.projected_file_children.empty() ? mapping.projected_file_children |
396 | 20 | : mapping.original_file_children; |
397 | 20 | } |
398 | | |
399 | | static const ColumnDefinition* resolve_file_leaf_from_projection( |
400 | 18 | const std::vector<ColumnDefinition>& children, const LocalColumnIndex& projection) { |
401 | 25 | const auto child_it = std::ranges::find_if(children, [&](const ColumnDefinition& child) { |
402 | 25 | return child.file_local_id() == projection.local_id(); |
403 | 25 | }); |
404 | 18 | if (child_it == children.end()) { |
405 | 0 | return nullptr; |
406 | 0 | } |
407 | 18 | if (projection.children.empty()) { |
408 | 16 | return &*child_it; |
409 | 16 | } |
410 | 2 | if (projection.children.size() != 1) { |
411 | 0 | return nullptr; |
412 | 0 | } |
413 | 2 | return resolve_file_leaf_from_projection(child_it->children, projection.children[0]); |
414 | 2 | } |
415 | | |
416 | | static bool collect_file_child_names_from_projection(const std::vector<ColumnDefinition>& children, |
417 | | const LocalColumnIndex& projection, |
418 | | std::vector<std::string>* file_child_names, |
419 | 64 | std::vector<DataTypePtr>* file_child_types) { |
420 | 64 | DORIS_CHECK(file_child_names != nullptr); |
421 | 64 | DORIS_CHECK(file_child_types != nullptr); |
422 | 89 | const auto child_it = std::ranges::find_if(children, [&](const ColumnDefinition& child) { |
423 | 89 | return child.file_local_id() == projection.local_id(); |
424 | 89 | }); |
425 | 64 | if (child_it == children.end()) { |
426 | 0 | return false; |
427 | 0 | } |
428 | 64 | file_child_names->push_back(child_it->name); |
429 | 64 | file_child_types->push_back(child_it->type); |
430 | 64 | if (projection.children.empty()) { |
431 | 56 | return true; |
432 | 56 | } |
433 | 8 | if (projection.children.size() != 1) { |
434 | 0 | return false; |
435 | 0 | } |
436 | 8 | return collect_file_child_names_from_projection(child_it->children, projection.children[0], |
437 | 8 | file_child_names, file_child_types); |
438 | 8 | } |
439 | | |
440 | | struct NestedPredicateTarget { |
441 | | LocalColumnIndex file_projection; |
442 | | FileNestedPredicateTarget file_target; |
443 | | std::string leaf_name; |
444 | | DataTypePtr leaf_type; |
445 | | }; |
446 | | |
447 | | static std::unique_ptr<FileStructPredicateTarget> build_struct_predicate_target_from_projection( |
448 | 18 | const std::vector<ColumnDefinition>& children, const LocalColumnIndex& projection) { |
449 | 25 | const auto child_it = std::ranges::find_if(children, [&](const ColumnDefinition& child) { |
450 | 25 | return child.file_local_id() == projection.local_id(); |
451 | 25 | }); |
452 | 18 | if (child_it == children.end()) { |
453 | 0 | return nullptr; |
454 | 0 | } |
455 | 18 | std::unique_ptr<FileStructPredicateTarget> nested_child; |
456 | 18 | if (!projection.children.empty()) { |
457 | 2 | if (projection.children.size() != 1) { |
458 | 0 | return nullptr; |
459 | 0 | } |
460 | 2 | nested_child = build_struct_predicate_target_from_projection(child_it->children, |
461 | 2 | projection.children[0]); |
462 | 2 | if (nested_child == nullptr) { |
463 | 0 | return nullptr; |
464 | 0 | } |
465 | 2 | } |
466 | 18 | return std::make_unique<FileStructPredicateTarget>(child_it->file_local_id(), child_it->name, |
467 | 18 | std::move(nested_child)); |
468 | 18 | } |
469 | | |
470 | | static bool build_struct_predicate_target(const ColumnMapping& root_mapping, |
471 | | const LocalColumnIndex& root_projection, |
472 | 16 | FileNestedPredicateTarget* file_target) { |
473 | 16 | DORIS_CHECK(file_target != nullptr); |
474 | 16 | if (!root_projection.column_id().is_valid() || root_projection.children.size() != 1) { |
475 | 0 | return false; |
476 | 0 | } |
477 | 16 | auto struct_target = build_struct_predicate_target_from_projection( |
478 | 16 | root_mapping.original_file_children, root_projection.children[0]); |
479 | 16 | if (struct_target == nullptr) { |
480 | 0 | return false; |
481 | 0 | } |
482 | 16 | *file_target = FileNestedPredicateTarget(root_projection.column_id(), std::move(struct_target)); |
483 | 16 | return true; |
484 | 16 | } |
485 | | |
486 | | static bool resolve_nested_predicate_target(const NestedStructPath& path, |
487 | | const std::vector<ColumnMapping>& mappings, |
488 | 18 | NestedPredicateTarget* target) { |
489 | 18 | DORIS_CHECK(target != nullptr); |
490 | 18 | ResolvedNestedStructPath resolved; |
491 | 18 | if (!resolve_nested_struct_path_for_file(path, mappings, &resolved)) { |
492 | 2 | return false; |
493 | 2 | } |
494 | | |
495 | 17 | const auto mapping_it = std::ranges::find_if(mappings, [&](const ColumnMapping& mapping) { |
496 | 17 | return mapping.global_index == path.root_global_index; |
497 | 17 | }); |
498 | 16 | if (mapping_it == mappings.end() || resolved.file_projection.children.size() != 1) { |
499 | 0 | return false; |
500 | 0 | } |
501 | 16 | const auto* file_leaf = resolve_file_leaf_from_projection(mapping_it->original_file_children, |
502 | 16 | resolved.file_projection.children[0]); |
503 | 16 | if (file_leaf == nullptr || file_leaf->type == nullptr) { |
504 | 0 | return false; |
505 | 0 | } |
506 | 16 | target->leaf_type = remove_nullable(file_leaf->type); |
507 | 16 | if (is_complex_type(target->leaf_type->get_primitive_type())) { |
508 | 0 | return false; |
509 | 0 | } |
510 | 16 | target->leaf_name = file_leaf->name; |
511 | 16 | target->file_projection = std::move(resolved.file_projection); |
512 | 16 | if (!build_struct_predicate_target(*mapping_it, target->file_projection, |
513 | 16 | &target->file_target)) { |
514 | 0 | return false; |
515 | 0 | } |
516 | 16 | return true; |
517 | 16 | } |
518 | | |
519 | 16 | static VExprSPtr original_table_literal_for_nested_predicate(const VExprSPtr& literal_expr) { |
520 | 16 | DORIS_CHECK(literal_expr != nullptr); |
521 | 16 | DORIS_CHECK(literal_expr->is_literal()); |
522 | 16 | const auto* rewritten_literal = dynamic_cast<const SplitLocalFileLiteral*>(literal_expr.get()); |
523 | 16 | if (rewritten_literal == nullptr) { |
524 | 16 | return literal_expr; |
525 | 16 | } |
526 | 0 | return VLiteral::create_shared(rewritten_literal->original_type(), |
527 | 0 | rewritten_literal->original_field()); |
528 | 16 | } |
529 | | |
530 | 12 | static std::optional<PredicateType> to_column_predicate_type(TExprOpcode::type opcode) { |
531 | 12 | switch (opcode) { |
532 | 1 | case TExprOpcode::EQ: |
533 | 1 | return PredicateType::EQ; |
534 | 0 | case TExprOpcode::NE: |
535 | 0 | return PredicateType::NE; |
536 | 10 | case TExprOpcode::GT: |
537 | 10 | return PredicateType::GT; |
538 | 0 | case TExprOpcode::GE: |
539 | 0 | return PredicateType::GE; |
540 | 1 | case TExprOpcode::LT: |
541 | 1 | return PredicateType::LT; |
542 | 0 | case TExprOpcode::LE: |
543 | 0 | return PredicateType::LE; |
544 | 0 | default: |
545 | 0 | return std::nullopt; |
546 | 12 | } |
547 | 12 | } |
548 | | |
549 | 1 | static TExprOpcode::type reverse_comparison_opcode(TExprOpcode::type opcode) { |
550 | 1 | switch (opcode) { |
551 | 0 | case TExprOpcode::GT: |
552 | 0 | return TExprOpcode::LT; |
553 | 0 | case TExprOpcode::GE: |
554 | 0 | return TExprOpcode::LE; |
555 | 1 | case TExprOpcode::LT: |
556 | 1 | return TExprOpcode::GT; |
557 | 0 | case TExprOpcode::LE: |
558 | 0 | return TExprOpcode::GE; |
559 | 0 | default: |
560 | 0 | return opcode; |
561 | 1 | } |
562 | 1 | } |
563 | | |
564 | | static std::shared_ptr<ColumnPredicate> create_comparison_column_predicate( |
565 | | PredicateType predicate_type, uint32_t column_id, const std::string& column_name, |
566 | 12 | const DataTypePtr& data_type, const Field& value) { |
567 | 12 | switch (predicate_type) { |
568 | 1 | case PredicateType::EQ: |
569 | 1 | return create_comparison_predicate<PredicateType::EQ>(column_id, column_name, data_type, |
570 | 1 | value, false); |
571 | 0 | case PredicateType::NE: |
572 | 0 | return create_comparison_predicate<PredicateType::NE>(column_id, column_name, data_type, |
573 | 0 | value, false); |
574 | 10 | case PredicateType::GT: |
575 | 10 | return create_comparison_predicate<PredicateType::GT>(column_id, column_name, data_type, |
576 | 10 | value, false); |
577 | 0 | case PredicateType::GE: |
578 | 0 | return create_comparison_predicate<PredicateType::GE>(column_id, column_name, data_type, |
579 | 0 | value, false); |
580 | 1 | case PredicateType::LT: |
581 | 1 | return create_comparison_predicate<PredicateType::LT>(column_id, column_name, data_type, |
582 | 1 | value, false); |
583 | 0 | case PredicateType::LE: |
584 | 0 | return create_comparison_predicate<PredicateType::LE>(column_id, column_name, data_type, |
585 | 0 | value, false); |
586 | 0 | default: |
587 | 0 | return nullptr; |
588 | 12 | } |
589 | 12 | } |
590 | | |
591 | | static bool extract_child_id_path_from_projection(const LocalColumnIndex& root_projection, |
592 | 16 | std::vector<int32_t>* file_child_id_path) { |
593 | 16 | DORIS_CHECK(file_child_id_path != nullptr); |
594 | 16 | file_child_id_path->clear(); |
595 | 16 | const auto* current_projection = &root_projection; |
596 | 34 | while (!current_projection->children.empty()) { |
597 | 18 | if (current_projection->children.size() != 1) { |
598 | 0 | file_child_id_path->clear(); |
599 | 0 | return false; |
600 | 0 | } |
601 | 18 | current_projection = ¤t_projection->children[0]; |
602 | 18 | file_child_id_path->push_back(current_projection->local_id()); |
603 | 18 | } |
604 | 16 | return !file_child_id_path->empty(); |
605 | 16 | } |
606 | | |
607 | | static std::shared_ptr<ColumnPredicate> build_nested_comparison_predicate( |
608 | | const VExprSPtr& literal_expr, TExprOpcode::type opcode, LocalColumnId root_file_column_id, |
609 | 12 | const std::string& leaf_name, const DataTypePtr& file_leaf_type) { |
610 | 12 | if (literal_expr == nullptr || !literal_expr->is_literal() || file_leaf_type == nullptr) { |
611 | 0 | return nullptr; |
612 | 0 | } |
613 | 12 | const auto predicate_type = to_column_predicate_type(opcode); |
614 | 12 | if (!predicate_type.has_value()) { |
615 | 0 | return nullptr; |
616 | 0 | } |
617 | 12 | const auto original_literal = original_table_literal_for_nested_predicate(literal_expr); |
618 | 12 | const Field original_field = literal_field(original_literal); |
619 | 12 | Field file_field; |
620 | 12 | try { |
621 | 12 | convert_field_to_type(original_field, *file_leaf_type, &file_field, |
622 | 12 | original_literal->data_type().get()); |
623 | 12 | } catch (const Exception&) { |
624 | 0 | return nullptr; |
625 | 0 | } |
626 | 12 | if (file_field.is_null()) { |
627 | 0 | return nullptr; |
628 | 0 | } |
629 | 12 | try { |
630 | 12 | return create_comparison_column_predicate(*predicate_type, |
631 | 12 | cast_set<uint32_t>(root_file_column_id.value()), |
632 | 12 | leaf_name, file_leaf_type, file_field); |
633 | 12 | } catch (const Exception&) { |
634 | 0 | return nullptr; |
635 | 0 | } |
636 | 12 | } |
637 | | |
638 | | static std::shared_ptr<ColumnPredicate> build_nested_in_list_predicate( |
639 | | const VExprSPtrs& literal_exprs, LocalColumnId root_file_column_id, |
640 | 2 | const std::string& leaf_name, const DataTypePtr& file_leaf_type) { |
641 | 2 | if (literal_exprs.empty() || file_leaf_type == nullptr) { |
642 | 0 | return nullptr; |
643 | 0 | } |
644 | | |
645 | 2 | auto value_column = file_leaf_type->create_column(); |
646 | 4 | for (const auto& literal_expr : literal_exprs) { |
647 | 4 | if (literal_expr == nullptr || !literal_expr->is_literal()) { |
648 | 0 | return nullptr; |
649 | 0 | } |
650 | 4 | const auto original_literal = original_table_literal_for_nested_predicate(literal_expr); |
651 | 4 | const Field original_field = literal_field(original_literal); |
652 | 4 | Field file_field; |
653 | 4 | try { |
654 | 4 | convert_field_to_type(original_field, *file_leaf_type, &file_field, |
655 | 4 | original_literal->data_type().get()); |
656 | 4 | } catch (const Exception&) { |
657 | 0 | return nullptr; |
658 | 0 | } |
659 | 4 | if (file_field.is_null()) { |
660 | 0 | return nullptr; |
661 | 0 | } |
662 | 4 | value_column->insert(file_field); |
663 | 4 | } |
664 | | |
665 | 2 | std::shared_ptr<HybridSetBase> values; |
666 | 2 | try { |
667 | 2 | values.reset(create_set(file_leaf_type->get_primitive_type(), literal_exprs.size(), false)); |
668 | 2 | ColumnPtr value_column_ptr = std::move(value_column); |
669 | 2 | values->insert_range_from(value_column_ptr, 0, value_column_ptr->size()); |
670 | 2 | return create_in_list_predicate<PredicateType::IN_LIST>( |
671 | 2 | cast_set<uint32_t>(root_file_column_id.value()), leaf_name, file_leaf_type, values, |
672 | 2 | false); |
673 | 2 | } catch (const Exception&) { |
674 | 0 | return nullptr; |
675 | 0 | } |
676 | 2 | } |
677 | | |
678 | | static std::shared_ptr<ColumnPredicate> build_nested_null_predicate( |
679 | | bool is_null, LocalColumnId root_file_column_id, const std::string& leaf_name, |
680 | 2 | const DataTypePtr& file_leaf_type) { |
681 | 2 | if (file_leaf_type == nullptr) { |
682 | 0 | return nullptr; |
683 | 0 | } |
684 | 2 | const auto leaf_primitive_type = remove_nullable(file_leaf_type)->get_primitive_type(); |
685 | 2 | return NullPredicate::create_shared(cast_set<uint32_t>(root_file_column_id.value()), leaf_name, |
686 | 2 | is_null, leaf_primitive_type); |
687 | 2 | } |
688 | | |
689 | | static bool set_nested_column_filter_target(const NestedPredicateTarget& target, |
690 | 16 | FileColumnPredicateFilter* column_filter) { |
691 | 16 | DORIS_CHECK(column_filter != nullptr); |
692 | 16 | std::vector<int32_t> file_child_id_path; |
693 | 16 | if (!extract_child_id_path_from_projection(target.file_projection, &file_child_id_path)) { |
694 | 0 | return false; |
695 | 0 | } |
696 | 16 | column_filter->file_column_id = target.file_projection.column_id(); |
697 | 16 | column_filter->file_child_id_path = std::move(file_child_id_path); |
698 | 16 | column_filter->target = target.file_target; |
699 | 16 | return true; |
700 | 16 | } |
701 | | |
702 | | static bool extract_nested_binary_comparison_filter(const VExprSPtr& expr, |
703 | | const std::vector<ColumnMapping>& mappings, |
704 | 63 | FileColumnPredicateFilter* column_filter) { |
705 | 63 | DORIS_CHECK(column_filter != nullptr); |
706 | 63 | if (!is_binary_comparison_predicate(expr)) { |
707 | 24 | return false; |
708 | 24 | } |
709 | 39 | NestedStructPath path; |
710 | 39 | VExprSPtr literal_expr; |
711 | 39 | TExprOpcode::type opcode = expr->op(); |
712 | 39 | if (extract_nested_struct_path_for_pruning(expr->children()[0], &path) && |
713 | 39 | expr->children()[1]->is_literal()) { |
714 | 13 | literal_expr = expr->children()[1]; |
715 | 26 | } else if (extract_nested_struct_path_for_pruning(expr->children()[1], &path) && |
716 | 26 | expr->children()[0]->is_literal()) { |
717 | 1 | literal_expr = expr->children()[0]; |
718 | 1 | opcode = reverse_comparison_opcode(opcode); |
719 | 25 | } else { |
720 | 25 | return false; |
721 | 25 | } |
722 | | |
723 | 14 | NestedPredicateTarget target; |
724 | 14 | if (!resolve_nested_predicate_target(path, mappings, &target)) { |
725 | 2 | return false; |
726 | 2 | } |
727 | 12 | auto predicate = build_nested_comparison_predicate(literal_expr, opcode, |
728 | 12 | target.file_projection.column_id(), |
729 | 12 | target.leaf_name, target.leaf_type); |
730 | 12 | if (predicate == nullptr) { |
731 | 0 | return false; |
732 | 0 | } |
733 | 12 | if (!set_nested_column_filter_target(target, column_filter)) { |
734 | 0 | return false; |
735 | 0 | } |
736 | 12 | column_filter->predicates.push_back(std::move(predicate)); |
737 | 12 | return true; |
738 | 12 | } |
739 | | |
740 | | static bool extract_nested_in_list_filter(const VExprSPtr& expr, |
741 | | const std::vector<ColumnMapping>& mappings, |
742 | 51 | FileColumnPredicateFilter* column_filter) { |
743 | 51 | DORIS_CHECK(column_filter != nullptr); |
744 | 51 | if (expr == nullptr || expr->node_type() != TExprNodeType::IN_PRED || |
745 | 51 | expr->get_num_children() < 2) { |
746 | 45 | return false; |
747 | 45 | } |
748 | 6 | if (const auto* in_predicate = dynamic_cast<const VInPredicate*>(expr.get()); |
749 | 6 | in_predicate != nullptr && in_predicate->is_not_in()) { |
750 | 0 | return false; |
751 | 0 | } |
752 | | |
753 | 6 | NestedStructPath path; |
754 | 6 | if (!extract_nested_struct_path_for_pruning(expr->children()[0], &path)) { |
755 | 4 | return false; |
756 | 4 | } |
757 | | |
758 | 2 | VExprSPtrs literal_exprs; |
759 | 2 | literal_exprs.reserve(expr->get_num_children() - 1); |
760 | 6 | for (size_t child_idx = 1; child_idx < expr->children().size(); ++child_idx) { |
761 | 4 | if (!expr->children()[child_idx]->is_literal()) { |
762 | 0 | return false; |
763 | 0 | } |
764 | 4 | literal_exprs.push_back(expr->children()[child_idx]); |
765 | 4 | } |
766 | | |
767 | 2 | NestedPredicateTarget target; |
768 | 2 | if (!resolve_nested_predicate_target(path, mappings, &target)) { |
769 | 0 | return false; |
770 | 0 | } |
771 | 2 | auto predicate = build_nested_in_list_predicate( |
772 | 2 | literal_exprs, target.file_projection.column_id(), target.leaf_name, target.leaf_type); |
773 | 2 | if (predicate == nullptr) { |
774 | 0 | return false; |
775 | 0 | } |
776 | 2 | if (!set_nested_column_filter_target(target, column_filter)) { |
777 | 0 | return false; |
778 | 0 | } |
779 | 2 | column_filter->predicates.push_back(std::move(predicate)); |
780 | 2 | return true; |
781 | 2 | } |
782 | | |
783 | | static bool extract_nested_null_filter(const VExprSPtr& expr, |
784 | | const std::vector<ColumnMapping>& mappings, |
785 | 49 | FileColumnPredicateFilter* column_filter) { |
786 | 49 | DORIS_CHECK(column_filter != nullptr); |
787 | 49 | bool is_null = false; |
788 | 49 | if (!is_null_predicate_function(expr, &is_null)) { |
789 | 47 | return false; |
790 | 47 | } |
791 | | |
792 | 2 | NestedStructPath path; |
793 | 2 | if (!extract_nested_struct_path_for_pruning(expr->children()[0], &path)) { |
794 | 0 | return false; |
795 | 0 | } |
796 | | |
797 | 2 | NestedPredicateTarget target; |
798 | 2 | if (!resolve_nested_predicate_target(path, mappings, &target)) { |
799 | 0 | return false; |
800 | 0 | } |
801 | 2 | auto predicate = build_nested_null_predicate(is_null, target.file_projection.column_id(), |
802 | 2 | target.leaf_name, target.leaf_type); |
803 | 2 | if (predicate == nullptr) { |
804 | 0 | return false; |
805 | 0 | } |
806 | 2 | if (!set_nested_column_filter_target(target, column_filter)) { |
807 | 0 | return false; |
808 | 0 | } |
809 | 2 | column_filter->predicates.push_back(std::move(predicate)); |
810 | 2 | return true; |
811 | 2 | } |
812 | | |
813 | | } // namespace |
814 | | |
815 | | SplitLocalFileLiteral::SplitLocalFileLiteral(const DataTypePtr& file_type, const Field& file_field, |
816 | | DataTypePtr original_type, Field original_field) |
817 | 22 | : VLiteral(file_type, file_field), |
818 | 22 | _original_type(std::move(original_type)), |
819 | 22 | _original_field(std::move(original_field)) {} |
820 | | |
821 | 216 | GlobalIndex slot_ref_global_index(const VSlotRef& slot_ref) { |
822 | 216 | DORIS_CHECK(slot_ref.column_id() >= 0); |
823 | 216 | return GlobalIndex(cast_set<size_t>(slot_ref.column_id())); |
824 | 216 | } |
825 | | |
826 | 574 | bool is_struct_element_expr(const VExprSPtr& expr) { |
827 | 574 | if (expr == nullptr || expr->get_num_children() != 2) { |
828 | 314 | return false; |
829 | 314 | } |
830 | 260 | const auto& function_name = expr->fn().name.function_name; |
831 | 260 | if (function_name == "struct_element") { |
832 | 138 | return true; |
833 | 138 | } |
834 | 122 | if (function_name != "element_at") { |
835 | 93 | return false; |
836 | 93 | } |
837 | 29 | const auto& parent_type = expr->children()[0]->data_type(); |
838 | 29 | return parent_type != nullptr && |
839 | 29 | remove_nullable(parent_type)->get_primitive_type() == TYPE_STRUCT; |
840 | 122 | } |
841 | | |
842 | 159 | Field literal_field(const VExprSPtr& literal_expr) { |
843 | 159 | DORIS_CHECK(literal_expr != nullptr); |
844 | 159 | DORIS_CHECK(literal_expr->is_literal()); |
845 | 159 | const auto* literal = dynamic_cast<const VLiteral*>(literal_expr.get()); |
846 | 159 | DORIS_CHECK(literal != nullptr); |
847 | 159 | Field field; |
848 | 159 | literal->get_column_ptr()->get(0, field); |
849 | 159 | return field; |
850 | 159 | } |
851 | | |
852 | | bool resolve_nested_struct_path_for_file(const NestedStructPath& path, |
853 | | const std::vector<ColumnMapping>& mappings, |
854 | | ResolvedNestedStructPath* resolved, |
855 | 62 | bool require_scan_projection) { |
856 | 62 | DORIS_CHECK(resolved != nullptr); |
857 | 62 | *resolved = {}; |
858 | 65 | const auto mapping_it = std::ranges::find_if(mappings, [&](const ColumnMapping& mapping) { |
859 | 65 | return mapping.global_index == path.root_global_index; |
860 | 65 | }); |
861 | 62 | if (mapping_it == mappings.end() || !mapping_it->file_local_id.has_value() || |
862 | 62 | path.selectors.empty()) { |
863 | 0 | return false; |
864 | 0 | } |
865 | | |
866 | | // Prefer ColumnMapping over schema-name lookup. This is the only path that can correctly |
867 | | // localize renamed Iceberg fields: a table filter `element_at(s, 'renamed_b')` must become a |
868 | | // file filter on physical child `b`, even if the old file type is `STRUCT<b ...>`. |
869 | 62 | const auto mapping_result = |
870 | 62 | resolve_nested_projection_with_mapping(path, mappings, &resolved->file_projection); |
871 | 62 | if (mapping_result == NestedProjectionResolveResult::MISSING_FILE_CHILD) { |
872 | 0 | return false; |
873 | 0 | } |
874 | 62 | if (mapping_result == NestedProjectionResolveResult::NOT_REPRESENTED) { |
875 | 48 | if (!table_root_is_struct(*mapping_it)) { |
876 | 6 | return false; |
877 | 6 | } |
878 | 42 | LocalColumnIndex child_projection; |
879 | 42 | if (!build_file_child_projection_from_schema(mapping_it->original_file_children, |
880 | 42 | path.selectors, &child_projection) |
881 | 42 | .ok() || |
882 | 42 | child_projection.local_id() < 0) { |
883 | 0 | return false; |
884 | 0 | } |
885 | 42 | resolved->file_projection = LocalColumnIndex::partial_local(*mapping_it->file_local_id); |
886 | 42 | resolved->file_projection.children.push_back(std::move(child_projection)); |
887 | 42 | } |
888 | | |
889 | 56 | if (resolved->file_projection.children.size() != 1) { |
890 | 0 | *resolved = {}; |
891 | 0 | return false; |
892 | 0 | } |
893 | | // When rewriting the final localized element_at chain, it executes on the file column produced |
894 | | // by this scan, so the intermediate return types must match the projected file shape, not the |
895 | | // full historical file schema. Example: |
896 | | // SELECT s.c WHERE element_at(element_at(s, 'b'), 'cc') LIKE 'NestedC%' |
897 | | // reads only b.cc and c; the inner element_at(s, 'b') returns Struct(cc), not |
898 | | // Struct(cc, new_dd). |
899 | | // |
900 | | // Earlier projection collection also calls this resolver before filter-only children have been |
901 | | // merged into the scan projection. That phase only needs the file path, so it still resolves |
902 | | // names/types from the original file schema. |
903 | 56 | const auto& child_source = require_scan_projection ? scan_file_children(*mapping_it) |
904 | 56 | : mapping_it->original_file_children; |
905 | 56 | if (!collect_file_child_names_from_projection( |
906 | 56 | child_source, resolved->file_projection.children[0], &resolved->file_child_names, |
907 | 56 | &resolved->file_child_types) || |
908 | 56 | resolved->file_child_names.size() != path.selectors.size() || |
909 | 56 | resolved->file_child_types.size() != path.selectors.size()) { |
910 | 0 | *resolved = {}; |
911 | 0 | return false; |
912 | 0 | } |
913 | 56 | return true; |
914 | 56 | } |
915 | | |
916 | | bool resolve_nested_struct_expr_for_file(const VExprSPtr& expr, |
917 | | const std::vector<ColumnMapping>& mappings, |
918 | 23 | ResolvedNestedStructPath* resolved) { |
919 | 23 | DORIS_CHECK(resolved != nullptr); |
920 | 23 | NestedStructPath path; |
921 | 23 | if (!extract_nested_struct_path(expr, &path)) { |
922 | 1 | *resolved = {}; |
923 | 1 | return false; |
924 | 1 | } |
925 | 22 | return resolve_nested_struct_path_for_file(path, mappings, resolved, true); |
926 | 23 | } |
927 | | |
928 | | // Collect nested struct leaf references that can be turned into file-reader projections and |
929 | | // primitive pruning predicates. For example, from `s.a > 1 AND element_at(s, 'b') = 2`, this |
930 | | // records two paths rooted at `s`: `s -> a` and `s -> b`. Non-struct expressions are traversed |
931 | | // recursively, while a recognized struct path is emitted once so the caller can merge it into the |
932 | | // scan projection for that top-level file column. |
933 | 288 | void collect_nested_struct_paths(const VExprSPtr& expr, std::vector<NestedStructPath>* paths) { |
934 | 288 | DORIS_CHECK(paths != nullptr); |
935 | 288 | if (expr == nullptr) { |
936 | 1 | return; |
937 | 1 | } |
938 | 287 | NestedStructPath path; |
939 | 287 | if (extract_nested_struct_path_for_pruning(expr, &path)) { |
940 | 44 | paths->push_back(std::move(path)); |
941 | 44 | return; |
942 | 44 | } |
943 | 243 | for (const auto& child : expr->children()) { |
944 | 189 | collect_nested_struct_paths(child, paths); |
945 | 189 | } |
946 | 243 | } |
947 | | |
948 | | std::vector<const ColumnMapping*> present_child_mappings_in_file_order( |
949 | 223 | const std::vector<ColumnMapping>& child_mappings) { |
950 | 223 | std::vector<const ColumnMapping*> result; |
951 | 223 | result.reserve(child_mappings.size()); |
952 | 223 | for (const auto& child_mapping : child_mappings) { |
953 | 172 | if (child_mapping.file_local_id.has_value()) { |
954 | 141 | result.push_back(&child_mapping); |
955 | 141 | } |
956 | 172 | } |
957 | 223 | std::ranges::sort(result, [](const ColumnMapping* lhs, const ColumnMapping* rhs) { |
958 | 52 | DORIS_CHECK(lhs->file_local_id.has_value()); |
959 | 52 | DORIS_CHECK(rhs->file_local_id.has_value()); |
960 | 52 | return *lhs->file_local_id < *rhs->file_local_id; |
961 | 52 | }); |
962 | 223 | return result; |
963 | 223 | } |
964 | | |
965 | | // Build the nested child projection under a top-level file column by walking file schema children |
966 | | // directly. The returned projection does not include the root column id; callers attach it under a |
967 | | // `LocalColumnIndex::partial_local(root_id)` when merging into the scan request. |
968 | | Status build_file_child_projection_from_schema(const std::vector<ColumnDefinition>& children, |
969 | | std::span<const StructChildSelector> selectors, |
970 | 53 | LocalColumnIndex* projection) { |
971 | 53 | DORIS_CHECK(projection != nullptr); |
972 | 53 | if (selectors.empty()) { |
973 | 0 | return Status::InvalidArgument("Nested struct selector path is empty"); |
974 | 0 | } |
975 | 53 | const auto* child = resolve_file_child(children, selectors.front()); |
976 | 53 | if (child == nullptr) { |
977 | 0 | return Status::OK(); |
978 | 0 | } |
979 | 53 | *projection = LocalColumnIndex::local(child->file_local_id()); |
980 | 53 | projection->project_all_children = selectors.size() == 1; |
981 | 53 | projection->children.clear(); |
982 | 53 | if (selectors.size() == 1) { |
983 | 45 | return Status::OK(); |
984 | 45 | } |
985 | 8 | if (child->children.empty() || |
986 | 8 | remove_nullable(child->type)->get_primitive_type() != TYPE_STRUCT) { |
987 | 0 | *projection = LocalColumnIndex {}; |
988 | 0 | return Status::OK(); |
989 | 0 | } |
990 | 8 | LocalColumnIndex child_projection; |
991 | 8 | RETURN_IF_ERROR(build_file_child_projection_from_schema(child->children, selectors.subspan(1), |
992 | 8 | &child_projection)); |
993 | 8 | if (child_projection.local_id() < 0) { |
994 | 0 | *projection = LocalColumnIndex {}; |
995 | 0 | return Status::OK(); |
996 | 0 | } |
997 | 8 | projection->children.push_back(std::move(child_projection)); |
998 | 8 | return Status::OK(); |
999 | 8 | } |
1000 | | |
1001 | | // Merge predicates that target the same physical file column or nested leaf. For example, |
1002 | | // `WHERE s.b > 1 AND s.b < 10` produces two predicates for the same target `s -> b`; keeping them |
1003 | | // in one FileColumnPredicateFilter lets the file reader apply both pruning checks to the same leaf |
1004 | | // instead of carrying duplicate target entries. |
1005 | | void merge_column_predicate_filter(FileColumnPredicateFilter column_filter, |
1006 | 30 | std::vector<FileColumnPredicateFilter>* filters) { |
1007 | 30 | DORIS_CHECK(filters != nullptr); |
1008 | 30 | auto existing_filter_it = std::ranges::find_if(*filters, [&](const auto& existing_filter) { |
1009 | 5 | return existing_filter.same_target_as(column_filter); |
1010 | 5 | }); |
1011 | 30 | if (existing_filter_it == filters->end()) { |
1012 | 25 | filters->push_back(std::move(column_filter)); |
1013 | 25 | return; |
1014 | 25 | } |
1015 | 5 | existing_filter_it->predicates.insert(existing_filter_it->predicates.end(), |
1016 | 5 | column_filter.predicates.begin(), |
1017 | 5 | column_filter.predicates.end()); |
1018 | 5 | } |
1019 | | |
1020 | | // Extract file-column pruning predicates from localized row-level conjuncts that reference nested |
1021 | | // struct leaves. This is separate from file_request->conjuncts: conjuncts do row filtering, while |
1022 | | // FileColumnPredicateFilter carries primitive leaf predicates for file/page/statistics pruning. |
1023 | | // |
1024 | | // Example: for `WHERE s.b.c > 10 AND element_at(s, 'd') IS NOT NULL`, this function emits pruning |
1025 | | // filters for the nested targets `s -> b -> c` and `s -> d`. The caller only invokes it after |
1026 | | // table_filter_has_only_local_entries() succeeds, so each root slot already has a file-local scan |
1027 | | // source in _filter_entries. |
1028 | | void collect_nested_column_predicate_filters(const VExprSPtr& expr, |
1029 | | const std::vector<ColumnMapping>& mappings, |
1030 | 67 | std::vector<FileColumnPredicateFilter>* filters) { |
1031 | 67 | DORIS_CHECK(filters != nullptr); |
1032 | 67 | if (expr == nullptr) { |
1033 | 0 | return; |
1034 | 0 | } |
1035 | 67 | if (expr->node_type() == TExprNodeType::COMPOUND_PRED && |
1036 | 67 | expr->op() == TExprOpcode::COMPOUND_AND) { |
1037 | 8 | for (const auto& child : expr->children()) { |
1038 | 8 | collect_nested_column_predicate_filters(child, mappings, filters); |
1039 | 8 | } |
1040 | 4 | return; |
1041 | 4 | } |
1042 | 63 | FileColumnPredicateFilter column_filter; |
1043 | 63 | if (extract_nested_binary_comparison_filter(expr, mappings, &column_filter) || |
1044 | 63 | extract_nested_in_list_filter(expr, mappings, &column_filter) || |
1045 | 63 | extract_nested_null_filter(expr, mappings, &column_filter)) { |
1046 | 16 | merge_column_predicate_filter(std::move(column_filter), filters); |
1047 | 16 | } |
1048 | 63 | } |
1049 | | |
1050 | | } // namespace doris::format |