be/src/format_v2/column_mapper_nested.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "format_v2/column_mapper_nested.h" |
19 | | |
20 | | #include <algorithm> |
21 | | #include <cstdint> |
22 | | #include <memory> |
23 | | #include <optional> |
24 | | #include <utility> |
25 | | |
26 | | #include "common/cast_set.h" |
27 | | #include "common/exception.h" |
28 | | #include "core/assert_cast.h" |
29 | | #include "core/data_type/data_type_nullable.h" |
30 | | #include "core/data_type/data_type_struct.h" |
31 | | #include "core/data_type/primitive_type.h" |
32 | | #include "exprs/vexpr.h" |
33 | | #include "format_v2/expr/cast.h" |
34 | | #include "gen_cpp/Exprs_types.h" |
35 | | |
36 | | namespace doris::format { |
37 | | |
38 | | namespace { |
39 | | |
40 | 253 | static bool is_cast_expr(const VExprSPtr& expr) { |
41 | 253 | return dynamic_cast<const Cast*>(expr.get()) != nullptr; |
42 | 253 | } |
43 | | |
44 | 15 | static bool is_signed_integer_type(PrimitiveType type) { |
45 | 15 | switch (type) { |
46 | 0 | case TYPE_TINYINT: |
47 | 0 | case TYPE_SMALLINT: |
48 | 5 | case TYPE_INT: |
49 | 11 | case TYPE_BIGINT: |
50 | 11 | case TYPE_LARGEINT: |
51 | 11 | return true; |
52 | 4 | default: |
53 | 4 | return false; |
54 | 15 | } |
55 | 15 | } |
56 | | |
57 | 10 | static int primitive_integer_width(PrimitiveType type) { |
58 | 10 | switch (type) { |
59 | 0 | case TYPE_TINYINT: |
60 | 0 | return 1; |
61 | 0 | case TYPE_SMALLINT: |
62 | 0 | return 2; |
63 | 5 | case TYPE_INT: |
64 | 5 | return 4; |
65 | 5 | case TYPE_BIGINT: |
66 | 5 | return 8; |
67 | 0 | case TYPE_LARGEINT: |
68 | 0 | return 16; |
69 | 0 | default: |
70 | 0 | return 0; |
71 | 10 | } |
72 | 10 | } |
73 | | |
74 | 5 | static bool is_decimal_type(PrimitiveType type) { |
75 | 5 | switch (type) { |
76 | 4 | case TYPE_DECIMAL32: |
77 | 4 | case TYPE_DECIMAL64: |
78 | 4 | case TYPE_DECIMALV2: |
79 | 4 | case TYPE_DECIMAL128I: |
80 | 4 | case TYPE_DECIMAL256: |
81 | 4 | return true; |
82 | 1 | default: |
83 | 1 | return false; |
84 | 5 | } |
85 | 5 | } |
86 | | |
87 | | static bool is_order_preserving_safe_cast(const DataTypePtr& from_type, |
88 | 11 | const DataTypePtr& to_type) { |
89 | 11 | if (from_type == nullptr || to_type == nullptr) { |
90 | 0 | return false; |
91 | 0 | } |
92 | 11 | const auto from_nested_type = remove_nullable(from_type); |
93 | 11 | const auto to_nested_type = remove_nullable(to_type); |
94 | 11 | if (from_nested_type->equals(*to_nested_type)) { |
95 | 2 | return true; |
96 | 2 | } |
97 | | |
98 | 9 | const auto from_primitive_type = from_nested_type->get_primitive_type(); |
99 | 9 | const auto to_primitive_type = to_nested_type->get_primitive_type(); |
100 | 9 | if (is_signed_integer_type(from_primitive_type) && is_signed_integer_type(to_primitive_type)) { |
101 | 5 | return primitive_integer_width(to_primitive_type) >= |
102 | 5 | primitive_integer_width(from_primitive_type); |
103 | 5 | } |
104 | 4 | if (from_primitive_type == TYPE_FLOAT && to_primitive_type == TYPE_DOUBLE) { |
105 | 1 | return true; |
106 | 1 | } |
107 | 3 | if (is_decimal_type(from_primitive_type) && is_decimal_type(to_primitive_type)) { |
108 | 2 | return from_nested_type->get_scale() == to_nested_type->get_scale() && |
109 | 2 | to_nested_type->get_precision() >= from_nested_type->get_precision(); |
110 | 2 | } |
111 | 1 | return false; |
112 | 3 | } |
113 | | |
114 | 93 | static bool parse_struct_child_selector(const VExprSPtr& expr, StructChildSelector* selector) { |
115 | 93 | DORIS_CHECK(selector != nullptr); |
116 | 93 | if (expr == nullptr || !expr->is_literal()) { |
117 | 1 | return false; |
118 | 1 | } |
119 | 92 | const Field field = literal_field(expr); |
120 | 92 | switch (field.get_type()) { |
121 | 80 | case TYPE_STRING: |
122 | 80 | case TYPE_CHAR: |
123 | 80 | case TYPE_VARCHAR: |
124 | 80 | selector->by_name = true; |
125 | 80 | selector->name = std::string(field.as_string_view()); |
126 | 80 | return true; |
127 | 2 | case TYPE_BOOLEAN: |
128 | 2 | selector->by_name = false; |
129 | 2 | selector->ordinal = field.get<TYPE_BOOLEAN>() ? 1 : 0; |
130 | 2 | return selector->ordinal > 0; |
131 | 1 | case TYPE_TINYINT: |
132 | 1 | selector->by_name = false; |
133 | 1 | if (field.get<TYPE_TINYINT>() <= 0) { |
134 | 0 | return false; |
135 | 0 | } |
136 | 1 | selector->ordinal = cast_set<size_t>(field.get<TYPE_TINYINT>()); |
137 | 1 | return true; |
138 | 1 | case TYPE_SMALLINT: |
139 | 1 | selector->by_name = false; |
140 | 1 | if (field.get<TYPE_SMALLINT>() <= 0) { |
141 | 0 | return false; |
142 | 0 | } |
143 | 1 | selector->ordinal = cast_set<size_t>(field.get<TYPE_SMALLINT>()); |
144 | 1 | return true; |
145 | 5 | case TYPE_INT: |
146 | 5 | selector->by_name = false; |
147 | 5 | if (field.get<TYPE_INT>() <= 0) { |
148 | 2 | return false; |
149 | 2 | } |
150 | 3 | selector->ordinal = cast_set<size_t>(field.get<TYPE_INT>()); |
151 | 3 | return true; |
152 | 1 | case TYPE_BIGINT: |
153 | 1 | selector->by_name = false; |
154 | 1 | if (field.get<TYPE_BIGINT>() <= 0) { |
155 | 0 | return false; |
156 | 0 | } |
157 | 1 | selector->ordinal = cast_set<size_t>(field.get<TYPE_BIGINT>()); |
158 | 1 | return true; |
159 | 2 | default: |
160 | 2 | return false; |
161 | 92 | } |
162 | 92 | } |
163 | | |
164 | 341 | static bool extract_nested_struct_path(const VExprSPtr& expr, NestedStructPath* path) { |
165 | 341 | DORIS_CHECK(path != nullptr); |
166 | 341 | if (!is_struct_element_expr(expr)) { |
167 | 248 | return false; |
168 | 248 | } |
169 | | |
170 | | // Process for element_at(struct, 'field') or element_at(struct, 1) expression. |
171 | 93 | StructChildSelector selector; |
172 | 93 | if (!parse_struct_child_selector(expr->children()[1], &selector)) { |
173 | 6 | return false; |
174 | 6 | } |
175 | | |
176 | 87 | const auto& parent = expr->children()[0]; |
177 | 87 | if (parent->is_slot_ref()) { |
178 | 66 | const auto* slot_ref = assert_cast<const VSlotRef*>(parent.get()); |
179 | 66 | path->root_global_index = slot_ref_global_index(*slot_ref); |
180 | 66 | path->selectors.clear(); |
181 | 66 | path->selectors.push_back(std::move(selector)); |
182 | 66 | return true; |
183 | 66 | } |
184 | | |
185 | | // Process for element_at(element_at(struct<struct>, 'field'), 'field') or |
186 | | // element_at(element_at(struct<struct>, 1), 1) expression. |
187 | 21 | if (!extract_nested_struct_path(parent, path)) { |
188 | 3 | return false; |
189 | 3 | } |
190 | 18 | path->selectors.push_back(std::move(selector)); |
191 | 18 | return true; |
192 | 21 | } |
193 | | |
194 | 297 | static bool extract_nested_struct_path_for_pruning(const VExprSPtr& expr, NestedStructPath* path) { |
195 | 297 | DORIS_CHECK(path != nullptr); |
196 | | // Simple `ELEMENT_AT` |
197 | 297 | if (extract_nested_struct_path(expr, path)) { |
198 | 44 | return true; |
199 | 44 | } |
200 | | |
201 | | // `ELEMENT_AT` with `CAST` |
202 | 253 | if (!is_cast_expr(expr) || expr->get_num_children() != 1) { |
203 | 242 | return false; |
204 | 242 | } |
205 | 11 | const auto& child = expr->children()[0]; |
206 | 11 | if (!is_order_preserving_safe_cast(child->data_type(), expr->data_type())) { |
207 | 4 | return false; |
208 | 4 | } |
209 | | // A safe widening cast is null-preserving and keeps the comparison ordering of the nested |
210 | | // primitive leaf, so file-layer pruning can target the original leaf statistics. The row-level |
211 | | // filter still evaluates the original cast expression after read. |
212 | 7 | return extract_nested_struct_path_for_pruning(child, path); |
213 | 11 | } |
214 | | |
215 | | static const ColumnDefinition* resolve_file_child(const std::vector<ColumnDefinition>& children, |
216 | 40 | const StructChildSelector& selector) { |
217 | 40 | if (selector.by_name) { |
218 | 50 | const auto child_it = std::ranges::find_if(children, [&](const ColumnDefinition& child) { |
219 | 50 | return child.name == selector.name; |
220 | 50 | }); |
221 | 38 | return child_it == children.end() ? nullptr : &*child_it; |
222 | 38 | } |
223 | 2 | if (selector.ordinal == 0 || selector.ordinal > children.size()) { |
224 | 0 | return nullptr; |
225 | 0 | } |
226 | 2 | return &children[selector.ordinal - 1]; |
227 | 2 | } |
228 | | |
229 | 100 | static const DataTypeStruct* struct_type_or_null(const DataTypePtr& type) { |
230 | 100 | if (type == nullptr) { |
231 | 0 | return nullptr; |
232 | 0 | } |
233 | 100 | const auto nested_type = remove_nullable(type); |
234 | 100 | if (nested_type->get_primitive_type() != TYPE_STRUCT) { |
235 | 8 | return nullptr; |
236 | 8 | } |
237 | 92 | return assert_cast<const DataTypeStruct*>(nested_type.get()); |
238 | 100 | } |
239 | | |
240 | | static std::optional<int32_t> struct_child_index(const ColumnMapping& mapping, |
241 | 46 | const StructChildSelector& selector) { |
242 | 46 | const auto* struct_type = struct_type_or_null(mapping.table_type); |
243 | 46 | if (struct_type == nullptr) { |
244 | 4 | return std::nullopt; |
245 | 4 | } |
246 | 42 | if (selector.by_name) { |
247 | 42 | const auto position = struct_type->try_get_position_by_name(selector.name); |
248 | 42 | if (!position.has_value()) { |
249 | 28 | return std::nullopt; |
250 | 28 | } |
251 | 14 | return cast_set<int32_t>(*position); |
252 | 42 | } |
253 | 0 | if (selector.ordinal == 0 || selector.ordinal > struct_type->get_elements().size()) { |
254 | 0 | return std::nullopt; |
255 | 0 | } |
256 | 0 | return cast_set<int32_t>(selector.ordinal - 1); |
257 | 0 | } |
258 | | |
259 | | // Get the global child index for a child mapping. If the mapping's table type is struct, resolve |
260 | | // the child index by the child mapping's table column name; otherwise, use the fallback child index. |
261 | | static int32_t child_mapping_global_index(const ColumnMapping& mapping, |
262 | | const ColumnMapping& child_mapping, |
263 | 20 | size_t fallback_child_idx) { |
264 | 20 | const auto* struct_type = struct_type_or_null(mapping.table_type); |
265 | 20 | if (struct_type == nullptr) { |
266 | 0 | return cast_set<int32_t>(fallback_child_idx); |
267 | 0 | } |
268 | 20 | const auto position = struct_type->try_get_position_by_name(child_mapping.table_column_name); |
269 | 20 | DORIS_CHECK(position.has_value()) << "Cannot find child '" << child_mapping.table_column_name |
270 | 0 | << "' in table type " << mapping.table_type->get_name(); |
271 | 20 | return cast_set<int32_t>(*position); |
272 | 20 | } |
273 | | |
274 | | static const ColumnMapping* resolve_mapped_child(const ColumnMapping& mapping, |
275 | 14 | int32_t global_child_index) { |
276 | 22 | for (size_t child_idx = 0; child_idx < mapping.child_mappings.size(); ++child_idx) { |
277 | 20 | const auto& child_mapping = mapping.child_mappings[child_idx]; |
278 | 20 | if (child_mapping_global_index(mapping, child_mapping, child_idx) == global_child_index) { |
279 | 12 | return &child_mapping; |
280 | 12 | } |
281 | 20 | } |
282 | 2 | return nullptr; |
283 | 14 | } |
284 | | |
285 | | enum class NestedProjectionResolveResult { |
286 | | RESOLVED, |
287 | | NOT_REPRESENTED, |
288 | | MISSING_FILE_CHILD, |
289 | | }; |
290 | | |
291 | | // Resolve a table-side nested struct path through the existing ColumnMapping tree and build the |
292 | | // corresponding file-local projection. For example, if table column `s` has children |
293 | | // `{a, renamed_b}` and file column `s` has children `{a, b}`, the filter path |
294 | | // `struct_element(s, 'renamed_b')` is resolved to the file projection `s -> b` by following the |
295 | | // child mapping instead of matching the table child name against the file schema. Return |
296 | | // MISSING_FILE_CHILD when ColumnMapping explicitly says a table child is absent from this file; in |
297 | | // that case callers must not fall back to schema-name lookup, because Iceberg can drop a field and |
298 | | // later add a different field with the same name. |
299 | | static NestedProjectionResolveResult resolve_nested_projection_with_mapping( |
300 | | const NestedStructPath& path, const std::vector<ColumnMapping>& mappings, |
301 | 44 | LocalColumnIndex* root_projection) { |
302 | 44 | DORIS_CHECK(root_projection != nullptr); |
303 | 44 | *root_projection = {}; |
304 | 44 | if (path.selectors.empty()) { |
305 | 0 | return NestedProjectionResolveResult::NOT_REPRESENTED; |
306 | 0 | } |
307 | 46 | const auto mapping_it = std::ranges::find_if(mappings, [&](const ColumnMapping& mapping) { |
308 | 46 | return mapping.global_index == path.root_global_index; |
309 | 46 | }); |
310 | 44 | if (mapping_it == mappings.end() || !mapping_it->file_local_id.has_value()) { |
311 | 0 | return NestedProjectionResolveResult::NOT_REPRESENTED; |
312 | 0 | } |
313 | | |
314 | 44 | *root_projection = LocalColumnIndex::partial_local(*mapping_it->file_local_id); |
315 | 44 | auto* current_projection = root_projection; |
316 | 44 | const auto* current_mapping = &*mapping_it; |
317 | | |
318 | | // Traverse the ColumnMapping tree according to the table-side struct selectors and emit the |
319 | | // corresponding file-local child ids. A missing child mapping means this predicate-only path |
320 | | // may need schema fallback; an existing child mapping without a file id means the table child |
321 | | // is genuinely absent from this file and must stay above the file reader. |
322 | 56 | for (size_t selector_idx = 0; selector_idx < path.selectors.size(); ++selector_idx) { |
323 | 46 | const auto global_child_index = |
324 | 46 | struct_child_index(*current_mapping, path.selectors[selector_idx]); |
325 | 46 | if (!global_child_index.has_value()) { |
326 | 32 | *root_projection = {}; |
327 | 32 | return NestedProjectionResolveResult::NOT_REPRESENTED; |
328 | 32 | } |
329 | 14 | const auto* child_mapping = resolve_mapped_child(*current_mapping, *global_child_index); |
330 | 14 | if (child_mapping == nullptr) { |
331 | 2 | *root_projection = {}; |
332 | 2 | return NestedProjectionResolveResult::NOT_REPRESENTED; |
333 | 2 | } |
334 | 12 | if (!child_mapping->file_local_id.has_value()) { |
335 | 0 | *root_projection = {}; |
336 | 0 | return NestedProjectionResolveResult::MISSING_FILE_CHILD; |
337 | 0 | } |
338 | | |
339 | 12 | auto child_projection = LocalColumnIndex::partial_local(*child_mapping->file_local_id); |
340 | 12 | child_projection.project_all_children = selector_idx + 1 == path.selectors.size(); |
341 | 12 | current_projection->children.push_back(std::move(child_projection)); |
342 | 12 | current_projection = ¤t_projection->children.back(); |
343 | 12 | current_mapping = child_mapping; |
344 | 12 | } |
345 | 10 | return NestedProjectionResolveResult::RESOLVED; |
346 | 44 | } |
347 | | |
348 | 34 | static bool table_root_is_struct(const ColumnMapping& mapping) { |
349 | 34 | return struct_type_or_null(mapping.table_type) != nullptr; |
350 | 34 | } |
351 | | |
352 | 20 | static const std::vector<ColumnDefinition>& scan_file_children(const ColumnMapping& mapping) { |
353 | 20 | return !mapping.projected_file_children.empty() ? mapping.projected_file_children |
354 | 20 | : mapping.original_file_children; |
355 | 20 | } |
356 | | |
357 | | static bool collect_file_child_names_from_projection(const std::vector<ColumnDefinition>& children, |
358 | | const LocalColumnIndex& projection, |
359 | | std::vector<std::string>* file_child_names, |
360 | 46 | std::vector<DataTypePtr>* file_child_types) { |
361 | 46 | DORIS_CHECK(file_child_names != nullptr); |
362 | 46 | DORIS_CHECK(file_child_types != nullptr); |
363 | 64 | const auto child_it = std::ranges::find_if(children, [&](const ColumnDefinition& child) { |
364 | 64 | return child.file_local_id() == projection.local_id(); |
365 | 64 | }); |
366 | 46 | if (child_it == children.end()) { |
367 | 0 | return false; |
368 | 0 | } |
369 | 46 | file_child_names->push_back(child_it->name); |
370 | 46 | file_child_types->push_back(child_it->type); |
371 | 46 | if (projection.children.empty()) { |
372 | 40 | return true; |
373 | 40 | } |
374 | 6 | if (projection.children.size() != 1) { |
375 | 0 | return false; |
376 | 0 | } |
377 | 6 | return collect_file_child_names_from_projection(child_it->children, projection.children[0], |
378 | 6 | file_child_names, file_child_types); |
379 | 6 | } |
380 | | |
381 | | } // namespace |
382 | | |
383 | | SplitLocalFileLiteral::SplitLocalFileLiteral(const DataTypePtr& file_type, const Field& file_field, |
384 | | DataTypePtr original_type, Field original_field) |
385 | 23 | : VLiteral(file_type, file_field), |
386 | 23 | _original_type(std::move(original_type)), |
387 | 23 | _original_field(std::move(original_field)) {} |
388 | | |
389 | 200 | GlobalIndex slot_ref_global_index(const VSlotRef& slot_ref) { |
390 | 200 | DORIS_CHECK(slot_ref.column_id() >= 0); |
391 | 200 | return GlobalIndex(cast_set<size_t>(slot_ref.column_id())); |
392 | 200 | } |
393 | | |
394 | 499 | bool is_struct_element_expr(const VExprSPtr& expr) { |
395 | 499 | if (expr == nullptr || expr->get_num_children() != 2) { |
396 | 261 | return false; |
397 | 261 | } |
398 | 238 | const auto& function_name = expr->fn().name.function_name; |
399 | 238 | if (function_name == "struct_element") { |
400 | 119 | return true; |
401 | 119 | } |
402 | 119 | if (function_name != "element_at") { |
403 | 93 | return false; |
404 | 93 | } |
405 | 26 | const auto& parent_type = expr->children()[0]->data_type(); |
406 | 26 | return parent_type != nullptr && |
407 | 26 | remove_nullable(parent_type)->get_primitive_type() == TYPE_STRUCT; |
408 | 119 | } |
409 | | |
410 | 122 | Field literal_field(const VExprSPtr& literal_expr) { |
411 | 122 | DORIS_CHECK(literal_expr != nullptr); |
412 | 122 | DORIS_CHECK(literal_expr->is_literal()); |
413 | 122 | const auto* literal = dynamic_cast<const VLiteral*>(literal_expr.get()); |
414 | 122 | DORIS_CHECK(literal != nullptr); |
415 | 122 | Field field; |
416 | 122 | literal->get_column_ptr()->get(0, field); |
417 | 122 | return field; |
418 | 122 | } |
419 | | |
420 | | bool resolve_nested_struct_path_for_file(const NestedStructPath& path, |
421 | | const std::vector<ColumnMapping>& mappings, |
422 | | ResolvedNestedStructPath* resolved, |
423 | 44 | bool require_scan_projection) { |
424 | 44 | DORIS_CHECK(resolved != nullptr); |
425 | 44 | *resolved = {}; |
426 | 46 | const auto mapping_it = std::ranges::find_if(mappings, [&](const ColumnMapping& mapping) { |
427 | 46 | return mapping.global_index == path.root_global_index; |
428 | 46 | }); |
429 | 44 | if (mapping_it == mappings.end() || !mapping_it->file_local_id.has_value() || |
430 | 44 | path.selectors.empty()) { |
431 | 0 | return false; |
432 | 0 | } |
433 | | |
434 | | // Prefer ColumnMapping over schema-name lookup. This is the only path that can correctly |
435 | | // localize renamed Iceberg fields: a table filter `element_at(s, 'renamed_b')` must become a |
436 | | // file filter on physical child `b`, even if the old file type is `STRUCT<b ...>`. |
437 | 44 | const auto mapping_result = |
438 | 44 | resolve_nested_projection_with_mapping(path, mappings, &resolved->file_projection); |
439 | 44 | if (mapping_result == NestedProjectionResolveResult::MISSING_FILE_CHILD) { |
440 | 0 | return false; |
441 | 0 | } |
442 | 44 | if (mapping_result == NestedProjectionResolveResult::NOT_REPRESENTED) { |
443 | 34 | if (!table_root_is_struct(*mapping_it)) { |
444 | 4 | return false; |
445 | 4 | } |
446 | 30 | LocalColumnIndex child_projection; |
447 | 30 | if (!build_file_child_projection_from_schema(mapping_it->original_file_children, |
448 | 30 | path.selectors, &child_projection) |
449 | 30 | .ok() || |
450 | 30 | child_projection.local_id() < 0) { |
451 | 0 | return false; |
452 | 0 | } |
453 | 30 | resolved->file_projection = LocalColumnIndex::partial_local(*mapping_it->file_local_id); |
454 | 30 | resolved->file_projection.children.push_back(std::move(child_projection)); |
455 | 30 | } |
456 | | |
457 | 40 | if (resolved->file_projection.children.size() != 1) { |
458 | 0 | *resolved = {}; |
459 | 0 | return false; |
460 | 0 | } |
461 | | // When rewriting the final localized element_at chain, it executes on the file column produced |
462 | | // by this scan, so the intermediate return types must match the projected file shape, not the |
463 | | // full historical file schema. Example: |
464 | | // SELECT s.c WHERE element_at(element_at(s, 'b'), 'cc') LIKE 'NestedC%' |
465 | | // reads only b.cc and c; the inner element_at(s, 'b') returns Struct(cc), not |
466 | | // Struct(cc, new_dd). |
467 | | // |
468 | | // Earlier projection collection also calls this resolver before filter-only children have been |
469 | | // merged into the scan projection. That phase only needs the file path, so it still resolves |
470 | | // names/types from the original file schema. |
471 | 40 | const auto& child_source = require_scan_projection ? scan_file_children(*mapping_it) |
472 | 40 | : mapping_it->original_file_children; |
473 | 40 | if (!collect_file_child_names_from_projection( |
474 | 40 | child_source, resolved->file_projection.children[0], &resolved->file_child_names, |
475 | 40 | &resolved->file_child_types) || |
476 | 40 | resolved->file_child_names.size() != path.selectors.size() || |
477 | 40 | resolved->file_child_types.size() != path.selectors.size()) { |
478 | 0 | *resolved = {}; |
479 | 0 | return false; |
480 | 0 | } |
481 | 40 | return true; |
482 | 40 | } |
483 | | |
484 | | bool resolve_nested_struct_expr_for_file(const VExprSPtr& expr, |
485 | | const std::vector<ColumnMapping>& mappings, |
486 | 23 | ResolvedNestedStructPath* resolved) { |
487 | 23 | DORIS_CHECK(resolved != nullptr); |
488 | 23 | NestedStructPath path; |
489 | 23 | if (!extract_nested_struct_path(expr, &path)) { |
490 | 1 | *resolved = {}; |
491 | 1 | return false; |
492 | 1 | } |
493 | 22 | return resolve_nested_struct_path_for_file(path, mappings, resolved, true); |
494 | 23 | } |
495 | | |
496 | | // Collect nested struct leaf references that can be turned into file-reader projections. For |
497 | | // example, from `s.a > 1 AND element_at(s, 'b') = 2`, this records two paths rooted at `s`: |
498 | | // `s -> a` and `s -> b`. Non-struct expressions are traversed recursively, while a recognized |
499 | | // struct path is emitted once so the caller can merge it into the scan projection for that |
500 | | // top-level file column. |
501 | 291 | void collect_nested_struct_paths(const VExprSPtr& expr, std::vector<NestedStructPath>* paths) { |
502 | 291 | DORIS_CHECK(paths != nullptr); |
503 | 291 | if (expr == nullptr) { |
504 | 1 | return; |
505 | 1 | } |
506 | 290 | NestedStructPath path; |
507 | 290 | if (extract_nested_struct_path_for_pruning(expr, &path)) { |
508 | 44 | paths->push_back(std::move(path)); |
509 | 44 | return; |
510 | 44 | } |
511 | 246 | for (const auto& child : expr->children()) { |
512 | 191 | collect_nested_struct_paths(child, paths); |
513 | 191 | } |
514 | 246 | } |
515 | | |
516 | | std::vector<const ColumnMapping*> present_child_mappings_in_file_order( |
517 | 223 | const std::vector<ColumnMapping>& child_mappings) { |
518 | 223 | std::vector<const ColumnMapping*> result; |
519 | 223 | result.reserve(child_mappings.size()); |
520 | 223 | for (const auto& child_mapping : child_mappings) { |
521 | 172 | if (child_mapping.file_local_id.has_value()) { |
522 | 141 | result.push_back(&child_mapping); |
523 | 141 | } |
524 | 172 | } |
525 | 223 | std::ranges::sort(result, [](const ColumnMapping* lhs, const ColumnMapping* rhs) { |
526 | 52 | DORIS_CHECK(lhs->file_local_id.has_value()); |
527 | 52 | DORIS_CHECK(rhs->file_local_id.has_value()); |
528 | 52 | return *lhs->file_local_id < *rhs->file_local_id; |
529 | 52 | }); |
530 | 223 | return result; |
531 | 223 | } |
532 | | |
533 | | // Build the nested child projection under a top-level file column by walking file schema children |
534 | | // directly. The returned projection does not include the root column id; callers attach it under a |
535 | | // `LocalColumnIndex::partial_local(root_id)` when merging into the scan request. |
536 | | Status build_file_child_projection_from_schema(const std::vector<ColumnDefinition>& children, |
537 | | std::span<const StructChildSelector> selectors, |
538 | 40 | LocalColumnIndex* projection) { |
539 | 40 | DORIS_CHECK(projection != nullptr); |
540 | 40 | if (selectors.empty()) { |
541 | 0 | return Status::InvalidArgument("Nested struct selector path is empty"); |
542 | 0 | } |
543 | 40 | const auto* child = resolve_file_child(children, selectors.front()); |
544 | 40 | if (child == nullptr) { |
545 | 0 | return Status::OK(); |
546 | 0 | } |
547 | 40 | *projection = LocalColumnIndex::local(child->file_local_id()); |
548 | 40 | projection->project_all_children = selectors.size() == 1; |
549 | 40 | projection->children.clear(); |
550 | 40 | if (selectors.size() == 1) { |
551 | 33 | return Status::OK(); |
552 | 33 | } |
553 | 7 | if (child->children.empty() || |
554 | 7 | remove_nullable(child->type)->get_primitive_type() != TYPE_STRUCT) { |
555 | 0 | *projection = LocalColumnIndex {}; |
556 | 0 | return Status::OK(); |
557 | 0 | } |
558 | 7 | LocalColumnIndex child_projection; |
559 | 7 | RETURN_IF_ERROR(build_file_child_projection_from_schema(child->children, selectors.subspan(1), |
560 | 7 | &child_projection)); |
561 | 7 | if (child_projection.local_id() < 0) { |
562 | 0 | *projection = LocalColumnIndex {}; |
563 | 0 | return Status::OK(); |
564 | 0 | } |
565 | 7 | projection->children.push_back(std::move(child_projection)); |
566 | 7 | return Status::OK(); |
567 | 7 | } |
568 | | |
569 | | } // namespace doris::format |