Coverage Report

Created: 2026-06-29 16:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format_v2/column_mapper_nested.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "format_v2/column_mapper_nested.h"
19
20
#include <algorithm>
21
#include <cstdint>
22
#include <memory>
23
#include <optional>
24
#include <utility>
25
26
#include "common/cast_set.h"
27
#include "common/exception.h"
28
#include "core/assert_cast.h"
29
#include "core/data_type/convert_field_to_type.h"
30
#include "core/data_type/data_type_nullable.h"
31
#include "core/data_type/data_type_struct.h"
32
#include "core/data_type/primitive_type.h"
33
#include "exprs/create_predicate_function.h"
34
#include "exprs/vexpr.h"
35
#include "exprs/vin_predicate.h"
36
#include "format_v2/expr/cast.h"
37
#include "gen_cpp/Exprs_types.h"
38
#include "storage/predicate/null_predicate.h"
39
#include "storage/predicate/predicate_creator.h"
40
41
namespace doris::format {
42
43
namespace {
44
45
306
static bool is_cast_expr(const VExprSPtr& expr) {
46
306
    return dynamic_cast<const Cast*>(expr.get()) != nullptr;
47
306
}
48
49
63
static bool is_binary_comparison_predicate(const VExprSPtr& expr) {
50
63
    if (expr == nullptr || expr->get_num_children() != 2 ||
51
63
        (expr->node_type() != TExprNodeType::BINARY_PRED &&
52
44
         expr->node_type() != TExprNodeType::NULL_AWARE_BINARY_PRED)) {
53
24
        return false;
54
24
    }
55
39
    switch (expr->op()) {
56
1
    case TExprOpcode::EQ:
57
1
    case TExprOpcode::EQ_FOR_NULL:
58
1
    case TExprOpcode::NE:
59
1
    case TExprOpcode::GE:
60
36
    case TExprOpcode::GT:
61
36
    case TExprOpcode::LE:
62
39
    case TExprOpcode::LT:
63
39
        return true;
64
0
    default:
65
0
        return false;
66
39
    }
67
39
}
68
69
49
static bool is_null_predicate_function(const VExprSPtr& expr, bool* is_null) {
70
49
    DORIS_CHECK(is_null != nullptr);
71
49
    if (expr == nullptr || expr->node_type() != TExprNodeType::FUNCTION_CALL ||
72
49
        expr->get_num_children() != 1) {
73
47
        return false;
74
47
    }
75
2
    if (expr->fn().name.function_name == "is_null_pred") {
76
1
        *is_null = true;
77
1
        return true;
78
1
    }
79
1
    if (expr->fn().name.function_name == "is_not_null_pred") {
80
1
        *is_null = false;
81
1
        return true;
82
1
    }
83
0
    return false;
84
1
}
85
86
23
static bool is_signed_integer_type(PrimitiveType type) {
87
23
    switch (type) {
88
0
    case TYPE_TINYINT:
89
0
    case TYPE_SMALLINT:
90
8
    case TYPE_INT:
91
18
    case TYPE_BIGINT:
92
18
    case TYPE_LARGEINT:
93
18
        return true;
94
5
    default:
95
5
        return false;
96
23
    }
97
23
}
98
99
16
static int primitive_integer_width(PrimitiveType type) {
100
16
    switch (type) {
101
0
    case TYPE_TINYINT:
102
0
        return 1;
103
0
    case TYPE_SMALLINT:
104
0
        return 2;
105
8
    case TYPE_INT:
106
8
        return 4;
107
8
    case TYPE_BIGINT:
108
8
        return 8;
109
0
    case TYPE_LARGEINT:
110
0
        return 16;
111
0
    default:
112
0
        return 0;
113
16
    }
114
16
}
115
116
6
static bool is_decimal_type(PrimitiveType type) {
117
6
    switch (type) {
118
4
    case TYPE_DECIMAL32:
119
4
    case TYPE_DECIMAL64:
120
4
    case TYPE_DECIMALV2:
121
4
    case TYPE_DECIMAL128I:
122
4
    case TYPE_DECIMAL256:
123
4
        return true;
124
2
    default:
125
2
        return false;
126
6
    }
127
6
}
128
129
static bool is_order_preserving_safe_cast(const DataTypePtr& from_type,
130
15
                                          const DataTypePtr& to_type) {
131
15
    if (from_type == nullptr || to_type == nullptr) {
132
0
        return false;
133
0
    }
134
15
    const auto from_nested_type = remove_nullable(from_type);
135
15
    const auto to_nested_type = remove_nullable(to_type);
136
15
    if (from_nested_type->equals(*to_nested_type)) {
137
2
        return true;
138
2
    }
139
140
13
    const auto from_primitive_type = from_nested_type->get_primitive_type();
141
13
    const auto to_primitive_type = to_nested_type->get_primitive_type();
142
13
    if (is_signed_integer_type(from_primitive_type) && is_signed_integer_type(to_primitive_type)) {
143
8
        return primitive_integer_width(to_primitive_type) >=
144
8
               primitive_integer_width(from_primitive_type);
145
8
    }
146
5
    if (from_primitive_type == TYPE_FLOAT && to_primitive_type == TYPE_DOUBLE) {
147
1
        return true;
148
1
    }
149
4
    if (is_decimal_type(from_primitive_type) && is_decimal_type(to_primitive_type)) {
150
2
        return from_nested_type->get_scale() == to_nested_type->get_scale() &&
151
2
               to_nested_type->get_precision() >= from_nested_type->get_precision();
152
2
    }
153
2
    return false;
154
4
}
155
156
115
static bool parse_struct_child_selector(const VExprSPtr& expr, StructChildSelector* selector) {
157
115
    DORIS_CHECK(selector != nullptr);
158
115
    if (expr == nullptr || !expr->is_literal()) {
159
1
        return false;
160
1
    }
161
114
    const Field field = literal_field(expr);
162
114
    switch (field.get_type()) {
163
102
    case TYPE_STRING:
164
102
    case TYPE_CHAR:
165
102
    case TYPE_VARCHAR:
166
102
        selector->by_name = true;
167
102
        selector->name = std::string(field.as_string_view());
168
102
        return true;
169
2
    case TYPE_BOOLEAN:
170
2
        selector->by_name = false;
171
2
        selector->ordinal = field.get<TYPE_BOOLEAN>() ? 1 : 0;
172
2
        return selector->ordinal > 0;
173
1
    case TYPE_TINYINT:
174
1
        selector->by_name = false;
175
1
        if (field.get<TYPE_TINYINT>() <= 0) {
176
0
            return false;
177
0
        }
178
1
        selector->ordinal = cast_set<size_t>(field.get<TYPE_TINYINT>());
179
1
        return true;
180
1
    case TYPE_SMALLINT:
181
1
        selector->by_name = false;
182
1
        if (field.get<TYPE_SMALLINT>() <= 0) {
183
0
            return false;
184
0
        }
185
1
        selector->ordinal = cast_set<size_t>(field.get<TYPE_SMALLINT>());
186
1
        return true;
187
5
    case TYPE_INT:
188
5
        selector->by_name = false;
189
5
        if (field.get<TYPE_INT>() <= 0) {
190
2
            return false;
191
2
        }
192
3
        selector->ordinal = cast_set<size_t>(field.get<TYPE_INT>());
193
3
        return true;
194
1
    case TYPE_BIGINT:
195
1
        selector->by_name = false;
196
1
        if (field.get<TYPE_BIGINT>() <= 0) {
197
0
            return false;
198
0
        }
199
1
        selector->ordinal = cast_set<size_t>(field.get<TYPE_BIGINT>());
200
1
        return true;
201
2
    default:
202
2
        return false;
203
114
    }
204
114
}
205
206
416
static bool extract_nested_struct_path(const VExprSPtr& expr, NestedStructPath* path) {
207
416
    DORIS_CHECK(path != nullptr);
208
416
    if (!is_struct_element_expr(expr)) {
209
301
        return false;
210
301
    }
211
212
    // Process for element_at(struct, 'field') or element_at(struct, 1) expression.
213
115
    StructChildSelector selector;
214
115
    if (!parse_struct_child_selector(expr->children()[1], &selector)) {
215
6
        return false;
216
6
    }
217
218
109
    const auto& parent = expr->children()[0];
219
109
    if (parent->is_slot_ref()) {
220
84
        const auto* slot_ref = assert_cast<const VSlotRef*>(parent.get());
221
84
        path->root_global_index = slot_ref_global_index(*slot_ref);
222
84
        path->selectors.clear();
223
84
        path->selectors.push_back(std::move(selector));
224
84
        return true;
225
84
    }
226
227
    // Process for element_at(element_at(struct<struct>, 'field'), 'field') or
228
    // element_at(element_at(struct<struct>, 1), 1) expression.
229
25
    if (!extract_nested_struct_path(parent, path)) {
230
3
        return false;
231
3
    }
232
22
    path->selectors.push_back(std::move(selector));
233
22
    return true;
234
25
}
235
236
368
static bool extract_nested_struct_path_for_pruning(const VExprSPtr& expr, NestedStructPath* path) {
237
368
    DORIS_CHECK(path != nullptr);
238
    // Simple `ELEMENT_AT`
239
368
    if (extract_nested_struct_path(expr, path)) {
240
62
        return true;
241
62
    }
242
243
    // `ELEMENT_AT` with `CAST`
244
306
    if (!is_cast_expr(expr) || expr->get_num_children() != 1) {
245
291
        return false;
246
291
    }
247
15
    const auto& child = expr->children()[0];
248
15
    if (!is_order_preserving_safe_cast(child->data_type(), expr->data_type())) {
249
7
        return false;
250
7
    }
251
    // A safe widening cast is null-preserving and keeps the comparison ordering of the nested
252
    // primitive leaf, so file-layer pruning can target the original leaf statistics. The row-level
253
    // filter still evaluates the original cast expression after read.
254
8
    return extract_nested_struct_path_for_pruning(child, path);
255
15
}
256
257
static const ColumnDefinition* resolve_file_child(const std::vector<ColumnDefinition>& children,
258
53
                                                  const StructChildSelector& selector) {
259
53
    if (selector.by_name) {
260
67
        const auto child_it = std::ranges::find_if(children, [&](const ColumnDefinition& child) {
261
67
            return child.name == selector.name;
262
67
        });
263
51
        return child_it == children.end() ? nullptr : &*child_it;
264
51
    }
265
2
    if (selector.ordinal == 0 || selector.ordinal > children.size()) {
266
0
        return nullptr;
267
0
    }
268
2
    return &children[selector.ordinal - 1];
269
2
}
270
271
141
static const DataTypeStruct* struct_type_or_null(const DataTypePtr& type) {
272
141
    if (type == nullptr) {
273
0
        return nullptr;
274
0
    }
275
141
    const auto nested_type = remove_nullable(type);
276
141
    if (nested_type->get_primitive_type() != TYPE_STRUCT) {
277
12
        return nullptr;
278
12
    }
279
129
    return assert_cast<const DataTypeStruct*>(nested_type.get());
280
141
}
281
282
static std::optional<int32_t> struct_child_index(const ColumnMapping& mapping,
283
65
                                                 const StructChildSelector& selector) {
284
65
    const auto* struct_type = struct_type_or_null(mapping.table_type);
285
65
    if (struct_type == nullptr) {
286
6
        return std::nullopt;
287
6
    }
288
59
    if (selector.by_name) {
289
59
        const auto position = struct_type->try_get_position_by_name(selector.name);
290
59
        if (!position.has_value()) {
291
39
            return std::nullopt;
292
39
        }
293
20
        return cast_set<int32_t>(*position);
294
59
    }
295
0
    if (selector.ordinal == 0 || selector.ordinal > struct_type->get_elements().size()) {
296
0
        return std::nullopt;
297
0
    }
298
0
    return cast_set<int32_t>(selector.ordinal - 1);
299
0
}
300
301
// Get the global child index for a child mapping. If the mapping's table type is struct, resolve
302
// the child index by the child mapping's table column name; otherwise, use the fallback child index.
303
static int32_t child_mapping_global_index(const ColumnMapping& mapping,
304
                                          const ColumnMapping& child_mapping,
305
28
                                          size_t fallback_child_idx) {
306
28
    const auto* struct_type = struct_type_or_null(mapping.table_type);
307
28
    if (struct_type == nullptr) {
308
0
        return cast_set<int32_t>(fallback_child_idx);
309
0
    }
310
28
    const auto position = struct_type->try_get_position_by_name(child_mapping.table_column_name);
311
28
    DORIS_CHECK(position.has_value()) << "Cannot find child '" << child_mapping.table_column_name
312
0
                                      << "' in table type " << mapping.table_type->get_name();
313
28
    return cast_set<int32_t>(*position);
314
28
}
315
316
static const ColumnMapping* resolve_mapped_child(const ColumnMapping& mapping,
317
20
                                                 int32_t global_child_index) {
318
31
    for (size_t child_idx = 0; child_idx < mapping.child_mappings.size(); ++child_idx) {
319
28
        const auto& child_mapping = mapping.child_mappings[child_idx];
320
28
        if (child_mapping_global_index(mapping, child_mapping, child_idx) == global_child_index) {
321
17
            return &child_mapping;
322
17
        }
323
28
    }
324
3
    return nullptr;
325
20
}
326
327
enum class NestedProjectionResolveResult {
328
    RESOLVED,
329
    NOT_REPRESENTED,
330
    MISSING_FILE_CHILD,
331
};
332
333
// Resolve a table-side nested struct path through the existing ColumnMapping tree and build the
334
// corresponding file-local projection. For example, if table column `s` has children
335
// `{a, renamed_b}` and file column `s` has children `{a, b}`, the filter path
336
// `struct_element(s, 'renamed_b')` is resolved to the file projection `s -> b` by following the
337
// child mapping instead of matching the table child name against the file schema. Return
338
// MISSING_FILE_CHILD when ColumnMapping explicitly says a table child is absent from this file; in
339
// that case callers must not fall back to schema-name lookup, because Iceberg can drop a field and
340
// later add a different field with the same name.
341
static NestedProjectionResolveResult resolve_nested_projection_with_mapping(
342
        const NestedStructPath& path, const std::vector<ColumnMapping>& mappings,
343
62
        LocalColumnIndex* root_projection) {
344
62
    DORIS_CHECK(root_projection != nullptr);
345
62
    *root_projection = {};
346
62
    if (path.selectors.empty()) {
347
0
        return NestedProjectionResolveResult::NOT_REPRESENTED;
348
0
    }
349
65
    const auto mapping_it = std::ranges::find_if(mappings, [&](const ColumnMapping& mapping) {
350
65
        return mapping.global_index == path.root_global_index;
351
65
    });
352
62
    if (mapping_it == mappings.end() || !mapping_it->file_local_id.has_value()) {
353
0
        return NestedProjectionResolveResult::NOT_REPRESENTED;
354
0
    }
355
356
62
    *root_projection = LocalColumnIndex::partial_local(*mapping_it->file_local_id);
357
62
    auto* current_projection = root_projection;
358
62
    const auto* current_mapping = &*mapping_it;
359
360
    // Traverse the ColumnMapping tree according to the table-side struct selectors and emit the
361
    // corresponding file-local child ids. A missing child mapping means this predicate-only path
362
    // may need schema fallback; an existing child mapping without a file id means the table child
363
    // is genuinely absent from this file and must stay above the file reader.
364
79
    for (size_t selector_idx = 0; selector_idx < path.selectors.size(); ++selector_idx) {
365
65
        const auto global_child_index =
366
65
                struct_child_index(*current_mapping, path.selectors[selector_idx]);
367
65
        if (!global_child_index.has_value()) {
368
45
            *root_projection = {};
369
45
            return NestedProjectionResolveResult::NOT_REPRESENTED;
370
45
        }
371
20
        const auto* child_mapping = resolve_mapped_child(*current_mapping, *global_child_index);
372
20
        if (child_mapping == nullptr) {
373
3
            *root_projection = {};
374
3
            return NestedProjectionResolveResult::NOT_REPRESENTED;
375
3
        }
376
17
        if (!child_mapping->file_local_id.has_value()) {
377
0
            *root_projection = {};
378
0
            return NestedProjectionResolveResult::MISSING_FILE_CHILD;
379
0
        }
380
381
17
        auto child_projection = LocalColumnIndex::partial_local(*child_mapping->file_local_id);
382
17
        child_projection.project_all_children = selector_idx + 1 == path.selectors.size();
383
17
        current_projection->children.push_back(std::move(child_projection));
384
17
        current_projection = &current_projection->children.back();
385
17
        current_mapping = child_mapping;
386
17
    }
387
14
    return NestedProjectionResolveResult::RESOLVED;
388
62
}
389
390
48
static bool table_root_is_struct(const ColumnMapping& mapping) {
391
48
    return struct_type_or_null(mapping.table_type) != nullptr;
392
48
}
393
394
20
static const std::vector<ColumnDefinition>& scan_file_children(const ColumnMapping& mapping) {
395
20
    return !mapping.projected_file_children.empty() ? mapping.projected_file_children
396
20
                                                    : mapping.original_file_children;
397
20
}
398
399
static const ColumnDefinition* resolve_file_leaf_from_projection(
400
18
        const std::vector<ColumnDefinition>& children, const LocalColumnIndex& projection) {
401
25
    const auto child_it = std::ranges::find_if(children, [&](const ColumnDefinition& child) {
402
25
        return child.file_local_id() == projection.local_id();
403
25
    });
404
18
    if (child_it == children.end()) {
405
0
        return nullptr;
406
0
    }
407
18
    if (projection.children.empty()) {
408
16
        return &*child_it;
409
16
    }
410
2
    if (projection.children.size() != 1) {
411
0
        return nullptr;
412
0
    }
413
2
    return resolve_file_leaf_from_projection(child_it->children, projection.children[0]);
414
2
}
415
416
static bool collect_file_child_names_from_projection(const std::vector<ColumnDefinition>& children,
417
                                                     const LocalColumnIndex& projection,
418
                                                     std::vector<std::string>* file_child_names,
419
64
                                                     std::vector<DataTypePtr>* file_child_types) {
420
64
    DORIS_CHECK(file_child_names != nullptr);
421
64
    DORIS_CHECK(file_child_types != nullptr);
422
89
    const auto child_it = std::ranges::find_if(children, [&](const ColumnDefinition& child) {
423
89
        return child.file_local_id() == projection.local_id();
424
89
    });
425
64
    if (child_it == children.end()) {
426
0
        return false;
427
0
    }
428
64
    file_child_names->push_back(child_it->name);
429
64
    file_child_types->push_back(child_it->type);
430
64
    if (projection.children.empty()) {
431
56
        return true;
432
56
    }
433
8
    if (projection.children.size() != 1) {
434
0
        return false;
435
0
    }
436
8
    return collect_file_child_names_from_projection(child_it->children, projection.children[0],
437
8
                                                    file_child_names, file_child_types);
438
8
}
439
440
struct NestedPredicateTarget {
441
    LocalColumnIndex file_projection;
442
    FileNestedPredicateTarget file_target;
443
    std::string leaf_name;
444
    DataTypePtr leaf_type;
445
};
446
447
static std::unique_ptr<FileStructPredicateTarget> build_struct_predicate_target_from_projection(
448
18
        const std::vector<ColumnDefinition>& children, const LocalColumnIndex& projection) {
449
25
    const auto child_it = std::ranges::find_if(children, [&](const ColumnDefinition& child) {
450
25
        return child.file_local_id() == projection.local_id();
451
25
    });
452
18
    if (child_it == children.end()) {
453
0
        return nullptr;
454
0
    }
455
18
    std::unique_ptr<FileStructPredicateTarget> nested_child;
456
18
    if (!projection.children.empty()) {
457
2
        if (projection.children.size() != 1) {
458
0
            return nullptr;
459
0
        }
460
2
        nested_child = build_struct_predicate_target_from_projection(child_it->children,
461
2
                                                                     projection.children[0]);
462
2
        if (nested_child == nullptr) {
463
0
            return nullptr;
464
0
        }
465
2
    }
466
18
    return std::make_unique<FileStructPredicateTarget>(child_it->file_local_id(), child_it->name,
467
18
                                                       std::move(nested_child));
468
18
}
469
470
static bool build_struct_predicate_target(const ColumnMapping& root_mapping,
471
                                          const LocalColumnIndex& root_projection,
472
16
                                          FileNestedPredicateTarget* file_target) {
473
16
    DORIS_CHECK(file_target != nullptr);
474
16
    if (!root_projection.column_id().is_valid() || root_projection.children.size() != 1) {
475
0
        return false;
476
0
    }
477
16
    auto struct_target = build_struct_predicate_target_from_projection(
478
16
            root_mapping.original_file_children, root_projection.children[0]);
479
16
    if (struct_target == nullptr) {
480
0
        return false;
481
0
    }
482
16
    *file_target = FileNestedPredicateTarget(root_projection.column_id(), std::move(struct_target));
483
16
    return true;
484
16
}
485
486
static bool resolve_nested_predicate_target(const NestedStructPath& path,
487
                                            const std::vector<ColumnMapping>& mappings,
488
18
                                            NestedPredicateTarget* target) {
489
18
    DORIS_CHECK(target != nullptr);
490
18
    ResolvedNestedStructPath resolved;
491
18
    if (!resolve_nested_struct_path_for_file(path, mappings, &resolved)) {
492
2
        return false;
493
2
    }
494
495
17
    const auto mapping_it = std::ranges::find_if(mappings, [&](const ColumnMapping& mapping) {
496
17
        return mapping.global_index == path.root_global_index;
497
17
    });
498
16
    if (mapping_it == mappings.end() || resolved.file_projection.children.size() != 1) {
499
0
        return false;
500
0
    }
501
16
    const auto* file_leaf = resolve_file_leaf_from_projection(mapping_it->original_file_children,
502
16
                                                              resolved.file_projection.children[0]);
503
16
    if (file_leaf == nullptr || file_leaf->type == nullptr) {
504
0
        return false;
505
0
    }
506
16
    target->leaf_type = remove_nullable(file_leaf->type);
507
16
    if (is_complex_type(target->leaf_type->get_primitive_type())) {
508
0
        return false;
509
0
    }
510
16
    target->leaf_name = file_leaf->name;
511
16
    target->file_projection = std::move(resolved.file_projection);
512
16
    if (!build_struct_predicate_target(*mapping_it, target->file_projection,
513
16
                                       &target->file_target)) {
514
0
        return false;
515
0
    }
516
16
    return true;
517
16
}
518
519
16
static VExprSPtr original_table_literal_for_nested_predicate(const VExprSPtr& literal_expr) {
520
16
    DORIS_CHECK(literal_expr != nullptr);
521
16
    DORIS_CHECK(literal_expr->is_literal());
522
16
    const auto* rewritten_literal = dynamic_cast<const SplitLocalFileLiteral*>(literal_expr.get());
523
16
    if (rewritten_literal == nullptr) {
524
16
        return literal_expr;
525
16
    }
526
0
    return VLiteral::create_shared(rewritten_literal->original_type(),
527
0
                                   rewritten_literal->original_field());
528
16
}
529
530
12
static std::optional<PredicateType> to_column_predicate_type(TExprOpcode::type opcode) {
531
12
    switch (opcode) {
532
1
    case TExprOpcode::EQ:
533
1
        return PredicateType::EQ;
534
0
    case TExprOpcode::NE:
535
0
        return PredicateType::NE;
536
10
    case TExprOpcode::GT:
537
10
        return PredicateType::GT;
538
0
    case TExprOpcode::GE:
539
0
        return PredicateType::GE;
540
1
    case TExprOpcode::LT:
541
1
        return PredicateType::LT;
542
0
    case TExprOpcode::LE:
543
0
        return PredicateType::LE;
544
0
    default:
545
0
        return std::nullopt;
546
12
    }
547
12
}
548
549
1
static TExprOpcode::type reverse_comparison_opcode(TExprOpcode::type opcode) {
550
1
    switch (opcode) {
551
0
    case TExprOpcode::GT:
552
0
        return TExprOpcode::LT;
553
0
    case TExprOpcode::GE:
554
0
        return TExprOpcode::LE;
555
1
    case TExprOpcode::LT:
556
1
        return TExprOpcode::GT;
557
0
    case TExprOpcode::LE:
558
0
        return TExprOpcode::GE;
559
0
    default:
560
0
        return opcode;
561
1
    }
562
1
}
563
564
static std::shared_ptr<ColumnPredicate> create_comparison_column_predicate(
565
        PredicateType predicate_type, uint32_t column_id, const std::string& column_name,
566
12
        const DataTypePtr& data_type, const Field& value) {
567
12
    switch (predicate_type) {
568
1
    case PredicateType::EQ:
569
1
        return create_comparison_predicate<PredicateType::EQ>(column_id, column_name, data_type,
570
1
                                                              value, false);
571
0
    case PredicateType::NE:
572
0
        return create_comparison_predicate<PredicateType::NE>(column_id, column_name, data_type,
573
0
                                                              value, false);
574
10
    case PredicateType::GT:
575
10
        return create_comparison_predicate<PredicateType::GT>(column_id, column_name, data_type,
576
10
                                                              value, false);
577
0
    case PredicateType::GE:
578
0
        return create_comparison_predicate<PredicateType::GE>(column_id, column_name, data_type,
579
0
                                                              value, false);
580
1
    case PredicateType::LT:
581
1
        return create_comparison_predicate<PredicateType::LT>(column_id, column_name, data_type,
582
1
                                                              value, false);
583
0
    case PredicateType::LE:
584
0
        return create_comparison_predicate<PredicateType::LE>(column_id, column_name, data_type,
585
0
                                                              value, false);
586
0
    default:
587
0
        return nullptr;
588
12
    }
589
12
}
590
591
static bool extract_child_id_path_from_projection(const LocalColumnIndex& root_projection,
592
16
                                                  std::vector<int32_t>* file_child_id_path) {
593
16
    DORIS_CHECK(file_child_id_path != nullptr);
594
16
    file_child_id_path->clear();
595
16
    const auto* current_projection = &root_projection;
596
34
    while (!current_projection->children.empty()) {
597
18
        if (current_projection->children.size() != 1) {
598
0
            file_child_id_path->clear();
599
0
            return false;
600
0
        }
601
18
        current_projection = &current_projection->children[0];
602
18
        file_child_id_path->push_back(current_projection->local_id());
603
18
    }
604
16
    return !file_child_id_path->empty();
605
16
}
606
607
static std::shared_ptr<ColumnPredicate> build_nested_comparison_predicate(
608
        const VExprSPtr& literal_expr, TExprOpcode::type opcode, LocalColumnId root_file_column_id,
609
12
        const std::string& leaf_name, const DataTypePtr& file_leaf_type) {
610
12
    if (literal_expr == nullptr || !literal_expr->is_literal() || file_leaf_type == nullptr) {
611
0
        return nullptr;
612
0
    }
613
12
    const auto predicate_type = to_column_predicate_type(opcode);
614
12
    if (!predicate_type.has_value()) {
615
0
        return nullptr;
616
0
    }
617
12
    const auto original_literal = original_table_literal_for_nested_predicate(literal_expr);
618
12
    const Field original_field = literal_field(original_literal);
619
12
    Field file_field;
620
12
    try {
621
12
        convert_field_to_type(original_field, *file_leaf_type, &file_field,
622
12
                              original_literal->data_type().get());
623
12
    } catch (const Exception&) {
624
0
        return nullptr;
625
0
    }
626
12
    if (file_field.is_null()) {
627
0
        return nullptr;
628
0
    }
629
12
    try {
630
12
        return create_comparison_column_predicate(*predicate_type,
631
12
                                                  cast_set<uint32_t>(root_file_column_id.value()),
632
12
                                                  leaf_name, file_leaf_type, file_field);
633
12
    } catch (const Exception&) {
634
0
        return nullptr;
635
0
    }
636
12
}
637
638
static std::shared_ptr<ColumnPredicate> build_nested_in_list_predicate(
639
        const VExprSPtrs& literal_exprs, LocalColumnId root_file_column_id,
640
2
        const std::string& leaf_name, const DataTypePtr& file_leaf_type) {
641
2
    if (literal_exprs.empty() || file_leaf_type == nullptr) {
642
0
        return nullptr;
643
0
    }
644
645
2
    auto value_column = file_leaf_type->create_column();
646
4
    for (const auto& literal_expr : literal_exprs) {
647
4
        if (literal_expr == nullptr || !literal_expr->is_literal()) {
648
0
            return nullptr;
649
0
        }
650
4
        const auto original_literal = original_table_literal_for_nested_predicate(literal_expr);
651
4
        const Field original_field = literal_field(original_literal);
652
4
        Field file_field;
653
4
        try {
654
4
            convert_field_to_type(original_field, *file_leaf_type, &file_field,
655
4
                                  original_literal->data_type().get());
656
4
        } catch (const Exception&) {
657
0
            return nullptr;
658
0
        }
659
4
        if (file_field.is_null()) {
660
0
            return nullptr;
661
0
        }
662
4
        value_column->insert(file_field);
663
4
    }
664
665
2
    std::shared_ptr<HybridSetBase> values;
666
2
    try {
667
2
        values.reset(create_set(file_leaf_type->get_primitive_type(), literal_exprs.size(), false));
668
2
        ColumnPtr value_column_ptr = std::move(value_column);
669
2
        values->insert_range_from(value_column_ptr, 0, value_column_ptr->size());
670
2
        return create_in_list_predicate<PredicateType::IN_LIST>(
671
2
                cast_set<uint32_t>(root_file_column_id.value()), leaf_name, file_leaf_type, values,
672
2
                false);
673
2
    } catch (const Exception&) {
674
0
        return nullptr;
675
0
    }
676
2
}
677
678
static std::shared_ptr<ColumnPredicate> build_nested_null_predicate(
679
        bool is_null, LocalColumnId root_file_column_id, const std::string& leaf_name,
680
2
        const DataTypePtr& file_leaf_type) {
681
2
    if (file_leaf_type == nullptr) {
682
0
        return nullptr;
683
0
    }
684
2
    const auto leaf_primitive_type = remove_nullable(file_leaf_type)->get_primitive_type();
685
2
    return NullPredicate::create_shared(cast_set<uint32_t>(root_file_column_id.value()), leaf_name,
686
2
                                        is_null, leaf_primitive_type);
687
2
}
688
689
static bool set_nested_column_filter_target(const NestedPredicateTarget& target,
690
16
                                            FileColumnPredicateFilter* column_filter) {
691
16
    DORIS_CHECK(column_filter != nullptr);
692
16
    std::vector<int32_t> file_child_id_path;
693
16
    if (!extract_child_id_path_from_projection(target.file_projection, &file_child_id_path)) {
694
0
        return false;
695
0
    }
696
16
    column_filter->file_column_id = target.file_projection.column_id();
697
16
    column_filter->file_child_id_path = std::move(file_child_id_path);
698
16
    column_filter->target = target.file_target;
699
16
    return true;
700
16
}
701
702
static bool extract_nested_binary_comparison_filter(const VExprSPtr& expr,
703
                                                    const std::vector<ColumnMapping>& mappings,
704
63
                                                    FileColumnPredicateFilter* column_filter) {
705
63
    DORIS_CHECK(column_filter != nullptr);
706
63
    if (!is_binary_comparison_predicate(expr)) {
707
24
        return false;
708
24
    }
709
39
    NestedStructPath path;
710
39
    VExprSPtr literal_expr;
711
39
    TExprOpcode::type opcode = expr->op();
712
39
    if (extract_nested_struct_path_for_pruning(expr->children()[0], &path) &&
713
39
        expr->children()[1]->is_literal()) {
714
13
        literal_expr = expr->children()[1];
715
26
    } else if (extract_nested_struct_path_for_pruning(expr->children()[1], &path) &&
716
26
               expr->children()[0]->is_literal()) {
717
1
        literal_expr = expr->children()[0];
718
1
        opcode = reverse_comparison_opcode(opcode);
719
25
    } else {
720
25
        return false;
721
25
    }
722
723
14
    NestedPredicateTarget target;
724
14
    if (!resolve_nested_predicate_target(path, mappings, &target)) {
725
2
        return false;
726
2
    }
727
12
    auto predicate = build_nested_comparison_predicate(literal_expr, opcode,
728
12
                                                       target.file_projection.column_id(),
729
12
                                                       target.leaf_name, target.leaf_type);
730
12
    if (predicate == nullptr) {
731
0
        return false;
732
0
    }
733
12
    if (!set_nested_column_filter_target(target, column_filter)) {
734
0
        return false;
735
0
    }
736
12
    column_filter->predicates.push_back(std::move(predicate));
737
12
    return true;
738
12
}
739
740
static bool extract_nested_in_list_filter(const VExprSPtr& expr,
741
                                          const std::vector<ColumnMapping>& mappings,
742
51
                                          FileColumnPredicateFilter* column_filter) {
743
51
    DORIS_CHECK(column_filter != nullptr);
744
51
    if (expr == nullptr || expr->node_type() != TExprNodeType::IN_PRED ||
745
51
        expr->get_num_children() < 2) {
746
45
        return false;
747
45
    }
748
6
    if (const auto* in_predicate = dynamic_cast<const VInPredicate*>(expr.get());
749
6
        in_predicate != nullptr && in_predicate->is_not_in()) {
750
0
        return false;
751
0
    }
752
753
6
    NestedStructPath path;
754
6
    if (!extract_nested_struct_path_for_pruning(expr->children()[0], &path)) {
755
4
        return false;
756
4
    }
757
758
2
    VExprSPtrs literal_exprs;
759
2
    literal_exprs.reserve(expr->get_num_children() - 1);
760
6
    for (size_t child_idx = 1; child_idx < expr->children().size(); ++child_idx) {
761
4
        if (!expr->children()[child_idx]->is_literal()) {
762
0
            return false;
763
0
        }
764
4
        literal_exprs.push_back(expr->children()[child_idx]);
765
4
    }
766
767
2
    NestedPredicateTarget target;
768
2
    if (!resolve_nested_predicate_target(path, mappings, &target)) {
769
0
        return false;
770
0
    }
771
2
    auto predicate = build_nested_in_list_predicate(
772
2
            literal_exprs, target.file_projection.column_id(), target.leaf_name, target.leaf_type);
773
2
    if (predicate == nullptr) {
774
0
        return false;
775
0
    }
776
2
    if (!set_nested_column_filter_target(target, column_filter)) {
777
0
        return false;
778
0
    }
779
2
    column_filter->predicates.push_back(std::move(predicate));
780
2
    return true;
781
2
}
782
783
static bool extract_nested_null_filter(const VExprSPtr& expr,
784
                                       const std::vector<ColumnMapping>& mappings,
785
49
                                       FileColumnPredicateFilter* column_filter) {
786
49
    DORIS_CHECK(column_filter != nullptr);
787
49
    bool is_null = false;
788
49
    if (!is_null_predicate_function(expr, &is_null)) {
789
47
        return false;
790
47
    }
791
792
2
    NestedStructPath path;
793
2
    if (!extract_nested_struct_path_for_pruning(expr->children()[0], &path)) {
794
0
        return false;
795
0
    }
796
797
2
    NestedPredicateTarget target;
798
2
    if (!resolve_nested_predicate_target(path, mappings, &target)) {
799
0
        return false;
800
0
    }
801
2
    auto predicate = build_nested_null_predicate(is_null, target.file_projection.column_id(),
802
2
                                                 target.leaf_name, target.leaf_type);
803
2
    if (predicate == nullptr) {
804
0
        return false;
805
0
    }
806
2
    if (!set_nested_column_filter_target(target, column_filter)) {
807
0
        return false;
808
0
    }
809
2
    column_filter->predicates.push_back(std::move(predicate));
810
2
    return true;
811
2
}
812
813
} // namespace
814
815
SplitLocalFileLiteral::SplitLocalFileLiteral(const DataTypePtr& file_type, const Field& file_field,
816
                                             DataTypePtr original_type, Field original_field)
817
22
        : VLiteral(file_type, file_field),
818
22
          _original_type(std::move(original_type)),
819
22
          _original_field(std::move(original_field)) {}
820
821
216
GlobalIndex slot_ref_global_index(const VSlotRef& slot_ref) {
822
216
    DORIS_CHECK(slot_ref.column_id() >= 0);
823
216
    return GlobalIndex(cast_set<size_t>(slot_ref.column_id()));
824
216
}
825
826
574
bool is_struct_element_expr(const VExprSPtr& expr) {
827
574
    if (expr == nullptr || expr->get_num_children() != 2) {
828
314
        return false;
829
314
    }
830
260
    const auto& function_name = expr->fn().name.function_name;
831
260
    if (function_name == "struct_element") {
832
138
        return true;
833
138
    }
834
122
    if (function_name != "element_at") {
835
93
        return false;
836
93
    }
837
29
    const auto& parent_type = expr->children()[0]->data_type();
838
29
    return parent_type != nullptr &&
839
29
           remove_nullable(parent_type)->get_primitive_type() == TYPE_STRUCT;
840
122
}
841
842
159
Field literal_field(const VExprSPtr& literal_expr) {
843
159
    DORIS_CHECK(literal_expr != nullptr);
844
159
    DORIS_CHECK(literal_expr->is_literal());
845
159
    const auto* literal = dynamic_cast<const VLiteral*>(literal_expr.get());
846
159
    DORIS_CHECK(literal != nullptr);
847
159
    Field field;
848
159
    literal->get_column_ptr()->get(0, field);
849
159
    return field;
850
159
}
851
852
bool resolve_nested_struct_path_for_file(const NestedStructPath& path,
853
                                         const std::vector<ColumnMapping>& mappings,
854
                                         ResolvedNestedStructPath* resolved,
855
62
                                         bool require_scan_projection) {
856
62
    DORIS_CHECK(resolved != nullptr);
857
62
    *resolved = {};
858
65
    const auto mapping_it = std::ranges::find_if(mappings, [&](const ColumnMapping& mapping) {
859
65
        return mapping.global_index == path.root_global_index;
860
65
    });
861
62
    if (mapping_it == mappings.end() || !mapping_it->file_local_id.has_value() ||
862
62
        path.selectors.empty()) {
863
0
        return false;
864
0
    }
865
866
    // Prefer ColumnMapping over schema-name lookup. This is the only path that can correctly
867
    // localize renamed Iceberg fields: a table filter `element_at(s, 'renamed_b')` must become a
868
    // file filter on physical child `b`, even if the old file type is `STRUCT<b ...>`.
869
62
    const auto mapping_result =
870
62
            resolve_nested_projection_with_mapping(path, mappings, &resolved->file_projection);
871
62
    if (mapping_result == NestedProjectionResolveResult::MISSING_FILE_CHILD) {
872
0
        return false;
873
0
    }
874
62
    if (mapping_result == NestedProjectionResolveResult::NOT_REPRESENTED) {
875
48
        if (!table_root_is_struct(*mapping_it)) {
876
6
            return false;
877
6
        }
878
42
        LocalColumnIndex child_projection;
879
42
        if (!build_file_child_projection_from_schema(mapping_it->original_file_children,
880
42
                                                     path.selectors, &child_projection)
881
42
                     .ok() ||
882
42
            child_projection.local_id() < 0) {
883
0
            return false;
884
0
        }
885
42
        resolved->file_projection = LocalColumnIndex::partial_local(*mapping_it->file_local_id);
886
42
        resolved->file_projection.children.push_back(std::move(child_projection));
887
42
    }
888
889
56
    if (resolved->file_projection.children.size() != 1) {
890
0
        *resolved = {};
891
0
        return false;
892
0
    }
893
    // When rewriting the final localized element_at chain, it executes on the file column produced
894
    // by this scan, so the intermediate return types must match the projected file shape, not the
895
    // full historical file schema. Example:
896
    //   SELECT s.c WHERE element_at(element_at(s, 'b'), 'cc') LIKE 'NestedC%'
897
    // reads only b.cc and c; the inner element_at(s, 'b') returns Struct(cc), not
898
    // Struct(cc, new_dd).
899
    //
900
    // Earlier projection collection also calls this resolver before filter-only children have been
901
    // merged into the scan projection. That phase only needs the file path, so it still resolves
902
    // names/types from the original file schema.
903
56
    const auto& child_source = require_scan_projection ? scan_file_children(*mapping_it)
904
56
                                                       : mapping_it->original_file_children;
905
56
    if (!collect_file_child_names_from_projection(
906
56
                child_source, resolved->file_projection.children[0], &resolved->file_child_names,
907
56
                &resolved->file_child_types) ||
908
56
        resolved->file_child_names.size() != path.selectors.size() ||
909
56
        resolved->file_child_types.size() != path.selectors.size()) {
910
0
        *resolved = {};
911
0
        return false;
912
0
    }
913
56
    return true;
914
56
}
915
916
bool resolve_nested_struct_expr_for_file(const VExprSPtr& expr,
917
                                         const std::vector<ColumnMapping>& mappings,
918
23
                                         ResolvedNestedStructPath* resolved) {
919
23
    DORIS_CHECK(resolved != nullptr);
920
23
    NestedStructPath path;
921
23
    if (!extract_nested_struct_path(expr, &path)) {
922
1
        *resolved = {};
923
1
        return false;
924
1
    }
925
22
    return resolve_nested_struct_path_for_file(path, mappings, resolved, true);
926
23
}
927
928
// Collect nested struct leaf references that can be turned into file-reader projections and
929
// primitive pruning predicates. For example, from `s.a > 1 AND element_at(s, 'b') = 2`, this
930
// records two paths rooted at `s`: `s -> a` and `s -> b`. Non-struct expressions are traversed
931
// recursively, while a recognized struct path is emitted once so the caller can merge it into the
932
// scan projection for that top-level file column.
933
288
void collect_nested_struct_paths(const VExprSPtr& expr, std::vector<NestedStructPath>* paths) {
934
288
    DORIS_CHECK(paths != nullptr);
935
288
    if (expr == nullptr) {
936
1
        return;
937
1
    }
938
287
    NestedStructPath path;
939
287
    if (extract_nested_struct_path_for_pruning(expr, &path)) {
940
44
        paths->push_back(std::move(path));
941
44
        return;
942
44
    }
943
243
    for (const auto& child : expr->children()) {
944
189
        collect_nested_struct_paths(child, paths);
945
189
    }
946
243
}
947
948
std::vector<const ColumnMapping*> present_child_mappings_in_file_order(
949
223
        const std::vector<ColumnMapping>& child_mappings) {
950
223
    std::vector<const ColumnMapping*> result;
951
223
    result.reserve(child_mappings.size());
952
223
    for (const auto& child_mapping : child_mappings) {
953
172
        if (child_mapping.file_local_id.has_value()) {
954
141
            result.push_back(&child_mapping);
955
141
        }
956
172
    }
957
223
    std::ranges::sort(result, [](const ColumnMapping* lhs, const ColumnMapping* rhs) {
958
52
        DORIS_CHECK(lhs->file_local_id.has_value());
959
52
        DORIS_CHECK(rhs->file_local_id.has_value());
960
52
        return *lhs->file_local_id < *rhs->file_local_id;
961
52
    });
962
223
    return result;
963
223
}
964
965
// Build the nested child projection under a top-level file column by walking file schema children
966
// directly. The returned projection does not include the root column id; callers attach it under a
967
// `LocalColumnIndex::partial_local(root_id)` when merging into the scan request.
968
Status build_file_child_projection_from_schema(const std::vector<ColumnDefinition>& children,
969
                                               std::span<const StructChildSelector> selectors,
970
53
                                               LocalColumnIndex* projection) {
971
53
    DORIS_CHECK(projection != nullptr);
972
53
    if (selectors.empty()) {
973
0
        return Status::InvalidArgument("Nested struct selector path is empty");
974
0
    }
975
53
    const auto* child = resolve_file_child(children, selectors.front());
976
53
    if (child == nullptr) {
977
0
        return Status::OK();
978
0
    }
979
53
    *projection = LocalColumnIndex::local(child->file_local_id());
980
53
    projection->project_all_children = selectors.size() == 1;
981
53
    projection->children.clear();
982
53
    if (selectors.size() == 1) {
983
45
        return Status::OK();
984
45
    }
985
8
    if (child->children.empty() ||
986
8
        remove_nullable(child->type)->get_primitive_type() != TYPE_STRUCT) {
987
0
        *projection = LocalColumnIndex {};
988
0
        return Status::OK();
989
0
    }
990
8
    LocalColumnIndex child_projection;
991
8
    RETURN_IF_ERROR(build_file_child_projection_from_schema(child->children, selectors.subspan(1),
992
8
                                                            &child_projection));
993
8
    if (child_projection.local_id() < 0) {
994
0
        *projection = LocalColumnIndex {};
995
0
        return Status::OK();
996
0
    }
997
8
    projection->children.push_back(std::move(child_projection));
998
8
    return Status::OK();
999
8
}
1000
1001
// Merge predicates that target the same physical file column or nested leaf. For example,
1002
// `WHERE s.b > 1 AND s.b < 10` produces two predicates for the same target `s -> b`; keeping them
1003
// in one FileColumnPredicateFilter lets the file reader apply both pruning checks to the same leaf
1004
// instead of carrying duplicate target entries.
1005
void merge_column_predicate_filter(FileColumnPredicateFilter column_filter,
1006
30
                                   std::vector<FileColumnPredicateFilter>* filters) {
1007
30
    DORIS_CHECK(filters != nullptr);
1008
30
    auto existing_filter_it = std::ranges::find_if(*filters, [&](const auto& existing_filter) {
1009
5
        return existing_filter.same_target_as(column_filter);
1010
5
    });
1011
30
    if (existing_filter_it == filters->end()) {
1012
25
        filters->push_back(std::move(column_filter));
1013
25
        return;
1014
25
    }
1015
5
    existing_filter_it->predicates.insert(existing_filter_it->predicates.end(),
1016
5
                                          column_filter.predicates.begin(),
1017
5
                                          column_filter.predicates.end());
1018
5
}
1019
1020
// Extract file-column pruning predicates from localized row-level conjuncts that reference nested
1021
// struct leaves. This is separate from file_request->conjuncts: conjuncts do row filtering, while
1022
// FileColumnPredicateFilter carries primitive leaf predicates for file/page/statistics pruning.
1023
//
1024
// Example: for `WHERE s.b.c > 10 AND element_at(s, 'd') IS NOT NULL`, this function emits pruning
1025
// filters for the nested targets `s -> b -> c` and `s -> d`. The caller only invokes it after
1026
// table_filter_has_only_local_entries() succeeds, so each root slot already has a file-local scan
1027
// source in _filter_entries.
1028
void collect_nested_column_predicate_filters(const VExprSPtr& expr,
1029
                                             const std::vector<ColumnMapping>& mappings,
1030
67
                                             std::vector<FileColumnPredicateFilter>* filters) {
1031
67
    DORIS_CHECK(filters != nullptr);
1032
67
    if (expr == nullptr) {
1033
0
        return;
1034
0
    }
1035
67
    if (expr->node_type() == TExprNodeType::COMPOUND_PRED &&
1036
67
        expr->op() == TExprOpcode::COMPOUND_AND) {
1037
8
        for (const auto& child : expr->children()) {
1038
8
            collect_nested_column_predicate_filters(child, mappings, filters);
1039
8
        }
1040
4
        return;
1041
4
    }
1042
63
    FileColumnPredicateFilter column_filter;
1043
63
    if (extract_nested_binary_comparison_filter(expr, mappings, &column_filter) ||
1044
63
        extract_nested_in_list_filter(expr, mappings, &column_filter) ||
1045
63
        extract_nested_null_filter(expr, mappings, &column_filter)) {
1046
16
        merge_column_predicate_filter(std::move(column_filter), filters);
1047
16
    }
1048
63
}
1049
1050
} // namespace doris::format