Coverage Report

Created: 2026-06-25 12:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/scan/access_path_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/scan/access_path_parser.h"
19
20
#include <fmt/format.h>
21
22
#include <algorithm>
23
#include <charconv>
24
#include <map>
25
#include <string>
26
#include <string_view>
27
#include <utility>
28
29
#include "common/cast_set.h"
30
#include "common/consts.h"
31
#include "core/assert_cast.h"
32
#include "core/data_type/data_type.h"
33
#include "core/data_type/data_type_array.h"
34
#include "core/data_type/data_type_map.h"
35
#include "core/data_type/data_type_nullable.h"
36
#include "core/data_type/data_type_struct.h"
37
#include "runtime/descriptors.h"
38
#include "util/string_util.h"
39
40
namespace doris {
41
namespace {
42
43
27
bool is_scanner_materialized_virtual_column(const std::string& column_name) {
44
27
    return column_name == BeConsts::ICEBERG_ROWID_COL;
45
27
}
46
47
22
bool parse_non_negative_int(std::string_view value, int32_t* result) {
48
22
    DORIS_CHECK(result != nullptr);
49
22
    int32_t parsed = -1;
50
22
    const auto* begin = value.data();
51
22
    const auto* end = begin + value.size();
52
22
    const auto [ptr, ec] = std::from_chars(begin, end, parsed);
53
22
    if (ec != std::errc() || ptr != end || parsed < 0) {
54
19
        return false;
55
19
    }
56
3
    *result = parsed;
57
3
    return true;
58
22
}
59
60
2
std::string access_path_to_string(const std::vector<std::string>& path) {
61
2
    return fmt::format("{}", fmt::join(path, "."));
62
2
}
63
64
format::ColumnDefinition* find_or_add_child(format::ColumnDefinition* parent, int32_t id,
65
30
                                            std::string name, DataTypePtr type) {
66
30
    DORIS_CHECK(parent != nullptr);
67
30
    for (auto& child : parent->children) {
68
14
        if ((child.has_identifier_field_id() && child.get_identifier_field_id() == id) ||
69
14
            child.name == name) {
70
0
            return &child;
71
0
        }
72
14
    }
73
30
    parent->children.push_back({
74
30
            .identifier = Field::create_field<TYPE_INT>(id),
75
30
            .name = std::move(name),
76
30
            .type = std::move(type),
77
30
            .children = {},
78
30
            .default_expr = nullptr,
79
30
            .is_partition_key = false,
80
30
    });
81
30
    return &parent->children.back();
82
30
}
83
84
void inherit_schema_metadata(format::ColumnDefinition* column,
85
30
                             const format::ColumnDefinition* schema_column) {
86
30
    if (column == nullptr || schema_column == nullptr) {
87
1
        return;
88
1
    }
89
29
    column->name_mapping = schema_column->name_mapping;
90
29
}
91
92
const format::ColumnDefinition* find_schema_child_by_path(
93
20
        const format::ColumnDefinition* schema_column, const std::string& child_path) {
94
20
    if (schema_column == nullptr) {
95
1
        return nullptr;
96
1
    }
97
19
    int32_t parsed_field_id = -1;
98
19
    if (parse_non_negative_int(child_path, &parsed_field_id)) {
99
1
        const auto child_it = std::ranges::find_if(
100
2
                schema_column->children, [&](const format::ColumnDefinition& child) {
101
2
                    return child.has_identifier_field_id() &&
102
2
                           child.get_identifier_field_id() == parsed_field_id;
103
2
                });
104
1
        return child_it == schema_column->children.end() ? nullptr : &*child_it;
105
1
    }
106
34
    const auto child_it = std::ranges::find_if(schema_column->children, [&](const auto& child) {
107
34
        if (to_lower(child.name) == to_lower(child_path)) {
108
12
            return true;
109
12
        }
110
22
        return std::ranges::any_of(child.name_mapping, [&](const std::string& alias) {
111
13
            return to_lower(alias) == to_lower(child_path);
112
13
        });
113
34
    });
114
18
    return child_it == schema_column->children.end() ? nullptr : &*child_it;
115
19
}
116
117
33
int32_t schema_field_id(const format::ColumnDefinition* schema_column) {
118
33
    if (schema_column == nullptr || !schema_column->has_identifier_field_id()) {
119
4
        return -1;
120
4
    }
121
29
    return schema_column->get_identifier_field_id();
122
33
}
123
124
23
int32_t schema_field_id_or(const format::ColumnDefinition* schema_column, int32_t fallback) {
125
23
    const auto field_id = schema_field_id(schema_column);
126
23
    return field_id >= 0 ? field_id : fallback;
127
23
}
128
129
std::string schema_field_name_or(const format::ColumnDefinition* schema_column,
130
10
                                 std::string fallback) {
131
10
    return schema_column == nullptr || schema_column->name.empty() ? std::move(fallback)
132
10
                                                                   : schema_column->name;
133
10
}
134
135
struct AccessPathNode {
136
    bool project_all = false;
137
    std::map<std::string, AccessPathNode> children;
138
};
139
140
9
void merge_access_path_node(AccessPathNode* dst, const AccessPathNode& src) {
141
9
    DORIS_CHECK(dst != nullptr);
142
9
    if (dst->project_all) {
143
0
        return;
144
0
    }
145
9
    if (src.project_all) {
146
5
        dst->project_all = true;
147
5
        dst->children.clear();
148
5
        return;
149
5
    }
150
4
    for (const auto& [path, child] : src.children) {
151
4
        merge_access_path_node(&dst->children[path], child);
152
4
    }
153
4
}
154
155
void insert_access_path(AccessPathNode* root, const std::vector<std::string>& path,
156
47
                        size_t path_idx) {
157
47
    DORIS_CHECK(root != nullptr);
158
47
    if (root->project_all) {
159
0
        return;
160
0
    }
161
47
    if (path_idx >= path.size()) {
162
22
        root->project_all = true;
163
22
        root->children.clear();
164
22
        return;
165
22
    }
166
25
    insert_access_path(&root->children[path[path_idx]], path, path_idx + 1);
167
25
}
168
169
Status build_nested_children_from_access_node(format::ColumnDefinition* column,
170
                                              const DataTypePtr& type, const AccessPathNode& node,
171
                                              const std::string& path,
172
                                              const format::ColumnDefinition* schema_column);
173
174
// Expand a full complex-column projection into table-schema children when the table format provides
175
// an external/current schema. Without this, `SELECT complex_col` or `SELECT *` leaves
176
// ColumnDefinition::children empty, so ColumnMapper treats the root complex column as a scalar
177
// mapping and later tries to cast the old file shape to the current table shape directly.
178
//
179
// Examples:
180
//   - STRUCT country/city projected from an old file STRUCT country/population/location should
181
//     create children country and city, so city can be materialized as missing/default.
182
//   - ARRAY<STRUCT<item, quantity>> should create the array element wrapper and then the element
183
//     struct children item and quantity.
184
//   - MAP<STRING, STRUCT<full_name, age>> should create semantic children key/value directly, then
185
//     expand the value struct children full_name and age. Do not introduce a physical entries
186
//     wrapper here: ColumnMapper and TableReader treat MAP children as [key, value].
187
Status build_all_nested_children_from_schema(format::ColumnDefinition* column,
188
                                             const DataTypePtr& type, const std::string& path,
189
28
                                             const format::ColumnDefinition* schema_column) {
190
28
    DORIS_CHECK(column != nullptr);
191
192
28
    const auto nested_type = remove_nullable(type);
193
28
    AccessPathNode project_all;
194
28
    project_all.project_all = true;
195
28
    switch (nested_type->get_primitive_type()) {
196
4
    case TYPE_STRUCT: {
197
4
        const auto& struct_type = assert_cast<const DataTypeStruct&>(*nested_type);
198
14
        for (size_t field_idx = 0; field_idx < struct_type.get_elements().size(); ++field_idx) {
199
10
            const auto field_name = struct_type.get_element_name(field_idx);
200
10
            const auto* schema_child = find_schema_child_by_path(schema_column, field_name);
201
10
            auto* child = find_or_add_child(
202
10
                    column, schema_field_id_or(schema_child, cast_set<int32_t>(field_idx)),
203
10
                    schema_field_name_or(schema_child, field_name),
204
10
                    struct_type.get_element(field_idx));
205
10
            inherit_schema_metadata(child, schema_child);
206
10
            RETURN_IF_ERROR(build_nested_children_from_access_node(
207
10
                    child, child->type, project_all, path + "." + child->name, schema_child));
208
10
        }
209
4
        return Status::OK();
210
4
    }
211
1
    case TYPE_ARRAY: {
212
1
        const auto& array_type = assert_cast<const DataTypeArray&>(*nested_type);
213
1
        const auto* element_schema = schema_column != nullptr && !schema_column->children.empty()
214
1
                                             ? &schema_column->children[0]
215
1
                                             : nullptr;
216
1
        auto* child = find_or_add_child(column, schema_field_id_or(element_schema, 0), "element",
217
1
                                        array_type.get_nested_type());
218
1
        inherit_schema_metadata(child, element_schema);
219
1
        return build_nested_children_from_access_node(child, child->type, project_all, path + ".*",
220
1
                                                      element_schema);
221
4
    }
222
1
    case TYPE_MAP: {
223
1
        const auto& map_type = assert_cast<const DataTypeMap&>(*nested_type);
224
1
        const auto* key_schema = schema_column != nullptr && !schema_column->children.empty()
225
1
                                         ? &schema_column->children[0]
226
1
                                         : nullptr;
227
1
        const auto* value_schema = schema_column != nullptr && schema_column->children.size() > 1
228
1
                                           ? &schema_column->children[1]
229
1
                                           : nullptr;
230
1
        auto* key_child = find_or_add_child(column, schema_field_id_or(key_schema, 0), "key",
231
1
                                            map_type.get_key_type());
232
1
        inherit_schema_metadata(key_child, key_schema);
233
1
        RETURN_IF_ERROR(build_nested_children_from_access_node(
234
1
                key_child, key_child->type, project_all, path + ".KEYS", key_schema));
235
1
        auto* value_child = find_or_add_child(column, schema_field_id_or(value_schema, 1), "value",
236
1
                                              map_type.get_value_type());
237
1
        inherit_schema_metadata(value_child, value_schema);
238
1
        RETURN_IF_ERROR(build_nested_children_from_access_node(
239
1
                value_child, value_child->type, project_all, path + ".VALUES", value_schema));
240
1
        return Status::OK();
241
1
    }
242
22
    default:
243
22
        return Status::OK();
244
28
    }
245
28
}
246
247
Status build_struct_children_from_access_node(format::ColumnDefinition* column,
248
                                              const DataTypeStruct& struct_type,
249
                                              const AccessPathNode& node, const std::string& path,
250
13
                                              const format::ColumnDefinition* schema_column) {
251
13
    DORIS_CHECK(column != nullptr);
252
14
    for (const auto& [child_path, child_node] : node.children) {
253
        // Struct children are resolved by name or schema field id. We do not treat a numeric
254
        // child token as a struct ordinal, because `col.0` becomes ambiguous once the struct
255
        // evolves. Position-based access needs a separate design if it is required later.
256
14
        if (child_path == "OFFSET" || child_path == "*" || child_path == "KEYS" ||
257
14
            child_path == "VALUES") {
258
4
            return Status::NotSupported(
259
4
                    "AccessPathParser does not support access path {} for slot {}",
260
4
                    path + "." + child_path, column->name);
261
4
        }
262
263
        // Prefer the table/schema ColumnDefinition because it carries field ids and aliases.
264
        // Fallback to the struct type name only for formats without external schema metadata.
265
10
        const auto* schema_child = find_schema_child_by_path(schema_column, child_path);
266
10
        int32_t field_id = schema_field_id(schema_child);
267
10
        std::string field_name = schema_child == nullptr ? child_path : schema_child->name;
268
10
        DataTypePtr field_type = schema_child == nullptr ? nullptr : schema_child->type;
269
10
        if (field_id < 0 || field_type == nullptr) {
270
11
            for (size_t field_idx = 0; field_idx < struct_type.get_elements().size(); ++field_idx) {
271
8
                if (to_lower(struct_type.get_element_name(field_idx)) == to_lower(field_name)) {
272
1
                    field_id = cast_set<int32_t>(field_idx);
273
1
                    field_name = struct_type.get_element_name(field_idx);
274
1
                    field_type = struct_type.get_element(field_idx);
275
1
                    break;
276
1
                }
277
8
            }
278
4
        }
279
280
10
        if (field_id < 0 || field_type == nullptr) {
281
3
            return Status::NotSupported(
282
3
                    "AccessPathParser does not support access path {} for slot {}",
283
3
                    path + "." + child_path, column->name);
284
3
        }
285
        // TODO: For TVF Parquet files without field ids, this fallback uses the struct ordinal as
286
        // the table child identifier. BY_NAME mapping should instead keep a string identifier and
287
        // let TableColumnMapper resolve the file-local child id from the Parquet schema.
288
7
        auto* child = find_or_add_child(column, field_id, field_name, field_type);
289
7
        inherit_schema_metadata(child, schema_child);
290
7
        RETURN_IF_ERROR(build_nested_children_from_access_node(
291
7
                child, child->type, child_node, path + "." + child_path, schema_child));
292
7
    }
293
6
    return Status::OK();
294
13
}
295
296
Status build_map_children_from_access_node(format::ColumnDefinition* column,
297
                                           const DataTypeMap& map_type, const AccessPathNode& node,
298
                                           const std::string& path,
299
6
                                           const format::ColumnDefinition* schema_column) {
300
6
    DORIS_CHECK(column != nullptr);
301
6
    AccessPathNode key_node;
302
6
    AccessPathNode value_node;
303
6
    bool need_key = false;
304
6
    bool need_value = false;
305
306
7
    for (const auto& [child_path, child_node] : node.children) {
307
7
        if (child_path == "OFFSET") {
308
1
            return Status::NotSupported(
309
1
                    "AccessPathParser does not support access path {} for slot {}",
310
1
                    path + "." + child_path, column->name);
311
1
        }
312
6
        if (child_path == "KEYS") {
313
1
            need_key = true;
314
1
            merge_access_path_node(&key_node, child_node);
315
1
            continue;
316
1
        }
317
5
        if (child_path == "VALUES") {
318
3
            need_key = true;
319
3
            key_node.project_all = true;
320
3
            key_node.children.clear();
321
3
            need_value = true;
322
3
            merge_access_path_node(&value_node, child_node);
323
3
            continue;
324
3
        }
325
2
        if (child_path == "*") {
326
1
            need_key = true;
327
1
            key_node.project_all = true;
328
1
            key_node.children.clear();
329
1
            need_value = true;
330
1
            merge_access_path_node(&value_node, child_node);
331
1
            continue;
332
1
        }
333
1
        return Status::NotSupported("AccessPathParser does not support access path {} for slot {}",
334
1
                                    path + "." + child_path, column->name);
335
2
    }
336
4
    if (need_key && !need_value) {
337
        // A key-only MAP projection is not independently materializable yet. FileScannerV2 can
338
        // describe a projection such as `m.KEYS`, but the downstream file block -> table block path
339
        // still builds a ColumnMap from key column + value column + offsets. If the value child is
340
        // omitted here, TableReader/ColumnMapper cannot reconstruct a valid table MAP column even
341
        // though the query only needs keys.
342
        //
343
        // Example:
344
        //   SELECT map_keys(m) FROM t;
345
        // or
346
        //   SELECT * FROM t WHERE array_contains(map_keys(m), 'k1');
347
        //
348
        // The access path only asks for `m.KEYS`, but the scan still has to read `m.VALUES` as a
349
        // temporary full projection until map materialization supports constructing a table MAP
350
        // from keys only.
351
1
        need_value = true;
352
1
        value_node.project_all = true;
353
1
        value_node.children.clear();
354
1
    }
355
356
4
    if (!need_key && !need_value) {
357
0
        return Status::OK();
358
0
    }
359
360
4
    const auto* key_schema = schema_column != nullptr && !schema_column->children.empty()
361
4
                                     ? &schema_column->children[0]
362
4
                                     : nullptr;
363
4
    const auto* value_schema = schema_column != nullptr && schema_column->children.size() > 1
364
4
                                       ? &schema_column->children[1]
365
4
                                       : nullptr;
366
4
    if (need_key) {
367
4
        auto* key_child = find_or_add_child(column, schema_field_id_or(key_schema, 0), "key",
368
4
                                            map_type.get_key_type());
369
4
        inherit_schema_metadata(key_child, key_schema);
370
4
        RETURN_IF_ERROR(build_nested_children_from_access_node(key_child, key_child->type, key_node,
371
4
                                                               path + ".KEYS", key_schema));
372
4
    }
373
4
    if (need_value) {
374
4
        auto* value_child = find_or_add_child(column, schema_field_id_or(value_schema, 1), "value",
375
4
                                              map_type.get_value_type());
376
4
        inherit_schema_metadata(value_child, value_schema);
377
4
        RETURN_IF_ERROR(build_nested_children_from_access_node(
378
4
                value_child, value_child->type, value_node, path + ".VALUES", value_schema));
379
4
    }
380
3
    return Status::OK();
381
4
}
382
383
Status build_nested_children_from_access_node(format::ColumnDefinition* column,
384
                                              const DataTypePtr& type, const AccessPathNode& node,
385
                                              const std::string& path,
386
51
                                              const format::ColumnDefinition* schema_column) {
387
51
    DORIS_CHECK(column != nullptr);
388
51
    if (node.project_all || node.children.empty()) {
389
28
        return build_all_nested_children_from_schema(column, type, path, schema_column);
390
28
    }
391
392
23
    const auto nested_type = remove_nullable(type);
393
23
    switch (nested_type->get_primitive_type()) {
394
13
    case TYPE_STRUCT:
395
13
        return build_struct_children_from_access_node(
396
13
                column, assert_cast<const DataTypeStruct&>(*nested_type), node, path,
397
13
                schema_column);
398
4
    case TYPE_ARRAY: {
399
4
        if (node.children.size() != 1 || !node.children.contains("*")) {
400
2
            return Status::NotSupported(
401
2
                    "AccessPathParser does not support access path {} for slot {}", path,
402
2
                    column->name);
403
2
        }
404
2
        const auto& array_type = assert_cast<const DataTypeArray&>(*nested_type);
405
2
        const auto* element_schema = schema_column != nullptr && !schema_column->children.empty()
406
2
                                             ? &schema_column->children[0]
407
2
                                             : nullptr;
408
2
        auto* child = find_or_add_child(column, schema_field_id_or(element_schema, 0), "element",
409
2
                                        array_type.get_nested_type());
410
2
        inherit_schema_metadata(child, element_schema);
411
2
        return build_nested_children_from_access_node(child, child->type, node.children.at("*"),
412
2
                                                      path + ".*", element_schema);
413
4
    }
414
6
    case TYPE_MAP:
415
6
        return build_map_children_from_access_node(
416
6
                column, assert_cast<const DataTypeMap&>(*nested_type), node, path, schema_column);
417
0
    default:
418
0
        return Status::NotSupported("AccessPathParser does not support access path {} for slot {}",
419
0
                                    path, column->name);
420
23
    }
421
23
}
422
423
} // namespace
424
425
Status AccessPathParser::build_nested_children(format::ColumnDefinition* column,
426
                                               const std::vector<TColumnAccessPath>& access_paths,
427
27
                                               const format::ColumnDefinition* schema_column) {
428
27
    DORIS_CHECK(column != nullptr);
429
27
    if (is_scanner_materialized_virtual_column(column->name)) {
430
1
        return Status::OK();
431
1
    }
432
26
    if (!is_complex_type(remove_nullable(column->type)->get_primitive_type())) {
433
1
        return Status::OK();
434
1
    }
435
436
25
    AccessPathNode root;
437
    // Build tree for AccessPathNode.
438
    // For example, for access paths ["a.b", "a.c", "d"], the tree will be:
439
    // root
440
    // ├── a
441
    // │   ├── b
442
    // │   └── c
443
    // └── d
444
26
    for (const auto& access_path : access_paths) {
445
        // TODO: Support META access paths if needed. Currently AccessPathParser only supports
446
        // DATA access paths.
447
26
        if (access_path.type != TAccessPathType::DATA || !access_path.__isset.data_access_path) {
448
2
            return Status::NotSupported(
449
2
                    "AccessPathParser only supports DATA access paths for slot {}", column->name);
450
2
        }
451
24
        const auto& path = access_path.data_access_path.path;
452
24
        if (path.empty()) {
453
0
            insert_access_path(&root, path, 0);
454
0
            continue;
455
0
        }
456
24
        int32_t top_level_id = -1;
457
24
        if (to_lower(path.front()) != to_lower(column->name) &&
458
24
            (!parse_non_negative_int(path.front(), &top_level_id) ||
459
3
             !column->has_identifier_field_id() ||
460
3
             top_level_id != column->get_identifier_field_id())) {
461
2
            return Status::NotSupported("AccessPathParser access path {} does not match slot {}",
462
2
                                        access_path_to_string(path), column->name);
463
2
        }
464
22
        insert_access_path(&root, path, 1);
465
22
    }
466
    // Recursively build nested children for the column based on the AccessPathNode tree.
467
21
    return build_nested_children_from_access_node(column, column->type, root, column->name,
468
21
                                                  schema_column);
469
25
}
470
471
Status AccessPathParser::build_nested_children(format::ColumnDefinition* column,
472
                                               const SlotDescriptor* slot_desc,
473
0
                                               const format::ColumnDefinition* schema_column) {
474
0
    DORIS_CHECK(column != nullptr);
475
0
    DORIS_CHECK(slot_desc != nullptr);
476
0
    return build_nested_children(column, slot_desc->all_access_paths(), schema_column);
477
0
}
478
479
} // namespace doris