Coverage Report

Created: 2026-07-02 14:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/scan/access_path_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/scan/access_path_parser.h"
19
20
#include <fmt/format.h>
21
22
#include <algorithm>
23
#include <charconv>
24
#include <map>
25
#include <string>
26
#include <string_view>
27
#include <utility>
28
29
#include "common/cast_set.h"
30
#include "common/consts.h"
31
#include "core/assert_cast.h"
32
#include "core/data_type/data_type.h"
33
#include "core/data_type/data_type_array.h"
34
#include "core/data_type/data_type_map.h"
35
#include "core/data_type/data_type_nullable.h"
36
#include "core/data_type/data_type_struct.h"
37
#include "runtime/descriptors.h"
38
#include "util/string_util.h"
39
40
namespace doris {
41
namespace {
42
43
54
bool is_scanner_materialized_virtual_column(const std::string& column_name) {
44
54
    return column_name == BeConsts::ICEBERG_ROWID_COL;
45
54
}
46
47
44
bool parse_non_negative_int(std::string_view value, int32_t* result) {
48
44
    DORIS_CHECK(result != nullptr);
49
44
    int32_t parsed = -1;
50
44
    const auto* begin = value.data();
51
44
    const auto* end = begin + value.size();
52
44
    const auto [ptr, ec] = std::from_chars(begin, end, parsed);
53
44
    if (ec != std::errc() || ptr != end || parsed < 0) {
54
38
        return false;
55
38
    }
56
6
    *result = parsed;
57
6
    return true;
58
44
}
59
60
4
std::string access_path_to_string(const std::vector<std::string>& path) {
61
4
    return fmt::format("{}", fmt::join(path, "."));
62
4
}
63
64
format::ColumnDefinition* find_or_add_child(format::ColumnDefinition* parent, int32_t id,
65
60
                                            std::string name, DataTypePtr type) {
66
60
    DORIS_CHECK(parent != nullptr);
67
60
    for (auto& child : parent->children) {
68
28
        if ((child.has_identifier_field_id() && child.get_identifier_field_id() == id) ||
69
28
            child.name == name) {
70
0
            return &child;
71
0
        }
72
28
    }
73
60
    parent->children.push_back({
74
60
            .identifier = Field::create_field<TYPE_INT>(id),
75
60
            .name = std::move(name),
76
60
            .type = std::move(type),
77
60
            .children = {},
78
60
            .default_expr = nullptr,
79
60
            .is_partition_key = false,
80
60
    });
81
60
    return &parent->children.back();
82
60
}
83
84
void inherit_schema_metadata(format::ColumnDefinition* column,
85
60
                             const format::ColumnDefinition* schema_column) {
86
60
    if (column == nullptr || schema_column == nullptr) {
87
2
        return;
88
2
    }
89
58
    column->name_mapping = schema_column->name_mapping;
90
58
}
91
92
const format::ColumnDefinition* find_schema_child_by_path(
93
40
        const format::ColumnDefinition* schema_column, const std::string& child_path) {
94
40
    if (schema_column == nullptr) {
95
2
        return nullptr;
96
2
    }
97
38
    int32_t parsed_field_id = -1;
98
38
    if (parse_non_negative_int(child_path, &parsed_field_id)) {
99
2
        const auto child_it = std::ranges::find_if(
100
4
                schema_column->children, [&](const format::ColumnDefinition& child) {
101
4
                    return child.has_identifier_field_id() &&
102
4
                           child.get_identifier_field_id() == parsed_field_id;
103
4
                });
104
2
        return child_it == schema_column->children.end() ? nullptr : &*child_it;
105
2
    }
106
68
    const auto child_it = std::ranges::find_if(schema_column->children, [&](const auto& child) {
107
68
        if (to_lower(child.name) == to_lower(child_path)) {
108
24
            return true;
109
24
        }
110
44
        return std::ranges::any_of(child.name_mapping, [&](const std::string& alias) {
111
26
            return to_lower(alias) == to_lower(child_path);
112
26
        });
113
68
    });
114
36
    return child_it == schema_column->children.end() ? nullptr : &*child_it;
115
38
}
116
117
66
int32_t schema_field_id(const format::ColumnDefinition* schema_column) {
118
66
    if (schema_column == nullptr || !schema_column->has_identifier_field_id()) {
119
8
        return -1;
120
8
    }
121
58
    return schema_column->get_identifier_field_id();
122
66
}
123
124
46
int32_t schema_field_id_or(const format::ColumnDefinition* schema_column, int32_t fallback) {
125
46
    const auto field_id = schema_field_id(schema_column);
126
46
    return field_id >= 0 ? field_id : fallback;
127
46
}
128
129
std::string schema_field_name_or(const format::ColumnDefinition* schema_column,
130
20
                                 std::string fallback) {
131
20
    return schema_column == nullptr || schema_column->name.empty() ? std::move(fallback)
132
20
                                                                   : schema_column->name;
133
20
}
134
135
struct AccessPathNode {
136
    bool project_all = false;
137
    std::map<std::string, AccessPathNode> children;
138
};
139
140
18
void merge_access_path_node(AccessPathNode* dst, const AccessPathNode& src) {
141
18
    DORIS_CHECK(dst != nullptr);
142
18
    if (dst->project_all) {
143
0
        return;
144
0
    }
145
18
    if (src.project_all) {
146
10
        dst->project_all = true;
147
10
        dst->children.clear();
148
10
        return;
149
10
    }
150
8
    for (const auto& [path, child] : src.children) {
151
8
        merge_access_path_node(&dst->children[path], child);
152
8
    }
153
8
}
154
155
void insert_access_path(AccessPathNode* root, const std::vector<std::string>& path,
156
94
                        size_t path_idx) {
157
94
    DORIS_CHECK(root != nullptr);
158
94
    if (root->project_all) {
159
0
        return;
160
0
    }
161
94
    if (path_idx >= path.size()) {
162
44
        root->project_all = true;
163
44
        root->children.clear();
164
44
        return;
165
44
    }
166
50
    insert_access_path(&root->children[path[path_idx]], path, path_idx + 1);
167
50
}
168
169
Status build_nested_children_from_access_node(format::ColumnDefinition* column,
170
                                              const DataTypePtr& type, const AccessPathNode& node,
171
                                              const std::string& path,
172
                                              const format::ColumnDefinition* schema_column);
173
174
// Expand a full complex-column projection into table-schema children when the table format provides
175
// an external/current schema. Without this, `SELECT complex_col` or `SELECT *` leaves
176
// ColumnDefinition::children empty, so ColumnMapper treats the root complex column as a scalar
177
// mapping and later tries to cast the old file shape to the current table shape directly.
178
//
179
// Examples:
180
//   - STRUCT country/city projected from an old file STRUCT country/population/location should
181
//     create children country and city, so city can be materialized as missing/default.
182
//   - ARRAY<STRUCT<item, quantity>> should create the array element wrapper and then the element
183
//     struct children item and quantity.
184
//   - MAP<STRING, STRUCT<full_name, age>> should create semantic children key/value directly, then
185
//     expand the value struct children full_name and age. Do not introduce a physical entries
186
//     wrapper here: ColumnMapper and TableReader treat MAP children as [key, value].
187
Status build_all_nested_children_from_schema(format::ColumnDefinition* column,
188
                                             const DataTypePtr& type, const std::string& path,
189
56
                                             const format::ColumnDefinition* schema_column) {
190
56
    DORIS_CHECK(column != nullptr);
191
192
56
    const auto nested_type = remove_nullable(type);
193
56
    AccessPathNode project_all;
194
56
    project_all.project_all = true;
195
56
    switch (nested_type->get_primitive_type()) {
196
8
    case TYPE_STRUCT: {
197
8
        const auto& struct_type = assert_cast<const DataTypeStruct&>(*nested_type);
198
28
        for (size_t field_idx = 0; field_idx < struct_type.get_elements().size(); ++field_idx) {
199
20
            const auto field_name = struct_type.get_element_name(field_idx);
200
20
            const auto* schema_child = find_schema_child_by_path(schema_column, field_name);
201
20
            auto* child = find_or_add_child(
202
20
                    column, schema_field_id_or(schema_child, cast_set<int32_t>(field_idx)),
203
20
                    schema_field_name_or(schema_child, field_name),
204
20
                    struct_type.get_element(field_idx));
205
20
            inherit_schema_metadata(child, schema_child);
206
20
            RETURN_IF_ERROR(build_nested_children_from_access_node(
207
20
                    child, child->type, project_all, path + "." + child->name, schema_child));
208
20
        }
209
8
        return Status::OK();
210
8
    }
211
2
    case TYPE_ARRAY: {
212
2
        const auto& array_type = assert_cast<const DataTypeArray&>(*nested_type);
213
2
        const auto* element_schema = schema_column != nullptr && !schema_column->children.empty()
214
2
                                             ? &schema_column->children[0]
215
2
                                             : nullptr;
216
2
        auto* child = find_or_add_child(column, schema_field_id_or(element_schema, 0), "element",
217
2
                                        array_type.get_nested_type());
218
2
        inherit_schema_metadata(child, element_schema);
219
2
        return build_nested_children_from_access_node(child, child->type, project_all, path + ".*",
220
2
                                                      element_schema);
221
8
    }
222
2
    case TYPE_MAP: {
223
2
        const auto& map_type = assert_cast<const DataTypeMap&>(*nested_type);
224
2
        const auto* key_schema = schema_column != nullptr && !schema_column->children.empty()
225
2
                                         ? &schema_column->children[0]
226
2
                                         : nullptr;
227
2
        const auto* value_schema = schema_column != nullptr && schema_column->children.size() > 1
228
2
                                           ? &schema_column->children[1]
229
2
                                           : nullptr;
230
2
        auto* key_child = find_or_add_child(column, schema_field_id_or(key_schema, 0), "key",
231
2
                                            map_type.get_key_type());
232
2
        inherit_schema_metadata(key_child, key_schema);
233
2
        RETURN_IF_ERROR(build_nested_children_from_access_node(
234
2
                key_child, key_child->type, project_all, path + ".KEYS", key_schema));
235
2
        auto* value_child = find_or_add_child(column, schema_field_id_or(value_schema, 1), "value",
236
2
                                              map_type.get_value_type());
237
2
        inherit_schema_metadata(value_child, value_schema);
238
2
        RETURN_IF_ERROR(build_nested_children_from_access_node(
239
2
                value_child, value_child->type, project_all, path + ".VALUES", value_schema));
240
2
        return Status::OK();
241
2
    }
242
44
    default:
243
44
        return Status::OK();
244
56
    }
245
56
}
246
247
Status build_struct_children_from_access_node(format::ColumnDefinition* column,
248
                                              const DataTypeStruct& struct_type,
249
                                              const AccessPathNode& node, const std::string& path,
250
26
                                              const format::ColumnDefinition* schema_column) {
251
26
    DORIS_CHECK(column != nullptr);
252
28
    for (const auto& [child_path, child_node] : node.children) {
253
        // Struct children are resolved by name or schema field id. We do not treat a numeric
254
        // child token as a struct ordinal, because `col.0` becomes ambiguous once the struct
255
        // evolves. Position-based access needs a separate design if it is required later.
256
28
        if (child_path == "OFFSET" || child_path == "*" || child_path == "KEYS" ||
257
28
            child_path == "VALUES") {
258
8
            return Status::NotSupported(
259
8
                    "AccessPathParser does not support access path {} for slot {}",
260
8
                    path + "." + child_path, column->name);
261
8
        }
262
263
        // Prefer the table/schema ColumnDefinition because it carries field ids and aliases.
264
        // Fallback to the struct type name only for formats without external schema metadata.
265
20
        const auto* schema_child = find_schema_child_by_path(schema_column, child_path);
266
20
        int32_t field_id = schema_field_id(schema_child);
267
20
        std::string field_name = schema_child == nullptr ? child_path : schema_child->name;
268
20
        DataTypePtr field_type = schema_child == nullptr ? nullptr : schema_child->type;
269
20
        if (field_id < 0 || field_type == nullptr) {
270
22
            for (size_t field_idx = 0; field_idx < struct_type.get_elements().size(); ++field_idx) {
271
16
                if (to_lower(struct_type.get_element_name(field_idx)) == to_lower(field_name)) {
272
2
                    field_id = cast_set<int32_t>(field_idx);
273
2
                    field_name = struct_type.get_element_name(field_idx);
274
2
                    field_type = struct_type.get_element(field_idx);
275
2
                    break;
276
2
                }
277
16
            }
278
8
        }
279
280
20
        if (field_id < 0 || field_type == nullptr) {
281
6
            return Status::NotSupported(
282
6
                    "AccessPathParser does not support access path {} for slot {}",
283
6
                    path + "." + child_path, column->name);
284
6
        }
285
        // TODO: For TVF Parquet files without field ids, this fallback uses the struct ordinal as
286
        // the table child identifier. BY_NAME mapping should instead keep a string identifier and
287
        // let TableColumnMapper resolve the file-local child id from the Parquet schema.
288
14
        auto* child = find_or_add_child(column, field_id, field_name, field_type);
289
14
        inherit_schema_metadata(child, schema_child);
290
14
        RETURN_IF_ERROR(build_nested_children_from_access_node(
291
14
                child, child->type, child_node, path + "." + child_path, schema_child));
292
14
    }
293
12
    return Status::OK();
294
26
}
295
296
Status build_map_children_from_access_node(format::ColumnDefinition* column,
297
                                           const DataTypeMap& map_type, const AccessPathNode& node,
298
                                           const std::string& path,
299
12
                                           const format::ColumnDefinition* schema_column) {
300
12
    DORIS_CHECK(column != nullptr);
301
12
    AccessPathNode key_node;
302
12
    AccessPathNode value_node;
303
12
    bool need_key = false;
304
12
    bool need_value = false;
305
306
14
    for (const auto& [child_path, child_node] : node.children) {
307
14
        if (child_path == "OFFSET") {
308
2
            return Status::NotSupported(
309
2
                    "AccessPathParser does not support access path {} for slot {}",
310
2
                    path + "." + child_path, column->name);
311
2
        }
312
12
        if (child_path == "KEYS") {
313
2
            need_key = true;
314
2
            merge_access_path_node(&key_node, child_node);
315
2
            continue;
316
2
        }
317
10
        if (child_path == "VALUES") {
318
6
            need_key = true;
319
6
            key_node.project_all = true;
320
6
            key_node.children.clear();
321
6
            need_value = true;
322
6
            merge_access_path_node(&value_node, child_node);
323
6
            continue;
324
6
        }
325
4
        if (child_path == "*") {
326
2
            need_key = true;
327
2
            key_node.project_all = true;
328
2
            key_node.children.clear();
329
2
            need_value = true;
330
2
            merge_access_path_node(&value_node, child_node);
331
2
            continue;
332
2
        }
333
2
        return Status::NotSupported("AccessPathParser does not support access path {} for slot {}",
334
2
                                    path + "." + child_path, column->name);
335
4
    }
336
8
    if (need_key && !need_value) {
337
        // A key-only MAP projection is not independently materializable yet. FileScannerV2 can
338
        // describe a projection such as `m.KEYS`, but the downstream file block -> table block path
339
        // still builds a ColumnMap from key column + value column + offsets. If the value child is
340
        // omitted here, TableReader/ColumnMapper cannot reconstruct a valid table MAP column even
341
        // though the query only needs keys.
342
        //
343
        // Example:
344
        //   SELECT map_keys(m) FROM t;
345
        // or
346
        //   SELECT * FROM t WHERE array_contains(map_keys(m), 'k1');
347
        //
348
        // The access path only asks for `m.KEYS`, but the scan still has to read `m.VALUES` as a
349
        // temporary full projection until map materialization supports constructing a table MAP
350
        // from keys only.
351
2
        need_value = true;
352
2
        value_node.project_all = true;
353
2
        value_node.children.clear();
354
2
    }
355
356
8
    if (!need_key && !need_value) {
357
0
        return Status::OK();
358
0
    }
359
360
8
    const auto* key_schema = schema_column != nullptr && !schema_column->children.empty()
361
8
                                     ? &schema_column->children[0]
362
8
                                     : nullptr;
363
8
    const auto* value_schema = schema_column != nullptr && schema_column->children.size() > 1
364
8
                                       ? &schema_column->children[1]
365
8
                                       : nullptr;
366
8
    if (need_key) {
367
8
        auto* key_child = find_or_add_child(column, schema_field_id_or(key_schema, 0), "key",
368
8
                                            map_type.get_key_type());
369
8
        inherit_schema_metadata(key_child, key_schema);
370
8
        RETURN_IF_ERROR(build_nested_children_from_access_node(key_child, key_child->type, key_node,
371
8
                                                               path + ".KEYS", key_schema));
372
8
    }
373
8
    if (need_value) {
374
8
        auto* value_child = find_or_add_child(column, schema_field_id_or(value_schema, 1), "value",
375
8
                                              map_type.get_value_type());
376
8
        inherit_schema_metadata(value_child, value_schema);
377
8
        RETURN_IF_ERROR(build_nested_children_from_access_node(
378
8
                value_child, value_child->type, value_node, path + ".VALUES", value_schema));
379
8
    }
380
6
    return Status::OK();
381
8
}
382
383
Status build_nested_children_from_access_node(format::ColumnDefinition* column,
384
                                              const DataTypePtr& type, const AccessPathNode& node,
385
                                              const std::string& path,
386
102
                                              const format::ColumnDefinition* schema_column) {
387
102
    DORIS_CHECK(column != nullptr);
388
102
    if (node.project_all || node.children.empty()) {
389
56
        return build_all_nested_children_from_schema(column, type, path, schema_column);
390
56
    }
391
392
46
    const auto nested_type = remove_nullable(type);
393
46
    switch (nested_type->get_primitive_type()) {
394
26
    case TYPE_STRUCT:
395
26
        return build_struct_children_from_access_node(
396
26
                column, assert_cast<const DataTypeStruct&>(*nested_type), node, path,
397
26
                schema_column);
398
8
    case TYPE_ARRAY: {
399
8
        if (node.children.size() != 1 || !node.children.contains("*")) {
400
4
            return Status::NotSupported(
401
4
                    "AccessPathParser does not support access path {} for slot {}", path,
402
4
                    column->name);
403
4
        }
404
4
        const auto& array_type = assert_cast<const DataTypeArray&>(*nested_type);
405
4
        const auto* element_schema = schema_column != nullptr && !schema_column->children.empty()
406
4
                                             ? &schema_column->children[0]
407
4
                                             : nullptr;
408
4
        auto* child = find_or_add_child(column, schema_field_id_or(element_schema, 0), "element",
409
4
                                        array_type.get_nested_type());
410
4
        inherit_schema_metadata(child, element_schema);
411
4
        return build_nested_children_from_access_node(child, child->type, node.children.at("*"),
412
4
                                                      path + ".*", element_schema);
413
8
    }
414
12
    case TYPE_MAP:
415
12
        return build_map_children_from_access_node(
416
12
                column, assert_cast<const DataTypeMap&>(*nested_type), node, path, schema_column);
417
0
    default:
418
0
        return Status::NotSupported("AccessPathParser does not support access path {} for slot {}",
419
0
                                    path, column->name);
420
46
    }
421
46
}
422
423
} // namespace
424
425
Status AccessPathParser::build_nested_children(format::ColumnDefinition* column,
426
                                               const std::vector<TColumnAccessPath>& access_paths,
427
54
                                               const format::ColumnDefinition* schema_column) {
428
54
    DORIS_CHECK(column != nullptr);
429
54
    if (is_scanner_materialized_virtual_column(column->name)) {
430
2
        return Status::OK();
431
2
    }
432
52
    if (!is_complex_type(remove_nullable(column->type)->get_primitive_type())) {
433
2
        return Status::OK();
434
2
    }
435
436
50
    AccessPathNode root;
437
    // Build tree for AccessPathNode.
438
    // For example, for access paths ["a.b", "a.c", "d"], the tree will be:
439
    // root
440
    // ├── a
441
    // │   ├── b
442
    // │   └── c
443
    // └── d
444
52
    for (const auto& access_path : access_paths) {
445
        // TODO: Support META access paths if needed. Currently AccessPathParser only supports
446
        // DATA access paths.
447
52
        if (access_path.type != TAccessPathType::DATA || !access_path.__isset.data_access_path) {
448
4
            return Status::NotSupported(
449
4
                    "AccessPathParser only supports DATA access paths for slot {}", column->name);
450
4
        }
451
48
        const auto& path = access_path.data_access_path.path;
452
48
        if (path.empty()) {
453
0
            insert_access_path(&root, path, 0);
454
0
            continue;
455
0
        }
456
48
        int32_t top_level_id = -1;
457
48
        if (to_lower(path.front()) != to_lower(column->name) &&
458
48
            (!parse_non_negative_int(path.front(), &top_level_id) ||
459
6
             !column->has_identifier_field_id() ||
460
6
             top_level_id != column->get_identifier_field_id())) {
461
4
            return Status::NotSupported("AccessPathParser access path {} does not match slot {}",
462
4
                                        access_path_to_string(path), column->name);
463
4
        }
464
44
        insert_access_path(&root, path, 1);
465
44
    }
466
    // Recursively build nested children for the column based on the AccessPathNode tree.
467
42
    return build_nested_children_from_access_node(column, column->type, root, column->name,
468
42
                                                  schema_column);
469
50
}
470
471
Status AccessPathParser::build_nested_children(format::ColumnDefinition* column,
472
                                               const SlotDescriptor* slot_desc,
473
0
                                               const format::ColumnDefinition* schema_column) {
474
0
    DORIS_CHECK(column != nullptr);
475
0
    DORIS_CHECK(slot_desc != nullptr);
476
0
    return build_nested_children(column, slot_desc->all_access_paths(), schema_column);
477
0
}
478
479
} // namespace doris