Coverage Report

Created: 2026-07-03 16:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/scan/access_path_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/scan/access_path_parser.h"
19
20
#include <fmt/format.h>
21
22
#include <algorithm>
23
#include <charconv>
24
#include <map>
25
#include <string>
26
#include <string_view>
27
#include <utility>
28
29
#include "common/cast_set.h"
30
#include "common/consts.h"
31
#include "core/assert_cast.h"
32
#include "core/data_type/data_type.h"
33
#include "core/data_type/data_type_array.h"
34
#include "core/data_type/data_type_map.h"
35
#include "core/data_type/data_type_nullable.h"
36
#include "core/data_type/data_type_struct.h"
37
#include "runtime/descriptors.h"
38
#include "util/string_util.h"
39
40
namespace doris {
41
namespace {
42
43
208k
bool is_scanner_materialized_virtual_column(const std::string& column_name) {
44
208k
    return column_name == BeConsts::ICEBERG_ROWID_COL;
45
208k
}
46
47
26.0k
bool parse_non_negative_int(std::string_view value, int32_t* result) {
48
26.0k
    DORIS_CHECK(result != nullptr);
49
26.0k
    int32_t parsed = -1;
50
26.0k
    const auto* begin = value.data();
51
26.0k
    const auto* end = begin + value.size();
52
26.0k
    const auto [ptr, ec] = std::from_chars(begin, end, parsed);
53
26.0k
    if (ec != std::errc() || ptr != end || parsed < 0) {
54
14.1k
        return false;
55
14.1k
    }
56
11.9k
    *result = parsed;
57
11.9k
    return true;
58
26.0k
}
59
60
2
std::string access_path_to_string(const std::vector<std::string>& path) {
61
2
    return fmt::format("{}", fmt::join(path, "."));
62
2
}
63
64
format::ColumnDefinition* find_or_add_child(format::ColumnDefinition* parent, int32_t id,
65
141k
                                            std::string name, DataTypePtr type) {
66
141k
    DORIS_CHECK(parent != nullptr);
67
141k
    for (auto& child : parent->children) {
68
68.7k
        if ((child.has_identifier_field_id() && child.get_identifier_field_id() == id) ||
69
68.7k
            child.name == name) {
70
0
            return &child;
71
0
        }
72
68.7k
    }
73
141k
    parent->children.push_back({
74
141k
            .identifier = Field::create_field<TYPE_INT>(id),
75
141k
            .name = std::move(name),
76
141k
            .type = std::move(type),
77
141k
            .children = {},
78
141k
            .default_expr = nullptr,
79
141k
            .is_partition_key = false,
80
141k
    });
81
141k
    return &parent->children.back();
82
141k
}
83
84
void inherit_schema_metadata(format::ColumnDefinition* column,
85
141k
                             const format::ColumnDefinition* schema_column) {
86
141k
    if (column == nullptr || schema_column == nullptr) {
87
103k
        return;
88
103k
    }
89
38.5k
    column->name_mapping = schema_column->name_mapping;
90
38.5k
}
91
92
const format::ColumnDefinition* find_schema_child_by_path(
93
37.1k
        const format::ColumnDefinition* schema_column, const std::string& child_path) {
94
37.1k
    if (schema_column == nullptr) {
95
22.1k
        return nullptr;
96
22.1k
    }
97
15.0k
    int32_t parsed_field_id = -1;
98
15.0k
    if (parse_non_negative_int(child_path, &parsed_field_id)) {
99
897
        const auto child_it = std::ranges::find_if(
100
1.07k
                schema_column->children, [&](const format::ColumnDefinition& child) {
101
1.07k
                    return child.has_identifier_field_id() &&
102
1.07k
                           child.get_identifier_field_id() == parsed_field_id;
103
1.07k
                });
104
897
        return child_it == schema_column->children.end() ? nullptr : &*child_it;
105
897
    }
106
33.1k
    const auto child_it = std::ranges::find_if(schema_column->children, [&](const auto& child) {
107
33.1k
        if (to_lower(child.name) == to_lower(child_path)) {
108
14.1k
            return true;
109
14.1k
        }
110
19.0k
        return std::ranges::any_of(child.name_mapping, [&](const std::string& alias) {
111
13
            return to_lower(alias) == to_lower(child_path);
112
13
        });
113
33.1k
    });
114
14.1k
    return child_it == schema_column->children.end() ? nullptr : &*child_it;
115
15.0k
}
116
117
141k
int32_t schema_field_id(const format::ColumnDefinition* schema_column) {
118
141k
    if (schema_column == nullptr || !schema_column->has_identifier_field_id()) {
119
103k
        return -1;
120
103k
    }
121
38.5k
    return schema_column->get_identifier_field_id();
122
141k
}
123
124
140k
int32_t schema_field_id_or(const format::ColumnDefinition* schema_column, int32_t fallback) {
125
140k
    const auto field_id = schema_field_id(schema_column);
126
140k
    return field_id >= 0 ? field_id : fallback;
127
140k
}
128
129
std::string schema_field_name_or(const format::ColumnDefinition* schema_column,
130
36.2k
                                 std::string fallback) {
131
36.2k
    return schema_column == nullptr || schema_column->name.empty() ? std::move(fallback)
132
36.2k
                                                                   : schema_column->name;
133
36.2k
}
134
135
struct AccessPathNode {
136
    bool project_all = false;
137
    std::map<std::string, AccessPathNode> children;
138
};
139
140
515
void merge_access_path_node(AccessPathNode* dst, const AccessPathNode& src) {
141
515
    DORIS_CHECK(dst != nullptr);
142
515
    if (dst->project_all) {
143
0
        return;
144
0
    }
145
515
    if (src.project_all) {
146
331
        dst->project_all = true;
147
331
        dst->children.clear();
148
331
        return;
149
331
    }
150
200
    for (const auto& [path, child] : src.children) {
151
200
        merge_access_path_node(&dst->children[path], child);
152
200
    }
153
184
}
154
155
void insert_access_path(AccessPathNode* root, const std::vector<std::string>& path,
156
28.3k
                        size_t path_idx) {
157
28.3k
    DORIS_CHECK(root != nullptr);
158
28.3k
    if (root->project_all) {
159
0
        return;
160
0
    }
161
28.3k
    if (path_idx >= path.size()) {
162
27.0k
        root->project_all = true;
163
27.0k
        root->children.clear();
164
27.0k
        return;
165
27.0k
    }
166
1.37k
    insert_access_path(&root->children[path[path_idx]], path, path_idx + 1);
167
1.37k
}
168
169
Status build_nested_children_from_access_node(format::ColumnDefinition* column,
170
                                              const DataTypePtr& type, const AccessPathNode& node,
171
                                              const std::string& path,
172
                                              const format::ColumnDefinition* schema_column);
173
174
// Expand a full complex-column projection into table-schema children when the table format provides
175
// an external/current schema. Without this, `SELECT complex_col` or `SELECT *` leaves
176
// ColumnDefinition::children empty, so ColumnMapper treats the root complex column as a scalar
177
// mapping and later tries to cast the old file shape to the current table shape directly.
178
//
179
// Examples:
180
//   - STRUCT country/city projected from an old file STRUCT country/population/location should
181
//     create children country and city, so city can be materialized as missing/default.
182
//   - ARRAY<STRUCT<item, quantity>> should create the array element wrapper and then the element
183
//     struct children item and quantity.
184
//   - MAP<STRING, STRUCT<full_name, age>> should create semantic children key/value directly, then
185
//     expand the value struct children full_name and age. Do not introduce a physical entries
186
//     wrapper here: ColumnMapper and TableReader treat MAP children as [key, value].
187
Status build_all_nested_children_from_schema(format::ColumnDefinition* column,
188
                                             const DataTypePtr& type, const std::string& path,
189
217k
                                             const format::ColumnDefinition* schema_column) {
190
217k
    DORIS_CHECK(column != nullptr);
191
192
217k
    const auto nested_type = remove_nullable(type);
193
217k
    AccessPathNode project_all;
194
217k
    project_all.project_all = true;
195
217k
    switch (nested_type->get_primitive_type()) {
196
17.4k
    case TYPE_STRUCT: {
197
17.4k
        const auto& struct_type = assert_cast<const DataTypeStruct&>(*nested_type);
198
53.7k
        for (size_t field_idx = 0; field_idx < struct_type.get_elements().size(); ++field_idx) {
199
36.2k
            const auto field_name = struct_type.get_element_name(field_idx);
200
36.2k
            const auto* schema_child = find_schema_child_by_path(schema_column, field_name);
201
36.2k
            auto* child = find_or_add_child(
202
36.2k
                    column, schema_field_id_or(schema_child, cast_set<int32_t>(field_idx)),
203
36.2k
                    schema_field_name_or(schema_child, field_name),
204
36.2k
                    struct_type.get_element(field_idx));
205
36.2k
            inherit_schema_metadata(child, schema_child);
206
36.2k
            RETURN_IF_ERROR(build_nested_children_from_access_node(
207
36.2k
                    child, child->type, project_all, path + "." + child->name, schema_child));
208
36.2k
        }
209
17.4k
        return Status::OK();
210
17.4k
    }
211
38.6k
    case TYPE_ARRAY: {
212
38.6k
        const auto& array_type = assert_cast<const DataTypeArray&>(*nested_type);
213
38.6k
        const auto* element_schema = schema_column != nullptr && !schema_column->children.empty()
214
38.6k
                                             ? &schema_column->children[0]
215
38.6k
                                             : nullptr;
216
38.6k
        auto* child = find_or_add_child(column, schema_field_id_or(element_schema, 0), "element",
217
38.6k
                                        array_type.get_nested_type());
218
38.6k
        inherit_schema_metadata(child, element_schema);
219
38.6k
        return build_nested_children_from_access_node(child, child->type, project_all, path + ".*",
220
38.6k
                                                      element_schema);
221
17.4k
    }
222
32.6k
    case TYPE_MAP: {
223
32.6k
        const auto& map_type = assert_cast<const DataTypeMap&>(*nested_type);
224
32.6k
        const auto* key_schema = schema_column != nullptr && !schema_column->children.empty()
225
32.6k
                                         ? &schema_column->children[0]
226
32.6k
                                         : nullptr;
227
32.6k
        const auto* value_schema = schema_column != nullptr && schema_column->children.size() > 1
228
32.6k
                                           ? &schema_column->children[1]
229
32.6k
                                           : nullptr;
230
32.6k
        auto* key_child = find_or_add_child(column, schema_field_id_or(key_schema, 0), "key",
231
32.6k
                                            map_type.get_key_type());
232
32.6k
        inherit_schema_metadata(key_child, key_schema);
233
32.6k
        RETURN_IF_ERROR(build_nested_children_from_access_node(
234
32.6k
                key_child, key_child->type, project_all, path + ".KEYS", key_schema));
235
32.6k
        auto* value_child = find_or_add_child(column, schema_field_id_or(value_schema, 1), "value",
236
32.6k
                                              map_type.get_value_type());
237
32.6k
        inherit_schema_metadata(value_child, value_schema);
238
32.6k
        RETURN_IF_ERROR(build_nested_children_from_access_node(
239
32.6k
                value_child, value_child->type, project_all, path + ".VALUES", value_schema));
240
32.6k
        return Status::OK();
241
32.6k
    }
242
128k
    default:
243
128k
        return Status::OK();
244
217k
    }
245
217k
}
246
247
Status build_struct_children_from_access_node(format::ColumnDefinition* column,
248
                                              const DataTypeStruct& struct_type,
249
                                              const AccessPathNode& node, const std::string& path,
250
799
                                              const format::ColumnDefinition* schema_column) {
251
799
    DORIS_CHECK(column != nullptr);
252
942
    for (const auto& [child_path, child_node] : node.children) {
253
        // Struct children are resolved by name or schema field id. We do not treat a numeric
254
        // child token as a struct ordinal, because `col.0` becomes ambiguous once the struct
255
        // evolves. Position-based access needs a separate design if it is required later.
256
942
        if (child_path == "OFFSET" || child_path == "*" || child_path == "KEYS" ||
257
942
            child_path == "VALUES") {
258
4
            return Status::NotSupported(
259
4
                    "AccessPathParser does not support access path {} for slot {}",
260
4
                    path + "." + child_path, column->name);
261
4
        }
262
263
        // Prefer the table/schema ColumnDefinition because it carries field ids and aliases.
264
        // Fallback to the struct type name only for formats without external schema metadata.
265
938
        const auto* schema_child = find_schema_child_by_path(schema_column, child_path);
266
938
        int32_t field_id = schema_field_id(schema_child);
267
938
        std::string field_name = schema_child == nullptr ? child_path : schema_child->name;
268
938
        DataTypePtr field_type = schema_child == nullptr ? nullptr : schema_child->type;
269
938
        if (field_id < 0 || field_type == nullptr) {
270
45
            for (size_t field_idx = 0; field_idx < struct_type.get_elements().size(); ++field_idx) {
271
42
                if (to_lower(struct_type.get_element_name(field_idx)) == to_lower(field_name)) {
272
33
                    field_id = cast_set<int32_t>(field_idx);
273
33
                    field_name = struct_type.get_element_name(field_idx);
274
33
                    field_type = struct_type.get_element(field_idx);
275
33
                    break;
276
33
                }
277
42
            }
278
36
        }
279
280
938
        if (field_id < 0 || field_type == nullptr) {
281
3
            return Status::NotSupported(
282
3
                    "AccessPathParser does not support access path {} for slot {}",
283
3
                    path + "." + child_path, column->name);
284
3
        }
285
        // TODO: For TVF Parquet files without field ids, this fallback uses the struct ordinal as
286
        // the table child identifier. BY_NAME mapping should instead keep a string identifier and
287
        // let TableColumnMapper resolve the file-local child id from the Parquet schema.
288
935
        auto* child = find_or_add_child(column, field_id, field_name, field_type);
289
935
        inherit_schema_metadata(child, schema_child);
290
935
        RETURN_IF_ERROR(build_nested_children_from_access_node(
291
935
                child, child->type, child_node, path + "." + child_path, schema_child));
292
935
    }
293
792
    return Status::OK();
294
799
}
295
296
Status build_map_children_from_access_node(format::ColumnDefinition* column,
297
                                           const DataTypeMap& map_type, const AccessPathNode& node,
298
                                           const std::string& path,
299
284
                                           const format::ColumnDefinition* schema_column) {
300
284
    DORIS_CHECK(column != nullptr);
301
284
    AccessPathNode key_node;
302
284
    AccessPathNode value_node;
303
284
    bool need_key = false;
304
284
    bool need_value = false;
305
306
317
    for (const auto& [child_path, child_node] : node.children) {
307
317
        if (child_path == "OFFSET") {
308
1
            return Status::NotSupported(
309
1
                    "AccessPathParser does not support access path {} for slot {}",
310
1
                    path + "." + child_path, column->name);
311
1
        }
312
316
        if (child_path == "KEYS") {
313
69
            need_key = true;
314
69
            merge_access_path_node(&key_node, child_node);
315
69
            continue;
316
69
        }
317
247
        if (child_path == "VALUES") {
318
207
            need_key = true;
319
207
            key_node.project_all = true;
320
207
            key_node.children.clear();
321
207
            need_value = true;
322
207
            merge_access_path_node(&value_node, child_node);
323
207
            continue;
324
207
        }
325
40
        if (child_path == "*") {
326
39
            need_key = true;
327
39
            key_node.project_all = true;
328
39
            key_node.children.clear();
329
39
            need_value = true;
330
39
            merge_access_path_node(&value_node, child_node);
331
39
            continue;
332
39
        }
333
1
        return Status::NotSupported("AccessPathParser does not support access path {} for slot {}",
334
1
                                    path + "." + child_path, column->name);
335
40
    }
336
282
    if (need_key && !need_value) {
337
        // A key-only MAP projection is not independently materializable yet. FileScannerV2 can
338
        // describe a projection such as `m.KEYS`, but the downstream file block -> table block path
339
        // still builds a ColumnMap from key column + value column + offsets. If the value child is
340
        // omitted here, TableReader/ColumnMapper cannot reconstruct a valid table MAP column even
341
        // though the query only needs keys.
342
        //
343
        // Example:
344
        //   SELECT map_keys(m) FROM t;
345
        // or
346
        //   SELECT * FROM t WHERE array_contains(map_keys(m), 'k1');
347
        //
348
        // The access path only asks for `m.KEYS`, but the scan still has to read `m.VALUES` as a
349
        // temporary full projection until map materialization supports constructing a table MAP
350
        // from keys only.
351
37
        need_value = true;
352
37
        value_node.project_all = true;
353
37
        value_node.children.clear();
354
37
    }
355
356
282
    if (!need_key && !need_value) {
357
0
        return Status::OK();
358
0
    }
359
360
282
    const auto* key_schema = schema_column != nullptr && !schema_column->children.empty()
361
282
                                     ? &schema_column->children[0]
362
282
                                     : nullptr;
363
282
    const auto* value_schema = schema_column != nullptr && schema_column->children.size() > 1
364
282
                                       ? &schema_column->children[1]
365
282
                                       : nullptr;
366
282
    if (need_key) {
367
282
        auto* key_child = find_or_add_child(column, schema_field_id_or(key_schema, 0), "key",
368
282
                                            map_type.get_key_type());
369
282
        inherit_schema_metadata(key_child, key_schema);
370
282
        RETURN_IF_ERROR(build_nested_children_from_access_node(key_child, key_child->type, key_node,
371
282
                                                               path + ".KEYS", key_schema));
372
282
    }
373
282
    if (need_value) {
374
282
        auto* value_child = find_or_add_child(column, schema_field_id_or(value_schema, 1), "value",
375
282
                                              map_type.get_value_type());
376
282
        inherit_schema_metadata(value_child, value_schema);
377
282
        RETURN_IF_ERROR(build_nested_children_from_access_node(
378
282
                value_child, value_child->type, value_node, path + ".VALUES", value_schema));
379
282
    }
380
281
    return Status::OK();
381
282
}
382
383
Status build_nested_children_from_access_node(format::ColumnDefinition* column,
384
                                              const DataTypePtr& type, const AccessPathNode& node,
385
                                              const std::string& path,
386
218k
                                              const format::ColumnDefinition* schema_column) {
387
218k
    DORIS_CHECK(column != nullptr);
388
218k
    if (node.project_all || node.children.empty()) {
389
217k
        return build_all_nested_children_from_schema(column, type, path, schema_column);
390
217k
    }
391
392
1.17k
    const auto nested_type = remove_nullable(type);
393
1.17k
    switch (nested_type->get_primitive_type()) {
394
799
    case TYPE_STRUCT:
395
799
        return build_struct_children_from_access_node(
396
799
                column, assert_cast<const DataTypeStruct&>(*nested_type), node, path,
397
799
                schema_column);
398
100
    case TYPE_ARRAY: {
399
100
        if (node.children.size() != 1 || !node.children.contains("*")) {
400
2
            return Status::NotSupported(
401
2
                    "AccessPathParser does not support access path {} for slot {}", path,
402
2
                    column->name);
403
2
        }
404
98
        const auto& array_type = assert_cast<const DataTypeArray&>(*nested_type);
405
98
        const auto* element_schema = schema_column != nullptr && !schema_column->children.empty()
406
98
                                             ? &schema_column->children[0]
407
98
                                             : nullptr;
408
98
        auto* child = find_or_add_child(column, schema_field_id_or(element_schema, 0), "element",
409
98
                                        array_type.get_nested_type());
410
98
        inherit_schema_metadata(child, element_schema);
411
98
        return build_nested_children_from_access_node(child, child->type, node.children.at("*"),
412
98
                                                      path + ".*", element_schema);
413
100
    }
414
284
    case TYPE_MAP:
415
284
        return build_map_children_from_access_node(
416
284
                column, assert_cast<const DataTypeMap&>(*nested_type), node, path, schema_column);
417
0
    default:
418
0
        return Status::NotSupported("AccessPathParser does not support access path {} for slot {}",
419
0
                                    path, column->name);
420
1.17k
    }
421
1.17k
}
422
423
} // namespace
424
425
Status AccessPathParser::build_nested_children(format::ColumnDefinition* column,
426
                                               const std::vector<TColumnAccessPath>& access_paths,
427
208k
                                               const format::ColumnDefinition* schema_column) {
428
208k
    DORIS_CHECK(column != nullptr);
429
208k
    if (is_scanner_materialized_virtual_column(column->name)) {
430
97
        return Status::OK();
431
97
    }
432
208k
    if (!is_complex_type(remove_nullable(column->type)->get_primitive_type())) {
433
131k
        return Status::OK();
434
131k
    }
435
436
76.8k
    AccessPathNode root;
437
    // Build tree for AccessPathNode.
438
    // For example, for access paths ["a.b", "a.c", "d"], the tree will be:
439
    // root
440
    // ├── a
441
    // │   ├── b
442
    // │   └── c
443
    // └── d
444
76.8k
    for (const auto& access_path : access_paths) {
445
        // TODO: Support META access paths if needed. Currently AccessPathParser only supports
446
        // DATA access paths.
447
27.0k
        if (access_path.type != TAccessPathType::DATA || !access_path.__isset.data_access_path) {
448
2
            return Status::NotSupported(
449
2
                    "AccessPathParser only supports DATA access paths for slot {}", column->name);
450
2
        }
451
27.0k
        const auto& path = access_path.data_access_path.path;
452
27.0k
        if (path.empty()) {
453
0
            insert_access_path(&root, path, 0);
454
0
            continue;
455
0
        }
456
27.0k
        int32_t top_level_id = -1;
457
27.0k
        if (to_lower(path.front()) != to_lower(column->name) &&
458
27.0k
            (!parse_non_negative_int(path.front(), &top_level_id) ||
459
11.0k
             !column->has_identifier_field_id() ||
460
11.0k
             top_level_id != column->get_identifier_field_id())) {
461
2
            return Status::NotSupported("AccessPathParser access path {} does not match slot {}",
462
2
                                        access_path_to_string(path), column->name);
463
2
        }
464
27.0k
        insert_access_path(&root, path, 1);
465
27.0k
    }
466
    // Recursively build nested children for the column based on the AccessPathNode tree.
467
76.8k
    return build_nested_children_from_access_node(column, column->type, root, column->name,
468
76.8k
                                                  schema_column);
469
76.8k
}
470
471
Status AccessPathParser::build_nested_children(format::ColumnDefinition* column,
472
                                               const SlotDescriptor* slot_desc,
473
208k
                                               const format::ColumnDefinition* schema_column) {
474
208k
    DORIS_CHECK(column != nullptr);
475
208k
    DORIS_CHECK(slot_desc != nullptr);
476
208k
    return build_nested_children(column, slot_desc->all_access_paths(), schema_column);
477
208k
}
478
479
} // namespace doris