Coverage Report

Created: 2026-06-09 14:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format_v2/column_mapper.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <cstddef>
21
#include <cstdint>
22
#include <map>
23
#include <memory>
24
#include <optional>
25
#include <string>
26
#include <utility>
27
#include <vector>
28
29
#include "common/status.h"
30
#include "core/data_type/data_type.h"
31
#include "exprs/vexpr_fwd.h"
32
#include "format_v2/expr/literal.h"
33
#include "format_v2/file_reader.h"
34
35
namespace doris {
36
class ColumnPredicate;
37
class RuntimeState;
38
} // namespace doris
39
40
namespace doris::format {
41
42
struct ColumnDefinition;
43
struct TableFilter;
44
45
// Table-level simple predicates grouped by table/global output position. The key is not
46
// LocalColumnId: TableColumnMapper resolves it through ColumnMapping before creating file pruning
47
// hints.
48
using TableColumnPredicates = std::map<GlobalIndex, std::vector<std::shared_ptr<ColumnPredicate>>>;
49
50
enum class TableColumnMappingMode {
51
    // Match by ColumnDefinition::identifier TYPE_INT as field id.
52
    BY_FIELD_ID,
53
    // Match by ColumnDefinition::identifier TYPE_STRING, or logical name when identifier is null.
54
    BY_NAME,
55
    // Match top-level columns by file position. This mainly serves Hive1 ORC style files whose
56
    // column names are placeholder values such as `_col0` / `_col1`, where position is the only
57
    // reliable way to select the correct column.
58
    BY_INDEX,
59
};
60
61
enum TableVirtualColumnType {
62
    INVALID = 0, // not a virtual column
63
    ROW_ID = 1,
64
    LAST_UPDATED_SEQUENCE_NUMBER = 2,
65
};
66
67
enum class FilterConversionType {
68
    COPY_DIRECTLY, // filter can be copied directly from file layer without any change, e.g. column type and table type are the same and no complex nested projection is involved.
69
    CAST_FILTER, // filter can be converted from file layer by adding a cast, e.g. column type is nullable but table type is not, or file column has a trivial nested projection but table column has a complex nested projection.
70
    READER_EXPRESSION,
71
    FINALIZE_ONLY, // filter cannot be converted to file layer and should be evaluated at table reader finalize phase, e.g. a child column of a nested column is null in file schema.
72
    CONSTANT,
73
};
74
75
// Nested global-to-local child mapping. The root index points either to a request-local slot or to
76
// a child id, depending on the owner. child_mapping keeps the recursive table-child to file-child
77
// relationship explicit instead of encoding it in ColumnMapping flags.
78
struct IndexMapping {
79
    int32_t index = -1;
80
    std::map<int32_t, std::shared_ptr<IndexMapping>> child_mapping;
81
};
82
83
// Recursive result produced after one table/global column is assigned to a file-local source.
84
struct ColumnMapResult {
85
    std::optional<LocalColumnId> local_column_id;
86
    std::optional<LocalColumnIndex> column_index;
87
    std::optional<IndexMapping> mapping;
88
};
89
90
// Final mapping entry from one global result column to one file-local source.
91
struct ColumnMapEntry {
92
    IndexMapping mapping;
93
    DataTypePtr local_type;
94
    DataTypePtr global_type;
95
    FilterConversionType filter_conversion = FilterConversionType::FINALIZE_ONLY;
96
};
97
98
// Collection of final result-column mappings produced for one file/split.
99
struct ResultColumnMapping {
100
    std::map<GlobalIndex, ColumnMapEntry> global_to_local;
101
};
102
103
// 单个 table column 到 file column 的映射结果。
104
// 这是 table 层和 file 层的核心边界对象。
105
struct ColumnMapping {
106
    // Position of the top-level projected column in the table/global output block. Table-level
107
    // filters and column predicates refer to this index after FileScannerV2 translates FE ids at
108
    // the scanner boundary.
109
    GlobalIndex global_index;
110
    std::string table_column_name;
111
    // File-reader local id for the mapped node.
112
    //
113
    // For a root mapping it is convertible to LocalColumnId. For a nested mapping it is the
114
    // LocalColumnIndex child id under the parent projection. This is deliberately separated from
115
    // ColumnDefinition::identifier, which is the table-to-file matching key such as Parquet/Iceberg
116
    // field_id or column name.
117
    //
118
    // Empty means the table column is constant, missing, partition-only, or virtual.
119
    std::optional<int32_t> file_local_id;
120
    std::string file_column_name;
121
    // Full file type/children before nested projection pruning. Used to rebuild projected types
122
    // and to localize nested filters that reference children not present in the output projection.
123
    DataTypePtr original_file_type;
124
    std::vector<ColumnDefinition> original_file_children;
125
    // Split/file-local constant entry when this mapping is produced from partition/default/virtual
126
    // expression instead of physical file data.
127
    std::optional<ConstantIndex> constant_index;
128
    // Effective file type after applying casts/remaps/nested projection pruning.
129
    DataTypePtr file_type;
130
    // Target table/global type after final materialization.
131
    DataTypePtr table_type;
132
133
    // 最终输出表达式。用于把 file-local value 转成 table/global value,例如 cast、
134
    // default、partition、generated column 或复杂列 remap。
135
    VExprContextSPtr projection;
136
137
    // Mapping tree for nested table children. The order follows table output children, while file
138
    // children can be pruned/reordered through each child mapping's file-reader local id.
139
    std::vector<ColumnMapping> child_mappings;
140
    // True when file value can be used directly as table value without cast or child remap.
141
    bool is_trivial = false;
142
    // True when the nested value read from file has a pruned/remapped child layout and must be
143
    // reconstructed before returning to table/global schema.
144
    bool has_complex_projection = false;
145
    // How filters referencing this table/global column can be converted below table-reader
146
    // finalize. This is metadata for localize_filters() and future constant-filter evaluation.
147
    FilterConversionType filter_conversion = FilterConversionType::FINALIZE_ONLY;
148
    TableVirtualColumnType virtual_column_type = TableVirtualColumnType::INVALID;
149
    VExprContextSPtr default_expr;
150
151
    std::string debug_string() const;
152
};
153
154
struct TableColumnMapperOptions {
155
    TableColumnMappingMode mode = TableColumnMappingMode::BY_FIELD_ID;
156
    bool allow_missing_columns = true;
157
158
    std::string debug_string() const;
159
};
160
161
Status clone_table_expr_tree(const VExprSPtr& expr, VExprSPtr* cloned_expr);
162
163
// 通用 table schema 到 file schema 映射层。
164
// Iceberg 会使用 BY_FIELD_ID;普通 by-name 场景可以复用该组件,但不应把它命名成
165
// Iceberg-only 组件。
166
class TableColumnMapper {
167
public:
168
    explicit TableColumnMapper(TableColumnMapperOptions options = {})
169
0
            : _options(std::move(options)) {}
170
0
    virtual ~TableColumnMapper() = default;
171
172
    // 建立 table schema 到 file schema 的列映射。
173
    // 输出的 ColumnMapping 描述 table column 如何从 file column、常量列或表达式得到;
174
    // 后续 projection、filter localization 和 table block finalize 都应复用这份映射。
175
    virtual Status create_mapping(const std::vector<ColumnDefinition>& projected_columns,
176
                                  const std::map<std::string, Field>& partition_values,
177
                                  const std::vector<ColumnDefinition>& file_schema);
178
179
    // 把 table-level scan 请求转换成 file-local scan 请求。table_filters 保留 row-level
180
    // 过滤语义并转换成 file-local conjuncts;table_column_predicates 只转换成 file-layer
181
    // pruning hints,不参与 batch row filtering。
182
    virtual Status create_scan_request(const std::vector<TableFilter>& table_filters,
183
                                       const TableColumnPredicates& table_column_predicates,
184
                                       const std::vector<ColumnDefinition>& projected_columns,
185
                                       FileScanRequest* file_request,
186
                                       RuntimeState* runtime_state = nullptr);
187
188
    // 将 table-level filter 定位到文件 schema。
189
    // trivial mapping 可以直接复制结构化谓词;类型变化时可以尝试安全 cast;无法安全
190
    // 下推的表达式应通过 reader_expression_map 或 table-level finalize/filter fallback 处理。
191
    virtual Status localize_filters(const std::vector<TableFilter>& table_filters,
192
                                    const TableColumnPredicates& table_column_predicates,
193
                                    FileScanRequest* file_request,
194
                                    RuntimeState* runtime_state = nullptr);
195
0
    void clear() {
196
0
        _mappings.clear();
197
0
        _constant_map.clear();
198
0
        _filter_entries.clear();
199
0
    }
200
0
    const std::vector<ColumnMapping>& mappings() const { return _mappings; }
201
0
    const std::map<GlobalIndex, FilterEntry>& filter_entries() const { return _filter_entries; }
202
0
    const ConstantMap& constant_map() const { return _constant_map; }
203
    std::string debug_string() const;
204
205
private:
206
    const ColumnDefinition* _find_file_field(
207
            const ColumnDefinition& table_column,
208
            const std::vector<ColumnDefinition>& file_schema) const;
209
    Status _create_direct_mapping(const ColumnDefinition& table_column,
210
                                  const ColumnDefinition& file_field, ColumnMapping* mapping) const;
211
212
    Status _create_by_index_mapping(const ColumnDefinition& table_column,
213
                                    const std::vector<ColumnDefinition>& file_schema,
214
                                    ColumnMapping* mapping);
215
    Status _build_filter_entries(const FileScanRequest& file_request);
216
    Status _build_result_column_mapping(const FileScanRequest& file_request);
217
218
    void _set_constant_mapping(ColumnMapping* mapping, VExprContextSPtr expr);
219
220
    ColumnMapping* _find_mapping(GlobalIndex global_index);
221
222
    TableColumnMapperOptions _options;
223
    // Column mapping for each projected column, in the same order as projected_columns. Each entry describes how to get one table/global column from file-local sources, and carries metadata for filter localization and result finalize.
224
    std::vector<ColumnMapping> _mappings;
225
    std::map<GlobalIndex, FilterEntry> _filter_entries;
226
    ConstantMap _constant_map;
227
};
228
229
} // namespace doris::format