Coverage Report

Created: 2026-07-03 16:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format_v2/column_mapper.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <cstddef>
21
#include <cstdint>
22
#include <map>
23
#include <memory>
24
#include <optional>
25
#include <string>
26
#include <utility>
27
#include <vector>
28
29
#include "common/status.h"
30
#include "core/data_type/data_type.h"
31
#include "core/field.h"
32
#include "exprs/vexpr_fwd.h"
33
#include "format_v2/file_reader.h"
34
35
namespace doris {
36
class ColumnPredicate;
37
class RuntimeState;
38
} // namespace doris
39
40
namespace doris::format {
41
42
struct ColumnDefinition;
43
struct TableFilter;
44
45
// Table-level simple predicates grouped by table/global output position. The key is not
46
// LocalColumnId: TableColumnMapper resolves it through ColumnMapping before creating file pruning
47
// hints.
48
using TableColumnPredicates = std::map<GlobalIndex, std::vector<std::shared_ptr<ColumnPredicate>>>;
49
50
enum class TableColumnMappingMode {
51
    // Match by ColumnDefinition::identifier TYPE_INT as field id.
52
    BY_FIELD_ID,
53
    // Match by ColumnDefinition::identifier TYPE_STRING, or logical name when identifier is null.
54
    BY_NAME,
55
    // Match top-level columns by file position. This mainly serves Hive1 ORC style files whose
56
    // column names are placeholder values such as `_col0` / `_col1`, where position is the only
57
    // reliable way to select the correct column.
58
    BY_INDEX,
59
};
60
61
enum TableVirtualColumnType {
62
    INVALID = 0, // not a virtual column
63
    // Iceberg v3 row lineage metadata column `_row_id`. Physical non-null values
64
    // are preserved; NULL or missing values inherit first_row_id + row_position.
65
    ROW_ID = 1,
66
    // Iceberg v3 row lineage metadata column `_last_updated_sequence_number`.
67
    // Physical non-null values are preserved; NULL or missing values inherit the
68
    // data file's last_updated_sequence_number.
69
    LAST_UPDATED_SEQUENCE_NUMBER = 2,
70
    // Doris internal Iceberg row locator column `__DORIS_ICEBERG_ROWID_COL__`.
71
    // It is a struct used by delete/update/merge, not the Iceberg `_row_id`.
72
    ICEBERG_ROWID = 3,
73
};
74
75
enum class FilterConversionType {
76
    COPY_DIRECTLY, // filter can be copied directly from file layer without any change, e.g. column type and table type are the same and no complex nested projection is involved.
77
    CAST_FILTER, // filter can be converted from file layer by adding a cast, e.g. column type is nullable but table type is not, or file column has a trivial nested projection but table column has a complex nested projection.
78
    READER_EXPRESSION,
79
    FINALIZE_ONLY, // filter cannot be converted to file layer and should be evaluated at table reader finalize phase, e.g. predicates on ICEBERG_ROW_ID column which is generated by IcebergReader.
80
    CONSTANT,
81
};
82
83
// Nested global-to-local child mapping. The root index points either to a request-local slot or to
84
// a child id, depending on the owner. child_mapping keeps the recursive table-child to file-child
85
// relationship explicit instead of encoding it in ColumnMapping flags.
86
struct IndexMapping {
87
    int32_t index = -1;
88
    std::map<int32_t, std::shared_ptr<IndexMapping>> child_mapping;
89
};
90
91
// Recursive result produced after one table/global column is assigned to a file-local source.
92
struct ColumnMapResult {
93
    std::optional<LocalColumnId> local_column_id;
94
    std::optional<LocalColumnIndex> column_index;
95
    std::optional<IndexMapping> mapping;
96
};
97
98
// Final mapping entry from one global result column to one file-local source.
99
struct ColumnMapEntry {
100
    IndexMapping mapping;
101
    DataTypePtr local_type;
102
    DataTypePtr global_type;
103
    FilterConversionType filter_conversion = FilterConversionType::FINALIZE_ONLY;
104
};
105
106
// Collection of final result-column mappings produced for one file/split.
107
struct ResultColumnMapping {
108
    std::map<GlobalIndex, ColumnMapEntry> global_to_local;
109
};
110
111
// Mapping result from one table column to one file column.
112
// This is the main boundary object between table-level schema semantics and file-local schema
113
// semantics.
114
struct ColumnMapping {
115
    // Position of the top-level projected column in the table/global output block. Table-level
116
    // filters and column predicates refer to this index after FileScannerV2 translates FE ids at
117
    // the scanner boundary.
118
    GlobalIndex global_index;
119
    std::string table_column_name;
120
    // File-reader local id for the mapped node.
121
    //
122
    // For a root mapping it is convertible to LocalColumnId. For a nested mapping it is the
123
    // LocalColumnIndex child id under the parent projection. This is deliberately separated from
124
    // ColumnDefinition::identifier, which is the table-to-file matching key such as Parquet/Iceberg
125
    // field_id or column name.
126
    //
127
    // Empty means the table column is constant, missing, partition-only, or virtual.
128
    std::optional<int32_t> file_local_id;
129
    std::string file_column_name;
130
    // Full file type/children before nested projection pruning. Used to rebuild projected types
131
    // and to localize nested filters that reference children not present in the output projection.
132
    DataTypePtr original_file_type;
133
    std::vector<ColumnDefinition> original_file_children;
134
    // File children after applying the scan projection. The order follows the file-local semantic
135
    // schema, not table child order. TableReader uses this to map table-output children back to the
136
    // file-local block layout when projection, predicate-only children, and schema evolution mix.
137
    std::vector<ColumnDefinition> projected_file_children;
138
    // Split/file-local constant entry when this mapping is produced from partition/default/virtual
139
    // expression instead of physical file data.
140
    std::optional<ConstantIndex> constant_index;
141
    // Effective file type after applying casts/remaps/nested projection pruning.
142
    DataTypePtr file_type;
143
    // Target table/global type after final materialization.
144
    DataTypePtr table_type;
145
146
    // Final projection expression used to convert file-local values into table/global values, such
147
    // as casts, defaults, partition values, generated columns, or complex-column remaps.
148
    VExprContextSPtr projection;
149
150
    // Mapping tree for nested table children. The order follows table output children, while file
151
    // children can be pruned/reordered through each child mapping's file-reader local id.
152
    std::vector<ColumnMapping> child_mappings;
153
    // True when file value can be used directly as table value without cast or child remap.
154
    bool is_trivial = false;
155
    // How filters referencing this table/global column can be converted below table-reader
156
    // finalize. This is metadata for localize_filters() and future constant-filter evaluation.
157
    FilterConversionType filter_conversion = FilterConversionType::FINALIZE_ONLY;
158
    TableVirtualColumnType virtual_column_type = TableVirtualColumnType::INVALID;
159
    VExprContextSPtr default_expr;
160
161
    std::string debug_string() const;
162
};
163
164
struct TableColumnMapperOptions {
165
    TableColumnMappingMode mode = TableColumnMappingMode::BY_FIELD_ID;
166
167
    std::string debug_string() const;
168
};
169
170
Status clone_table_expr_tree(const VExprSPtr& expr, VExprSPtr* cloned_expr);
171
const Field* find_partition_value(const ColumnDefinition& table_column,
172
                                  const std::map<std::string, Field>& partition_values);
173
174
// Generic mapping layer from table schema to file schema.
175
// Iceberg uses BY_FIELD_ID. Plain by-name formats can reuse this component as well, so keep this
176
// abstraction table-format neutral instead of making it Iceberg-only.
177
class TableColumnMapper {
178
public:
179
    explicit TableColumnMapper(TableColumnMapperOptions options = {})
180
38.5k
            : _options(std::move(options)) {}
181
38.6k
    virtual ~TableColumnMapper() = default;
182
183
    // Build column mappings from table schema to file schema.
184
    // The resulting ColumnMapping describes how each table column is produced from a file column,
185
    // a constant, or an expression. Later projection, filter localization, and table-block
186
    // finalization should all reuse the same mapping.
187
    virtual Status create_mapping(const std::vector<ColumnDefinition>& projected_columns,
188
                                  const std::map<std::string, Field>& partition_values,
189
                                  const std::vector<ColumnDefinition>& file_schema);
190
191
    // Convert a table-level scan request into a file-local scan request. table_filters preserve
192
    // row-level filtering semantics and are rewritten as file-local conjuncts. table_column_predicates
193
    // are converted only into file-layer pruning hints and do not participate in batch row
194
    // filtering.
195
    virtual Status create_scan_request(const std::vector<TableFilter>& table_filters,
196
                                       const TableColumnPredicates& table_column_predicates,
197
                                       const std::vector<ColumnDefinition>& projected_columns,
198
                                       FileScanRequest* file_request,
199
                                       RuntimeState* runtime_state = nullptr);
200
201
    // Localize table-level filters to the file schema.
202
    // Trivial mappings can copy structured predicates directly. Type changes may be localized with
203
    // a safe cast. Expressions that cannot be pushed down safely should be handled through
204
    // reader_expression_map or table-level finalize/filter fallback.
205
    virtual Status localize_filters(const std::vector<TableFilter>& table_filters,
206
                                    const TableColumnPredicates& table_column_predicates,
207
                                    FileScanRequest* file_request,
208
                                    RuntimeState* runtime_state = nullptr);
209
77.0k
    void clear() {
210
77.0k
        _mappings.clear();
211
77.0k
        _hidden_mappings.clear();
212
77.0k
        _constant_map.clear();
213
77.0k
        _filter_entries.clear();
214
77.0k
        _file_schema.clear();
215
77.0k
        _partition_values.clear();
216
77.0k
    }
217
953k
    const std::vector<ColumnMapping>& mappings() const { return _mappings; }
218
18.6k
    const std::map<GlobalIndex, FilterEntry>& filter_entries() const { return _filter_entries; }
219
2.75k
    const ConstantMap& constant_map() const { return _constant_map; }
220
    std::string debug_string() const;
221
222
protected:
223
    // Columnar readers such as Parquet can read predicate columns first, evaluate row filters, and
224
    // lazily read the rest. Row-oriented readers such as CSV/Text materialize one row at a time and
225
    // should keep all required columns in one scan list.
226
29.9k
    virtual bool enable_lazy_materialization() const { return true; }
227
    // File-layer column predicate filters are reader-specific pruning hints. Parquet consumes them
228
    // for row-group/page-index/statistics pruning; simple delimited readers do not.
229
32.5k
    virtual bool enable_column_predicate_filters() const { return true; }
230
    // Row-oriented readers such as CSV/Text cannot physically read only a nested child from one
231
    // delimited text field. They must scan the whole complex top-level field and let TableReader
232
    // rematerialize the requested table child after row-level filters have run.
233
112k
    virtual bool force_full_complex_scan_projection() const { return false; }
234
235
    const ColumnDefinition* _find_file_field(
236
            const ColumnDefinition& table_column,
237
            const std::vector<ColumnDefinition>& file_schema) const;
238
    Status _create_direct_mapping(const ColumnDefinition& table_column,
239
                                  const ColumnDefinition& file_field, ColumnMapping* mapping) const;
240
241
    Status _create_by_index_mapping(const ColumnDefinition& table_column,
242
                                    const std::vector<ColumnDefinition>& file_schema,
243
                                    ColumnMapping* mapping);
244
    Status _build_filter_entries(const FileScanRequest& file_request);
245
    Status _build_result_column_mapping(const FileScanRequest& file_request);
246
247
    void _set_constant_mapping(ColumnMapping* mapping, VExprContextSPtr expr);
248
    Status _create_mapping_for_column(const ColumnDefinition& table_column,
249
                                      GlobalIndex global_index, ColumnMapping* mapping);
250
    Status _create_hidden_filter_mapping(const ColumnDefinition& table_column,
251
                                         GlobalIndex global_index, ColumnMapping* mapping);
252
    Status _build_hidden_filter_mappings(const std::vector<TableFilter>& table_filters);
253
    std::vector<ColumnMapping> _filter_visible_mappings() const;
254
255
    ColumnMapping* _find_mapping(GlobalIndex global_index);
256
    ColumnMapping* _find_filter_mapping(GlobalIndex global_index);
257
258
    TableColumnMapperOptions _options;
259
    // Column mapping for each projected column, in the same order as projected_columns. Each entry
260
    // describes how to get one table/global column from file-local sources, and carries metadata
261
    // for filter localization and result finalize.
262
    std::vector<ColumnMapping> _mappings;
263
    // Predicate-only top-level columns are not output projection columns, so keep their mappings
264
    // here. They are visible only to filter localization and file-reader predicate construction.
265
    std::vector<ColumnMapping> _hidden_mappings;
266
    std::map<GlobalIndex, FilterEntry> _filter_entries;
267
    ConstantMap _constant_map;
268
    // Split-local schema state retained from create_mapping() so create_scan_request() can build
269
    // hidden mappings for top-level filter slots that are absent from projected_columns.
270
    std::vector<ColumnDefinition> _file_schema;
271
    std::map<std::string, Field> _partition_values;
272
};
273
274
// Parquet consumes the full FileScanRequest shape: predicate columns for lazy materialization and
275
// top-level column_predicate_filters for statistics/page-index pruning.
276
class ParquetColumnMapper final : public TableColumnMapper {
277
public:
278
    using TableColumnMapper::TableColumnMapper;
279
};
280
281
// Mapper for readers that always materialize every required file column before filtering. The
282
// table-to-file schema mapping is still generic, but the FileScanRequest layout is simpler:
283
// predicate_columns and column_predicate_filters are not populated.
284
class MaterializedColumnMapper final : public TableColumnMapper {
285
public:
286
    using TableColumnMapper::TableColumnMapper;
287
288
protected:
289
550
    bool enable_lazy_materialization() const override { return false; }
290
6.01k
    bool enable_column_predicate_filters() const override { return false; }
291
185k
    bool force_full_complex_scan_projection() const override { return true; }
292
};
293
294
} // namespace doris::format