Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <cstddef> |
21 | | #include <cstdint> |
22 | | #include <map> |
23 | | #include <memory> |
24 | | #include <optional> |
25 | | #include <string> |
26 | | #include <utility> |
27 | | #include <vector> |
28 | | |
29 | | #include "common/status.h" |
30 | | #include "core/data_type/data_type.h" |
31 | | #include "exprs/vexpr_fwd.h" |
32 | | #include "format_v2/expr/literal.h" |
33 | | #include "format_v2/file_reader.h" |
34 | | |
35 | | namespace doris { |
36 | | class ColumnPredicate; |
37 | | class RuntimeState; |
38 | | } // namespace doris |
39 | | |
40 | | namespace doris::format { |
41 | | |
42 | | struct ColumnDefinition; |
43 | | struct TableFilter; |
44 | | |
45 | | // Table-level simple predicates grouped by table/global output position. The key is not |
46 | | // LocalColumnId: TableColumnMapper resolves it through ColumnMapping before creating file pruning |
47 | | // hints. |
48 | | using TableColumnPredicates = std::map<GlobalIndex, std::vector<std::shared_ptr<ColumnPredicate>>>; |
49 | | |
50 | | enum class TableColumnMappingMode { |
51 | | // Match by ColumnDefinition::identifier TYPE_INT as field id. |
52 | | BY_FIELD_ID, |
53 | | // Match by ColumnDefinition::identifier TYPE_STRING, or logical name when identifier is null. |
54 | | BY_NAME, |
55 | | // Match top-level columns by file position. This mainly serves Hive1 ORC style files whose |
56 | | // column names are placeholder values such as `_col0` / `_col1`, where position is the only |
57 | | // reliable way to select the correct column. |
58 | | BY_INDEX, |
59 | | }; |
60 | | |
61 | | enum TableVirtualColumnType { |
62 | | INVALID = 0, // not a virtual column |
63 | | ROW_ID = 1, |
64 | | LAST_UPDATED_SEQUENCE_NUMBER = 2, |
65 | | }; |
66 | | |
67 | | enum class FilterConversionType { |
68 | | COPY_DIRECTLY, // filter can be copied directly from file layer without any change, e.g. column type and table type are the same and no complex nested projection is involved. |
69 | | CAST_FILTER, // filter can be converted from file layer by adding a cast, e.g. column type is nullable but table type is not, or file column has a trivial nested projection but table column has a complex nested projection. |
70 | | READER_EXPRESSION, |
71 | | FINALIZE_ONLY, // filter cannot be converted to file layer and should be evaluated at table reader finalize phase, e.g. a child column of a nested column is null in file schema. |
72 | | CONSTANT, |
73 | | }; |
74 | | |
75 | | // Nested global-to-local child mapping. The root index points either to a request-local slot or to |
76 | | // a child id, depending on the owner. child_mapping keeps the recursive table-child to file-child |
77 | | // relationship explicit instead of encoding it in ColumnMapping flags. |
78 | | struct IndexMapping { |
79 | | int32_t index = -1; |
80 | | std::map<int32_t, std::shared_ptr<IndexMapping>> child_mapping; |
81 | | }; |
82 | | |
83 | | // Recursive result produced after one table/global column is assigned to a file-local source. |
84 | | struct ColumnMapResult { |
85 | | std::optional<LocalColumnId> local_column_id; |
86 | | std::optional<LocalColumnIndex> column_index; |
87 | | std::optional<IndexMapping> mapping; |
88 | | }; |
89 | | |
90 | | // Final mapping entry from one global result column to one file-local source. |
91 | | struct ColumnMapEntry { |
92 | | IndexMapping mapping; |
93 | | DataTypePtr local_type; |
94 | | DataTypePtr global_type; |
95 | | FilterConversionType filter_conversion = FilterConversionType::FINALIZE_ONLY; |
96 | | }; |
97 | | |
98 | | // Collection of final result-column mappings produced for one file/split. |
99 | | struct ResultColumnMapping { |
100 | | std::map<GlobalIndex, ColumnMapEntry> global_to_local; |
101 | | }; |
102 | | |
103 | | // 单个 table column 到 file column 的映射结果。 |
104 | | // 这是 table 层和 file 层的核心边界对象。 |
105 | | struct ColumnMapping { |
106 | | // Position of the top-level projected column in the table/global output block. Table-level |
107 | | // filters and column predicates refer to this index after FileScannerV2 translates FE ids at |
108 | | // the scanner boundary. |
109 | | GlobalIndex global_index; |
110 | | std::string table_column_name; |
111 | | // File-reader local id for the mapped node. |
112 | | // |
113 | | // For a root mapping it is convertible to LocalColumnId. For a nested mapping it is the |
114 | | // LocalColumnIndex child id under the parent projection. This is deliberately separated from |
115 | | // ColumnDefinition::identifier, which is the table-to-file matching key such as Parquet/Iceberg |
116 | | // field_id or column name. |
117 | | // |
118 | | // Empty means the table column is constant, missing, partition-only, or virtual. |
119 | | std::optional<int32_t> file_local_id; |
120 | | std::string file_column_name; |
121 | | // Full file type/children before nested projection pruning. Used to rebuild projected types |
122 | | // and to localize nested filters that reference children not present in the output projection. |
123 | | DataTypePtr original_file_type; |
124 | | std::vector<ColumnDefinition> original_file_children; |
125 | | // Split/file-local constant entry when this mapping is produced from partition/default/virtual |
126 | | // expression instead of physical file data. |
127 | | std::optional<ConstantIndex> constant_index; |
128 | | // Effective file type after applying casts/remaps/nested projection pruning. |
129 | | DataTypePtr file_type; |
130 | | // Target table/global type after final materialization. |
131 | | DataTypePtr table_type; |
132 | | |
133 | | // 最终输出表达式。用于把 file-local value 转成 table/global value,例如 cast、 |
134 | | // default、partition、generated column 或复杂列 remap。 |
135 | | VExprContextSPtr projection; |
136 | | |
137 | | // Mapping tree for nested table children. The order follows table output children, while file |
138 | | // children can be pruned/reordered through each child mapping's file-reader local id. |
139 | | std::vector<ColumnMapping> child_mappings; |
140 | | // True when file value can be used directly as table value without cast or child remap. |
141 | | bool is_trivial = false; |
142 | | // True when the nested value read from file has a pruned/remapped child layout and must be |
143 | | // reconstructed before returning to table/global schema. |
144 | | bool has_complex_projection = false; |
145 | | // How filters referencing this table/global column can be converted below table-reader |
146 | | // finalize. This is metadata for localize_filters() and future constant-filter evaluation. |
147 | | FilterConversionType filter_conversion = FilterConversionType::FINALIZE_ONLY; |
148 | | TableVirtualColumnType virtual_column_type = TableVirtualColumnType::INVALID; |
149 | | VExprContextSPtr default_expr; |
150 | | |
151 | | std::string debug_string() const; |
152 | | }; |
153 | | |
154 | | struct TableColumnMapperOptions { |
155 | | TableColumnMappingMode mode = TableColumnMappingMode::BY_FIELD_ID; |
156 | | bool allow_missing_columns = true; |
157 | | |
158 | | std::string debug_string() const; |
159 | | }; |
160 | | |
161 | | Status clone_table_expr_tree(const VExprSPtr& expr, VExprSPtr* cloned_expr); |
162 | | |
163 | | // 通用 table schema 到 file schema 映射层。 |
164 | | // Iceberg 会使用 BY_FIELD_ID;普通 by-name 场景可以复用该组件,但不应把它命名成 |
165 | | // Iceberg-only 组件。 |
166 | | class TableColumnMapper { |
167 | | public: |
168 | | explicit TableColumnMapper(TableColumnMapperOptions options = {}) |
169 | 0 | : _options(std::move(options)) {} |
170 | 0 | virtual ~TableColumnMapper() = default; |
171 | | |
172 | | // 建立 table schema 到 file schema 的列映射。 |
173 | | // 输出的 ColumnMapping 描述 table column 如何从 file column、常量列或表达式得到; |
174 | | // 后续 projection、filter localization 和 table block finalize 都应复用这份映射。 |
175 | | virtual Status create_mapping(const std::vector<ColumnDefinition>& projected_columns, |
176 | | const std::map<std::string, Field>& partition_values, |
177 | | const std::vector<ColumnDefinition>& file_schema); |
178 | | |
179 | | // 把 table-level scan 请求转换成 file-local scan 请求。table_filters 保留 row-level |
180 | | // 过滤语义并转换成 file-local conjuncts;table_column_predicates 只转换成 file-layer |
181 | | // pruning hints,不参与 batch row filtering。 |
182 | | virtual Status create_scan_request(const std::vector<TableFilter>& table_filters, |
183 | | const TableColumnPredicates& table_column_predicates, |
184 | | const std::vector<ColumnDefinition>& projected_columns, |
185 | | FileScanRequest* file_request, |
186 | | RuntimeState* runtime_state = nullptr); |
187 | | |
188 | | // 将 table-level filter 定位到文件 schema。 |
189 | | // trivial mapping 可以直接复制结构化谓词;类型变化时可以尝试安全 cast;无法安全 |
190 | | // 下推的表达式应通过 reader_expression_map 或 table-level finalize/filter fallback 处理。 |
191 | | virtual Status localize_filters(const std::vector<TableFilter>& table_filters, |
192 | | const TableColumnPredicates& table_column_predicates, |
193 | | FileScanRequest* file_request, |
194 | | RuntimeState* runtime_state = nullptr); |
195 | 0 | void clear() { |
196 | 0 | _mappings.clear(); |
197 | 0 | _constant_map.clear(); |
198 | 0 | _filter_entries.clear(); |
199 | 0 | } |
200 | 0 | const std::vector<ColumnMapping>& mappings() const { return _mappings; } |
201 | 0 | const std::map<GlobalIndex, FilterEntry>& filter_entries() const { return _filter_entries; } |
202 | 0 | const ConstantMap& constant_map() const { return _constant_map; } |
203 | | std::string debug_string() const; |
204 | | |
205 | | private: |
206 | | const ColumnDefinition* _find_file_field( |
207 | | const ColumnDefinition& table_column, |
208 | | const std::vector<ColumnDefinition>& file_schema) const; |
209 | | Status _create_direct_mapping(const ColumnDefinition& table_column, |
210 | | const ColumnDefinition& file_field, ColumnMapping* mapping) const; |
211 | | |
212 | | Status _create_by_index_mapping(const ColumnDefinition& table_column, |
213 | | const std::vector<ColumnDefinition>& file_schema, |
214 | | ColumnMapping* mapping); |
215 | | Status _build_filter_entries(const FileScanRequest& file_request); |
216 | | Status _build_result_column_mapping(const FileScanRequest& file_request); |
217 | | |
218 | | void _set_constant_mapping(ColumnMapping* mapping, VExprContextSPtr expr); |
219 | | |
220 | | ColumnMapping* _find_mapping(GlobalIndex global_index); |
221 | | |
222 | | TableColumnMapperOptions _options; |
223 | | // Column mapping for each projected column, in the same order as projected_columns. Each entry describes how to get one table/global column from file-local sources, and carries metadata for filter localization and result finalize. |
224 | | std::vector<ColumnMapping> _mappings; |
225 | | std::map<GlobalIndex, FilterEntry> _filter_entries; |
226 | | ConstantMap _constant_map; |
227 | | }; |
228 | | |
229 | | } // namespace doris::format |