be/src/format_v2/column_data.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <algorithm> |
21 | | #include <cstddef> |
22 | | #include <cstdint> |
23 | | #include <ostream> |
24 | | #include <string> |
25 | | #include <utility> |
26 | | #include <vector> |
27 | | |
28 | | #include "common/status.h" |
29 | | #include "core/data_type/data_type.h" |
30 | | #include "core/field.h" |
31 | | #include "exprs/vexpr_fwd.h" |
32 | | |
33 | | namespace doris::format { |
34 | | |
35 | | // File-local top-level column id. |
36 | | // |
37 | | // Scope: |
38 | | // - Only valid inside one physical file schema returned by FileReader::get_schema(). |
39 | | // - For Parquet, this is the top-level field ordinal in the new reader schema. |
40 | | // - The synthetic row-position column also uses this type, with a reserved negative id. |
41 | | // |
42 | | // Do not use this for table/global column unique ids, block positions, nested child ids, or |
43 | | // slot ids. Nested child ids are carried by LocalColumnIndex::index below. |
44 | | class LocalColumnId { |
45 | | public: |
46 | 212 | constexpr LocalColumnId() = default; |
47 | 2.80k | explicit constexpr LocalColumnId(int32_t id) : _id(id) {} |
48 | | |
49 | 212 | static constexpr LocalColumnId invalid() { return LocalColumnId(); } |
50 | | |
51 | 1.22k | constexpr int32_t value() const { return _id; } |
52 | 3 | constexpr bool is_valid() const { return _id >= 0; } |
53 | | |
54 | 1.81k | constexpr bool operator==(const LocalColumnId& other) const { return _id == other._id; } |
55 | 0 | constexpr bool operator!=(const LocalColumnId& other) const { return !(*this == other); } |
56 | 3.94k | constexpr bool operator<(const LocalColumnId& other) const { return _id < other._id; } |
57 | | |
58 | | private: |
59 | | int32_t _id = -1; |
60 | | }; |
61 | | |
62 | | // Position of a file-local column in the Block produced by one FileScanRequest. |
63 | | // |
64 | | // This is assigned by TableColumnMapper/TableReader after predicate/non-predicate columns are |
65 | | // deduplicated. It is not a file schema id and it is not stable across requests. Use value() only |
66 | | // at the boundary where an existing Block or expression API still expects a size_t/int position. |
67 | | class LocalIndex { |
68 | | public: |
69 | | constexpr LocalIndex() = default; |
70 | 420 | explicit constexpr LocalIndex(size_t index) : _index(index) {} |
71 | | |
72 | 1.61k | constexpr size_t value() const { return _index; } |
73 | 0 | constexpr bool operator==(const LocalIndex& other) const { return _index == other._index; } |
74 | 0 | constexpr bool operator<(const LocalIndex& other) const { return _index < other._index; } |
75 | | |
76 | | private: |
77 | | size_t _index = 0; |
78 | | }; |
79 | | |
80 | | // Position of a table/global output column in the final Block returned by TableReader. |
81 | | // |
82 | | // This type is reserved for boundaries that need to refer to caller-visible column order. It must |
83 | | // not be used to index a file-local Block, because schema evolution and lazy materialization can |
84 | | // make file-local order different from table output order. |
85 | | class GlobalIndex { |
86 | | public: |
87 | 218 | constexpr GlobalIndex() = default; |
88 | 640 | explicit constexpr GlobalIndex(size_t index) : _index(index) {} |
89 | | |
90 | 6 | constexpr size_t value() const { return _index; } |
91 | 2.03k | constexpr bool operator==(const GlobalIndex& other) const { return _index == other._index; } |
92 | 3.75k | constexpr bool operator<(const GlobalIndex& other) const { return _index < other._index; } |
93 | | |
94 | | private: |
95 | | size_t _index = 0; |
96 | | }; |
97 | | |
98 | | // Index of a split-local constant/default value used to materialize columns that are not read from |
99 | | // the physical file, such as partition columns, added columns with default values, and virtual |
100 | | // table-format columns. |
101 | | // |
102 | | // It is separate from LocalIndex because constants do not occupy a position in the file reader |
103 | | // output block unless an expression explicitly materializes them. |
104 | | class ConstantIndex { |
105 | | public: |
106 | | constexpr ConstantIndex() = default; |
107 | 0 | explicit constexpr ConstantIndex(size_t index) : _index(index) {} |
108 | | |
109 | 0 | constexpr size_t value() const { return _index; } |
110 | 0 | constexpr bool operator==(const ConstantIndex& other) const { return _index == other._index; } |
111 | 0 | constexpr bool operator<(const ConstantIndex& other) const { return _index < other._index; } |
112 | | |
113 | | private: |
114 | | size_t _index = 0; |
115 | | }; |
116 | | |
117 | 0 | inline std::ostream& operator<<(std::ostream& os, const LocalColumnId& id) { |
118 | 0 | return os << id.value(); |
119 | 0 | } |
120 | | |
121 | 0 | inline std::ostream& operator<<(std::ostream& os, const LocalIndex& index) { |
122 | 0 | return os << index.value(); |
123 | 0 | } |
124 | | |
125 | 0 | inline std::ostream& operator<<(std::ostream& os, const GlobalIndex& index) { |
126 | 0 | return os << index.value(); |
127 | 0 | } |
128 | | |
129 | 0 | inline std::ostream& operator<<(std::ostream& os, const ConstantIndex& index) { |
130 | 0 | return os << index.value(); |
131 | 0 | } |
132 | | |
133 | | // A split/file-local constant value used to materialize a table/global column without reading a |
134 | | // physical file column. |
135 | | // |
136 | | // Common producers are partition values, schema-evolution default expressions, generated columns |
137 | | // and table-format virtual columns. The entry is keyed by ConstantIndex in ConstantMap; global_index |
138 | | // keeps the link back to the caller-visible output column. |
139 | | struct ConstantEntry { |
140 | | GlobalIndex global_index; |
141 | | VExprContextSPtr expr; |
142 | | DataTypePtr type; |
143 | | }; |
144 | | |
145 | | // Per mapping/split collection of constants. |
146 | | // |
147 | | // ConstantIndex only has meaning within this container. Keeping constants separate from LocalIndex |
148 | | // makes it explicit that these values do not occupy positions in the file reader output Block. |
149 | | class ConstantMap { |
150 | | public: |
151 | 0 | ConstantIndex add(ConstantEntry entry) { |
152 | 0 | const auto index = ConstantIndex(_entries.size()); |
153 | 0 | _entries.push_back(std::move(entry)); |
154 | 0 | return index; |
155 | 0 | } |
156 | | |
157 | 0 | const ConstantEntry& get(ConstantIndex index) const { |
158 | 0 | DORIS_CHECK(index.value() < _entries.size()); |
159 | 0 | return _entries[index.value()]; |
160 | 0 | } |
161 | | |
162 | 120 | void clear() { _entries.clear(); } |
163 | 0 | bool empty() const { return _entries.empty(); } |
164 | 0 | size_t size() const { return _entries.size(); } |
165 | | |
166 | 0 | const std::vector<ConstantEntry>& entries() const { return _entries; } |
167 | | |
168 | | private: |
169 | | std::vector<ConstantEntry> _entries; |
170 | | }; |
171 | | |
172 | | // Target of a localized filter. |
173 | | // |
174 | | // A filter can either reference a file-local Block position or a constant entry. Unset entries mean |
175 | | // the filter cannot be evaluated below the table-reader finalize stage. |
176 | | struct FilterEntry { |
177 | | enum class Kind { |
178 | | UNSET, |
179 | | LOCAL, |
180 | | CONSTANT, |
181 | | }; |
182 | | |
183 | 210 | static FilterEntry local(LocalIndex index) { |
184 | 210 | return {.kind = Kind::LOCAL, .index = index.value()}; |
185 | 210 | } |
186 | | |
187 | 0 | static FilterEntry constant(ConstantIndex index) { |
188 | 0 | return {.kind = Kind::CONSTANT, .index = index.value()}; |
189 | 0 | } |
190 | | |
191 | 0 | bool is_set() const { return kind != Kind::UNSET; } |
192 | 423 | bool is_local() const { return kind == Kind::LOCAL; } |
193 | 1 | bool is_constant() const { return kind == Kind::CONSTANT; } |
194 | | |
195 | 210 | LocalIndex local_index() const { |
196 | 210 | DORIS_CHECK(is_local()); |
197 | 210 | return LocalIndex(index); |
198 | 210 | } |
199 | | |
200 | 0 | ConstantIndex constant_index() const { |
201 | 0 | DORIS_CHECK(is_constant()); |
202 | 0 | return ConstantIndex(index); |
203 | 0 | } |
204 | | |
205 | | Kind kind = Kind::UNSET; |
206 | | size_t index = 0; |
207 | | }; |
208 | | |
209 | | enum ColumnType { |
210 | | DATA_COLUMN = 0, // normal data column |
211 | | ROW_NUMBER = 1, // row number in a file |
212 | | }; |
213 | | |
214 | | // Column schema definition shared by table/global projection and file-local schema matching. |
215 | | // |
216 | | // ColumnDefinition intentionally carries schema identity only. FE column unique ids are translated |
217 | | // to GlobalIndex at the FileScannerV2 boundary and must not appear in table/file reader APIs. |
218 | | struct ColumnDefinition { |
219 | | // Typed identifier value used to match a column against another schema. |
220 | | // |
221 | | // - TYPE_NULL: no explicit identifier. BY_NAME falls back to ColumnDefinition::name. |
222 | | // - TYPE_INT: interpreted by TableColumnMapperOptions::mode as a field id or file position. |
223 | | // - TYPE_STRING: explicit name identifier. |
224 | | // |
225 | | // This is not the id that FileReader uses to read data. For example, a Parquet column can be |
226 | | // matched by its optional Parquet field_id, while the reader still addresses it by a file-local |
227 | | // ordinal. |
228 | | Field identifier; |
229 | | // Reader-local id of this node inside the file schema returned by FileReader::get_schema(). |
230 | | // Top-level fields use the root column ordinal and nested fields use the child ordinal under |
231 | | // their parent. -1 means unset; special virtual file columns may use other negative ids. |
232 | | // Table/global ColumnDefinition values can leave this as -1 because they are not read directly |
233 | | // by a FileReader. |
234 | | int32_t local_id = -1; |
235 | | // Logical table column name. This is also the matching name for by-name file formats. |
236 | | std::string name; |
237 | | // Historical or external names for the same logical field. Table formats such as Iceberg can |
238 | | // use this to resolve partition path keys after column rename. |
239 | | std::vector<std::string> name_mapping {}; |
240 | | DataTypePtr type; |
241 | | // Projected nested table children. Children use table/global identifiers; they are resolved to |
242 | | // file-local child ids by TableColumnMapper before reaching FileReader. |
243 | | std::vector<ColumnDefinition> children {}; |
244 | | // Expression used to materialize missing/default/generated values when the column is not read |
245 | | // directly from the file. |
246 | | VExprContextSPtr default_expr = nullptr; |
247 | | // Partition columns are constants from split metadata and should not be matched against file |
248 | | // schema unless table-format logic explicitly asks for it. |
249 | | bool is_partition_key = false; |
250 | | // File-local column kind. For table/global columns this remains DATA_COLUMN. |
251 | | ColumnType column_type = ColumnType::DATA_COLUMN; |
252 | | |
253 | 0 | bool has_identifier() const { return !identifier.is_null(); } |
254 | 20 | bool has_identifier_field_id() const { return identifier.get_type() == TYPE_INT; } |
255 | 5.46k | bool has_identifier_name() const { return identifier.get_type() == TYPE_STRING; } |
256 | | |
257 | | // DuckDB-style helper for BY_FIELD_ID matching. The mapper binds the matching mode once, so a |
258 | | // TYPE_INT identifier is interpreted as a field id only by the field-id matcher. |
259 | 10 | int32_t get_identifier_field_id() const { |
260 | 10 | DORIS_CHECK(has_identifier_field_id()); |
261 | 10 | return identifier.get<TYPE_INT>(); |
262 | 10 | } |
263 | | // DuckDB-style helper for BY_NAME matching. When no explicit string identifier is present, the |
264 | | // logical column name is the identifier. |
265 | 2.73k | const std::string& get_identifier_name() const { |
266 | 2.73k | if (identifier.is_null()) { |
267 | 0 | return name; |
268 | 0 | } |
269 | 2.73k | DORIS_CHECK(has_identifier_name()); |
270 | 2.73k | return identifier.get<TYPE_STRING>(); |
271 | 2.73k | } |
272 | | // Helper for BY_INDEX matching. BY_INDEX reuses the TYPE_INT identifier as the table-side file |
273 | | // position, matching DuckDB's typed identifier plus mapper-mode interpretation. |
274 | 0 | int32_t get_identifier_position() const { |
275 | 0 | DORIS_CHECK(has_identifier_field_id()); |
276 | 0 | return identifier.get<TYPE_INT>(); |
277 | 0 | } |
278 | | |
279 | | // Helper for reader-local projection and scan requests. |
280 | 1.01k | int32_t file_local_id() const { |
281 | 1.01k | if (local_id != -1) { |
282 | 1.01k | return local_id; |
283 | 1.01k | } |
284 | 0 | return get_identifier_field_id(); |
285 | 1.01k | } |
286 | | |
287 | | std::string debug_string() const; |
288 | | }; |
289 | | |
290 | | // Recursive file-local projection path. |
291 | | // |
292 | | // For a root entry in FileScanRequest::{predicate_columns, non_predicate_columns}, index is the |
293 | | // top-level file column id and column_id() is valid. For children, index is the file-local child id |
294 | | // under the parent node, not a table child id and not a child output ordinal. |
295 | | // |
296 | | // project_all_children=true means the whole subtree under this node is needed. When false, children |
297 | | // lists the selected child paths. File readers can use this to avoid constructing readers for |
298 | | // unprojected nested children. |
299 | | struct LocalColumnIndex { |
300 | | int32_t index = -1; |
301 | | bool project_all_children = true; |
302 | | std::vector<LocalColumnIndex> children {}; |
303 | | |
304 | 210 | static LocalColumnIndex top_level(LocalColumnId column_id) { |
305 | 210 | return {.index = column_id.value()}; |
306 | 210 | } |
307 | | |
308 | 0 | static LocalColumnIndex field(int32_t field_id) { return {.index = field_id}; } |
309 | | |
310 | 0 | static LocalColumnIndex partial_field(int32_t field_id) { |
311 | 0 | return {.index = field_id, .project_all_children = false}; |
312 | 0 | } |
313 | | |
314 | 2.03k | LocalColumnId column_id() const { return LocalColumnId(index); } |
315 | 371 | int32_t field_id() const { return index; } |
316 | | std::string debug_string() const; |
317 | | }; |
318 | | |
319 | 42 | inline bool is_full_projection(const LocalColumnIndex* projection) { |
320 | 42 | return projection == nullptr || projection->project_all_children; |
321 | 42 | } |
322 | | |
323 | 40 | inline bool is_partial_projection(const LocalColumnIndex* projection) { |
324 | 40 | return projection != nullptr && !projection->project_all_children; |
325 | 40 | } |
326 | | |
327 | | inline const LocalColumnIndex* find_child_projection(const LocalColumnIndex* projection, |
328 | 29 | int32_t field_id) { |
329 | 29 | if (is_full_projection(projection)) { |
330 | 29 | return nullptr; |
331 | 29 | } |
332 | 0 | const auto child_it = std::find_if( |
333 | 0 | projection->children.begin(), projection->children.end(), |
334 | 0 | [&](const LocalColumnIndex& child) { return child.field_id() == field_id; }); |
335 | 0 | return child_it == projection->children.end() ? nullptr : &*child_it; |
336 | 29 | } |
337 | | |
338 | 13 | inline bool is_child_projected(const LocalColumnIndex* projection, int32_t field_id) { |
339 | 13 | return is_full_projection(projection) || find_child_projection(projection, field_id) != nullptr; |
340 | 13 | } |
341 | | |
342 | | // Merge two projection trees that point to the same file-local node. |
343 | | // |
344 | | // A full projection dominates a partial projection. Two partial projections are merged by child id |
345 | | // and recursively union their child paths. The caller must only merge projections for the same |
346 | | // root/child node. |
347 | 0 | inline Status merge_local_column_index(LocalColumnIndex* target, const LocalColumnIndex& source) { |
348 | 0 | DORIS_CHECK(target != nullptr); |
349 | 0 | DORIS_CHECK(target->index == source.index); |
350 | 0 | if (target->project_all_children) { |
351 | 0 | return Status::OK(); |
352 | 0 | } |
353 | 0 | if (source.project_all_children) { |
354 | 0 | target->project_all_children = true; |
355 | 0 | target->children.clear(); |
356 | 0 | return Status::OK(); |
357 | 0 | } |
358 | 0 | for (const auto& source_child : source.children) { |
359 | 0 | auto target_child_it = std::find_if( |
360 | 0 | target->children.begin(), target->children.end(), |
361 | 0 | [&](const LocalColumnIndex& child) { return child.index == source_child.index; }); |
362 | 0 | if (target_child_it == target->children.end()) { |
363 | 0 | target->children.push_back(source_child); |
364 | 0 | continue; |
365 | 0 | } |
366 | 0 | RETURN_IF_ERROR(merge_local_column_index(&*target_child_it, source_child)); |
367 | 0 | } |
368 | 0 | return Status::OK(); |
369 | 0 | } |
370 | | |
371 | | } // namespace doris::format |