Coverage Report

Created: 2026-06-09 07:29

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format_v2/column_data.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <algorithm>
21
#include <cstddef>
22
#include <cstdint>
23
#include <ostream>
24
#include <string>
25
#include <utility>
26
#include <vector>
27
28
#include "common/status.h"
29
#include "core/data_type/data_type.h"
30
#include "core/field.h"
31
#include "exprs/vexpr_fwd.h"
32
33
namespace doris::format {
34
35
// File-local top-level column id.
36
//
37
// Scope:
38
// - Only valid inside one physical file schema returned by FileReader::get_schema().
39
// - For Parquet, this is the top-level field ordinal in the new reader schema.
40
// - The synthetic row-position column also uses this type, with a reserved negative id.
41
//
42
// Do not use this for table/global column unique ids, block positions, nested child ids, or
43
// slot ids. Nested child ids are carried by LocalColumnIndex::index below.
44
class LocalColumnId {
45
public:
46
212
    constexpr LocalColumnId() = default;
47
2.80k
    explicit constexpr LocalColumnId(int32_t id) : _id(id) {}
48
49
212
    static constexpr LocalColumnId invalid() { return LocalColumnId(); }
50
51
1.22k
    constexpr int32_t value() const { return _id; }
52
3
    constexpr bool is_valid() const { return _id >= 0; }
53
54
1.81k
    constexpr bool operator==(const LocalColumnId& other) const { return _id == other._id; }
55
0
    constexpr bool operator!=(const LocalColumnId& other) const { return !(*this == other); }
56
3.94k
    constexpr bool operator<(const LocalColumnId& other) const { return _id < other._id; }
57
58
private:
59
    int32_t _id = -1;
60
};
61
62
// Position of a file-local column in the Block produced by one FileScanRequest.
63
//
64
// This is assigned by TableColumnMapper/TableReader after predicate/non-predicate columns are
65
// deduplicated. It is not a file schema id and it is not stable across requests. Use value() only
66
// at the boundary where an existing Block or expression API still expects a size_t/int position.
67
class LocalIndex {
68
public:
69
    constexpr LocalIndex() = default;
70
420
    explicit constexpr LocalIndex(size_t index) : _index(index) {}
71
72
1.61k
    constexpr size_t value() const { return _index; }
73
0
    constexpr bool operator==(const LocalIndex& other) const { return _index == other._index; }
74
0
    constexpr bool operator<(const LocalIndex& other) const { return _index < other._index; }
75
76
private:
77
    size_t _index = 0;
78
};
79
80
// Position of a table/global output column in the final Block returned by TableReader.
81
//
82
// This type is reserved for boundaries that need to refer to caller-visible column order. It must
83
// not be used to index a file-local Block, because schema evolution and lazy materialization can
84
// make file-local order different from table output order.
85
class GlobalIndex {
86
public:
87
218
    constexpr GlobalIndex() = default;
88
640
    explicit constexpr GlobalIndex(size_t index) : _index(index) {}
89
90
6
    constexpr size_t value() const { return _index; }
91
2.03k
    constexpr bool operator==(const GlobalIndex& other) const { return _index == other._index; }
92
3.75k
    constexpr bool operator<(const GlobalIndex& other) const { return _index < other._index; }
93
94
private:
95
    size_t _index = 0;
96
};
97
98
// Index of a split-local constant/default value used to materialize columns that are not read from
99
// the physical file, such as partition columns, added columns with default values, and virtual
100
// table-format columns.
101
//
102
// It is separate from LocalIndex because constants do not occupy a position in the file reader
103
// output block unless an expression explicitly materializes them.
104
class ConstantIndex {
105
public:
106
    constexpr ConstantIndex() = default;
107
0
    explicit constexpr ConstantIndex(size_t index) : _index(index) {}
108
109
0
    constexpr size_t value() const { return _index; }
110
0
    constexpr bool operator==(const ConstantIndex& other) const { return _index == other._index; }
111
0
    constexpr bool operator<(const ConstantIndex& other) const { return _index < other._index; }
112
113
private:
114
    size_t _index = 0;
115
};
116
117
0
inline std::ostream& operator<<(std::ostream& os, const LocalColumnId& id) {
118
0
    return os << id.value();
119
0
}
120
121
0
inline std::ostream& operator<<(std::ostream& os, const LocalIndex& index) {
122
0
    return os << index.value();
123
0
}
124
125
0
inline std::ostream& operator<<(std::ostream& os, const GlobalIndex& index) {
126
0
    return os << index.value();
127
0
}
128
129
0
inline std::ostream& operator<<(std::ostream& os, const ConstantIndex& index) {
130
0
    return os << index.value();
131
0
}
132
133
// A split/file-local constant value used to materialize a table/global column without reading a
134
// physical file column.
135
//
136
// Common producers are partition values, schema-evolution default expressions, generated columns
137
// and table-format virtual columns. The entry is keyed by ConstantIndex in ConstantMap; global_index
138
// keeps the link back to the caller-visible output column.
139
struct ConstantEntry {
140
    GlobalIndex global_index;
141
    VExprContextSPtr expr;
142
    DataTypePtr type;
143
};
144
145
// Per mapping/split collection of constants.
146
//
147
// ConstantIndex only has meaning within this container. Keeping constants separate from LocalIndex
148
// makes it explicit that these values do not occupy positions in the file reader output Block.
149
class ConstantMap {
150
public:
151
0
    ConstantIndex add(ConstantEntry entry) {
152
0
        const auto index = ConstantIndex(_entries.size());
153
0
        _entries.push_back(std::move(entry));
154
0
        return index;
155
0
    }
156
157
0
    const ConstantEntry& get(ConstantIndex index) const {
158
0
        DORIS_CHECK(index.value() < _entries.size());
159
0
        return _entries[index.value()];
160
0
    }
161
162
120
    void clear() { _entries.clear(); }
163
0
    bool empty() const { return _entries.empty(); }
164
0
    size_t size() const { return _entries.size(); }
165
166
0
    const std::vector<ConstantEntry>& entries() const { return _entries; }
167
168
private:
169
    std::vector<ConstantEntry> _entries;
170
};
171
172
// Target of a localized filter.
173
//
174
// A filter can either reference a file-local Block position or a constant entry. Unset entries mean
175
// the filter cannot be evaluated below the table-reader finalize stage.
176
struct FilterEntry {
177
    enum class Kind {
178
        UNSET,
179
        LOCAL,
180
        CONSTANT,
181
    };
182
183
210
    static FilterEntry local(LocalIndex index) {
184
210
        return {.kind = Kind::LOCAL, .index = index.value()};
185
210
    }
186
187
0
    static FilterEntry constant(ConstantIndex index) {
188
0
        return {.kind = Kind::CONSTANT, .index = index.value()};
189
0
    }
190
191
0
    bool is_set() const { return kind != Kind::UNSET; }
192
423
    bool is_local() const { return kind == Kind::LOCAL; }
193
1
    bool is_constant() const { return kind == Kind::CONSTANT; }
194
195
210
    LocalIndex local_index() const {
196
210
        DORIS_CHECK(is_local());
197
210
        return LocalIndex(index);
198
210
    }
199
200
0
    ConstantIndex constant_index() const {
201
0
        DORIS_CHECK(is_constant());
202
0
        return ConstantIndex(index);
203
0
    }
204
205
    Kind kind = Kind::UNSET;
206
    size_t index = 0;
207
};
208
209
enum ColumnType {
210
    DATA_COLUMN = 0, // normal data column
211
    ROW_NUMBER = 1,  // row number in a file
212
};
213
214
// Column schema definition shared by table/global projection and file-local schema matching.
215
//
216
// ColumnDefinition intentionally carries schema identity only. FE column unique ids are translated
217
// to GlobalIndex at the FileScannerV2 boundary and must not appear in table/file reader APIs.
218
struct ColumnDefinition {
219
    // Typed identifier value used to match a column against another schema.
220
    //
221
    // - TYPE_NULL: no explicit identifier. BY_NAME falls back to ColumnDefinition::name.
222
    // - TYPE_INT: interpreted by TableColumnMapperOptions::mode as a field id or file position.
223
    // - TYPE_STRING: explicit name identifier.
224
    //
225
    // This is not the id that FileReader uses to read data. For example, a Parquet column can be
226
    // matched by its optional Parquet field_id, while the reader still addresses it by a file-local
227
    // ordinal.
228
    Field identifier;
229
    // Reader-local id of this node inside the file schema returned by FileReader::get_schema().
230
    // Top-level fields use the root column ordinal and nested fields use the child ordinal under
231
    // their parent. -1 means unset; special virtual file columns may use other negative ids.
232
    // Table/global ColumnDefinition values can leave this as -1 because they are not read directly
233
    // by a FileReader.
234
    int32_t local_id = -1;
235
    // Logical table column name. This is also the matching name for by-name file formats.
236
    std::string name;
237
    // Historical or external names for the same logical field. Table formats such as Iceberg can
238
    // use this to resolve partition path keys after column rename.
239
    std::vector<std::string> name_mapping {};
240
    DataTypePtr type;
241
    // Projected nested table children. Children use table/global identifiers; they are resolved to
242
    // file-local child ids by TableColumnMapper before reaching FileReader.
243
    std::vector<ColumnDefinition> children {};
244
    // Expression used to materialize missing/default/generated values when the column is not read
245
    // directly from the file.
246
    VExprContextSPtr default_expr = nullptr;
247
    // Partition columns are constants from split metadata and should not be matched against file
248
    // schema unless table-format logic explicitly asks for it.
249
    bool is_partition_key = false;
250
    // File-local column kind. For table/global columns this remains DATA_COLUMN.
251
    ColumnType column_type = ColumnType::DATA_COLUMN;
252
253
0
    bool has_identifier() const { return !identifier.is_null(); }
254
20
    bool has_identifier_field_id() const { return identifier.get_type() == TYPE_INT; }
255
5.46k
    bool has_identifier_name() const { return identifier.get_type() == TYPE_STRING; }
256
257
    // DuckDB-style helper for BY_FIELD_ID matching. The mapper binds the matching mode once, so a
258
    // TYPE_INT identifier is interpreted as a field id only by the field-id matcher.
259
10
    int32_t get_identifier_field_id() const {
260
10
        DORIS_CHECK(has_identifier_field_id());
261
10
        return identifier.get<TYPE_INT>();
262
10
    }
263
    // DuckDB-style helper for BY_NAME matching. When no explicit string identifier is present, the
264
    // logical column name is the identifier.
265
2.73k
    const std::string& get_identifier_name() const {
266
2.73k
        if (identifier.is_null()) {
267
0
            return name;
268
0
        }
269
2.73k
        DORIS_CHECK(has_identifier_name());
270
2.73k
        return identifier.get<TYPE_STRING>();
271
2.73k
    }
272
    // Helper for BY_INDEX matching. BY_INDEX reuses the TYPE_INT identifier as the table-side file
273
    // position, matching DuckDB's typed identifier plus mapper-mode interpretation.
274
0
    int32_t get_identifier_position() const {
275
0
        DORIS_CHECK(has_identifier_field_id());
276
0
        return identifier.get<TYPE_INT>();
277
0
    }
278
279
    // Helper for reader-local projection and scan requests.
280
1.01k
    int32_t file_local_id() const {
281
1.01k
        if (local_id != -1) {
282
1.01k
            return local_id;
283
1.01k
        }
284
0
        return get_identifier_field_id();
285
1.01k
    }
286
287
    std::string debug_string() const;
288
};
289
290
// Recursive file-local projection path.
291
//
292
// For a root entry in FileScanRequest::{predicate_columns, non_predicate_columns}, index is the
293
// top-level file column id and column_id() is valid. For children, index is the file-local child id
294
// under the parent node, not a table child id and not a child output ordinal.
295
//
296
// project_all_children=true means the whole subtree under this node is needed. When false, children
297
// lists the selected child paths. File readers can use this to avoid constructing readers for
298
// unprojected nested children.
299
struct LocalColumnIndex {
300
    int32_t index = -1;
301
    bool project_all_children = true;
302
    std::vector<LocalColumnIndex> children {};
303
304
210
    static LocalColumnIndex top_level(LocalColumnId column_id) {
305
210
        return {.index = column_id.value()};
306
210
    }
307
308
0
    static LocalColumnIndex field(int32_t field_id) { return {.index = field_id}; }
309
310
0
    static LocalColumnIndex partial_field(int32_t field_id) {
311
0
        return {.index = field_id, .project_all_children = false};
312
0
    }
313
314
2.03k
    LocalColumnId column_id() const { return LocalColumnId(index); }
315
371
    int32_t field_id() const { return index; }
316
    std::string debug_string() const;
317
};
318
319
42
inline bool is_full_projection(const LocalColumnIndex* projection) {
320
42
    return projection == nullptr || projection->project_all_children;
321
42
}
322
323
40
inline bool is_partial_projection(const LocalColumnIndex* projection) {
324
40
    return projection != nullptr && !projection->project_all_children;
325
40
}
326
327
inline const LocalColumnIndex* find_child_projection(const LocalColumnIndex* projection,
328
29
                                                     int32_t field_id) {
329
29
    if (is_full_projection(projection)) {
330
29
        return nullptr;
331
29
    }
332
0
    const auto child_it = std::find_if(
333
0
            projection->children.begin(), projection->children.end(),
334
0
            [&](const LocalColumnIndex& child) { return child.field_id() == field_id; });
335
0
    return child_it == projection->children.end() ? nullptr : &*child_it;
336
29
}
337
338
13
inline bool is_child_projected(const LocalColumnIndex* projection, int32_t field_id) {
339
13
    return is_full_projection(projection) || find_child_projection(projection, field_id) != nullptr;
340
13
}
341
342
// Merge two projection trees that point to the same file-local node.
343
//
344
// A full projection dominates a partial projection. Two partial projections are merged by child id
345
// and recursively union their child paths. The caller must only merge projections for the same
346
// root/child node.
347
0
inline Status merge_local_column_index(LocalColumnIndex* target, const LocalColumnIndex& source) {
348
0
    DORIS_CHECK(target != nullptr);
349
0
    DORIS_CHECK(target->index == source.index);
350
0
    if (target->project_all_children) {
351
0
        return Status::OK();
352
0
    }
353
0
    if (source.project_all_children) {
354
0
        target->project_all_children = true;
355
0
        target->children.clear();
356
0
        return Status::OK();
357
0
    }
358
0
    for (const auto& source_child : source.children) {
359
0
        auto target_child_it = std::find_if(
360
0
                target->children.begin(), target->children.end(),
361
0
                [&](const LocalColumnIndex& child) { return child.index == source_child.index; });
362
0
        if (target_child_it == target->children.end()) {
363
0
            target->children.push_back(source_child);
364
0
            continue;
365
0
        }
366
0
        RETURN_IF_ERROR(merge_local_column_index(&*target_child_it, source_child));
367
0
    }
368
0
    return Status::OK();
369
0
}
370
371
} // namespace doris::format