Coverage Report

Created: 2026-07-03 18:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format_v2/column_data.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <algorithm>
21
#include <cstddef>
22
#include <cstdint>
23
#include <memory>
24
#include <ostream>
25
#include <string>
26
#include <utility>
27
#include <vector>
28
29
#include "common/consts.h"
30
#include "common/status.h"
31
#include "core/data_type/data_type.h"
32
#include "core/data_type/data_type_number.h"
33
#include "core/data_type/data_type_string.h"
34
#include "core/field.h"
35
#include "exprs/vexpr_fwd.h"
36
37
namespace doris::format {
38
39
// File-local top-level column id.
40
//
41
// Scope:
42
// - Only valid inside one physical file schema returned by FileReader::get_schema().
43
// - For Parquet, this is the top-level field ordinal in the new reader schema.
44
// - The synthetic row-position column also uses this type, with a reserved negative id.
45
//
46
// Do not use this for table/global column unique ids, block positions, nested child ids, or
47
// slot ids. Nested child ids are carried by LocalColumnIndex::index below.
48
class LocalColumnId {
49
public:
50
1.02M
    constexpr LocalColumnId() = default;
51
23.8M
    explicit constexpr LocalColumnId(int32_t id) : _id(id) {}
52
53
1.02M
    static constexpr LocalColumnId invalid() { return LocalColumnId(); }
54
55
49.2M
    constexpr int32_t value() const { return _id; }
56
222k
    constexpr bool is_valid() const { return _id >= 0; }
57
58
21.7M
    constexpr bool operator==(const LocalColumnId& other) const { return _id == other._id; }
59
315k
    constexpr bool operator!=(const LocalColumnId& other) const { return !(*this == other); }
60
12.0M
    constexpr bool operator<(const LocalColumnId& other) const { return _id < other._id; }
61
62
private:
63
    int32_t _id = -1;
64
};
65
66
// Position of a file-local column in the Block produced by one FileScanRequest.
67
//
68
// This is assigned by TableColumnMapper/TableReader after predicate/non-predicate columns are
69
// deduplicated. It is not a file schema id and it is not stable across requests. Use value() only
70
// at the boundary where an existing Block or expression API still expects a size_t/int position.
71
class LocalIndex {
72
public:
73
392k
    constexpr LocalIndex() = default;
74
620k
    explicit constexpr LocalIndex(size_t index) : _index(index) {}
75
76
51.3M
    constexpr size_t value() const { return _index; }
77
    constexpr bool operator==(const LocalIndex& other) const { return _index == other._index; }
78
0
    constexpr bool operator<(const LocalIndex& other) const { return _index < other._index; }
79
80
private:
81
    size_t _index = 0;
82
};
83
84
// Position of a table/global output column in the final Block returned by TableReader.
85
//
86
// This type is reserved for boundaries that need to refer to caller-visible column order. It must
87
// not be used to index a file-local Block, because schema evolution and lazy materialization can
88
// make file-local order different from table output order.
89
class GlobalIndex {
90
public:
91
976k
    constexpr GlobalIndex() = default;
92
925k
    explicit constexpr GlobalIndex(size_t index) : _index(index) {}
93
94
42.7k
    constexpr size_t value() const { return _index; }
95
8.75M
    constexpr bool operator==(const GlobalIndex& other) const { return _index == other._index; }
96
8.25M
    constexpr bool operator<(const GlobalIndex& other) const { return _index < other._index; }
97
98
private:
99
    size_t _index = 0;
100
};
101
102
// Index of a split-local constant/default value used to materialize columns that are not read from
103
// the physical file, such as partition columns, added columns with default values, and virtual
104
// table-format columns.
105
//
106
// It is separate from LocalIndex because constants do not occupy a position in the file reader
107
// output block unless an expression explicitly materializes them.
108
class ConstantIndex {
109
public:
110
    constexpr ConstantIndex() = default;
111
17.0k
    explicit constexpr ConstantIndex(size_t index) : _index(index) {}
112
113
19.8k
    constexpr size_t value() const { return _index; }
114
    constexpr bool operator==(const ConstantIndex& other) const { return _index == other._index; }
115
0
    constexpr bool operator<(const ConstantIndex& other) const { return _index < other._index; }
116
117
private:
118
    size_t _index = 0;
119
};
120
121
2
inline std::ostream& operator<<(std::ostream& os, const LocalColumnId& id) {
122
2
    return os << id.value();
123
2
}
124
125
0
inline std::ostream& operator<<(std::ostream& os, const LocalIndex& index) {
126
0
    return os << index.value();
127
0
}
128
129
12
inline std::ostream& operator<<(std::ostream& os, const GlobalIndex& index) {
130
12
    return os << index.value();
131
12
}
132
133
5
inline std::ostream& operator<<(std::ostream& os, const ConstantIndex& index) {
134
5
    return os << index.value();
135
5
}
136
137
// A split/file-local constant value used to materialize a table/global column without reading a
138
// physical file column.
139
//
140
// Common producers are partition values, schema-evolution default expressions, generated columns
141
// and table-format virtual columns. The entry is keyed by ConstantIndex in ConstantMap; global_index
142
// keeps the link back to the caller-visible output column.
143
struct ConstantEntry {
144
    GlobalIndex global_index;
145
    VExprContextSPtr expr;
146
    DataTypePtr type;
147
};
148
149
// Per mapping/split collection of constants.
150
//
151
// ConstantIndex only has meaning within this container. Keeping constants separate from LocalIndex
152
// makes it explicit that these values do not occupy positions in the file reader output Block.
153
class ConstantMap {
154
public:
155
14.3k
    ConstantIndex add(ConstantEntry entry) {
156
14.3k
        const auto index = ConstantIndex(_entries.size());
157
14.3k
        _entries.push_back(std::move(entry));
158
14.3k
        return index;
159
14.3k
    }
160
161
2.74k
    const ConstantEntry& get(ConstantIndex index) const {
162
2.74k
        DORIS_CHECK(index.value() < _entries.size());
163
2.74k
        return _entries[index.value()];
164
2.74k
    }
165
166
77.6k
    void clear() { _entries.clear(); }
167
    bool empty() const { return _entries.empty(); }
168
8
    size_t size() const { return _entries.size(); }
169
170
0
    const std::vector<ConstantEntry>& entries() const { return _entries; }
171
172
private:
173
    std::vector<ConstantEntry> _entries;
174
};
175
176
// Target of a localized filter.
177
//
178
// A filter can either reference a file-local Block position or a constant entry. Unset entries mean
179
// the filter cannot be evaluated below the table-reader finalize stage.
180
struct FilterEntry {
181
    enum class Kind {
182
        UNSET,
183
        LOCAL,
184
        CONSTANT,
185
    };
186
187
306k
    static FilterEntry local(LocalIndex index) {
188
306k
        return {.kind = Kind::LOCAL, .index = index.value()};
189
306k
    }
190
191
14.3k
    static FilterEntry constant(ConstantIndex index) {
192
14.3k
        return {.kind = Kind::CONSTANT, .index = index.value()};
193
14.3k
    }
194
195
0
    bool is_set() const { return kind != Kind::UNSET; }
196
658k
    bool is_local() const { return kind == Kind::LOCAL; }
197
21.6k
    bool is_constant() const { return kind == Kind::CONSTANT; }
198
199
306k
    LocalIndex local_index() const {
200
306k
        DORIS_CHECK(is_local());
201
306k
        return LocalIndex(index);
202
306k
    }
203
204
2.74k
    ConstantIndex constant_index() const {
205
2.74k
        DORIS_CHECK(is_constant());
206
2.74k
        return ConstantIndex(index);
207
2.74k
    }
208
209
    Kind kind = Kind::UNSET;
210
    size_t index = 0;
211
};
212
213
enum ColumnType {
214
    DATA_COLUMN = 0,  // normal data column
215
    ROW_NUMBER = 1,   // row number in a file
216
    GLOBAL_ROWID = 2, // global unique row id across files, used by TopN filter
217
};
218
219
struct GlobalRowIdContext {
220
    uint8_t version = 0;
221
    int64_t backend_id = 0;
222
    uint32_t file_id = 0;
223
};
224
225
// Column schema definition shared by table/global projection and file-local schema matching.
226
//
227
// ColumnDefinition intentionally carries schema identity only. FE column unique ids are translated
228
// to GlobalIndex at the FileScannerV2 boundary and must not appear in table/file reader APIs.
229
struct ColumnDefinition {
230
    // Typed identifier value used to match a column against another schema.
231
    //
232
    // - TYPE_NULL: no explicit identifier. BY_NAME falls back to ColumnDefinition::name.
233
    // - TYPE_INT: interpreted by TableColumnMapperOptions::mode as a field id or file position.
234
    // - TYPE_STRING: explicit name identifier.
235
    //
236
    // This is not the id that FileReader uses to read data. For example, a Parquet column can be
237
    // matched by its optional Parquet field_id, while the reader still addresses it by a file-local
238
    // ordinal.
239
    Field identifier;
240
    // Reader-local id of this node inside the file schema returned by FileReader::get_schema().
241
    // Top-level fields use the root column ordinal and nested fields use the child ordinal under
242
    // their parent. -1 means unset; special virtual file columns may use other negative ids.
243
    // Table/global ColumnDefinition values can leave this as -1 because they are not read directly
244
    // by a FileReader.
245
    int32_t local_id = -1;
246
    // Logical table column name. This is also the matching name for by-name file formats.
247
    std::string name;
248
    // Historical or external names for the same logical field. Table formats such as Iceberg can
249
    // use this to resolve partition path keys after column rename.
250
    std::vector<std::string> name_mapping {};
251
    DataTypePtr type;
252
    // Semantic nested children for this schema node.
253
    //
254
    // Table/global columns carry projected table children. File-local schemas returned by
255
    // FileReader::get_schema() also expose semantic children, not physical reader wrappers. For
256
    // example, MAP children are key/value and ARRAY children contain only the element field.
257
    std::vector<ColumnDefinition> children {};
258
    // Expression used to materialize missing/default/generated values when the column is not read
259
    // directly from the file.
260
    VExprContextSPtr default_expr = nullptr;
261
    // Partition columns are constants from split metadata and should not be matched against file
262
    // schema unless table-format logic explicitly asks for it.
263
    bool is_partition_key = false;
264
    // File-local column kind. For table/global columns this remains DATA_COLUMN.
265
    ColumnType column_type = ColumnType::DATA_COLUMN;
266
267
0
    bool has_identifier() const { return !identifier.is_null(); }
268
1.66M
    bool has_identifier_field_id() const { return identifier.get_type() == TYPE_INT; }
269
38.5M
    bool has_identifier_name() const { return identifier.get_type() == TYPE_STRING; }
270
271
    // DuckDB-style helper for BY_FIELD_ID matching. The mapper binds the matching mode once, so a
272
    // TYPE_INT identifier is interpreted as a field id only by the field-id matcher.
273
746k
    int32_t get_identifier_field_id() const {
274
746k
        DORIS_CHECK(has_identifier_field_id());
275
746k
        return identifier.get<TYPE_INT>();
276
746k
    }
277
    // DuckDB-style helper for BY_NAME matching. When no explicit string identifier is present, the
278
    // logical column name is the identifier.
279
19.5M
    const std::string& get_identifier_name() const {
280
19.5M
        if (identifier.is_null()) {
281
0
            return name;
282
0
        }
283
19.5M
        DORIS_CHECK(has_identifier_name());
284
19.5M
        return identifier.get<TYPE_STRING>();
285
19.5M
    }
286
    // Helper for BY_INDEX matching. BY_INDEX reuses the TYPE_INT identifier as the table-side file
287
    // position, matching DuckDB's typed identifier plus mapper-mode interpretation.
288
722
    int32_t get_identifier_position() const {
289
722
        DORIS_CHECK(has_identifier_field_id());
290
722
        return identifier.get<TYPE_INT>();
291
722
    }
292
293
    // Helper for reader-local projection and scan requests.
294
7.92M
    int32_t file_local_id() const {
295
7.92M
        if (local_id != -1) {
296
7.92M
            return local_id;
297
7.92M
        }
298
18.4E
        return get_identifier_field_id();
299
7.92M
    }
300
301
    std::string debug_string() const;
302
};
303
304
static constexpr int ROW_POSITION_COLUMN_ID = -10001;
305
static constexpr const char* ROW_POSITION_COLUMN_NAME = "__file_row_position";
306
static constexpr int GLOBAL_ROWID_COLUMN_ID = -10002;
307
308
3.28k
inline ColumnDefinition row_position_column_definition() {
309
3.28k
    ColumnDefinition field;
310
3.28k
    field.identifier = Field::create_field<TYPE_INT>(ROW_POSITION_COLUMN_ID);
311
3.28k
    field.local_id = ROW_POSITION_COLUMN_ID;
312
3.28k
    field.name = ROW_POSITION_COLUMN_NAME;
313
3.28k
    field.type = std::make_shared<DataTypeInt64>();
314
3.28k
    field.column_type = ColumnType::ROW_NUMBER;
315
3.28k
    return field;
316
3.28k
}
317
318
2.13k
inline ColumnDefinition global_rowid_column_definition() {
319
2.13k
    ColumnDefinition field;
320
2.13k
    field.identifier = Field::create_field<TYPE_STRING>(BeConsts::GLOBAL_ROWID_COL);
321
2.13k
    field.local_id = GLOBAL_ROWID_COLUMN_ID;
322
2.13k
    field.name = BeConsts::GLOBAL_ROWID_COL;
323
2.13k
    field.type = std::make_shared<DataTypeString>();
324
2.13k
    field.column_type = ColumnType::GLOBAL_ROWID;
325
2.13k
    return field;
326
2.13k
}
327
328
// Recursive file-local projection path.
329
//
330
// For a root entry in FileScanRequest::{predicate_columns, non_predicate_columns}, index is the
331
// top-level file column id and column_id() is valid. For children, index is the file-local child id
332
// under the parent node. This is the reader schema local id, not an Iceberg/Parquet field id, not a
333
// table child id, and not a child output ordinal.
334
//
335
// project_all_children=true means the whole subtree under this node is needed. When false, children
336
// lists the selected child paths. File readers can use this to avoid constructing readers for
337
// unprojected nested children.
338
struct LocalColumnIndex {
339
    int32_t index = -1;
340
    bool project_all_children = true;
341
    std::vector<LocalColumnIndex> children {};
342
343
319k
    static LocalColumnIndex top_level(LocalColumnId column_id) {
344
319k
        return {.index = column_id.value()};
345
319k
    }
346
347
20.8k
    static LocalColumnIndex local(int32_t local_id) { return {.index = local_id}; }
348
349
9.45k
    static LocalColumnIndex partial_local(int32_t local_id) {
350
9.45k
        return {.index = local_id, .project_all_children = false};
351
9.45k
    }
352
353
21.9M
    LocalColumnId column_id() const { return LocalColumnId(index); }
354
683k
    int32_t local_id() const { return index; }
355
    std::string debug_string() const;
356
};
357
358
91.9k
inline bool is_full_projection(const LocalColumnIndex* projection) {
359
91.9k
    return projection == nullptr || projection->project_all_children;
360
91.9k
}
361
362
137k
inline bool is_partial_projection(const LocalColumnIndex* projection) {
363
137k
    return projection != nullptr && !projection->project_all_children;
364
137k
}
365
366
inline const LocalColumnIndex* find_child_projection(const LocalColumnIndex* projection,
367
62.9k
                                                     int32_t local_id) {
368
62.9k
    if (is_full_projection(projection)) {
369
50.7k
        return nullptr;
370
50.7k
    }
371
12.2k
    const auto child_it = std::find_if(
372
12.2k
            projection->children.begin(), projection->children.end(),
373
15.6k
            [&](const LocalColumnIndex& child) { return child.local_id() == local_id; });
374
12.2k
    return child_it == projection->children.end() ? nullptr : &*child_it;
375
62.9k
}
376
377
28.9k
inline bool is_child_projected(const LocalColumnIndex* projection, int32_t local_id) {
378
28.9k
    return is_full_projection(projection) || find_child_projection(projection, local_id) != nullptr;
379
28.9k
}
380
381
// Merge two projection trees that point to the same file-local node.
382
//
383
// A full projection dominates a partial projection. Two partial projections are merged by child id
384
// and recursively union their child paths. The caller must only merge projections for the same
385
// root/child node.
386
5.93k
inline Status merge_local_column_index(LocalColumnIndex* target, const LocalColumnIndex& source) {
387
5.93k
    DORIS_CHECK(target != nullptr);
388
5.93k
    DORIS_CHECK(target->index == source.index);
389
5.93k
    if (target->project_all_children) {
390
5.69k
        return Status::OK();
391
5.69k
    }
392
239
    if (source.project_all_children) {
393
1
        target->project_all_children = true;
394
1
        target->children.clear();
395
1
        return Status::OK();
396
1
    }
397
239
    for (const auto& source_child : source.children) {
398
239
        auto target_child_it = std::find_if(
399
239
                target->children.begin(), target->children.end(),
400
282
                [&](const LocalColumnIndex& child) { return child.index == source_child.index; });
401
239
        if (target_child_it == target->children.end()) {
402
45
            target->children.push_back(source_child);
403
45
            continue;
404
45
        }
405
194
        RETURN_IF_ERROR(merge_local_column_index(&*target_child_it, source_child));
406
194
    }
407
238
    return Status::OK();
408
238
}
409
410
} // namespace doris::format