Coverage Report

Created: 2026-05-17 12:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format/parquet/schema_desc.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <gen_cpp/Planner_types.h>
21
#include <gen_cpp/parquet_types.h>
22
#include <stddef.h>
23
#include <stdint.h>
24
25
#include <string>
26
#include <unordered_map>
27
#include <unordered_set>
28
#include <vector>
29
30
#include "common/cast_set.h"
31
#include "common/status.h"
32
#include "core/data_type/data_type.h"
33
#include "core/data_type/data_type_nothing.h"
34
#include "util/slice.h"
35
36
namespace doris {
37
38
// Constant for unassigned column IDs
39
constexpr uint64_t UNASSIGNED_COLUMN_ID = UINT64_MAX;
40
41
struct FieldSchema {
42
    std::string name;
43
    std::string lower_case_name; // for hms column name case insensitive match
44
    // the referenced parquet schema element
45
    tparquet::SchemaElement parquet_schema;
46
47
    // Used to identify whether this field is a nested field.
48
    DataTypePtr data_type;
49
50
    // Only valid when this field is a leaf node
51
    tparquet::Type::type physical_type;
52
    // The index order in FieldDescriptor._physical_fields
53
    int physical_column_index = -1;
54
    int16_t definition_level = 0;
55
    int16_t repetition_level = 0;
56
    int16_t repeated_parent_def_level = 0;
57
    std::vector<FieldSchema> children;
58
59
    //For UInt8 -> Int16,UInt16 -> Int32,UInt32 -> Int64,UInt64 -> Int128.
60
    bool is_type_compatibility = false;
61
    bool is_in_variant = false;
62
63
    FieldSchema()
64
2.82k
            : data_type(std::make_shared<DataTypeNothing>()), column_id(UNASSIGNED_COLUMN_ID) {}
65
6.64k
    ~FieldSchema() = default;
66
3.81k
    FieldSchema(const FieldSchema& fieldSchema) = default;
67
    std::string debug_string() const;
68
69
    int32_t field_id = -1;
70
    uint64_t column_id = UNASSIGNED_COLUMN_ID;
71
    uint64_t max_column_id = 0; // Maximum column ID for this field and its children
72
73
    // Column ID assignment and lookup methods
74
    void assign_ids(uint64_t& next_id);
75
    const FieldSchema* find_column_by_id(uint64_t target_id) const;
76
    uint64_t get_column_id() const;
77
    void set_column_id(uint64_t id);
78
    uint64_t get_max_column_id() const;
79
};
80
81
class FieldDescriptor {
82
private:
83
    // Only the schema elements at the first level
84
    std::vector<FieldSchema> _fields;
85
    // The leaf node of schema elements
86
    std::vector<FieldSchema*> _physical_fields;
87
    // Name to _fields, not all schema elements
88
    std::unordered_map<std::string, FieldSchema*> _name_to_field;
89
    // Used in from_thrift, marking the next schema position that should be parsed
90
    size_t _next_schema_pos;
91
    // useful for parse_node_field to decide whether to convert byte_array to VARBINARY type
92
    bool _enable_mapping_varbinary = false;
93
    bool _enable_mapping_timestamp_tz = false;
94
95
private:
96
    void parse_physical_field(const tparquet::SchemaElement& physical_schema, bool is_nullable,
97
                              FieldSchema* physical_field);
98
99
    Status parse_list_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
100
                            FieldSchema* list_field);
101
102
    Status parse_map_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
103
                           FieldSchema* map_field);
104
105
    Status parse_variant_field(const std::vector<tparquet::SchemaElement>& t_schemas,
106
                               size_t curr_pos, FieldSchema* variant_field);
107
108
    Status parse_struct_field(const std::vector<tparquet::SchemaElement>& t_schemas,
109
                              size_t curr_pos, FieldSchema* struct_field);
110
111
    Status parse_group_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
112
                             FieldSchema* group_field);
113
114
    Status parse_node_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
115
                            FieldSchema* node_field);
116
117
    void rebuild_indexes();
118
119
    std::pair<DataTypePtr, bool> convert_to_doris_type(tparquet::LogicalType logicalType,
120
                                                       bool nullable);
121
    std::pair<DataTypePtr, bool> convert_to_doris_type(
122
            const tparquet::SchemaElement& physical_schema, bool nullable);
123
    std::pair<DataTypePtr, bool> get_doris_type(const tparquet::SchemaElement& physical_schema,
124
                                                bool nullable);
125
126
public:
127
121
    FieldDescriptor() = default;
128
    FieldDescriptor(const FieldDescriptor& other)
129
90
            : _fields(other._fields),
130
90
              _next_schema_pos(other._next_schema_pos),
131
90
              _enable_mapping_varbinary(other._enable_mapping_varbinary),
132
90
              _enable_mapping_timestamp_tz(other._enable_mapping_timestamp_tz) {
133
90
        rebuild_indexes();
134
90
    }
135
0
    FieldDescriptor& operator=(const FieldDescriptor& other) {
136
0
        if (this != &other) {
137
0
            _fields = other._fields;
138
0
            _next_schema_pos = other._next_schema_pos;
139
0
            _enable_mapping_varbinary = other._enable_mapping_varbinary;
140
0
            _enable_mapping_timestamp_tz = other._enable_mapping_timestamp_tz;
141
0
            rebuild_indexes();
142
0
        }
143
0
        return *this;
144
0
    }
145
211
    ~FieldDescriptor() = default;
146
147
    /**
148
     * Parse FieldDescriptor from parquet thrift FileMetaData.
149
     * @param t_schemas list of schema elements
150
     */
151
    Status parse_from_thrift(const std::vector<tparquet::SchemaElement>& t_schemas);
152
153
    int get_column_index(const std::string& column) const;
154
155
    /**
156
     * Get the column(the first level schema element, maybe nested field) by index.
157
     * @param index Column index in _fields
158
     */
159
1.07k
    const FieldSchema* get_column(size_t index) const { return &_fields[index]; }
160
161
    /**
162
     * Get the column(the first level schema element, maybe nested field) by name.
163
     * @param name Column name
164
     * @return FieldSchema or nullptr if not exists
165
     */
166
    FieldSchema* get_column(const std::string& name) const;
167
168
    void get_column_names(std::unordered_set<std::string>* names) const;
169
170
    std::string debug_string() const;
171
172
1.10k
    int32_t size() const { return cast_set<int32_t>(_fields.size()); }
173
174
31
    const std::vector<FieldSchema>& get_fields_schema() const { return _fields; }
175
176
    /**
177
     * Assign stable column IDs to schema fields.
178
     *
179
     * This uses an ORC-compatible encoding so that the results of
180
     * create_column_ids() are consistent across formats. IDs start from 1
181
     * and are assigned in a pre-order traversal (parent before children).
182
     * After calling this, each FieldSchema will have column_id and
183
     * max_column_id populated.
184
     */
185
    void assign_ids();
186
187
    FieldDescriptor copy_with_assigned_ids() const;
188
189
    const FieldSchema* find_column_by_id(uint64_t column_id) const;
190
107
    void set_enable_mapping_varbinary(bool enable) { _enable_mapping_varbinary = enable; }
191
107
    void set_enable_mapping_timestamp_tz(bool enable) { _enable_mapping_timestamp_tz = enable; }
192
};
193
194
} // namespace doris