be/src/format/parquet/schema_desc.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <gen_cpp/Planner_types.h> |
21 | | #include <gen_cpp/parquet_types.h> |
22 | | #include <stddef.h> |
23 | | #include <stdint.h> |
24 | | |
25 | | #include <string> |
26 | | #include <unordered_map> |
27 | | #include <unordered_set> |
28 | | #include <vector> |
29 | | |
30 | | #include "common/cast_set.h" |
31 | | #include "common/status.h" |
32 | | #include "core/data_type/data_type.h" |
33 | | #include "core/data_type/data_type_nothing.h" |
34 | | #include "util/slice.h" |
35 | | |
36 | | namespace doris { |
37 | | #include "common/compile_check_begin.h" |
38 | | |
39 | | // Constant for unassigned column IDs |
40 | | constexpr uint64_t UNASSIGNED_COLUMN_ID = UINT64_MAX; |
41 | | |
42 | | struct FieldSchema { |
43 | | std::string name; |
44 | | std::string lower_case_name; // for hms column name case insensitive match |
45 | | // the referenced parquet schema element |
46 | | tparquet::SchemaElement parquet_schema; |
47 | | |
48 | | // Used to identify whether this field is a nested field. |
49 | | DataTypePtr data_type; |
50 | | |
51 | | // Only valid when this field is a leaf node |
52 | | tparquet::Type::type physical_type; |
53 | | // The index order in FieldDescriptor._physical_fields |
54 | | int physical_column_index = -1; |
55 | | int16_t definition_level = 0; |
56 | | int16_t repetition_level = 0; |
57 | | int16_t repeated_parent_def_level = 0; |
58 | | std::vector<FieldSchema> children; |
59 | | |
60 | | //For UInt8 -> Int16,UInt16 -> Int32,UInt32 -> Int64,UInt64 -> Int128. |
61 | | bool is_type_compatibility = false; |
62 | | |
63 | | FieldSchema() |
64 | 4.24k | : data_type(std::make_shared<DataTypeNothing>()), column_id(UNASSIGNED_COLUMN_ID) {} |
65 | 8.62k | ~FieldSchema() = default; |
66 | 5.08k | FieldSchema(const FieldSchema& fieldSchema) = default; |
67 | | std::string debug_string() const; |
68 | | |
69 | | int32_t field_id = -1; |
70 | | uint64_t column_id = UNASSIGNED_COLUMN_ID; |
71 | | uint64_t max_column_id = 0; // Maximum column ID for this field and its children |
72 | | |
73 | | // Column ID assignment and lookup methods |
74 | | void assign_ids(uint64_t& next_id); |
75 | | const FieldSchema* find_column_by_id(uint64_t target_id) const; |
76 | | uint64_t get_column_id() const; |
77 | | void set_column_id(uint64_t id); |
78 | | uint64_t get_max_column_id() const; |
79 | | }; |
80 | | |
81 | | class FieldDescriptor { |
82 | | private: |
83 | | // Only the schema elements at the first level |
84 | | std::vector<FieldSchema> _fields; |
85 | | // The leaf node of schema elements |
86 | | std::vector<FieldSchema*> _physical_fields; |
87 | | // Name to _fields, not all schema elements |
88 | | std::unordered_map<std::string, FieldSchema*> _name_to_field; |
89 | | // Used in from_thrift, marking the next schema position that should be parsed |
90 | | size_t _next_schema_pos; |
91 | | // useful for parse_node_field to decide whether to convert byte_array to VARBINARY type |
92 | | bool _enable_mapping_varbinary = false; |
93 | | bool _enable_mapping_timestamp_tz = false; |
94 | | |
95 | | private: |
96 | | void parse_physical_field(const tparquet::SchemaElement& physical_schema, bool is_nullable, |
97 | | FieldSchema* physical_field); |
98 | | |
99 | | Status parse_list_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos, |
100 | | FieldSchema* list_field); |
101 | | |
102 | | Status parse_map_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos, |
103 | | FieldSchema* map_field); |
104 | | |
105 | | Status parse_struct_field(const std::vector<tparquet::SchemaElement>& t_schemas, |
106 | | size_t curr_pos, FieldSchema* struct_field); |
107 | | |
108 | | Status parse_group_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos, |
109 | | FieldSchema* group_field); |
110 | | |
111 | | Status parse_node_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos, |
112 | | FieldSchema* node_field); |
113 | | |
114 | | std::pair<DataTypePtr, bool> convert_to_doris_type(tparquet::LogicalType logicalType, |
115 | | bool nullable); |
116 | | std::pair<DataTypePtr, bool> convert_to_doris_type( |
117 | | const tparquet::SchemaElement& physical_schema, bool nullable); |
118 | | std::pair<DataTypePtr, bool> get_doris_type(const tparquet::SchemaElement& physical_schema, |
119 | | bool nullable); |
120 | | |
121 | | public: |
122 | 289 | FieldDescriptor() = default; |
123 | 476 | ~FieldDescriptor() = default; |
124 | | |
125 | | /** |
126 | | * Parse FieldDescriptor from parquet thrift FileMetaData. |
127 | | * @param t_schemas list of schema elements |
128 | | */ |
129 | | Status parse_from_thrift(const std::vector<tparquet::SchemaElement>& t_schemas); |
130 | | |
131 | | int get_column_index(const std::string& column) const; |
132 | | |
133 | | /** |
134 | | * Get the column(the first level schema element, maybe nested field) by index. |
135 | | * @param index Column index in _fields |
136 | | */ |
137 | 3.08k | const FieldSchema* get_column(size_t index) const { return &_fields[index]; } |
138 | | |
139 | | /** |
140 | | * Get the column(the first level schema element, maybe nested field) by name. |
141 | | * @param name Column name |
142 | | * @return FieldSchema or nullptr if not exists |
143 | | */ |
144 | | FieldSchema* get_column(const std::string& name) const; |
145 | | |
146 | | void get_column_names(std::unordered_set<std::string>* names) const; |
147 | | |
148 | | std::string debug_string() const; |
149 | | |
150 | 2.82k | int32_t size() const { return cast_set<int32_t>(_fields.size()); } |
151 | | |
152 | 131 | const std::vector<FieldSchema>& get_fields_schema() const { return _fields; } |
153 | | |
154 | | /** |
155 | | * Assign stable column IDs to schema fields. |
156 | | * |
157 | | * This uses an ORC-compatible encoding so that the results of |
158 | | * create_column_ids() are consistent across formats. IDs start from 1 |
159 | | * and are assigned in a pre-order traversal (parent before children). |
160 | | * After calling this, each FieldSchema will have column_id and |
161 | | * max_column_id populated. |
162 | | */ |
163 | | void assign_ids(); |
164 | | |
165 | | const FieldSchema* find_column_by_id(uint64_t column_id) const; |
166 | 284 | void set_enable_mapping_varbinary(bool enable) { _enable_mapping_varbinary = enable; } |
167 | 284 | void set_enable_mapping_timestamp_tz(bool enable) { _enable_mapping_timestamp_tz = enable; } |
168 | | }; |
169 | | #include "common/compile_check_end.h" |
170 | | |
171 | | } // namespace doris |