be/src/format/parquet/schema_desc.h

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <gen_cpp/Planner_types.h>
#include <gen_cpp/parquet_types.h>
#include <stddef.h>
#include <stdint.h>

#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include "common/cast_set.h"
#include "common/status.h"
#include "core/data_type/data_type.h"
#include "core/data_type/data_type_nothing.h"
#include "util/slice.h"

namespace doris {
#include "common/compile_check_begin.h"

// Constant for unassigned column IDs
constexpr uint64_t UNASSIGNED_COLUMN_ID = UINT64_MAX;

struct FieldSchema {
    std::string name;
    std::string lower_case_name; // for hms column name case insensitive match
    // the referenced parquet schema element
    tparquet::SchemaElement parquet_schema;

    // Used to identify whether this field is a nested field.
    DataTypePtr data_type;

    // Only valid when this field is a leaf node
    tparquet::Type::type physical_type;
    // The index order in FieldDescriptor._physical_fields
    int physical_column_index = -1;
    int16_t definition_level = 0;
    int16_t repetition_level = 0;
    int16_t repeated_parent_def_level = 0;
    std::vector<FieldSchema> children;

    //For UInt8 -> Int16,UInt16 -> Int32,UInt32 -> Int64,UInt64 -> Int128.
    bool is_type_compatibility = false;

    FieldSchema()
            : data_type(std::make_shared<DataTypeNothing>()), column_id(UNASSIGNED_COLUMN_ID) {}
    ~FieldSchema() = default;
    FieldSchema(const FieldSchema& fieldSchema) = default;
    std::string debug_string() const;

    int32_t field_id = -1;
    uint64_t column_id = UNASSIGNED_COLUMN_ID;
    uint64_t max_column_id = 0; // Maximum column ID for this field and its children

    // Column ID assignment and lookup methods
    void assign_ids(uint64_t& next_id);
    const FieldSchema* find_column_by_id(uint64_t target_id) const;
    uint64_t get_column_id() const;
    void set_column_id(uint64_t id);
    uint64_t get_max_column_id() const;
};

class FieldDescriptor {
private:
    // Only the schema elements at the first level
    std::vector<FieldSchema> _fields;
    // The leaf node of schema elements
    std::vector<FieldSchema*> _physical_fields;
    // Name to _fields, not all schema elements
    std::unordered_map<std::string, FieldSchema*> _name_to_field;
    // Used in from_thrift, marking the next schema position that should be parsed
    size_t _next_schema_pos;
    // useful for parse_node_field to decide whether to convert byte_array to VARBINARY type
    bool _enable_mapping_varbinary = false;
    bool _enable_mapping_timestamp_tz = false;

private:
    void parse_physical_field(const tparquet::SchemaElement& physical_schema, bool is_nullable,
                              FieldSchema* physical_field);

    Status parse_list_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
                            FieldSchema* list_field);

    Status parse_map_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
                           FieldSchema* map_field);

    Status parse_struct_field(const std::vector<tparquet::SchemaElement>& t_schemas,
                              size_t curr_pos, FieldSchema* struct_field);

    Status parse_group_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
                             FieldSchema* group_field);

    Status parse_node_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
                            FieldSchema* node_field);

    std::pair<DataTypePtr, bool> convert_to_doris_type(tparquet::LogicalType logicalType,
                                                       bool nullable);
    std::pair<DataTypePtr, bool> convert_to_doris_type(
            const tparquet::SchemaElement& physical_schema, bool nullable);
    std::pair<DataTypePtr, bool> get_doris_type(const tparquet::SchemaElement& physical_schema,
                                                bool nullable);

public:
    FieldDescriptor() = default;
    ~FieldDescriptor() = default;

    /**
     * Parse FieldDescriptor from parquet thrift FileMetaData.
     * @param t_schemas list of schema elements
     */
    Status parse_from_thrift(const std::vector<tparquet::SchemaElement>& t_schemas);

    int get_column_index(const std::string& column) const;

    /**
     * Get the column(the first level schema element, maybe nested field) by index.
     * @param index Column index in _fields
     */
    const FieldSchema* get_column(size_t index) const { return &_fields[index]; }

    /**
     * Get the column(the first level schema element, maybe nested field) by name.
     * @param name Column name
     * @return FieldSchema or nullptr if not exists
     */
    FieldSchema* get_column(const std::string& name) const;

    void get_column_names(std::unordered_set<std::string>* names) const;

    std::string debug_string() const;

    int32_t size() const { return cast_set<int32_t>(_fields.size()); }

    const std::vector<FieldSchema>& get_fields_schema() const { return _fields; }

    /**
     * Assign stable column IDs to schema fields.
     *
     * This uses an ORC-compatible encoding so that the results of
     * create_column_ids() are consistent across formats. IDs start from 1
     * and are assigned in a pre-order traversal (parent before children).
     * After calling this, each FieldSchema will have column_id and
     * max_column_id populated.
     */
    void assign_ids();

    const FieldSchema* find_column_by_id(uint64_t column_id) const;
    void set_enable_mapping_varbinary(bool enable) { _enable_mapping_varbinary = enable; }
    void set_enable_mapping_timestamp_tz(bool enable) { _enable_mapping_timestamp_tz = enable; }
};
#include "common/compile_check_end.h"

} // namespace doris

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#pragma once
19
20		#include <gen_cpp/Planner_types.h>
21		#include <gen_cpp/parquet_types.h>
22		#include <stddef.h>
23		#include <stdint.h>
24
25		#include <string>
26		#include <unordered_map>
27		#include <unordered_set>
28		#include <vector>
29
30		#include "common/cast_set.h"
31		#include "common/status.h"
32		#include "core/data_type/data_type.h"
33		#include "core/data_type/data_type_nothing.h"
34		#include "util/slice.h"
35
36		namespace doris {
37		#include "common/compile_check_begin.h"
38
39		// Constant for unassigned column IDs
40		constexpr uint64_t UNASSIGNED_COLUMN_ID = UINT64_MAX;
41
42		struct FieldSchema {
43		std::string name;
44		std::string lower_case_name; // for hms column name case insensitive match
45		// the referenced parquet schema element
46		tparquet::SchemaElement parquet_schema;
47
48		// Used to identify whether this field is a nested field.
49		DataTypePtr data_type;
50
51		// Only valid when this field is a leaf node
52		tparquet::Type::type physical_type;
53		// The index order in FieldDescriptor._physical_fields
54		int physical_column_index = -1;
55		int16_t definition_level = 0;
56		int16_t repetition_level = 0;
57		int16_t repeated_parent_def_level = 0;
58		std::vector<FieldSchema> children;
59
60		//For UInt8 -> Int16,UInt16 -> Int32,UInt32 -> Int64,UInt64 -> Int128.
61		bool is_type_compatibility = false;
62
63		FieldSchema()
64	4.24k	: data_type(std::make_shared<DataTypeNothing>()), column_id(UNASSIGNED_COLUMN_ID) {}
65	8.62k	~FieldSchema() = default;
66	5.08k	FieldSchema(const FieldSchema& fieldSchema) = default;
67		std::string debug_string() const;
68
69		int32_t field_id = -1;
70		uint64_t column_id = UNASSIGNED_COLUMN_ID;
71		uint64_t max_column_id = 0; // Maximum column ID for this field and its children
72
73		// Column ID assignment and lookup methods
74		void assign_ids(uint64_t& next_id);
75		const FieldSchema* find_column_by_id(uint64_t target_id) const;
76		uint64_t get_column_id() const;
77		void set_column_id(uint64_t id);
78		uint64_t get_max_column_id() const;
79		};
80
81		class FieldDescriptor {
82		private:
83		// Only the schema elements at the first level
84		std::vector<FieldSchema> _fields;
85		// The leaf node of schema elements
86		std::vector<FieldSchema*> _physical_fields;
87		// Name to _fields, not all schema elements
88		std::unordered_map<std::string, FieldSchema*> _name_to_field;
89		// Used in from_thrift, marking the next schema position that should be parsed
90		size_t _next_schema_pos;
91		// useful for parse_node_field to decide whether to convert byte_array to VARBINARY type
92		bool _enable_mapping_varbinary = false;
93		bool _enable_mapping_timestamp_tz = false;
94
95		private:
96		void parse_physical_field(const tparquet::SchemaElement& physical_schema, bool is_nullable,
97		FieldSchema* physical_field);
98
99		Status parse_list_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
100		FieldSchema* list_field);
101
102		Status parse_map_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
103		FieldSchema* map_field);
104
105		Status parse_struct_field(const std::vector<tparquet::SchemaElement>& t_schemas,
106		size_t curr_pos, FieldSchema* struct_field);
107
108		Status parse_group_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
109		FieldSchema* group_field);
110
111		Status parse_node_field(const std::vector<tparquet::SchemaElement>& t_schemas, size_t curr_pos,
112		FieldSchema* node_field);
113
114		std::pair<DataTypePtr, bool> convert_to_doris_type(tparquet::LogicalType logicalType,
115		bool nullable);
116		std::pair<DataTypePtr, bool> convert_to_doris_type(
117		const tparquet::SchemaElement& physical_schema, bool nullable);
118		std::pair<DataTypePtr, bool> get_doris_type(const tparquet::SchemaElement& physical_schema,
119		bool nullable);
120
121		public:
122	289	FieldDescriptor() = default;
123	476	~FieldDescriptor() = default;
124
125		/**
126		* Parse FieldDescriptor from parquet thrift FileMetaData.
127		* @param t_schemas list of schema elements
128		*/
129		Status parse_from_thrift(const std::vector<tparquet::SchemaElement>& t_schemas);
130
131		int get_column_index(const std::string& column) const;
132
133		/**
134		* Get the column(the first level schema element, maybe nested field) by index.
135		* @param index Column index in _fields
136		*/
137	3.08k	const FieldSchema* get_column(size_t index) const { return &_fields[index]; }
138
139		/**
140		* Get the column(the first level schema element, maybe nested field) by name.
141		* @param name Column name
142		* @return FieldSchema or nullptr if not exists
143		*/
144		FieldSchema* get_column(const std::string& name) const;
145
146		void get_column_names(std::unordered_set<std::string>* names) const;
147
148		std::string debug_string() const;
149
150	2.82k	int32_t size() const { return cast_set<int32_t>(_fields.size()); }
151
152	131	const std::vector<FieldSchema>& get_fields_schema() const { return _fields; }
153
154		/**
155		* Assign stable column IDs to schema fields.
156		*
157		* This uses an ORC-compatible encoding so that the results of
158		* create_column_ids() are consistent across formats. IDs start from 1
159		* and are assigned in a pre-order traversal (parent before children).
160		* After calling this, each FieldSchema will have column_id and
161		* max_column_id populated.
162		*/
163		void assign_ids();
164
165		const FieldSchema* find_column_by_id(uint64_t column_id) const;
166	284	void set_enable_mapping_varbinary(bool enable) { _enable_mapping_varbinary = enable; }
167	284	void set_enable_mapping_timestamp_tz(bool enable) { _enable_mapping_timestamp_tz = enable; }
168		};
169		#include "common/compile_check_end.h"
170
171		} // namespace doris

Coverage Report

Created: 2026-03-15 18:33