be/src/format/parquet/parquet_common.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <gen_cpp/parquet_types.h> |
21 | | #include <stddef.h> |
22 | | |
23 | | #include <cstdint> |
24 | | #include <ostream> |
25 | | #include <regex> |
26 | | #include <string> |
27 | | #include <unordered_set> |
28 | | #include <vector> |
29 | | |
30 | | #include "core/column/column_nullable.h" |
31 | | #include "storage/segment/row_ranges.h" |
32 | | |
33 | | namespace doris { |
34 | | #include "common/compile_check_begin.h" |
35 | | using level_t = int16_t; |
36 | | |
37 | | using segment_v2::RowRange; |
38 | | using segment_v2::RowRanges; |
39 | | |
40 | | #pragma pack(1) |
41 | | struct ParquetInt96 { |
42 | | int64_t lo; // time of nanoseconds in a day |
43 | | int32_t hi; // days from julian epoch |
44 | | |
45 | 76.6M | NO_SANITIZE_UNDEFINED inline int64_t to_timestamp_micros() const { |
46 | 76.6M | return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND; |
47 | 76.6M | } |
48 | 0 | inline __int128 to_int128() const { |
49 | 0 | __int128 ans = 0; |
50 | 0 | ans = (((__int128)hi) << 64) + lo; |
51 | 0 | return ans; |
52 | 0 | } |
53 | | |
54 | | static const int32_t JULIAN_EPOCH_OFFSET_DAYS; |
55 | | static const int64_t MICROS_IN_DAY; |
56 | | static const int64_t NANOS_PER_MICROSECOND; |
57 | | }; |
58 | | #pragma pack() |
59 | | static_assert(sizeof(ParquetInt96) == 12, "The size of ParquetInt96 is not 12."); |
60 | | |
61 | | class FilterMap { |
62 | | public: |
63 | 322k | FilterMap() = default; |
64 | | Status init(const uint8_t* filter_map_data, size_t filter_map_size, bool filter_all); |
65 | | |
66 | | Status generate_nested_filter_map(const std::vector<level_t>& rep_levels, |
67 | | std::vector<uint8_t>& nested_filter_map_data, |
68 | | std::unique_ptr<FilterMap>* nested_filter_map, |
69 | | size_t* current_row_ptr, size_t start_index = 0) const; |
70 | | |
71 | 711M | const uint8_t* filter_map_data() const { return _filter_map_data; } |
72 | 680M | size_t filter_map_size() const { return _filter_map_size; } |
73 | 3.77M | bool has_filter() const { return _has_filter; } |
74 | 27.2k | bool filter_all() const { return _filter_all; } |
75 | 351k | double filter_ratio() const { return _has_filter ? _filter_ratio : 0; } |
76 | | |
77 | | bool can_filter_all(size_t remaining_num_values, size_t filter_map_index); |
78 | | |
79 | | private: |
80 | | bool _has_filter = false; |
81 | | bool _filter_all = false; |
82 | | const uint8_t* _filter_map_data = nullptr; |
83 | | size_t _filter_map_size = 0; |
84 | | double _filter_ratio = 0; |
85 | | }; |
86 | | |
87 | | class ColumnSelectVector { |
88 | | public: |
89 | | enum DataReadType : uint8_t { CONTENT = 0, NULL_DATA, FILTERED_CONTENT, FILTERED_NULL }; |
90 | | |
91 | 1.17M | ColumnSelectVector() = default; |
92 | | |
93 | | Status init(const std::vector<uint16_t>& run_length_null_map, size_t num_values, |
94 | | NullMap* null_map, FilterMap* filter_map, size_t filter_map_index, |
95 | | const std::unordered_set<size_t>* skipped_indices = nullptr); |
96 | | |
97 | 5.43M | size_t num_values() const { return _num_values; } |
98 | | |
99 | 973k | size_t num_nulls() const { return _num_nulls; } |
100 | | |
101 | 942k | size_t num_filtered() const { return _num_filtered; } |
102 | | |
103 | 1.17M | bool has_filter() const { return _has_filter; } |
104 | | |
105 | | template <bool has_filter> |
106 | 61.3M | size_t get_next_run(DataReadType* data_read_type) { |
107 | 61.3M | DCHECK_EQ(_has_filter, has_filter); |
108 | 61.3M | if constexpr (has_filter) { |
109 | 57.5M | if (_read_index == _num_values) { |
110 | 315k | return 0; |
111 | 315k | } |
112 | 57.2M | const DataReadType& type = _data_map[_read_index++]; |
113 | 57.2M | size_t run_length = 1; |
114 | 676M | while (_read_index < _num_values) { |
115 | 673M | if (_data_map[_read_index] == type) { |
116 | 619M | run_length++; |
117 | 619M | _read_index++; |
118 | 619M | } else { |
119 | 54.1M | break; |
120 | 54.1M | } |
121 | 673M | } |
122 | 57.2M | *data_read_type = type; |
123 | 57.2M | return run_length; |
124 | 57.5M | } else { |
125 | 3.74M | size_t run_length = 0; |
126 | 6.64M | while (run_length == 0) { |
127 | 3.75M | if (_read_index == (*_run_length_null_map).size()) { |
128 | 855k | return 0; |
129 | 855k | } |
130 | 2.90M | *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA; |
131 | 2.90M | run_length = (*_run_length_null_map)[_read_index++]; |
132 | 2.90M | } |
133 | 2.88M | return run_length; |
134 | 3.74M | } |
135 | 61.3M | } _ZN5doris18ColumnSelectVector12get_next_runILb1EEEmPNS0_12DataReadTypeE Line | Count | Source | 106 | 57.5M | size_t get_next_run(DataReadType* data_read_type) { | 107 | 57.5M | DCHECK_EQ(_has_filter, has_filter); | 108 | 57.5M | if constexpr (has_filter) { | 109 | 57.5M | if (_read_index == _num_values) { | 110 | 315k | return 0; | 111 | 315k | } | 112 | 57.2M | const DataReadType& type = _data_map[_read_index++]; | 113 | 57.2M | size_t run_length = 1; | 114 | 676M | while (_read_index < _num_values) { | 115 | 673M | if (_data_map[_read_index] == type) { | 116 | 619M | run_length++; | 117 | 619M | _read_index++; | 118 | 619M | } else { | 119 | 54.1M | break; | 120 | 54.1M | } | 121 | 673M | } | 122 | 57.2M | *data_read_type = type; | 123 | 57.2M | return run_length; | 124 | | } else { | 125 | | size_t run_length = 0; | 126 | | while (run_length == 0) { | 127 | | if (_read_index == (*_run_length_null_map).size()) { | 128 | | return 0; | 129 | | } | 130 | | *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA; | 131 | | run_length = (*_run_length_null_map)[_read_index++]; | 132 | | } | 133 | | return run_length; | 134 | | } | 135 | 57.5M | } |
_ZN5doris18ColumnSelectVector12get_next_runILb0EEEmPNS0_12DataReadTypeE Line | Count | Source | 106 | 3.74M | size_t get_next_run(DataReadType* data_read_type) { | 107 | 3.74M | DCHECK_EQ(_has_filter, has_filter); | 108 | | if constexpr (has_filter) { | 109 | | if (_read_index == _num_values) { | 110 | | return 0; | 111 | | } | 112 | | const DataReadType& type = _data_map[_read_index++]; | 113 | | size_t run_length = 1; | 114 | | while (_read_index < _num_values) { | 115 | | if (_data_map[_read_index] == type) { | 116 | | run_length++; | 117 | | _read_index++; | 118 | | } else { | 119 | | break; | 120 | | } | 121 | | } | 122 | | *data_read_type = type; | 123 | | return run_length; | 124 | 3.74M | } else { | 125 | 3.74M | size_t run_length = 0; | 126 | 6.64M | while (run_length == 0) { | 127 | 3.75M | if (_read_index == (*_run_length_null_map).size()) { | 128 | 855k | return 0; | 129 | 855k | } | 130 | 2.90M | *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA; | 131 | 2.90M | run_length = (*_run_length_null_map)[_read_index++]; | 132 | 2.90M | } | 133 | 2.88M | return run_length; | 134 | 3.74M | } | 135 | 3.74M | } |
|
136 | | |
137 | | private: |
138 | | std::vector<DataReadType> _data_map; |
139 | | // the length of non-null values and null values are arranged in turn. |
140 | | const std::vector<uint16_t>* _run_length_null_map; |
141 | | bool _has_filter; |
142 | | size_t _num_values; |
143 | | size_t _num_nulls; |
144 | | size_t _num_filtered; |
145 | | size_t _read_index; |
146 | | }; |
147 | | |
148 | | enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER }; |
149 | | |
150 | | enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN }; |
151 | | |
152 | | class ParsedVersion { |
153 | | public: |
154 | | ParsedVersion(std::string application, std::optional<std::string> version, |
155 | | std::optional<std::string> app_build_hash); |
156 | | |
157 | 39 | const std::string& application() const { return _application; } |
158 | | |
159 | 90 | const std::optional<std::string>& version() const { return _version; } |
160 | | |
161 | 0 | const std::optional<std::string>& app_build_hash() const { return _app_build_hash; } |
162 | | |
163 | | bool operator==(const ParsedVersion& other) const; |
164 | | |
165 | | bool operator!=(const ParsedVersion& other) const; |
166 | | |
167 | | size_t hash() const; |
168 | | |
169 | | std::string to_string() const; |
170 | | |
171 | | private: |
172 | | std::string _application; |
173 | | std::optional<std::string> _version; |
174 | | std::optional<std::string> _app_build_hash; |
175 | | }; |
176 | | |
177 | | class VersionParser { |
178 | | public: |
179 | | static Status parse(const std::string& created_by, |
180 | | std::unique_ptr<ParsedVersion>* parsed_version); |
181 | | }; |
182 | | |
183 | | class SemanticVersion { |
184 | | public: |
185 | | SemanticVersion(int major, int minor, int patch); |
186 | | |
187 | | #ifdef BE_TEST |
188 | | SemanticVersion(int major, int minor, int patch, bool has_unknown); |
189 | | #endif |
190 | | |
191 | | SemanticVersion(int major, int minor, int patch, std::optional<std::string> unknown, |
192 | | std::optional<std::string> pre, std::optional<std::string> build_info); |
193 | | |
194 | | static Status parse(const std::string& version, |
195 | | std::unique_ptr<SemanticVersion>* semantic_version); |
196 | | |
197 | | int compare_to(const SemanticVersion& other) const; |
198 | | |
199 | | bool operator==(const SemanticVersion& other) const; |
200 | | |
201 | | bool operator!=(const SemanticVersion& other) const; |
202 | | |
203 | | std::string to_string() const; |
204 | | |
205 | | private: |
206 | | class NumberOrString { |
207 | | public: |
208 | | explicit NumberOrString(const std::string& value_string); |
209 | | |
210 | | NumberOrString(const NumberOrString& other); |
211 | | |
212 | | int compare_to(const NumberOrString& that) const; |
213 | | std::string to_string() const; |
214 | | |
215 | | bool operator<(const NumberOrString& that) const; |
216 | | bool operator==(const NumberOrString& that) const; |
217 | | bool operator!=(const NumberOrString& that) const; |
218 | | bool operator>(const NumberOrString& that) const; |
219 | | bool operator<=(const NumberOrString& that) const; |
220 | | bool operator>=(const NumberOrString& that) const; |
221 | | |
222 | | private: |
223 | | std::string _original; |
224 | | bool _is_numeric; |
225 | | int _number; |
226 | | }; |
227 | | |
228 | | class Prerelease { |
229 | | public: |
230 | | explicit Prerelease(std::string original); |
231 | | |
232 | | int compare_to(const Prerelease& that) const; |
233 | | std::string to_string() const; |
234 | | |
235 | | bool operator<(const Prerelease& that) const; |
236 | | bool operator==(const Prerelease& that) const; |
237 | | bool operator!=(const Prerelease& that) const; |
238 | | bool operator>(const Prerelease& that) const; |
239 | | bool operator<=(const Prerelease& that) const; |
240 | | bool operator>=(const Prerelease& that) const; |
241 | | |
242 | 0 | const std::string& original() const { return _original; } |
243 | | |
244 | | private: |
245 | | static std::vector<std::string> _split(const std::string& s, const std::regex& delimiter); |
246 | | |
247 | | std::string _original; |
248 | | std::vector<NumberOrString> _identifiers; |
249 | | }; |
250 | | |
251 | | static int _compare_integers(int x, int y); |
252 | | static int _compare_booleans(bool x, bool y); |
253 | | |
254 | | int _major; |
255 | | int _minor; |
256 | | int _patch; |
257 | | bool _prerelease; |
258 | | std::optional<std::string> _unknown; |
259 | | std::optional<Prerelease> _pre; |
260 | | std::optional<std::string> _build_info; |
261 | | }; |
262 | | |
263 | | class CorruptStatistics { |
264 | | public: |
265 | | static bool should_ignore_statistics(const std::string& created_by, |
266 | | tparquet::Type::type physical_type); |
267 | | |
268 | | private: |
269 | | static const SemanticVersion PARQUET_251_FIXED_VERSION; |
270 | | static const SemanticVersion CDH_5_PARQUET_251_FIXED_START; |
271 | | static const SemanticVersion CDH_5_PARQUET_251_FIXED_END; |
272 | | }; |
273 | | #include "common/compile_check_end.h" |
274 | | |
275 | | } // namespace doris |