be/src/format/parquet/parquet_common.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <gen_cpp/parquet_types.h> |
21 | | #include <stddef.h> |
22 | | |
23 | | #include <cstdint> |
24 | | #include <ostream> |
25 | | #include <regex> |
26 | | #include <string> |
27 | | #include <unordered_set> |
28 | | #include <vector> |
29 | | |
30 | | #include "core/column/column_nullable.h" |
31 | | #include "storage/segment/row_ranges.h" |
32 | | |
33 | | namespace doris { |
34 | | using level_t = int16_t; |
35 | | |
36 | | using segment_v2::RowRange; |
37 | | using segment_v2::RowRanges; |
38 | | |
39 | | #pragma pack(1) |
40 | | struct ParquetInt96 { |
41 | | int64_t lo; // time of nanoseconds in a day |
42 | | int32_t hi; // days from julian epoch |
43 | | |
44 | 0 | NO_SANITIZE_UNDEFINED inline int64_t to_timestamp_micros() const { |
45 | 0 | return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND; |
46 | 0 | } |
47 | 0 | inline __int128 to_int128() const { |
48 | 0 | __int128 ans = 0; |
49 | 0 | ans = (((__int128)hi) << 64) + lo; |
50 | 0 | return ans; |
51 | 0 | } |
52 | | |
53 | | static const int32_t JULIAN_EPOCH_OFFSET_DAYS; |
54 | | static const int64_t MICROS_IN_DAY; |
55 | | static const int64_t NANOS_PER_MICROSECOND; |
56 | | }; |
57 | | #pragma pack() |
58 | | static_assert(sizeof(ParquetInt96) == 12, "The size of ParquetInt96 is not 12."); |
59 | | |
60 | | class FilterMap { |
61 | | public: |
62 | 191 | FilterMap() = default; |
63 | | Status init(const uint8_t* filter_map_data, size_t filter_map_size, bool filter_all); |
64 | | |
65 | | Status generate_nested_filter_map(const std::vector<level_t>& rep_levels, |
66 | | std::vector<uint8_t>& nested_filter_map_data, |
67 | | std::unique_ptr<FilterMap>* nested_filter_map, |
68 | | size_t* current_row_ptr, size_t start_index = 0) const; |
69 | | |
70 | 195 | const uint8_t* filter_map_data() const { return _filter_map_data; } |
71 | 196 | size_t filter_map_size() const { return _filter_map_size; } |
72 | 703 | bool has_filter() const { return _has_filter; } |
73 | 24 | bool filter_all() const { return _filter_all; } |
74 | 5 | double filter_ratio() const { return _has_filter ? _filter_ratio : 0; } |
75 | | |
76 | | bool can_filter_all(size_t remaining_num_values, size_t filter_map_index); |
77 | | |
78 | | private: |
79 | | bool _has_filter = false; |
80 | | bool _filter_all = false; |
81 | | const uint8_t* _filter_map_data = nullptr; |
82 | | size_t _filter_map_size = 0; |
83 | | double _filter_ratio = 0; |
84 | | }; |
85 | | |
86 | | class ColumnSelectVector { |
87 | | public: |
88 | | enum DataReadType : uint8_t { CONTENT = 0, NULL_DATA, FILTERED_CONTENT, FILTERED_NULL }; |
89 | | |
90 | 268 | ColumnSelectVector() = default; |
91 | | |
92 | | Status init(const std::vector<uint16_t>& run_length_null_map, size_t num_values, |
93 | | NullMap* null_map, FilterMap* filter_map, size_t filter_map_index, |
94 | | const std::unordered_set<size_t>* skipped_indices = nullptr); |
95 | | |
96 | 1.00k | size_t num_values() const { return _num_values; } |
97 | | |
98 | 225 | size_t num_nulls() const { return _num_nulls; } |
99 | | |
100 | 194 | size_t num_filtered() const { return _num_filtered; } |
101 | | |
102 | 263 | bool has_filter() const { return _has_filter; } |
103 | | |
104 | | template <bool has_filter> |
105 | 669 | size_t get_next_run(DataReadType* data_read_type) { |
106 | 669 | DCHECK_EQ(_has_filter, has_filter); |
107 | 669 | if constexpr (has_filter) { |
108 | 205 | if (_read_index == _num_values) { |
109 | 32 | return 0; |
110 | 32 | } |
111 | 173 | const DataReadType& type = _data_map[_read_index++]; |
112 | 173 | size_t run_length = 1; |
113 | 189 | while (_read_index < _num_values) { |
114 | 154 | if (_data_map[_read_index] == type) { |
115 | 16 | run_length++; |
116 | 16 | _read_index++; |
117 | 138 | } else { |
118 | 138 | break; |
119 | 138 | } |
120 | 154 | } |
121 | 173 | *data_read_type = type; |
122 | 173 | return run_length; |
123 | 464 | } else { |
124 | 464 | size_t run_length = 0; |
125 | 714 | while (run_length == 0) { |
126 | 481 | if (_read_index == (*_run_length_null_map).size()) { |
127 | 231 | return 0; |
128 | 231 | } |
129 | 250 | *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA; |
130 | 250 | run_length = (*_run_length_null_map)[_read_index++]; |
131 | 250 | } |
132 | 233 | return run_length; |
133 | 464 | } |
134 | 669 | } _ZN5doris18ColumnSelectVector12get_next_runILb1EEEmPNS0_12DataReadTypeE Line | Count | Source | 105 | 205 | size_t get_next_run(DataReadType* data_read_type) { | 106 | 205 | DCHECK_EQ(_has_filter, has_filter); | 107 | 205 | if constexpr (has_filter) { | 108 | 205 | if (_read_index == _num_values) { | 109 | 32 | return 0; | 110 | 32 | } | 111 | 173 | const DataReadType& type = _data_map[_read_index++]; | 112 | 173 | size_t run_length = 1; | 113 | 189 | while (_read_index < _num_values) { | 114 | 154 | if (_data_map[_read_index] == type) { | 115 | 16 | run_length++; | 116 | 16 | _read_index++; | 117 | 138 | } else { | 118 | 138 | break; | 119 | 138 | } | 120 | 154 | } | 121 | 173 | *data_read_type = type; | 122 | 173 | return run_length; | 123 | | } else { | 124 | | size_t run_length = 0; | 125 | | while (run_length == 0) { | 126 | | if (_read_index == (*_run_length_null_map).size()) { | 127 | | return 0; | 128 | | } | 129 | | *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA; | 130 | | run_length = (*_run_length_null_map)[_read_index++]; | 131 | | } | 132 | | return run_length; | 133 | | } | 134 | 205 | } |
_ZN5doris18ColumnSelectVector12get_next_runILb0EEEmPNS0_12DataReadTypeE Line | Count | Source | 105 | 464 | size_t get_next_run(DataReadType* data_read_type) { | 106 | 464 | DCHECK_EQ(_has_filter, has_filter); | 107 | | if constexpr (has_filter) { | 108 | | if (_read_index == _num_values) { | 109 | | return 0; | 110 | | } | 111 | | const DataReadType& type = _data_map[_read_index++]; | 112 | | size_t run_length = 1; | 113 | | while (_read_index < _num_values) { | 114 | | if (_data_map[_read_index] == type) { | 115 | | run_length++; | 116 | | _read_index++; | 117 | | } else { | 118 | | break; | 119 | | } | 120 | | } | 121 | | *data_read_type = type; | 122 | | return run_length; | 123 | 464 | } else { | 124 | 464 | size_t run_length = 0; | 125 | 714 | while (run_length == 0) { | 126 | 481 | if (_read_index == (*_run_length_null_map).size()) { | 127 | 231 | return 0; | 128 | 231 | } | 129 | 250 | *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA; | 130 | 250 | run_length = (*_run_length_null_map)[_read_index++]; | 131 | 250 | } | 132 | 233 | return run_length; | 133 | 464 | } | 134 | 464 | } |
|
135 | | |
136 | | private: |
137 | | std::vector<DataReadType> _data_map; |
138 | | // the length of non-null values and null values are arranged in turn. |
139 | | const std::vector<uint16_t>* _run_length_null_map; |
140 | | bool _has_filter; |
141 | | size_t _num_values; |
142 | | size_t _num_nulls; |
143 | | size_t _num_filtered; |
144 | | size_t _read_index; |
145 | | }; |
146 | | |
147 | | enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER }; |
148 | | |
149 | | enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN }; |
150 | | |
151 | | class ParsedVersion { |
152 | | public: |
153 | | ParsedVersion(std::string application, std::optional<std::string> version, |
154 | | std::optional<std::string> app_build_hash); |
155 | | |
156 | 39 | const std::string& application() const { return _application; } |
157 | | |
158 | 90 | const std::optional<std::string>& version() const { return _version; } |
159 | | |
160 | 0 | const std::optional<std::string>& app_build_hash() const { return _app_build_hash; } |
161 | | |
162 | | bool operator==(const ParsedVersion& other) const; |
163 | | |
164 | | bool operator!=(const ParsedVersion& other) const; |
165 | | |
166 | | size_t hash() const; |
167 | | |
168 | | std::string to_string() const; |
169 | | |
170 | | private: |
171 | | std::string _application; |
172 | | std::optional<std::string> _version; |
173 | | std::optional<std::string> _app_build_hash; |
174 | | }; |
175 | | |
176 | | class VersionParser { |
177 | | public: |
178 | | static Status parse(const std::string& created_by, |
179 | | std::unique_ptr<ParsedVersion>* parsed_version); |
180 | | }; |
181 | | |
182 | | class SemanticVersion { |
183 | | public: |
184 | | SemanticVersion(int major, int minor, int patch); |
185 | | |
186 | | #ifdef BE_TEST |
187 | | SemanticVersion(int major, int minor, int patch, bool has_unknown); |
188 | | #endif |
189 | | |
190 | | SemanticVersion(int major, int minor, int patch, std::optional<std::string> unknown, |
191 | | std::optional<std::string> pre, std::optional<std::string> build_info); |
192 | | |
193 | | static Status parse(const std::string& version, |
194 | | std::unique_ptr<SemanticVersion>* semantic_version); |
195 | | |
196 | | int compare_to(const SemanticVersion& other) const; |
197 | | |
198 | | bool operator==(const SemanticVersion& other) const; |
199 | | |
200 | | bool operator!=(const SemanticVersion& other) const; |
201 | | |
202 | | std::string to_string() const; |
203 | | |
204 | | private: |
205 | | class NumberOrString { |
206 | | public: |
207 | | explicit NumberOrString(const std::string& value_string); |
208 | | |
209 | | NumberOrString(const NumberOrString& other); |
210 | | |
211 | | int compare_to(const NumberOrString& that) const; |
212 | | std::string to_string() const; |
213 | | |
214 | | bool operator<(const NumberOrString& that) const; |
215 | | bool operator==(const NumberOrString& that) const; |
216 | | bool operator!=(const NumberOrString& that) const; |
217 | | bool operator>(const NumberOrString& that) const; |
218 | | bool operator<=(const NumberOrString& that) const; |
219 | | bool operator>=(const NumberOrString& that) const; |
220 | | |
221 | | private: |
222 | | std::string _original; |
223 | | bool _is_numeric; |
224 | | int _number; |
225 | | }; |
226 | | |
227 | | class Prerelease { |
228 | | public: |
229 | | explicit Prerelease(std::string original); |
230 | | |
231 | | int compare_to(const Prerelease& that) const; |
232 | | std::string to_string() const; |
233 | | |
234 | | bool operator<(const Prerelease& that) const; |
235 | | bool operator==(const Prerelease& that) const; |
236 | | bool operator!=(const Prerelease& that) const; |
237 | | bool operator>(const Prerelease& that) const; |
238 | | bool operator<=(const Prerelease& that) const; |
239 | | bool operator>=(const Prerelease& that) const; |
240 | | |
241 | 0 | const std::string& original() const { return _original; } |
242 | | |
243 | | private: |
244 | | static std::vector<std::string> _split(const std::string& s, const std::regex& delimiter); |
245 | | |
246 | | std::string _original; |
247 | | std::vector<NumberOrString> _identifiers; |
248 | | }; |
249 | | |
250 | | static int _compare_integers(int x, int y); |
251 | | static int _compare_booleans(bool x, bool y); |
252 | | |
253 | | int _major; |
254 | | int _minor; |
255 | | int _patch; |
256 | | bool _prerelease; |
257 | | std::optional<std::string> _unknown; |
258 | | std::optional<Prerelease> _pre; |
259 | | std::optional<std::string> _build_info; |
260 | | }; |
261 | | |
262 | | class CorruptStatistics { |
263 | | public: |
264 | | static bool should_ignore_statistics(const std::string& created_by, |
265 | | tparquet::Type::type physical_type); |
266 | | |
267 | | private: |
268 | | static const SemanticVersion PARQUET_251_FIXED_VERSION; |
269 | | static const SemanticVersion CDH_5_PARQUET_251_FIXED_START; |
270 | | static const SemanticVersion CDH_5_PARQUET_251_FIXED_END; |
271 | | }; |
272 | | |
273 | | } // namespace doris |