be/src/format/parquet/parquet_common.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "format/parquet/parquet_common.h" |
19 | | |
20 | | #include <glog/logging.h> |
21 | | |
22 | | #include "common/cast_set.h" |
23 | | #include "core/types.h" |
24 | | #include "util/simd/bits.h" |
25 | | |
26 | | namespace doris { |
27 | | const int32_t ParquetInt96::JULIAN_EPOCH_OFFSET_DAYS = 2440588; |
28 | | const int64_t ParquetInt96::MICROS_IN_DAY = 86400000000; |
29 | | const int64_t ParquetInt96::NANOS_PER_MICROSECOND = 1000; |
30 | | |
31 | 53.6k | Status FilterMap::init(const uint8_t* filter_map_data, size_t filter_map_size, bool filter_all) { |
32 | 53.6k | _filter_all = filter_all; |
33 | 53.6k | _filter_map_data = filter_map_data; |
34 | 53.6k | _filter_map_size = filter_map_size; |
35 | 53.6k | if (filter_all) { |
36 | 2.90k | _has_filter = true; |
37 | 2.90k | _filter_ratio = 1; |
38 | 50.7k | } else if (filter_map_data == nullptr) { |
39 | 34 | _has_filter = false; |
40 | 34 | _filter_ratio = 0; |
41 | 50.7k | } else { |
42 | 50.7k | size_t filter_count = simd::count_zero_num(reinterpret_cast<const int8_t*>(filter_map_data), |
43 | 50.7k | filter_map_size); |
44 | 50.7k | if (filter_count == filter_map_size) { |
45 | 14.6k | _has_filter = true; |
46 | 14.6k | _filter_all = true; |
47 | 14.6k | _filter_ratio = 1; |
48 | 36.0k | } else if (filter_count > 0 && filter_map_size > 0) { |
49 | 18.0k | _has_filter = true; |
50 | 18.0k | _filter_ratio = (double)filter_count / (double)filter_map_size; |
51 | 18.0k | } else { |
52 | 18.0k | _has_filter = false; |
53 | 18.0k | _filter_ratio = 0; |
54 | 18.0k | } |
55 | 50.7k | } |
56 | 53.6k | return Status::OK(); |
57 | 53.6k | } |
58 | | |
59 | 163k | bool FilterMap::can_filter_all(size_t remaining_num_values, size_t filter_map_index) { |
60 | 163k | if (!_has_filter) { |
61 | 0 | return false; |
62 | 0 | } |
63 | 163k | if (_filter_all) { |
64 | | // all data in normal columns can be skipped when _filter_all = true, |
65 | | // so the remaining_num_values should be less than the remaining filter map size. |
66 | 2 | DCHECK_LE(remaining_num_values + filter_map_index, _filter_map_size); |
67 | | // return true always, to make sure that the data in normal columns can be skipped. |
68 | 2 | return true; |
69 | 2 | } |
70 | 163k | if (remaining_num_values + filter_map_index > _filter_map_size) { |
71 | 0 | return false; |
72 | 0 | } |
73 | 163k | return simd::count_zero_num( |
74 | 163k | reinterpret_cast<const int8_t*>(_filter_map_data + filter_map_index), |
75 | 163k | remaining_num_values) == remaining_num_values; |
76 | 163k | } |
77 | | |
78 | | Status FilterMap::generate_nested_filter_map(const std::vector<level_t>& rep_levels, |
79 | | std::vector<uint8_t>& nested_filter_map_data, |
80 | | std::unique_ptr<FilterMap>* nested_filter_map, |
81 | 13 | size_t* current_row_ptr, size_t start_index) const { |
82 | 13 | if (!has_filter() || filter_all()) { |
83 | 1 | return Status::InternalError(fmt::format( |
84 | 1 | "FilterMap::generate_nested_filter_map failed: has_filter={}, filter_all={}", |
85 | 1 | has_filter(), filter_all())); |
86 | 1 | } |
87 | | |
88 | 12 | if (rep_levels.empty()) { |
89 | 1 | return Status::OK(); |
90 | 1 | } |
91 | | |
92 | 11 | nested_filter_map_data.resize(rep_levels.size()); |
93 | | |
94 | 11 | size_t current_row = current_row_ptr ? *current_row_ptr : 0; |
95 | | |
96 | 88 | for (size_t i = start_index; i < rep_levels.size(); i++) { |
97 | 77 | if (i != start_index && rep_levels[i] == 0) { |
98 | 19 | current_row++; |
99 | 19 | if (current_row >= _filter_map_size) { |
100 | 0 | return Status::InvalidArgument(fmt::format( |
101 | 0 | "current_row >= _filter_map_size. current_row: {}, _filter_map_size: {}", |
102 | 0 | current_row, _filter_map_size)); |
103 | 0 | } |
104 | 19 | } |
105 | 77 | nested_filter_map_data[i] = _filter_map_data[current_row]; |
106 | 77 | } |
107 | | |
108 | 11 | if (current_row_ptr) { |
109 | 11 | *current_row_ptr = current_row; |
110 | 11 | } |
111 | | |
112 | 11 | auto new_filter = std::make_unique<FilterMap>(); |
113 | 11 | RETURN_IF_ERROR( |
114 | 11 | new_filter->init(nested_filter_map_data.data(), nested_filter_map_data.size(), false)); |
115 | 11 | *nested_filter_map = std::move(new_filter); |
116 | | |
117 | 11 | return Status::OK(); |
118 | 11 | } |
119 | | |
120 | | Status ColumnSelectVector::init(const std::vector<uint16_t>& run_length_null_map, size_t num_values, |
121 | | NullMap* null_map, FilterMap* filter_map, size_t filter_map_index, |
122 | 845k | const std::unordered_set<size_t>* skipped_indices) { |
123 | 845k | _num_values = num_values; |
124 | 845k | _num_nulls = 0; |
125 | 845k | _read_index = 0; |
126 | 845k | size_t map_index = 0; |
127 | 845k | bool is_null = false; |
128 | 845k | _has_filter = filter_map->has_filter(); |
129 | | |
130 | 845k | if (filter_map->has_filter()) { |
131 | | // No run length null map is generated when _filter_all = true |
132 | | // DCHECK(!filter_map->filter_all()); |
133 | 255k | _data_map.resize(num_values); |
134 | 816k | for (auto& run_length : run_length_null_map) { |
135 | 816k | if (is_null) { |
136 | 280k | _num_nulls += run_length; |
137 | 573k | for (int i = 0; i < run_length; ++i) { |
138 | 293k | _data_map[map_index++] = FILTERED_NULL; |
139 | 293k | } |
140 | 536k | } else { |
141 | 478M | for (int i = 0; i < run_length; ++i) { |
142 | 477M | _data_map[map_index++] = FILTERED_CONTENT; |
143 | 477M | } |
144 | 536k | } |
145 | 816k | is_null = !is_null; |
146 | 816k | } |
147 | | |
148 | 255k | size_t num_read = 0; |
149 | 255k | size_t i = 0; |
150 | 255k | size_t valid_count = 0; |
151 | | |
152 | 472M | while (valid_count < num_values) { |
153 | 471M | DCHECK_LT(filter_map_index + i, filter_map->filter_map_size()); |
154 | | |
155 | 471M | if (skipped_indices != nullptr && skipped_indices->count(filter_map_index + i) > 0) { |
156 | 1.08M | ++i; |
157 | 1.08M | continue; |
158 | 1.08M | } |
159 | | |
160 | 470M | if (filter_map->filter_map_data()[filter_map_index + i]) { |
161 | 258M | _data_map[valid_count] = |
162 | 258M | _data_map[valid_count] == FILTERED_NULL ? NULL_DATA : CONTENT; |
163 | 258M | num_read++; |
164 | 258M | } |
165 | 470M | ++valid_count; |
166 | 470M | ++i; |
167 | 470M | } |
168 | | |
169 | 255k | _num_filtered = num_values - num_read; |
170 | | |
171 | 255k | if (null_map != nullptr && num_read > 0) { |
172 | 221k | NullMap& map_data_column = *null_map; |
173 | 221k | auto null_map_index = map_data_column.size(); |
174 | 221k | map_data_column.resize(null_map_index + num_read); |
175 | | |
176 | 221k | if (_num_nulls == 0) { |
177 | 220k | memset(map_data_column.data() + null_map_index, 0, num_read); |
178 | 220k | } else if (_num_nulls == num_values) { |
179 | 0 | memset(map_data_column.data() + null_map_index, 1, num_read); |
180 | 1.04k | } else { |
181 | 6.22M | for (i = 0; i < num_values; ++i) { |
182 | 6.22M | if (_data_map[i] == CONTENT) { |
183 | 1.56k | map_data_column[null_map_index++] = (UInt8) false; |
184 | 6.22M | } else if (_data_map[i] == NULL_DATA) { |
185 | 19.4k | map_data_column[null_map_index++] = (UInt8) true; |
186 | 19.4k | } |
187 | 6.22M | } |
188 | 1.04k | } |
189 | 221k | } |
190 | 589k | } else { |
191 | 589k | _num_filtered = 0; |
192 | 589k | _run_length_null_map = &run_length_null_map; |
193 | 589k | if (null_map != nullptr) { |
194 | 587k | NullMap& map_data_column = *null_map; |
195 | 587k | auto null_map_index = map_data_column.size(); |
196 | 587k | map_data_column.resize(null_map_index + num_values); |
197 | | |
198 | 2.61M | for (auto& run_length : run_length_null_map) { |
199 | 2.61M | if (is_null) { |
200 | 1.01M | memset(map_data_column.data() + null_map_index, 1, run_length); |
201 | 1.01M | null_map_index += run_length; |
202 | 1.01M | _num_nulls += run_length; |
203 | 1.59M | } else { |
204 | 1.59M | memset(map_data_column.data() + null_map_index, 0, run_length); |
205 | 1.59M | null_map_index += run_length; |
206 | 1.59M | } |
207 | 2.61M | is_null = !is_null; |
208 | 2.61M | } |
209 | 587k | } else { |
210 | 1.74k | for (auto& run_length : run_length_null_map) { |
211 | 1.74k | if (is_null) { |
212 | 1 | _num_nulls += run_length; |
213 | 1 | } |
214 | 1.74k | is_null = !is_null; |
215 | 1.74k | } |
216 | 1.71k | } |
217 | 589k | } |
218 | 845k | return Status::OK(); |
219 | 845k | } |
220 | | |
221 | | ParsedVersion::ParsedVersion(std::string application, std::optional<std::string> version, |
222 | | std::optional<std::string> app_build_hash) |
223 | 79 | : _application(std::move(application)), |
224 | 79 | _version(std::move(version)), |
225 | 79 | _app_build_hash(std::move(app_build_hash)) {} |
226 | | |
227 | 20 | bool ParsedVersion::operator==(const ParsedVersion& other) const { |
228 | 20 | return _application == other._application && _version == other._version && |
229 | 20 | _app_build_hash == other._app_build_hash; |
230 | 20 | } |
231 | | |
232 | 0 | bool ParsedVersion::operator!=(const ParsedVersion& other) const { |
233 | 0 | return !(*this == other); |
234 | 0 | } |
235 | | |
236 | 0 | size_t ParsedVersion::hash() const { |
237 | 0 | std::hash<std::string> hasher; |
238 | 0 | return hasher(_application) ^ (_version ? hasher(*_version) : 0) ^ |
239 | 0 | (_app_build_hash ? hasher(*_app_build_hash) : 0); |
240 | 0 | } |
241 | | |
242 | 0 | std::string ParsedVersion::to_string() const { |
243 | 0 | return "ParsedVersion(application=" + _application + |
244 | 0 | ", semver=" + (_version ? *_version : "null") + |
245 | 0 | ", app_build_hash=" + (_app_build_hash ? *_app_build_hash : "null") + ")"; |
246 | 0 | } |
247 | | |
248 | | Status VersionParser::parse(const std::string& created_by, |
249 | 61 | std::unique_ptr<ParsedVersion>* parsed_version) { |
250 | 61 | static const std::string FORMAT = |
251 | 61 | "(.*?)\\s+version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?"; |
252 | 61 | static const std::regex PATTERN(FORMAT); |
253 | | |
254 | 61 | std::smatch matcher; |
255 | 61 | if (!std::regex_match(created_by, matcher, PATTERN)) { |
256 | 2 | return Status::InternalError(fmt::format("Could not parse created_by: {}, using format: {}", |
257 | 2 | created_by, FORMAT)); |
258 | 2 | } |
259 | | |
260 | 59 | std::string application = matcher[1].str(); |
261 | 59 | if (application.empty()) { |
262 | 0 | return Status::InternalError("application cannot be null or empty"); |
263 | 0 | } |
264 | 59 | std::optional<std::string> semver = |
265 | 59 | matcher[2].str().empty() ? std::nullopt : std::optional<std::string>(matcher[2].str()); |
266 | 59 | std::optional<std::string> app_build_hash = |
267 | 59 | matcher[3].str().empty() ? std::nullopt : std::optional<std::string>(matcher[3].str()); |
268 | 59 | *parsed_version = std::make_unique<ParsedVersion>(application, semver, app_build_hash); |
269 | 59 | return Status::OK(); |
270 | 59 | } |
271 | | |
272 | | SemanticVersion::SemanticVersion(int major, int minor, int patch) |
273 | 43 | : _major(major), |
274 | 43 | _minor(minor), |
275 | 43 | _patch(patch), |
276 | 43 | _prerelease(false), |
277 | 43 | _unknown(std::nullopt), |
278 | 43 | _pre(std::nullopt), |
279 | 43 | _build_info(std::nullopt) {} |
280 | | |
281 | | #ifdef BE_TEST |
282 | | SemanticVersion::SemanticVersion(int major, int minor, int patch, bool has_unknown) |
283 | | : _major(major), |
284 | | _minor(minor), |
285 | | _patch(patch), |
286 | | _prerelease(has_unknown), |
287 | | _unknown(std::nullopt), |
288 | | _pre(std::nullopt), |
289 | | _build_info(std::nullopt) {} |
290 | | #endif |
291 | | |
292 | | SemanticVersion::SemanticVersion(int major, int minor, int patch, |
293 | | std::optional<std::string> unknown, std::optional<std::string> pre, |
294 | | std::optional<std::string> build_info) |
295 | 104 | : _major(major), |
296 | 104 | _minor(minor), |
297 | 104 | _patch(patch), |
298 | 104 | _prerelease(unknown.has_value() && !unknown.value().empty()), |
299 | 104 | _unknown(std::move(unknown)), |
300 | 104 | _pre(pre.has_value() ? std::optional<Prerelease>(Prerelease(std::move(pre.value()))) |
301 | 104 | : std::nullopt), |
302 | 104 | _build_info(std::move(build_info)) {} |
303 | | |
304 | | Status SemanticVersion::parse(const std::string& version, |
305 | 92 | std::unique_ptr<SemanticVersion>* semantic_version) { |
306 | 92 | static const std::regex pattern(R"(^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$)"); |
307 | 92 | std::smatch match; |
308 | | |
309 | 92 | if (!std::regex_match(version, match, pattern)) { |
310 | 0 | return Status::InternalError(version + " does not match format"); |
311 | 0 | } |
312 | | |
313 | 92 | int major = std::stoi(match[1].str()); |
314 | 92 | int minor = std::stoi(match[2].str()); |
315 | 92 | int patch = std::stoi(match[3].str()); |
316 | 92 | std::optional<std::string> unknown = |
317 | 92 | match[4].str().empty() ? std::nullopt : std::optional<std::string>(match[4].str()); |
318 | 92 | std::optional<std::string> prerelease = |
319 | 92 | match[5].str().empty() ? std::nullopt : std::optional<std::string>(match[5].str()); |
320 | 92 | std::optional<std::string> build_info = |
321 | 92 | match[6].str().empty() ? std::nullopt : std::optional<std::string>(match[6].str()); |
322 | 92 | if (major < 0 || minor < 0 || patch < 0) { |
323 | 0 | return Status::InternalError("major({}), minor({}), and patch({}) must all be >= 0", major, |
324 | 0 | minor, patch); |
325 | 0 | } |
326 | 92 | *semantic_version = |
327 | 92 | std::make_unique<SemanticVersion>(major, minor, patch, unknown, prerelease, build_info); |
328 | 92 | return Status::OK(); |
329 | 92 | } |
330 | | |
331 | 132 | int SemanticVersion::compare_to(const SemanticVersion& other) const { |
332 | 132 | if (int cmp = _compare_integers(_major, other._major); cmp != 0) { |
333 | 3 | return cmp; |
334 | 3 | } |
335 | 129 | if (int cmp = _compare_integers(_minor, other._minor); cmp != 0) { |
336 | 55 | return cmp; |
337 | 55 | } |
338 | 74 | if (int cmp = _compare_integers(_patch, other._patch); cmp != 0) { |
339 | 9 | return cmp; |
340 | 9 | } |
341 | 65 | if (int cmp = _compare_booleans(other._prerelease, _prerelease); cmp != 0) { |
342 | 4 | return cmp; |
343 | 4 | } |
344 | 61 | if (_pre.has_value()) { |
345 | 48 | if (other._pre.has_value()) { |
346 | 42 | return _pre.value().compare_to(other._pre.value()); |
347 | 42 | } else { |
348 | 6 | return -1; |
349 | 6 | } |
350 | 48 | } else if (other._pre.has_value()) { |
351 | 3 | return 1; |
352 | 3 | } |
353 | 10 | return 0; |
354 | 61 | } |
355 | | |
356 | 5 | bool SemanticVersion::operator==(const SemanticVersion& other) const { |
357 | 5 | return compare_to(other) == 0; |
358 | 5 | } |
359 | | |
360 | 0 | bool SemanticVersion::operator!=(const SemanticVersion& other) const { |
361 | 0 | return !(*this == other); |
362 | 0 | } |
363 | | |
364 | 0 | std::string SemanticVersion::to_string() const { |
365 | 0 | std::string result = |
366 | 0 | std::to_string(_major) + "." + std::to_string(_minor) + "." + std::to_string(_patch); |
367 | 0 | if (_prerelease && _unknown) result += _unknown.value(); |
368 | 0 | if (_pre) result += _pre.value().to_string(); |
369 | 0 | if (_build_info) result += _build_info.value(); |
370 | 0 | return result; |
371 | 0 | } |
372 | | |
373 | | SemanticVersion::NumberOrString::NumberOrString(const std::string& value_string) |
374 | 153 | : _original(value_string) { |
375 | 153 | const static std::regex NUMERIC("\\d+"); |
376 | 153 | _is_numeric = std::regex_match(_original, NUMERIC); |
377 | 153 | _number = -1; |
378 | 153 | if (_is_numeric) { |
379 | 71 | _number = std::stoi(_original); |
380 | 71 | } |
381 | 153 | } |
382 | | |
383 | | SemanticVersion::NumberOrString::NumberOrString(const NumberOrString& other) |
384 | 260 | : _original(other._original), _is_numeric(other._is_numeric), _number(other._number) {} |
385 | | |
386 | 77 | int SemanticVersion::NumberOrString::compare_to(const SemanticVersion::NumberOrString& that) const { |
387 | 77 | if (this->_is_numeric != that._is_numeric) { |
388 | 7 | return this->_is_numeric ? -1 : 1; |
389 | 7 | } |
390 | | |
391 | 70 | if (_is_numeric) { |
392 | 27 | return this->_number - that._number; |
393 | 27 | } |
394 | | |
395 | 43 | return this->_original.compare(that._original); |
396 | 70 | } |
397 | | |
398 | 0 | std::string SemanticVersion::NumberOrString::to_string() const { |
399 | 0 | return _original; |
400 | 0 | } |
401 | | |
402 | 0 | bool SemanticVersion::NumberOrString::operator<(const SemanticVersion::NumberOrString& that) const { |
403 | 0 | return compare_to(that) < 0; |
404 | 0 | } |
405 | | |
406 | | bool SemanticVersion::NumberOrString::operator==( |
407 | 0 | const SemanticVersion::NumberOrString& that) const { |
408 | 0 | return compare_to(that) == 0; |
409 | 0 | } |
410 | | |
411 | | bool SemanticVersion::NumberOrString::operator!=( |
412 | 0 | const SemanticVersion::NumberOrString& that) const { |
413 | 0 | return !(*this == that); |
414 | 0 | } |
415 | | |
416 | 0 | bool SemanticVersion::NumberOrString::operator>(const SemanticVersion::NumberOrString& that) const { |
417 | 0 | return compare_to(that) > 0; |
418 | 0 | } |
419 | | |
420 | | bool SemanticVersion::NumberOrString::operator<=( |
421 | 0 | const SemanticVersion::NumberOrString& that) const { |
422 | 0 | return !(*this > that); |
423 | 0 | } |
424 | | |
425 | | bool SemanticVersion::NumberOrString::operator>=( |
426 | 0 | const SemanticVersion::NumberOrString& that) const { |
427 | 0 | return !(*this < that); |
428 | 0 | } |
429 | | |
430 | 335 | int SemanticVersion::_compare_integers(int x, int y) { |
431 | 335 | return (x < y) ? -1 : ((x == y) ? 0 : 1); |
432 | 335 | } |
433 | | |
434 | 65 | int SemanticVersion::_compare_booleans(bool x, bool y) { |
435 | 65 | return (x == y) ? 0 : (x ? 1 : -1); |
436 | 65 | } |
437 | | |
438 | | std::vector<std::string> SemanticVersion::Prerelease::_split(const std::string& s, |
439 | 75 | const std::regex& delimiter) { |
440 | 75 | std::sregex_token_iterator iter(s.begin(), s.end(), delimiter, -1); |
441 | 75 | std::sregex_token_iterator end; |
442 | 75 | std::vector<std::string> tokens(iter, end); |
443 | 75 | return tokens; |
444 | 75 | } |
445 | | |
446 | 75 | SemanticVersion::Prerelease::Prerelease(std::string original) : _original(std::move(original)) { |
447 | 75 | static const std::regex DOT("\\."); |
448 | 75 | auto parts = _split(_original, DOT); |
449 | 153 | for (const auto& part : parts) { |
450 | 153 | NumberOrString number_or_string(part); |
451 | 153 | _identifiers.emplace_back(number_or_string); |
452 | 153 | } |
453 | 75 | } |
454 | | |
455 | 42 | int SemanticVersion::Prerelease::compare_to(const Prerelease& that) const { |
456 | 42 | auto size = std::min(this->_identifiers.size(), that._identifiers.size()); |
457 | 97 | for (int i = 0; i < size; ++i) { |
458 | 77 | int cmp = this->_identifiers[i].compare_to(that._identifiers[i]); |
459 | 77 | if (cmp != 0) { |
460 | 22 | return cmp; |
461 | 22 | } |
462 | 77 | } |
463 | 20 | return static_cast<int>(this->_identifiers.size()) - static_cast<int>(that._identifiers.size()); |
464 | 42 | } |
465 | | |
466 | 0 | std::string SemanticVersion::Prerelease::to_string() const { |
467 | 0 | return _original; |
468 | 0 | } |
469 | | |
470 | 0 | bool SemanticVersion::Prerelease::operator<(const Prerelease& that) const { |
471 | 0 | return compare_to(that) < 0; |
472 | 0 | } |
473 | | |
474 | 0 | bool SemanticVersion::Prerelease::operator==(const Prerelease& that) const { |
475 | 0 | return compare_to(that) == 0; |
476 | 0 | } |
477 | | |
478 | 0 | bool SemanticVersion::Prerelease::operator!=(const Prerelease& that) const { |
479 | 0 | return !(*this == that); |
480 | 0 | } |
481 | | |
482 | 0 | bool SemanticVersion::Prerelease::operator>(const Prerelease& that) const { |
483 | 0 | return compare_to(that) > 0; |
484 | 0 | } |
485 | | |
486 | 0 | bool SemanticVersion::Prerelease::operator<=(const Prerelease& that) const { |
487 | 0 | return !(*this > that); |
488 | 0 | } |
489 | | |
490 | 0 | bool SemanticVersion::Prerelease::operator>=(const Prerelease& that) const { |
491 | 0 | return !(*this < that); |
492 | 0 | } |
493 | | |
494 | | const SemanticVersion CorruptStatistics::PARQUET_251_FIXED_VERSION(1, 8, 0); |
495 | | const SemanticVersion CorruptStatistics::CDH_5_PARQUET_251_FIXED_START(1, 5, 0, std::nullopt, |
496 | | "cdh5.5.0", std::nullopt); |
497 | | const SemanticVersion CorruptStatistics::CDH_5_PARQUET_251_FIXED_END(1, 5, 0); |
498 | | |
499 | | bool CorruptStatistics::should_ignore_statistics(const std::string& created_by, |
500 | 261 | tparquet::Type::type physical_type) { |
501 | 261 | if (physical_type != tparquet::Type::BYTE_ARRAY && |
502 | 261 | physical_type != tparquet::Type::FIXED_LEN_BYTE_ARRAY) { |
503 | | // The bug only applies to binary columns |
504 | 221 | return false; |
505 | 221 | } |
506 | | |
507 | 40 | if (created_by.empty()) { |
508 | | // created_by is not populated |
509 | 0 | VLOG_DEBUG |
510 | 0 | << "Ignoring statistics because created_by is null or empty! See PARQUET-251 and " |
511 | 0 | "PARQUET-297"; |
512 | 0 | return true; |
513 | 0 | } |
514 | | |
515 | 40 | Status status; |
516 | 40 | std::unique_ptr<ParsedVersion> parsed_version; |
517 | 40 | status = VersionParser::parse(created_by, &parsed_version); |
518 | 40 | if (!status.ok()) { |
519 | 1 | VLOG_DEBUG << "Ignoring statistics because created_by could not be parsed (see " |
520 | 0 | "PARQUET-251)." |
521 | 0 | " CreatedBy: " |
522 | 0 | << created_by << ", msg: " << status.msg(); |
523 | 1 | return true; |
524 | 1 | } |
525 | | |
526 | 39 | if (parsed_version->application() != "parquet-mr") { |
527 | | // Assume other applications don't have this bug |
528 | 7 | return false; |
529 | 7 | } |
530 | | |
531 | 32 | if ((!parsed_version->version().has_value()) || parsed_version->version().value().empty()) { |
532 | 3 | VLOG_DEBUG << "Ignoring statistics because created_by did not contain a semver (see " |
533 | 0 | "PARQUET-251): " |
534 | 0 | << created_by; |
535 | 3 | return true; |
536 | 3 | } |
537 | | |
538 | 29 | std::unique_ptr<SemanticVersion> semantic_version; |
539 | 29 | status = SemanticVersion::parse(parsed_version->version().value(), &semantic_version); |
540 | 29 | if (!status.ok()) { |
541 | 0 | VLOG_DEBUG << "Ignoring statistics because created_by could not be parsed (see " |
542 | 0 | "PARQUET-251)." |
543 | 0 | " CreatedBy: " |
544 | 0 | << created_by << ", msg: " << status.msg(); |
545 | 0 | return true; |
546 | 0 | } |
547 | 29 | if (semantic_version->compare_to(PARQUET_251_FIXED_VERSION) < 0 && |
548 | 29 | !(semantic_version->compare_to(CDH_5_PARQUET_251_FIXED_START) >= 0 && |
549 | 22 | semantic_version->compare_to(CDH_5_PARQUET_251_FIXED_END) < 0)) { |
550 | 18 | VLOG_DEBUG |
551 | 0 | << "Ignoring statistics because this file was created prior to the fixed version, " |
552 | 0 | "see PARQUET-251"; |
553 | 18 | return true; |
554 | 18 | } |
555 | | |
556 | | // This file was created after the fix |
557 | 11 | return false; |
558 | 29 | } |
559 | | |
560 | | } // namespace doris |