be/src/format/parquet/parquet_common.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "format/parquet/parquet_common.h" |
19 | | |
20 | | #include <glog/logging.h> |
21 | | |
22 | | #include "common/cast_set.h" |
23 | | #include "core/types.h" |
24 | | #include "util/simd/bits.h" |
25 | | |
26 | | namespace doris { |
27 | | #include "common/compile_check_begin.h" |
28 | | const int32_t ParquetInt96::JULIAN_EPOCH_OFFSET_DAYS = 2440588; |
29 | | const int64_t ParquetInt96::MICROS_IN_DAY = 86400000000; |
30 | | const int64_t ParquetInt96::NANOS_PER_MICROSECOND = 1000; |
31 | | |
32 | 92.0k | Status FilterMap::init(const uint8_t* filter_map_data, size_t filter_map_size, bool filter_all) { |
33 | 92.0k | _filter_all = filter_all; |
34 | 92.0k | _filter_map_data = filter_map_data; |
35 | 92.0k | _filter_map_size = filter_map_size; |
36 | 92.0k | if (filter_all) { |
37 | 3.46k | _has_filter = true; |
38 | 3.46k | _filter_ratio = 1; |
39 | 88.6k | } else if (filter_map_data == nullptr) { |
40 | 34 | _has_filter = false; |
41 | 34 | _filter_ratio = 0; |
42 | 88.5k | } else { |
43 | 88.5k | size_t filter_count = simd::count_zero_num(reinterpret_cast<const int8_t*>(filter_map_data), |
44 | 88.5k | filter_map_size); |
45 | 88.5k | if (filter_count == filter_map_size) { |
46 | 28.4k | _has_filter = true; |
47 | 28.4k | _filter_all = true; |
48 | 28.4k | _filter_ratio = 1; |
49 | 60.1k | } else if (filter_count > 0 && filter_map_size > 0) { |
50 | 28.3k | _has_filter = true; |
51 | 28.3k | _filter_ratio = (double)filter_count / (double)filter_map_size; |
52 | 31.8k | } else { |
53 | 31.8k | _has_filter = false; |
54 | 31.8k | _filter_ratio = 0; |
55 | 31.8k | } |
56 | 88.5k | } |
57 | 92.0k | return Status::OK(); |
58 | 92.0k | } |
59 | | |
60 | 197k | bool FilterMap::can_filter_all(size_t remaining_num_values, size_t filter_map_index) { |
61 | 197k | if (!_has_filter) { |
62 | 0 | return false; |
63 | 0 | } |
64 | 197k | if (_filter_all) { |
65 | | // all data in normal columns can be skipped when _filter_all = true, |
66 | | // so the remaining_num_values should be less than the remaining filter map size. |
67 | 2 | DCHECK_LE(remaining_num_values + filter_map_index, _filter_map_size); |
68 | | // return true always, to make sure that the data in normal columns can be skipped. |
69 | 2 | return true; |
70 | 2 | } |
71 | 197k | if (remaining_num_values + filter_map_index > _filter_map_size) { |
72 | 0 | return false; |
73 | 0 | } |
74 | 197k | return simd::count_zero_num( |
75 | 197k | reinterpret_cast<const int8_t*>(_filter_map_data + filter_map_index), |
76 | 197k | remaining_num_values) == remaining_num_values; |
77 | 197k | } |
78 | | |
79 | | Status FilterMap::generate_nested_filter_map(const std::vector<level_t>& rep_levels, |
80 | | std::vector<uint8_t>& nested_filter_map_data, |
81 | | std::unique_ptr<FilterMap>* nested_filter_map, |
82 | 13 | size_t* current_row_ptr, size_t start_index) const { |
83 | 13 | if (!has_filter() || filter_all()) { |
84 | 1 | return Status::InternalError(fmt::format( |
85 | 1 | "FilterMap::generate_nested_filter_map failed: has_filter={}, filter_all={}", |
86 | 1 | has_filter(), filter_all())); |
87 | 1 | } |
88 | | |
89 | 12 | if (rep_levels.empty()) { |
90 | 1 | return Status::OK(); |
91 | 1 | } |
92 | | |
93 | 11 | nested_filter_map_data.resize(rep_levels.size()); |
94 | | |
95 | 11 | size_t current_row = current_row_ptr ? *current_row_ptr : 0; |
96 | | |
97 | 88 | for (size_t i = start_index; i < rep_levels.size(); i++) { |
98 | 77 | if (i != start_index && rep_levels[i] == 0) { |
99 | 19 | current_row++; |
100 | 19 | if (current_row >= _filter_map_size) { |
101 | 0 | return Status::InvalidArgument(fmt::format( |
102 | 0 | "current_row >= _filter_map_size. current_row: {}, _filter_map_size: {}", |
103 | 0 | current_row, _filter_map_size)); |
104 | 0 | } |
105 | 19 | } |
106 | 77 | nested_filter_map_data[i] = _filter_map_data[current_row]; |
107 | 77 | } |
108 | | |
109 | 11 | if (current_row_ptr) { |
110 | 11 | *current_row_ptr = current_row; |
111 | 11 | } |
112 | | |
113 | 11 | auto new_filter = std::make_unique<FilterMap>(); |
114 | 11 | RETURN_IF_ERROR( |
115 | 11 | new_filter->init(nested_filter_map_data.data(), nested_filter_map_data.size(), false)); |
116 | 11 | *nested_filter_map = std::move(new_filter); |
117 | | |
118 | 11 | return Status::OK(); |
119 | 11 | } |
120 | | |
121 | | Status ColumnSelectVector::init(const std::vector<uint16_t>& run_length_null_map, size_t num_values, |
122 | | NullMap* null_map, FilterMap* filter_map, size_t filter_map_index, |
123 | 1.10M | const std::unordered_set<size_t>* skipped_indices) { |
124 | 1.10M | _num_values = num_values; |
125 | 1.10M | _num_nulls = 0; |
126 | 1.10M | _read_index = 0; |
127 | 1.10M | size_t map_index = 0; |
128 | 1.10M | bool is_null = false; |
129 | 1.10M | _has_filter = filter_map->has_filter(); |
130 | | |
131 | 1.10M | if (filter_map->has_filter()) { |
132 | | // No run length null map is generated when _filter_all = true |
133 | | // DCHECK(!filter_map->filter_all()); |
134 | 291k | _data_map.resize(num_values); |
135 | 906k | for (auto& run_length : run_length_null_map) { |
136 | 906k | if (is_null) { |
137 | 308k | _num_nulls += run_length; |
138 | 629k | for (int i = 0; i < run_length; ++i) { |
139 | 321k | _data_map[map_index++] = FILTERED_NULL; |
140 | 321k | } |
141 | 598k | } else { |
142 | 540M | for (int i = 0; i < run_length; ++i) { |
143 | 540M | _data_map[map_index++] = FILTERED_CONTENT; |
144 | 540M | } |
145 | 598k | } |
146 | 906k | is_null = !is_null; |
147 | 906k | } |
148 | | |
149 | 291k | size_t num_read = 0; |
150 | 291k | size_t i = 0; |
151 | 291k | size_t valid_count = 0; |
152 | | |
153 | 537M | while (valid_count < num_values) { |
154 | 537M | DCHECK_LT(filter_map_index + i, filter_map->filter_map_size()); |
155 | | |
156 | 537M | if (skipped_indices != nullptr && skipped_indices->count(filter_map_index + i) > 0) { |
157 | 1.08M | ++i; |
158 | 1.08M | continue; |
159 | 1.08M | } |
160 | | |
161 | 536M | if (filter_map->filter_map_data()[filter_map_index + i]) { |
162 | 131M | _data_map[valid_count] = |
163 | 131M | _data_map[valid_count] == FILTERED_NULL ? NULL_DATA : CONTENT; |
164 | 131M | num_read++; |
165 | 131M | } |
166 | 536M | ++valid_count; |
167 | 536M | ++i; |
168 | 536M | } |
169 | | |
170 | 291k | _num_filtered = num_values - num_read; |
171 | | |
172 | 291k | if (null_map != nullptr && num_read > 0) { |
173 | 243k | NullMap& map_data_column = *null_map; |
174 | 243k | auto null_map_index = map_data_column.size(); |
175 | 243k | map_data_column.resize(null_map_index + num_read); |
176 | | |
177 | 243k | if (_num_nulls == 0) { |
178 | 242k | memset(map_data_column.data() + null_map_index, 0, num_read); |
179 | 242k | } else if (_num_nulls == num_values) { |
180 | 0 | memset(map_data_column.data() + null_map_index, 1, num_read); |
181 | 1.12k | } else { |
182 | 6.60M | for (i = 0; i < num_values; ++i) { |
183 | 6.59M | if (_data_map[i] == CONTENT) { |
184 | 1.62k | map_data_column[null_map_index++] = (UInt8) false; |
185 | 6.59M | } else if (_data_map[i] == NULL_DATA) { |
186 | 37.9k | map_data_column[null_map_index++] = (UInt8) true; |
187 | 37.9k | } |
188 | 6.59M | } |
189 | 1.12k | } |
190 | 243k | } |
191 | 815k | } else { |
192 | 815k | _num_filtered = 0; |
193 | 815k | _run_length_null_map = &run_length_null_map; |
194 | 815k | if (null_map != nullptr) { |
195 | 814k | NullMap& map_data_column = *null_map; |
196 | 814k | auto null_map_index = map_data_column.size(); |
197 | 814k | map_data_column.resize(null_map_index + num_values); |
198 | | |
199 | 2.86M | for (auto& run_length : run_length_null_map) { |
200 | 2.86M | if (is_null) { |
201 | 1.03M | memset(map_data_column.data() + null_map_index, 1, run_length); |
202 | 1.03M | null_map_index += run_length; |
203 | 1.03M | _num_nulls += run_length; |
204 | 1.83M | } else { |
205 | 1.83M | memset(map_data_column.data() + null_map_index, 0, run_length); |
206 | 1.83M | null_map_index += run_length; |
207 | 1.83M | } |
208 | 2.86M | is_null = !is_null; |
209 | 2.86M | } |
210 | 814k | } else { |
211 | 1.43k | for (auto& run_length : run_length_null_map) { |
212 | 1.43k | if (is_null) { |
213 | 1 | _num_nulls += run_length; |
214 | 1 | } |
215 | 1.43k | is_null = !is_null; |
216 | 1.43k | } |
217 | 1.32k | } |
218 | 815k | } |
219 | 1.10M | return Status::OK(); |
220 | 1.10M | } |
221 | | |
222 | | ParsedVersion::ParsedVersion(std::string application, std::optional<std::string> version, |
223 | | std::optional<std::string> app_build_hash) |
224 | 79 | : _application(std::move(application)), |
225 | 79 | _version(std::move(version)), |
226 | 79 | _app_build_hash(std::move(app_build_hash)) {} |
227 | | |
228 | 20 | bool ParsedVersion::operator==(const ParsedVersion& other) const { |
229 | 20 | return _application == other._application && _version == other._version && |
230 | 20 | _app_build_hash == other._app_build_hash; |
231 | 20 | } |
232 | | |
233 | 0 | bool ParsedVersion::operator!=(const ParsedVersion& other) const { |
234 | 0 | return !(*this == other); |
235 | 0 | } |
236 | | |
237 | 0 | size_t ParsedVersion::hash() const { |
238 | 0 | std::hash<std::string> hasher; |
239 | 0 | return hasher(_application) ^ (_version ? hasher(*_version) : 0) ^ |
240 | 0 | (_app_build_hash ? hasher(*_app_build_hash) : 0); |
241 | 0 | } |
242 | | |
243 | 0 | std::string ParsedVersion::to_string() const { |
244 | 0 | return "ParsedVersion(application=" + _application + |
245 | 0 | ", semver=" + (_version ? *_version : "null") + |
246 | 0 | ", app_build_hash=" + (_app_build_hash ? *_app_build_hash : "null") + ")"; |
247 | 0 | } |
248 | | |
249 | | Status VersionParser::parse(const std::string& created_by, |
250 | 61 | std::unique_ptr<ParsedVersion>* parsed_version) { |
251 | 61 | static const std::string FORMAT = |
252 | 61 | "(.*?)\\s+version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?"; |
253 | 61 | static const std::regex PATTERN(FORMAT); |
254 | | |
255 | 61 | std::smatch matcher; |
256 | 61 | if (!std::regex_match(created_by, matcher, PATTERN)) { |
257 | 2 | return Status::InternalError(fmt::format("Could not parse created_by: {}, using format: {}", |
258 | 2 | created_by, FORMAT)); |
259 | 2 | } |
260 | | |
261 | 59 | std::string application = matcher[1].str(); |
262 | 59 | if (application.empty()) { |
263 | 0 | return Status::InternalError("application cannot be null or empty"); |
264 | 0 | } |
265 | 59 | std::optional<std::string> semver = |
266 | 59 | matcher[2].str().empty() ? std::nullopt : std::optional<std::string>(matcher[2].str()); |
267 | 59 | std::optional<std::string> app_build_hash = |
268 | 59 | matcher[3].str().empty() ? std::nullopt : std::optional<std::string>(matcher[3].str()); |
269 | 59 | *parsed_version = std::make_unique<ParsedVersion>(application, semver, app_build_hash); |
270 | 59 | return Status::OK(); |
271 | 59 | } |
272 | | |
273 | | SemanticVersion::SemanticVersion(int major, int minor, int patch) |
274 | 43 | : _major(major), |
275 | 43 | _minor(minor), |
276 | 43 | _patch(patch), |
277 | 43 | _prerelease(false), |
278 | 43 | _unknown(std::nullopt), |
279 | 43 | _pre(std::nullopt), |
280 | 43 | _build_info(std::nullopt) {} |
281 | | |
282 | | #ifdef BE_TEST |
283 | | SemanticVersion::SemanticVersion(int major, int minor, int patch, bool has_unknown) |
284 | | : _major(major), |
285 | | _minor(minor), |
286 | | _patch(patch), |
287 | | _prerelease(has_unknown), |
288 | | _unknown(std::nullopt), |
289 | | _pre(std::nullopt), |
290 | | _build_info(std::nullopt) {} |
291 | | #endif |
292 | | |
293 | | SemanticVersion::SemanticVersion(int major, int minor, int patch, |
294 | | std::optional<std::string> unknown, std::optional<std::string> pre, |
295 | | std::optional<std::string> build_info) |
296 | 104 | : _major(major), |
297 | 104 | _minor(minor), |
298 | 104 | _patch(patch), |
299 | 104 | _prerelease(unknown.has_value() && !unknown.value().empty()), |
300 | 104 | _unknown(std::move(unknown)), |
301 | 104 | _pre(pre.has_value() ? std::optional<Prerelease>(Prerelease(std::move(pre.value()))) |
302 | 104 | : std::nullopt), |
303 | 104 | _build_info(std::move(build_info)) {} |
304 | | |
305 | | Status SemanticVersion::parse(const std::string& version, |
306 | 92 | std::unique_ptr<SemanticVersion>* semantic_version) { |
307 | 92 | static const std::regex pattern(R"(^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$)"); |
308 | 92 | std::smatch match; |
309 | | |
310 | 92 | if (!std::regex_match(version, match, pattern)) { |
311 | 0 | return Status::InternalError(version + " does not match format"); |
312 | 0 | } |
313 | | |
314 | 92 | int major = std::stoi(match[1].str()); |
315 | 92 | int minor = std::stoi(match[2].str()); |
316 | 92 | int patch = std::stoi(match[3].str()); |
317 | 92 | std::optional<std::string> unknown = |
318 | 92 | match[4].str().empty() ? std::nullopt : std::optional<std::string>(match[4].str()); |
319 | 92 | std::optional<std::string> prerelease = |
320 | 92 | match[5].str().empty() ? std::nullopt : std::optional<std::string>(match[5].str()); |
321 | 92 | std::optional<std::string> build_info = |
322 | 92 | match[6].str().empty() ? std::nullopt : std::optional<std::string>(match[6].str()); |
323 | 92 | if (major < 0 || minor < 0 || patch < 0) { |
324 | 0 | return Status::InternalError("major({}), minor({}), and patch({}) must all be >= 0", major, |
325 | 0 | minor, patch); |
326 | 0 | } |
327 | 92 | *semantic_version = |
328 | 92 | std::make_unique<SemanticVersion>(major, minor, patch, unknown, prerelease, build_info); |
329 | 92 | return Status::OK(); |
330 | 92 | } |
331 | | |
332 | 132 | int SemanticVersion::compare_to(const SemanticVersion& other) const { |
333 | 132 | if (int cmp = _compare_integers(_major, other._major); cmp != 0) { |
334 | 3 | return cmp; |
335 | 3 | } |
336 | 129 | if (int cmp = _compare_integers(_minor, other._minor); cmp != 0) { |
337 | 55 | return cmp; |
338 | 55 | } |
339 | 74 | if (int cmp = _compare_integers(_patch, other._patch); cmp != 0) { |
340 | 9 | return cmp; |
341 | 9 | } |
342 | 65 | if (int cmp = _compare_booleans(other._prerelease, _prerelease); cmp != 0) { |
343 | 4 | return cmp; |
344 | 4 | } |
345 | 61 | if (_pre.has_value()) { |
346 | 48 | if (other._pre.has_value()) { |
347 | 42 | return _pre.value().compare_to(other._pre.value()); |
348 | 42 | } else { |
349 | 6 | return -1; |
350 | 6 | } |
351 | 48 | } else if (other._pre.has_value()) { |
352 | 3 | return 1; |
353 | 3 | } |
354 | 10 | return 0; |
355 | 61 | } |
356 | | |
357 | 5 | bool SemanticVersion::operator==(const SemanticVersion& other) const { |
358 | 5 | return compare_to(other) == 0; |
359 | 5 | } |
360 | | |
361 | 0 | bool SemanticVersion::operator!=(const SemanticVersion& other) const { |
362 | 0 | return !(*this == other); |
363 | 0 | } |
364 | | |
365 | 0 | std::string SemanticVersion::to_string() const { |
366 | 0 | std::string result = |
367 | 0 | std::to_string(_major) + "." + std::to_string(_minor) + "." + std::to_string(_patch); |
368 | 0 | if (_prerelease && _unknown) result += _unknown.value(); |
369 | 0 | if (_pre) result += _pre.value().to_string(); |
370 | 0 | if (_build_info) result += _build_info.value(); |
371 | 0 | return result; |
372 | 0 | } |
373 | | |
374 | | SemanticVersion::NumberOrString::NumberOrString(const std::string& value_string) |
375 | 153 | : _original(value_string) { |
376 | 153 | const static std::regex NUMERIC("\\d+"); |
377 | 153 | _is_numeric = std::regex_match(_original, NUMERIC); |
378 | 153 | _number = -1; |
379 | 153 | if (_is_numeric) { |
380 | 71 | _number = std::stoi(_original); |
381 | 71 | } |
382 | 153 | } |
383 | | |
384 | | SemanticVersion::NumberOrString::NumberOrString(const NumberOrString& other) |
385 | 260 | : _original(other._original), _is_numeric(other._is_numeric), _number(other._number) {} |
386 | | |
387 | 77 | int SemanticVersion::NumberOrString::compare_to(const SemanticVersion::NumberOrString& that) const { |
388 | 77 | if (this->_is_numeric != that._is_numeric) { |
389 | 7 | return this->_is_numeric ? -1 : 1; |
390 | 7 | } |
391 | | |
392 | 70 | if (_is_numeric) { |
393 | 27 | return this->_number - that._number; |
394 | 27 | } |
395 | | |
396 | 43 | return this->_original.compare(that._original); |
397 | 70 | } |
398 | | |
399 | 0 | std::string SemanticVersion::NumberOrString::to_string() const { |
400 | 0 | return _original; |
401 | 0 | } |
402 | | |
403 | 0 | bool SemanticVersion::NumberOrString::operator<(const SemanticVersion::NumberOrString& that) const { |
404 | 0 | return compare_to(that) < 0; |
405 | 0 | } |
406 | | |
407 | | bool SemanticVersion::NumberOrString::operator==( |
408 | 0 | const SemanticVersion::NumberOrString& that) const { |
409 | 0 | return compare_to(that) == 0; |
410 | 0 | } |
411 | | |
412 | | bool SemanticVersion::NumberOrString::operator!=( |
413 | 0 | const SemanticVersion::NumberOrString& that) const { |
414 | 0 | return !(*this == that); |
415 | 0 | } |
416 | | |
417 | 0 | bool SemanticVersion::NumberOrString::operator>(const SemanticVersion::NumberOrString& that) const { |
418 | 0 | return compare_to(that) > 0; |
419 | 0 | } |
420 | | |
421 | | bool SemanticVersion::NumberOrString::operator<=( |
422 | 0 | const SemanticVersion::NumberOrString& that) const { |
423 | 0 | return !(*this > that); |
424 | 0 | } |
425 | | |
426 | | bool SemanticVersion::NumberOrString::operator>=( |
427 | 0 | const SemanticVersion::NumberOrString& that) const { |
428 | 0 | return !(*this < that); |
429 | 0 | } |
430 | | |
431 | 335 | int SemanticVersion::_compare_integers(int x, int y) { |
432 | 335 | return (x < y) ? -1 : ((x == y) ? 0 : 1); |
433 | 335 | } |
434 | | |
435 | 65 | int SemanticVersion::_compare_booleans(bool x, bool y) { |
436 | 65 | return (x == y) ? 0 : (x ? 1 : -1); |
437 | 65 | } |
438 | | |
439 | | std::vector<std::string> SemanticVersion::Prerelease::_split(const std::string& s, |
440 | 75 | const std::regex& delimiter) { |
441 | 75 | std::sregex_token_iterator iter(s.begin(), s.end(), delimiter, -1); |
442 | 75 | std::sregex_token_iterator end; |
443 | 75 | std::vector<std::string> tokens(iter, end); |
444 | 75 | return tokens; |
445 | 75 | } |
446 | | |
447 | 75 | SemanticVersion::Prerelease::Prerelease(std::string original) : _original(std::move(original)) { |
448 | 75 | static const std::regex DOT("\\."); |
449 | 75 | auto parts = _split(_original, DOT); |
450 | 153 | for (const auto& part : parts) { |
451 | 153 | NumberOrString number_or_string(part); |
452 | 153 | _identifiers.emplace_back(number_or_string); |
453 | 153 | } |
454 | 75 | } |
455 | | |
456 | 42 | int SemanticVersion::Prerelease::compare_to(const Prerelease& that) const { |
457 | 42 | auto size = std::min(this->_identifiers.size(), that._identifiers.size()); |
458 | 97 | for (int i = 0; i < size; ++i) { |
459 | 77 | int cmp = this->_identifiers[i].compare_to(that._identifiers[i]); |
460 | 77 | if (cmp != 0) { |
461 | 22 | return cmp; |
462 | 22 | } |
463 | 77 | } |
464 | 20 | return static_cast<int>(this->_identifiers.size()) - static_cast<int>(that._identifiers.size()); |
465 | 42 | } |
466 | | |
467 | 0 | std::string SemanticVersion::Prerelease::to_string() const { |
468 | 0 | return _original; |
469 | 0 | } |
470 | | |
471 | 0 | bool SemanticVersion::Prerelease::operator<(const Prerelease& that) const { |
472 | 0 | return compare_to(that) < 0; |
473 | 0 | } |
474 | | |
475 | 0 | bool SemanticVersion::Prerelease::operator==(const Prerelease& that) const { |
476 | 0 | return compare_to(that) == 0; |
477 | 0 | } |
478 | | |
479 | 0 | bool SemanticVersion::Prerelease::operator!=(const Prerelease& that) const { |
480 | 0 | return !(*this == that); |
481 | 0 | } |
482 | | |
483 | 0 | bool SemanticVersion::Prerelease::operator>(const Prerelease& that) const { |
484 | 0 | return compare_to(that) > 0; |
485 | 0 | } |
486 | | |
487 | 0 | bool SemanticVersion::Prerelease::operator<=(const Prerelease& that) const { |
488 | 0 | return !(*this > that); |
489 | 0 | } |
490 | | |
491 | 0 | bool SemanticVersion::Prerelease::operator>=(const Prerelease& that) const { |
492 | 0 | return !(*this < that); |
493 | 0 | } |
494 | | |
495 | | const SemanticVersion CorruptStatistics::PARQUET_251_FIXED_VERSION(1, 8, 0); |
496 | | const SemanticVersion CorruptStatistics::CDH_5_PARQUET_251_FIXED_START(1, 5, 0, std::nullopt, |
497 | | "cdh5.5.0", std::nullopt); |
498 | | const SemanticVersion CorruptStatistics::CDH_5_PARQUET_251_FIXED_END(1, 5, 0); |
499 | | |
500 | | bool CorruptStatistics::should_ignore_statistics(const std::string& created_by, |
501 | 263 | tparquet::Type::type physical_type) { |
502 | 263 | if (physical_type != tparquet::Type::BYTE_ARRAY && |
503 | 263 | physical_type != tparquet::Type::FIXED_LEN_BYTE_ARRAY) { |
504 | | // The bug only applies to binary columns |
505 | 223 | return false; |
506 | 223 | } |
507 | | |
508 | 40 | if (created_by.empty()) { |
509 | | // created_by is not populated |
510 | 0 | VLOG_DEBUG |
511 | 0 | << "Ignoring statistics because created_by is null or empty! See PARQUET-251 and " |
512 | 0 | "PARQUET-297"; |
513 | 0 | return true; |
514 | 0 | } |
515 | | |
516 | 40 | Status status; |
517 | 40 | std::unique_ptr<ParsedVersion> parsed_version; |
518 | 40 | status = VersionParser::parse(created_by, &parsed_version); |
519 | 40 | if (!status.ok()) { |
520 | 1 | VLOG_DEBUG << "Ignoring statistics because created_by could not be parsed (see " |
521 | 0 | "PARQUET-251)." |
522 | 0 | " CreatedBy: " |
523 | 0 | << created_by << ", msg: " << status.msg(); |
524 | 1 | return true; |
525 | 1 | } |
526 | | |
527 | 39 | if (parsed_version->application() != "parquet-mr") { |
528 | | // Assume other applications don't have this bug |
529 | 7 | return false; |
530 | 7 | } |
531 | | |
532 | 32 | if ((!parsed_version->version().has_value()) || parsed_version->version().value().empty()) { |
533 | 3 | VLOG_DEBUG << "Ignoring statistics because created_by did not contain a semver (see " |
534 | 0 | "PARQUET-251): " |
535 | 0 | << created_by; |
536 | 3 | return true; |
537 | 3 | } |
538 | | |
539 | 29 | std::unique_ptr<SemanticVersion> semantic_version; |
540 | 29 | status = SemanticVersion::parse(parsed_version->version().value(), &semantic_version); |
541 | 29 | if (!status.ok()) { |
542 | 0 | VLOG_DEBUG << "Ignoring statistics because created_by could not be parsed (see " |
543 | 0 | "PARQUET-251)." |
544 | 0 | " CreatedBy: " |
545 | 0 | << created_by << ", msg: " << status.msg(); |
546 | 0 | return true; |
547 | 0 | } |
548 | 29 | if (semantic_version->compare_to(PARQUET_251_FIXED_VERSION) < 0 && |
549 | 29 | !(semantic_version->compare_to(CDH_5_PARQUET_251_FIXED_START) >= 0 && |
550 | 22 | semantic_version->compare_to(CDH_5_PARQUET_251_FIXED_END) < 0)) { |
551 | 18 | VLOG_DEBUG |
552 | 0 | << "Ignoring statistics because this file was created prior to the fixed version, " |
553 | 0 | "see PARQUET-251"; |
554 | 18 | return true; |
555 | 18 | } |
556 | | |
557 | | // This file was created after the fix |
558 | 11 | return false; |
559 | 29 | } |
560 | | #include "common/compile_check_end.h" |
561 | | |
562 | | } // namespace doris |