Coverage Report

Created: 2026-04-10 04:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format/parquet/parquet_common.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <gen_cpp/parquet_types.h>
21
#include <stddef.h>
22
23
#include <cstdint>
24
#include <ostream>
25
#include <regex>
26
#include <string>
27
#include <unordered_set>
28
#include <vector>
29
30
#include "core/column/column_nullable.h"
31
#include "storage/segment/row_ranges.h"
32
33
namespace doris {
34
using level_t = int16_t;
35
36
using segment_v2::RowRange;
37
using segment_v2::RowRanges;
38
39
#pragma pack(1)
40
struct ParquetInt96 {
41
    int64_t lo; // time of nanoseconds in a day
42
    int32_t hi; // days from julian epoch
43
44
0
    NO_SANITIZE_UNDEFINED inline int64_t to_timestamp_micros() const {
45
0
        return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND;
46
0
    }
47
0
    inline __int128 to_int128() const {
48
0
        __int128 ans = 0;
49
0
        ans = (((__int128)hi) << 64) + lo;
50
0
        return ans;
51
0
    }
52
53
    static const int32_t JULIAN_EPOCH_OFFSET_DAYS;
54
    static const int64_t MICROS_IN_DAY;
55
    static const int64_t NANOS_PER_MICROSECOND;
56
};
57
#pragma pack()
58
static_assert(sizeof(ParquetInt96) == 12, "The size of ParquetInt96 is not 12.");
59
60
class FilterMap {
61
public:
62
191
    FilterMap() = default;
63
    Status init(const uint8_t* filter_map_data, size_t filter_map_size, bool filter_all);
64
65
    Status generate_nested_filter_map(const std::vector<level_t>& rep_levels,
66
                                      std::vector<uint8_t>& nested_filter_map_data,
67
                                      std::unique_ptr<FilterMap>* nested_filter_map,
68
                                      size_t* current_row_ptr, size_t start_index = 0) const;
69
70
195
    const uint8_t* filter_map_data() const { return _filter_map_data; }
71
196
    size_t filter_map_size() const { return _filter_map_size; }
72
703
    bool has_filter() const { return _has_filter; }
73
24
    bool filter_all() const { return _filter_all; }
74
5
    double filter_ratio() const { return _has_filter ? _filter_ratio : 0; }
75
76
    bool can_filter_all(size_t remaining_num_values, size_t filter_map_index);
77
78
private:
79
    bool _has_filter = false;
80
    bool _filter_all = false;
81
    const uint8_t* _filter_map_data = nullptr;
82
    size_t _filter_map_size = 0;
83
    double _filter_ratio = 0;
84
};
85
86
class ColumnSelectVector {
87
public:
88
    enum DataReadType : uint8_t { CONTENT = 0, NULL_DATA, FILTERED_CONTENT, FILTERED_NULL };
89
90
268
    ColumnSelectVector() = default;
91
92
    Status init(const std::vector<uint16_t>& run_length_null_map, size_t num_values,
93
                NullMap* null_map, FilterMap* filter_map, size_t filter_map_index,
94
                const std::unordered_set<size_t>* skipped_indices = nullptr);
95
96
1.00k
    size_t num_values() const { return _num_values; }
97
98
225
    size_t num_nulls() const { return _num_nulls; }
99
100
194
    size_t num_filtered() const { return _num_filtered; }
101
102
263
    bool has_filter() const { return _has_filter; }
103
104
    template <bool has_filter>
105
669
    size_t get_next_run(DataReadType* data_read_type) {
106
669
        DCHECK_EQ(_has_filter, has_filter);
107
669
        if constexpr (has_filter) {
108
205
            if (_read_index == _num_values) {
109
32
                return 0;
110
32
            }
111
173
            const DataReadType& type = _data_map[_read_index++];
112
173
            size_t run_length = 1;
113
189
            while (_read_index < _num_values) {
114
154
                if (_data_map[_read_index] == type) {
115
16
                    run_length++;
116
16
                    _read_index++;
117
138
                } else {
118
138
                    break;
119
138
                }
120
154
            }
121
173
            *data_read_type = type;
122
173
            return run_length;
123
464
        } else {
124
464
            size_t run_length = 0;
125
714
            while (run_length == 0) {
126
481
                if (_read_index == (*_run_length_null_map).size()) {
127
231
                    return 0;
128
231
                }
129
250
                *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA;
130
250
                run_length = (*_run_length_null_map)[_read_index++];
131
250
            }
132
233
            return run_length;
133
464
        }
134
669
    }
_ZN5doris18ColumnSelectVector12get_next_runILb1EEEmPNS0_12DataReadTypeE
Line
Count
Source
105
205
    size_t get_next_run(DataReadType* data_read_type) {
106
205
        DCHECK_EQ(_has_filter, has_filter);
107
205
        if constexpr (has_filter) {
108
205
            if (_read_index == _num_values) {
109
32
                return 0;
110
32
            }
111
173
            const DataReadType& type = _data_map[_read_index++];
112
173
            size_t run_length = 1;
113
189
            while (_read_index < _num_values) {
114
154
                if (_data_map[_read_index] == type) {
115
16
                    run_length++;
116
16
                    _read_index++;
117
138
                } else {
118
138
                    break;
119
138
                }
120
154
            }
121
173
            *data_read_type = type;
122
173
            return run_length;
123
        } else {
124
            size_t run_length = 0;
125
            while (run_length == 0) {
126
                if (_read_index == (*_run_length_null_map).size()) {
127
                    return 0;
128
                }
129
                *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA;
130
                run_length = (*_run_length_null_map)[_read_index++];
131
            }
132
            return run_length;
133
        }
134
205
    }
_ZN5doris18ColumnSelectVector12get_next_runILb0EEEmPNS0_12DataReadTypeE
Line
Count
Source
105
464
    size_t get_next_run(DataReadType* data_read_type) {
106
464
        DCHECK_EQ(_has_filter, has_filter);
107
        if constexpr (has_filter) {
108
            if (_read_index == _num_values) {
109
                return 0;
110
            }
111
            const DataReadType& type = _data_map[_read_index++];
112
            size_t run_length = 1;
113
            while (_read_index < _num_values) {
114
                if (_data_map[_read_index] == type) {
115
                    run_length++;
116
                    _read_index++;
117
                } else {
118
                    break;
119
                }
120
            }
121
            *data_read_type = type;
122
            return run_length;
123
464
        } else {
124
464
            size_t run_length = 0;
125
714
            while (run_length == 0) {
126
481
                if (_read_index == (*_run_length_null_map).size()) {
127
231
                    return 0;
128
231
                }
129
250
                *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA;
130
250
                run_length = (*_run_length_null_map)[_read_index++];
131
250
            }
132
233
            return run_length;
133
464
        }
134
464
    }
135
136
private:
137
    std::vector<DataReadType> _data_map;
138
    // the length of non-null values and null values are arranged in turn.
139
    const std::vector<uint16_t>* _run_length_null_map;
140
    bool _has_filter;
141
    size_t _num_values;
142
    size_t _num_nulls;
143
    size_t _num_filtered;
144
    size_t _read_index;
145
};
146
147
enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER };
148
149
enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN };
150
151
class ParsedVersion {
152
public:
153
    ParsedVersion(std::string application, std::optional<std::string> version,
154
                  std::optional<std::string> app_build_hash);
155
156
39
    const std::string& application() const { return _application; }
157
158
90
    const std::optional<std::string>& version() const { return _version; }
159
160
0
    const std::optional<std::string>& app_build_hash() const { return _app_build_hash; }
161
162
    bool operator==(const ParsedVersion& other) const;
163
164
    bool operator!=(const ParsedVersion& other) const;
165
166
    size_t hash() const;
167
168
    std::string to_string() const;
169
170
private:
171
    std::string _application;
172
    std::optional<std::string> _version;
173
    std::optional<std::string> _app_build_hash;
174
};
175
176
class VersionParser {
177
public:
178
    static Status parse(const std::string& created_by,
179
                        std::unique_ptr<ParsedVersion>* parsed_version);
180
};
181
182
class SemanticVersion {
183
public:
184
    SemanticVersion(int major, int minor, int patch);
185
186
#ifdef BE_TEST
187
    SemanticVersion(int major, int minor, int patch, bool has_unknown);
188
#endif
189
190
    SemanticVersion(int major, int minor, int patch, std::optional<std::string> unknown,
191
                    std::optional<std::string> pre, std::optional<std::string> build_info);
192
193
    static Status parse(const std::string& version,
194
                        std::unique_ptr<SemanticVersion>* semantic_version);
195
196
    int compare_to(const SemanticVersion& other) const;
197
198
    bool operator==(const SemanticVersion& other) const;
199
200
    bool operator!=(const SemanticVersion& other) const;
201
202
    std::string to_string() const;
203
204
private:
205
    class NumberOrString {
206
    public:
207
        explicit NumberOrString(const std::string& value_string);
208
209
        NumberOrString(const NumberOrString& other);
210
211
        int compare_to(const NumberOrString& that) const;
212
        std::string to_string() const;
213
214
        bool operator<(const NumberOrString& that) const;
215
        bool operator==(const NumberOrString& that) const;
216
        bool operator!=(const NumberOrString& that) const;
217
        bool operator>(const NumberOrString& that) const;
218
        bool operator<=(const NumberOrString& that) const;
219
        bool operator>=(const NumberOrString& that) const;
220
221
    private:
222
        std::string _original;
223
        bool _is_numeric;
224
        int _number;
225
    };
226
227
    class Prerelease {
228
    public:
229
        explicit Prerelease(std::string original);
230
231
        int compare_to(const Prerelease& that) const;
232
        std::string to_string() const;
233
234
        bool operator<(const Prerelease& that) const;
235
        bool operator==(const Prerelease& that) const;
236
        bool operator!=(const Prerelease& that) const;
237
        bool operator>(const Prerelease& that) const;
238
        bool operator<=(const Prerelease& that) const;
239
        bool operator>=(const Prerelease& that) const;
240
241
0
        const std::string& original() const { return _original; }
242
243
    private:
244
        static std::vector<std::string> _split(const std::string& s, const std::regex& delimiter);
245
246
        std::string _original;
247
        std::vector<NumberOrString> _identifiers;
248
    };
249
250
    static int _compare_integers(int x, int y);
251
    static int _compare_booleans(bool x, bool y);
252
253
    int _major;
254
    int _minor;
255
    int _patch;
256
    bool _prerelease;
257
    std::optional<std::string> _unknown;
258
    std::optional<Prerelease> _pre;
259
    std::optional<std::string> _build_info;
260
};
261
262
class CorruptStatistics {
263
public:
264
    static bool should_ignore_statistics(const std::string& created_by,
265
                                         tparquet::Type::type physical_type);
266
267
private:
268
    static const SemanticVersion PARQUET_251_FIXED_VERSION;
269
    static const SemanticVersion CDH_5_PARQUET_251_FIXED_START;
270
    static const SemanticVersion CDH_5_PARQUET_251_FIXED_END;
271
};
272
273
} // namespace doris