Coverage Report

Created: 2026-03-13 14:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format/parquet/parquet_common.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <gen_cpp/parquet_types.h>
21
#include <stddef.h>
22
23
#include <cstdint>
24
#include <ostream>
25
#include <regex>
26
#include <string>
27
#include <unordered_set>
28
#include <vector>
29
30
#include "core/column/column_nullable.h"
31
#include "storage/segment/row_ranges.h"
32
33
namespace doris {
34
#include "common/compile_check_begin.h"
35
using level_t = int16_t;
36
37
using segment_v2::RowRange;
38
using segment_v2::RowRanges;
39
40
#pragma pack(1)
41
struct ParquetInt96 {
42
    int64_t lo; // time of nanoseconds in a day
43
    int32_t hi; // days from julian epoch
44
45
76.6M
    NO_SANITIZE_UNDEFINED inline int64_t to_timestamp_micros() const {
46
76.6M
        return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND;
47
76.6M
    }
48
0
    inline __int128 to_int128() const {
49
0
        __int128 ans = 0;
50
0
        ans = (((__int128)hi) << 64) + lo;
51
0
        return ans;
52
0
    }
53
54
    static const int32_t JULIAN_EPOCH_OFFSET_DAYS;
55
    static const int64_t MICROS_IN_DAY;
56
    static const int64_t NANOS_PER_MICROSECOND;
57
};
58
#pragma pack()
59
static_assert(sizeof(ParquetInt96) == 12, "The size of ParquetInt96 is not 12.");
60
61
class FilterMap {
62
public:
63
322k
    FilterMap() = default;
64
    Status init(const uint8_t* filter_map_data, size_t filter_map_size, bool filter_all);
65
66
    Status generate_nested_filter_map(const std::vector<level_t>& rep_levels,
67
                                      std::vector<uint8_t>& nested_filter_map_data,
68
                                      std::unique_ptr<FilterMap>* nested_filter_map,
69
                                      size_t* current_row_ptr, size_t start_index = 0) const;
70
71
711M
    const uint8_t* filter_map_data() const { return _filter_map_data; }
72
680M
    size_t filter_map_size() const { return _filter_map_size; }
73
3.77M
    bool has_filter() const { return _has_filter; }
74
27.2k
    bool filter_all() const { return _filter_all; }
75
351k
    double filter_ratio() const { return _has_filter ? _filter_ratio : 0; }
76
77
    bool can_filter_all(size_t remaining_num_values, size_t filter_map_index);
78
79
private:
80
    bool _has_filter = false;
81
    bool _filter_all = false;
82
    const uint8_t* _filter_map_data = nullptr;
83
    size_t _filter_map_size = 0;
84
    double _filter_ratio = 0;
85
};
86
87
class ColumnSelectVector {
88
public:
89
    enum DataReadType : uint8_t { CONTENT = 0, NULL_DATA, FILTERED_CONTENT, FILTERED_NULL };
90
91
1.17M
    ColumnSelectVector() = default;
92
93
    Status init(const std::vector<uint16_t>& run_length_null_map, size_t num_values,
94
                NullMap* null_map, FilterMap* filter_map, size_t filter_map_index,
95
                const std::unordered_set<size_t>* skipped_indices = nullptr);
96
97
5.43M
    size_t num_values() const { return _num_values; }
98
99
973k
    size_t num_nulls() const { return _num_nulls; }
100
101
942k
    size_t num_filtered() const { return _num_filtered; }
102
103
1.17M
    bool has_filter() const { return _has_filter; }
104
105
    template <bool has_filter>
106
61.3M
    size_t get_next_run(DataReadType* data_read_type) {
107
61.3M
        DCHECK_EQ(_has_filter, has_filter);
108
61.3M
        if constexpr (has_filter) {
109
57.5M
            if (_read_index == _num_values) {
110
315k
                return 0;
111
315k
            }
112
57.2M
            const DataReadType& type = _data_map[_read_index++];
113
57.2M
            size_t run_length = 1;
114
676M
            while (_read_index < _num_values) {
115
673M
                if (_data_map[_read_index] == type) {
116
619M
                    run_length++;
117
619M
                    _read_index++;
118
619M
                } else {
119
54.1M
                    break;
120
54.1M
                }
121
673M
            }
122
57.2M
            *data_read_type = type;
123
57.2M
            return run_length;
124
57.5M
        } else {
125
3.74M
            size_t run_length = 0;
126
6.64M
            while (run_length == 0) {
127
3.75M
                if (_read_index == (*_run_length_null_map).size()) {
128
855k
                    return 0;
129
855k
                }
130
2.90M
                *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA;
131
2.90M
                run_length = (*_run_length_null_map)[_read_index++];
132
2.90M
            }
133
2.88M
            return run_length;
134
3.74M
        }
135
61.3M
    }
_ZN5doris18ColumnSelectVector12get_next_runILb1EEEmPNS0_12DataReadTypeE
Line
Count
Source
106
57.5M
    size_t get_next_run(DataReadType* data_read_type) {
107
57.5M
        DCHECK_EQ(_has_filter, has_filter);
108
57.5M
        if constexpr (has_filter) {
109
57.5M
            if (_read_index == _num_values) {
110
315k
                return 0;
111
315k
            }
112
57.2M
            const DataReadType& type = _data_map[_read_index++];
113
57.2M
            size_t run_length = 1;
114
676M
            while (_read_index < _num_values) {
115
673M
                if (_data_map[_read_index] == type) {
116
619M
                    run_length++;
117
619M
                    _read_index++;
118
619M
                } else {
119
54.1M
                    break;
120
54.1M
                }
121
673M
            }
122
57.2M
            *data_read_type = type;
123
57.2M
            return run_length;
124
        } else {
125
            size_t run_length = 0;
126
            while (run_length == 0) {
127
                if (_read_index == (*_run_length_null_map).size()) {
128
                    return 0;
129
                }
130
                *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA;
131
                run_length = (*_run_length_null_map)[_read_index++];
132
            }
133
            return run_length;
134
        }
135
57.5M
    }
_ZN5doris18ColumnSelectVector12get_next_runILb0EEEmPNS0_12DataReadTypeE
Line
Count
Source
106
3.74M
    size_t get_next_run(DataReadType* data_read_type) {
107
3.74M
        DCHECK_EQ(_has_filter, has_filter);
108
        if constexpr (has_filter) {
109
            if (_read_index == _num_values) {
110
                return 0;
111
            }
112
            const DataReadType& type = _data_map[_read_index++];
113
            size_t run_length = 1;
114
            while (_read_index < _num_values) {
115
                if (_data_map[_read_index] == type) {
116
                    run_length++;
117
                    _read_index++;
118
                } else {
119
                    break;
120
                }
121
            }
122
            *data_read_type = type;
123
            return run_length;
124
3.74M
        } else {
125
3.74M
            size_t run_length = 0;
126
6.64M
            while (run_length == 0) {
127
3.75M
                if (_read_index == (*_run_length_null_map).size()) {
128
855k
                    return 0;
129
855k
                }
130
2.90M
                *data_read_type = _read_index % 2 == 0 ? CONTENT : NULL_DATA;
131
2.90M
                run_length = (*_run_length_null_map)[_read_index++];
132
2.90M
            }
133
2.88M
            return run_length;
134
3.74M
        }
135
3.74M
    }
136
137
private:
138
    std::vector<DataReadType> _data_map;
139
    // the length of non-null values and null values are arranged in turn.
140
    const std::vector<uint16_t>* _run_length_null_map;
141
    bool _has_filter;
142
    size_t _num_values;
143
    size_t _num_nulls;
144
    size_t _num_filtered;
145
    size_t _read_index;
146
};
147
148
enum class ColumnOrderName { UNDEFINED, TYPE_DEFINED_ORDER };
149
150
enum class SortOrder { SIGNED, UNSIGNED, UNKNOWN };
151
152
class ParsedVersion {
153
public:
154
    ParsedVersion(std::string application, std::optional<std::string> version,
155
                  std::optional<std::string> app_build_hash);
156
157
39
    const std::string& application() const { return _application; }
158
159
90
    const std::optional<std::string>& version() const { return _version; }
160
161
0
    const std::optional<std::string>& app_build_hash() const { return _app_build_hash; }
162
163
    bool operator==(const ParsedVersion& other) const;
164
165
    bool operator!=(const ParsedVersion& other) const;
166
167
    size_t hash() const;
168
169
    std::string to_string() const;
170
171
private:
172
    std::string _application;
173
    std::optional<std::string> _version;
174
    std::optional<std::string> _app_build_hash;
175
};
176
177
class VersionParser {
178
public:
179
    static Status parse(const std::string& created_by,
180
                        std::unique_ptr<ParsedVersion>* parsed_version);
181
};
182
183
class SemanticVersion {
184
public:
185
    SemanticVersion(int major, int minor, int patch);
186
187
#ifdef BE_TEST
188
    SemanticVersion(int major, int minor, int patch, bool has_unknown);
189
#endif
190
191
    SemanticVersion(int major, int minor, int patch, std::optional<std::string> unknown,
192
                    std::optional<std::string> pre, std::optional<std::string> build_info);
193
194
    static Status parse(const std::string& version,
195
                        std::unique_ptr<SemanticVersion>* semantic_version);
196
197
    int compare_to(const SemanticVersion& other) const;
198
199
    bool operator==(const SemanticVersion& other) const;
200
201
    bool operator!=(const SemanticVersion& other) const;
202
203
    std::string to_string() const;
204
205
private:
206
    class NumberOrString {
207
    public:
208
        explicit NumberOrString(const std::string& value_string);
209
210
        NumberOrString(const NumberOrString& other);
211
212
        int compare_to(const NumberOrString& that) const;
213
        std::string to_string() const;
214
215
        bool operator<(const NumberOrString& that) const;
216
        bool operator==(const NumberOrString& that) const;
217
        bool operator!=(const NumberOrString& that) const;
218
        bool operator>(const NumberOrString& that) const;
219
        bool operator<=(const NumberOrString& that) const;
220
        bool operator>=(const NumberOrString& that) const;
221
222
    private:
223
        std::string _original;
224
        bool _is_numeric;
225
        int _number;
226
    };
227
228
    class Prerelease {
229
    public:
230
        explicit Prerelease(std::string original);
231
232
        int compare_to(const Prerelease& that) const;
233
        std::string to_string() const;
234
235
        bool operator<(const Prerelease& that) const;
236
        bool operator==(const Prerelease& that) const;
237
        bool operator!=(const Prerelease& that) const;
238
        bool operator>(const Prerelease& that) const;
239
        bool operator<=(const Prerelease& that) const;
240
        bool operator>=(const Prerelease& that) const;
241
242
0
        const std::string& original() const { return _original; }
243
244
    private:
245
        static std::vector<std::string> _split(const std::string& s, const std::regex& delimiter);
246
247
        std::string _original;
248
        std::vector<NumberOrString> _identifiers;
249
    };
250
251
    static int _compare_integers(int x, int y);
252
    static int _compare_booleans(bool x, bool y);
253
254
    int _major;
255
    int _minor;
256
    int _patch;
257
    bool _prerelease;
258
    std::optional<std::string> _unknown;
259
    std::optional<Prerelease> _pre;
260
    std::optional<std::string> _build_info;
261
};
262
263
class CorruptStatistics {
264
public:
265
    static bool should_ignore_statistics(const std::string& created_by,
266
                                         tparquet::Type::type physical_type);
267
268
private:
269
    static const SemanticVersion PARQUET_251_FIXED_VERSION;
270
    static const SemanticVersion CDH_5_PARQUET_251_FIXED_START;
271
    static const SemanticVersion CDH_5_PARQUET_251_FIXED_END;
272
};
273
#include "common/compile_check_end.h"
274
275
} // namespace doris