Coverage Report

Created: 2025-08-27 05:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/olap/hll.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <cstdint>
21
#include <cstring>
22
#include <map>
23
#include <string>
24
#include <utility>
25
26
#include "vec/common/hash_table/phmap_fwd_decl.h"
27
28
namespace doris {
29
#include "common/compile_check_begin.h"
30
struct Slice;
31
32
inline const int HLL_COLUMN_PRECISION = 14;
33
inline const int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION);
34
inline const int HLL_EXPLICIT_INT64_NUM = 160;
35
inline const int HLL_SPARSE_THRESHOLD = 4096;
36
inline const uint16_t HLL_REGISTERS_COUNT = 16 * 1024;
37
// maximum size in byte of serialized HLL: type(1) + registers (2^14)
38
inline const int HLL_COLUMN_DEFAULT_LEN = HLL_REGISTERS_COUNT + 1;
39
40
// 1 for type; 1 for hash values count; 8 for hash value
41
inline const int HLL_SINGLE_VALUE_SIZE = 10;
42
inline const int HLL_EMPTY_SIZE = 1;
43
44
// Hyperloglog distinct estimate algorithm.
45
// See these papers for more details.
46
// 1) Hyperloglog: The analysis of a near-optimal cardinality estimation
47
// algorithm (2007)
48
// 2) HyperLogLog in Practice (paper from google with some improvements)
49
50
// Each HLL value is a set of values. To save space, Doris store HLL value
51
// in different format according to its cardinality.
52
//
53
// HLL_DATA_EMPTY: when set is empty.
54
//
55
// HLL_DATA_EXPLICIT: when there is only few values in set, store these values explicit.
56
// If the number of hash values is not greater than 160, set is encoded in this format.
57
// The max space occupied is (1 + 1 + 160 * 8) = 1282. I don't know why 160 is chosen,
58
// maybe can be other number. If you are interested, you can try other number and see
59
// if it will be better.
60
//
61
// HLL_DATA_SPARSE: only store non-zero registers. If the number of non-zero registers
62
// is not greater than 4096, set is encoded in this format. The max space occupied is
63
// (1 + 4 + 3 * 4096) = 12293.
64
//
65
// HLL_DATA_FULL: most space-consuming, store all registers
66
//
67
// A HLL value will change in the sequence empty -> explicit -> sparse -> full, and not
68
// allow reverse.
69
//
70
// NOTE: This values are persisted in storage devices, so don't change exist
71
// enum values.
72
enum HllDataType {
73
    HLL_DATA_EMPTY = 0,
74
    HLL_DATA_EXPLICIT = 1,
75
    HLL_DATA_SPARSE = 2,
76
    HLL_DATA_FULL = 3,
77
};
78
79
class HyperLogLog {
80
public:
81
802k
    HyperLogLog() = default;
82
    explicit HyperLogLog(uint64_t hash_value) : _type(HLL_DATA_EXPLICIT) {
83
        _hash_set.emplace(hash_value);
84
    }
85
    explicit HyperLogLog(const Slice& src);
86
87
58.7k
    HyperLogLog(const HyperLogLog& other) {
88
58.7k
        this->_type = other._type;
89
58.7k
        switch (other._type) {
90
1.77k
        case HLL_DATA_EMPTY:
91
1.77k
            break;
92
56.9k
        case HLL_DATA_EXPLICIT: {
93
56.9k
            this->_hash_set = other._hash_set;
94
56.9k
            break;
95
0
        }
96
2
        case HLL_DATA_SPARSE:
97
8
        case HLL_DATA_FULL: {
98
8
            _registers = new uint8_t[HLL_REGISTERS_COUNT];
99
8
            memcpy(_registers, other._registers, HLL_REGISTERS_COUNT);
100
8
            break;
101
2
        }
102
0
        default:
103
0
            break;
104
58.7k
        }
105
58.7k
    }
106
107
55.1k
    HyperLogLog(HyperLogLog&& other) noexcept {
108
55.1k
        this->_type = other._type;
109
55.1k
        switch (other._type) {
110
3.00k
        case HLL_DATA_EMPTY:
111
3.00k
            break;
112
52.1k
        case HLL_DATA_EXPLICIT: {
113
52.1k
            this->_hash_set = std::move(other._hash_set);
114
52.1k
            other._type = HLL_DATA_EMPTY;
115
52.1k
            break;
116
0
        }
117
2
        case HLL_DATA_SPARSE:
118
8
        case HLL_DATA_FULL: {
119
8
            this->_registers = other._registers;
120
8
            other._registers = nullptr;
121
8
            other._type = HLL_DATA_EMPTY;
122
8
            break;
123
2
        }
124
0
        default:
125
0
            break;
126
55.1k
        }
127
55.1k
    }
128
129
0
    HyperLogLog& operator=(HyperLogLog&& other) noexcept {
130
0
        if (this != &other) {
131
0
            if (_registers != nullptr) {
132
0
                delete[] _registers;
133
0
                _registers = nullptr;
134
0
            }
135
136
0
            this->_type = other._type;
137
0
            switch (other._type) {
138
0
            case HLL_DATA_EMPTY:
139
0
                break;
140
0
            case HLL_DATA_EXPLICIT: {
141
0
                this->_hash_set = std::move(other._hash_set);
142
0
                other._type = HLL_DATA_EMPTY;
143
0
                break;
144
0
            }
145
0
            case HLL_DATA_SPARSE:
146
0
            case HLL_DATA_FULL: {
147
0
                this->_registers = other._registers;
148
0
                other._registers = nullptr;
149
0
                other._type = HLL_DATA_EMPTY;
150
0
                break;
151
0
            }
152
0
            default:
153
0
                break;
154
0
            }
155
0
        }
156
0
        return *this;
157
0
    }
158
159
1.75k
    HyperLogLog& operator=(const HyperLogLog& other) {
160
1.75k
        if (this != &other) {
161
1.71k
            if (_registers != nullptr) {
162
0
                delete[] _registers;
163
0
                _registers = nullptr;
164
0
            }
165
166
1.71k
            this->_type = other._type;
167
1.71k
            switch (other._type) {
168
55
            case HLL_DATA_EMPTY:
169
55
                break;
170
1.65k
            case HLL_DATA_EXPLICIT: {
171
1.65k
                this->_hash_set = other._hash_set;
172
1.65k
                break;
173
0
            }
174
0
            case HLL_DATA_SPARSE:
175
0
            case HLL_DATA_FULL: {
176
0
                _registers = new uint8_t[HLL_REGISTERS_COUNT];
177
0
                memcpy(_registers, other._registers, HLL_REGISTERS_COUNT);
178
0
                break;
179
0
            }
180
0
            default:
181
0
                break;
182
1.71k
            }
183
1.71k
        }
184
1.75k
        return *this;
185
1.75k
    }
186
187
907k
    ~HyperLogLog() { clear(); }
188
907k
    void clear() {
189
907k
        _type = HLL_DATA_EMPTY;
190
907k
        _hash_set.clear();
191
907k
        delete[] _registers;
192
907k
        _registers = nullptr;
193
907k
    }
194
195
    using SetTypeValueType = uint8_t;
196
    using SparseLengthValueType = int32_t;
197
    using SparseIndexType = uint16_t;
198
    using SparseValueType = uint8_t;
199
200
    // Add a hash value to this HLL value
201
    // NOTE: input must be a hash_value
202
    void update(uint64_t hash_value);
203
204
    void merge(const HyperLogLog& other);
205
206
    // Return max size of serialized binary
207
    size_t max_serialized_size() const;
208
209
0
    size_t memory_consumed() const {
210
0
        size_t size = sizeof(*this);
211
0
        if (_type == HLL_DATA_EXPLICIT) {
212
0
            size += _hash_set.size() * sizeof(uint64_t);
213
0
        } else if (_type == HLL_DATA_SPARSE || _type == HLL_DATA_FULL) {
214
0
            size += HLL_REGISTERS_COUNT;
215
0
        }
216
0
        return size;
217
0
    }
218
219
    // Input slice should has enough capacity for serialize, which
220
    // can be get through max_serialized_size(). If insufficient buffer
221
    // is given, this will cause process crash.
222
    // Return actual size of serialized binary.
223
    size_t serialize(uint8_t* dst) const;
224
225
    // Now, only empty HLL support this function.
226
    bool deserialize(const Slice& slice);
227
228
    int64_t estimate_cardinality() const;
229
230
21
    static HyperLogLog empty() { return HyperLogLog {}; }
231
232
    // Check if input slice is a valid serialized binary of HyperLogLog.
233
    // This function only check the encoded type in slice, whose complex
234
    // function is O(1).
235
    static bool is_valid(const Slice& slice);
236
237
    // only for debug
238
    std::string to_string() const {
239
        switch (_type) {
240
        case HLL_DATA_EMPTY:
241
            return {};
242
        case HLL_DATA_EXPLICIT:
243
        case HLL_DATA_SPARSE:
244
        case HLL_DATA_FULL: {
245
            std::string str {"hash set size: "};
246
            str.append(std::to_string(_hash_set.size()));
247
            str.append("\ncardinality:\t");
248
            str.append(std::to_string(estimate_cardinality()));
249
            str.append("\ntype:\t");
250
            str.append(std::to_string(_type));
251
            return str;
252
        }
253
        default:
254
            return {};
255
        }
256
    }
257
258
private:
259
    void _convert_explicit_to_register();
260
261
    // update one hash value into this registers
262
49.6M
    void _update_registers(uint64_t hash_value) {
263
        // Use the lower bits to index into the number of streams and then
264
        // find the first 1 bit after the index bits.
265
49.6M
        int idx = hash_value % HLL_REGISTERS_COUNT;
266
49.6M
        hash_value >>= HLL_COLUMN_PRECISION;
267
        // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1
268
49.6M
        hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS);
269
49.6M
        auto first_one_bit = uint8_t(__builtin_ctzl(hash_value) + 1);
270
49.6M
        _registers[idx] = (_registers[idx] < first_one_bit ? first_one_bit : _registers[idx]);
271
49.6M
    }
272
273
    // absorb other registers into this registers
274
3
    void _merge_registers(const uint8_t* other_registers) {
275
3
        _do_simd_merge(_registers, other_registers);
276
3
    }
277
278
3
    void _do_simd_merge(uint8_t* __restrict registers, const uint8_t* __restrict other_registers) {
279
49.1k
        for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) {
280
49.1k
            registers[i] = (registers[i] < other_registers[i] ? other_registers[i] : registers[i]);
281
49.1k
        }
282
3
    }
283
284
    HllDataType _type = HLL_DATA_EMPTY;
285
    vectorized::flat_hash_set<uint64_t> _hash_set;
286
287
    // This field is much space consuming(HLL_REGISTERS_COUNT), we create
288
    // it only when it is really needed.
289
    uint8_t* _registers = nullptr;
290
};
291
#include "common/compile_check_end.h"
292
} // namespace doris