Coverage Report

Created: 2026-04-14 13:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/core/value/hll.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <cstdint>
21
#include <cstring>
22
#include <map>
23
#include <string>
24
#include <utility>
25
26
#include "exec/common/hash_table/phmap_fwd_decl.h"
27
28
namespace doris {
29
struct Slice;
30
31
inline const int HLL_COLUMN_PRECISION = 14;
32
inline const int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION);
33
inline const int HLL_EXPLICIT_INT64_NUM = 160;
34
inline const int HLL_SPARSE_THRESHOLD = 4096;
35
inline const uint16_t HLL_REGISTERS_COUNT = 16 * 1024;
36
// maximum size in byte of serialized HLL: type(1) + registers (2^14)
37
inline const int HLL_COLUMN_DEFAULT_LEN = HLL_REGISTERS_COUNT + 1;
38
39
// 1 for type; 1 for hash values count; 8 for hash value
40
inline const int HLL_SINGLE_VALUE_SIZE = 10;
41
inline const int HLL_EMPTY_SIZE = 1;
42
43
// Hyperloglog distinct estimate algorithm.
44
// See these papers for more details.
45
// 1) Hyperloglog: The analysis of a near-optimal cardinality estimation
46
// algorithm (2007)
47
// 2) HyperLogLog in Practice (paper from google with some improvements)
48
49
// Each HLL value is a set of values. To save space, Doris store HLL value
50
// in different format according to its cardinality.
51
//
52
// HLL_DATA_EMPTY: when set is empty.
53
//
54
// HLL_DATA_EXPLICIT: when there is only few values in set, store these values explicit.
55
// If the number of hash values is not greater than 160, set is encoded in this format.
56
// The max space occupied is (1 + 1 + 160 * 8) = 1282. I don't know why 160 is chosen,
57
// maybe can be other number. If you are interested, you can try other number and see
58
// if it will be better.
59
//
60
// HLL_DATA_SPARSE: only store non-zero registers. If the number of non-zero registers
61
// is not greater than 4096, set is encoded in this format. The max space occupied is
62
// (1 + 4 + 3 * 4096) = 12293.
63
//
64
// HLL_DATA_FULL: most space-consuming, store all registers
65
//
66
// A HLL value will change in the sequence empty -> explicit -> sparse -> full, and not
67
// allow reverse.
68
//
69
// NOTE: This values are persisted in storage devices, so don't change exist
70
// enum values.
71
enum HllDataType {
72
    HLL_DATA_EMPTY = 0,
73
    HLL_DATA_EXPLICIT = 1,
74
    HLL_DATA_SPARSE = 2,
75
    HLL_DATA_FULL = 3,
76
};
77
78
class HyperLogLog {
79
public:
80
6.81k
    HyperLogLog() = default;
81
10
    explicit HyperLogLog(uint64_t hash_value) : _type(HLL_DATA_EXPLICIT) {
82
10
        _hash_set.emplace(hash_value);
83
10
    }
84
    explicit HyperLogLog(const Slice& src);
85
86
6.67k
    HyperLogLog(const HyperLogLog& other) {
87
6.67k
        this->_type = other._type;
88
6.67k
        switch (other._type) {
89
139
        case HLL_DATA_EMPTY:
90
139
            break;
91
6.53k
        case HLL_DATA_EXPLICIT: {
92
6.53k
            this->_hash_set = other._hash_set;
93
6.53k
            break;
94
0
        }
95
0
        case HLL_DATA_SPARSE:
96
0
        case HLL_DATA_FULL: {
97
0
            _registers = new uint8_t[HLL_REGISTERS_COUNT];
98
0
            memcpy(_registers, other._registers, HLL_REGISTERS_COUNT);
99
0
            break;
100
0
        }
101
0
        default:
102
0
            break;
103
6.67k
        }
104
6.67k
    }
105
106
9.88k
    HyperLogLog(HyperLogLog&& other) noexcept {
107
9.88k
        this->_type = other._type;
108
9.88k
        switch (other._type) {
109
56
        case HLL_DATA_EMPTY:
110
56
            break;
111
9.82k
        case HLL_DATA_EXPLICIT: {
112
9.82k
            this->_hash_set = std::move(other._hash_set);
113
9.82k
            other._type = HLL_DATA_EMPTY;
114
9.82k
            break;
115
0
        }
116
0
        case HLL_DATA_SPARSE:
117
0
        case HLL_DATA_FULL: {
118
0
            this->_registers = other._registers;
119
0
            other._registers = nullptr;
120
0
            other._type = HLL_DATA_EMPTY;
121
0
            break;
122
0
        }
123
0
        default:
124
0
            break;
125
9.88k
        }
126
9.88k
    }
127
128
1
    HyperLogLog& operator=(HyperLogLog&& other) noexcept {
129
1
        if (this != &other) {
130
1
            if (_registers != nullptr) {
131
0
                delete[] _registers;
132
0
                _registers = nullptr;
133
0
            }
134
135
1
            this->_type = other._type;
136
1
            switch (other._type) {
137
0
            case HLL_DATA_EMPTY:
138
0
                break;
139
1
            case HLL_DATA_EXPLICIT: {
140
1
                this->_hash_set = std::move(other._hash_set);
141
1
                other._type = HLL_DATA_EMPTY;
142
1
                break;
143
0
            }
144
0
            case HLL_DATA_SPARSE:
145
0
            case HLL_DATA_FULL: {
146
0
                this->_registers = other._registers;
147
0
                other._registers = nullptr;
148
0
                other._type = HLL_DATA_EMPTY;
149
0
                break;
150
0
            }
151
0
            default:
152
0
                break;
153
1
            }
154
1
        }
155
1
        return *this;
156
1
    }
157
158
24
    HyperLogLog& operator=(const HyperLogLog& other) {
159
24
        if (this != &other) {
160
19
            if (_registers != nullptr) {
161
0
                delete[] _registers;
162
0
                _registers = nullptr;
163
0
            }
164
165
19
            this->_type = other._type;
166
19
            switch (other._type) {
167
8
            case HLL_DATA_EMPTY:
168
8
                break;
169
11
            case HLL_DATA_EXPLICIT: {
170
11
                this->_hash_set = other._hash_set;
171
11
                break;
172
0
            }
173
0
            case HLL_DATA_SPARSE:
174
0
            case HLL_DATA_FULL: {
175
0
                _registers = new uint8_t[HLL_REGISTERS_COUNT];
176
0
                memcpy(_registers, other._registers, HLL_REGISTERS_COUNT);
177
0
                break;
178
0
            }
179
0
            default:
180
0
                break;
181
19
            }
182
19
        }
183
24
        return *this;
184
24
    }
185
186
24.4k
    ~HyperLogLog() { clear(); }
187
24.4k
    void clear() {
188
24.4k
        _type = HLL_DATA_EMPTY;
189
24.4k
        _hash_set.clear();
190
24.4k
        delete[] _registers;
191
24.4k
        _registers = nullptr;
192
24.4k
    }
193
194
    using SetTypeValueType = uint8_t;
195
    using SparseLengthValueType = int32_t;
196
    using SparseIndexType = uint16_t;
197
    using SparseValueType = uint8_t;
198
199
    // Add a hash value to this HLL value
200
    // NOTE: input must be a hash_value
201
    void update(uint64_t hash_value);
202
203
    void merge(const HyperLogLog& other);
204
205
    // Return max size of serialized binary
206
    size_t max_serialized_size() const;
207
208
0
    size_t memory_consumed() const {
209
0
        size_t size = sizeof(*this);
210
0
        if (_type == HLL_DATA_EXPLICIT) {
211
0
            size += _hash_set.size() * sizeof(uint64_t);
212
0
        } else if (_type == HLL_DATA_SPARSE || _type == HLL_DATA_FULL) {
213
0
            size += HLL_REGISTERS_COUNT;
214
0
        }
215
0
        return size;
216
0
    }
217
218
    // Input slice should has enough capacity for serialize, which
219
    // can be get through max_serialized_size(). If insufficient buffer
220
    // is given, this will cause process crash.
221
    // Return actual size of serialized binary.
222
    size_t serialize(uint8_t* dst) const;
223
224
    // Now, only empty HLL support this function.
225
    bool deserialize(const Slice& slice);
226
227
    int64_t estimate_cardinality() const;
228
229
20
    static HyperLogLog empty() { return HyperLogLog {}; }
230
231
    // Check if input slice is a valid serialized binary of HyperLogLog.
232
    // This function only check the encoded type in slice, whose complex
233
    // function is O(1).
234
    static bool is_valid(const Slice& slice);
235
236
    // only for debug
237
485
    std::string to_string() const {
238
485
        switch (_type) {
239
10
        case HLL_DATA_EMPTY:
240
10
            return {};
241
475
        case HLL_DATA_EXPLICIT:
242
475
        case HLL_DATA_SPARSE:
243
475
        case HLL_DATA_FULL: {
244
475
            std::string str {"hash set size: "};
245
475
            str.append(std::to_string(_hash_set.size()));
246
475
            str.append("\ncardinality:\t");
247
475
            str.append(std::to_string(estimate_cardinality()));
248
475
            str.append("\ntype:\t");
249
475
            str.append(std::to_string(_type));
250
475
            return str;
251
475
        }
252
0
        default:
253
0
            return {};
254
485
        }
255
485
    }
256
257
private:
258
    void _convert_explicit_to_register();
259
260
    // update one hash value into this registers
261
67.9k
    void _update_registers(uint64_t hash_value) {
262
        // Use the lower bits to index into the number of streams and then
263
        // find the first 1 bit after the index bits.
264
67.9k
        int idx = hash_value % HLL_REGISTERS_COUNT;
265
67.9k
        hash_value >>= HLL_COLUMN_PRECISION;
266
        // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1
267
67.9k
        hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS);
268
67.9k
        auto first_one_bit = uint8_t(__builtin_ctzl(hash_value) + 1);
269
67.9k
        _registers[idx] = (_registers[idx] < first_one_bit ? first_one_bit : _registers[idx]);
270
67.9k
    }
271
272
    // absorb other registers into this registers
273
3
    void _merge_registers(const uint8_t* other_registers) {
274
3
        _do_simd_merge(_registers, other_registers);
275
3
    }
276
277
3
    void _do_simd_merge(uint8_t* __restrict registers, const uint8_t* __restrict other_registers) {
278
49.1k
        for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) {
279
49.1k
            registers[i] = (registers[i] < other_registers[i] ? other_registers[i] : registers[i]);
280
49.1k
        }
281
3
    }
282
283
    HllDataType _type = HLL_DATA_EMPTY;
284
    flat_hash_set<uint64_t> _hash_set;
285
286
    // This field is much space consuming(HLL_REGISTERS_COUNT), we create
287
    // it only when it is really needed.
288
    uint8_t* _registers = nullptr;
289
};
290
} // namespace doris