Coverage Report

Created: 2024-11-22 00:22

/root/doris/be/src/olap/hll.h
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <cstdint>
21
#include <cstring>
22
#include <map>
23
#include <string>
24
#include <utility>
25
26
#ifdef __x86_64__
27
#include <immintrin.h>
28
#endif
29
30
#include "vec/common/hash_table/phmap_fwd_decl.h"
31
32
namespace doris {
33
34
struct Slice;
35
36
inline const int HLL_COLUMN_PRECISION = 14;
37
inline const int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION);
38
inline const int HLL_EXPLICIT_INT64_NUM = 160;
39
inline const int HLL_SPARSE_THRESHOLD = 4096;
40
inline const int HLL_REGISTERS_COUNT = 16 * 1024;
41
// maximum size in byte of serialized HLL: type(1) + registers (2^14)
42
inline const int HLL_COLUMN_DEFAULT_LEN = HLL_REGISTERS_COUNT + 1;
43
44
// 1 for type; 1 for hash values count; 8 for hash value
45
inline const int HLL_SINGLE_VALUE_SIZE = 10;
46
inline const int HLL_EMPTY_SIZE = 1;
47
48
// Hyperloglog distinct estimate algorithm.
49
// See these papers for more details.
50
// 1) Hyperloglog: The analysis of a near-optimal cardinality estimation
51
// algorithm (2007)
52
// 2) HyperLogLog in Practice (paper from google with some improvements)
53
54
// Each HLL value is a set of values. To save space, Doris store HLL value
55
// in different format according to its cardinality.
56
//
57
// HLL_DATA_EMPTY: when set is empty.
58
//
59
// HLL_DATA_EXPLICIT: when there is only few values in set, store these values explicit.
60
// If the number of hash values is not greater than 160, set is encoded in this format.
61
// The max space occupied is (1 + 1 + 160 * 8) = 1282. I don't know why 160 is chosen,
62
// maybe can be other number. If you are interested, you can try other number and see
63
// if it will be better.
64
//
65
// HLL_DATA_SPARSE: only store non-zero registers. If the number of non-zero registers
66
// is not greater than 4096, set is encoded in this format. The max space occupied is
67
// (1 + 4 + 3 * 4096) = 12293.
68
//
69
// HLL_DATA_FULL: most space-consuming, store all registers
70
//
71
// A HLL value will change in the sequence empty -> explicit -> sparse -> full, and not
72
// allow reverse.
73
//
74
// NOTE: This values are persisted in storage devices, so don't change exist
75
// enum values.
76
enum HllDataType {
77
    HLL_DATA_EMPTY = 0,
78
    HLL_DATA_EXPLICIT = 1,
79
    HLL_DATA_SPARSE = 2,
80
    HLL_DATA_FULL = 3,
81
};
82
83
class HyperLogLog {
84
public:
85
5.16k
    HyperLogLog() = default;
86
0
    explicit HyperLogLog(uint64_t hash_value) : _type(HLL_DATA_EXPLICIT) {
87
0
        _hash_set.emplace(hash_value);
88
0
    }
89
    explicit HyperLogLog(const Slice& src);
90
91
6.19k
    HyperLogLog(const HyperLogLog& other) {
92
6.19k
        this->_type = other._type;
93
6.19k
        switch (other._type) {
94
1.02k
        case HLL_DATA_EMPTY:
95
1.02k
            break;
96
5.16k
        case HLL_DATA_EXPLICIT: {
97
5.16k
            this->_hash_set = other._hash_set;
98
5.16k
            break;
99
0
        }
100
0
        case HLL_DATA_SPARSE:
101
0
        case HLL_DATA_FULL: {
102
0
            _registers = new uint8_t[HLL_REGISTERS_COUNT];
103
0
            memcpy(_registers, other._registers, HLL_REGISTERS_COUNT);
104
0
            break;
105
0
        }
106
0
        default:
107
0
            break;
108
6.19k
        }
109
6.19k
    }
110
111
9.27k
    HyperLogLog(HyperLogLog&& other) noexcept {
112
9.27k
        this->_type = other._type;
113
9.27k
        switch (other._type) {
114
2.04k
        case HLL_DATA_EMPTY:
115
2.04k
            break;
116
7.22k
        case HLL_DATA_EXPLICIT: {
117
7.22k
            this->_hash_set = std::move(other._hash_set);
118
7.22k
            other._type = HLL_DATA_EMPTY;
119
7.22k
            break;
120
0
        }
121
0
        case HLL_DATA_SPARSE:
122
0
        case HLL_DATA_FULL: {
123
0
            this->_registers = other._registers;
124
0
            other._registers = nullptr;
125
0
            other._type = HLL_DATA_EMPTY;
126
0
            break;
127
0
        }
128
0
        default:
129
0
            break;
130
9.27k
        }
131
9.27k
    }
132
133
0
    HyperLogLog& operator=(HyperLogLog&& other) noexcept {
134
0
        if (this != &other) {
135
0
            if (_registers != nullptr) {
136
0
                delete[] _registers;
137
0
                _registers = nullptr;
138
0
            }
139
140
0
            this->_type = other._type;
141
0
            switch (other._type) {
142
0
            case HLL_DATA_EMPTY:
143
0
                break;
144
0
            case HLL_DATA_EXPLICIT: {
145
0
                this->_hash_set = std::move(other._hash_set);
146
0
                other._type = HLL_DATA_EMPTY;
147
0
                break;
148
0
            }
149
0
            case HLL_DATA_SPARSE:
150
0
            case HLL_DATA_FULL: {
151
0
                this->_registers = other._registers;
152
0
                other._registers = nullptr;
153
0
                other._type = HLL_DATA_EMPTY;
154
0
                break;
155
0
            }
156
0
            default:
157
0
                break;
158
0
            }
159
0
        }
160
0
        return *this;
161
0
    }
162
163
0
    HyperLogLog& operator=(const HyperLogLog& other) {
164
0
        if (this != &other) {
165
0
            if (_registers != nullptr) {
166
0
                delete[] _registers;
167
0
                _registers = nullptr;
168
0
            }
169
170
0
            this->_type = other._type;
171
0
            switch (other._type) {
172
0
            case HLL_DATA_EMPTY:
173
0
                break;
174
0
            case HLL_DATA_EXPLICIT: {
175
0
                this->_hash_set = other._hash_set;
176
0
                break;
177
0
            }
178
0
            case HLL_DATA_SPARSE:
179
0
            case HLL_DATA_FULL: {
180
0
                _registers = new uint8_t[HLL_REGISTERS_COUNT];
181
0
                memcpy(_registers, other._registers, HLL_REGISTERS_COUNT);
182
0
                break;
183
0
            }
184
0
            default:
185
0
                break;
186
0
            }
187
0
        }
188
0
        return *this;
189
0
    }
190
191
21.6k
    ~HyperLogLog() { clear(); }
192
21.6k
    void clear() {
193
21.6k
        _type = HLL_DATA_EMPTY;
194
21.6k
        _hash_set.clear();
195
21.6k
        delete[] _registers;
196
21.6k
        _registers = nullptr;
197
21.6k
    }
198
199
    using SetTypeValueType = uint8_t;
200
    using SparseLengthValueType = int32_t;
201
    using SparseIndexType = uint16_t;
202
    using SparseValueType = uint8_t;
203
204
    // Add a hash value to this HLL value
205
    // NOTE: input must be a hash_value
206
    void update(uint64_t hash_value);
207
208
    void merge(const HyperLogLog& other);
209
210
    // Return max size of serialized binary
211
    size_t max_serialized_size() const;
212
213
0
    size_t memory_consumed() const {
214
0
        size_t size = sizeof(*this);
215
0
        if (_type == HLL_DATA_EXPLICIT) {
216
0
            size += _hash_set.size() * sizeof(uint64_t);
217
0
        } else if (_type == HLL_DATA_SPARSE || _type == HLL_DATA_FULL) {
218
0
            size += HLL_REGISTERS_COUNT;
219
0
        }
220
0
        return size;
221
0
    }
222
223
    // Input slice should has enough capacity for serialize, which
224
    // can be get through max_serialized_size(). If insufficient buffer
225
    // is given, this will cause process crash.
226
    // Return actual size of serialized binary.
227
    size_t serialize(uint8_t* dst) const;
228
229
    // Now, only empty HLL support this function.
230
    bool deserialize(const Slice& slice);
231
232
    int64_t estimate_cardinality() const;
233
234
0
    static HyperLogLog empty() { return HyperLogLog {}; }
235
236
    // Check if input slice is a valid serialized binary of HyperLogLog.
237
    // This function only check the encoded type in slice, whose complex
238
    // function is O(1).
239
    static bool is_valid(const Slice& slice);
240
241
    // only for debug
242
0
    std::string to_string() {
243
0
        switch (_type) {
244
0
        case HLL_DATA_EMPTY:
245
0
            return {};
246
0
        case HLL_DATA_EXPLICIT:
247
0
        case HLL_DATA_SPARSE:
248
0
        case HLL_DATA_FULL: {
249
0
            std::string str {"hash set size: "};
250
0
            str.append(std::to_string(_hash_set.size()));
251
0
            str.append("\ncardinality:\t");
252
0
            str.append(std::to_string(estimate_cardinality()));
253
0
            str.append("\ntype:\t");
254
0
            str.append(std::to_string(_type));
255
0
            return str;
256
0
        }
257
0
        default:
258
0
            return {};
259
0
        }
260
0
    }
261
262
private:
263
    void _convert_explicit_to_register();
264
265
    // update one hash value into this registers
266
67.9k
    void _update_registers(uint64_t hash_value) {
267
        // Use the lower bits to index into the number of streams and then
268
        // find the first 1 bit after the index bits.
269
67.9k
        int idx = hash_value % HLL_REGISTERS_COUNT;
270
67.9k
        hash_value >>= HLL_COLUMN_PRECISION;
271
        // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1
272
67.9k
        hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS);
273
67.9k
        uint8_t first_one_bit = __builtin_ctzl(hash_value) + 1;
274
67.9k
        _registers[idx] = (_registers[idx] < first_one_bit ? first_one_bit : _registers[idx]);
275
67.9k
    }
276
277
    // absorb other registers into this registers
278
3
    void _merge_registers(const uint8_t* other_registers) {
279
#ifdef __AVX2__
280
        int loop = HLL_REGISTERS_COUNT / 32; // 32 = 256/8
281
        uint8_t* dst = _registers;
282
        const uint8_t* src = other_registers;
283
        for (int i = 0; i < loop; i++) {
284
            __m256i xa = _mm256_loadu_si256((const __m256i*)dst);
285
            __m256i xb = _mm256_loadu_si256((const __m256i*)src);
286
            _mm256_storeu_si256((__m256i*)dst, _mm256_max_epu8(xa, xb));
287
            src += 32;
288
            dst += 32;
289
        }
290
#else
291
49.1k
        for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) {
292
49.1k
            _registers[i] =
293
49.1k
                    (_registers[i] < other_registers[i] ? other_registers[i] : _registers[i]);
294
49.1k
        }
295
3
#endif
296
3
    }
297
298
    HllDataType _type = HLL_DATA_EMPTY;
299
    vectorized::flat_hash_set<uint64_t> _hash_set;
300
301
    // This field is much space consuming(HLL_REGISTERS_COUNT), we create
302
    // it only when it is really needed.
303
    uint8_t* _registers = nullptr;
304
};
305
306
// todo(kks): remove this when dpp_sink class was removed
307
class HllSetResolver {
308
public:
309
    HllSetResolver() = default;
310
311
    ~HllSetResolver() = default;
312
313
    using SetTypeValueType = uint8_t;
314
    using ExplicitLengthValueType = uint8_t;
315
    using SparseLengthValueType = int32_t;
316
    using SparseIndexType = uint16_t;
317
    using SparseValueType = uint8_t;
318
319
    // only save pointer
320
0
    void init(char* buf, int len) {
321
0
        this->_buf_ref = buf;
322
0
        this->_buf_len = len;
323
0
    }
324
325
    // hll set type
326
0
    HllDataType get_hll_data_type() { return _set_type; }
327
328
    // explicit value num
329
0
    int get_explicit_count() const { return (int)_explicit_num; }
330
331
    // get explicit index value 64bit
332
0
    uint64_t get_explicit_value(int index) {
333
0
        if (index >= _explicit_num) {
334
0
            return -1;
335
0
        }
336
0
        return _explicit_value[index];
337
0
    }
338
339
    // get full register value
340
0
    char* get_full_value() { return _full_value_position; }
341
342
    // get (index, value) map
343
0
    std::map<SparseIndexType, SparseValueType>& get_sparse_map() { return _sparse_map; }
344
345
    // parse set , call after copy() or init()
346
    void parse();
347
348
private:
349
    char* _buf_ref = nullptr; // set
350
    int _buf_len {};          // set len
351
    HllDataType _set_type {}; //set type
352
    char* _full_value_position = nullptr;
353
    uint64_t* _explicit_value = nullptr;
354
    ExplicitLengthValueType _explicit_num {};
355
    std::map<SparseIndexType, SparseValueType> _sparse_map;
356
    SparseLengthValueType* _sparse_count;
357
};
358
359
} // namespace doris