Coverage Report

Created: 2024-11-18 10:37

/root/doris/be/src/olap/hll.h
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <stdint.h>
21
#include <string.h>
22
23
#include <map>
24
#include <string>
25
#include <utility>
26
27
#ifdef __x86_64__
28
#include <immintrin.h>
29
#endif
30
31
#include "vec/common/hash_table/phmap_fwd_decl.h"
32
33
namespace doris {
34
35
struct Slice;
36
37
inline const int HLL_COLUMN_PRECISION = 14;
38
inline const int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION);
39
inline const int HLL_EXPLICIT_INT64_NUM = 160;
40
inline const int HLL_SPARSE_THRESHOLD = 4096;
41
inline const int HLL_REGISTERS_COUNT = 16 * 1024;
42
// maximum size in byte of serialized HLL: type(1) + registers (2^14)
43
inline const int HLL_COLUMN_DEFAULT_LEN = HLL_REGISTERS_COUNT + 1;
44
45
// 1 for type; 1 for hash values count; 8 for hash value
46
inline const int HLL_SINGLE_VALUE_SIZE = 10;
47
inline const int HLL_EMPTY_SIZE = 1;
48
49
// Hyperloglog distinct estimate algorithm.
50
// See these papers for more details.
51
// 1) Hyperloglog: The analysis of a near-optimal cardinality estimation
52
// algorithm (2007)
53
// 2) HyperLogLog in Practice (paper from google with some improvements)
54
55
// Each HLL value is a set of values. To save space, Doris store HLL value
56
// in different format according to its cardinality.
57
//
58
// HLL_DATA_EMPTY: when set is empty.
59
//
60
// HLL_DATA_EXPLICIT: when there is only few values in set, store these values explicit.
61
// If the number of hash values is not greater than 160, set is encoded in this format.
62
// The max space occupied is (1 + 1 + 160 * 8) = 1282. I don't know why 160 is chosen,
63
// maybe can be other number. If you are interested, you can try other number and see
64
// if it will be better.
65
//
66
// HLL_DATA_SPARSE: only store non-zero registers. If the number of non-zero registers
67
// is not greater than 4096, set is encoded in this format. The max space occupied is
68
// (1 + 4 + 3 * 4096) = 12293.
69
//
70
// HLL_DATA_FULL: most space-consuming, store all registers
71
//
72
// A HLL value will change in the sequence empty -> explicit -> sparse -> full, and not
73
// allow reverse.
74
//
75
// NOTE: This values are persisted in storage devices, so don't change exist
76
// enum values.
77
enum HllDataType {
78
    HLL_DATA_EMPTY = 0,
79
    HLL_DATA_EXPLICIT = 1,
80
    HLL_DATA_SPARSE = 2,
81
    HLL_DATA_FULL = 3,
82
};
83
84
class HyperLogLog {
85
public:
86
5.16k
    HyperLogLog() = default;
87
0
    explicit HyperLogLog(uint64_t hash_value) : _type(HLL_DATA_EXPLICIT) {
88
0
        _hash_set.emplace(hash_value);
89
0
    }
90
    explicit HyperLogLog(const Slice& src);
91
92
12.3k
    HyperLogLog(const HyperLogLog& other) {
93
12.3k
        this->_type = other._type;
94
12.3k
        switch (other._type) {
95
2.04k
        case HLL_DATA_EMPTY:
96
2.04k
            break;
97
10.3k
        case HLL_DATA_EXPLICIT: {
98
10.3k
            this->_hash_set = other._hash_set;
99
10.3k
            break;
100
0
        }
101
0
        case HLL_DATA_SPARSE:
102
0
        case HLL_DATA_FULL: {
103
0
            _registers = new uint8_t[HLL_REGISTERS_COUNT];
104
0
            memcpy(_registers, other._registers, HLL_REGISTERS_COUNT);
105
0
            break;
106
0
        }
107
0
        default:
108
0
            break;
109
12.3k
        }
110
12.3k
    }
111
112
3.07k
    HyperLogLog(HyperLogLog&& other) {
113
3.07k
        this->_type = other._type;
114
3.07k
        switch (other._type) {
115
1.02k
        case HLL_DATA_EMPTY:
116
1.02k
            break;
117
2.04k
        case HLL_DATA_EXPLICIT: {
118
2.04k
            this->_hash_set = std::move(other._hash_set);
119
2.04k
            other._type = HLL_DATA_EMPTY;
120
2.04k
            break;
121
0
        }
122
0
        case HLL_DATA_SPARSE:
123
0
        case HLL_DATA_FULL: {
124
0
            this->_registers = other._registers;
125
0
            other._registers = nullptr;
126
0
            other._type = HLL_DATA_EMPTY;
127
0
            break;
128
0
        }
129
0
        default:
130
0
            break;
131
3.07k
        }
132
3.07k
    }
133
134
0
    HyperLogLog& operator=(HyperLogLog&& other) {
135
0
        if (this != &other) {
136
0
            if (_registers != nullptr) {
137
0
                delete[] _registers;
138
0
                _registers = nullptr;
139
0
            }
140
141
0
            this->_type = other._type;
142
0
            switch (other._type) {
143
0
            case HLL_DATA_EMPTY:
144
0
                break;
145
0
            case HLL_DATA_EXPLICIT: {
146
0
                this->_hash_set = std::move(other._hash_set);
147
0
                other._type = HLL_DATA_EMPTY;
148
0
                break;
149
0
            }
150
0
            case HLL_DATA_SPARSE:
151
0
            case HLL_DATA_FULL: {
152
0
                this->_registers = other._registers;
153
0
                other._registers = nullptr;
154
0
                other._type = HLL_DATA_EMPTY;
155
0
                break;
156
0
            }
157
0
            default:
158
0
                break;
159
0
            }
160
0
        }
161
0
        return *this;
162
0
    }
163
164
0
    HyperLogLog& operator=(const HyperLogLog& other) {
165
0
        if (this != &other) {
166
0
            if (_registers != nullptr) {
167
0
                delete[] _registers;
168
0
                _registers = nullptr;
169
0
            }
170
171
0
            this->_type = other._type;
172
0
            switch (other._type) {
173
0
            case HLL_DATA_EMPTY:
174
0
                break;
175
0
            case HLL_DATA_EXPLICIT: {
176
0
                this->_hash_set = other._hash_set;
177
0
                break;
178
0
            }
179
0
            case HLL_DATA_SPARSE:
180
0
            case HLL_DATA_FULL: {
181
0
                _registers = new uint8_t[HLL_REGISTERS_COUNT];
182
0
                memcpy(_registers, other._registers, HLL_REGISTERS_COUNT);
183
0
                break;
184
0
            }
185
0
            default:
186
0
                break;
187
0
            }
188
0
        }
189
0
        return *this;
190
0
    }
191
192
21.6k
    ~HyperLogLog() { clear(); }
193
21.6k
    void clear() {
194
21.6k
        _type = HLL_DATA_EMPTY;
195
21.6k
        _hash_set.clear();
196
21.6k
        delete[] _registers;
197
21.6k
        _registers = nullptr;
198
21.6k
    }
199
200
    typedef uint8_t SetTypeValueType;
201
    typedef int32_t SparseLengthValueType;
202
    typedef uint16_t SparseIndexType;
203
    typedef uint8_t SparseValueType;
204
205
    // Add a hash value to this HLL value
206
    // NOTE: input must be a hash_value
207
    void update(uint64_t hash_value);
208
209
    void merge(const HyperLogLog& other);
210
211
    // Return max size of serialized binary
212
    size_t max_serialized_size() const;
213
214
0
    size_t memory_consumed() const {
215
0
        size_t size = sizeof(*this);
216
0
        if (_type == HLL_DATA_EXPLICIT)
217
0
            size += _hash_set.size() * sizeof(uint64_t);
218
0
        else if (_type == HLL_DATA_SPARSE || _type == HLL_DATA_FULL)
219
0
            size += HLL_REGISTERS_COUNT;
220
0
        return size;
221
0
    }
222
223
    // Input slice should has enough capacity for serialize, which
224
    // can be get through max_serialized_size(). If insufficient buffer
225
    // is given, this will cause process crash.
226
    // Return actual size of serialized binary.
227
    size_t serialize(uint8_t* dst) const;
228
229
    // Now, only empty HLL support this function.
230
    bool deserialize(const Slice& slice);
231
232
    int64_t estimate_cardinality() const;
233
234
0
    static std::string empty() {
235
0
        static HyperLogLog hll;
236
0
        std::string buf;
237
0
        buf.resize(HLL_EMPTY_SIZE);
238
0
        hll.serialize((uint8_t*)buf.c_str());
239
0
        return buf;
240
0
    }
241
242
    // Check if input slice is a valid serialized binary of HyperLogLog.
243
    // This function only check the encoded type in slice, whose complex
244
    // function is O(1).
245
    static bool is_valid(const Slice& slice);
246
247
    // only for debug
248
0
    std::string to_string() {
249
0
        switch (_type) {
250
0
        case HLL_DATA_EMPTY:
251
0
            return {};
252
0
        case HLL_DATA_EXPLICIT:
253
0
        case HLL_DATA_SPARSE:
254
0
        case HLL_DATA_FULL: {
255
0
            std::string str {"hash set size: "};
256
0
            str.append(std::to_string(_hash_set.size()));
257
0
            str.append("\ncardinality:\t");
258
0
            str.append(std::to_string(estimate_cardinality()));
259
0
            str.append("\ntype:\t");
260
0
            str.append(std::to_string(_type));
261
0
            return str;
262
0
        }
263
0
        default:
264
0
            return {};
265
0
        }
266
0
    }
267
268
private:
269
    HllDataType _type = HLL_DATA_EMPTY;
270
    vectorized::flat_hash_set<uint64_t> _hash_set;
271
272
    // This field is much space consuming(HLL_REGISTERS_COUNT), we create
273
    // it only when it is really needed.
274
    uint8_t* _registers = nullptr;
275
276
private:
277
    void _convert_explicit_to_register();
278
279
    // update one hash value into this registers
280
67.9k
    void _update_registers(uint64_t hash_value) {
281
        // Use the lower bits to index into the number of streams and then
282
        // find the first 1 bit after the index bits.
283
67.9k
        int idx = hash_value % HLL_REGISTERS_COUNT;
284
67.9k
        hash_value >>= HLL_COLUMN_PRECISION;
285
        // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1
286
67.9k
        hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS);
287
67.9k
        uint8_t first_one_bit = __builtin_ctzl(hash_value) + 1;
288
67.9k
        _registers[idx] = (_registers[idx] < first_one_bit ? first_one_bit : _registers[idx]);
289
67.9k
    }
290
291
    // absorb other registers into this registers
292
3
    void _merge_registers(const uint8_t* other_registers) {
293
#ifdef __AVX2__
294
        int loop = HLL_REGISTERS_COUNT / 32; // 32 = 256/8
295
        uint8_t* dst = _registers;
296
        const uint8_t* src = other_registers;
297
        for (int i = 0; i < loop; i++) {
298
            __m256i xa = _mm256_loadu_si256((const __m256i*)dst);
299
            __m256i xb = _mm256_loadu_si256((const __m256i*)src);
300
            _mm256_storeu_si256((__m256i*)dst, _mm256_max_epu8(xa, xb));
301
            src += 32;
302
            dst += 32;
303
        }
304
#else
305
49.1k
        for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) {
306
49.1k
            _registers[i] =
307
49.1k
                    (_registers[i] < other_registers[i] ? other_registers[i] : _registers[i]);
308
49.1k
        }
309
3
#endif
310
3
    }
311
};
312
313
// todo(kks): remove this when dpp_sink class was removed
314
class HllSetResolver {
315
public:
316
    HllSetResolver()
317
            : _buf_ref(nullptr),
318
              _buf_len(0),
319
              _set_type(HLL_DATA_EMPTY),
320
              _full_value_position(nullptr),
321
              _explicit_value(nullptr),
322
0
              _explicit_num(0) {}
323
324
0
    ~HllSetResolver() {}
325
326
    typedef uint8_t SetTypeValueType;
327
    typedef uint8_t ExplicitLengthValueType;
328
    typedef int32_t SparseLengthValueType;
329
    typedef uint16_t SparseIndexType;
330
    typedef uint8_t SparseValueType;
331
332
    // only save pointer
333
0
    void init(char* buf, int len) {
334
0
        this->_buf_ref = buf;
335
0
        this->_buf_len = len;
336
0
    }
337
338
    // hll set type
339
0
    HllDataType get_hll_data_type() { return _set_type; }
340
341
    // explicit value num
342
0
    int get_explicit_count() { return (int)_explicit_num; }
343
344
    // get explicit index value 64bit
345
0
    uint64_t get_explicit_value(int index) {
346
0
        if (index >= _explicit_num) {
347
0
            return -1;
348
0
        }
349
0
        return _explicit_value[index];
350
0
    }
351
352
    // get full register value
353
0
    char* get_full_value() { return _full_value_position; }
354
355
    // get (index, value) map
356
0
    std::map<SparseIndexType, SparseValueType>& get_sparse_map() { return _sparse_map; }
357
358
    // parse set , call after copy() or init()
359
    void parse();
360
361
private:
362
    char* _buf_ref;        // set
363
    int _buf_len;          // set len
364
    HllDataType _set_type; //set type
365
    char* _full_value_position;
366
    uint64_t* _explicit_value;
367
    ExplicitLengthValueType _explicit_num;
368
    std::map<SparseIndexType, SparseValueType> _sparse_map;
369
    SparseLengthValueType* _sparse_count;
370
};
371
372
} // namespace doris