Coverage Report

Created: 2025-07-26 12:05

/root/doris/be/src/olap/hll.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "olap/hll.h"
19
20
#include <cmath>
21
#include <map>
22
#include <ostream>
23
24
#include "common/logging.h"
25
#include "util/coding.h"
26
#include "util/slice.h"
27
28
using std::string;
29
using std::stringstream;
30
31
namespace doris {
32
33
1.03k
HyperLogLog::HyperLogLog(const Slice& src) {
34
    // When deserialize return false, we make this object a empty
35
1.03k
    if (!deserialize(src)) {
36
1.02k
        _type = HLL_DATA_EMPTY;
37
1.02k
    }
38
1.03k
}
39
40
// Convert explicit values to register format, and clear explicit values.
41
// NOTE: this function won't modify _type.
42
443
void HyperLogLog::_convert_explicit_to_register() {
43
443
    DCHECK(_type == HLL_DATA_EXPLICIT)
44
0
            << "_type(" << _type << ") should be explicit(" << HLL_DATA_EXPLICIT << ")";
45
443
    _registers = new uint8_t[HLL_REGISTERS_COUNT];
46
443
    memset(_registers, 0, HLL_REGISTERS_COUNT);
47
70.8k
    for (auto value : _hash_set) {
48
70.8k
        _update_registers(value);
49
70.8k
    }
50
    // clear _hash_set
51
443
    vectorized::flat_hash_set<uint64_t>().swap(_hash_set);
52
443
}
53
54
// Change HLL_DATA_EXPLICIT to HLL_DATA_FULL directly, because HLL_DATA_SPARSE
55
// is implemented in the same way in memory with HLL_DATA_FULL.
56
54.2M
void HyperLogLog::update(uint64_t hash_value) {
57
54.2M
    switch (_type) {
58
48.5k
    case HLL_DATA_EMPTY:
59
48.5k
        _hash_set.insert(hash_value);
60
48.5k
        _type = HLL_DATA_EXPLICIT;
61
48.5k
        break;
62
22.4M
    case HLL_DATA_EXPLICIT:
63
22.4M
        if (_hash_set.size() < HLL_EXPLICIT_INT64_NUM) {
64
22.4M
            _hash_set.insert(hash_value);
65
22.4M
            break;
66
22.4M
        }
67
461
        _convert_explicit_to_register();
68
461
        _type = HLL_DATA_FULL;
69
        // fall through
70
462
    case HLL_DATA_SPARSE:
71
31.7M
    case HLL_DATA_FULL:
72
31.7M
        _update_registers(hash_value);
73
31.7M
        break;
74
54.2M
    }
75
54.2M
}
76
77
46.4k
void HyperLogLog::merge(const HyperLogLog& other) {
78
    // fast path
79
46.4k
    if (other._type == HLL_DATA_EMPTY) {
80
516
        return;
81
516
    }
82
45.9k
    switch (_type) {
83
31.8k
    case HLL_DATA_EMPTY: {
84
        // _type must change
85
31.8k
        _type = other._type;
86
31.8k
        switch (other._type) {
87
31.3k
        case HLL_DATA_EXPLICIT:
88
31.3k
            _hash_set = other._hash_set;
89
31.3k
            break;
90
304
        case HLL_DATA_SPARSE:
91
439
        case HLL_DATA_FULL:
92
439
            _registers = new uint8_t[HLL_REGISTERS_COUNT];
93
439
            memcpy(_registers, other._registers, HLL_REGISTERS_COUNT);
94
439
            break;
95
0
        default:
96
0
            break;
97
31.8k
        }
98
31.8k
        break;
99
31.8k
    }
100
31.8k
    case HLL_DATA_EXPLICIT: {
101
3.56k
        switch (other._type) {
102
3.56k
        case HLL_DATA_EXPLICIT: {
103
            // Merge other's explicit values first, then check if the number is exceed
104
            // HLL_EXPLICIT_INT64_NUM. This is OK because the max value is 2 * 160.
105
3.56k
            _hash_set.insert(other._hash_set.begin(), other._hash_set.end());
106
3.56k
            if (_hash_set.size() > HLL_EXPLICIT_INT64_NUM) {
107
15
                _convert_explicit_to_register();
108
15
                _type = HLL_DATA_FULL;
109
15
            }
110
3.56k
        } break;
111
0
        case HLL_DATA_SPARSE:
112
1
        case HLL_DATA_FULL:
113
1
            _convert_explicit_to_register();
114
1
            _merge_registers(other._registers);
115
1
            _type = HLL_DATA_FULL;
116
1
            break;
117
0
        default:
118
0
            break;
119
3.56k
        }
120
3.56k
        break;
121
3.56k
    }
122
3.56k
    case HLL_DATA_SPARSE:
123
10.5k
    case HLL_DATA_FULL: {
124
10.5k
        switch (other._type) {
125
10.5k
        case HLL_DATA_EXPLICIT:
126
10.6k
            for (auto hash_value : other._hash_set) {
127
10.6k
                _update_registers(hash_value);
128
10.6k
            }
129
10.5k
            break;
130
0
        case HLL_DATA_SPARSE:
131
2
        case HLL_DATA_FULL:
132
2
            _merge_registers(other._registers);
133
2
            break;
134
0
        default:
135
0
            break;
136
10.5k
        }
137
10.5k
        break;
138
10.5k
    }
139
45.9k
    }
140
45.9k
}
141
142
56.6k
size_t HyperLogLog::max_serialized_size() const {
143
56.6k
    switch (_type) {
144
1.25k
    case HLL_DATA_EMPTY:
145
1.25k
    default:
146
1.25k
        return 1;
147
54.9k
    case HLL_DATA_EXPLICIT:
148
54.9k
        return 2 + _hash_set.size() * 8;
149
4
    case HLL_DATA_SPARSE:
150
442
    case HLL_DATA_FULL:
151
442
        return 1 + HLL_REGISTERS_COUNT;
152
56.6k
    }
153
56.6k
}
154
155
56.3k
size_t HyperLogLog::serialize(uint8_t* dst) const {
156
56.3k
    uint8_t* ptr = dst;
157
56.3k
    switch (_type) {
158
1.24k
    case HLL_DATA_EMPTY:
159
1.24k
    default: {
160
        // When the _type is unknown, which may not happen, we encode it as
161
        // Empty HyperLogLog object.
162
1.24k
        *ptr++ = HLL_DATA_EMPTY;
163
1.24k
        break;
164
1.24k
    }
165
54.7k
    case HLL_DATA_EXPLICIT: {
166
18.4E
        DCHECK(_hash_set.size() <= HLL_EXPLICIT_INT64_NUM)
167
18.4E
                << "Number of explicit elements(" << _hash_set.size()
168
18.4E
                << ") should be less or equal than " << HLL_EXPLICIT_INT64_NUM;
169
54.7k
        *ptr++ = _type;
170
54.7k
        *ptr++ = (uint8_t)_hash_set.size();
171
120k
        for (auto hash_value : _hash_set) {
172
120k
            encode_fixed64_le(ptr, hash_value);
173
120k
            ptr += 8;
174
120k
        }
175
54.7k
        break;
176
1.24k
    }
177
2
    case HLL_DATA_SPARSE:
178
442
    case HLL_DATA_FULL: {
179
442
        uint32_t num_non_zero_registers = 0;
180
7.24M
        for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) {
181
7.24M
            num_non_zero_registers += (_registers[i] != 0);
182
7.24M
        }
183
184
        // each register in sparse format will occupy 3bytes, 2 for index and
185
        // 1 for register value. So if num_non_zero_registers is greater than
186
        // 4K we use full encode format.
187
442
        if (num_non_zero_registers > HLL_SPARSE_THRESHOLD) {
188
135
            *ptr++ = HLL_DATA_FULL;
189
135
            memcpy(ptr, _registers, HLL_REGISTERS_COUNT);
190
135
            ptr += HLL_REGISTERS_COUNT;
191
307
        } else {
192
307
            *ptr++ = HLL_DATA_SPARSE;
193
            // 2-5(4 byte): number of registers
194
307
            encode_fixed32_le(ptr, num_non_zero_registers);
195
307
            ptr += 4;
196
197
5.03M
            for (uint32_t i = 0; i < HLL_REGISTERS_COUNT; ++i) {
198
5.02M
                if (_registers[i] == 0) {
199
4.58M
                    continue;
200
4.58M
                }
201
                // 2 bytes: register index
202
                // 1 byte: register value
203
444k
                encode_fixed16_le(ptr, i);
204
444k
                ptr += 2;
205
444k
                *ptr++ = _registers[i];
206
444k
            }
207
307
        }
208
442
        break;
209
2
    }
210
56.3k
    }
211
56.3k
    return ptr - dst;
212
56.3k
}
213
214
54.2k
bool HyperLogLog::is_valid(const Slice& slice) {
215
54.2k
    if (slice.size < 1) {
216
1
        return false;
217
1
    }
218
54.2k
    const uint8_t* ptr = (uint8_t*)slice.data;
219
54.2k
    const uint8_t* end = (uint8_t*)slice.data + slice.size;
220
54.2k
    auto type = (HllDataType)*ptr++;
221
54.2k
    switch (type) {
222
1.31k
    case HLL_DATA_EMPTY:
223
1.31k
        break;
224
52.4k
    case HLL_DATA_EXPLICIT: {
225
52.4k
        if ((ptr + 1) > end) {
226
1
            return false;
227
1
        }
228
52.4k
        uint8_t num_explicits = *ptr++;
229
52.4k
        ptr += num_explicits * 8;
230
52.4k
        break;
231
52.4k
    }
232
311
    case HLL_DATA_SPARSE: {
233
311
        if ((ptr + 4) > end) {
234
1
            return false;
235
1
        }
236
310
        uint32_t num_registers = decode_fixed32_le(ptr);
237
310
        ptr += 4 + 3 * num_registers;
238
310
        break;
239
311
    }
240
137
    case HLL_DATA_FULL: {
241
137
        ptr += HLL_REGISTERS_COUNT;
242
137
        break;
243
311
    }
244
2
    default:
245
2
        return false;
246
54.2k
    }
247
54.2k
    return ptr == end;
248
54.2k
}
249
250
// TODO(zc): check input string's length
251
54.2k
bool HyperLogLog::deserialize(const Slice& slice) {
252
    // can be called only when type is empty
253
54.2k
    DCHECK(_type == HLL_DATA_EMPTY);
254
255
    // NOTE(zc): Don't remove this check unless you known what
256
    // you are doing. Because of history bug, we ingest some
257
    // invalid HLL data in storage, which ptr is nullptr.
258
    // we must handle this case to avoid process crash.
259
    // This bug is in release 0.10, I think we can remove this
260
    // in release 0.12 or later.
261
54.2k
    if (slice.data == nullptr || slice.size <= 0) {
262
1
        return false;
263
1
    }
264
    // check if input length is valid
265
54.2k
    if (!is_valid(slice)) {
266
1.02k
        return false;
267
1.02k
    }
268
269
53.2k
    const uint8_t* ptr = (uint8_t*)slice.data;
270
    // first byte : type
271
53.2k
    _type = (HllDataType)*ptr++;
272
53.2k
    switch (_type) {
273
1.31k
    case HLL_DATA_EMPTY:
274
1.31k
        break;
275
51.4k
    case HLL_DATA_EXPLICIT: {
276
        // 2: number of explicit values
277
        // make sure that num_explicit is positive
278
51.4k
        uint8_t num_explicits = *ptr++;
279
        // 3+: 8 bytes hash value
280
168k
        for (int i = 0; i < num_explicits; ++i) {
281
117k
            _hash_set.insert(decode_fixed64_le(ptr));
282
117k
            ptr += 8;
283
117k
        }
284
51.4k
        break;
285
0
    }
286
309
    case HLL_DATA_SPARSE: {
287
309
        _registers = new uint8_t[HLL_REGISTERS_COUNT];
288
309
        memset(_registers, 0, HLL_REGISTERS_COUNT);
289
        // 2-5(4 byte): number of registers
290
309
        uint32_t num_registers = decode_fixed32_le(ptr);
291
309
        ptr += 4;
292
446k
        for (uint32_t i = 0; i < num_registers; ++i) {
293
            // 2 bytes: register index
294
            // 1 byte: register value
295
446k
            uint16_t register_idx = decode_fixed16_le(ptr);
296
446k
            ptr += 2;
297
446k
            _registers[register_idx] = *ptr++;
298
446k
        }
299
309
        break;
300
0
    }
301
135
    case HLL_DATA_FULL: {
302
135
        _registers = new uint8_t[HLL_REGISTERS_COUNT];
303
        // 2+ : hll register value
304
135
        memcpy(_registers, ptr, HLL_REGISTERS_COUNT);
305
135
        break;
306
0
    }
307
0
    default:
308
        // revert type to EMPTY
309
0
        _type = HLL_DATA_EMPTY;
310
0
        return false;
311
53.2k
    }
312
53.2k
    return true;
313
53.2k
}
314
315
31.5k
int64_t HyperLogLog::estimate_cardinality() const {
316
31.5k
    if (_type == HLL_DATA_EMPTY) {
317
1.07k
        return 0;
318
1.07k
    }
319
30.4k
    if (_type == HLL_DATA_EXPLICIT) {
320
29.9k
        return _hash_set.size();
321
29.9k
    }
322
323
453
    const int num_streams = HLL_REGISTERS_COUNT;
324
    // Empirical constants for the algorithm.
325
453
    float alpha = 0;
326
327
453
    if (num_streams == 16) {
328
0
        alpha = 0.673F;
329
453
    } else if (num_streams == 32) {
330
0
        alpha = 0.697F;
331
453
    } else if (num_streams == 64) {
332
0
        alpha = 0.709F;
333
453
    } else {
334
453
        alpha = 0.7213F / (1 + 1.079F / num_streams);
335
453
    }
336
337
453
    float harmonic_mean = 0;
338
453
    int num_zero_registers = 0;
339
340
7.42M
    for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) {
341
7.42M
        harmonic_mean += powf(2.0F, -_registers[i]);
342
343
7.42M
        if (_registers[i] == 0) {
344
5.21M
            ++num_zero_registers;
345
5.21M
        }
346
7.42M
    }
347
348
453
    harmonic_mean = 1.0F / harmonic_mean;
349
453
    double estimate = alpha * num_streams * num_streams * harmonic_mean;
350
    // according to HyperLogLog current correction, if E is cardinal
351
    // E =< num_streams * 2.5 , LC has higher accuracy.
352
    // num_streams * 2.5 < E , HyperLogLog has higher accuracy.
353
    // Generally , we can use HyperLogLog to produce value as E.
354
453
    if (estimate <= num_streams * 2.5 && num_zero_registers != 0) {
355
        // Estimated cardinality is too low. Hll is too inaccurate here, instead use
356
        // linear counting.
357
389
        estimate = num_streams * log(static_cast<float>(num_streams) / num_zero_registers);
358
389
    } else if (num_streams == 16384 && estimate < 72000) {
359
        // when Linear Couint change to HyperLogLog according to HyperLogLog Correction,
360
        // there are relatively large fluctuations, we fixed the problem refer to redis.
361
12
        double bias = 5.9119 * 1.0e-18 * (estimate * estimate * estimate * estimate) -
362
12
                      1.4253 * 1.0e-12 * (estimate * estimate * estimate) +
363
12
                      1.2940 * 1.0e-7 * (estimate * estimate) - 5.2921 * 1.0e-3 * estimate +
364
12
                      83.3216;
365
12
        estimate -= estimate * (bias / 100);
366
12
    }
367
453
    return (int64_t)(estimate + 0.5);
368
30.4k
}
369
370
0
void HllSetResolver::parse() {
371
    // skip LengthValueType
372
0
    char* pdata = _buf_ref;
373
0
    _set_type = (HllDataType)pdata[0];
374
0
    char* sparse_data = nullptr;
375
0
    switch (_set_type) {
376
0
    case HLL_DATA_EXPLICIT:
377
        // first byte : type
378
        // second~five byte : hash values's number
379
        // five byte later : hash value
380
0
        _explicit_num = (ExplicitLengthValueType)(pdata[sizeof(SetTypeValueType)]);
381
0
        _explicit_value =
382
0
                (uint64_t*)(pdata + sizeof(SetTypeValueType) + sizeof(ExplicitLengthValueType));
383
0
        break;
384
0
    case HLL_DATA_SPARSE:
385
        // first byte : type
386
        // second ~(2^HLL_COLUMN_PRECISION)/8 byte : bitmap mark which is not zero
387
        // 2^HLL_COLUMN_PRECISION)/8 + 1以后value
388
0
        _sparse_count = (SparseLengthValueType*)(pdata + sizeof(SetTypeValueType));
389
0
        sparse_data = pdata + sizeof(SetTypeValueType) + sizeof(SparseLengthValueType);
390
0
        for (int i = 0; i < *_sparse_count; i++) {
391
0
            auto* index = (SparseIndexType*)sparse_data;
392
0
            sparse_data += sizeof(SparseIndexType);
393
0
            auto* value = (SparseValueType*)sparse_data;
394
0
            _sparse_map[*index] = *value;
395
0
            sparse_data += sizeof(SetTypeValueType);
396
0
        }
397
0
        break;
398
0
    case HLL_DATA_FULL:
399
        // first byte : type
400
        // second byte later : hll register value
401
0
        _full_value_position = pdata + sizeof(SetTypeValueType);
402
0
        break;
403
0
    default:
404
        // HLL_DATA_EMPTY
405
0
        break;
406
0
    }
407
0
}
408
409
} // namespace doris