/root/doris/be/src/olap/hll.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "olap/hll.h" |
19 | | |
20 | | #include <cmath> |
21 | | #include <map> |
22 | | #include <ostream> |
23 | | |
24 | | #include "common/logging.h" |
25 | | #include "util/coding.h" |
26 | | #include "util/slice.h" |
27 | | |
28 | | using std::string; |
29 | | using std::stringstream; |
30 | | |
31 | | namespace doris { |
32 | | |
33 | 1.03k | HyperLogLog::HyperLogLog(const Slice& src) { |
34 | | // When deserialize return false, we make this object a empty |
35 | 1.03k | if (!deserialize(src)) { |
36 | 1.02k | _type = HLL_DATA_EMPTY; |
37 | 1.02k | } |
38 | 1.03k | } |
39 | | |
40 | | // Convert explicit values to register format, and clear explicit values. |
41 | | // NOTE: this function won't modify _type. |
42 | 5 | void HyperLogLog::_convert_explicit_to_register() { |
43 | 5 | DCHECK(_type == HLL_DATA_EXPLICIT) |
44 | 0 | << "_type(" << _type << ") should be explicit(" << HLL_DATA_EXPLICIT << ")"; |
45 | 5 | _registers = new uint8_t[HLL_REGISTERS_COUNT]; |
46 | 5 | memset(_registers, 0, HLL_REGISTERS_COUNT); |
47 | 780 | for (auto value : _hash_set) { |
48 | 780 | _update_registers(value); |
49 | 780 | } |
50 | | // clear _hash_set |
51 | 5 | vectorized::flat_hash_set<uint64_t>().swap(_hash_set); |
52 | 5 | } |
53 | | |
54 | | // Change HLL_DATA_EXPLICIT to HLL_DATA_FULL directly, because HLL_DATA_SPARSE |
55 | | // is implemented in the same way in memory with HLL_DATA_FULL. |
56 | 71.0k | void HyperLogLog::update(uint64_t hash_value) { |
57 | 71.0k | switch (_type) { |
58 | 3.11k | case HLL_DATA_EMPTY: |
59 | 3.11k | _hash_set.insert(hash_value); |
60 | 3.11k | _type = HLL_DATA_EXPLICIT; |
61 | 3.11k | break; |
62 | 782 | case HLL_DATA_EXPLICIT: |
63 | 782 | if (_hash_set.size() < HLL_EXPLICIT_INT64_NUM) { |
64 | 779 | _hash_set.insert(hash_value); |
65 | 779 | break; |
66 | 779 | } |
67 | 3 | _convert_explicit_to_register(); |
68 | 3 | _type = HLL_DATA_FULL; |
69 | | // fall through |
70 | 4 | case HLL_DATA_SPARSE: |
71 | 67.1k | case HLL_DATA_FULL: |
72 | 67.1k | _update_registers(hash_value); |
73 | 67.1k | break; |
74 | 71.0k | } |
75 | 71.0k | } |
76 | | |
77 | 8 | void HyperLogLog::merge(const HyperLogLog& other) { |
78 | | // fast path |
79 | 8 | if (other._type == HLL_DATA_EMPTY) { |
80 | 0 | return; |
81 | 0 | } |
82 | 8 | switch (_type) { |
83 | 2 | case HLL_DATA_EMPTY: { |
84 | | // _type must change |
85 | 2 | _type = other._type; |
86 | 2 | switch (other._type) { |
87 | 1 | case HLL_DATA_EXPLICIT: |
88 | 1 | _hash_set = other._hash_set; |
89 | 1 | break; |
90 | 0 | case HLL_DATA_SPARSE: |
91 | 1 | case HLL_DATA_FULL: |
92 | 1 | _registers = new uint8_t[HLL_REGISTERS_COUNT]; |
93 | 1 | memcpy(_registers, other._registers, HLL_REGISTERS_COUNT); |
94 | 1 | break; |
95 | 0 | default: |
96 | 0 | break; |
97 | 2 | } |
98 | 2 | break; |
99 | 2 | } |
100 | 3 | case HLL_DATA_EXPLICIT: { |
101 | 3 | switch (other._type) { |
102 | 2 | case HLL_DATA_EXPLICIT: { |
103 | | // Merge other's explicit values first, then check if the number is exceed |
104 | | // HLL_EXPLICIT_INT64_NUM. This is OK because the max value is 2 * 160. |
105 | 2 | _hash_set.insert(other._hash_set.begin(), other._hash_set.end()); |
106 | 2 | if (_hash_set.size() > HLL_EXPLICIT_INT64_NUM) { |
107 | 1 | _convert_explicit_to_register(); |
108 | 1 | _type = HLL_DATA_FULL; |
109 | 1 | } |
110 | 2 | } break; |
111 | 0 | case HLL_DATA_SPARSE: |
112 | 1 | case HLL_DATA_FULL: |
113 | 1 | _convert_explicit_to_register(); |
114 | 1 | _merge_registers(other._registers); |
115 | 1 | _type = HLL_DATA_FULL; |
116 | 1 | break; |
117 | 0 | default: |
118 | 0 | break; |
119 | 3 | } |
120 | 3 | break; |
121 | 3 | } |
122 | 3 | case HLL_DATA_SPARSE: |
123 | 3 | case HLL_DATA_FULL: { |
124 | 3 | switch (other._type) { |
125 | 1 | case HLL_DATA_EXPLICIT: |
126 | 100 | for (auto hash_value : other._hash_set) { |
127 | 100 | _update_registers(hash_value); |
128 | 100 | } |
129 | 1 | break; |
130 | 0 | case HLL_DATA_SPARSE: |
131 | 2 | case HLL_DATA_FULL: |
132 | 2 | _merge_registers(other._registers); |
133 | 2 | break; |
134 | 0 | default: |
135 | 0 | break; |
136 | 3 | } |
137 | 3 | break; |
138 | 3 | } |
139 | 8 | } |
140 | 8 | } |
141 | | |
142 | 5.12k | size_t HyperLogLog::max_serialized_size() const { |
143 | 5.12k | switch (_type) { |
144 | 0 | case HLL_DATA_EMPTY: |
145 | 0 | default: |
146 | 0 | return 1; |
147 | 5.12k | case HLL_DATA_EXPLICIT: |
148 | 5.12k | return 2 + _hash_set.size() * 8; |
149 | 0 | case HLL_DATA_SPARSE: |
150 | 0 | case HLL_DATA_FULL: |
151 | 0 | return 1 + HLL_REGISTERS_COUNT; |
152 | 5.12k | } |
153 | 5.12k | } |
154 | | |
155 | 5.12k | size_t HyperLogLog::serialize(uint8_t* dst) const { |
156 | 5.12k | uint8_t* ptr = dst; |
157 | 5.12k | switch (_type) { |
158 | 1 | case HLL_DATA_EMPTY: |
159 | 1 | default: { |
160 | | // When the _type is unknown, which may not happen, we encode it as |
161 | | // Empty HyperLogLog object. |
162 | 1 | *ptr++ = HLL_DATA_EMPTY; |
163 | 1 | break; |
164 | 1 | } |
165 | 5.12k | case HLL_DATA_EXPLICIT: { |
166 | 5.12k | DCHECK(_hash_set.size() <= HLL_EXPLICIT_INT64_NUM) |
167 | 0 | << "Number of explicit elements(" << _hash_set.size() |
168 | 0 | << ") should be less or equal than " << HLL_EXPLICIT_INT64_NUM; |
169 | 5.12k | *ptr++ = _type; |
170 | 5.12k | *ptr++ = (uint8_t)_hash_set.size(); |
171 | 5.22k | for (auto hash_value : _hash_set) { |
172 | 5.22k | encode_fixed64_le(ptr, hash_value); |
173 | 5.22k | ptr += 8; |
174 | 5.22k | } |
175 | 5.12k | break; |
176 | 1 | } |
177 | 0 | case HLL_DATA_SPARSE: |
178 | 2 | case HLL_DATA_FULL: { |
179 | 2 | uint32_t num_non_zero_registers = 0; |
180 | 32.7k | for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) { |
181 | 32.7k | num_non_zero_registers += (_registers[i] != 0); |
182 | 32.7k | } |
183 | | |
184 | | // each register in sparse format will occupy 3bytes, 2 for index and |
185 | | // 1 for register value. So if num_non_zero_registers is greater than |
186 | | // 4K we use full encode format. |
187 | 2 | if (num_non_zero_registers > HLL_SPARSE_THRESHOLD) { |
188 | 1 | *ptr++ = HLL_DATA_FULL; |
189 | 1 | memcpy(ptr, _registers, HLL_REGISTERS_COUNT); |
190 | 1 | ptr += HLL_REGISTERS_COUNT; |
191 | 1 | } else { |
192 | 1 | *ptr++ = HLL_DATA_SPARSE; |
193 | | // 2-5(4 byte): number of registers |
194 | 1 | encode_fixed32_le(ptr, num_non_zero_registers); |
195 | 1 | ptr += 4; |
196 | | |
197 | 16.3k | for (uint32_t i = 0; i < HLL_REGISTERS_COUNT; ++i) { |
198 | 16.3k | if (_registers[i] == 0) { |
199 | 15.4k | continue; |
200 | 15.4k | } |
201 | | // 2 bytes: register index |
202 | | // 1 byte: register value |
203 | 984 | encode_fixed16_le(ptr, i); |
204 | 984 | ptr += 2; |
205 | 984 | *ptr++ = _registers[i]; |
206 | 984 | } |
207 | 1 | } |
208 | 2 | break; |
209 | 0 | } |
210 | 5.12k | } |
211 | 5.12k | return ptr - dst; |
212 | 5.12k | } |
213 | | |
214 | 3.08k | bool HyperLogLog::is_valid(const Slice& slice) { |
215 | 3.08k | if (slice.size < 1) { |
216 | 1 | return false; |
217 | 1 | } |
218 | 3.08k | const uint8_t* ptr = (uint8_t*)slice.data; |
219 | 3.08k | const uint8_t* end = (uint8_t*)slice.data + slice.size; |
220 | 3.08k | auto type = (HllDataType)*ptr++; |
221 | 3.08k | switch (type) { |
222 | 3 | case HLL_DATA_EMPTY: |
223 | 3 | break; |
224 | 3.07k | case HLL_DATA_EXPLICIT: { |
225 | 3.07k | if ((ptr + 1) > end) { |
226 | 1 | return false; |
227 | 1 | } |
228 | 3.07k | uint8_t num_explicits = *ptr++; |
229 | 3.07k | ptr += num_explicits * 8; |
230 | 3.07k | break; |
231 | 3.07k | } |
232 | 3 | case HLL_DATA_SPARSE: { |
233 | 3 | if ((ptr + 4) > end) { |
234 | 1 | return false; |
235 | 1 | } |
236 | 2 | uint32_t num_registers = decode_fixed32_le(ptr); |
237 | 2 | ptr += 4 + 3 * num_registers; |
238 | 2 | break; |
239 | 3 | } |
240 | 3 | case HLL_DATA_FULL: { |
241 | 3 | ptr += HLL_REGISTERS_COUNT; |
242 | 3 | break; |
243 | 3 | } |
244 | 2 | default: |
245 | 2 | return false; |
246 | 3.08k | } |
247 | 3.08k | return ptr == end; |
248 | 3.08k | } |
249 | | |
250 | | // TODO(zc): check input string's length |
251 | 3.07k | bool HyperLogLog::deserialize(const Slice& slice) { |
252 | | // can be called only when type is empty |
253 | 3.07k | DCHECK(_type == HLL_DATA_EMPTY); |
254 | | |
255 | | // NOTE(zc): Don't remove this check unless you known what |
256 | | // you are doing. Because of history bug, we ingest some |
257 | | // invalid HLL data in storage, which ptr is nullptr. |
258 | | // we must handle this case to avoid process crash. |
259 | | // This bug is in release 0.10, I think we can remove this |
260 | | // in release 0.12 or later. |
261 | 3.07k | if (slice.data == nullptr || slice.size <= 0) { |
262 | 1 | return false; |
263 | 1 | } |
264 | | // check if input length is valid |
265 | 3.07k | if (!is_valid(slice)) { |
266 | 1.02k | return false; |
267 | 1.02k | } |
268 | | |
269 | 2.05k | const uint8_t* ptr = (uint8_t*)slice.data; |
270 | | // first byte : type |
271 | 2.05k | _type = (HllDataType)*ptr++; |
272 | 2.05k | switch (_type) { |
273 | 1 | case HLL_DATA_EMPTY: |
274 | 1 | break; |
275 | 2.04k | case HLL_DATA_EXPLICIT: { |
276 | | // 2: number of explicit values |
277 | | // make sure that num_explicit is positive |
278 | 2.04k | uint8_t num_explicits = *ptr++; |
279 | | // 3+: 8 bytes hash value |
280 | 4.19k | for (int i = 0; i < num_explicits; ++i) { |
281 | 2.14k | _hash_set.insert(decode_fixed64_le(ptr)); |
282 | 2.14k | ptr += 8; |
283 | 2.14k | } |
284 | 2.04k | break; |
285 | 0 | } |
286 | 1 | case HLL_DATA_SPARSE: { |
287 | 1 | _registers = new uint8_t[HLL_REGISTERS_COUNT]; |
288 | 1 | memset(_registers, 0, HLL_REGISTERS_COUNT); |
289 | | // 2-5(4 byte): number of registers |
290 | 1 | uint32_t num_registers = decode_fixed32_le(ptr); |
291 | 1 | ptr += 4; |
292 | 985 | for (uint32_t i = 0; i < num_registers; ++i) { |
293 | | // 2 bytes: register index |
294 | | // 1 byte: register value |
295 | 984 | uint16_t register_idx = decode_fixed16_le(ptr); |
296 | 984 | ptr += 2; |
297 | 984 | _registers[register_idx] = *ptr++; |
298 | 984 | } |
299 | 1 | break; |
300 | 0 | } |
301 | 1 | case HLL_DATA_FULL: { |
302 | 1 | _registers = new uint8_t[HLL_REGISTERS_COUNT]; |
303 | | // 2+ : hll register value |
304 | 1 | memcpy(_registers, ptr, HLL_REGISTERS_COUNT); |
305 | 1 | break; |
306 | 0 | } |
307 | 0 | default: |
308 | | // revert type to EMPTY |
309 | 0 | _type = HLL_DATA_EMPTY; |
310 | 0 | return false; |
311 | 2.05k | } |
312 | 2.05k | return true; |
313 | 2.05k | } |
314 | | |
315 | 26 | int64_t HyperLogLog::estimate_cardinality() const { |
316 | 26 | if (_type == HLL_DATA_EMPTY) { |
317 | 3 | return 0; |
318 | 3 | } |
319 | 23 | if (_type == HLL_DATA_EXPLICIT) { |
320 | 10 | return _hash_set.size(); |
321 | 10 | } |
322 | | |
323 | 13 | const int num_streams = HLL_REGISTERS_COUNT; |
324 | | // Empirical constants for the algorithm. |
325 | 13 | float alpha = 0; |
326 | | |
327 | 13 | if (num_streams == 16) { |
328 | 0 | alpha = 0.673F; |
329 | 13 | } else if (num_streams == 32) { |
330 | 0 | alpha = 0.697F; |
331 | 13 | } else if (num_streams == 64) { |
332 | 0 | alpha = 0.709F; |
333 | 13 | } else { |
334 | 13 | alpha = 0.7213F / (1 + 1.079F / num_streams); |
335 | 13 | } |
336 | | |
337 | 13 | float harmonic_mean = 0; |
338 | 13 | int num_zero_registers = 0; |
339 | | |
340 | 213k | for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) { |
341 | 212k | harmonic_mean += powf(2.0F, -_registers[i]); |
342 | | |
343 | 212k | if (_registers[i] == 0) { |
344 | 110k | ++num_zero_registers; |
345 | 110k | } |
346 | 212k | } |
347 | | |
348 | 13 | harmonic_mean = 1.0F / harmonic_mean; |
349 | 13 | double estimate = alpha * num_streams * num_streams * harmonic_mean; |
350 | | // according to HyperLogLog current correction, if E is cardinal |
351 | | // E =< num_streams * 2.5 , LC has higher accuracy. |
352 | | // num_streams * 2.5 < E , HyperLogLog has higher accuracy. |
353 | | // Generally , we can use HyperLogLog to produce value as E. |
354 | 13 | if (estimate <= num_streams * 2.5 && num_zero_registers != 0) { |
355 | | // Estimated cardinality is too low. Hll is too inaccurate here, instead use |
356 | | // linear counting. |
357 | 7 | estimate = num_streams * log(static_cast<float>(num_streams) / num_zero_registers); |
358 | 7 | } else if (num_streams == 16384 && estimate < 72000) { |
359 | | // when Linear Couint change to HyperLogLog according to HyperLogLog Correction, |
360 | | // there are relatively large fluctuations, we fixed the problem refer to redis. |
361 | 6 | double bias = 5.9119 * 1.0e-18 * (estimate * estimate * estimate * estimate) - |
362 | 6 | 1.4253 * 1.0e-12 * (estimate * estimate * estimate) + |
363 | 6 | 1.2940 * 1.0e-7 * (estimate * estimate) - 5.2921 * 1.0e-3 * estimate + |
364 | 6 | 83.3216; |
365 | 6 | estimate -= estimate * (bias / 100); |
366 | 6 | } |
367 | 13 | return (int64_t)(estimate + 0.5); |
368 | 23 | } |
369 | | |
370 | 0 | void HllSetResolver::parse() { |
371 | | // skip LengthValueType |
372 | 0 | char* pdata = _buf_ref; |
373 | 0 | _set_type = (HllDataType)pdata[0]; |
374 | 0 | char* sparse_data = nullptr; |
375 | 0 | switch (_set_type) { |
376 | 0 | case HLL_DATA_EXPLICIT: |
377 | | // first byte : type |
378 | | // second~five byte : hash values's number |
379 | | // five byte later : hash value |
380 | 0 | _explicit_num = (ExplicitLengthValueType)(pdata[sizeof(SetTypeValueType)]); |
381 | 0 | _explicit_value = |
382 | 0 | (uint64_t*)(pdata + sizeof(SetTypeValueType) + sizeof(ExplicitLengthValueType)); |
383 | 0 | break; |
384 | 0 | case HLL_DATA_SPARSE: |
385 | | // first byte : type |
386 | | // second ~(2^HLL_COLUMN_PRECISION)/8 byte : bitmap mark which is not zero |
387 | | // 2^HLL_COLUMN_PRECISION)/8 + 1以后value |
388 | 0 | _sparse_count = (SparseLengthValueType*)(pdata + sizeof(SetTypeValueType)); |
389 | 0 | sparse_data = pdata + sizeof(SetTypeValueType) + sizeof(SparseLengthValueType); |
390 | 0 | for (int i = 0; i < *_sparse_count; i++) { |
391 | 0 | auto* index = (SparseIndexType*)sparse_data; |
392 | 0 | sparse_data += sizeof(SparseIndexType); |
393 | 0 | auto* value = (SparseValueType*)sparse_data; |
394 | 0 | _sparse_map[*index] = *value; |
395 | 0 | sparse_data += sizeof(SetTypeValueType); |
396 | 0 | } |
397 | 0 | break; |
398 | 0 | case HLL_DATA_FULL: |
399 | | // first byte : type |
400 | | // second byte later : hll register value |
401 | 0 | _full_value_position = pdata + sizeof(SetTypeValueType); |
402 | 0 | break; |
403 | 0 | default: |
404 | | // HLL_DATA_EMPTY |
405 | 0 | break; |
406 | 0 | } |
407 | 0 | } |
408 | | |
409 | | } // namespace doris |