/root/doris/be/src/olap/hll.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <stdint.h> |
21 | | #include <string.h> |
22 | | |
23 | | #include <map> |
24 | | #include <string> |
25 | | #include <utility> |
26 | | |
27 | | #ifdef __x86_64__ |
28 | | #include <immintrin.h> |
29 | | #endif |
30 | | |
31 | | #include "vec/common/hash_table/phmap_fwd_decl.h" |
32 | | |
33 | | namespace doris { |
34 | | |
35 | | struct Slice; |
36 | | |
37 | | inline const int HLL_COLUMN_PRECISION = 14; |
38 | | inline const int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION); |
39 | | inline const int HLL_EXPLICIT_INT64_NUM = 160; |
40 | | inline const int HLL_SPARSE_THRESHOLD = 4096; |
41 | | inline const int HLL_REGISTERS_COUNT = 16 * 1024; |
42 | | // maximum size in byte of serialized HLL: type(1) + registers (2^14) |
43 | | inline const int HLL_COLUMN_DEFAULT_LEN = HLL_REGISTERS_COUNT + 1; |
44 | | |
45 | | // 1 for type; 1 for hash values count; 8 for hash value |
46 | | inline const int HLL_SINGLE_VALUE_SIZE = 10; |
47 | | inline const int HLL_EMPTY_SIZE = 1; |
48 | | |
49 | | // Hyperloglog distinct estimate algorithm. |
50 | | // See these papers for more details. |
51 | | // 1) Hyperloglog: The analysis of a near-optimal cardinality estimation |
52 | | // algorithm (2007) |
53 | | // 2) HyperLogLog in Practice (paper from google with some improvements) |
54 | | |
55 | | // Each HLL value is a set of values. To save space, Doris store HLL value |
56 | | // in different format according to its cardinality. |
57 | | // |
58 | | // HLL_DATA_EMPTY: when set is empty. |
59 | | // |
60 | | // HLL_DATA_EXPLICIT: when there is only few values in set, store these values explicit. |
61 | | // If the number of hash values is not greater than 160, set is encoded in this format. |
62 | | // The max space occupied is (1 + 1 + 160 * 8) = 1282. I don't know why 160 is chosen, |
63 | | // maybe can be other number. If you are interested, you can try other number and see |
64 | | // if it will be better. |
65 | | // |
66 | | // HLL_DATA_SPARSE: only store non-zero registers. If the number of non-zero registers |
67 | | // is not greater than 4096, set is encoded in this format. The max space occupied is |
68 | | // (1 + 4 + 3 * 4096) = 12293. |
69 | | // |
70 | | // HLL_DATA_FULL: most space-consuming, store all registers |
71 | | // |
72 | | // A HLL value will change in the sequence empty -> explicit -> sparse -> full, and not |
73 | | // allow reverse. |
74 | | // |
75 | | // NOTE: This values are persisted in storage devices, so don't change exist |
76 | | // enum values. |
77 | | enum HllDataType { |
78 | | HLL_DATA_EMPTY = 0, |
79 | | HLL_DATA_EXPLICIT = 1, |
80 | | HLL_DATA_SPARSE = 2, |
81 | | HLL_DATA_FULL = 3, |
82 | | }; |
83 | | |
84 | | class HyperLogLog { |
85 | | public: |
86 | 5.16k | HyperLogLog() = default; |
87 | 0 | explicit HyperLogLog(uint64_t hash_value) : _type(HLL_DATA_EXPLICIT) { |
88 | 0 | _hash_set.emplace(hash_value); |
89 | 0 | } |
90 | | explicit HyperLogLog(const Slice& src); |
91 | | |
92 | 12.3k | HyperLogLog(const HyperLogLog& other) { |
93 | 12.3k | this->_type = other._type; |
94 | 12.3k | switch (other._type) { |
95 | 2.04k | case HLL_DATA_EMPTY: |
96 | 2.04k | break; |
97 | 10.3k | case HLL_DATA_EXPLICIT: { |
98 | 10.3k | this->_hash_set = other._hash_set; |
99 | 10.3k | break; |
100 | 0 | } |
101 | 0 | case HLL_DATA_SPARSE: |
102 | 0 | case HLL_DATA_FULL: { |
103 | 0 | _registers = new uint8_t[HLL_REGISTERS_COUNT]; |
104 | 0 | memcpy(_registers, other._registers, HLL_REGISTERS_COUNT); |
105 | 0 | break; |
106 | 0 | } |
107 | 0 | default: |
108 | 0 | break; |
109 | 12.3k | } |
110 | 12.3k | } |
111 | | |
112 | 3.07k | HyperLogLog(HyperLogLog&& other) { |
113 | 3.07k | this->_type = other._type; |
114 | 3.07k | switch (other._type) { |
115 | 1.02k | case HLL_DATA_EMPTY: |
116 | 1.02k | break; |
117 | 2.04k | case HLL_DATA_EXPLICIT: { |
118 | 2.04k | this->_hash_set = std::move(other._hash_set); |
119 | 2.04k | other._type = HLL_DATA_EMPTY; |
120 | 2.04k | break; |
121 | 0 | } |
122 | 0 | case HLL_DATA_SPARSE: |
123 | 0 | case HLL_DATA_FULL: { |
124 | 0 | this->_registers = other._registers; |
125 | 0 | other._registers = nullptr; |
126 | 0 | other._type = HLL_DATA_EMPTY; |
127 | 0 | break; |
128 | 0 | } |
129 | 0 | default: |
130 | 0 | break; |
131 | 3.07k | } |
132 | 3.07k | } |
133 | | |
134 | 0 | HyperLogLog& operator=(HyperLogLog&& other) { |
135 | 0 | if (this != &other) { |
136 | 0 | if (_registers != nullptr) { |
137 | 0 | delete[] _registers; |
138 | 0 | _registers = nullptr; |
139 | 0 | } |
140 | |
|
141 | 0 | this->_type = other._type; |
142 | 0 | switch (other._type) { |
143 | 0 | case HLL_DATA_EMPTY: |
144 | 0 | break; |
145 | 0 | case HLL_DATA_EXPLICIT: { |
146 | 0 | this->_hash_set = std::move(other._hash_set); |
147 | 0 | other._type = HLL_DATA_EMPTY; |
148 | 0 | break; |
149 | 0 | } |
150 | 0 | case HLL_DATA_SPARSE: |
151 | 0 | case HLL_DATA_FULL: { |
152 | 0 | this->_registers = other._registers; |
153 | 0 | other._registers = nullptr; |
154 | 0 | other._type = HLL_DATA_EMPTY; |
155 | 0 | break; |
156 | 0 | } |
157 | 0 | default: |
158 | 0 | break; |
159 | 0 | } |
160 | 0 | } |
161 | 0 | return *this; |
162 | 0 | } |
163 | | |
164 | 0 | HyperLogLog& operator=(const HyperLogLog& other) { |
165 | 0 | if (this != &other) { |
166 | 0 | if (_registers != nullptr) { |
167 | 0 | delete[] _registers; |
168 | 0 | _registers = nullptr; |
169 | 0 | } |
170 | |
|
171 | 0 | this->_type = other._type; |
172 | 0 | switch (other._type) { |
173 | 0 | case HLL_DATA_EMPTY: |
174 | 0 | break; |
175 | 0 | case HLL_DATA_EXPLICIT: { |
176 | 0 | this->_hash_set = other._hash_set; |
177 | 0 | break; |
178 | 0 | } |
179 | 0 | case HLL_DATA_SPARSE: |
180 | 0 | case HLL_DATA_FULL: { |
181 | 0 | _registers = new uint8_t[HLL_REGISTERS_COUNT]; |
182 | 0 | memcpy(_registers, other._registers, HLL_REGISTERS_COUNT); |
183 | 0 | break; |
184 | 0 | } |
185 | 0 | default: |
186 | 0 | break; |
187 | 0 | } |
188 | 0 | } |
189 | 0 | return *this; |
190 | 0 | } |
191 | | |
192 | 21.6k | ~HyperLogLog() { clear(); } |
193 | 21.6k | void clear() { |
194 | 21.6k | _type = HLL_DATA_EMPTY; |
195 | 21.6k | _hash_set.clear(); |
196 | 21.6k | delete[] _registers; |
197 | 21.6k | _registers = nullptr; |
198 | 21.6k | } |
199 | | |
200 | | typedef uint8_t SetTypeValueType; |
201 | | typedef int32_t SparseLengthValueType; |
202 | | typedef uint16_t SparseIndexType; |
203 | | typedef uint8_t SparseValueType; |
204 | | |
205 | | // Add a hash value to this HLL value |
206 | | // NOTE: input must be a hash_value |
207 | | void update(uint64_t hash_value); |
208 | | |
209 | | void merge(const HyperLogLog& other); |
210 | | |
211 | | // Return max size of serialized binary |
212 | | size_t max_serialized_size() const; |
213 | | |
214 | 0 | size_t memory_consumed() const { |
215 | 0 | size_t size = sizeof(*this); |
216 | 0 | if (_type == HLL_DATA_EXPLICIT) |
217 | 0 | size += _hash_set.size() * sizeof(uint64_t); |
218 | 0 | else if (_type == HLL_DATA_SPARSE || _type == HLL_DATA_FULL) |
219 | 0 | size += HLL_REGISTERS_COUNT; |
220 | 0 | return size; |
221 | 0 | } |
222 | | |
223 | | // Input slice should has enough capacity for serialize, which |
224 | | // can be get through max_serialized_size(). If insufficient buffer |
225 | | // is given, this will cause process crash. |
226 | | // Return actual size of serialized binary. |
227 | | size_t serialize(uint8_t* dst) const; |
228 | | |
229 | | // Now, only empty HLL support this function. |
230 | | bool deserialize(const Slice& slice); |
231 | | |
232 | | int64_t estimate_cardinality() const; |
233 | | |
234 | 0 | static std::string empty() { |
235 | 0 | static HyperLogLog hll; |
236 | 0 | std::string buf; |
237 | 0 | buf.resize(HLL_EMPTY_SIZE); |
238 | 0 | hll.serialize((uint8_t*)buf.c_str()); |
239 | 0 | return buf; |
240 | 0 | } |
241 | | |
242 | | // Check if input slice is a valid serialized binary of HyperLogLog. |
243 | | // This function only check the encoded type in slice, whose complex |
244 | | // function is O(1). |
245 | | static bool is_valid(const Slice& slice); |
246 | | |
247 | | // only for debug |
248 | 0 | std::string to_string() { |
249 | 0 | switch (_type) { |
250 | 0 | case HLL_DATA_EMPTY: |
251 | 0 | return {}; |
252 | 0 | case HLL_DATA_EXPLICIT: |
253 | 0 | case HLL_DATA_SPARSE: |
254 | 0 | case HLL_DATA_FULL: { |
255 | 0 | std::string str {"hash set size: "}; |
256 | 0 | str.append(std::to_string(_hash_set.size())); |
257 | 0 | str.append("\ncardinality:\t"); |
258 | 0 | str.append(std::to_string(estimate_cardinality())); |
259 | 0 | str.append("\ntype:\t"); |
260 | 0 | str.append(std::to_string(_type)); |
261 | 0 | return str; |
262 | 0 | } |
263 | 0 | default: |
264 | 0 | return {}; |
265 | 0 | } |
266 | 0 | } |
267 | | |
268 | | private: |
269 | | HllDataType _type = HLL_DATA_EMPTY; |
270 | | vectorized::flat_hash_set<uint64_t> _hash_set; |
271 | | |
272 | | // This field is much space consuming(HLL_REGISTERS_COUNT), we create |
273 | | // it only when it is really needed. |
274 | | uint8_t* _registers = nullptr; |
275 | | |
276 | | private: |
277 | | void _convert_explicit_to_register(); |
278 | | |
279 | | // update one hash value into this registers |
280 | 67.9k | void _update_registers(uint64_t hash_value) { |
281 | | // Use the lower bits to index into the number of streams and then |
282 | | // find the first 1 bit after the index bits. |
283 | 67.9k | int idx = hash_value % HLL_REGISTERS_COUNT; |
284 | 67.9k | hash_value >>= HLL_COLUMN_PRECISION; |
285 | | // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1 |
286 | 67.9k | hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS); |
287 | 67.9k | uint8_t first_one_bit = __builtin_ctzl(hash_value) + 1; |
288 | 67.9k | _registers[idx] = (_registers[idx] < first_one_bit ? first_one_bit : _registers[idx]); |
289 | 67.9k | } |
290 | | |
291 | | // absorb other registers into this registers |
292 | 3 | void _merge_registers(const uint8_t* other_registers) { |
293 | | #ifdef __AVX2__ |
294 | | int loop = HLL_REGISTERS_COUNT / 32; // 32 = 256/8 |
295 | | uint8_t* dst = _registers; |
296 | | const uint8_t* src = other_registers; |
297 | | for (int i = 0; i < loop; i++) { |
298 | | __m256i xa = _mm256_loadu_si256((const __m256i*)dst); |
299 | | __m256i xb = _mm256_loadu_si256((const __m256i*)src); |
300 | | _mm256_storeu_si256((__m256i*)dst, _mm256_max_epu8(xa, xb)); |
301 | | src += 32; |
302 | | dst += 32; |
303 | | } |
304 | | #else |
305 | 49.1k | for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) { |
306 | 49.1k | _registers[i] = |
307 | 49.1k | (_registers[i] < other_registers[i] ? other_registers[i] : _registers[i]); |
308 | 49.1k | } |
309 | 3 | #endif |
310 | 3 | } |
311 | | }; |
312 | | |
313 | | // todo(kks): remove this when dpp_sink class was removed |
314 | | class HllSetResolver { |
315 | | public: |
316 | | HllSetResolver() |
317 | | : _buf_ref(nullptr), |
318 | | _buf_len(0), |
319 | | _set_type(HLL_DATA_EMPTY), |
320 | | _full_value_position(nullptr), |
321 | | _explicit_value(nullptr), |
322 | 0 | _explicit_num(0) {} |
323 | | |
324 | 0 | ~HllSetResolver() {} |
325 | | |
326 | | typedef uint8_t SetTypeValueType; |
327 | | typedef uint8_t ExplicitLengthValueType; |
328 | | typedef int32_t SparseLengthValueType; |
329 | | typedef uint16_t SparseIndexType; |
330 | | typedef uint8_t SparseValueType; |
331 | | |
332 | | // only save pointer |
333 | 0 | void init(char* buf, int len) { |
334 | 0 | this->_buf_ref = buf; |
335 | 0 | this->_buf_len = len; |
336 | 0 | } |
337 | | |
338 | | // hll set type |
339 | 0 | HllDataType get_hll_data_type() { return _set_type; } |
340 | | |
341 | | // explicit value num |
342 | 0 | int get_explicit_count() { return (int)_explicit_num; } |
343 | | |
344 | | // get explicit index value 64bit |
345 | 0 | uint64_t get_explicit_value(int index) { |
346 | 0 | if (index >= _explicit_num) { |
347 | 0 | return -1; |
348 | 0 | } |
349 | 0 | return _explicit_value[index]; |
350 | 0 | } |
351 | | |
352 | | // get full register value |
353 | 0 | char* get_full_value() { return _full_value_position; } |
354 | | |
355 | | // get (index, value) map |
356 | 0 | std::map<SparseIndexType, SparseValueType>& get_sparse_map() { return _sparse_map; } |
357 | | |
358 | | // parse set , call after copy() or init() |
359 | | void parse(); |
360 | | |
361 | | private: |
362 | | char* _buf_ref; // set |
363 | | int _buf_len; // set len |
364 | | HllDataType _set_type; //set type |
365 | | char* _full_value_position; |
366 | | uint64_t* _explicit_value; |
367 | | ExplicitLengthValueType _explicit_num; |
368 | | std::map<SparseIndexType, SparseValueType> _sparse_map; |
369 | | SparseLengthValueType* _sparse_count; |
370 | | }; |
371 | | |
372 | | } // namespace doris |