/root/doris/be/src/olap/hll.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <cstdint> |
21 | | #include <cstring> |
22 | | #include <map> |
23 | | #include <string> |
24 | | #include <utility> |
25 | | |
26 | | #ifdef __x86_64__ |
27 | | #include <immintrin.h> |
28 | | #endif |
29 | | |
30 | | #include "vec/common/hash_table/phmap_fwd_decl.h" |
31 | | |
32 | | namespace doris { |
33 | | |
34 | | struct Slice; |
35 | | |
36 | | inline const int HLL_COLUMN_PRECISION = 14; |
37 | | inline const int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION); |
38 | | inline const int HLL_EXPLICIT_INT64_NUM = 160; |
39 | | inline const int HLL_SPARSE_THRESHOLD = 4096; |
40 | | inline const int HLL_REGISTERS_COUNT = 16 * 1024; |
41 | | // maximum size in byte of serialized HLL: type(1) + registers (2^14) |
42 | | inline const int HLL_COLUMN_DEFAULT_LEN = HLL_REGISTERS_COUNT + 1; |
43 | | |
44 | | // 1 for type; 1 for hash values count; 8 for hash value |
45 | | inline const int HLL_SINGLE_VALUE_SIZE = 10; |
46 | | inline const int HLL_EMPTY_SIZE = 1; |
47 | | |
48 | | // Hyperloglog distinct estimate algorithm. |
49 | | // See these papers for more details. |
50 | | // 1) Hyperloglog: The analysis of a near-optimal cardinality estimation |
51 | | // algorithm (2007) |
52 | | // 2) HyperLogLog in Practice (paper from google with some improvements) |
53 | | |
54 | | // Each HLL value is a set of values. To save space, Doris store HLL value |
55 | | // in different format according to its cardinality. |
56 | | // |
57 | | // HLL_DATA_EMPTY: when set is empty. |
58 | | // |
59 | | // HLL_DATA_EXPLICIT: when there is only few values in set, store these values explicit. |
60 | | // If the number of hash values is not greater than 160, set is encoded in this format. |
61 | | // The max space occupied is (1 + 1 + 160 * 8) = 1282. I don't know why 160 is chosen, |
62 | | // maybe can be other number. If you are interested, you can try other number and see |
63 | | // if it will be better. |
64 | | // |
65 | | // HLL_DATA_SPARSE: only store non-zero registers. If the number of non-zero registers |
66 | | // is not greater than 4096, set is encoded in this format. The max space occupied is |
67 | | // (1 + 4 + 3 * 4096) = 12293. |
68 | | // |
69 | | // HLL_DATA_FULL: most space-consuming, store all registers |
70 | | // |
71 | | // A HLL value will change in the sequence empty -> explicit -> sparse -> full, and not |
72 | | // allow reverse. |
73 | | // |
74 | | // NOTE: This values are persisted in storage devices, so don't change exist |
75 | | // enum values. |
76 | | enum HllDataType { |
77 | | HLL_DATA_EMPTY = 0, |
78 | | HLL_DATA_EXPLICIT = 1, |
79 | | HLL_DATA_SPARSE = 2, |
80 | | HLL_DATA_FULL = 3, |
81 | | }; |
82 | | |
83 | | class HyperLogLog { |
84 | | public: |
85 | 5.16k | HyperLogLog() = default; |
86 | 0 | explicit HyperLogLog(uint64_t hash_value) : _type(HLL_DATA_EXPLICIT) { |
87 | 0 | _hash_set.emplace(hash_value); |
88 | 0 | } |
89 | | explicit HyperLogLog(const Slice& src); |
90 | | |
91 | 6.19k | HyperLogLog(const HyperLogLog& other) { |
92 | 6.19k | this->_type = other._type; |
93 | 6.19k | switch (other._type) { |
94 | 1.02k | case HLL_DATA_EMPTY: |
95 | 1.02k | break; |
96 | 5.16k | case HLL_DATA_EXPLICIT: { |
97 | 5.16k | this->_hash_set = other._hash_set; |
98 | 5.16k | break; |
99 | 0 | } |
100 | 0 | case HLL_DATA_SPARSE: |
101 | 0 | case HLL_DATA_FULL: { |
102 | 0 | _registers = new uint8_t[HLL_REGISTERS_COUNT]; |
103 | 0 | memcpy(_registers, other._registers, HLL_REGISTERS_COUNT); |
104 | 0 | break; |
105 | 0 | } |
106 | 0 | default: |
107 | 0 | break; |
108 | 6.19k | } |
109 | 6.19k | } |
110 | | |
111 | 9.27k | HyperLogLog(HyperLogLog&& other) noexcept { |
112 | 9.27k | this->_type = other._type; |
113 | 9.27k | switch (other._type) { |
114 | 2.04k | case HLL_DATA_EMPTY: |
115 | 2.04k | break; |
116 | 7.22k | case HLL_DATA_EXPLICIT: { |
117 | 7.22k | this->_hash_set = std::move(other._hash_set); |
118 | 7.22k | other._type = HLL_DATA_EMPTY; |
119 | 7.22k | break; |
120 | 0 | } |
121 | 0 | case HLL_DATA_SPARSE: |
122 | 0 | case HLL_DATA_FULL: { |
123 | 0 | this->_registers = other._registers; |
124 | 0 | other._registers = nullptr; |
125 | 0 | other._type = HLL_DATA_EMPTY; |
126 | 0 | break; |
127 | 0 | } |
128 | 0 | default: |
129 | 0 | break; |
130 | 9.27k | } |
131 | 9.27k | } |
132 | | |
133 | 0 | HyperLogLog& operator=(HyperLogLog&& other) noexcept { |
134 | 0 | if (this != &other) { |
135 | 0 | if (_registers != nullptr) { |
136 | 0 | delete[] _registers; |
137 | 0 | _registers = nullptr; |
138 | 0 | } |
139 | |
|
140 | 0 | this->_type = other._type; |
141 | 0 | switch (other._type) { |
142 | 0 | case HLL_DATA_EMPTY: |
143 | 0 | break; |
144 | 0 | case HLL_DATA_EXPLICIT: { |
145 | 0 | this->_hash_set = std::move(other._hash_set); |
146 | 0 | other._type = HLL_DATA_EMPTY; |
147 | 0 | break; |
148 | 0 | } |
149 | 0 | case HLL_DATA_SPARSE: |
150 | 0 | case HLL_DATA_FULL: { |
151 | 0 | this->_registers = other._registers; |
152 | 0 | other._registers = nullptr; |
153 | 0 | other._type = HLL_DATA_EMPTY; |
154 | 0 | break; |
155 | 0 | } |
156 | 0 | default: |
157 | 0 | break; |
158 | 0 | } |
159 | 0 | } |
160 | 0 | return *this; |
161 | 0 | } |
162 | | |
163 | 0 | HyperLogLog& operator=(const HyperLogLog& other) { |
164 | 0 | if (this != &other) { |
165 | 0 | if (_registers != nullptr) { |
166 | 0 | delete[] _registers; |
167 | 0 | _registers = nullptr; |
168 | 0 | } |
169 | |
|
170 | 0 | this->_type = other._type; |
171 | 0 | switch (other._type) { |
172 | 0 | case HLL_DATA_EMPTY: |
173 | 0 | break; |
174 | 0 | case HLL_DATA_EXPLICIT: { |
175 | 0 | this->_hash_set = other._hash_set; |
176 | 0 | break; |
177 | 0 | } |
178 | 0 | case HLL_DATA_SPARSE: |
179 | 0 | case HLL_DATA_FULL: { |
180 | 0 | _registers = new uint8_t[HLL_REGISTERS_COUNT]; |
181 | 0 | memcpy(_registers, other._registers, HLL_REGISTERS_COUNT); |
182 | 0 | break; |
183 | 0 | } |
184 | 0 | default: |
185 | 0 | break; |
186 | 0 | } |
187 | 0 | } |
188 | 0 | return *this; |
189 | 0 | } |
190 | | |
191 | 21.6k | ~HyperLogLog() { clear(); } |
192 | 21.6k | void clear() { |
193 | 21.6k | _type = HLL_DATA_EMPTY; |
194 | 21.6k | _hash_set.clear(); |
195 | 21.6k | delete[] _registers; |
196 | 21.6k | _registers = nullptr; |
197 | 21.6k | } |
198 | | |
199 | | using SetTypeValueType = uint8_t; |
200 | | using SparseLengthValueType = int32_t; |
201 | | using SparseIndexType = uint16_t; |
202 | | using SparseValueType = uint8_t; |
203 | | |
204 | | // Add a hash value to this HLL value |
205 | | // NOTE: input must be a hash_value |
206 | | void update(uint64_t hash_value); |
207 | | |
208 | | void merge(const HyperLogLog& other); |
209 | | |
210 | | // Return max size of serialized binary |
211 | | size_t max_serialized_size() const; |
212 | | |
213 | 0 | size_t memory_consumed() const { |
214 | 0 | size_t size = sizeof(*this); |
215 | 0 | if (_type == HLL_DATA_EXPLICIT) { |
216 | 0 | size += _hash_set.size() * sizeof(uint64_t); |
217 | 0 | } else if (_type == HLL_DATA_SPARSE || _type == HLL_DATA_FULL) { |
218 | 0 | size += HLL_REGISTERS_COUNT; |
219 | 0 | } |
220 | 0 | return size; |
221 | 0 | } |
222 | | |
223 | | // Input slice should has enough capacity for serialize, which |
224 | | // can be get through max_serialized_size(). If insufficient buffer |
225 | | // is given, this will cause process crash. |
226 | | // Return actual size of serialized binary. |
227 | | size_t serialize(uint8_t* dst) const; |
228 | | |
229 | | // Now, only empty HLL support this function. |
230 | | bool deserialize(const Slice& slice); |
231 | | |
232 | | int64_t estimate_cardinality() const; |
233 | | |
234 | 0 | static HyperLogLog empty() { return HyperLogLog {}; } |
235 | | |
236 | | // Check if input slice is a valid serialized binary of HyperLogLog. |
237 | | // This function only check the encoded type in slice, whose complex |
238 | | // function is O(1). |
239 | | static bool is_valid(const Slice& slice); |
240 | | |
241 | | // only for debug |
242 | 0 | std::string to_string() { |
243 | 0 | switch (_type) { |
244 | 0 | case HLL_DATA_EMPTY: |
245 | 0 | return {}; |
246 | 0 | case HLL_DATA_EXPLICIT: |
247 | 0 | case HLL_DATA_SPARSE: |
248 | 0 | case HLL_DATA_FULL: { |
249 | 0 | std::string str {"hash set size: "}; |
250 | 0 | str.append(std::to_string(_hash_set.size())); |
251 | 0 | str.append("\ncardinality:\t"); |
252 | 0 | str.append(std::to_string(estimate_cardinality())); |
253 | 0 | str.append("\ntype:\t"); |
254 | 0 | str.append(std::to_string(_type)); |
255 | 0 | return str; |
256 | 0 | } |
257 | 0 | default: |
258 | 0 | return {}; |
259 | 0 | } |
260 | 0 | } |
261 | | |
262 | | private: |
263 | | void _convert_explicit_to_register(); |
264 | | |
265 | | // update one hash value into this registers |
266 | 67.9k | void _update_registers(uint64_t hash_value) { |
267 | | // Use the lower bits to index into the number of streams and then |
268 | | // find the first 1 bit after the index bits. |
269 | 67.9k | int idx = hash_value % HLL_REGISTERS_COUNT; |
270 | 67.9k | hash_value >>= HLL_COLUMN_PRECISION; |
271 | | // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1 |
272 | 67.9k | hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS); |
273 | 67.9k | uint8_t first_one_bit = __builtin_ctzl(hash_value) + 1; |
274 | 67.9k | _registers[idx] = (_registers[idx] < first_one_bit ? first_one_bit : _registers[idx]); |
275 | 67.9k | } |
276 | | |
277 | | // absorb other registers into this registers |
278 | 3 | void _merge_registers(const uint8_t* other_registers) { |
279 | | #ifdef __AVX2__ |
280 | | int loop = HLL_REGISTERS_COUNT / 32; // 32 = 256/8 |
281 | | uint8_t* dst = _registers; |
282 | | const uint8_t* src = other_registers; |
283 | | for (int i = 0; i < loop; i++) { |
284 | | __m256i xa = _mm256_loadu_si256((const __m256i*)dst); |
285 | | __m256i xb = _mm256_loadu_si256((const __m256i*)src); |
286 | | _mm256_storeu_si256((__m256i*)dst, _mm256_max_epu8(xa, xb)); |
287 | | src += 32; |
288 | | dst += 32; |
289 | | } |
290 | | #else |
291 | 49.1k | for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) { |
292 | 49.1k | _registers[i] = |
293 | 49.1k | (_registers[i] < other_registers[i] ? other_registers[i] : _registers[i]); |
294 | 49.1k | } |
295 | 3 | #endif |
296 | 3 | } |
297 | | |
298 | | HllDataType _type = HLL_DATA_EMPTY; |
299 | | vectorized::flat_hash_set<uint64_t> _hash_set; |
300 | | |
301 | | // This field is much space consuming(HLL_REGISTERS_COUNT), we create |
302 | | // it only when it is really needed. |
303 | | uint8_t* _registers = nullptr; |
304 | | }; |
305 | | |
306 | | // todo(kks): remove this when dpp_sink class was removed |
307 | | class HllSetResolver { |
308 | | public: |
309 | | HllSetResolver() = default; |
310 | | |
311 | | ~HllSetResolver() = default; |
312 | | |
313 | | using SetTypeValueType = uint8_t; |
314 | | using ExplicitLengthValueType = uint8_t; |
315 | | using SparseLengthValueType = int32_t; |
316 | | using SparseIndexType = uint16_t; |
317 | | using SparseValueType = uint8_t; |
318 | | |
319 | | // only save pointer |
320 | 0 | void init(char* buf, int len) { |
321 | 0 | this->_buf_ref = buf; |
322 | 0 | this->_buf_len = len; |
323 | 0 | } |
324 | | |
325 | | // hll set type |
326 | 0 | HllDataType get_hll_data_type() { return _set_type; } |
327 | | |
328 | | // explicit value num |
329 | 0 | int get_explicit_count() const { return (int)_explicit_num; } |
330 | | |
331 | | // get explicit index value 64bit |
332 | 0 | uint64_t get_explicit_value(int index) { |
333 | 0 | if (index >= _explicit_num) { |
334 | 0 | return -1; |
335 | 0 | } |
336 | 0 | return _explicit_value[index]; |
337 | 0 | } |
338 | | |
339 | | // get full register value |
340 | 0 | char* get_full_value() { return _full_value_position; } |
341 | | |
342 | | // get (index, value) map |
343 | 0 | std::map<SparseIndexType, SparseValueType>& get_sparse_map() { return _sparse_map; } |
344 | | |
345 | | // parse set , call after copy() or init() |
346 | | void parse(); |
347 | | |
348 | | private: |
349 | | char* _buf_ref = nullptr; // set |
350 | | int _buf_len {}; // set len |
351 | | HllDataType _set_type {}; //set type |
352 | | char* _full_value_position = nullptr; |
353 | | uint64_t* _explicit_value = nullptr; |
354 | | ExplicitLengthValueType _explicit_num {}; |
355 | | std::map<SparseIndexType, SparseValueType> _sparse_map; |
356 | | SparseLengthValueType* _sparse_count; |
357 | | }; |
358 | | |
359 | | } // namespace doris |