/root/doris/be/src/olap/hll.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <cstdint> |
21 | | #include <cstring> |
22 | | #include <map> |
23 | | #include <string> |
24 | | #include <utility> |
25 | | |
26 | | #include "vec/common/hash_table/phmap_fwd_decl.h" |
27 | | |
28 | | namespace doris { |
29 | | #include "common/compile_check_begin.h" |
30 | | struct Slice; |
31 | | |
32 | | inline const int HLL_COLUMN_PRECISION = 14; |
33 | | inline const int HLL_ZERO_COUNT_BITS = (64 - HLL_COLUMN_PRECISION); |
34 | | inline const int HLL_EXPLICIT_INT64_NUM = 160; |
35 | | inline const int HLL_SPARSE_THRESHOLD = 4096; |
36 | | inline const uint16_t HLL_REGISTERS_COUNT = 16 * 1024; |
37 | | // maximum size in byte of serialized HLL: type(1) + registers (2^14) |
38 | | inline const int HLL_COLUMN_DEFAULT_LEN = HLL_REGISTERS_COUNT + 1; |
39 | | |
40 | | // 1 for type; 1 for hash values count; 8 for hash value |
41 | | inline const int HLL_SINGLE_VALUE_SIZE = 10; |
42 | | inline const int HLL_EMPTY_SIZE = 1; |
43 | | |
44 | | // Hyperloglog distinct estimate algorithm. |
45 | | // See these papers for more details. |
46 | | // 1) Hyperloglog: The analysis of a near-optimal cardinality estimation |
47 | | // algorithm (2007) |
48 | | // 2) HyperLogLog in Practice (paper from google with some improvements) |
49 | | |
50 | | // Each HLL value is a set of values. To save space, Doris store HLL value |
51 | | // in different format according to its cardinality. |
52 | | // |
53 | | // HLL_DATA_EMPTY: when set is empty. |
54 | | // |
55 | | // HLL_DATA_EXPLICIT: when there is only few values in set, store these values explicit. |
56 | | // If the number of hash values is not greater than 160, set is encoded in this format. |
57 | | // The max space occupied is (1 + 1 + 160 * 8) = 1282. I don't know why 160 is chosen, |
58 | | // maybe can be other number. If you are interested, you can try other number and see |
59 | | // if it will be better. |
60 | | // |
61 | | // HLL_DATA_SPARSE: only store non-zero registers. If the number of non-zero registers |
62 | | // is not greater than 4096, set is encoded in this format. The max space occupied is |
63 | | // (1 + 4 + 3 * 4096) = 12293. |
64 | | // |
65 | | // HLL_DATA_FULL: most space-consuming, store all registers |
66 | | // |
67 | | // A HLL value will change in the sequence empty -> explicit -> sparse -> full, and not |
68 | | // allow reverse. |
69 | | // |
70 | | // NOTE: This values are persisted in storage devices, so don't change exist |
71 | | // enum values. |
72 | | enum HllDataType { |
73 | | HLL_DATA_EMPTY = 0, |
74 | | HLL_DATA_EXPLICIT = 1, |
75 | | HLL_DATA_SPARSE = 2, |
76 | | HLL_DATA_FULL = 3, |
77 | | }; |
78 | | |
79 | | class HyperLogLog { |
80 | | public: |
81 | 802k | HyperLogLog() = default; |
82 | | explicit HyperLogLog(uint64_t hash_value) : _type(HLL_DATA_EXPLICIT) { |
83 | | _hash_set.emplace(hash_value); |
84 | | } |
85 | | explicit HyperLogLog(const Slice& src); |
86 | | |
87 | 58.7k | HyperLogLog(const HyperLogLog& other) { |
88 | 58.7k | this->_type = other._type; |
89 | 58.7k | switch (other._type) { |
90 | 1.77k | case HLL_DATA_EMPTY: |
91 | 1.77k | break; |
92 | 56.9k | case HLL_DATA_EXPLICIT: { |
93 | 56.9k | this->_hash_set = other._hash_set; |
94 | 56.9k | break; |
95 | 0 | } |
96 | 2 | case HLL_DATA_SPARSE: |
97 | 8 | case HLL_DATA_FULL: { |
98 | 8 | _registers = new uint8_t[HLL_REGISTERS_COUNT]; |
99 | 8 | memcpy(_registers, other._registers, HLL_REGISTERS_COUNT); |
100 | 8 | break; |
101 | 2 | } |
102 | 0 | default: |
103 | 0 | break; |
104 | 58.7k | } |
105 | 58.7k | } |
106 | | |
107 | 55.1k | HyperLogLog(HyperLogLog&& other) noexcept { |
108 | 55.1k | this->_type = other._type; |
109 | 55.1k | switch (other._type) { |
110 | 3.00k | case HLL_DATA_EMPTY: |
111 | 3.00k | break; |
112 | 52.1k | case HLL_DATA_EXPLICIT: { |
113 | 52.1k | this->_hash_set = std::move(other._hash_set); |
114 | 52.1k | other._type = HLL_DATA_EMPTY; |
115 | 52.1k | break; |
116 | 0 | } |
117 | 2 | case HLL_DATA_SPARSE: |
118 | 8 | case HLL_DATA_FULL: { |
119 | 8 | this->_registers = other._registers; |
120 | 8 | other._registers = nullptr; |
121 | 8 | other._type = HLL_DATA_EMPTY; |
122 | 8 | break; |
123 | 2 | } |
124 | 0 | default: |
125 | 0 | break; |
126 | 55.1k | } |
127 | 55.1k | } |
128 | | |
129 | 0 | HyperLogLog& operator=(HyperLogLog&& other) noexcept { |
130 | 0 | if (this != &other) { |
131 | 0 | if (_registers != nullptr) { |
132 | 0 | delete[] _registers; |
133 | 0 | _registers = nullptr; |
134 | 0 | } |
135 | |
|
136 | 0 | this->_type = other._type; |
137 | 0 | switch (other._type) { |
138 | 0 | case HLL_DATA_EMPTY: |
139 | 0 | break; |
140 | 0 | case HLL_DATA_EXPLICIT: { |
141 | 0 | this->_hash_set = std::move(other._hash_set); |
142 | 0 | other._type = HLL_DATA_EMPTY; |
143 | 0 | break; |
144 | 0 | } |
145 | 0 | case HLL_DATA_SPARSE: |
146 | 0 | case HLL_DATA_FULL: { |
147 | 0 | this->_registers = other._registers; |
148 | 0 | other._registers = nullptr; |
149 | 0 | other._type = HLL_DATA_EMPTY; |
150 | 0 | break; |
151 | 0 | } |
152 | 0 | default: |
153 | 0 | break; |
154 | 0 | } |
155 | 0 | } |
156 | 0 | return *this; |
157 | 0 | } |
158 | | |
159 | 1.75k | HyperLogLog& operator=(const HyperLogLog& other) { |
160 | 1.75k | if (this != &other) { |
161 | 1.71k | if (_registers != nullptr) { |
162 | 0 | delete[] _registers; |
163 | 0 | _registers = nullptr; |
164 | 0 | } |
165 | | |
166 | 1.71k | this->_type = other._type; |
167 | 1.71k | switch (other._type) { |
168 | 55 | case HLL_DATA_EMPTY: |
169 | 55 | break; |
170 | 1.65k | case HLL_DATA_EXPLICIT: { |
171 | 1.65k | this->_hash_set = other._hash_set; |
172 | 1.65k | break; |
173 | 0 | } |
174 | 0 | case HLL_DATA_SPARSE: |
175 | 0 | case HLL_DATA_FULL: { |
176 | 0 | _registers = new uint8_t[HLL_REGISTERS_COUNT]; |
177 | 0 | memcpy(_registers, other._registers, HLL_REGISTERS_COUNT); |
178 | 0 | break; |
179 | 0 | } |
180 | 0 | default: |
181 | 0 | break; |
182 | 1.71k | } |
183 | 1.71k | } |
184 | 1.75k | return *this; |
185 | 1.75k | } |
186 | | |
187 | 907k | ~HyperLogLog() { clear(); } |
188 | 907k | void clear() { |
189 | 907k | _type = HLL_DATA_EMPTY; |
190 | 907k | _hash_set.clear(); |
191 | 907k | delete[] _registers; |
192 | 907k | _registers = nullptr; |
193 | 907k | } |
194 | | |
195 | | using SetTypeValueType = uint8_t; |
196 | | using SparseLengthValueType = int32_t; |
197 | | using SparseIndexType = uint16_t; |
198 | | using SparseValueType = uint8_t; |
199 | | |
200 | | // Add a hash value to this HLL value |
201 | | // NOTE: input must be a hash_value |
202 | | void update(uint64_t hash_value); |
203 | | |
204 | | void merge(const HyperLogLog& other); |
205 | | |
206 | | // Return max size of serialized binary |
207 | | size_t max_serialized_size() const; |
208 | | |
209 | 0 | size_t memory_consumed() const { |
210 | 0 | size_t size = sizeof(*this); |
211 | 0 | if (_type == HLL_DATA_EXPLICIT) { |
212 | 0 | size += _hash_set.size() * sizeof(uint64_t); |
213 | 0 | } else if (_type == HLL_DATA_SPARSE || _type == HLL_DATA_FULL) { |
214 | 0 | size += HLL_REGISTERS_COUNT; |
215 | 0 | } |
216 | 0 | return size; |
217 | 0 | } |
218 | | |
219 | | // Input slice should has enough capacity for serialize, which |
220 | | // can be get through max_serialized_size(). If insufficient buffer |
221 | | // is given, this will cause process crash. |
222 | | // Return actual size of serialized binary. |
223 | | size_t serialize(uint8_t* dst) const; |
224 | | |
225 | | // Now, only empty HLL support this function. |
226 | | bool deserialize(const Slice& slice); |
227 | | |
228 | | int64_t estimate_cardinality() const; |
229 | | |
230 | 21 | static HyperLogLog empty() { return HyperLogLog {}; } |
231 | | |
232 | | // Check if input slice is a valid serialized binary of HyperLogLog. |
233 | | // This function only check the encoded type in slice, whose complex |
234 | | // function is O(1). |
235 | | static bool is_valid(const Slice& slice); |
236 | | |
237 | | // only for debug |
238 | | std::string to_string() const { |
239 | | switch (_type) { |
240 | | case HLL_DATA_EMPTY: |
241 | | return {}; |
242 | | case HLL_DATA_EXPLICIT: |
243 | | case HLL_DATA_SPARSE: |
244 | | case HLL_DATA_FULL: { |
245 | | std::string str {"hash set size: "}; |
246 | | str.append(std::to_string(_hash_set.size())); |
247 | | str.append("\ncardinality:\t"); |
248 | | str.append(std::to_string(estimate_cardinality())); |
249 | | str.append("\ntype:\t"); |
250 | | str.append(std::to_string(_type)); |
251 | | return str; |
252 | | } |
253 | | default: |
254 | | return {}; |
255 | | } |
256 | | } |
257 | | |
258 | | private: |
259 | | void _convert_explicit_to_register(); |
260 | | |
261 | | // update one hash value into this registers |
262 | 49.6M | void _update_registers(uint64_t hash_value) { |
263 | | // Use the lower bits to index into the number of streams and then |
264 | | // find the first 1 bit after the index bits. |
265 | 49.6M | int idx = hash_value % HLL_REGISTERS_COUNT; |
266 | 49.6M | hash_value >>= HLL_COLUMN_PRECISION; |
267 | | // make sure max first_one_bit is HLL_ZERO_COUNT_BITS + 1 |
268 | 49.6M | hash_value |= ((uint64_t)1 << HLL_ZERO_COUNT_BITS); |
269 | 49.6M | auto first_one_bit = uint8_t(__builtin_ctzl(hash_value) + 1); |
270 | 49.6M | _registers[idx] = (_registers[idx] < first_one_bit ? first_one_bit : _registers[idx]); |
271 | 49.6M | } |
272 | | |
273 | | // absorb other registers into this registers |
274 | 3 | void _merge_registers(const uint8_t* other_registers) { |
275 | 3 | _do_simd_merge(_registers, other_registers); |
276 | 3 | } |
277 | | |
278 | 3 | void _do_simd_merge(uint8_t* __restrict registers, const uint8_t* __restrict other_registers) { |
279 | 49.1k | for (int i = 0; i < HLL_REGISTERS_COUNT; ++i) { |
280 | 49.1k | registers[i] = (registers[i] < other_registers[i] ? other_registers[i] : registers[i]); |
281 | 49.1k | } |
282 | 3 | } |
283 | | |
284 | | HllDataType _type = HLL_DATA_EMPTY; |
285 | | vectorized::flat_hash_set<uint64_t> _hash_set; |
286 | | |
287 | | // This field is much space consuming(HLL_REGISTERS_COUNT), we create |
288 | | // it only when it is really needed. |
289 | | uint8_t* _registers = nullptr; |
290 | | }; |
291 | | #include "common/compile_check_end.h" |
292 | | } // namespace doris |