be/src/exec/common/hash_table/hash.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/Hash.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | #include <type_traits> |
24 | | |
25 | | #include "core/extended_types.h" |
26 | | #include "core/string_ref.h" |
27 | | #include "core/types.h" |
28 | | #include "core/uint128.h" |
29 | | #include "parallel_hashmap/phmap_utils.h" |
30 | | |
31 | | // Here is an empirical value. |
32 | | static constexpr size_t HASH_MAP_PREFETCH_DIST = 16; |
33 | | |
34 | | /** Hash functions that are better than the trivial function std::hash. |
35 | | * |
36 | | * Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times. |
37 | | * This is because of following reasons: |
38 | | * - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits; |
39 | | * - in typical implementation of standard library, hash function for integers is trivial and just use lower bits; |
40 | | * - traffic is non-uniformly distributed across a day; |
41 | | * - we are using open-addressing linear probing hash tables that are most critical to hash function quality, |
42 | | * and trivial hash function gives disastrous results. |
43 | | */ |
44 | | |
45 | | /** Taken from MurmurHash. This is Murmur finalizer. |
46 | | * Faster than int_hash32 when inserting into the hash table UInt64 -> UInt64, where the key is the visitor ID. |
47 | | */ |
48 | 57 | inline doris::UInt64 int_hash64(doris::UInt64 x) { |
49 | 57 | x ^= x >> 33; |
50 | 57 | x *= 0xff51afd7ed558ccdULL; |
51 | 57 | x ^= x >> 33; |
52 | 57 | x *= 0xc4ceb9fe1a85ec53ULL; |
53 | 57 | x ^= x >> 33; |
54 | | |
55 | 57 | return x; |
56 | 57 | } |
57 | | |
58 | | /** CRC32C is not very high-quality as a hash function, |
59 | | * according to avalanche and bit independence tests (see SMHasher software), as well as a small number of bits, |
60 | | * but can behave well when used in hash tables, |
61 | | * due to high speed (latency 3 + 1 clock cycle, throughput 1 clock cycle). |
62 | | * Works only with SSE 4.2 support. |
63 | | */ |
64 | | #include "util/sse_util.hpp" |
65 | | |
66 | 4.07M | inline doris::UInt64 int_hash_crc32(doris::UInt64 x) { |
67 | 4.07M | #if defined(__SSE4_2__) || (defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)) |
68 | 4.07M | return _mm_crc32_u64(-1ULL, x); |
69 | | #else |
70 | | /// On other platforms we do not have CRC32. NOTE This can be confusing. |
71 | | return int_hash64(x); |
72 | | #endif |
73 | 4.07M | } |
74 | | |
75 | | template <typename T> |
76 | 57 | inline size_t default_hash64(T key) { |
77 | 57 | union { |
78 | 57 | T in; |
79 | 57 | doris::UInt64 out; |
80 | 57 | } u; |
81 | 57 | u.out = 0; |
82 | 57 | u.in = key; |
83 | 57 | return int_hash64(u.out); |
84 | 57 | } Unexecuted instantiation: _Z14default_hash64IhEmT_ Unexecuted instantiation: _Z14default_hash64IaEmT_ Unexecuted instantiation: _Z14default_hash64IsEmT_ Line | Count | Source | 76 | 13 | inline size_t default_hash64(T key) { | 77 | 13 | union { | 78 | 13 | T in; | 79 | 13 | doris::UInt64 out; | 80 | 13 | } u; | 81 | 13 | u.out = 0; | 82 | 13 | u.in = key; | 83 | 13 | return int_hash64(u.out); | 84 | 13 | } |
Unexecuted instantiation: _Z14default_hash64IlEmT_ Line | Count | Source | 76 | 41 | inline size_t default_hash64(T key) { | 77 | 41 | union { | 78 | 41 | T in; | 79 | 41 | doris::UInt64 out; | 80 | 41 | } u; | 81 | 41 | u.out = 0; | 82 | 41 | u.in = key; | 83 | 41 | return int_hash64(u.out); | 84 | 41 | } |
Unexecuted instantiation: _Z14default_hash64IfEmT_ Line | Count | Source | 76 | 3 | inline size_t default_hash64(T key) { | 77 | 3 | union { | 78 | 3 | T in; | 79 | 3 | doris::UInt64 out; | 80 | 3 | } u; | 81 | 3 | u.out = 0; | 82 | 3 | u.in = key; | 83 | 3 | return int_hash64(u.out); | 84 | 3 | } |
Unexecuted instantiation: _Z14default_hash64IjEmT_ Unexecuted instantiation: _Z14default_hash64IoEmT_ |
85 | | |
86 | | template <typename T, typename Enable = void> |
87 | | struct DefaultHash; |
88 | | |
89 | | template <typename T> |
90 | | requires std::is_arithmetic_v<T> |
91 | | struct DefaultHash<T> { |
92 | 57 | size_t operator()(T key) const { return default_hash64<T>(key); }Unexecuted instantiation: _ZNK11DefaultHashIhvEclEh Unexecuted instantiation: _ZNK11DefaultHashIavEclEa Unexecuted instantiation: _ZNK11DefaultHashIsvEclEs _ZNK11DefaultHashIivEclEi Line | Count | Source | 92 | 13 | size_t operator()(T key) const { return default_hash64<T>(key); } |
Unexecuted instantiation: _ZNK11DefaultHashIlvEclEl _ZNK11DefaultHashInvEclEn Line | Count | Source | 92 | 41 | size_t operator()(T key) const { return default_hash64<T>(key); } |
Unexecuted instantiation: _ZNK11DefaultHashIfvEclEf _ZNK11DefaultHashIdvEclEd Line | Count | Source | 92 | 3 | size_t operator()(T key) const { return default_hash64<T>(key); } |
Unexecuted instantiation: _ZNK11DefaultHashIjvEclEj Unexecuted instantiation: _ZNK11DefaultHashIovEclEo |
93 | | }; |
94 | | |
95 | | template <> |
96 | | struct DefaultHash<doris::VecDateTimeValue> { |
97 | 0 | size_t operator()(doris::VecDateTimeValue key) const { return int_hash64(*(int64_t*)&key); } |
98 | | }; |
99 | | |
100 | | template <> |
101 | | struct DefaultHash<doris::DateV2Value<doris::DateTimeV2ValueType>> { |
102 | 0 | size_t operator()(doris::DateV2Value<doris::DateTimeV2ValueType> key) const { |
103 | 0 | return int_hash64(key.to_date_int_val()); |
104 | 0 | } |
105 | | }; |
106 | | |
107 | | template <> |
108 | | struct DefaultHash<doris::DateV2Value<doris::DateV2ValueType>> { |
109 | 0 | size_t operator()(doris::DateV2Value<doris::DateV2ValueType> key) const { |
110 | 0 | return int_hash64(key.to_date_int_val()); |
111 | 0 | } |
112 | | }; |
113 | | |
114 | | template <> |
115 | | struct DefaultHash<doris::TimestampTzValue> { |
116 | 0 | size_t operator()(doris::TimestampTzValue key) const { |
117 | 0 | return int_hash64(key.to_date_int_val()); |
118 | 0 | } |
119 | | }; |
120 | | |
121 | | template <> |
122 | | struct DefaultHash<doris::StringRef> : public doris::StringRefHash {}; |
123 | | |
124 | | template <> |
125 | | struct DefaultHash<wide::Int256> : public std::hash<wide::Int256> {}; |
126 | | |
127 | | template <typename T> |
128 | | struct HashCRC32; |
129 | | |
130 | | template <typename T> |
131 | 4.07M | inline size_t hash_crc32(T key) { |
132 | 4.07M | union { |
133 | 4.07M | T in; |
134 | 4.07M | doris::UInt64 out; |
135 | 4.07M | } u; |
136 | 4.07M | u.out = 0; |
137 | 4.07M | u.in = key; |
138 | 4.07M | return int_hash_crc32(u.out); |
139 | 4.07M | } Line | Count | Source | 131 | 8.50k | inline size_t hash_crc32(T key) { | 132 | 8.50k | union { | 133 | 8.50k | T in; | 134 | 8.50k | doris::UInt64 out; | 135 | 8.50k | } u; | 136 | 8.50k | u.out = 0; | 137 | 8.50k | u.in = key; | 138 | 8.50k | return int_hash_crc32(u.out); | 139 | 8.50k | } |
Line | Count | Source | 131 | 29.0k | inline size_t hash_crc32(T key) { | 132 | 29.0k | union { | 133 | 29.0k | T in; | 134 | 29.0k | doris::UInt64 out; | 135 | 29.0k | } u; | 136 | 29.0k | u.out = 0; | 137 | 29.0k | u.in = key; | 138 | 29.0k | return int_hash_crc32(u.out); | 139 | 29.0k | } |
Line | Count | Source | 131 | 8.50k | inline size_t hash_crc32(T key) { | 132 | 8.50k | union { | 133 | 8.50k | T in; | 134 | 8.50k | doris::UInt64 out; | 135 | 8.50k | } u; | 136 | 8.50k | u.out = 0; | 137 | 8.50k | u.in = key; | 138 | 8.50k | return int_hash_crc32(u.out); | 139 | 8.50k | } |
Line | Count | Source | 131 | 18.7k | inline size_t hash_crc32(T key) { | 132 | 18.7k | union { | 133 | 18.7k | T in; | 134 | 18.7k | doris::UInt64 out; | 135 | 18.7k | } u; | 136 | 18.7k | u.out = 0; | 137 | 18.7k | u.in = key; | 138 | 18.7k | return int_hash_crc32(u.out); | 139 | 18.7k | } |
Unexecuted instantiation: _Z10hash_crc32IlEmT_ Line | Count | Source | 131 | 37.4k | inline size_t hash_crc32(T key) { | 132 | 37.4k | union { | 133 | 37.4k | T in; | 134 | 37.4k | doris::UInt64 out; | 135 | 37.4k | } u; | 136 | 37.4k | u.out = 0; | 137 | 37.4k | u.in = key; | 138 | 37.4k | return int_hash_crc32(u.out); | 139 | 37.4k | } |
Line | Count | Source | 131 | 3.97M | inline size_t hash_crc32(T key) { | 132 | 3.97M | union { | 133 | 3.97M | T in; | 134 | 3.97M | doris::UInt64 out; | 135 | 3.97M | } u; | 136 | 3.97M | u.out = 0; | 137 | 3.97M | u.in = key; | 138 | 3.97M | return int_hash_crc32(u.out); | 139 | 3.97M | } |
Unexecuted instantiation: _Z10hash_crc32IiEmT_ Unexecuted instantiation: _Z10hash_crc32IfEmT_ Unexecuted instantiation: _Z10hash_crc32IdEmT_ |
140 | | |
141 | | template <> |
142 | 31.5k | inline size_t hash_crc32(doris::UInt128 u) { |
143 | 31.5k | return doris::UInt128HashCRC32()(u); |
144 | 31.5k | } |
145 | | |
146 | | template <> |
147 | 0 | inline size_t hash_crc32(unsigned __int128 u) { |
148 | 0 | return doris::UInt128HashCRC32()(u); |
149 | 0 | } |
150 | | |
151 | | template <> |
152 | 0 | inline size_t hash_crc32(doris::Int128 u) { |
153 | 0 | return doris::UInt128HashCRC32()({(u >> 64) & int64_t(-1), u & int64_t(-1)}); |
154 | 0 | } |
155 | | |
156 | | template <> |
157 | 0 | inline size_t hash_crc32(doris::VecDateTimeValue u) { |
158 | 0 | return hash_crc32(*(int64_t*)&u); |
159 | 0 | } |
160 | | |
161 | | template <> |
162 | 0 | inline size_t hash_crc32(doris::DateV2Value<doris::DateTimeV2ValueType> u) { |
163 | 0 | return hash_crc32(u.to_date_int_val()); |
164 | 0 | } |
165 | | |
166 | | template <> |
167 | 0 | inline size_t hash_crc32(doris::DateV2Value<doris::DateV2ValueType> u) { |
168 | 0 | return hash_crc32(u.to_date_int_val()); |
169 | 0 | } |
170 | | |
171 | | template <> |
172 | 0 | inline size_t hash_crc32(doris::TimestampTzValue u) { |
173 | 0 | return hash_crc32(u.to_date_int_val()); |
174 | 0 | } |
175 | | |
176 | | #define DEFINE_HASH(T) \ |
177 | | template <> \ |
178 | | struct HashCRC32<T> { \ |
179 | 4.10M | size_t operator()(T key) const { \ |
180 | 4.10M | return hash_crc32<T>(key); \ |
181 | 4.10M | } \ Line | Count | Source | 179 | 8.50k | size_t operator()(T key) const { \ | 180 | 8.50k | return hash_crc32<T>(key); \ | 181 | 8.50k | } \ |
Line | Count | Source | 179 | 29.0k | size_t operator()(T key) const { \ | 180 | 29.0k | return hash_crc32<T>(key); \ | 181 | 29.0k | } \ |
Line | Count | Source | 179 | 8.50k | size_t operator()(T key) const { \ | 180 | 8.50k | return hash_crc32<T>(key); \ | 181 | 8.50k | } \ |
Line | Count | Source | 179 | 18.7k | size_t operator()(T key) const { \ | 180 | 18.7k | return hash_crc32<T>(key); \ | 181 | 18.7k | } \ |
Line | Count | Source | 179 | 3.97M | size_t operator()(T key) const { \ | 180 | 3.97M | return hash_crc32<T>(key); \ | 181 | 3.97M | } \ |
Line | Count | Source | 179 | 37.4k | size_t operator()(T key) const { \ | 180 | 37.4k | return hash_crc32<T>(key); \ | 181 | 37.4k | } \ |
_ZNK9HashCRC32IN4wide7integerILm128EjEEEclES2_ Line | Count | Source | 179 | 31.5k | size_t operator()(T key) const { \ | 180 | 31.5k | return hash_crc32<T>(key); \ | 181 | 31.5k | } \ |
Unexecuted instantiation: _ZNK9HashCRC32IiEclEi Unexecuted instantiation: _ZNK9HashCRC32IlEclEl Unexecuted instantiation: _ZNK9HashCRC32InEclEn Unexecuted instantiation: _ZNK9HashCRC32IfEclEf Unexecuted instantiation: _ZNK9HashCRC32IdEclEd Unexecuted instantiation: _ZNK9HashCRC32IN5doris16VecDateTimeValueEEclES1_ Unexecuted instantiation: _ZNK9HashCRC32IN5doris11DateV2ValueINS0_19DateTimeV2ValueTypeEEEEclES3_ Unexecuted instantiation: _ZNK9HashCRC32IN5doris11DateV2ValueINS0_15DateV2ValueTypeEEEEclES3_ Unexecuted instantiation: _ZNK9HashCRC32IN5doris16TimestampTzValueEEclES1_ Unexecuted instantiation: _ZNK9HashCRC32IoEclEo |
182 | | }; |
183 | | |
184 | | DEFINE_HASH(doris::UInt8) |
185 | | DEFINE_HASH(doris::UInt16) |
186 | | DEFINE_HASH(doris::UInt32) |
187 | | DEFINE_HASH(doris::UInt64) |
188 | | DEFINE_HASH(doris::UInt128) |
189 | | DEFINE_HASH(doris::Int8) |
190 | | DEFINE_HASH(doris::Int16) |
191 | | DEFINE_HASH(doris::Int32) |
192 | | DEFINE_HASH(doris::Int64) |
193 | | DEFINE_HASH(doris::Int128) |
194 | | DEFINE_HASH(doris::Float32) |
195 | | DEFINE_HASH(doris::Float64) |
196 | | DEFINE_HASH(doris::VecDateTimeValue) |
197 | | DEFINE_HASH(doris::DateV2Value<doris::DateTimeV2ValueType>) |
198 | | DEFINE_HASH(doris::DateV2Value<doris::DateV2ValueType>) |
199 | | DEFINE_HASH(doris::TimestampTzValue) |
200 | | DEFINE_HASH(unsigned __int128) |
201 | | |
202 | | #undef DEFINE_HASH |
203 | | |
204 | | template <typename Key, typename Hash = HashCRC32<Key>> |
205 | | struct HashMixWrapper { |
206 | 3.93M | size_t operator()(Key key) const { return phmap::phmap_mix<sizeof(size_t)>()(Hash()(key)); }_ZNK14HashMixWrapperIm9HashCRC32ImEEclEm Line | Count | Source | 206 | 209 | size_t operator()(Key key) const { return phmap::phmap_mix<sizeof(size_t)>()(Hash()(key)); } |
_ZNK14HashMixWrapperIj9HashCRC32IjEEclEj Line | Count | Source | 206 | 3.93M | size_t operator()(Key key) const { return phmap::phmap_mix<sizeof(size_t)>()(Hash()(key)); } |
|
207 | | }; |
208 | | |
209 | | template <> |
210 | | struct HashCRC32<doris::UInt256> { |
211 | 10.8k | size_t operator()(const doris::UInt256& x) const { |
212 | 10.8k | #if defined(__SSE4_2__) || defined(__aarch64__) |
213 | 10.8k | doris::UInt64 crc = -1ULL; |
214 | 10.8k | crc = _mm_crc32_u64(crc, x.items[0]); |
215 | 10.8k | crc = _mm_crc32_u64(crc, x.items[1]); |
216 | 10.8k | crc = _mm_crc32_u64(crc, x.items[2]); |
217 | 10.8k | crc = _mm_crc32_u64(crc, x.items[3]); |
218 | 10.8k | return crc; |
219 | | #else |
220 | | return Hash128to64({Hash128to64({x.a, x.b}), Hash128to64({x.c, x.d})}); |
221 | | #endif |
222 | 10.8k | } |
223 | | }; |
224 | | |
225 | | template <> |
226 | | struct HashCRC32<wide::Int256> { |
227 | 8.19k | size_t operator()(const wide::Int256& x) const { |
228 | 8.19k | #if defined(__SSE4_2__) || defined(__aarch64__) |
229 | 8.19k | doris::UInt64 crc = -1ULL; |
230 | 8.19k | crc = _mm_crc32_u64(crc, x.items[0]); |
231 | 8.19k | crc = _mm_crc32_u64(crc, x.items[1]); |
232 | 8.19k | crc = _mm_crc32_u64(crc, x.items[2]); |
233 | 8.19k | crc = _mm_crc32_u64(crc, x.items[3]); |
234 | 8.19k | return crc; |
235 | | #else |
236 | | return Hash128to64( |
237 | | {Hash128to64({x.items[0], x.items[1]}), Hash128to64({x.items[2], x.items[3]})}); |
238 | | #endif |
239 | 8.19k | } |
240 | | }; |
241 | | |
242 | | template <> |
243 | | struct HashCRC32<doris::Decimal256> { |
244 | 0 | size_t operator()(const doris::Decimal256& value) const { |
245 | 0 | return HashCRC32<wide::Int256>()(value.value); |
246 | 0 | } |
247 | | }; |
248 | | |
249 | | template <> |
250 | | struct HashCRC32<doris::Decimal32> { |
251 | 0 | size_t operator()(const doris::Decimal32& value) const { |
252 | 0 | return HashCRC32<int32_t>()(value.value); |
253 | 0 | } |
254 | | }; |
255 | | |
256 | | template <> |
257 | | struct HashCRC32<doris::Decimal64> { |
258 | 0 | size_t operator()(const doris::Decimal64& value) const { |
259 | 0 | return HashCRC32<int64_t>()(value.value); |
260 | 0 | } |
261 | | }; |
262 | | |
263 | | template <> |
264 | | struct HashCRC32<doris::Decimal128V3> { |
265 | 0 | size_t operator()(const doris::Decimal128V3& value) const { |
266 | 0 | return HashCRC32<__int128>()(value.value); |
267 | 0 | } |
268 | | }; |
269 | | |
270 | | template <> |
271 | | struct HashCRC32<doris::Decimal128V2> { |
272 | 0 | size_t operator()(const doris::Decimal128V2& value) const { |
273 | 0 | return HashCRC32<__int128>()(value.value); |
274 | 0 | } |
275 | | }; |
276 | | |
277 | | template <> |
278 | | struct HashCRC32<doris::DecimalV2Value> { |
279 | 0 | size_t operator()(const doris::DecimalV2Value& value) const { |
280 | 0 | return HashCRC32<__int128>()(value.value()); |
281 | 0 | } |
282 | | }; |
283 | | |
284 | | #include "common/compile_check_avoid_begin.h" |
285 | | |
286 | | template <> |
287 | | struct HashCRC32<doris::UInt72> { |
288 | 4.40k | size_t operator()(const doris::UInt72& x) const { |
289 | 4.40k | doris::UInt64 crc = -1ULL; |
290 | 4.40k | crc = _mm_crc32_u8(crc, x.a); |
291 | 4.40k | crc = _mm_crc32_u64(crc, x.b); |
292 | 4.40k | return crc; |
293 | 4.40k | } |
294 | | }; |
295 | | |
296 | | template <> |
297 | | struct HashCRC32<doris::UInt96> { |
298 | 11.0k | size_t operator()(const doris::UInt96& x) const { |
299 | 11.0k | doris::UInt64 crc = -1ULL; |
300 | 11.0k | crc = _mm_crc32_u32(crc, x.a); |
301 | 11.0k | crc = _mm_crc32_u64(crc, x.b); |
302 | 11.0k | return crc; |
303 | 11.0k | } |
304 | | }; |
305 | | |
306 | | template <> |
307 | | struct HashCRC32<doris::UInt104> { |
308 | 0 | size_t operator()(const doris::UInt104& x) const { |
309 | 0 | doris::UInt64 crc = -1ULL; |
310 | 0 | crc = _mm_crc32_u8(crc, x.a); |
311 | 0 | crc = _mm_crc32_u32(crc, x.b); |
312 | 0 | crc = _mm_crc32_u64(crc, x.c); |
313 | 0 | return crc; |
314 | 0 | } |
315 | | }; |
316 | | |
317 | | template <> |
318 | | struct HashCRC32<doris::UInt136> { |
319 | 1.32k | size_t operator()(const doris::UInt136& x) const { |
320 | 1.32k | doris::UInt64 crc = -1ULL; |
321 | 1.32k | crc = _mm_crc32_u8(crc, x.a); |
322 | 1.32k | crc = _mm_crc32_u64(crc, x.b); |
323 | 1.32k | crc = _mm_crc32_u64(crc, x.c); |
324 | 1.32k | return crc; |
325 | 1.32k | } |
326 | | }; |
327 | | |
328 | | #include "common/compile_check_avoid_end.h" |