Coverage Report

Created: 2026-03-13 19:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/common/hash_table/hash.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/Hash.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include <type_traits>
24
25
#include "core/extended_types.h"
26
#include "core/string_ref.h"
27
#include "core/types.h"
28
#include "core/uint128.h"
29
#include "parallel_hashmap/phmap_utils.h"
30
31
// Here is an empirical value.
32
static constexpr size_t HASH_MAP_PREFETCH_DIST = 16;
33
34
/** Hash functions that are better than the trivial function std::hash.
35
  *
36
  * Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times.
37
  * This is because of following reasons:
38
  * - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits;
39
  * - in typical implementation of standard library, hash function for integers is trivial and just use lower bits;
40
  * - traffic is non-uniformly distributed across a day;
41
  * - we are using open-addressing linear probing hash tables that are most critical to hash function quality,
42
  *   and trivial hash function gives disastrous results.
43
  */
44
45
/** Taken from MurmurHash. This is Murmur finalizer.
46
  * Faster than int_hash32 when inserting into the hash table UInt64 -> UInt64, where the key is the visitor ID.
47
  */
48
57
inline doris::UInt64 int_hash64(doris::UInt64 x) {
49
57
    x ^= x >> 33;
50
57
    x *= 0xff51afd7ed558ccdULL;
51
57
    x ^= x >> 33;
52
57
    x *= 0xc4ceb9fe1a85ec53ULL;
53
57
    x ^= x >> 33;
54
55
57
    return x;
56
57
}
57
58
/** CRC32C is not very high-quality as a hash function,
59
  *  according to avalanche and bit independence tests (see SMHasher software), as well as a small number of bits,
60
  *  but can behave well when used in hash tables,
61
  *  due to high speed (latency 3 + 1 clock cycle, throughput 1 clock cycle).
62
  * Works only with SSE 4.2 support.
63
  */
64
#include "util/sse_util.hpp"
65
66
4.07M
inline doris::UInt64 int_hash_crc32(doris::UInt64 x) {
67
4.07M
#if defined(__SSE4_2__) || (defined(__aarch64__) && defined(__ARM_FEATURE_CRC32))
68
4.07M
    return _mm_crc32_u64(-1ULL, x);
69
#else
70
    /// On other platforms we do not have CRC32. NOTE This can be confusing.
71
    return int_hash64(x);
72
#endif
73
4.07M
}
74
75
template <typename T>
76
57
inline size_t default_hash64(T key) {
77
57
    union {
78
57
        T in;
79
57
        doris::UInt64 out;
80
57
    } u;
81
57
    u.out = 0;
82
57
    u.in = key;
83
57
    return int_hash64(u.out);
84
57
}
Unexecuted instantiation: _Z14default_hash64IhEmT_
Unexecuted instantiation: _Z14default_hash64IaEmT_
Unexecuted instantiation: _Z14default_hash64IsEmT_
_Z14default_hash64IiEmT_
Line
Count
Source
76
13
inline size_t default_hash64(T key) {
77
13
    union {
78
13
        T in;
79
13
        doris::UInt64 out;
80
13
    } u;
81
13
    u.out = 0;
82
13
    u.in = key;
83
13
    return int_hash64(u.out);
84
13
}
Unexecuted instantiation: _Z14default_hash64IlEmT_
_Z14default_hash64InEmT_
Line
Count
Source
76
41
inline size_t default_hash64(T key) {
77
41
    union {
78
41
        T in;
79
41
        doris::UInt64 out;
80
41
    } u;
81
41
    u.out = 0;
82
41
    u.in = key;
83
41
    return int_hash64(u.out);
84
41
}
Unexecuted instantiation: _Z14default_hash64IfEmT_
_Z14default_hash64IdEmT_
Line
Count
Source
76
3
inline size_t default_hash64(T key) {
77
3
    union {
78
3
        T in;
79
3
        doris::UInt64 out;
80
3
    } u;
81
3
    u.out = 0;
82
3
    u.in = key;
83
3
    return int_hash64(u.out);
84
3
}
Unexecuted instantiation: _Z14default_hash64IjEmT_
Unexecuted instantiation: _Z14default_hash64IoEmT_
85
86
template <typename T, typename Enable = void>
87
struct DefaultHash;
88
89
template <typename T>
90
    requires std::is_arithmetic_v<T>
91
struct DefaultHash<T> {
92
57
    size_t operator()(T key) const { return default_hash64<T>(key); }
Unexecuted instantiation: _ZNK11DefaultHashIhvEclEh
Unexecuted instantiation: _ZNK11DefaultHashIavEclEa
Unexecuted instantiation: _ZNK11DefaultHashIsvEclEs
_ZNK11DefaultHashIivEclEi
Line
Count
Source
92
13
    size_t operator()(T key) const { return default_hash64<T>(key); }
Unexecuted instantiation: _ZNK11DefaultHashIlvEclEl
_ZNK11DefaultHashInvEclEn
Line
Count
Source
92
41
    size_t operator()(T key) const { return default_hash64<T>(key); }
Unexecuted instantiation: _ZNK11DefaultHashIfvEclEf
_ZNK11DefaultHashIdvEclEd
Line
Count
Source
92
3
    size_t operator()(T key) const { return default_hash64<T>(key); }
Unexecuted instantiation: _ZNK11DefaultHashIjvEclEj
Unexecuted instantiation: _ZNK11DefaultHashIovEclEo
93
};
94
95
template <>
96
struct DefaultHash<doris::VecDateTimeValue> {
97
0
    size_t operator()(doris::VecDateTimeValue key) const { return int_hash64(*(int64_t*)&key); }
98
};
99
100
template <>
101
struct DefaultHash<doris::DateV2Value<doris::DateTimeV2ValueType>> {
102
0
    size_t operator()(doris::DateV2Value<doris::DateTimeV2ValueType> key) const {
103
0
        return int_hash64(key.to_date_int_val());
104
0
    }
105
};
106
107
template <>
108
struct DefaultHash<doris::DateV2Value<doris::DateV2ValueType>> {
109
0
    size_t operator()(doris::DateV2Value<doris::DateV2ValueType> key) const {
110
0
        return int_hash64(key.to_date_int_val());
111
0
    }
112
};
113
114
template <>
115
struct DefaultHash<doris::TimestampTzValue> {
116
0
    size_t operator()(doris::TimestampTzValue key) const {
117
0
        return int_hash64(key.to_date_int_val());
118
0
    }
119
};
120
121
template <>
122
struct DefaultHash<doris::StringRef> : public doris::StringRefHash {};
123
124
template <>
125
struct DefaultHash<wide::Int256> : public std::hash<wide::Int256> {};
126
127
template <typename T>
128
struct HashCRC32;
129
130
template <typename T>
131
4.07M
inline size_t hash_crc32(T key) {
132
4.07M
    union {
133
4.07M
        T in;
134
4.07M
        doris::UInt64 out;
135
4.07M
    } u;
136
4.07M
    u.out = 0;
137
4.07M
    u.in = key;
138
4.07M
    return int_hash_crc32(u.out);
139
4.07M
}
_Z10hash_crc32IaEmT_
Line
Count
Source
131
8.50k
inline size_t hash_crc32(T key) {
132
8.50k
    union {
133
8.50k
        T in;
134
8.50k
        doris::UInt64 out;
135
8.50k
    } u;
136
8.50k
    u.out = 0;
137
8.50k
    u.in = key;
138
8.50k
    return int_hash_crc32(u.out);
139
8.50k
}
_Z10hash_crc32IhEmT_
Line
Count
Source
131
29.0k
inline size_t hash_crc32(T key) {
132
29.0k
    union {
133
29.0k
        T in;
134
29.0k
        doris::UInt64 out;
135
29.0k
    } u;
136
29.0k
    u.out = 0;
137
29.0k
    u.in = key;
138
29.0k
    return int_hash_crc32(u.out);
139
29.0k
}
_Z10hash_crc32IsEmT_
Line
Count
Source
131
8.50k
inline size_t hash_crc32(T key) {
132
8.50k
    union {
133
8.50k
        T in;
134
8.50k
        doris::UInt64 out;
135
8.50k
    } u;
136
8.50k
    u.out = 0;
137
8.50k
    u.in = key;
138
8.50k
    return int_hash_crc32(u.out);
139
8.50k
}
_Z10hash_crc32ItEmT_
Line
Count
Source
131
18.7k
inline size_t hash_crc32(T key) {
132
18.7k
    union {
133
18.7k
        T in;
134
18.7k
        doris::UInt64 out;
135
18.7k
    } u;
136
18.7k
    u.out = 0;
137
18.7k
    u.in = key;
138
18.7k
    return int_hash_crc32(u.out);
139
18.7k
}
Unexecuted instantiation: _Z10hash_crc32IlEmT_
_Z10hash_crc32ImEmT_
Line
Count
Source
131
37.4k
inline size_t hash_crc32(T key) {
132
37.4k
    union {
133
37.4k
        T in;
134
37.4k
        doris::UInt64 out;
135
37.4k
    } u;
136
37.4k
    u.out = 0;
137
37.4k
    u.in = key;
138
37.4k
    return int_hash_crc32(u.out);
139
37.4k
}
_Z10hash_crc32IjEmT_
Line
Count
Source
131
3.97M
inline size_t hash_crc32(T key) {
132
3.97M
    union {
133
3.97M
        T in;
134
3.97M
        doris::UInt64 out;
135
3.97M
    } u;
136
3.97M
    u.out = 0;
137
3.97M
    u.in = key;
138
3.97M
    return int_hash_crc32(u.out);
139
3.97M
}
Unexecuted instantiation: _Z10hash_crc32IiEmT_
Unexecuted instantiation: _Z10hash_crc32IfEmT_
Unexecuted instantiation: _Z10hash_crc32IdEmT_
140
141
template <>
142
31.5k
inline size_t hash_crc32(doris::UInt128 u) {
143
31.5k
    return doris::UInt128HashCRC32()(u);
144
31.5k
}
145
146
template <>
147
0
inline size_t hash_crc32(unsigned __int128 u) {
148
0
    return doris::UInt128HashCRC32()(u);
149
0
}
150
151
template <>
152
0
inline size_t hash_crc32(doris::Int128 u) {
153
0
    return doris::UInt128HashCRC32()({(u >> 64) & int64_t(-1), u & int64_t(-1)});
154
0
}
155
156
template <>
157
0
inline size_t hash_crc32(doris::VecDateTimeValue u) {
158
0
    return hash_crc32(*(int64_t*)&u);
159
0
}
160
161
template <>
162
0
inline size_t hash_crc32(doris::DateV2Value<doris::DateTimeV2ValueType> u) {
163
0
    return hash_crc32(u.to_date_int_val());
164
0
}
165
166
template <>
167
0
inline size_t hash_crc32(doris::DateV2Value<doris::DateV2ValueType> u) {
168
0
    return hash_crc32(u.to_date_int_val());
169
0
}
170
171
template <>
172
0
inline size_t hash_crc32(doris::TimestampTzValue u) {
173
0
    return hash_crc32(u.to_date_int_val());
174
0
}
175
176
#define DEFINE_HASH(T)                   \
177
    template <>                          \
178
    struct HashCRC32<T> {                \
179
4.10M
        size_t operator()(T key) const { \
180
4.10M
            return hash_crc32<T>(key);   \
181
4.10M
        }                                \
_ZNK9HashCRC32IaEclEa
Line
Count
Source
179
8.50k
        size_t operator()(T key) const { \
180
8.50k
            return hash_crc32<T>(key);   \
181
8.50k
        }                                \
_ZNK9HashCRC32IhEclEh
Line
Count
Source
179
29.0k
        size_t operator()(T key) const { \
180
29.0k
            return hash_crc32<T>(key);   \
181
29.0k
        }                                \
_ZNK9HashCRC32IsEclEs
Line
Count
Source
179
8.50k
        size_t operator()(T key) const { \
180
8.50k
            return hash_crc32<T>(key);   \
181
8.50k
        }                                \
_ZNK9HashCRC32ItEclEt
Line
Count
Source
179
18.7k
        size_t operator()(T key) const { \
180
18.7k
            return hash_crc32<T>(key);   \
181
18.7k
        }                                \
_ZNK9HashCRC32IjEclEj
Line
Count
Source
179
3.97M
        size_t operator()(T key) const { \
180
3.97M
            return hash_crc32<T>(key);   \
181
3.97M
        }                                \
_ZNK9HashCRC32ImEclEm
Line
Count
Source
179
37.4k
        size_t operator()(T key) const { \
180
37.4k
            return hash_crc32<T>(key);   \
181
37.4k
        }                                \
_ZNK9HashCRC32IN4wide7integerILm128EjEEEclES2_
Line
Count
Source
179
31.5k
        size_t operator()(T key) const { \
180
31.5k
            return hash_crc32<T>(key);   \
181
31.5k
        }                                \
Unexecuted instantiation: _ZNK9HashCRC32IiEclEi
Unexecuted instantiation: _ZNK9HashCRC32IlEclEl
Unexecuted instantiation: _ZNK9HashCRC32InEclEn
Unexecuted instantiation: _ZNK9HashCRC32IfEclEf
Unexecuted instantiation: _ZNK9HashCRC32IdEclEd
Unexecuted instantiation: _ZNK9HashCRC32IN5doris16VecDateTimeValueEEclES1_
Unexecuted instantiation: _ZNK9HashCRC32IN5doris11DateV2ValueINS0_19DateTimeV2ValueTypeEEEEclES3_
Unexecuted instantiation: _ZNK9HashCRC32IN5doris11DateV2ValueINS0_15DateV2ValueTypeEEEEclES3_
Unexecuted instantiation: _ZNK9HashCRC32IN5doris16TimestampTzValueEEclES1_
Unexecuted instantiation: _ZNK9HashCRC32IoEclEo
182
    };
183
184
DEFINE_HASH(doris::UInt8)
185
DEFINE_HASH(doris::UInt16)
186
DEFINE_HASH(doris::UInt32)
187
DEFINE_HASH(doris::UInt64)
188
DEFINE_HASH(doris::UInt128)
189
DEFINE_HASH(doris::Int8)
190
DEFINE_HASH(doris::Int16)
191
DEFINE_HASH(doris::Int32)
192
DEFINE_HASH(doris::Int64)
193
DEFINE_HASH(doris::Int128)
194
DEFINE_HASH(doris::Float32)
195
DEFINE_HASH(doris::Float64)
196
DEFINE_HASH(doris::VecDateTimeValue)
197
DEFINE_HASH(doris::DateV2Value<doris::DateTimeV2ValueType>)
198
DEFINE_HASH(doris::DateV2Value<doris::DateV2ValueType>)
199
DEFINE_HASH(doris::TimestampTzValue)
200
DEFINE_HASH(unsigned __int128)
201
202
#undef DEFINE_HASH
203
204
template <typename Key, typename Hash = HashCRC32<Key>>
205
struct HashMixWrapper {
206
3.93M
    size_t operator()(Key key) const { return phmap::phmap_mix<sizeof(size_t)>()(Hash()(key)); }
_ZNK14HashMixWrapperIm9HashCRC32ImEEclEm
Line
Count
Source
206
209
    size_t operator()(Key key) const { return phmap::phmap_mix<sizeof(size_t)>()(Hash()(key)); }
_ZNK14HashMixWrapperIj9HashCRC32IjEEclEj
Line
Count
Source
206
3.93M
    size_t operator()(Key key) const { return phmap::phmap_mix<sizeof(size_t)>()(Hash()(key)); }
207
};
208
209
template <>
210
struct HashCRC32<doris::UInt256> {
211
10.8k
    size_t operator()(const doris::UInt256& x) const {
212
10.8k
#if defined(__SSE4_2__) || defined(__aarch64__)
213
10.8k
        doris::UInt64 crc = -1ULL;
214
10.8k
        crc = _mm_crc32_u64(crc, x.items[0]);
215
10.8k
        crc = _mm_crc32_u64(crc, x.items[1]);
216
10.8k
        crc = _mm_crc32_u64(crc, x.items[2]);
217
10.8k
        crc = _mm_crc32_u64(crc, x.items[3]);
218
10.8k
        return crc;
219
#else
220
        return Hash128to64({Hash128to64({x.a, x.b}), Hash128to64({x.c, x.d})});
221
#endif
222
10.8k
    }
223
};
224
225
template <>
226
struct HashCRC32<wide::Int256> {
227
8.19k
    size_t operator()(const wide::Int256& x) const {
228
8.19k
#if defined(__SSE4_2__) || defined(__aarch64__)
229
8.19k
        doris::UInt64 crc = -1ULL;
230
8.19k
        crc = _mm_crc32_u64(crc, x.items[0]);
231
8.19k
        crc = _mm_crc32_u64(crc, x.items[1]);
232
8.19k
        crc = _mm_crc32_u64(crc, x.items[2]);
233
8.19k
        crc = _mm_crc32_u64(crc, x.items[3]);
234
8.19k
        return crc;
235
#else
236
        return Hash128to64(
237
                {Hash128to64({x.items[0], x.items[1]}), Hash128to64({x.items[2], x.items[3]})});
238
#endif
239
8.19k
    }
240
};
241
242
template <>
243
struct HashCRC32<doris::Decimal256> {
244
0
    size_t operator()(const doris::Decimal256& value) const {
245
0
        return HashCRC32<wide::Int256>()(value.value);
246
0
    }
247
};
248
249
template <>
250
struct HashCRC32<doris::Decimal32> {
251
0
    size_t operator()(const doris::Decimal32& value) const {
252
0
        return HashCRC32<int32_t>()(value.value);
253
0
    }
254
};
255
256
template <>
257
struct HashCRC32<doris::Decimal64> {
258
0
    size_t operator()(const doris::Decimal64& value) const {
259
0
        return HashCRC32<int64_t>()(value.value);
260
0
    }
261
};
262
263
template <>
264
struct HashCRC32<doris::Decimal128V3> {
265
0
    size_t operator()(const doris::Decimal128V3& value) const {
266
0
        return HashCRC32<__int128>()(value.value);
267
0
    }
268
};
269
270
template <>
271
struct HashCRC32<doris::Decimal128V2> {
272
0
    size_t operator()(const doris::Decimal128V2& value) const {
273
0
        return HashCRC32<__int128>()(value.value);
274
0
    }
275
};
276
277
template <>
278
struct HashCRC32<doris::DecimalV2Value> {
279
0
    size_t operator()(const doris::DecimalV2Value& value) const {
280
0
        return HashCRC32<__int128>()(value.value());
281
0
    }
282
};
283
284
#include "common/compile_check_avoid_begin.h"
285
286
template <>
287
struct HashCRC32<doris::UInt72> {
288
4.40k
    size_t operator()(const doris::UInt72& x) const {
289
4.40k
        doris::UInt64 crc = -1ULL;
290
4.40k
        crc = _mm_crc32_u8(crc, x.a);
291
4.40k
        crc = _mm_crc32_u64(crc, x.b);
292
4.40k
        return crc;
293
4.40k
    }
294
};
295
296
template <>
297
struct HashCRC32<doris::UInt96> {
298
11.0k
    size_t operator()(const doris::UInt96& x) const {
299
11.0k
        doris::UInt64 crc = -1ULL;
300
11.0k
        crc = _mm_crc32_u32(crc, x.a);
301
11.0k
        crc = _mm_crc32_u64(crc, x.b);
302
11.0k
        return crc;
303
11.0k
    }
304
};
305
306
template <>
307
struct HashCRC32<doris::UInt104> {
308
0
    size_t operator()(const doris::UInt104& x) const {
309
0
        doris::UInt64 crc = -1ULL;
310
0
        crc = _mm_crc32_u8(crc, x.a);
311
0
        crc = _mm_crc32_u32(crc, x.b);
312
0
        crc = _mm_crc32_u64(crc, x.c);
313
0
        return crc;
314
0
    }
315
};
316
317
template <>
318
struct HashCRC32<doris::UInt136> {
319
1.32k
    size_t operator()(const doris::UInt136& x) const {
320
1.32k
        doris::UInt64 crc = -1ULL;
321
1.32k
        crc = _mm_crc32_u8(crc, x.a);
322
1.32k
        crc = _mm_crc32_u64(crc, x.b);
323
1.32k
        crc = _mm_crc32_u64(crc, x.c);
324
1.32k
        return crc;
325
1.32k
    }
326
};
327
328
#include "common/compile_check_avoid_end.h"