Coverage Report

Created: 2025-09-16 21:17

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/util/hash_util.hpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/hash-util.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include <gen_cpp/Types_types.h>
24
#include <xxh3.h>
25
#include <zlib.h>
26
27
#include <bit>
28
#include <functional>
29
30
#include "common/compiler_util.h" // IWYU pragma: keep
31
#include "util/cpu_info.h"
32
#include "util/hash/city.h"
33
#include "util/murmur_hash3.h"
34
#include "util/sse_util.hpp"
35
36
namespace doris {
37
#include "common/compile_check_begin.h"
38
// Utility class to compute hash values.
39
class HashUtil {
40
public:
41
3.78M
    static uint32_t zlib_crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
42
3.78M
        return (uint32_t)crc32(hash, (const unsigned char*)data, bytes);
43
3.78M
    }
44
45
1.08M
    static uint32_t zlib_crc_hash_null(uint32_t hash) {
46
        // null is treat as 0 when hash
47
1.08M
        static const int INT_VALUE = 0;
48
1.08M
        return (uint32_t)crc32(hash, (const unsigned char*)(&INT_VALUE), 4);
49
1.08M
    }
50
51
#if defined(__SSE4_2__) || defined(__aarch64__)
52
    // Compute the Crc32 hash for data using SSE4 instructions.  The input hash parameter is
53
    // the current hash/seed value.
54
    // This should only be called if SSE is supported.
55
    // This is ~4x faster than Fnv/Boost Hash.
56
    // NOTE: DO NOT use this method for checksum! This does not generate the standard CRC32 checksum!
57
    //       For checksum, use CRC-32C algorithm from crc32c.h
58
    // NOTE: Any changes made to this function need to be reflected in Codegen::GetHashFn.
59
    // TODO: crc32 hashes with different seeds do not result in different hash functions.
60
    // The resulting hashes are correlated.
61
173k
    static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
62
173k
        if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) {
63
0
            return zlib_crc_hash(data, bytes, hash);
64
0
        }
65
173k
        uint32_t words = bytes / sizeof(uint32_t);
66
173k
        bytes = bytes % sizeof(uint32_t);
67
68
173k
        const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
69
70
312k
        while (words--) {
71
139k
            hash = _mm_crc32_u32(hash, *p);
72
139k
            ++p;
73
139k
        }
74
75
173k
        const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
76
77
174k
        while (bytes--) {
78
1.52k
            hash = _mm_crc32_u8(hash, *s);
79
1.52k
            ++s;
80
1.52k
        }
81
82
        // The lower half of the CRC hash has has poor uniformity, so swap the halves
83
        // for anyone who only uses the first several bits of the hash.
84
173k
        hash = (hash << 16) | (hash >> 16);
85
173k
        return hash;
86
173k
    }
87
88
0
    static uint64_t crc_hash64(const void* data, uint32_t bytes, uint64_t hash) {
89
0
        uint32_t words = bytes / sizeof(uint32_t);
90
0
        bytes = bytes % sizeof(uint32_t);
91
0
92
0
        uint32_t h1 = hash >> 32;
93
0
        uint32_t h2 = (hash << 32) >> 32;
94
0
95
0
        const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
96
0
        while (words--) {
97
0
            (words & 1) ? (h1 = _mm_crc32_u32(h1, *p)) : (h2 = _mm_crc32_u32(h2, *p));
98
0
            ++p;
99
0
        }
100
0
101
0
        const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
102
0
        while (bytes--) {
103
0
            (bytes & 1) ? (h1 = _mm_crc32_u8(h1, *s)) : (h2 = _mm_crc32_u8(h2, *s));
104
0
            ++s;
105
0
        }
106
0
        union {
107
0
            uint64_t u64;
108
0
            uint32_t u32[2];
109
0
        } converter;
110
0
        converter.u64 = hash;
111
0
112
0
        h1 = (h1 << 16) | (h1 >> 16);
113
0
        h2 = (h2 << 16) | (h2 >> 16);
114
0
        converter.u32[0] = h1;
115
0
        converter.u32[1] = h2;
116
0
117
0
        return converter.u64;
118
0
    }
119
#else
120
    static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
121
        return zlib_crc_hash(data, bytes, hash);
122
    }
123
#endif
124
125
    // refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
126
    static const uint32_t MURMUR3_32_SEED = 104729;
127
128
    // modify from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
129
20
    static uint32_t murmur_hash3_32(const void* key, int64_t len, uint32_t seed) {
130
20
        uint32_t out = 0;
131
20
        murmur_hash3_x86_32(key, len, seed, &out);
132
20
        return out;
133
20
    }
134
135
    static const int MURMUR_R = 47;
136
137
    // Murmur2 hash implementation returning 64-bit hashes.
138
0
    static uint64_t murmur_hash2_64(const void* input, int len, uint64_t seed) {
139
0
        uint64_t h = seed ^ (len * MURMUR_PRIME);
140
0
141
0
        const uint64_t* data = reinterpret_cast<const uint64_t*>(input);
142
0
        const uint64_t* end = data + (len / sizeof(uint64_t));
143
0
144
0
        while (data != end) {
145
0
            uint64_t k = *data++;
146
0
            k *= MURMUR_PRIME;
147
0
            k ^= k >> MURMUR_R;
148
0
            k *= MURMUR_PRIME;
149
0
            h ^= k;
150
0
            h *= MURMUR_PRIME;
151
0
        }
152
0
153
0
        const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data);
154
0
        switch (len & 7) {
155
0
        case 7:
156
0
            h ^= uint64_t(data2[6]) << 48;
157
0
            [[fallthrough]];
158
0
        case 6:
159
0
            h ^= uint64_t(data2[5]) << 40;
160
0
            [[fallthrough]];
161
0
        case 5:
162
0
            h ^= uint64_t(data2[4]) << 32;
163
0
            [[fallthrough]];
164
0
        case 4:
165
0
            h ^= uint64_t(data2[3]) << 24;
166
0
            [[fallthrough]];
167
0
        case 3:
168
0
            h ^= uint64_t(data2[2]) << 16;
169
0
            [[fallthrough]];
170
0
        case 2:
171
0
            h ^= uint64_t(data2[1]) << 8;
172
0
            [[fallthrough]];
173
0
        case 1:
174
0
            h ^= uint64_t(data2[0]);
175
0
            h *= MURMUR_PRIME;
176
0
        }
177
0
178
0
        h ^= h >> MURMUR_R;
179
0
        h *= MURMUR_PRIME;
180
0
        h ^= h >> MURMUR_R;
181
0
        return h;
182
0
    }
183
184
    // default values recommended by http://isthe.com/chongo/tech/comp/fnv/
185
    static const uint32_t FNV_PRIME = 0x01000193; //   16777619
186
    static const uint32_t FNV_SEED = 0x811C9DC5;  // 2166136261
187
    static const uint64_t FNV64_PRIME = 1099511628211UL;
188
    static const uint64_t FNV64_SEED = 14695981039346656037UL;
189
    static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995ULL;
190
    static const uint32_t MURMUR_SEED = 0xadc83b19ULL;
191
    // Implementation of the Fowler–Noll–Vo hash function.  This is not as performant
192
    // as boost's hash on int types (2x slower) but has bit entropy.
193
    // For ints, boost just returns the value of the int which can be pathological.
194
    // For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000
195
    // is taken on the hash, all values will collide to the same bucket.
196
    // For string values, Fnv is slightly faster than boost.
197
0
    static uint32_t fnv_hash(const void* data, uint32_t bytes, uint32_t hash) {
198
0
        const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
199
200
0
        while (bytes--) {
201
0
            hash = (*ptr ^ hash) * FNV_PRIME;
202
0
            ++ptr;
203
0
        }
204
205
0
        return hash;
206
0
    }
207
208
0
    static uint64_t fnv_hash64(const void* data, uint32_t bytes, uint64_t hash) {
209
0
        const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
210
0
211
0
        while (bytes--) {
212
0
            hash = (*ptr ^ hash) * FNV64_PRIME;
213
0
            ++ptr;
214
0
        }
215
0
216
0
        return hash;
217
0
    }
218
219
    // Our hash function is MurmurHash2, 64 bit version.
220
    // It was modified in order to provide the same result in
221
    // big and little endian archs (endian neutral).
222
67.8k
    static uint64_t murmur_hash64A(const void* key, int64_t len, unsigned int seed) {
223
67.8k
        const uint64_t m = MURMUR_PRIME;
224
67.8k
        const int r = 47;
225
67.8k
        uint64_t h = seed ^ (len * m);
226
67.8k
        const uint8_t* data = (const uint8_t*)key;
227
67.8k
        const uint8_t* end = data + (len - (len & 7));
228
229
135k
        while (data != end) {
230
67.8k
            uint64_t k;
231
            if constexpr (std::endian::native == std::endian::big) {
232
                k = (uint64_t)data[0];
233
                k |= (uint64_t)data[1] << 8;
234
                k |= (uint64_t)data[2] << 16;
235
                k |= (uint64_t)data[3] << 24;
236
                k |= (uint64_t)data[4] << 32;
237
                k |= (uint64_t)data[5] << 40;
238
                k |= (uint64_t)data[6] << 48;
239
                k |= (uint64_t)data[7] << 56;
240
67.8k
            } else if constexpr (std::endian::native == std::endian::little) {
241
67.8k
                memcpy(&k, data, sizeof(k));
242
            } else {
243
                static_assert(std::endian::native == std::endian::big ||
244
                                      std::endian::native == std::endian::little,
245
                              "Unsupported endianness");
246
            }
247
248
67.8k
            k *= m;
249
67.8k
            k ^= k >> r;
250
67.8k
            k *= m;
251
67.8k
            h ^= k;
252
67.8k
            h *= m;
253
67.8k
            data += 8;
254
67.8k
        }
255
256
67.8k
        switch (len & 7) {
257
0
        case 7:
258
0
            h ^= (uint64_t)data[6] << 48;
259
0
            [[fallthrough]];
260
0
        case 6:
261
0
            h ^= (uint64_t)data[5] << 40;
262
0
            [[fallthrough]];
263
0
        case 5:
264
0
            h ^= (uint64_t)data[4] << 32;
265
0
            [[fallthrough]];
266
3
        case 4:
267
3
            h ^= (uint64_t)data[3] << 24;
268
3
            [[fallthrough]];
269
3
        case 3:
270
3
            h ^= (uint64_t)data[2] << 16;
271
3
            [[fallthrough]];
272
3
        case 2:
273
3
            h ^= (uint64_t)data[1] << 8;
274
3
            [[fallthrough]];
275
6
        case 1:
276
6
            h ^= (uint64_t)data[0];
277
6
            h *= m;
278
67.8k
        }
279
280
67.8k
        h ^= h >> r;
281
67.8k
        h *= m;
282
67.8k
        h ^= h >> r;
283
67.8k
        return h;
284
67.8k
    }
285
286
    // Computes the hash value for data.  Will call either CrcHash or FnvHash
287
    // depending on hardware capabilities.
288
    // Seed values for different steps of the query execution should use different seeds
289
    // to prevent accidental key collisions. (See IMPALA-219 for more details).
290
173k
    static uint32_t hash(const void* data, uint32_t bytes, uint32_t seed) {
291
173k
#ifdef __SSE4_2__
292
293
173k
        if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
294
173k
            return crc_hash(data, bytes, seed);
295
173k
        } else {
296
0
            return fnv_hash(data, bytes, seed);
297
0
        }
298
299
#else
300
        return fnv_hash(data, bytes, seed);
301
#endif
302
173k
    }
303
304
24.0k
    static uint64_t hash64(const void* data, uint64_t bytes, uint64_t seed) {
305
#ifdef _SSE4_2_
306
        if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
307
            return crc_hash64(data, bytes, seed);
308
309
        } else {
310
            uint64_t hash = 0;
311
            murmur_hash3_x64_64(data, bytes, seed, &hash);
312
            return hash;
313
        }
314
#else
315
24.0k
        uint64_t hash = 0;
316
24.0k
        murmur_hash3_x64_64(data, bytes, seed, &hash);
317
24.0k
        return hash;
318
24.0k
#endif
319
24.0k
    }
320
    // hash_combine is the same with boost hash_combine,
321
    // except replace boost::hash with std::hash
322
    template <class T>
323
349
    static inline void hash_combine(std::size_t& seed, const T& v) {
324
349
        std::hash<T> hasher;
325
349
        seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
326
349
    }
_ZN5doris8HashUtil12hash_combineINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEEvRmRKT_
Line
Count
Source
323
347
    static inline void hash_combine(std::size_t& seed, const T& v) {
324
347
        std::hash<T> hasher;
325
347
        seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
326
347
    }
_ZN5doris8HashUtil12hash_combineIlEEvRmRKT_
Line
Count
Source
323
2
    static inline void hash_combine(std::size_t& seed, const T& v) {
324
2
        std::hash<T> hasher;
325
2
        seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
326
2
    }
327
328
#if defined(__clang__)
329
#pragma clang diagnostic push
330
#pragma clang diagnostic ignored "-Wused-but-marked-unused"
331
#endif
332
    // xxHash function for a byte array.  For convenience, a 64-bit seed is also
333
    // hashed into the result.  The mapping may change from time to time.
334
12
    static xxh_u32 xxHash32WithSeed(const char* s, size_t len, xxh_u32 seed) {
335
12
        return XXH32(s, len, seed);
336
12
    }
337
338
    // same to the up function, just for null value
339
0
    static xxh_u32 xxHash32NullWithSeed(xxh_u32 seed) {
340
0
        static const int INT_VALUE = 0;
341
0
        return XXH32(reinterpret_cast<const char*>(&INT_VALUE), sizeof(int), seed);
342
0
    }
343
344
145k
    static xxh_u64 xxHash64WithSeed(const char* s, size_t len, xxh_u64 seed) {
345
145k
        return XXH3_64bits_withSeed(s, len, seed);
346
145k
    }
347
348
    // same to the up function, just for null value
349
1.08M
    static xxh_u64 xxHash64NullWithSeed(xxh_u64 seed) {
350
1.08M
        static const int INT_VALUE = 0;
351
1.08M
        return XXH3_64bits_withSeed(reinterpret_cast<const char*>(&INT_VALUE), sizeof(int), seed);
352
1.08M
    }
353
354
#if defined(__clang__)
355
#pragma clang diagnostic pop
356
#endif
357
};
358
359
} // namespace doris
360
361
template <>
362
struct std::hash<doris::TUniqueId> {
363
7.64k
    size_t operator()(const doris::TUniqueId& id) const {
364
7.64k
        uint32_t seed = 0;
365
7.64k
        seed = doris::HashUtil::hash(&id.lo, sizeof(id.lo), seed);
366
7.64k
        seed = doris::HashUtil::hash(&id.hi, sizeof(id.hi), seed);
367
7.64k
        return seed;
368
7.64k
    }
369
};
370
371
template <>
372
struct std::hash<doris::TNetworkAddress> {
373
0
    size_t operator()(const doris::TNetworkAddress& address) const {
374
0
        uint32_t seed = 0;
375
0
        seed = doris::HashUtil::hash(address.hostname.data(), (uint32_t)address.hostname.size(),
376
0
                                     seed);
377
0
        seed = doris::HashUtil::hash(&address.port, 4, seed);
378
0
        return seed;
379
0
    }
380
};
381
382
template <>
383
struct std::hash<std::pair<doris::TUniqueId, int64_t>> {
384
0
    size_t operator()(const std::pair<doris::TUniqueId, int64_t>& pair) const {
385
0
        uint32_t seed = 0;
386
0
        seed = doris::HashUtil::hash(&pair.first.lo, sizeof(pair.first.lo), seed);
387
0
        seed = doris::HashUtil::hash(&pair.first.hi, sizeof(pair.first.hi), seed);
388
0
        seed = doris::HashUtil::hash(&pair.second, sizeof(pair.second), seed);
389
0
        return seed;
390
0
    }
391
};
392
393
template <class First, class Second>
394
struct std::hash<std::pair<First, Second>> {
395
84.8k
    size_t operator()(const pair<First, Second>& p) const {
396
84.8k
        size_t h1 = std::hash<First>()(p.first);
397
84.8k
        size_t h2 = std::hash<Second>()(p.second);
398
84.8k
        return doris::util_hash::HashLen16(h1, h2);
399
84.8k
    }
Unexecuted instantiation: _ZNKSt4hashISt4pairIlN5doris8RowsetIdEEEclERKS3_
_ZNKSt4hashISt4pairINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEElEEclERKS7_
Line
Count
Source
395
26
    size_t operator()(const pair<First, Second>& p) const {
396
26
        size_t h1 = std::hash<First>()(p.first);
397
26
        size_t h2 = std::hash<Second>()(p.second);
398
26
        return doris::util_hash::HashLen16(h1, h2);
399
26
    }
_ZNKSt4hashISt4pairIiN5doris10vectorized10PathInDataEEEclERKS4_
Line
Count
Source
395
84.7k
    size_t operator()(const pair<First, Second>& p) const {
396
84.7k
        size_t h1 = std::hash<First>()(p.first);
397
84.7k
        size_t h2 = std::hash<Second>()(p.second);
398
84.7k
        return doris::util_hash::HashLen16(h1, h2);
399
84.7k
    }
_ZNKSt4hashISt4pairIllEEclERKS1_
Line
Count
Source
395
60
    size_t operator()(const pair<First, Second>& p) const {
396
60
        size_t h1 = std::hash<First>()(p.first);
397
60
        size_t h2 = std::hash<Second>()(p.second);
398
60
        return doris::util_hash::HashLen16(h1, h2);
399
60
    }
Unexecuted instantiation: _ZNKSt4hashISt4pairIN5doris9TUniqueIdEiEEclERKS3_
400
};
401
402
#include "common/compile_check_end.h"