Coverage Report

Created: 2025-07-24 22:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/util/hash_util.hpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/hash-util.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include <gen_cpp/Types_types.h>
24
#include <xxh3.h>
25
#include <zlib.h>
26
27
#include <bit>
28
#include <functional>
29
30
#include "common/compiler_util.h" // IWYU pragma: keep
31
#include "gutil/endian.h"
32
#include "gutil/hash/city.h"
33
#include "runtime/define_primitive_type.h"
34
#include "util/cpu_info.h"
35
#include "util/murmur_hash3.h"
36
#include "util/sse_util.hpp"
37
38
namespace doris {
39
#include "common/compile_check_begin.h"
40
// Utility class to compute hash values.
41
class HashUtil {
42
public:
43
3.75M
    static uint32_t zlib_crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
44
3.75M
        return (uint32_t)crc32(hash, (const unsigned char*)data, bytes);
45
3.75M
    }
46
47
0
    static uint32_t zlib_crc_hash_null(uint32_t hash) {
48
0
        // null is treat as 0 when hash
49
0
        static const int INT_VALUE = 0;
50
0
        return (uint32_t)crc32(hash, (const unsigned char*)(&INT_VALUE), 4);
51
0
    }
52
53
#if defined(__SSE4_2__) || defined(__aarch64__)
54
    // Compute the Crc32 hash for data using SSE4 instructions.  The input hash parameter is
55
    // the current hash/seed value.
56
    // This should only be called if SSE is supported.
57
    // This is ~4x faster than Fnv/Boost Hash.
58
    // NOTE: DO NOT use this method for checksum! This does not generate the standard CRC32 checksum!
59
    //       For checksum, use CRC-32C algorithm from crc32c.h
60
    // NOTE: Any changes made to this function need to be reflected in Codegen::GetHashFn.
61
    // TODO: crc32 hashes with different seeds do not result in different hash functions.
62
    // The resulting hashes are correlated.
63
166k
    static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
64
166k
        if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) {
65
0
            return zlib_crc_hash(data, bytes, hash);
66
0
        }
67
166k
        uint32_t words = bytes / sizeof(uint32_t);
68
166k
        bytes = bytes % sizeof(uint32_t);
69
70
166k
        const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
71
72
301k
        while (words--) {
73
134k
            hash = _mm_crc32_u32(hash, *p);
74
134k
            ++p;
75
134k
        }
76
77
166k
        const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
78
79
167k
        while (bytes--) {
80
1.28k
            hash = _mm_crc32_u8(hash, *s);
81
1.28k
            ++s;
82
1.28k
        }
83
84
        // The lower half of the CRC hash has has poor uniformity, so swap the halves
85
        // for anyone who only uses the first several bits of the hash.
86
166k
        hash = (hash << 16) | (hash >> 16);
87
166k
        return hash;
88
166k
    }
89
90
0
    static uint64_t crc_hash64(const void* data, uint32_t bytes, uint64_t hash) {
91
0
        uint32_t words = bytes / sizeof(uint32_t);
92
0
        bytes = bytes % sizeof(uint32_t);
93
0
94
0
        uint32_t h1 = hash >> 32;
95
0
        uint32_t h2 = (hash << 32) >> 32;
96
0
97
0
        const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
98
0
        while (words--) {
99
0
            (words & 1) ? (h1 = _mm_crc32_u32(h1, *p)) : (h2 = _mm_crc32_u32(h2, *p));
100
0
            ++p;
101
0
        }
102
0
103
0
        const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
104
0
        while (bytes--) {
105
0
            (bytes & 1) ? (h1 = _mm_crc32_u8(h1, *s)) : (h2 = _mm_crc32_u8(h2, *s));
106
0
            ++s;
107
0
        }
108
0
        union {
109
0
            uint64_t u64;
110
0
            uint32_t u32[2];
111
0
        } converter;
112
0
        converter.u64 = hash;
113
0
114
0
        h1 = (h1 << 16) | (h1 >> 16);
115
0
        h2 = (h2 << 16) | (h2 >> 16);
116
0
        converter.u32[0] = h1;
117
0
        converter.u32[1] = h2;
118
0
119
0
        return converter.u64;
120
0
    }
121
#else
122
    static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
123
        return zlib_crc_hash(data, bytes, hash);
124
    }
125
#endif
126
127
    // refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
128
    static const uint32_t MURMUR3_32_SEED = 104729;
129
130
    // modify from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
131
20
    static uint32_t murmur_hash3_32(const void* key, int64_t len, uint32_t seed) {
132
20
        uint32_t out = 0;
133
20
        murmur_hash3_x86_32(key, len, seed, &out);
134
20
        return out;
135
20
    }
136
137
    static const int MURMUR_R = 47;
138
139
    // Murmur2 hash implementation returning 64-bit hashes.
140
0
    static uint64_t murmur_hash2_64(const void* input, int len, uint64_t seed) {
141
0
        uint64_t h = seed ^ (len * MURMUR_PRIME);
142
0
143
0
        const uint64_t* data = reinterpret_cast<const uint64_t*>(input);
144
0
        const uint64_t* end = data + (len / sizeof(uint64_t));
145
0
146
0
        while (data != end) {
147
0
            uint64_t k = *data++;
148
0
            k *= MURMUR_PRIME;
149
0
            k ^= k >> MURMUR_R;
150
0
            k *= MURMUR_PRIME;
151
0
            h ^= k;
152
0
            h *= MURMUR_PRIME;
153
0
        }
154
0
155
0
        const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data);
156
0
        switch (len & 7) {
157
0
        case 7:
158
0
            h ^= uint64_t(data2[6]) << 48;
159
0
            [[fallthrough]];
160
0
        case 6:
161
0
            h ^= uint64_t(data2[5]) << 40;
162
0
            [[fallthrough]];
163
0
        case 5:
164
0
            h ^= uint64_t(data2[4]) << 32;
165
0
            [[fallthrough]];
166
0
        case 4:
167
0
            h ^= uint64_t(data2[3]) << 24;
168
0
            [[fallthrough]];
169
0
        case 3:
170
0
            h ^= uint64_t(data2[2]) << 16;
171
0
            [[fallthrough]];
172
0
        case 2:
173
0
            h ^= uint64_t(data2[1]) << 8;
174
0
            [[fallthrough]];
175
0
        case 1:
176
0
            h ^= uint64_t(data2[0]);
177
0
            h *= MURMUR_PRIME;
178
0
        }
179
0
180
0
        h ^= h >> MURMUR_R;
181
0
        h *= MURMUR_PRIME;
182
0
        h ^= h >> MURMUR_R;
183
0
        return h;
184
0
    }
185
186
    // default values recommended by http://isthe.com/chongo/tech/comp/fnv/
187
    static const uint32_t FNV_PRIME = 0x01000193; //   16777619
188
    static const uint32_t FNV_SEED = 0x811C9DC5;  // 2166136261
189
    static const uint64_t FNV64_PRIME = 1099511628211UL;
190
    static const uint64_t FNV64_SEED = 14695981039346656037UL;
191
    static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995ULL;
192
    static const uint32_t MURMUR_SEED = 0xadc83b19ULL;
193
    // Implementation of the Fowler–Noll–Vo hash function.  This is not as performant
194
    // as boost's hash on int types (2x slower) but has bit entropy.
195
    // For ints, boost just returns the value of the int which can be pathological.
196
    // For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000
197
    // is taken on the hash, all values will collide to the same bucket.
198
    // For string values, Fnv is slightly faster than boost.
199
0
    static uint32_t fnv_hash(const void* data, uint32_t bytes, uint32_t hash) {
200
0
        const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
201
202
0
        while (bytes--) {
203
0
            hash = (*ptr ^ hash) * FNV_PRIME;
204
0
            ++ptr;
205
0
        }
206
207
0
        return hash;
208
0
    }
209
210
0
    static uint64_t fnv_hash64(const void* data, uint32_t bytes, uint64_t hash) {
211
0
        const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
212
0
213
0
        while (bytes--) {
214
0
            hash = (*ptr ^ hash) * FNV64_PRIME;
215
0
            ++ptr;
216
0
        }
217
0
218
0
        return hash;
219
0
    }
220
221
    // Our hash function is MurmurHash2, 64 bit version.
222
    // It was modified in order to provide the same result in
223
    // big and little endian archs (endian neutral).
224
67.8k
    static uint64_t murmur_hash64A(const void* key, int64_t len, unsigned int seed) {
225
67.8k
        const uint64_t m = MURMUR_PRIME;
226
67.8k
        const int r = 47;
227
67.8k
        uint64_t h = seed ^ (len * m);
228
67.8k
        const uint8_t* data = (const uint8_t*)key;
229
67.8k
        const uint8_t* end = data + (len - (len & 7));
230
231
135k
        while (data != end) {
232
67.8k
            uint64_t k;
233
            if constexpr (std::endian::native == std::endian::big) {
234
                k = (uint64_t)data[0];
235
                k |= (uint64_t)data[1] << 8;
236
                k |= (uint64_t)data[2] << 16;
237
                k |= (uint64_t)data[3] << 24;
238
                k |= (uint64_t)data[4] << 32;
239
                k |= (uint64_t)data[5] << 40;
240
                k |= (uint64_t)data[6] << 48;
241
                k |= (uint64_t)data[7] << 56;
242
67.8k
            } else if constexpr (std::endian::native == std::endian::little) {
243
67.8k
                memcpy(&k, data, sizeof(k));
244
            } else {
245
                static_assert(std::endian::native == std::endian::big ||
246
                                      std::endian::native == std::endian::little,
247
                              "Unsupported endianness");
248
            }
249
250
67.8k
            k *= m;
251
67.8k
            k ^= k >> r;
252
67.8k
            k *= m;
253
67.8k
            h ^= k;
254
67.8k
            h *= m;
255
67.8k
            data += 8;
256
67.8k
        }
257
258
67.8k
        switch (len & 7) {
259
0
        case 7:
260
0
            h ^= (uint64_t)data[6] << 48;
261
0
            [[fallthrough]];
262
0
        case 6:
263
0
            h ^= (uint64_t)data[5] << 40;
264
0
            [[fallthrough]];
265
0
        case 5:
266
0
            h ^= (uint64_t)data[4] << 32;
267
0
            [[fallthrough]];
268
3
        case 4:
269
3
            h ^= (uint64_t)data[3] << 24;
270
3
            [[fallthrough]];
271
3
        case 3:
272
3
            h ^= (uint64_t)data[2] << 16;
273
3
            [[fallthrough]];
274
3
        case 2:
275
3
            h ^= (uint64_t)data[1] << 8;
276
3
            [[fallthrough]];
277
6
        case 1:
278
6
            h ^= (uint64_t)data[0];
279
6
            h *= m;
280
67.8k
        }
281
282
67.8k
        h ^= h >> r;
283
67.8k
        h *= m;
284
67.8k
        h ^= h >> r;
285
67.8k
        return h;
286
67.8k
    }
287
288
    // Computes the hash value for data.  Will call either CrcHash or FnvHash
289
    // depending on hardware capabilities.
290
    // Seed values for different steps of the query execution should use different seeds
291
    // to prevent accidental key collisions. (See IMPALA-219 for more details).
292
166k
    static uint32_t hash(const void* data, uint32_t bytes, uint32_t seed) {
293
166k
#ifdef __SSE4_2__
294
295
166k
        if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
296
166k
            return crc_hash(data, bytes, seed);
297
166k
        } else {
298
0
            return fnv_hash(data, bytes, seed);
299
0
        }
300
301
#else
302
        return fnv_hash(data, bytes, seed);
303
#endif
304
166k
    }
305
306
23.8k
    static uint64_t hash64(const void* data, uint64_t bytes, uint64_t seed) {
307
#ifdef _SSE4_2_
308
        if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
309
            return crc_hash64(data, bytes, seed);
310
311
        } else {
312
            uint64_t hash = 0;
313
            murmur_hash3_x64_64(data, bytes, seed, &hash);
314
            return hash;
315
        }
316
#else
317
23.8k
        uint64_t hash = 0;
318
23.8k
        murmur_hash3_x64_64(data, bytes, seed, &hash);
319
23.8k
        return hash;
320
23.8k
#endif
321
23.8k
    }
322
    // hash_combine is the same with boost hash_combine,
323
    // except replace boost::hash with std::hash
324
    template <class T>
325
    static inline void hash_combine(std::size_t& seed, const T& v) {
326
        std::hash<T> hasher;
327
        seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
328
    }
329
330
#if defined(__clang__)
331
#pragma clang diagnostic push
332
#pragma clang diagnostic ignored "-Wused-but-marked-unused"
333
#endif
334
    // xxHash function for a byte array.  For convenience, a 64-bit seed is also
335
    // hashed into the result.  The mapping may change from time to time.
336
0
    static xxh_u32 xxHash32WithSeed(const char* s, size_t len, xxh_u32 seed) {
337
0
        return XXH32(s, len, seed);
338
0
    }
339
340
    // same to the up function, just for null value
341
0
    static xxh_u32 xxHash32NullWithSeed(xxh_u32 seed) {
342
0
        static const int INT_VALUE = 0;
343
0
        return XXH32(reinterpret_cast<const char*>(&INT_VALUE), sizeof(int), seed);
344
0
    }
345
346
115k
    static xxh_u64 xxHash64WithSeed(const char* s, size_t len, xxh_u64 seed) {
347
115k
        return XXH3_64bits_withSeed(s, len, seed);
348
115k
    }
349
350
    // same to the up function, just for null value
351
1.61k
    static xxh_u64 xxHash64NullWithSeed(xxh_u64 seed) {
352
1.61k
        static const int INT_VALUE = 0;
353
1.61k
        return XXH3_64bits_withSeed(reinterpret_cast<const char*>(&INT_VALUE), sizeof(int), seed);
354
1.61k
    }
355
356
#if defined(__clang__)
357
#pragma clang diagnostic pop
358
#endif
359
};
360
361
} // namespace doris
362
363
template <>
364
struct std::hash<doris::TUniqueId> {
365
7.65k
    size_t operator()(const doris::TUniqueId& id) const {
366
7.65k
        uint32_t seed = 0;
367
7.65k
        seed = doris::HashUtil::hash(&id.lo, sizeof(id.lo), seed);
368
7.65k
        seed = doris::HashUtil::hash(&id.hi, sizeof(id.hi), seed);
369
7.65k
        return seed;
370
7.65k
    }
371
};
372
373
template <>
374
struct std::hash<doris::TNetworkAddress> {
375
0
    size_t operator()(const doris::TNetworkAddress& address) const {
376
0
        uint32_t seed = 0;
377
0
        seed = doris::HashUtil::hash(address.hostname.data(), (uint32_t)address.hostname.size(),
378
0
                                     seed);
379
0
        seed = doris::HashUtil::hash(&address.port, 4, seed);
380
0
        return seed;
381
0
    }
382
};
383
384
template <>
385
struct std::hash<std::pair<doris::TUniqueId, int64_t>> {
386
0
    size_t operator()(const std::pair<doris::TUniqueId, int64_t>& pair) const {
387
0
        uint32_t seed = 0;
388
0
        seed = doris::HashUtil::hash(&pair.first.lo, sizeof(pair.first.lo), seed);
389
0
        seed = doris::HashUtil::hash(&pair.first.hi, sizeof(pair.first.hi), seed);
390
0
        seed = doris::HashUtil::hash(&pair.second, sizeof(pair.second), seed);
391
0
        return seed;
392
0
    }
393
};
394
395
template <class First, class Second>
396
struct std::hash<std::pair<First, Second>> {
397
0
    size_t operator()(const pair<First, Second>& p) const {
398
0
        size_t h1 = std::hash<First>()(p.first);
399
0
        size_t h2 = std::hash<Second>()(p.second);
400
0
        return util_hash::HashLen16(h1, h2);
401
0
    }
402
};
403
404
#include "common/compile_check_end.h"