Coverage Report

Created: 2024-11-21 20:39

/root/doris/be/src/util/hash_util.hpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/hash-util.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include <gen_cpp/Types_types.h>
24
#include <xxh3.h>
25
#include <zlib.h>
26
27
#include <functional>
28
29
#include "common/compiler_util.h" // IWYU pragma: keep
30
#include "gutil/hash/city.h"
31
#include "runtime/define_primitive_type.h"
32
#include "util/cpu_info.h"
33
#include "util/murmur_hash3.h"
34
#include "util/sse_util.hpp"
35
36
namespace doris {
37
38
// Utility class to compute hash values.
39
class HashUtil {
40
public:
41
    template <typename T>
42
    static uint32_t fixed_len_to_uint32(T value) {
43
        if constexpr (sizeof(T) <= sizeof(uint32_t)) {
44
            return (uint32_t)value;
45
        }
46
        return std::hash<T>()(value);
47
    }
48
49
500k
    static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t hash) {
50
500k
        return crc32(hash, (const unsigned char*)data, bytes);
51
500k
    }
52
53
    static uint32_t zlib_crc_hash_null(uint32_t hash) {
54
        // null is treat as 0 when hash
55
        static const int INT_VALUE = 0;
56
        return crc32(hash, (const unsigned char*)(&INT_VALUE), 4);
57
    }
58
59
#if defined(__SSE4_2__) || defined(__aarch64__)
60
    // Compute the Crc32 hash for data using SSE4 instructions.  The input hash parameter is
61
    // the current hash/seed value.
62
    // This should only be called if SSE is supported.
63
    // This is ~4x faster than Fnv/Boost Hash.
64
    // NOTE: DO NOT use this method for checksum! This does not generate the standard CRC32 checksum!
65
    //       For checksum, use CRC-32C algorithm from crc32c.h
66
    // NOTE: Any changes made to this function need to be reflected in Codegen::GetHashFn.
67
    // TODO: crc32 hashes with different seeds do not result in different hash functions.
68
    // The resulting hashes are correlated.
69
14.7k
    static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) {
70
14.7k
        if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) {
71
0
            return zlib_crc_hash(data, bytes, hash);
72
0
        }
73
14.7k
        uint32_t words = bytes / sizeof(uint32_t);
74
14.7k
        bytes = bytes % sizeof(uint32_t);
75
76
14.7k
        const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
77
78
44.5k
        while (words--) {
79
29.7k
            hash = _mm_crc32_u32(hash, *p);
80
29.7k
            ++p;
81
29.7k
        }
82
83
14.7k
        const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
84
85
14.8k
        while (bytes--) {
86
48
            hash = _mm_crc32_u8(hash, *s);
87
48
            ++s;
88
48
        }
89
90
        // The lower half of the CRC hash has has poor uniformity, so swap the halves
91
        // for anyone who only uses the first several bits of the hash.
92
14.7k
        hash = (hash << 16) | (hash >> 16);
93
14.7k
        return hash;
94
14.7k
    }
95
96
0
    static uint64_t crc_hash64(const void* data, int32_t bytes, uint64_t hash) {
97
0
        uint32_t words = bytes / sizeof(uint32_t);
98
0
        bytes = bytes % sizeof(uint32_t);
99
0
100
0
        uint32_t h1 = hash >> 32;
101
0
        uint32_t h2 = (hash << 32) >> 32;
102
0
103
0
        const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
104
0
        while (words--) {
105
0
            (words & 1) ? (h1 = _mm_crc32_u32(h1, *p)) : (h2 = _mm_crc32_u32(h2, *p));
106
0
            ++p;
107
0
        }
108
0
109
0
        const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
110
0
        while (bytes--) {
111
0
            (bytes & 1) ? (h1 = _mm_crc32_u8(h1, *s)) : (h2 = _mm_crc32_u8(h2, *s));
112
0
            ++s;
113
0
        }
114
0
        union {
115
0
            uint64_t u64;
116
0
            uint32_t u32[2];
117
0
        } converter;
118
0
        converter.u64 = hash;
119
0
120
0
        h1 = (h1 << 16) | (h1 >> 16);
121
0
        h2 = (h2 << 16) | (h2 >> 16);
122
0
        converter.u32[0] = h1;
123
0
        converter.u32[1] = h2;
124
0
125
0
        return converter.u64;
126
0
    }
127
#else
128
    static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) {
129
        return zlib_crc_hash(data, bytes, hash);
130
    }
131
#endif
132
133
    // refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
134
    static const uint32_t MURMUR3_32_SEED = 104729;
135
136
    // modify from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
137
20
    static uint32_t murmur_hash3_32(const void* key, int64_t len, uint32_t seed) {
138
20
        uint32_t out = 0;
139
20
        murmur_hash3_x86_32(key, len, seed, &out);
140
20
        return out;
141
20
    }
142
143
    static const int MURMUR_R = 47;
144
145
    // Murmur2 hash implementation returning 64-bit hashes.
146
0
    static uint64_t murmur_hash2_64(const void* input, int len, uint64_t seed) {
147
0
        uint64_t h = seed ^ (len * MURMUR_PRIME);
148
0
149
0
        const uint64_t* data = reinterpret_cast<const uint64_t*>(input);
150
0
        const uint64_t* end = data + (len / sizeof(uint64_t));
151
0
152
0
        while (data != end) {
153
0
            uint64_t k = *data++;
154
0
            k *= MURMUR_PRIME;
155
0
            k ^= k >> MURMUR_R;
156
0
            k *= MURMUR_PRIME;
157
0
            h ^= k;
158
0
            h *= MURMUR_PRIME;
159
0
        }
160
0
161
0
        const uint8_t* data2 = reinterpret_cast<const uint8_t*>(data);
162
0
        switch (len & 7) {
163
0
        case 7:
164
0
            h ^= uint64_t(data2[6]) << 48;
165
0
            [[fallthrough]];
166
0
        case 6:
167
0
            h ^= uint64_t(data2[5]) << 40;
168
0
            [[fallthrough]];
169
0
        case 5:
170
0
            h ^= uint64_t(data2[4]) << 32;
171
0
            [[fallthrough]];
172
0
        case 4:
173
0
            h ^= uint64_t(data2[3]) << 24;
174
0
            [[fallthrough]];
175
0
        case 3:
176
0
            h ^= uint64_t(data2[2]) << 16;
177
0
            [[fallthrough]];
178
0
        case 2:
179
0
            h ^= uint64_t(data2[1]) << 8;
180
0
            [[fallthrough]];
181
0
        case 1:
182
0
            h ^= uint64_t(data2[0]);
183
0
            h *= MURMUR_PRIME;
184
0
        }
185
0
186
0
        h ^= h >> MURMUR_R;
187
0
        h *= MURMUR_PRIME;
188
0
        h ^= h >> MURMUR_R;
189
0
        return h;
190
0
    }
191
192
    // default values recommended by http://isthe.com/chongo/tech/comp/fnv/
193
    static const uint32_t FNV_PRIME = 0x01000193; //   16777619
194
    static const uint32_t FNV_SEED = 0x811C9DC5;  // 2166136261
195
    static const uint64_t FNV64_PRIME = 1099511628211UL;
196
    static const uint64_t FNV64_SEED = 14695981039346656037UL;
197
    static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995ULL;
198
    static const uint32_t MURMUR_SEED = 0xadc83b19ULL;
199
    // Implementation of the Fowler–Noll–Vo hash function.  This is not as performant
200
    // as boost's hash on int types (2x slower) but has bit entropy.
201
    // For ints, boost just returns the value of the int which can be pathological.
202
    // For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000
203
    // is taken on the hash, all values will collide to the same bucket.
204
    // For string values, Fnv is slightly faster than boost.
205
    static uint32_t fnv_hash(const void* data, int32_t bytes, uint32_t hash) {
206
        const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
207
208
        while (bytes--) {
209
            hash = (*ptr ^ hash) * FNV_PRIME;
210
            ++ptr;
211
        }
212
213
        return hash;
214
    }
215
216
0
    static uint64_t fnv_hash64(const void* data, int32_t bytes, uint64_t hash) {
217
0
        const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
218
0
219
0
        while (bytes--) {
220
0
            hash = (*ptr ^ hash) * FNV64_PRIME;
221
0
            ++ptr;
222
0
        }
223
0
224
0
        return hash;
225
0
    }
226
227
    // Our hash function is MurmurHash2, 64 bit version.
228
    // It was modified in order to provide the same result in
229
    // big and little endian archs (endian neutral).
230
67.8k
    static uint64_t murmur_hash64A(const void* key, int64_t len, unsigned int seed) {
231
67.8k
        const uint64_t m = MURMUR_PRIME;
232
67.8k
        const int r = 47;
233
67.8k
        uint64_t h = seed ^ (len * m);
234
67.8k
        const uint8_t* data = (const uint8_t*)key;
235
67.8k
        const uint8_t* end = data + (len - (len & 7));
236
237
135k
        while (data != end) {
238
67.8k
            uint64_t k;
239
#if (BYTE_ORDER == BIG_ENDIAN)
240
            k = (uint64_t)data[0];
241
            k |= (uint64_t)data[1] << 8;
242
            k |= (uint64_t)data[2] << 16;
243
            k |= (uint64_t)data[3] << 24;
244
            k |= (uint64_t)data[4] << 32;
245
            k |= (uint64_t)data[5] << 40;
246
            k |= (uint64_t)data[6] << 48;
247
            k |= (uint64_t)data[7] << 56;
248
#else
249
67.8k
            k = *((uint64_t*)data);
250
67.8k
#endif
251
252
67.8k
            k *= m;
253
67.8k
            k ^= k >> r;
254
67.8k
            k *= m;
255
67.8k
            h ^= k;
256
67.8k
            h *= m;
257
67.8k
            data += 8;
258
67.8k
        }
259
260
67.8k
        switch (len & 7) {
261
0
        case 7:
262
0
            h ^= (uint64_t)data[6] << 48;
263
0
            [[fallthrough]];
264
0
        case 6:
265
0
            h ^= (uint64_t)data[5] << 40;
266
0
            [[fallthrough]];
267
0
        case 5:
268
0
            h ^= (uint64_t)data[4] << 32;
269
0
            [[fallthrough]];
270
0
        case 4:
271
0
            h ^= (uint64_t)data[3] << 24;
272
0
            [[fallthrough]];
273
0
        case 3:
274
0
            h ^= (uint64_t)data[2] << 16;
275
0
            [[fallthrough]];
276
0
        case 2:
277
0
            h ^= (uint64_t)data[1] << 8;
278
0
            [[fallthrough]];
279
0
        case 1:
280
0
            h ^= (uint64_t)data[0];
281
0
            h *= m;
282
67.8k
        }
283
284
67.8k
        h ^= h >> r;
285
67.8k
        h *= m;
286
67.8k
        h ^= h >> r;
287
67.8k
        return h;
288
67.8k
    }
289
290
    // Computes the hash value for data.  Will call either CrcHash or FnvHash
291
    // depending on hardware capabilities.
292
    // Seed values for different steps of the query execution should use different seeds
293
    // to prevent accidental key collisions. (See IMPALA-219 for more details).
294
    static uint32_t hash(const void* data, int32_t bytes, uint32_t seed) {
295
#ifdef __SSE4_2__
296
297
        if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
298
            return crc_hash(data, bytes, seed);
299
        } else {
300
            return fnv_hash(data, bytes, seed);
301
        }
302
303
#else
304
        return fnv_hash(data, bytes, seed);
305
#endif
306
    }
307
308
2.29k
    static uint64_t hash64(const void* data, int32_t bytes, uint64_t seed) {
309
#ifdef _SSE4_2_
310
        if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
311
            return crc_hash64(data, bytes, seed);
312
313
        } else {
314
            uint64_t hash = 0;
315
            murmur_hash3_x64_64(data, bytes, seed, &hash);
316
            return hash;
317
        }
318
#else
319
2.29k
        uint64_t hash = 0;
320
2.29k
        murmur_hash3_x64_64(data, bytes, seed, &hash);
321
2.29k
        return hash;
322
2.29k
#endif
323
2.29k
    }
324
    // hash_combine is the same with boost hash_combine,
325
    // except replace boost::hash with std::hash
326
    template <class T>
327
    static inline void hash_combine(std::size_t& seed, const T& v) {
328
        std::hash<T> hasher;
329
        seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
330
    }
331
332
#if defined(__clang__)
333
#pragma clang diagnostic push
334
#pragma clang diagnostic ignored "-Wused-but-marked-unused"
335
#endif
336
    // xxHash function for a byte array.  For convenience, a 64-bit seed is also
337
    // hashed into the result.  The mapping may change from time to time.
338
    static xxh_u32 xxHash32WithSeed(const char* s, size_t len, xxh_u32 seed) {
339
        return XXH32(s, len, seed);
340
    }
341
342
    // same to the up function, just for null value
343
0
    static xxh_u32 xxHash32NullWithSeed(xxh_u32 seed) {
344
0
        static const int INT_VALUE = 0;
345
0
        return XXH32(reinterpret_cast<const char*>(&INT_VALUE), sizeof(int), seed);
346
0
    }
347
348
4.43k
    static xxh_u64 xxHash64WithSeed(const char* s, size_t len, xxh_u64 seed) {
349
4.43k
        return XXH3_64bits_withSeed(s, len, seed);
350
4.43k
    }
351
352
    // same to the up function, just for null value
353
    static xxh_u64 xxHash64NullWithSeed(xxh_u64 seed) {
354
        static const int INT_VALUE = 0;
355
        return XXH3_64bits_withSeed(reinterpret_cast<const char*>(&INT_VALUE), sizeof(int), seed);
356
    }
357
358
#if defined(__clang__)
359
#pragma clang diagnostic pop
360
#endif
361
};
362
363
} // namespace doris
364
365
template <>
366
struct std::hash<doris::TUniqueId> {
367
    std::size_t operator()(const doris::TUniqueId& id) const {
368
        std::size_t seed = 0;
369
        seed = doris::HashUtil::hash(&id.lo, sizeof(id.lo), seed);
370
        seed = doris::HashUtil::hash(&id.hi, sizeof(id.hi), seed);
371
        return seed;
372
    }
373
};
374
375
template <>
376
struct std::hash<doris::TNetworkAddress> {
377
    size_t operator()(const doris::TNetworkAddress& address) const {
378
        std::size_t seed = 0;
379
        seed = doris::HashUtil::hash(address.hostname.data(), address.hostname.size(), seed);
380
        seed = doris::HashUtil::hash(&address.port, 4, seed);
381
        return seed;
382
    }
383
};
384
385
template <>
386
struct std::hash<std::pair<doris::TUniqueId, int64_t>> {
387
0
    size_t operator()(const std::pair<doris::TUniqueId, int64_t>& pair) const {
388
0
        size_t seed = 0;
389
0
        seed = doris::HashUtil::hash(&pair.first.lo, sizeof(pair.first.lo), seed);
390
0
        seed = doris::HashUtil::hash(&pair.first.hi, sizeof(pair.first.hi), seed);
391
0
        seed = doris::HashUtil::hash(&pair.second, sizeof(pair.second), seed);
392
0
        return seed;
393
0
    }
394
};
395
396
template <class First, class Second>
397
struct std::hash<std::pair<First, Second>> {
398
    size_t operator()(const pair<First, Second>& p) const {
399
        size_t h1 = std::hash<First>()(p.first);
400
        size_t h2 = std::hash<Second>()(p.second);
401
        return util_hash::HashLen16(h1, h2);
402
    }
403
};