Coverage Report

Created: 2025-06-08 11:30

/root/doris/be/src/gutil/hash/hash.h
Line
Count
Source (jump to first uncovered line)
1
//
2
// Copyright (C) 1999 and onwards Google, Inc.
3
//
4
//
5
// This file contains routines for hashing and fingerprinting.
6
//
7
// A hash function takes an arbitrary input bitstring (string, char*,
8
// number) and turns it into a hash value (a fixed-size number) such
9
// that unequal input values have a high likelihood of generating
10
// unequal hash values.  A fingerprint is a hash whose design is
11
// biased towards avoiding hash collisions, possibly at the expense of
12
// other characteristics such as execution speed.
13
//
14
// In general, if you are only using the hash values inside a single
15
// executable -- you're not writing the values to disk, and you don't
16
// depend on another instance of your program, running on another
17
// machine, generating the same hash values as you -- you want to use
18
// a HASH.  Otherwise, you want to use a FINGERPRINT.
19
//
20
// RECOMMENDED HASH FOR STRINGS:    GoodFastHash
21
//
22
// It is a functor, so you can use it like this:
23
//     hash_map<string, xxx, GoodFastHash<string> >
24
//     hash_set<char *, GoodFastHash<char*> >
25
//
26
// RECOMMENDED HASH FOR NUMBERS:    hash<>
27
//
28
// Note that this is likely the identity hash, so if your
29
// numbers are "non-random" (especially in the low bits), another
30
// choice is better.  You can use it like this:
31
//     hash_map<int, xxx>
32
//     hash_set<uint64>
33
//
34
// RECOMMENDED HASH FOR POINTERS:    hash<>
35
//
36
// This is also likely the identity hash.
37
//
38
// RECOMMENDED HASH FOR STRUCTS:    hash<some_fingerprint(struct)>
39
//
40
// Take a fingerprint of the struct, and use that as the key.
41
// For instance: const uint64 hash_data[] = { s.foo, bit_cast<uint64>(s.bar) };
42
//    uint64 fprint = <fingerprint fn>(reinterpret_cast<const char*>(hash_data),
43
//                                     sizeof(hash_data));
44
//    hash_map[fprint] = whatever;
45
//
46
// RECOMMENDED FINGERPRINT:         Fingerprint2011
47
//
48
// (In util/hash/fingerprint2011.h)
49
// In particular, do *not* use Fingerprint in new code; it has
50
// problems with excess collisions.
51
//
52
// OTHER HASHES AND FINGERPRINTS:
53
//
54
//
55
// The wiki page also has good advice for when to use a fingerprint vs
56
// a hash.
57
//
58
//
59
// Note: if your file declares hash_map<string, ...> or
60
// hash_set<string>, it will use the default hash function,
61
// hash<string>.  This is not a great choice.  Always provide an
62
// explicit functor, such as GoodFastHash, as a template argument.
63
// (Either way, you will need to #include this file to get the
64
// necessary definition.)
65
//
66
// Some of the hash functions below are documented to be fixed
67
// forever; the rest (whether they're documented as so or not) may
68
// change over time.  If you require a hash function that does not
69
// change over time, you should have unittests enforcing this
70
// property.  We already have several such functions; see
71
// hash_unittest.cc for the details and unittests.
72
73
#pragma once
74
75
#include <stddef.h>
76
#include <string.h>
77
#include <string>
78
#include <cstddef>
79
#include <functional>
80
#include <iterator>
81
#include <string_view>
82
83
#include "gutil/hash/hash128to64.h"
84
#include "gutil/hash/jenkins.h"
85
#include "gutil/hash/jenkins_lookup2.h"
86
#include "gutil/hash/legacy_hash.h"
87
#include "gutil/hash/string_hash.h"
88
#include "gutil/int128.h"
89
#include "gutil/integral_types.h"
90
#include "gutil/hash/builtin_type_hash.h"
91
92
// ----------------------------------------------------------------------
93
// Fingerprint()
94
//   Not recommended for new code.  Instead, use Fingerprint2011(),
95
//   a higher-quality and faster hash function.  See fingerprint2011.h.
96
//
97
//   Fingerprinting a string (or char*) will never return 0 or 1,
98
//   in case you want a couple of special values.  However,
99
//   fingerprinting a numeric type may produce 0 or 1.
100
//
101
//   The hash mapping of Fingerprint() will never change.
102
//
103
//   Note: AVOID USING FINGERPRINT if at all possible.  Use
104
//   Fingerprint2011 (in fingerprint2011.h) instead.
105
//   Fingerprint() is susceptible to collisions for even short
106
//   strings with low edit distance; see
107
//   Example collisions:
108
//     "01056/02" vs. "11057/02"
109
//     "LTA 02" vs. "MTA 12"
110
//   The same study found only one collision each for CityHash64() and
111
//   MurmurHash64(), from more than 2^32 inputs, and on medium-length
112
//   strings with large edit distances.These issues, among others,
113
//   led to the recommendation that new code should avoid Fingerprint().
114
// ----------------------------------------------------------------------
115
extern uint64 FingerprintReferenceImplementation(const char* s, uint32 len);
116
extern uint64 FingerprintInterleavedImplementation(const char* s, uint32 len);
117
inline uint64 Fingerprint(const char* s, uint32 len) {
118
    if constexpr (sizeof(s) == 8) { // 64-bit systems have 8-byte pointers.
119
        // The better choice when we have a decent number of registers.
120
        return FingerprintInterleavedImplementation(s, len);
121
    } else {
122
        return FingerprintReferenceImplementation(s, len);
123
    }
124
}
125
126
// Routine that combines the hi/lo part of a fingerprint
127
// and changes the result appropriately to avoid returning 0/1.
128
7
inline uint64 CombineFingerprintHalves(uint32 hi, uint32 lo) {
129
7
    uint64 result = (static_cast<uint64>(hi) << 32) | static_cast<uint64>(lo);
130
7
    if ((hi == 0) && (lo < 2)) {
131
0
        result ^= GG_ULONGLONG(0x130f9bef94a0a928);
132
0
    }
133
7
    return result;
134
7
}
135
136
inline uint64 Fingerprint(const std::string& s) {
137
    return Fingerprint(s.data(), static_cast<uint32>(s.size()));
138
}
139
0
inline uint64 Hash64StringWithSeed(const std::string& s, uint64 c) {
140
0
    return Hash64StringWithSeed(s.data(), static_cast<uint32>(s.size()), c);
141
0
}
142
0
inline uint64 Fingerprint(schar c) {
143
0
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
144
0
}
145
0
inline uint64 Fingerprint(char c) {
146
0
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
147
0
}
148
0
inline uint64 Fingerprint(uint16 c) {
149
0
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
150
0
}
151
0
inline uint64 Fingerprint(int16 c) {
152
0
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
153
0
}
154
0
inline uint64 Fingerprint(uint32 c) {
155
0
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
156
0
}
157
inline uint64 Fingerprint(int32 c) {
158
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
159
}
160
0
inline uint64 Fingerprint(uint64 c) {
161
0
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
162
0
}
163
0
inline uint64 Fingerprint(int64 c) {
164
0
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
165
0
}
166
167
// This concatenates two 64-bit fingerprints. It is a convenience function to
168
// get a fingerprint for a combination of already fingerprinted components.
169
// It assumes that each input is already a good fingerprint itself.
170
// Note that this is legacy code and new code should use its replacement
171
// FingerprintCat2011().
172
//
173
// Note that in general it's impossible to construct Fingerprint(str)
174
// from the fingerprints of substrings of str.  One shouldn't expect
175
// FingerprintCat(Fingerprint(x), Fingerprint(y)) to indicate
176
// anything about Fingerprint(StrCat(x, y)).
177
0
inline uint64 FingerprintCat(uint64 fp1, uint64 fp2) {
178
0
    return Hash64NumWithSeed(fp1, fp2);
179
0
}
180
181
// This intended to be a "good" hash function.  It may change from time to time.
182
template <>
183
struct std::hash<uint128> {
184
0
    size_t operator()(const uint128& x) const {
185
0
        if (sizeof(&x) == 8) { // 64-bit systems have 8-byte pointers.
186
0
            return Hash128to64(x);
187
0
        } else {
188
0
            uint32 a = static_cast<uint32>(Uint128Low64(x)) + static_cast<uint32>(0x9e3779b9UL);
189
0
            uint32 b =
190
0
                    static_cast<uint32>(Uint128Low64(x) >> 32) + static_cast<uint32>(0x9e3779b9UL);
191
0
            uint32 c = static_cast<uint32>(Uint128High64(x)) + MIX32;
192
0
            mix(a, b, c);
193
0
            a += static_cast<uint32>(Uint128High64(x) >> 32);
194
0
            mix(a, b, c);
195
0
            return c;
196
0
        }
197
0
    }
198
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
199
};
200
201
// Hasher for STL pairs. Requires hashers for both members to be defined
202
template <class First, class Second>
203
struct std::hash<std::pair<First, Second> > {
204
    size_t operator()(const pair<First, Second>& p) const {
205
        size_t h1 = std::hash<First>()(p.first);
206
        size_t h2 = std::hash<Second>()(p.second);
207
        // The decision below is at compile time
208
        return (sizeof(h1) <= sizeof(uint32)) ? Hash32NumWithSeed(h1, h2)
209
                                              : Hash64NumWithSeed(h1, h2);
210
    }
211
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
212
};
213
214
// If you want an excellent string hash function, and you don't mind if it
215
// might change when you sync and recompile, please use GoodFastHash<>.
216
// For most applications, GoodFastHash<> is a good choice, better than
217
// hash<string> or hash<char*> or similar.  GoodFastHash<> can change
218
// from time to time and may differ across platforms, and we'll strive
219
// to keep improving it.
220
//
221
// By the way, when deleting the contents of a hash_set of pointers, it is
222
// unsafe to delete *iterator because the hash function may be called on
223
// the next iterator advance.  Use STLDeleteContainerPointers().
224
225
template <class X>
226
struct GoodFastHash;
227
228
// This intended to be a "good" hash function.  It may change from time to time.
229
template <>
230
struct GoodFastHash<char*> {
231
0
    size_t operator()(const char* s) const { return HashStringThoroughly(s, strlen(s)); }
232
    // Less than operator for MSVC.
233
0
    bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; }
234
    static const size_t bucket_size = 4; // These are required by MSVC
235
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
236
};
237
238
// This intended to be a "good" hash function.  It may change from time to time.
239
template <>
240
struct GoodFastHash<const char*> {
241
0
    size_t operator()(const char* s) const { return HashStringThoroughly(s, strlen(s)); }
242
    // Less than operator for MSVC.
243
0
    bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; }
244
    static const size_t bucket_size = 4; // These are required by MSVC
245
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
246
};
247
248
// This intended to be a "good" hash function.  It may change from time to time.
249
template <class _CharT, class _Traits, class _Alloc>
250
struct GoodFastHash<std::basic_string<_CharT, _Traits, _Alloc> > {
251
    size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const {
252
        return HashStringThoroughly(k.data(), k.length() * sizeof(k[0]));
253
    }
254
    // Less than operator for MSVC.
255
    bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a,
256
                    const std::basic_string<_CharT, _Traits, _Alloc>& b) const {
257
        return a < b;
258
    }
259
    static const size_t bucket_size = 4; // These are required by MSVC
260
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
261
};
262
263
// This intended to be a "good" hash function.  It may change from time to time.
264
template <class _CharT, class _Traits, class _Alloc>
265
struct GoodFastHash<const std::basic_string<_CharT, _Traits, _Alloc> > {
266
    size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const {
267
        return HashStringThoroughly(k.data(), k.length() * sizeof(k[0]));
268
    }
269
    // Less than operator for MSVC.
270
    bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a,
271
                    const std::basic_string<_CharT, _Traits, _Alloc>& b) const {
272
        return a < b;
273
    }
274
    static const size_t bucket_size = 4; // These are required by MSVC
275
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
276
};