/root/doris/be/src/gutil/hash/hash.h
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // Copyright (C) 1999 and onwards Google, Inc. |
3 | | // |
4 | | // |
5 | | // This file contains routines for hashing and fingerprinting. |
6 | | // |
7 | | // A hash function takes an arbitrary input bitstring (string, char*, |
8 | | // number) and turns it into a hash value (a fixed-size number) such |
9 | | // that unequal input values have a high likelihood of generating |
10 | | // unequal hash values. A fingerprint is a hash whose design is |
11 | | // biased towards avoiding hash collisions, possibly at the expense of |
12 | | // other characteristics such as execution speed. |
13 | | // |
14 | | // In general, if you are only using the hash values inside a single |
15 | | // executable -- you're not writing the values to disk, and you don't |
16 | | // depend on another instance of your program, running on another |
17 | | // machine, generating the same hash values as you -- you want to use |
18 | | // a HASH. Otherwise, you want to use a FINGERPRINT. |
19 | | // |
20 | | // RECOMMENDED HASH FOR STRINGS: GoodFastHash |
21 | | // |
22 | | // It is a functor, so you can use it like this: |
23 | | // hash_map<string, xxx, GoodFastHash<string> > |
24 | | // hash_set<char *, GoodFastHash<char*> > |
25 | | // |
26 | | // RECOMMENDED HASH FOR NUMBERS: hash<> |
27 | | // |
28 | | // Note that this is likely the identity hash, so if your |
29 | | // numbers are "non-random" (especially in the low bits), another |
30 | | // choice is better. You can use it like this: |
31 | | // hash_map<int, xxx> |
32 | | // hash_set<uint64> |
33 | | // |
34 | | // RECOMMENDED HASH FOR POINTERS: hash<> |
35 | | // |
36 | | // This is also likely the identity hash. |
37 | | // |
38 | | // RECOMMENDED HASH FOR STRUCTS: hash<some_fingerprint(struct)> |
39 | | // |
40 | | // Take a fingerprint of the struct, and use that as the key. |
41 | | // For instance: const uint64 hash_data[] = { s.foo, bit_cast<uint64>(s.bar) }; |
42 | | // uint64 fprint = <fingerprint fn>(reinterpret_cast<const char*>(hash_data), |
43 | | // sizeof(hash_data)); |
44 | | // hash_map[fprint] = whatever; |
45 | | // |
46 | | // RECOMMENDED FINGERPRINT: Fingerprint2011 |
47 | | // |
48 | | // (In util/hash/fingerprint2011.h) |
49 | | // In particular, do *not* use Fingerprint in new code; it has |
50 | | // problems with excess collisions. |
51 | | // |
52 | | // OTHER HASHES AND FINGERPRINTS: |
53 | | // |
54 | | // |
55 | | // The wiki page also has good advice for when to use a fingerprint vs |
56 | | // a hash. |
57 | | // |
58 | | // |
59 | | // Note: if your file declares hash_map<string, ...> or |
60 | | // hash_set<string>, it will use the default hash function, |
61 | | // hash<string>. This is not a great choice. Always provide an |
62 | | // explicit functor, such as GoodFastHash, as a template argument. |
63 | | // (Either way, you will need to #include this file to get the |
64 | | // necessary definition.) |
65 | | // |
66 | | // Some of the hash functions below are documented to be fixed |
67 | | // forever; the rest (whether they're documented as so or not) may |
68 | | // change over time. If you require a hash function that does not |
69 | | // change over time, you should have unittests enforcing this |
70 | | // property. We already have several such functions; see |
71 | | // hash_unittest.cc for the details and unittests. |
72 | | |
73 | | #pragma once |
74 | | |
75 | | #include <stddef.h> |
76 | | #include <string.h> |
77 | | #include <string> |
78 | | #include <cstddef> |
79 | | #include <functional> |
80 | | #include <iterator> |
81 | | #include <string_view> |
82 | | |
83 | | #include "gutil/hash/hash128to64.h" |
84 | | #include "gutil/hash/jenkins.h" |
85 | | #include "gutil/hash/jenkins_lookup2.h" |
86 | | #include "gutil/hash/legacy_hash.h" |
87 | | #include "gutil/hash/string_hash.h" |
88 | | #include "gutil/int128.h" |
89 | | #include "gutil/integral_types.h" |
90 | | #include "gutil/hash/builtin_type_hash.h" |
91 | | |
92 | | // ---------------------------------------------------------------------- |
93 | | // Fingerprint() |
94 | | // Not recommended for new code. Instead, use Fingerprint2011(), |
95 | | // a higher-quality and faster hash function. See fingerprint2011.h. |
96 | | // |
97 | | // Fingerprinting a string (or char*) will never return 0 or 1, |
98 | | // in case you want a couple of special values. However, |
99 | | // fingerprinting a numeric type may produce 0 or 1. |
100 | | // |
101 | | // The hash mapping of Fingerprint() will never change. |
102 | | // |
103 | | // Note: AVOID USING FINGERPRINT if at all possible. Use |
104 | | // Fingerprint2011 (in fingerprint2011.h) instead. |
105 | | // Fingerprint() is susceptible to collisions for even short |
106 | | // strings with low edit distance; see |
107 | | // Example collisions: |
108 | | // "01056/02" vs. "11057/02" |
109 | | // "LTA 02" vs. "MTA 12" |
110 | | // The same study found only one collision each for CityHash64() and |
111 | | // MurmurHash64(), from more than 2^32 inputs, and on medium-length |
112 | | // strings with large edit distances.These issues, among others, |
113 | | // led to the recommendation that new code should avoid Fingerprint(). |
114 | | // ---------------------------------------------------------------------- |
115 | | extern uint64 FingerprintReferenceImplementation(const char* s, uint32 len); |
116 | | extern uint64 FingerprintInterleavedImplementation(const char* s, uint32 len); |
117 | | inline uint64 Fingerprint(const char* s, uint32 len) { |
118 | | if constexpr (sizeof(s) == 8) { // 64-bit systems have 8-byte pointers. |
119 | | // The better choice when we have a decent number of registers. |
120 | | return FingerprintInterleavedImplementation(s, len); |
121 | | } else { |
122 | | return FingerprintReferenceImplementation(s, len); |
123 | | } |
124 | | } |
125 | | |
126 | | // Routine that combines the hi/lo part of a fingerprint |
127 | | // and changes the result appropriately to avoid returning 0/1. |
128 | 7 | inline uint64 CombineFingerprintHalves(uint32 hi, uint32 lo) { |
129 | 7 | uint64 result = (static_cast<uint64>(hi) << 32) | static_cast<uint64>(lo); |
130 | 7 | if ((hi == 0) && (lo < 2)) { |
131 | 0 | result ^= GG_ULONGLONG(0x130f9bef94a0a928); |
132 | 0 | } |
133 | 7 | return result; |
134 | 7 | } |
135 | | |
136 | | inline uint64 Fingerprint(const std::string& s) { |
137 | | return Fingerprint(s.data(), static_cast<uint32>(s.size())); |
138 | | } |
139 | 0 | inline uint64 Hash64StringWithSeed(const std::string& s, uint64 c) { |
140 | 0 | return Hash64StringWithSeed(s.data(), static_cast<uint32>(s.size()), c); |
141 | 0 | } |
142 | 0 | inline uint64 Fingerprint(schar c) { |
143 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
144 | 0 | } |
145 | 0 | inline uint64 Fingerprint(char c) { |
146 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
147 | 0 | } |
148 | 0 | inline uint64 Fingerprint(uint16 c) { |
149 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
150 | 0 | } |
151 | 0 | inline uint64 Fingerprint(int16 c) { |
152 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
153 | 0 | } |
154 | 0 | inline uint64 Fingerprint(uint32 c) { |
155 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
156 | 0 | } |
157 | | inline uint64 Fingerprint(int32 c) { |
158 | | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
159 | | } |
160 | 0 | inline uint64 Fingerprint(uint64 c) { |
161 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
162 | 0 | } |
163 | 0 | inline uint64 Fingerprint(int64 c) { |
164 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
165 | 0 | } |
166 | | |
167 | | // This concatenates two 64-bit fingerprints. It is a convenience function to |
168 | | // get a fingerprint for a combination of already fingerprinted components. |
169 | | // It assumes that each input is already a good fingerprint itself. |
170 | | // Note that this is legacy code and new code should use its replacement |
171 | | // FingerprintCat2011(). |
172 | | // |
173 | | // Note that in general it's impossible to construct Fingerprint(str) |
174 | | // from the fingerprints of substrings of str. One shouldn't expect |
175 | | // FingerprintCat(Fingerprint(x), Fingerprint(y)) to indicate |
176 | | // anything about Fingerprint(StrCat(x, y)). |
177 | 0 | inline uint64 FingerprintCat(uint64 fp1, uint64 fp2) { |
178 | 0 | return Hash64NumWithSeed(fp1, fp2); |
179 | 0 | } |
180 | | |
181 | | // This intended to be a "good" hash function. It may change from time to time. |
182 | | template <> |
183 | | struct std::hash<uint128> { |
184 | 0 | size_t operator()(const uint128& x) const { |
185 | 0 | if (sizeof(&x) == 8) { // 64-bit systems have 8-byte pointers. |
186 | 0 | return Hash128to64(x); |
187 | 0 | } else { |
188 | 0 | uint32 a = static_cast<uint32>(Uint128Low64(x)) + static_cast<uint32>(0x9e3779b9UL); |
189 | 0 | uint32 b = |
190 | 0 | static_cast<uint32>(Uint128Low64(x) >> 32) + static_cast<uint32>(0x9e3779b9UL); |
191 | 0 | uint32 c = static_cast<uint32>(Uint128High64(x)) + MIX32; |
192 | 0 | mix(a, b, c); |
193 | 0 | a += static_cast<uint32>(Uint128High64(x) >> 32); |
194 | 0 | mix(a, b, c); |
195 | 0 | return c; |
196 | 0 | } |
197 | 0 | } |
198 | | static const size_t min_buckets = 8; // 4 and 8 are defaults. |
199 | | }; |
200 | | |
201 | | // Hasher for STL pairs. Requires hashers for both members to be defined |
202 | | template <class First, class Second> |
203 | | struct std::hash<std::pair<First, Second> > { |
204 | | size_t operator()(const pair<First, Second>& p) const { |
205 | | size_t h1 = std::hash<First>()(p.first); |
206 | | size_t h2 = std::hash<Second>()(p.second); |
207 | | // The decision below is at compile time |
208 | | return (sizeof(h1) <= sizeof(uint32)) ? Hash32NumWithSeed(h1, h2) |
209 | | : Hash64NumWithSeed(h1, h2); |
210 | | } |
211 | | static const size_t min_buckets = 8; // 4 and 8 are defaults. |
212 | | }; |
213 | | |
214 | | // If you want an excellent string hash function, and you don't mind if it |
215 | | // might change when you sync and recompile, please use GoodFastHash<>. |
216 | | // For most applications, GoodFastHash<> is a good choice, better than |
217 | | // hash<string> or hash<char*> or similar. GoodFastHash<> can change |
218 | | // from time to time and may differ across platforms, and we'll strive |
219 | | // to keep improving it. |
220 | | // |
221 | | // By the way, when deleting the contents of a hash_set of pointers, it is |
222 | | // unsafe to delete *iterator because the hash function may be called on |
223 | | // the next iterator advance. Use STLDeleteContainerPointers(). |
224 | | |
225 | | template <class X> |
226 | | struct GoodFastHash; |
227 | | |
228 | | // This intended to be a "good" hash function. It may change from time to time. |
229 | | template <> |
230 | | struct GoodFastHash<char*> { |
231 | 0 | size_t operator()(const char* s) const { return HashStringThoroughly(s, strlen(s)); } |
232 | | // Less than operator for MSVC. |
233 | 0 | bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; } |
234 | | static const size_t bucket_size = 4; // These are required by MSVC |
235 | | static const size_t min_buckets = 8; // 4 and 8 are defaults. |
236 | | }; |
237 | | |
238 | | // This intended to be a "good" hash function. It may change from time to time. |
239 | | template <> |
240 | | struct GoodFastHash<const char*> { |
241 | 0 | size_t operator()(const char* s) const { return HashStringThoroughly(s, strlen(s)); } |
242 | | // Less than operator for MSVC. |
243 | 0 | bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; } |
244 | | static const size_t bucket_size = 4; // These are required by MSVC |
245 | | static const size_t min_buckets = 8; // 4 and 8 are defaults. |
246 | | }; |
247 | | |
248 | | // This intended to be a "good" hash function. It may change from time to time. |
249 | | template <class _CharT, class _Traits, class _Alloc> |
250 | | struct GoodFastHash<std::basic_string<_CharT, _Traits, _Alloc> > { |
251 | | size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const { |
252 | | return HashStringThoroughly(k.data(), k.length() * sizeof(k[0])); |
253 | | } |
254 | | // Less than operator for MSVC. |
255 | | bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a, |
256 | | const std::basic_string<_CharT, _Traits, _Alloc>& b) const { |
257 | | return a < b; |
258 | | } |
259 | | static const size_t bucket_size = 4; // These are required by MSVC |
260 | | static const size_t min_buckets = 8; // 4 and 8 are defaults. |
261 | | }; |
262 | | |
263 | | // This intended to be a "good" hash function. It may change from time to time. |
264 | | template <class _CharT, class _Traits, class _Alloc> |
265 | | struct GoodFastHash<const std::basic_string<_CharT, _Traits, _Alloc> > { |
266 | | size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const { |
267 | | return HashStringThoroughly(k.data(), k.length() * sizeof(k[0])); |
268 | | } |
269 | | // Less than operator for MSVC. |
270 | | bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a, |
271 | | const std::basic_string<_CharT, _Traits, _Alloc>& b) const { |
272 | | return a < b; |
273 | | } |
274 | | static const size_t bucket_size = 4; // These are required by MSVC |
275 | | static const size_t min_buckets = 8; // 4 and 8 are defaults. |
276 | | }; |