/root/doris/be/src/gutil/hash/hash.h

Source (jump to first uncovered line)
//
// Copyright (C) 1999 and onwards Google, Inc.
//
//
// This file contains routines for hashing and fingerprinting.
//
// A hash function takes an arbitrary input bitstring (string, char*,
// number) and turns it into a hash value (a fixed-size number) such
// that unequal input values have a high likelihood of generating
// unequal hash values.  A fingerprint is a hash whose design is
// biased towards avoiding hash collisions, possibly at the expense of
// other characteristics such as execution speed.
//
// In general, if you are only using the hash values inside a single
// executable -- you're not writing the values to disk, and you don't
// depend on another instance of your program, running on another
// machine, generating the same hash values as you -- you want to use
// a HASH.  Otherwise, you want to use a FINGERPRINT.
//
// RECOMMENDED HASH FOR STRINGS:    GoodFastHash
//
// It is a functor, so you can use it like this:
//     hash_map<string, xxx, GoodFastHash<string> >
//     hash_set<char *, GoodFastHash<char*> >
//
// RECOMMENDED HASH FOR NUMBERS:    hash<>
//
// Note that this is likely the identity hash, so if your
// numbers are "non-random" (especially in the low bits), another
// choice is better.  You can use it like this:
//     hash_map<int, xxx>
//     hash_set<uint64>
//
// RECOMMENDED HASH FOR POINTERS:    hash<>
//
// This is also likely the identity hash.
//
// RECOMMENDED HASH FOR STRUCTS:    hash<some_fingerprint(struct)>
//
// Take a fingerprint of the struct, and use that as the key.
// For instance: const uint64 hash_data[] = { s.foo, bit_cast<uint64>(s.bar) };
//    uint64 fprint = <fingerprint fn>(reinterpret_cast<const char*>(hash_data),
//                                     sizeof(hash_data));
//    hash_map[fprint] = whatever;
//
// RECOMMENDED FINGERPRINT:         Fingerprint2011
//
// (In util/hash/fingerprint2011.h)
// In particular, do *not* use Fingerprint in new code; it has
// problems with excess collisions.
//
// OTHER HASHES AND FINGERPRINTS:
//
//
// The wiki page also has good advice for when to use a fingerprint vs
// a hash.
//
//
// Note: if your file declares hash_map<string, ...> or
// hash_set<string>, it will use the default hash function,
// hash<string>.  This is not a great choice.  Always provide an
// explicit functor, such as GoodFastHash, as a template argument.
// (Either way, you will need to #include this file to get the
// necessary definition.)
//
// Some of the hash functions below are documented to be fixed
// forever; the rest (whether they're documented as so or not) may
// change over time.  If you require a hash function that does not
// change over time, you should have unittests enforcing this
// property.  We already have several such functions; see
// hash_unittest.cc for the details and unittests.

#pragma once

#include <stddef.h>
#include <string.h>
#include <string>
#include <cstddef>
#include <functional>
#include <iterator>
#include <string_view>

#include "gutil/hash/hash128to64.h"
#include "gutil/hash/jenkins.h"
#include "gutil/hash/jenkins_lookup2.h"
#include "gutil/hash/legacy_hash.h"
#include "gutil/hash/string_hash.h"
#include "gutil/int128.h"
#include "gutil/integral_types.h"
#include "gutil/hash/builtin_type_hash.h"

// ----------------------------------------------------------------------
// Fingerprint()
//   Not recommended for new code.  Instead, use Fingerprint2011(),
//   a higher-quality and faster hash function.  See fingerprint2011.h.
//
//   Fingerprinting a string (or char*) will never return 0 or 1,
//   in case you want a couple of special values.  However,
//   fingerprinting a numeric type may produce 0 or 1.
//
//   The hash mapping of Fingerprint() will never change.
//
//   Note: AVOID USING FINGERPRINT if at all possible.  Use
//   Fingerprint2011 (in fingerprint2011.h) instead.
//   Fingerprint() is susceptible to collisions for even short
//   strings with low edit distance; see
//   Example collisions:
//     "01056/02" vs. "11057/02"
//     "LTA 02" vs. "MTA 12"
//   The same study found only one collision each for CityHash64() and
//   MurmurHash64(), from more than 2^32 inputs, and on medium-length
//   strings with large edit distances.These issues, among others,
//   led to the recommendation that new code should avoid Fingerprint().
// ----------------------------------------------------------------------
extern uint64 FingerprintReferenceImplementation(const char* s, uint32 len);
extern uint64 FingerprintInterleavedImplementation(const char* s, uint32 len);
inline uint64 Fingerprint(const char* s, uint32 len) {
    if constexpr (sizeof(s) == 8) { // 64-bit systems have 8-byte pointers.
        // The better choice when we have a decent number of registers.
        return FingerprintInterleavedImplementation(s, len);
    } else {
        return FingerprintReferenceImplementation(s, len);
    }
}

// Routine that combines the hi/lo part of a fingerprint
// and changes the result appropriately to avoid returning 0/1.
inline uint64 CombineFingerprintHalves(uint32 hi, uint32 lo) {
    uint64 result = (static_cast<uint64>(hi) << 32) | static_cast<uint64>(lo);
    if ((hi == 0) && (lo < 2)) {
        result ^= GG_ULONGLONG(0x130f9bef94a0a928);
    }
    return result;
}

inline uint64 Fingerprint(const std::string& s) {
    return Fingerprint(s.data(), static_cast<uint32>(s.size()));
}
inline uint64 Hash64StringWithSeed(const std::string& s, uint64 c) {
    return Hash64StringWithSeed(s.data(), static_cast<uint32>(s.size()), c);
}
inline uint64 Fingerprint(schar c) {
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
}
inline uint64 Fingerprint(char c) {
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
}
inline uint64 Fingerprint(uint16 c) {
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
}
inline uint64 Fingerprint(int16 c) {
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
}
inline uint64 Fingerprint(uint32 c) {
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
}
inline uint64 Fingerprint(int32 c) {
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
}
inline uint64 Fingerprint(uint64 c) {
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
}
inline uint64 Fingerprint(int64 c) {
    return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
}

// This concatenates two 64-bit fingerprints. It is a convenience function to
// get a fingerprint for a combination of already fingerprinted components.
// It assumes that each input is already a good fingerprint itself.
// Note that this is legacy code and new code should use its replacement
// FingerprintCat2011().
//
// Note that in general it's impossible to construct Fingerprint(str)
// from the fingerprints of substrings of str.  One shouldn't expect
// FingerprintCat(Fingerprint(x), Fingerprint(y)) to indicate
// anything about Fingerprint(StrCat(x, y)).
inline uint64 FingerprintCat(uint64 fp1, uint64 fp2) {
    return Hash64NumWithSeed(fp1, fp2);
}

// This intended to be a "good" hash function.  It may change from time to time.
template <>
struct std::hash<uint128> {
    size_t operator()(const uint128& x) const {
        if (sizeof(&x) == 8) { // 64-bit systems have 8-byte pointers.
            return Hash128to64(x);
        } else {
            uint32 a = static_cast<uint32>(Uint128Low64(x)) + static_cast<uint32>(0x9e3779b9UL);
            uint32 b =
                    static_cast<uint32>(Uint128Low64(x) >> 32) + static_cast<uint32>(0x9e3779b9UL);
            uint32 c = static_cast<uint32>(Uint128High64(x)) + MIX32;
            mix(a, b, c);
            a += static_cast<uint32>(Uint128High64(x) >> 32);
            mix(a, b, c);
            return c;
        }
    }
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
};

// Hasher for STL pairs. Requires hashers for both members to be defined
template <class First, class Second>
struct std::hash<std::pair<First, Second> > {
    size_t operator()(const pair<First, Second>& p) const {
        size_t h1 = std::hash<First>()(p.first);
        size_t h2 = std::hash<Second>()(p.second);
        // The decision below is at compile time
        return (sizeof(h1) <= sizeof(uint32)) ? Hash32NumWithSeed(h1, h2)
                                              : Hash64NumWithSeed(h1, h2);
    }
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
};

// If you want an excellent string hash function, and you don't mind if it
// might change when you sync and recompile, please use GoodFastHash<>.
// For most applications, GoodFastHash<> is a good choice, better than
// hash<string> or hash<char*> or similar.  GoodFastHash<> can change
// from time to time and may differ across platforms, and we'll strive
// to keep improving it.
//
// By the way, when deleting the contents of a hash_set of pointers, it is
// unsafe to delete *iterator because the hash function may be called on
// the next iterator advance.  Use STLDeleteContainerPointers().

template <class X>
struct GoodFastHash;

// This intended to be a "good" hash function.  It may change from time to time.
template <>
struct GoodFastHash<char*> {
    size_t operator()(const char* s) const { return HashStringThoroughly(s, strlen(s)); }
    // Less than operator for MSVC.
    bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; }
    static const size_t bucket_size = 4; // These are required by MSVC
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
};

// This intended to be a "good" hash function.  It may change from time to time.
template <>
struct GoodFastHash<const char*> {
    size_t operator()(const char* s) const { return HashStringThoroughly(s, strlen(s)); }
    // Less than operator for MSVC.
    bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; }
    static const size_t bucket_size = 4; // These are required by MSVC
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
};

// This intended to be a "good" hash function.  It may change from time to time.
template <class _CharT, class _Traits, class _Alloc>
struct GoodFastHash<std::basic_string<_CharT, _Traits, _Alloc> > {
    size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const {
        return HashStringThoroughly(k.data(), k.length() * sizeof(k[0]));
    }
    // Less than operator for MSVC.
    bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a,
                    const std::basic_string<_CharT, _Traits, _Alloc>& b) const {
        return a < b;
    }
    static const size_t bucket_size = 4; // These are required by MSVC
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
};

// This intended to be a "good" hash function.  It may change from time to time.
template <class _CharT, class _Traits, class _Alloc>
struct GoodFastHash<const std::basic_string<_CharT, _Traits, _Alloc> > {
    size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const {
        return HashStringThoroughly(k.data(), k.length() * sizeof(k[0]));
    }
    // Less than operator for MSVC.
    bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a,
                    const std::basic_string<_CharT, _Traits, _Alloc>& b) const {
        return a < b;
    }
    static const size_t bucket_size = 4; // These are required by MSVC
    static const size_t min_buckets = 8; // 4 and 8 are defaults.
};

Coverage Report

Created: 2025-06-08 11:30

Line	Count	Source (jump to first uncovered line)
1		//
2		// Copyright (C) 1999 and onwards Google, Inc.
3		//
4		//
5		// This file contains routines for hashing and fingerprinting.
6		//
7		// A hash function takes an arbitrary input bitstring (string, char*,
8		// number) and turns it into a hash value (a fixed-size number) such
9		// that unequal input values have a high likelihood of generating
10		// unequal hash values. A fingerprint is a hash whose design is
11		// biased towards avoiding hash collisions, possibly at the expense of
12		// other characteristics such as execution speed.
13		//
14		// In general, if you are only using the hash values inside a single
15		// executable -- you're not writing the values to disk, and you don't
16		// depend on another instance of your program, running on another
17		// machine, generating the same hash values as you -- you want to use
18		// a HASH. Otherwise, you want to use a FINGERPRINT.
19		//
20		// RECOMMENDED HASH FOR STRINGS: GoodFastHash
21		//
22		// It is a functor, so you can use it like this:
23		// hash_map<string, xxx, GoodFastHash<string> >
24		// hash_set<char , GoodFastHash<char> >
25		//
26		// RECOMMENDED HASH FOR NUMBERS: hash<>
27		//
28		// Note that this is likely the identity hash, so if your
29		// numbers are "non-random" (especially in the low bits), another
30		// choice is better. You can use it like this:
31		// hash_map<int, xxx>
32		// hash_set<uint64>
33		//
34		// RECOMMENDED HASH FOR POINTERS: hash<>
35		//
36		// This is also likely the identity hash.
37		//
38		// RECOMMENDED HASH FOR STRUCTS: hash<some_fingerprint(struct)>
39		//
40		// Take a fingerprint of the struct, and use that as the key.
41		// For instance: const uint64 hash_data[] = { s.foo, bit_cast<uint64>(s.bar) };
42		// uint64 fprint = <fingerprint fn>(reinterpret_cast<const char*>(hash_data),
43		// sizeof(hash_data));
44		// hash_map[fprint] = whatever;
45		//
46		// RECOMMENDED FINGERPRINT: Fingerprint2011
47		//
48		// (In util/hash/fingerprint2011.h)
49		// In particular, do not use Fingerprint in new code; it has
50		// problems with excess collisions.
51		//
52		// OTHER HASHES AND FINGERPRINTS:
53		//
54		//
55		// The wiki page also has good advice for when to use a fingerprint vs
56		// a hash.
57		//
58		//
59		// Note: if your file declares hash_map<string, ...> or
60		// hash_set<string>, it will use the default hash function,
61		// hash<string>. This is not a great choice. Always provide an
62		// explicit functor, such as GoodFastHash, as a template argument.
63		// (Either way, you will need to #include this file to get the
64		// necessary definition.)
65		//
66		// Some of the hash functions below are documented to be fixed
67		// forever; the rest (whether they're documented as so or not) may
68		// change over time. If you require a hash function that does not
69		// change over time, you should have unittests enforcing this
70		// property. We already have several such functions; see
71		// hash_unittest.cc for the details and unittests.
72
73		#pragma once
74
75		#include <stddef.h>
76		#include <string.h>
77		#include <string>
78		#include <cstddef>
79		#include <functional>
80		#include <iterator>
81		#include <string_view>
82
83		#include "gutil/hash/hash128to64.h"
84		#include "gutil/hash/jenkins.h"
85		#include "gutil/hash/jenkins_lookup2.h"
86		#include "gutil/hash/legacy_hash.h"
87		#include "gutil/hash/string_hash.h"
88		#include "gutil/int128.h"
89		#include "gutil/integral_types.h"
90		#include "gutil/hash/builtin_type_hash.h"
91
92		// ----------------------------------------------------------------------
93		// Fingerprint()
94		// Not recommended for new code. Instead, use Fingerprint2011(),
95		// a higher-quality and faster hash function. See fingerprint2011.h.
96		//
97		// Fingerprinting a string (or char*) will never return 0 or 1,
98		// in case you want a couple of special values. However,
99		// fingerprinting a numeric type may produce 0 or 1.
100		//
101		// The hash mapping of Fingerprint() will never change.
102		//
103		// Note: AVOID USING FINGERPRINT if at all possible. Use
104		// Fingerprint2011 (in fingerprint2011.h) instead.
105		// Fingerprint() is susceptible to collisions for even short
106		// strings with low edit distance; see
107		// Example collisions:
108		// "01056/02" vs. "11057/02"
109		// "LTA 02" vs. "MTA 12"
110		// The same study found only one collision each for CityHash64() and
111		// MurmurHash64(), from more than 2^32 inputs, and on medium-length
112		// strings with large edit distances.These issues, among others,
113		// led to the recommendation that new code should avoid Fingerprint().
114		// ----------------------------------------------------------------------
115		extern uint64 FingerprintReferenceImplementation(const char* s, uint32 len);
116		extern uint64 FingerprintInterleavedImplementation(const char* s, uint32 len);
117		inline uint64 Fingerprint(const char* s, uint32 len) {
118		if constexpr (sizeof(s) == 8) { // 64-bit systems have 8-byte pointers.
119		// The better choice when we have a decent number of registers.
120		return FingerprintInterleavedImplementation(s, len);
121		} else {
122		return FingerprintReferenceImplementation(s, len);
123		}
124		}
125
126		// Routine that combines the hi/lo part of a fingerprint
127		// and changes the result appropriately to avoid returning 0/1.
128	7	inline uint64 CombineFingerprintHalves(uint32 hi, uint32 lo) {
129	7	uint64 result = (static_cast<uint64>(hi) << 32) \| static_cast<uint64>(lo);
130	7	if ((hi == 0) && (lo < 2)) {
131	0	result ^= GG_ULONGLONG(0x130f9bef94a0a928);
132	0	}
133	7	return result;
134	7	}
135
136		inline uint64 Fingerprint(const std::string& s) {
137		return Fingerprint(s.data(), static_cast<uint32>(s.size()));
138		}
139	0	inline uint64 Hash64StringWithSeed(const std::string& s, uint64 c) {
140	0	return Hash64StringWithSeed(s.data(), static_cast<uint32>(s.size()), c);
141	0	}
142	0	inline uint64 Fingerprint(schar c) {
143	0	return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
144	0	}
145	0	inline uint64 Fingerprint(char c) {
146	0	return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
147	0	}
148	0	inline uint64 Fingerprint(uint16 c) {
149	0	return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
150	0	}
151	0	inline uint64 Fingerprint(int16 c) {
152	0	return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
153	0	}
154	0	inline uint64 Fingerprint(uint32 c) {
155	0	return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
156	0	}
157		inline uint64 Fingerprint(int32 c) {
158		return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
159		}
160	0	inline uint64 Fingerprint(uint64 c) {
161	0	return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
162	0	}
163	0	inline uint64 Fingerprint(int64 c) {
164	0	return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
165	0	}
166
167		// This concatenates two 64-bit fingerprints. It is a convenience function to
168		// get a fingerprint for a combination of already fingerprinted components.
169		// It assumes that each input is already a good fingerprint itself.
170		// Note that this is legacy code and new code should use its replacement
171		// FingerprintCat2011().
172		//
173		// Note that in general it's impossible to construct Fingerprint(str)
174		// from the fingerprints of substrings of str. One shouldn't expect
175		// FingerprintCat(Fingerprint(x), Fingerprint(y)) to indicate
176		// anything about Fingerprint(StrCat(x, y)).
177	0	inline uint64 FingerprintCat(uint64 fp1, uint64 fp2) {
178	0	return Hash64NumWithSeed(fp1, fp2);
179	0	}
180
181		// This intended to be a "good" hash function. It may change from time to time.
182		template <>
183		struct std::hash<uint128> {
184	0	size_t operator()(const uint128& x) const {
185	0	if (sizeof(&x) == 8) { // 64-bit systems have 8-byte pointers.
186	0	return Hash128to64(x);
187	0	} else {
188	0	uint32 a = static_cast<uint32>(Uint128Low64(x)) + static_cast<uint32>(0x9e3779b9UL);
189	0	uint32 b =
190	0	static_cast<uint32>(Uint128Low64(x) >> 32) + static_cast<uint32>(0x9e3779b9UL);
191	0	uint32 c = static_cast<uint32>(Uint128High64(x)) + MIX32;
192	0	mix(a, b, c);
193	0	a += static_cast<uint32>(Uint128High64(x) >> 32);
194	0	mix(a, b, c);
195	0	return c;
196	0	}
197	0	}
198		static const size_t min_buckets = 8; // 4 and 8 are defaults.
199		};
200
201		// Hasher for STL pairs. Requires hashers for both members to be defined
202		template <class First, class Second>
203		struct std::hash<std::pair<First, Second> > {
204		size_t operator()(const pair<First, Second>& p) const {
205		size_t h1 = std::hash<First>()(p.first);
206		size_t h2 = std::hash<Second>()(p.second);
207		// The decision below is at compile time
208		return (sizeof(h1) <= sizeof(uint32)) ? Hash32NumWithSeed(h1, h2)
209		: Hash64NumWithSeed(h1, h2);
210		}
211		static const size_t min_buckets = 8; // 4 and 8 are defaults.
212		};
213
214		// If you want an excellent string hash function, and you don't mind if it
215		// might change when you sync and recompile, please use GoodFastHash<>.
216		// For most applications, GoodFastHash<> is a good choice, better than
217		// hash<string> or hash<char*> or similar. GoodFastHash<> can change
218		// from time to time and may differ across platforms, and we'll strive
219		// to keep improving it.
220		//
221		// By the way, when deleting the contents of a hash_set of pointers, it is
222		// unsafe to delete *iterator because the hash function may be called on
223		// the next iterator advance. Use STLDeleteContainerPointers().
224
225		template <class X>
226		struct GoodFastHash;
227
228		// This intended to be a "good" hash function. It may change from time to time.
229		template <>
230		struct GoodFastHash<char*> {
231	0	size_t operator()(const char* s) const { return HashStringThoroughly(s, strlen(s)); }
232		// Less than operator for MSVC.
233	0	bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; }
234		static const size_t bucket_size = 4; // These are required by MSVC
235		static const size_t min_buckets = 8; // 4 and 8 are defaults.
236		};
237
238		// This intended to be a "good" hash function. It may change from time to time.
239		template <>
240		struct GoodFastHash<const char*> {
241	0	size_t operator()(const char* s) const { return HashStringThoroughly(s, strlen(s)); }
242		// Less than operator for MSVC.
243	0	bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; }
244		static const size_t bucket_size = 4; // These are required by MSVC
245		static const size_t min_buckets = 8; // 4 and 8 are defaults.
246		};
247
248		// This intended to be a "good" hash function. It may change from time to time.
249		template <class _CharT, class _Traits, class _Alloc>
250		struct GoodFastHash<std::basic_string<_CharT, _Traits, _Alloc> > {
251		size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const {
252		return HashStringThoroughly(k.data(), k.length() * sizeof(k[0]));
253		}
254		// Less than operator for MSVC.
255		bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a,
256		const std::basic_string<_CharT, _Traits, _Alloc>& b) const {
257		return a < b;
258		}
259		static const size_t bucket_size = 4; // These are required by MSVC
260		static const size_t min_buckets = 8; // 4 and 8 are defaults.
261		};
262
263		// This intended to be a "good" hash function. It may change from time to time.
264		template <class _CharT, class _Traits, class _Alloc>
265		struct GoodFastHash<const std::basic_string<_CharT, _Traits, _Alloc> > {
266		size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const {
267		return HashStringThoroughly(k.data(), k.length() * sizeof(k[0]));
268		}
269		// Less than operator for MSVC.
270		bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a,
271		const std::basic_string<_CharT, _Traits, _Alloc>& b) const {
272		return a < b;
273		}
274		static const size_t bucket_size = 4; // These are required by MSVC
275		static const size_t min_buckets = 8; // 4 and 8 are defaults.
276		};