be/src/exec/common/sip_hash.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/SipHash.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | /** SipHash is a fast cryptographic hash function for short strings. |
24 | | * Taken from here: https://www.131002.net/siphash/ |
25 | | * |
26 | | * This is SipHash 2-4 variant. |
27 | | * |
28 | | * Two changes are made: |
29 | | * - returns also 128 bits, not only 64; |
30 | | * - done streaming (can be calculated in parts). |
31 | | * |
32 | | * On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL. |
33 | | * (~ 700 MB/sec, 15 million strings per second) |
34 | | */ |
35 | | |
36 | | #include <string> |
37 | | #include <type_traits> |
38 | | |
39 | | #include "common/compiler_util.h" // IWYU pragma: keep |
40 | | #include "core/types.h" |
41 | | #include "util/unaligned.h" |
42 | | |
43 | | namespace doris { |
44 | | #include "common/compile_check_begin.h" |
45 | 3.12G | #define ROTL(x, b) static_cast<UInt64>(((x) << (b)) | ((x) >> (64 - (b)))) |
46 | | |
47 | | #define SIPROUND \ |
48 | 520M | do { \ |
49 | 520M | v0 += v1; \ |
50 | 520M | v1 = ROTL(v1, 13); \ |
51 | 520M | v1 ^= v0; \ |
52 | 520M | v0 = ROTL(v0, 32); \ |
53 | 520M | v2 += v3; \ |
54 | 520M | v3 = ROTL(v3, 16); \ |
55 | 520M | v3 ^= v2; \ |
56 | 520M | v0 += v3; \ |
57 | 520M | v3 = ROTL(v3, 21); \ |
58 | 520M | v3 ^= v0; \ |
59 | 520M | v2 += v1; \ |
60 | 520M | v1 = ROTL(v1, 17); \ |
61 | 520M | v1 ^= v2; \ |
62 | 520M | v2 = ROTL(v2, 32); \ |
63 | 520M | } while (0) |
64 | | |
65 | | class SipHash { |
66 | | private: |
67 | | /// State. |
68 | | UInt64 v0; |
69 | | UInt64 v1; |
70 | | UInt64 v2; |
71 | | UInt64 v3; |
72 | | |
73 | | /// How many bytes have been processed. |
74 | | UInt64 cnt; |
75 | | |
76 | | /// The current 8 bytes of input data. |
77 | | union { |
78 | | UInt64 current_word; |
79 | | UInt8 current_bytes[8]; |
80 | | }; |
81 | | |
82 | 59.6M | ALWAYS_INLINE void finalize() { |
83 | | /// In the last free byte, we write the remainder of the division by 256. |
84 | 59.6M | current_bytes[7] = uint8_t(cnt); |
85 | | |
86 | 59.6M | v3 ^= current_word; |
87 | 59.6M | SIPROUND; |
88 | 59.6M | SIPROUND; |
89 | 59.6M | v0 ^= current_word; |
90 | | |
91 | 59.6M | v2 ^= 0xff; |
92 | 59.6M | SIPROUND; |
93 | 59.6M | SIPROUND; |
94 | 59.6M | SIPROUND; |
95 | 59.6M | SIPROUND; |
96 | 59.6M | } |
97 | | |
98 | | public: |
99 | | /// Arguments - seed. |
100 | 59.8M | SipHash(UInt64 k0 = 0, UInt64 k1 = 0) { |
101 | | /// Initialize the state with some random bytes and seed. |
102 | 59.8M | v0 = 0x736f6d6570736575ULL ^ k0; |
103 | 59.8M | v1 = 0x646f72616e646f6dULL ^ k1; |
104 | 59.8M | v2 = 0x6c7967656e657261ULL ^ k0; |
105 | 59.8M | v3 = 0x7465646279746573ULL ^ k1; |
106 | | |
107 | 59.8M | cnt = 0; |
108 | 59.8M | current_word = 0; |
109 | 59.8M | } |
110 | | |
111 | 117M | void update(const char* data, UInt64 size) { |
112 | 117M | const char* end = data + size; |
113 | | |
114 | | /// We'll finish to process the remainder of the previous update, if any. |
115 | 117M | if (cnt & 7) { |
116 | 1.66M | while (cnt & 7 && data < end) { |
117 | 973k | current_bytes[cnt & 7] = *data; |
118 | 973k | ++data; |
119 | 973k | ++cnt; |
120 | 973k | } |
121 | | |
122 | | /// If we still do not have enough bytes to an 8-byte word. |
123 | 692k | if (cnt & 7) return; |
124 | | |
125 | 139k | v3 ^= current_word; |
126 | 139k | SIPROUND; |
127 | 139k | SIPROUND; |
128 | 139k | v0 ^= current_word; |
129 | 139k | } |
130 | | |
131 | 117M | cnt += end - data; |
132 | | |
133 | 198M | while (data + 8 <= end) { |
134 | 81.3M | current_word = unaligned_load<UInt64>(data); |
135 | | |
136 | 81.3M | v3 ^= current_word; |
137 | 81.3M | SIPROUND; |
138 | 81.3M | SIPROUND; |
139 | 81.3M | v0 ^= current_word; |
140 | | |
141 | 81.3M | data += 8; |
142 | 81.3M | } |
143 | | |
144 | | /// Pad the remainder, which is missing up to an 8-byte word. |
145 | 117M | current_word = 0; |
146 | 117M | switch (end - data) { |
147 | 23.3k | case 7: |
148 | 23.3k | current_bytes[6] = data[6]; |
149 | 23.3k | [[fallthrough]]; |
150 | 2.05M | case 6: |
151 | 2.05M | current_bytes[5] = data[5]; |
152 | 2.05M | [[fallthrough]]; |
153 | 2.22M | case 5: |
154 | 2.22M | current_bytes[4] = data[4]; |
155 | 2.22M | [[fallthrough]]; |
156 | 2.28M | case 4: |
157 | 2.28M | current_bytes[3] = data[3]; |
158 | 2.28M | [[fallthrough]]; |
159 | 2.31M | case 3: |
160 | 2.31M | current_bytes[2] = data[2]; |
161 | 2.31M | [[fallthrough]]; |
162 | 2.36M | case 2: |
163 | 2.36M | current_bytes[1] = data[1]; |
164 | 2.36M | [[fallthrough]]; |
165 | 59.7M | case 1: |
166 | 59.7M | current_bytes[0] = data[0]; |
167 | 59.7M | [[fallthrough]]; |
168 | 117M | case 0: |
169 | 117M | break; |
170 | 117M | } |
171 | 117M | } |
172 | | |
173 | | template <typename T> |
174 | 115M | void update(const T& x) { |
175 | | if constexpr (std::is_same_v<T, std::string>) { |
176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); |
177 | | } |
178 | 115M | update(reinterpret_cast<const char*>(&x), sizeof(x)); |
179 | 115M | } _ZN5doris7SipHash6updateINS_7DecimalIiEEEEvRKT_ Line | Count | Source | 174 | 354 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 354 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 354 | } |
_ZN5doris7SipHash6updateINS_7DecimalIlEEEEvRKT_ Line | Count | Source | 174 | 576 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 576 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 576 | } |
_ZN5doris7SipHash6updateINS_14DecimalV2ValueEEEvRKT_ Line | Count | Source | 174 | 1.34k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 1.34k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 1.34k | } |
_ZN5doris7SipHash6updateINS_12Decimal128V3EEEvRKT_ Line | Count | Source | 174 | 662 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 662 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 662 | } |
_ZN5doris7SipHash6updateINS_7DecimalIN4wide7integerILm256EiEEEEEEvRKT_ Line | Count | Source | 174 | 1.25k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 1.25k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 1.25k | } |
_ZN5doris7SipHash6updateIhEEvRKT_ Line | Count | Source | 174 | 230k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 230k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 230k | } |
_ZN5doris7SipHash6updateIaEEvRKT_ Line | Count | Source | 174 | 974 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 974 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 974 | } |
_ZN5doris7SipHash6updateIsEEvRKT_ Line | Count | Source | 174 | 888 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 888 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 888 | } |
_ZN5doris7SipHash6updateIiEEvRKT_ Line | Count | Source | 174 | 1.97k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 1.97k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 1.97k | } |
_ZN5doris7SipHash6updateIlEEvRKT_ Line | Count | Source | 174 | 2.14k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 2.14k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 2.14k | } |
_ZN5doris7SipHash6updateInEEvRKT_ Line | Count | Source | 174 | 674 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 674 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 674 | } |
_ZN5doris7SipHash6updateIfEEvRKT_ Line | Count | Source | 174 | 737 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 737 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 737 | } |
_ZN5doris7SipHash6updateIdEEvRKT_ Line | Count | Source | 174 | 821 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 821 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 821 | } |
_ZN5doris7SipHash6updateIjEEvRKT_ Line | Count | Source | 174 | 1.13k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 1.13k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 1.13k | } |
_ZN5doris7SipHash6updateIoEEvRKT_ Line | Count | Source | 174 | 752 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 752 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 752 | } |
_ZN5doris7SipHash6updateINS_16VecDateTimeValueEEEvRKT_ Line | Count | Source | 174 | 816 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 816 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 816 | } |
_ZN5doris7SipHash6updateINS_11DateV2ValueINS_15DateV2ValueTypeEEEEEvRKT_ Line | Count | Source | 174 | 256 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 256 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 256 | } |
_ZN5doris7SipHash6updateINS_11DateV2ValueINS_19DateTimeV2ValueTypeEEEEEvRKT_ Line | Count | Source | 174 | 1.95k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 1.95k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 1.95k | } |
Unexecuted instantiation: _ZN5doris7SipHash6updateINS_16TimestampTzValueEEEvRKT_ _ZN5doris7SipHash6updateImEEvRKT_ Line | Count | Source | 174 | 57.5M | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 57.5M | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 57.5M | } |
_ZN5doris7SipHash6updateIbEEvRKT_ Line | Count | Source | 174 | 57.7M | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 57.7M | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 57.7M | } |
|
180 | | |
181 | | /// Get the result in some form. This can only be done once! |
182 | | |
183 | 59.4M | void get128(char* out) { |
184 | 59.4M | finalize(); |
185 | 59.4M | reinterpret_cast<UInt64*>(out)[0] = v0 ^ v1; |
186 | 59.4M | reinterpret_cast<UInt64*>(out)[1] = v2 ^ v3; |
187 | 59.4M | } |
188 | | |
189 | | /// template for avoiding 'unsigned long long' vs 'unsigned long' problem on old poco in macos |
190 | | template <typename T> |
191 | | ALWAYS_INLINE void get128(T& lo, T& hi) { |
192 | | static_assert(sizeof(T) == 8); |
193 | | finalize(); |
194 | | lo = v0 ^ v1; |
195 | | hi = v2 ^ v3; |
196 | | } |
197 | | |
198 | 229k | UInt64 get64() { |
199 | 229k | finalize(); |
200 | 229k | return v0 ^ v1 ^ v2 ^ v3; |
201 | 229k | } |
202 | | |
203 | | template <typename T> |
204 | 57.4M | ALWAYS_INLINE void get128(T& dst) { |
205 | 57.4M | static_assert(sizeof(T) == 16); |
206 | 57.4M | get128(reinterpret_cast<char*>(&dst)); |
207 | 57.4M | } |
208 | | }; |
209 | | |
210 | | #undef ROTL |
211 | | #undef SIPROUND |
212 | | |
213 | | #include <cstddef> |
214 | | |
215 | 2.02M | inline void sip_hash128(const char* data, const size_t size, char* out) { |
216 | 2.02M | SipHash hash; |
217 | 2.02M | hash.update(data, size); |
218 | 2.02M | hash.get128(out); |
219 | 2.02M | } |
220 | | #include "common/compile_check_end.h" |
221 | | } // namespace doris |