be/src/exec/common/sip_hash.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/SipHash.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | /** SipHash is a fast cryptographic hash function for short strings. |
24 | | * Taken from here: https://www.131002.net/siphash/ |
25 | | * |
26 | | * This is SipHash 2-4 variant. |
27 | | * |
28 | | * Two changes are made: |
29 | | * - returns also 128 bits, not only 64; |
30 | | * - done streaming (can be calculated in parts). |
31 | | * |
32 | | * On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL. |
33 | | * (~ 700 MB/sec, 15 million strings per second) |
34 | | */ |
35 | | |
36 | | #include <string> |
37 | | #include <type_traits> |
38 | | |
39 | | #include "common/compiler_util.h" // IWYU pragma: keep |
40 | | #include "core/types.h" |
41 | | #include "util/unaligned.h" |
42 | | |
43 | | namespace doris { |
44 | | #include "common/compile_check_begin.h" |
45 | 168M | #define ROTL(x, b) static_cast<UInt64>(((x) << (b)) | ((x) >> (64 - (b)))) |
46 | | |
47 | | #define SIPROUND \ |
48 | 28.0M | do { \ |
49 | 28.0M | v0 += v1; \ |
50 | 28.0M | v1 = ROTL(v1, 13); \ |
51 | 28.0M | v1 ^= v0; \ |
52 | 28.0M | v0 = ROTL(v0, 32); \ |
53 | 28.0M | v2 += v3; \ |
54 | 28.0M | v3 = ROTL(v3, 16); \ |
55 | 28.0M | v3 ^= v2; \ |
56 | 28.0M | v0 += v3; \ |
57 | 28.0M | v3 = ROTL(v3, 21); \ |
58 | 28.0M | v3 ^= v0; \ |
59 | 28.0M | v2 += v1; \ |
60 | 28.0M | v1 = ROTL(v1, 17); \ |
61 | 28.0M | v1 ^= v2; \ |
62 | 28.0M | v2 = ROTL(v2, 32); \ |
63 | 28.0M | } while (0) |
64 | | |
65 | | class SipHash { |
66 | | private: |
67 | | /// State. |
68 | | UInt64 v0; |
69 | | UInt64 v1; |
70 | | UInt64 v2; |
71 | | UInt64 v3; |
72 | | |
73 | | /// How many bytes have been processed. |
74 | | UInt64 cnt; |
75 | | |
76 | | /// The current 8 bytes of input data. |
77 | | union { |
78 | | UInt64 current_word; |
79 | | UInt8 current_bytes[8]; |
80 | | }; |
81 | | |
82 | 610k | ALWAYS_INLINE void finalize() { |
83 | | /// In the last free byte, we write the remainder of the division by 256. |
84 | 610k | current_bytes[7] = uint8_t(cnt); |
85 | | |
86 | 610k | v3 ^= current_word; |
87 | 610k | SIPROUND; |
88 | 610k | SIPROUND; |
89 | 610k | v0 ^= current_word; |
90 | | |
91 | 610k | v2 ^= 0xff; |
92 | 610k | SIPROUND; |
93 | 610k | SIPROUND; |
94 | 610k | SIPROUND; |
95 | 610k | SIPROUND; |
96 | 610k | } |
97 | | |
98 | | public: |
99 | | /// Arguments - seed. |
100 | 610k | SipHash(UInt64 k0 = 0, UInt64 k1 = 0) { |
101 | | /// Initialize the state with some random bytes and seed. |
102 | 610k | v0 = 0x736f6d6570736575ULL ^ k0; |
103 | 610k | v1 = 0x646f72616e646f6dULL ^ k1; |
104 | 610k | v2 = 0x6c7967656e657261ULL ^ k0; |
105 | 610k | v3 = 0x7465646279746573ULL ^ k1; |
106 | | |
107 | 610k | cnt = 0; |
108 | 610k | current_word = 0; |
109 | 610k | } |
110 | | |
111 | 1.31M | void update(const char* data, UInt64 size) { |
112 | 1.31M | const char* end = data + size; |
113 | | |
114 | | /// We'll finish to process the remainder of the previous update, if any. |
115 | 1.31M | if (cnt & 7) { |
116 | 317k | while (cnt & 7 && data < end) { |
117 | 228k | current_bytes[cnt & 7] = *data; |
118 | 228k | ++data; |
119 | 228k | ++cnt; |
120 | 228k | } |
121 | | |
122 | | /// If we still do not have enough bytes to an 8-byte word. |
123 | 88.4k | if (cnt & 7) return; |
124 | | |
125 | 46.9k | v3 ^= current_word; |
126 | 46.9k | SIPROUND; |
127 | 46.9k | SIPROUND; |
128 | 46.9k | v0 ^= current_word; |
129 | 46.9k | } |
130 | | |
131 | 1.27M | cnt += end - data; |
132 | | |
133 | 13.4M | while (data + 8 <= end) { |
134 | 12.1M | current_word = unaligned_load<UInt64>(data); |
135 | | |
136 | 12.1M | v3 ^= current_word; |
137 | 12.1M | SIPROUND; |
138 | 12.1M | SIPROUND; |
139 | 12.1M | v0 ^= current_word; |
140 | | |
141 | 12.1M | data += 8; |
142 | 12.1M | } |
143 | | |
144 | | /// Pad the remainder, which is missing up to an 8-byte word. |
145 | 1.27M | current_word = 0; |
146 | 1.27M | switch (end - data) { |
147 | 4.94k | case 7: |
148 | 4.94k | current_bytes[6] = data[6]; |
149 | 4.94k | [[fallthrough]]; |
150 | 10.8k | case 6: |
151 | 10.8k | current_bytes[5] = data[5]; |
152 | 10.8k | [[fallthrough]]; |
153 | 16.5k | case 5: |
154 | 16.5k | current_bytes[4] = data[4]; |
155 | 16.5k | [[fallthrough]]; |
156 | 36.1k | case 4: |
157 | 36.1k | current_bytes[3] = data[3]; |
158 | 36.1k | [[fallthrough]]; |
159 | 43.3k | case 3: |
160 | 43.3k | current_bytes[2] = data[2]; |
161 | 43.3k | [[fallthrough]]; |
162 | 49.7k | case 2: |
163 | 49.7k | current_bytes[1] = data[1]; |
164 | 49.7k | [[fallthrough]]; |
165 | 655k | case 1: |
166 | 655k | current_bytes[0] = data[0]; |
167 | 655k | [[fallthrough]]; |
168 | 1.27M | case 0: |
169 | 1.27M | break; |
170 | 1.27M | } |
171 | 1.27M | } |
172 | | |
173 | | template <typename T> |
174 | 1.24M | void update(const T& x) { |
175 | | if constexpr (std::is_same_v<T, std::string>) { |
176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); |
177 | | } |
178 | 1.24M | update(reinterpret_cast<const char*>(&x), sizeof(x)); |
179 | 1.24M | } _ZN5doris7SipHash6updateINS_7DecimalIiEEEEvRKT_ Line | Count | Source | 174 | 354 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 354 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 354 | } |
_ZN5doris7SipHash6updateINS_7DecimalIlEEEEvRKT_ Line | Count | Source | 174 | 576 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 576 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 576 | } |
_ZN5doris7SipHash6updateINS_14DecimalV2ValueEEEvRKT_ Line | Count | Source | 174 | 1.34k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 1.34k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 1.34k | } |
_ZN5doris7SipHash6updateINS_12Decimal128V3EEEvRKT_ Line | Count | Source | 174 | 662 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 662 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 662 | } |
_ZN5doris7SipHash6updateINS_7DecimalIN4wide7integerILm256EiEEEEEEvRKT_ Line | Count | Source | 174 | 1.25k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 1.25k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 1.25k | } |
_ZN5doris7SipHash6updateIiEEvRKT_ Line | Count | Source | 174 | 1.97k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 1.97k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 1.97k | } |
_ZN5doris7SipHash6updateIhEEvRKT_ Line | Count | Source | 174 | 13.2k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 13.2k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 13.2k | } |
_ZN5doris7SipHash6updateIaEEvRKT_ Line | Count | Source | 174 | 974 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 974 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 974 | } |
_ZN5doris7SipHash6updateIsEEvRKT_ Line | Count | Source | 174 | 888 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 888 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 888 | } |
_ZN5doris7SipHash6updateIlEEvRKT_ Line | Count | Source | 174 | 2.14k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 2.14k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 2.14k | } |
_ZN5doris7SipHash6updateInEEvRKT_ Line | Count | Source | 174 | 674 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 674 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 674 | } |
_ZN5doris7SipHash6updateIfEEvRKT_ Line | Count | Source | 174 | 737 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 737 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 737 | } |
_ZN5doris7SipHash6updateIdEEvRKT_ Line | Count | Source | 174 | 821 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 821 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 821 | } |
_ZN5doris7SipHash6updateIjEEvRKT_ Line | Count | Source | 174 | 1.13k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 1.13k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 1.13k | } |
_ZN5doris7SipHash6updateIoEEvRKT_ Line | Count | Source | 174 | 752 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 752 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 752 | } |
_ZN5doris7SipHash6updateINS_16VecDateTimeValueEEEvRKT_ Line | Count | Source | 174 | 816 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 816 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 816 | } |
_ZN5doris7SipHash6updateINS_11DateV2ValueINS_15DateV2ValueTypeEEEEEvRKT_ Line | Count | Source | 174 | 256 | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 256 | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 256 | } |
_ZN5doris7SipHash6updateINS_11DateV2ValueINS_19DateTimeV2ValueTypeEEEEEvRKT_ Line | Count | Source | 174 | 1.95k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 1.95k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 1.95k | } |
Unexecuted instantiation: _ZN5doris7SipHash6updateINS_16TimestampTzValueEEEvRKT_ _ZN5doris7SipHash6updateImEEvRKT_ Line | Count | Source | 174 | 600k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 600k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 600k | } |
_ZN5doris7SipHash6updateIbEEvRKT_ Line | Count | Source | 174 | 613k | void update(const T& x) { | 175 | | if constexpr (std::is_same_v<T, std::string>) { | 176 | | throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!"); | 177 | | } | 178 | 613k | update(reinterpret_cast<const char*>(&x), sizeof(x)); | 179 | 613k | } |
|
180 | | |
181 | | /// Get the result in some form. This can only be done once! |
182 | | |
183 | 604k | void get128(char* out) { |
184 | 604k | finalize(); |
185 | 604k | reinterpret_cast<UInt64*>(out)[0] = v0 ^ v1; |
186 | 604k | reinterpret_cast<UInt64*>(out)[1] = v2 ^ v3; |
187 | 604k | } |
188 | | |
189 | | /// template for avoiding 'unsigned long long' vs 'unsigned long' problem on old poco in macos |
190 | | template <typename T> |
191 | | ALWAYS_INLINE void get128(T& lo, T& hi) { |
192 | | static_assert(sizeof(T) == 8); |
193 | | finalize(); |
194 | | lo = v0 ^ v1; |
195 | | hi = v2 ^ v3; |
196 | | } |
197 | | |
198 | 5.41k | UInt64 get64() { |
199 | 5.41k | finalize(); |
200 | 5.41k | return v0 ^ v1 ^ v2 ^ v3; |
201 | 5.41k | } |
202 | | |
203 | | template <typename T> |
204 | 600k | ALWAYS_INLINE void get128(T& dst) { |
205 | 600k | static_assert(sizeof(T) == 16); |
206 | 600k | get128(reinterpret_cast<char*>(&dst)); |
207 | 600k | } |
208 | | }; |
209 | | |
210 | | #undef ROTL |
211 | | #undef SIPROUND |
212 | | |
213 | | #include <cstddef> |
214 | | |
215 | 4.24k | inline void sip_hash128(const char* data, const size_t size, char* out) { |
216 | 4.24k | SipHash hash; |
217 | 4.24k | hash.update(data, size); |
218 | 4.24k | hash.get128(out); |
219 | 4.24k | } |
220 | | #include "common/compile_check_end.h" |
221 | | } // namespace doris |