Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "util/md5.h" |
19 | | |
20 | | #include <algorithm> |
21 | | #include <cstring> |
22 | | #include <vector> |
23 | | |
24 | | #ifdef __AVX2__ |
25 | | #include <immintrin.h> |
26 | | #endif |
27 | | |
28 | | #include "exec/common/endian.h" |
29 | | |
30 | | namespace doris { |
31 | | |
32 | | namespace { |
33 | | |
34 | | constexpr uint32_t MD5_A0 = 0x67452301; |
35 | | constexpr uint32_t MD5_B0 = 0xefcdab89; |
36 | | constexpr uint32_t MD5_C0 = 0x98badcfe; |
37 | | constexpr uint32_t MD5_D0 = 0x10325476; |
38 | | constexpr unsigned char MD5_DUMMY_INPUT = 0; |
39 | | |
40 | 1.66k | void md5_to_hex(const unsigned char* digest, char* out) { |
41 | 1.66k | static constexpr char DIGITS[] = "0123456789abcdef"; |
42 | 28.2k | for (int i = 0; i < MD5_DIGEST_LENGTH; ++i) { |
43 | 26.6k | *out++ = DIGITS[digest[i] >> 4]; |
44 | 26.6k | *out++ = DIGITS[digest[i] & 0x0F]; |
45 | 26.6k | } |
46 | 1.66k | } |
47 | | |
48 | 3.39k | size_t md5_num_blocks(size_t len) { |
49 | 3.39k | return (len + 9 + 63) / 64; |
50 | 3.39k | } |
51 | | |
52 | 2.73k | size_t md5_pad_final_blocks(const unsigned char* data, size_t len, unsigned char* out) { |
53 | 2.73k | size_t full_blocks = len / 64; |
54 | 2.73k | size_t tail = len % 64; |
55 | 2.73k | size_t num_blocks = md5_num_blocks(len); |
56 | 2.73k | size_t final_count = num_blocks - full_blocks; |
57 | | |
58 | 2.73k | std::memset(out, 0, final_count * 64); |
59 | 2.73k | std::memcpy(out, data + full_blocks * 64, tail); |
60 | 2.73k | out[tail] = 0x80; |
61 | 2.73k | LittleEndian::Store64(out + final_count * 64 - 8, static_cast<uint64_t>(len) * 8); |
62 | | |
63 | 2.73k | return final_count; |
64 | 2.73k | } |
65 | | |
66 | | #ifdef __AVX2__ |
67 | | |
68 | | struct AVX2MD5Ops { |
69 | | using Vec = __m256i; |
70 | | static constexpr size_t LANES = 8; |
71 | | |
72 | 9.29M | static Vec add(Vec a, Vec b) { return _mm256_add_epi32(a, b); } |
73 | | |
74 | 1.14M | static Vec set1(uint32_t v) { return _mm256_set1_epi32(static_cast<int>(v)); } |
75 | | |
76 | 572k | static Vec loadu(const void* p) { |
77 | 572k | return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p)); |
78 | 572k | } |
79 | | |
80 | 2.62k | static void storeu(void* p, Vec v) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v); } |
81 | | |
82 | | template <int N> |
83 | 2.28M | static Vec rotl(Vec x) { |
84 | 2.28M | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); |
85 | 2.28M | } md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi7EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi12EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi17EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi22EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi5EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi9EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi14EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi20EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi4EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi11EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi16EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi23EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi6EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi10EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi15EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
md5.cpp:_ZN5doris12_GLOBAL__N_110AVX2MD5Ops4rotlILi21EEEDv4_xS3_ Line | Count | Source | 83 | 143k | static Vec rotl(Vec x) { | 84 | 143k | return _mm256_or_si256(_mm256_slli_epi32(x, N), _mm256_srli_epi32(x, 32 - N)); | 85 | 143k | } |
|
86 | | |
87 | 572k | static Vec F(Vec b, Vec c, Vec d) { |
88 | 572k | return _mm256_xor_si256(d, _mm256_and_si256(b, _mm256_xor_si256(c, d))); |
89 | 572k | } |
90 | | |
91 | 572k | static Vec G(Vec b, Vec c, Vec d) { |
92 | 572k | return _mm256_xor_si256(c, _mm256_and_si256(d, _mm256_xor_si256(b, c))); |
93 | 572k | } |
94 | | |
95 | 572k | static Vec H(Vec b, Vec c, Vec d) { return _mm256_xor_si256(b, _mm256_xor_si256(c, d)); } |
96 | | |
97 | 572k | static Vec I(Vec b, Vec c, Vec d) { |
98 | 572k | return _mm256_xor_si256(c, _mm256_or_si256(b, _mm256_xor_si256(d, _mm256_set1_epi32(-1)))); |
99 | 572k | } |
100 | | |
101 | 35.7k | static void gather_all_message_words(const unsigned char* const block_ptrs[], Vec msg[16]) { |
102 | 107k | for (int half = 0; half < 2; ++half) { |
103 | 71.5k | size_t off = half * 32; |
104 | 71.5k | Vec r0 = loadu(block_ptrs[0] + off); |
105 | 71.5k | Vec r1 = loadu(block_ptrs[1] + off); |
106 | 71.5k | Vec r2 = loadu(block_ptrs[2] + off); |
107 | 71.5k | Vec r3 = loadu(block_ptrs[3] + off); |
108 | 71.5k | Vec r4 = loadu(block_ptrs[4] + off); |
109 | 71.5k | Vec r5 = loadu(block_ptrs[5] + off); |
110 | 71.5k | Vec r6 = loadu(block_ptrs[6] + off); |
111 | 71.5k | Vec r7 = loadu(block_ptrs[7] + off); |
112 | | |
113 | 71.5k | Vec t0 = _mm256_unpacklo_epi32(r0, r1); |
114 | 71.5k | Vec t1 = _mm256_unpackhi_epi32(r0, r1); |
115 | 71.5k | Vec t2 = _mm256_unpacklo_epi32(r2, r3); |
116 | 71.5k | Vec t3 = _mm256_unpackhi_epi32(r2, r3); |
117 | 71.5k | Vec t4 = _mm256_unpacklo_epi32(r4, r5); |
118 | 71.5k | Vec t5 = _mm256_unpackhi_epi32(r4, r5); |
119 | 71.5k | Vec t6 = _mm256_unpacklo_epi32(r6, r7); |
120 | 71.5k | Vec t7 = _mm256_unpackhi_epi32(r6, r7); |
121 | | |
122 | 71.5k | Vec u0 = _mm256_unpacklo_epi64(t0, t2); |
123 | 71.5k | Vec u1 = _mm256_unpackhi_epi64(t0, t2); |
124 | 71.5k | Vec u2 = _mm256_unpacklo_epi64(t1, t3); |
125 | 71.5k | Vec u3 = _mm256_unpackhi_epi64(t1, t3); |
126 | 71.5k | Vec u4 = _mm256_unpacklo_epi64(t4, t6); |
127 | 71.5k | Vec u5 = _mm256_unpackhi_epi64(t4, t6); |
128 | 71.5k | Vec u6 = _mm256_unpacklo_epi64(t5, t7); |
129 | 71.5k | Vec u7 = _mm256_unpackhi_epi64(t5, t7); |
130 | | |
131 | 71.5k | size_t base = half * 8; |
132 | 71.5k | msg[base + 0] = _mm256_permute2x128_si256(u0, u4, 0x20); |
133 | 71.5k | msg[base + 4] = _mm256_permute2x128_si256(u0, u4, 0x31); |
134 | 71.5k | msg[base + 1] = _mm256_permute2x128_si256(u1, u5, 0x20); |
135 | 71.5k | msg[base + 5] = _mm256_permute2x128_si256(u1, u5, 0x31); |
136 | 71.5k | msg[base + 2] = _mm256_permute2x128_si256(u2, u6, 0x20); |
137 | 71.5k | msg[base + 6] = _mm256_permute2x128_si256(u2, u6, 0x31); |
138 | 71.5k | msg[base + 3] = _mm256_permute2x128_si256(u3, u7, 0x20); |
139 | 71.5k | msg[base + 7] = _mm256_permute2x128_si256(u3, u7, 0x31); |
140 | 71.5k | } |
141 | 35.7k | } |
142 | | }; |
143 | | |
144 | | #define MD5_STEP_X2(func, w1, x1, y1, z1, w2, x2, y2, z2, g, s, ti) \ |
145 | 1.14M | { \ |
146 | 1.14M | Vec t1 = Ops::func(x1, y1, z1); \ |
147 | 1.14M | Vec t2 = Ops::func(x2, y2, z2); \ |
148 | 1.14M | t1 = Ops::add(t1, w1); \ |
149 | 1.14M | t2 = Ops::add(t2, w2); \ |
150 | 1.14M | Vec k = Ops::set1(ti); \ |
151 | 1.14M | t1 = Ops::add(t1, k); \ |
152 | 1.14M | t2 = Ops::add(t2, k); \ |
153 | 1.14M | t1 = Ops::add(t1, msg1[g]); \ |
154 | 1.14M | t2 = Ops::add(t2, msg2[g]); \ |
155 | 1.14M | (w1) = Ops::add(x1, Ops::template rotl<s>(t1)); \ |
156 | 1.14M | (w2) = Ops::add(x2, Ops::template rotl<s>(t2)); \ |
157 | 1.14M | } |
158 | | |
159 | | template <typename Ops> |
160 | | struct MD5X2State { |
161 | | typename Ops::Vec a1, b1, c1, d1, a2, b2, c2, d2; |
162 | | }; |
163 | | |
164 | | template <typename Ops> |
165 | | MD5X2State<Ops> md5_multi_buffer_block_x2(typename Ops::Vec a1, typename Ops::Vec b1, |
166 | | typename Ops::Vec c1, typename Ops::Vec d1, |
167 | | typename Ops::Vec a2, typename Ops::Vec b2, |
168 | | typename Ops::Vec c2, typename Ops::Vec d2, |
169 | | const typename Ops::Vec msg1[16], |
170 | 17.8k | const typename Ops::Vec msg2[16]) { |
171 | 17.8k | using Vec = typename Ops::Vec; |
172 | 17.8k | Vec aa1 = a1; |
173 | 17.8k | Vec bb1 = b1; |
174 | 17.8k | Vec cc1 = c1; |
175 | 17.8k | Vec dd1 = d1; |
176 | 17.8k | Vec aa2 = a2; |
177 | 17.8k | Vec bb2 = b2; |
178 | 17.8k | Vec cc2 = c2; |
179 | 17.8k | Vec dd2 = d2; |
180 | | |
181 | 17.8k | MD5_STEP_X2(F, a1, b1, c1, d1, a2, b2, c2, d2, 0, 7, 0xd76aa478) |
182 | 17.8k | MD5_STEP_X2(F, d1, a1, b1, c1, d2, a2, b2, c2, 1, 12, 0xe8c7b756) |
183 | 17.8k | MD5_STEP_X2(F, c1, d1, a1, b1, c2, d2, a2, b2, 2, 17, 0x242070db) |
184 | 17.8k | MD5_STEP_X2(F, b1, c1, d1, a1, b2, c2, d2, a2, 3, 22, 0xc1bdceee) |
185 | 17.8k | MD5_STEP_X2(F, a1, b1, c1, d1, a2, b2, c2, d2, 4, 7, 0xf57c0faf) |
186 | 17.8k | MD5_STEP_X2(F, d1, a1, b1, c1, d2, a2, b2, c2, 5, 12, 0x4787c62a) |
187 | 17.8k | MD5_STEP_X2(F, c1, d1, a1, b1, c2, d2, a2, b2, 6, 17, 0xa8304613) |
188 | 17.8k | MD5_STEP_X2(F, b1, c1, d1, a1, b2, c2, d2, a2, 7, 22, 0xfd469501) |
189 | 17.8k | MD5_STEP_X2(F, a1, b1, c1, d1, a2, b2, c2, d2, 8, 7, 0x698098d8) |
190 | 17.8k | MD5_STEP_X2(F, d1, a1, b1, c1, d2, a2, b2, c2, 9, 12, 0x8b44f7af) |
191 | 17.8k | MD5_STEP_X2(F, c1, d1, a1, b1, c2, d2, a2, b2, 10, 17, 0xffff5bb1) |
192 | 17.8k | MD5_STEP_X2(F, b1, c1, d1, a1, b2, c2, d2, a2, 11, 22, 0x895cd7be) |
193 | 17.8k | MD5_STEP_X2(F, a1, b1, c1, d1, a2, b2, c2, d2, 12, 7, 0x6b901122) |
194 | 17.8k | MD5_STEP_X2(F, d1, a1, b1, c1, d2, a2, b2, c2, 13, 12, 0xfd987193) |
195 | 17.8k | MD5_STEP_X2(F, c1, d1, a1, b1, c2, d2, a2, b2, 14, 17, 0xa679438e) |
196 | 17.8k | MD5_STEP_X2(F, b1, c1, d1, a1, b2, c2, d2, a2, 15, 22, 0x49b40821) |
197 | | |
198 | 17.8k | MD5_STEP_X2(G, a1, b1, c1, d1, a2, b2, c2, d2, 1, 5, 0xf61e2562) |
199 | 17.8k | MD5_STEP_X2(G, d1, a1, b1, c1, d2, a2, b2, c2, 6, 9, 0xc040b340) |
200 | 17.8k | MD5_STEP_X2(G, c1, d1, a1, b1, c2, d2, a2, b2, 11, 14, 0x265e5a51) |
201 | 17.8k | MD5_STEP_X2(G, b1, c1, d1, a1, b2, c2, d2, a2, 0, 20, 0xe9b6c7aa) |
202 | 17.8k | MD5_STEP_X2(G, a1, b1, c1, d1, a2, b2, c2, d2, 5, 5, 0xd62f105d) |
203 | 17.8k | MD5_STEP_X2(G, d1, a1, b1, c1, d2, a2, b2, c2, 10, 9, 0x02441453) |
204 | 17.8k | MD5_STEP_X2(G, c1, d1, a1, b1, c2, d2, a2, b2, 15, 14, 0xd8a1e681) |
205 | 17.8k | MD5_STEP_X2(G, b1, c1, d1, a1, b2, c2, d2, a2, 4, 20, 0xe7d3fbc8) |
206 | 17.8k | MD5_STEP_X2(G, a1, b1, c1, d1, a2, b2, c2, d2, 9, 5, 0x21e1cde6) |
207 | 17.8k | MD5_STEP_X2(G, d1, a1, b1, c1, d2, a2, b2, c2, 14, 9, 0xc33707d6) |
208 | 17.8k | MD5_STEP_X2(G, c1, d1, a1, b1, c2, d2, a2, b2, 3, 14, 0xf4d50d87) |
209 | 17.8k | MD5_STEP_X2(G, b1, c1, d1, a1, b2, c2, d2, a2, 8, 20, 0x455a14ed) |
210 | 17.8k | MD5_STEP_X2(G, a1, b1, c1, d1, a2, b2, c2, d2, 13, 5, 0xa9e3e905) |
211 | 17.8k | MD5_STEP_X2(G, d1, a1, b1, c1, d2, a2, b2, c2, 2, 9, 0xfcefa3f8) |
212 | 17.8k | MD5_STEP_X2(G, c1, d1, a1, b1, c2, d2, a2, b2, 7, 14, 0x676f02d9) |
213 | 17.8k | MD5_STEP_X2(G, b1, c1, d1, a1, b2, c2, d2, a2, 12, 20, 0x8d2a4c8a) |
214 | | |
215 | 17.8k | MD5_STEP_X2(H, a1, b1, c1, d1, a2, b2, c2, d2, 5, 4, 0xfffa3942) |
216 | 17.8k | MD5_STEP_X2(H, d1, a1, b1, c1, d2, a2, b2, c2, 8, 11, 0x8771f681) |
217 | 17.8k | MD5_STEP_X2(H, c1, d1, a1, b1, c2, d2, a2, b2, 11, 16, 0x6d9d6122) |
218 | 17.8k | MD5_STEP_X2(H, b1, c1, d1, a1, b2, c2, d2, a2, 14, 23, 0xfde5380c) |
219 | 17.8k | MD5_STEP_X2(H, a1, b1, c1, d1, a2, b2, c2, d2, 1, 4, 0xa4beea44) |
220 | 17.8k | MD5_STEP_X2(H, d1, a1, b1, c1, d2, a2, b2, c2, 4, 11, 0x4bdecfa9) |
221 | 17.8k | MD5_STEP_X2(H, c1, d1, a1, b1, c2, d2, a2, b2, 7, 16, 0xf6bb4b60) |
222 | 17.8k | MD5_STEP_X2(H, b1, c1, d1, a1, b2, c2, d2, a2, 10, 23, 0xbebfbc70) |
223 | 17.8k | MD5_STEP_X2(H, a1, b1, c1, d1, a2, b2, c2, d2, 13, 4, 0x289b7ec6) |
224 | 17.8k | MD5_STEP_X2(H, d1, a1, b1, c1, d2, a2, b2, c2, 0, 11, 0xeaa127fa) |
225 | 17.8k | MD5_STEP_X2(H, c1, d1, a1, b1, c2, d2, a2, b2, 3, 16, 0xd4ef3085) |
226 | 17.8k | MD5_STEP_X2(H, b1, c1, d1, a1, b2, c2, d2, a2, 6, 23, 0x04881d05) |
227 | 17.8k | MD5_STEP_X2(H, a1, b1, c1, d1, a2, b2, c2, d2, 9, 4, 0xd9d4d039) |
228 | 17.8k | MD5_STEP_X2(H, d1, a1, b1, c1, d2, a2, b2, c2, 12, 11, 0xe6db99e5) |
229 | 17.8k | MD5_STEP_X2(H, c1, d1, a1, b1, c2, d2, a2, b2, 15, 16, 0x1fa27cf8) |
230 | 17.8k | MD5_STEP_X2(H, b1, c1, d1, a1, b2, c2, d2, a2, 2, 23, 0xc4ac5665) |
231 | | |
232 | 17.8k | MD5_STEP_X2(I, a1, b1, c1, d1, a2, b2, c2, d2, 0, 6, 0xf4292244) |
233 | 17.8k | MD5_STEP_X2(I, d1, a1, b1, c1, d2, a2, b2, c2, 7, 10, 0x432aff97) |
234 | 17.8k | MD5_STEP_X2(I, c1, d1, a1, b1, c2, d2, a2, b2, 14, 15, 0xab9423a7) |
235 | 17.8k | MD5_STEP_X2(I, b1, c1, d1, a1, b2, c2, d2, a2, 5, 21, 0xfc93a039) |
236 | 17.8k | MD5_STEP_X2(I, a1, b1, c1, d1, a2, b2, c2, d2, 12, 6, 0x655b59c3) |
237 | 17.8k | MD5_STEP_X2(I, d1, a1, b1, c1, d2, a2, b2, c2, 3, 10, 0x8f0ccc92) |
238 | 17.8k | MD5_STEP_X2(I, c1, d1, a1, b1, c2, d2, a2, b2, 10, 15, 0xffeff47d) |
239 | 17.8k | MD5_STEP_X2(I, b1, c1, d1, a1, b2, c2, d2, a2, 1, 21, 0x85845dd1) |
240 | 17.8k | MD5_STEP_X2(I, a1, b1, c1, d1, a2, b2, c2, d2, 8, 6, 0x6fa87e4f) |
241 | 17.8k | MD5_STEP_X2(I, d1, a1, b1, c1, d2, a2, b2, c2, 15, 10, 0xfe2ce6e0) |
242 | 17.8k | MD5_STEP_X2(I, c1, d1, a1, b1, c2, d2, a2, b2, 6, 15, 0xa3014314) |
243 | 17.8k | MD5_STEP_X2(I, b1, c1, d1, a1, b2, c2, d2, a2, 13, 21, 0x4e0811a1) |
244 | 17.8k | MD5_STEP_X2(I, a1, b1, c1, d1, a2, b2, c2, d2, 4, 6, 0xf7537e82) |
245 | 17.8k | MD5_STEP_X2(I, d1, a1, b1, c1, d2, a2, b2, c2, 11, 10, 0xbd3af235) |
246 | 17.8k | MD5_STEP_X2(I, c1, d1, a1, b1, c2, d2, a2, b2, 2, 15, 0x2ad7d2bb) |
247 | 17.8k | MD5_STEP_X2(I, b1, c1, d1, a1, b2, c2, d2, a2, 9, 21, 0xeb86d391) |
248 | | |
249 | 17.8k | return {Ops::add(a1, aa1), Ops::add(b1, bb1), Ops::add(c1, cc1), Ops::add(d1, dd1), |
250 | 17.8k | Ops::add(a2, aa2), Ops::add(b2, bb2), Ops::add(c2, cc2), Ops::add(d2, dd2)}; |
251 | 17.8k | } |
252 | | |
253 | | #undef MD5_STEP_X2 |
254 | | |
255 | | template <typename Ops> |
256 | 2.62k | uint32_t extract_lane(typename Ops::Vec v, size_t lane) { |
257 | 2.62k | alignas(32) uint32_t values[Ops::LANES]; |
258 | 2.62k | Ops::storeu(values, v); |
259 | 2.62k | return values[lane]; |
260 | 2.62k | } |
261 | | |
262 | | template <typename Ops> |
263 | | void md5_multi_buffer_compute(const unsigned char* const inputs[], const size_t lengths[], |
264 | 171 | unsigned char* outputs, size_t count) { |
265 | 171 | constexpr size_t N = Ops::LANES; |
266 | 171 | using Vec = typename Ops::Vec; |
267 | 171 | size_t count1 = std::min(count, N); |
268 | 171 | size_t count2 = count > N ? count - N : 0; |
269 | | |
270 | 171 | size_t num_blocks[2 * N]; |
271 | 171 | size_t max_blocks = 0; |
272 | 826 | for (size_t i = 0; i < count; ++i) { |
273 | 655 | num_blocks[i] = md5_num_blocks(lengths[i]); |
274 | 655 | max_blocks = std::max(max_blocks, num_blocks[i]); |
275 | 655 | } |
276 | 2.25k | for (size_t i = count; i < 2 * N; ++i) { |
277 | 2.08k | num_blocks[i] = 1; |
278 | 2.08k | } |
279 | | |
280 | 171 | alignas(32) unsigned char final_buf[2 * N][128]; |
281 | 171 | size_t final_block_start[2 * N]; |
282 | 171 | size_t final_block_count[2 * N]; |
283 | 826 | for (size_t i = 0; i < count; ++i) { |
284 | 655 | final_block_start[i] = lengths[i] / 64; |
285 | 655 | final_block_count[i] = md5_pad_final_blocks(inputs[i], lengths[i], final_buf[i]); |
286 | 655 | } |
287 | 2.25k | for (size_t i = count; i < 2 * N; ++i) { |
288 | 2.08k | final_block_start[i] = 0; |
289 | 2.08k | final_block_count[i] = md5_pad_final_blocks(&MD5_DUMMY_INPUT, 0, final_buf[i]); |
290 | 2.08k | } |
291 | | |
292 | 171 | Vec a1 = Ops::set1(MD5_A0); |
293 | 171 | Vec b1 = Ops::set1(MD5_B0); |
294 | 171 | Vec c1 = Ops::set1(MD5_C0); |
295 | 171 | Vec d1 = Ops::set1(MD5_D0); |
296 | 171 | Vec a2 = Ops::set1(MD5_A0); |
297 | 171 | Vec b2 = Ops::set1(MD5_B0); |
298 | 171 | Vec c2 = Ops::set1(MD5_C0); |
299 | 171 | Vec d2 = Ops::set1(MD5_D0); |
300 | | |
301 | 18.0k | for (size_t block = 0; block < max_blocks; ++block) { |
302 | 17.8k | const unsigned char* block_ptrs[2 * N]; |
303 | 303k | for (size_t i = 0; i < 2 * N; ++i) { |
304 | 286k | if (block < final_block_start[i]) { |
305 | 17.9k | block_ptrs[i] = inputs[i] + block * 64; |
306 | 268k | } else { |
307 | 268k | size_t final_index = block - final_block_start[i]; |
308 | 268k | block_ptrs[i] = final_index < final_block_count[i] ? final_buf[i] + final_index * 64 |
309 | 268k | : final_buf[i]; |
310 | 268k | } |
311 | 286k | } |
312 | | |
313 | 17.8k | Vec msg1[16]; |
314 | 17.8k | Vec msg2[16]; |
315 | 17.8k | Ops::gather_all_message_words(block_ptrs, msg1); |
316 | 17.8k | Ops::gather_all_message_words(block_ptrs + N, msg2); |
317 | | |
318 | 17.8k | auto st = md5_multi_buffer_block_x2<Ops>(a1, b1, c1, d1, a2, b2, c2, d2, msg1, msg2); |
319 | 17.8k | a1 = st.a1; |
320 | 17.8k | b1 = st.b1; |
321 | 17.8k | c1 = st.c1; |
322 | 17.8k | d1 = st.d1; |
323 | 17.8k | a2 = st.a2; |
324 | 17.8k | b2 = st.b2; |
325 | 17.8k | c2 = st.c2; |
326 | 17.8k | d2 = st.d2; |
327 | | |
328 | 53.3k | for (size_t lane = 0; lane < count1; ++lane) { |
329 | 35.5k | if (block + 1 == num_blocks[lane]) { |
330 | 477 | unsigned char* out = outputs + lane * MD5_DIGEST_LENGTH; |
331 | 477 | LittleEndian::Store32(out, extract_lane<Ops>(a1, lane)); |
332 | 477 | LittleEndian::Store32(out + 4, extract_lane<Ops>(b1, lane)); |
333 | 477 | LittleEndian::Store32(out + 8, extract_lane<Ops>(c1, lane)); |
334 | 477 | LittleEndian::Store32(out + 12, extract_lane<Ops>(d1, lane)); |
335 | 477 | } |
336 | 35.5k | } |
337 | 18.5k | for (size_t lane = 0; lane < count2; ++lane) { |
338 | 670 | if (block + 1 == num_blocks[N + lane]) { |
339 | 178 | unsigned char* out = outputs + (N + lane) * MD5_DIGEST_LENGTH; |
340 | 178 | LittleEndian::Store32(out, extract_lane<Ops>(a2, lane)); |
341 | 178 | LittleEndian::Store32(out + 4, extract_lane<Ops>(b2, lane)); |
342 | 178 | LittleEndian::Store32(out + 8, extract_lane<Ops>(c2, lane)); |
343 | 178 | LittleEndian::Store32(out + 12, extract_lane<Ops>(d2, lane)); |
344 | 178 | } |
345 | 670 | } |
346 | 17.8k | } |
347 | 171 | } |
348 | | |
349 | | void md5_binary_batch_avx2(const unsigned char* const inputs[], const size_t lengths[], |
350 | 156 | unsigned char* outputs, size_t count) { |
351 | 156 | constexpr size_t BATCH = 2 * AVX2MD5Ops::LANES; |
352 | 327 | for (size_t base = 0; base < count; base += BATCH) { |
353 | 171 | size_t batch = std::min(BATCH, count - base); |
354 | 171 | const unsigned char* batch_inputs[BATCH]; |
355 | 171 | size_t batch_lengths[BATCH]; |
356 | 826 | for (size_t i = 0; i < batch; ++i) { |
357 | 655 | batch_inputs[i] = lengths[base + i] == 0 ? &MD5_DUMMY_INPUT : inputs[base + i]; |
358 | 655 | batch_lengths[i] = lengths[base + i]; |
359 | 655 | } |
360 | 2.25k | for (size_t i = batch; i < BATCH; ++i) { |
361 | 2.08k | batch_inputs[i] = &MD5_DUMMY_INPUT; |
362 | 2.08k | batch_lengths[i] = 0; |
363 | 2.08k | } |
364 | 171 | md5_multi_buffer_compute<AVX2MD5Ops>(batch_inputs, batch_lengths, |
365 | 171 | outputs + base * MD5_DIGEST_LENGTH, batch); |
366 | 171 | } |
367 | 156 | } |
368 | | |
369 | | #endif |
370 | | |
371 | | } // namespace |
372 | | |
373 | 1.00k | Md5Digest::Md5Digest() { |
374 | 1.00k | MD5_Init(&_md5_ctx); |
375 | 1.00k | } |
376 | | |
377 | 2.74k | void Md5Digest::update(const void* data, size_t length) { |
378 | 2.74k | MD5_Update(&_md5_ctx, data, length); |
379 | 2.74k | } |
380 | | |
381 | 1.00k | void Md5Digest::digest() { |
382 | 1.00k | unsigned char buf[MD5_DIGEST_LENGTH]; |
383 | 1.00k | MD5_Final(buf, &_md5_ctx); |
384 | | |
385 | 1.00k | char hex_buf[MD5_HEX_LENGTH]; |
386 | 1.00k | md5_to_hex(buf, hex_buf); |
387 | 1.00k | _hex.assign(hex_buf, MD5_HEX_LENGTH); |
388 | 1.00k | } |
389 | | |
390 | | void md5_hex_batch(const unsigned char* const inputs[], const size_t lengths[], char* outputs, |
391 | 157 | size_t count) { |
392 | 157 | if (count == 0) { |
393 | 1 | return; |
394 | 1 | } |
395 | | |
396 | 156 | #ifdef __AVX2__ |
397 | 156 | std::vector<unsigned char> digests(count * MD5_DIGEST_LENGTH); |
398 | 156 | md5_binary_batch_avx2(inputs, lengths, digests.data(), count); |
399 | 811 | for (size_t i = 0; i < count; ++i) { |
400 | 655 | md5_to_hex(digests.data() + i * MD5_DIGEST_LENGTH, outputs + i * MD5_HEX_LENGTH); |
401 | 655 | } |
402 | | #else |
403 | | for (size_t i = 0; i < count; ++i) { |
404 | | unsigned char digest[MD5_DIGEST_LENGTH]; |
405 | | MD5(lengths[i] == 0 ? &MD5_DUMMY_INPUT : inputs[i], lengths[i], digest); |
406 | | md5_to_hex(digest, outputs + i * MD5_HEX_LENGTH); |
407 | | } |
408 | | #endif |
409 | 156 | } |
410 | | |
411 | | } // namespace doris |