/root/doris/be/src/util/simdutf8check.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <stdbool.h> |
21 | | #include <stddef.h> |
22 | | #include <stdint.h> |
23 | | #include <string.h> |
24 | | #include <x86intrin.h> |
25 | | |
26 | | /* |
27 | | * These functions are used for validating utf8 string. |
28 | | * Details can be seen here: https://github.com/lemire/fastvalidate-utf-8 |
29 | | */ |
30 | | |
31 | | /* |
32 | | * legal utf-8 byte sequence |
33 | | * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 |
34 | | * |
35 | | * Code Points 1st 2s 3s 4s |
36 | | * U+0000..U+007F 00..7F |
37 | | * U+0080..U+07FF C2..DF 80..BF |
38 | | * U+0800..U+0FFF E0 A0..BF 80..BF |
39 | | * U+1000..U+CFFF E1..EC 80..BF 80..BF |
40 | | * U+D000..U+D7FF ED 80..9F 80..BF |
41 | | * U+E000..U+FFFF EE..EF 80..BF 80..BF |
42 | | * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
43 | | * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
44 | | * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF |
45 | | * |
46 | | */ |
47 | | |
48 | | // all byte values must be no larger than 0xF4 |
49 | 3 | static inline void checkSmallerThan0xF4(__m128i current_bytes, __m128i* has_error) { |
50 | | // unsigned, saturates to 0 below max |
51 | 3 | *has_error = _mm_or_si128(*has_error, _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4))); |
52 | 3 | } |
53 | | |
54 | 3 | static inline __m128i continuationLengths(__m128i high_nibbles) { |
55 | 3 | return _mm_shuffle_epi8(_mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) |
56 | 3 | 0, 0, 0, 0, // 10xx (continuation) |
57 | 3 | 2, 2, // 110x |
58 | 3 | 3, // 1110 |
59 | 3 | 4), // 1111, next should be 0 (not checked here) |
60 | 3 | high_nibbles); |
61 | 3 | } |
62 | | |
63 | 3 | static inline __m128i carryContinuations(__m128i initial_lengths, __m128i previous_carries) { |
64 | 3 | __m128i right1 = _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1), |
65 | 3 | _mm_set1_epi8(1)); |
66 | 3 | __m128i sum = _mm_add_epi8(initial_lengths, right1); |
67 | | |
68 | 3 | __m128i right2 = |
69 | 3 | _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2), _mm_set1_epi8(2)); |
70 | 3 | return _mm_add_epi8(sum, right2); |
71 | 3 | } |
72 | | |
73 | | static inline void checkContinuations(__m128i initial_lengths, __m128i carries, |
74 | 3 | __m128i* has_error) { |
75 | | // overlap || underlap |
76 | | // carry > length && length > 0 || !(carry > length) && !(length > 0) |
77 | | // (carries > length) == (lengths > 0) |
78 | 3 | __m128i overunder = _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths), |
79 | 3 | _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128())); |
80 | | |
81 | 3 | *has_error = _mm_or_si128(*has_error, overunder); |
82 | 3 | } |
83 | | |
84 | | // when 0xED is found, next byte must be no larger than 0x9F |
85 | | // when 0xF4 is found, next byte must be no larger than 0x8F |
86 | | // next byte must be continuation, ie sign bit is set, so signed < is ok |
87 | | static inline void checkFirstContinuationMax(__m128i current_bytes, __m128i off1_current_bytes, |
88 | 3 | __m128i* has_error) { |
89 | 3 | __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED)); |
90 | 3 | __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4)); |
91 | | |
92 | 3 | __m128i badfollowED = _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED); |
93 | 3 | __m128i badfollowF4 = _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4); |
94 | | |
95 | 3 | *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4)); |
96 | 3 | } |
97 | | |
98 | | // map off1_hibits => error condition |
99 | | // hibits off1 cur |
100 | | // C => < C2 && true |
101 | | // E => < E1 && < A0 |
102 | | // F => < F1 && < 90 |
103 | | // else false && false |
104 | | static inline void checkOverlong(__m128i current_bytes, __m128i off1_current_bytes, __m128i hibits, |
105 | 3 | __m128i previous_hibits, __m128i* has_error) { |
106 | 3 | __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1); |
107 | 3 | __m128i initial_mins = |
108 | 3 | _mm_shuffle_epi8(_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, |
109 | 3 | -128, -128, -128, // 10xx => false |
110 | 3 | 0xC2, -128, // 110x |
111 | 3 | 0xE1, // 1110 |
112 | 3 | 0xF1), |
113 | 3 | off1_hibits); |
114 | | |
115 | 3 | __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes); |
116 | | |
117 | 3 | __m128i second_mins = |
118 | 3 | _mm_shuffle_epi8(_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, |
119 | 3 | -128, -128, -128, // 10xx => false |
120 | 3 | 127, 127, // 110x => true |
121 | 3 | 0xA0, // 1110 |
122 | 3 | 0x90), |
123 | 3 | off1_hibits); |
124 | 3 | __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes); |
125 | 3 | *has_error = _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under)); |
126 | 3 | } |
127 | | |
128 | | struct processed_utf_bytes { |
129 | | __m128i rawbytes; |
130 | | __m128i high_nibbles; |
131 | | __m128i carried_continuations; |
132 | | }; |
133 | | |
134 | 3 | static inline void count_nibbles(__m128i bytes, struct processed_utf_bytes* answer) { |
135 | 3 | answer->rawbytes = bytes; |
136 | 3 | answer->high_nibbles = _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F)); |
137 | 3 | } |
138 | | |
139 | | // check whether the current bytes are valid UTF-8 |
140 | | // at the end of the function, previous gets updated |
141 | | static struct processed_utf_bytes checkUTF8Bytes(__m128i current_bytes, |
142 | | struct processed_utf_bytes* previous, |
143 | 3 | __m128i* has_error) { |
144 | 3 | struct processed_utf_bytes pb; |
145 | 3 | count_nibbles(current_bytes, &pb); |
146 | | |
147 | 3 | checkSmallerThan0xF4(current_bytes, has_error); |
148 | | |
149 | 3 | __m128i initial_lengths = continuationLengths(pb.high_nibbles); |
150 | | |
151 | 3 | pb.carried_continuations = carryContinuations(initial_lengths, previous->carried_continuations); |
152 | | |
153 | 3 | checkContinuations(initial_lengths, pb.carried_continuations, has_error); |
154 | | |
155 | 3 | __m128i off1_current_bytes = _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1); |
156 | 3 | checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error); |
157 | | |
158 | 3 | checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles, previous->high_nibbles, |
159 | 3 | has_error); |
160 | 3 | return pb; |
161 | 3 | } |
162 | | |
163 | 4 | static bool validate_utf8_fast(const char* src, size_t len) { |
164 | 4 | size_t i = 0; |
165 | 4 | __m128i has_error = _mm_setzero_si128(); |
166 | 4 | struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(), |
167 | 4 | .high_nibbles = _mm_setzero_si128(), |
168 | 4 | .carried_continuations = _mm_setzero_si128()}; |
169 | 4 | if (len >= 16) { |
170 | 2 | for (; i <= len - 16; i += 16) { |
171 | 1 | __m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i)); |
172 | 1 | previous = checkUTF8Bytes(current_bytes, &previous, &has_error); |
173 | 1 | } |
174 | 1 | } |
175 | | |
176 | | // last part |
177 | 4 | if (i < len) { |
178 | 2 | char buffer[16]; |
179 | 2 | memset(buffer, 0, 16); |
180 | 2 | memcpy(buffer, src + i, len - i); |
181 | 2 | __m128i current_bytes = _mm_loadu_si128((const __m128i*)(buffer)); |
182 | 2 | previous = checkUTF8Bytes(current_bytes, &previous, &has_error); |
183 | 2 | } else { |
184 | 2 | has_error = _mm_or_si128( |
185 | 2 | _mm_cmpgt_epi8(previous.carried_continuations, |
186 | 2 | _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), |
187 | 2 | has_error); |
188 | 2 | } |
189 | | |
190 | 4 | return _mm_testz_si128(has_error, has_error); |
191 | 4 | } |
192 | | |
193 | | #ifdef __AVX2__ |
194 | | |
195 | | /*****************************/ |
196 | | static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) { |
197 | | return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15); |
198 | | } |
199 | | |
200 | | static inline __m256i push_last_2bytes_of_a_to_b(__m256i a, __m256i b) { |
201 | | return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 14); |
202 | | } |
203 | | |
204 | | // all byte values must be no larger than 0xF4 |
205 | | static inline void avxcheckSmallerThan0xF4(__m256i current_bytes, __m256i* has_error) { |
206 | | // unsigned, saturates to 0 below max |
207 | | *has_error = |
208 | | _mm256_or_si256(*has_error, _mm256_subs_epu8(current_bytes, _mm256_set1_epi8(0xF4))); |
209 | | } |
210 | | |
211 | | static inline __m256i avxcontinuationLengths(__m256i high_nibbles) { |
212 | | return _mm256_shuffle_epi8(_mm256_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) |
213 | | 0, 0, 0, 0, // 10xx (continuation) |
214 | | 2, 2, // 110x |
215 | | 3, // 1110 |
216 | | 4, // 1111, next should be 0 (not checked here) |
217 | | 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) |
218 | | 0, 0, 0, 0, // 10xx (continuation) |
219 | | 2, 2, // 110x |
220 | | 3, // 1110 |
221 | | 4 // 1111, next should be 0 (not checked here) |
222 | | ), |
223 | | high_nibbles); |
224 | | } |
225 | | |
226 | | static inline __m256i avxcarryContinuations(__m256i initial_lengths, __m256i previous_carries) { |
227 | | __m256i right1 = _mm256_subs_epu8(push_last_byte_of_a_to_b(previous_carries, initial_lengths), |
228 | | _mm256_set1_epi8(1)); |
229 | | __m256i sum = _mm256_add_epi8(initial_lengths, right1); |
230 | | |
231 | | __m256i right2 = _mm256_subs_epu8(push_last_2bytes_of_a_to_b(previous_carries, sum), |
232 | | _mm256_set1_epi8(2)); |
233 | | return _mm256_add_epi8(sum, right2); |
234 | | } |
235 | | |
236 | | static inline void avxcheckContinuations(__m256i initial_lengths, __m256i carries, |
237 | | __m256i* has_error) { |
238 | | // overlap || underlap |
239 | | // carry > length && length > 0 || !(carry > length) && !(length > 0) |
240 | | // (carries > length) == (lengths > 0) |
241 | | __m256i overunder = |
242 | | _mm256_cmpeq_epi8(_mm256_cmpgt_epi8(carries, initial_lengths), |
243 | | _mm256_cmpgt_epi8(initial_lengths, _mm256_setzero_si256())); |
244 | | |
245 | | *has_error = _mm256_or_si256(*has_error, overunder); |
246 | | } |
247 | | |
248 | | // when 0xED is found, next byte must be no larger than 0x9F |
249 | | // when 0xF4 is found, next byte must be no larger than 0x8F |
250 | | // next byte must be continuation, ie sign bit is set, so signed < is ok |
251 | | static inline void avxcheckFirstContinuationMax(__m256i current_bytes, __m256i off1_current_bytes, |
252 | | __m256i* has_error) { |
253 | | __m256i maskED = _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xED)); |
254 | | __m256i maskF4 = _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xF4)); |
255 | | |
256 | | __m256i badfollowED = |
257 | | _mm256_and_si256(_mm256_cmpgt_epi8(current_bytes, _mm256_set1_epi8(0x9F)), maskED); |
258 | | __m256i badfollowF4 = |
259 | | _mm256_and_si256(_mm256_cmpgt_epi8(current_bytes, _mm256_set1_epi8(0x8F)), maskF4); |
260 | | |
261 | | *has_error = _mm256_or_si256(*has_error, _mm256_or_si256(badfollowED, badfollowF4)); |
262 | | } |
263 | | |
264 | | // map off1_hibits => error condition |
265 | | // hibits off1 cur |
266 | | // C => < C2 && true |
267 | | // E => < E1 && < A0 |
268 | | // F => < F1 && < 90 |
269 | | // else false && false |
270 | | static inline void avxcheckOverlong(__m256i current_bytes, __m256i off1_current_bytes, |
271 | | __m256i hibits, __m256i previous_hibits, __m256i* has_error) { |
272 | | __m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits); |
273 | | __m256i initial_mins = |
274 | | _mm256_shuffle_epi8(_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, |
275 | | -128, -128, -128, -128, // 10xx => false |
276 | | 0xC2, -128, // 110x |
277 | | 0xE1, // 1110 |
278 | | 0xF1, -128, -128, -128, -128, -128, -128, -128, |
279 | | -128, -128, -128, -128, -128, // 10xx => false |
280 | | 0xC2, -128, // 110x |
281 | | 0xE1, // 1110 |
282 | | 0xF1), |
283 | | off1_hibits); |
284 | | |
285 | | __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes); |
286 | | |
287 | | __m256i second_mins = |
288 | | _mm256_shuffle_epi8(_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, |
289 | | -128, -128, -128, -128, // 10xx => false |
290 | | 127, 127, // 110x => true |
291 | | 0xA0, // 1110 |
292 | | 0x90, -128, -128, -128, -128, -128, -128, -128, |
293 | | -128, -128, -128, -128, -128, // 10xx => false |
294 | | 127, 127, // 110x => true |
295 | | 0xA0, // 1110 |
296 | | 0x90), |
297 | | off1_hibits); |
298 | | __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes); |
299 | | *has_error = _mm256_or_si256(*has_error, _mm256_and_si256(initial_under, second_under)); |
300 | | } |
301 | | |
302 | | struct avx_processed_utf_bytes { |
303 | | __m256i rawbytes; |
304 | | __m256i high_nibbles; |
305 | | __m256i carried_continuations; |
306 | | }; |
307 | | |
308 | | static inline void avx_count_nibbles(__m256i bytes, struct avx_processed_utf_bytes* answer) { |
309 | | answer->rawbytes = bytes; |
310 | | answer->high_nibbles = _mm256_and_si256(_mm256_srli_epi16(bytes, 4), _mm256_set1_epi8(0x0F)); |
311 | | } |
312 | | |
313 | | #endif // __AVX2__ |