/root/doris/be/src/util/simdutf8check.h
| Line | Count | Source | 
| 1 |  | // Licensed to the Apache Software Foundation (ASF) under one | 
| 2 |  | // or more contributor license agreements.  See the NOTICE file | 
| 3 |  | // distributed with this work for additional information | 
| 4 |  | // regarding copyright ownership.  The ASF licenses this file | 
| 5 |  | // to you under the Apache License, Version 2.0 (the | 
| 6 |  | // "License"); you may not use this file except in compliance | 
| 7 |  | // with the License.  You may obtain a copy of the License at | 
| 8 |  | // | 
| 9 |  | //   http://www.apache.org/licenses/LICENSE-2.0 | 
| 10 |  | // | 
| 11 |  | // Unless required by applicable law or agreed to in writing, | 
| 12 |  | // software distributed under the License is distributed on an | 
| 13 |  | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
| 14 |  | // KIND, either express or implied.  See the License for the | 
| 15 |  | // specific language governing permissions and limitations | 
| 16 |  | // under the License. | 
| 17 |  |  | 
| 18 |  | #pragma once | 
| 19 |  |  | 
| 20 |  | #include <stdbool.h> | 
| 21 |  | #include <stddef.h> | 
| 22 |  | #include <stdint.h> | 
| 23 |  | #include <string.h> | 
| 24 |  | #include <x86intrin.h> | 
| 25 |  |  | 
| 26 |  | /* | 
| 27 |  |  * These functions are used for validating utf8 string. | 
| 28 |  |  * Details can be seen here: https://github.com/lemire/fastvalidate-utf-8 | 
| 29 |  |  */ | 
| 30 |  |  | 
| 31 |  | /* | 
| 32 |  |  * legal utf-8 byte sequence | 
| 33 |  |  * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 | 
| 34 |  |  * | 
| 35 |  |  *  Code Points        1st       2s       3s       4s | 
| 36 |  |  * U+0000..U+007F     00..7F | 
| 37 |  |  * U+0080..U+07FF     C2..DF   80..BF | 
| 38 |  |  * U+0800..U+0FFF     E0       A0..BF   80..BF | 
| 39 |  |  * U+1000..U+CFFF     E1..EC   80..BF   80..BF | 
| 40 |  |  * U+D000..U+D7FF     ED       80..9F   80..BF | 
| 41 |  |  * U+E000..U+FFFF     EE..EF   80..BF   80..BF | 
| 42 |  |  * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF | 
| 43 |  |  * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF | 
| 44 |  |  * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF | 
| 45 |  |  * | 
| 46 |  |  */ | 
| 47 |  |  | 
| 48 |  | // all byte values must be no larger than 0xF4 | 
| 49 | 10 | static inline void checkSmallerThan0xF4(__m128i current_bytes, __m128i* has_error) { | 
| 50 |  |     // unsigned, saturates to 0 below max | 
| 51 | 10 |     *has_error = _mm_or_si128(*has_error, _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4))); | 
| 52 | 10 | } | 
| 53 |  |  | 
| 54 | 10 | static inline __m128i continuationLengths(__m128i high_nibbles) { | 
| 55 | 10 |     return _mm_shuffle_epi8(_mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) | 
| 56 | 10 |                                           0, 0, 0, 0,             // 10xx (continuation) | 
| 57 | 10 |                                           2, 2,                   // 110x | 
| 58 | 10 |                                           3,                      // 1110 | 
| 59 | 10 |                                           4), // 1111, next should be 0 (not checked here) | 
| 60 | 10 |                             high_nibbles); | 
| 61 | 10 | } | 
| 62 |  |  | 
| 63 | 10 | static inline __m128i carryContinuations(__m128i initial_lengths, __m128i previous_carries) { | 
| 64 | 10 |     __m128i right1 = _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1), | 
| 65 | 10 |                                    _mm_set1_epi8(1)); | 
| 66 | 10 |     __m128i sum = _mm_add_epi8(initial_lengths, right1); | 
| 67 |  |  | 
| 68 | 10 |     __m128i right2 = | 
| 69 | 10 |             _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2), _mm_set1_epi8(2)); | 
| 70 | 10 |     return _mm_add_epi8(sum, right2); | 
| 71 | 10 | } | 
| 72 |  |  | 
| 73 |  | static inline void checkContinuations(__m128i initial_lengths, __m128i carries, | 
| 74 | 10 |                                       __m128i* has_error) { | 
| 75 |  |     // overlap || underlap | 
| 76 |  |     // carry > length && length > 0 || !(carry > length) && !(length > 0) | 
| 77 |  |     // (carries > length) == (lengths > 0) | 
| 78 | 10 |     __m128i overunder = _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths), | 
| 79 | 10 |                                        _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128())); | 
| 80 |  |  | 
| 81 | 10 |     *has_error = _mm_or_si128(*has_error, overunder); | 
| 82 | 10 | } | 
| 83 |  |  | 
| 84 |  | // when 0xED is found, next byte must be no larger than 0x9F | 
| 85 |  | // when 0xF4 is found, next byte must be no larger than 0x8F | 
| 86 |  | // next byte must be continuation, ie sign bit is set, so signed < is ok | 
| 87 |  | static inline void checkFirstContinuationMax(__m128i current_bytes, __m128i off1_current_bytes, | 
| 88 | 10 |                                              __m128i* has_error) { | 
| 89 | 10 |     __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED)); | 
| 90 | 10 |     __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4)); | 
| 91 |  |  | 
| 92 | 10 |     __m128i badfollowED = _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED); | 
| 93 | 10 |     __m128i badfollowF4 = _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4); | 
| 94 |  |  | 
| 95 | 10 |     *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4)); | 
| 96 | 10 | } | 
| 97 |  |  | 
| 98 |  | // map off1_hibits => error condition | 
| 99 |  | // hibits     off1    cur | 
| 100 |  | // C       => < C2 && true | 
| 101 |  | // E       => < E1 && < A0 | 
| 102 |  | // F       => < F1 && < 90 | 
| 103 |  | // else      false && false | 
| 104 |  | static inline void checkOverlong(__m128i current_bytes, __m128i off1_current_bytes, __m128i hibits, | 
| 105 | 10 |                                  __m128i previous_hibits, __m128i* has_error) { | 
| 106 | 10 |     __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1); | 
| 107 | 10 |     __m128i initial_mins = | 
| 108 | 10 |             _mm_shuffle_epi8(_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, | 
| 109 | 10 |                                            -128, -128, -128, // 10xx => false | 
| 110 | 10 |                                            0xC2, -128,       // 110x | 
| 111 | 10 |                                            0xE1,             // 1110 | 
| 112 | 10 |                                            0xF1), | 
| 113 | 10 |                              off1_hibits); | 
| 114 |  |  | 
| 115 | 10 |     __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes); | 
| 116 |  |  | 
| 117 | 10 |     __m128i second_mins = | 
| 118 | 10 |             _mm_shuffle_epi8(_mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, | 
| 119 | 10 |                                            -128, -128, -128, // 10xx => false | 
| 120 | 10 |                                            127, 127,         // 110x => true | 
| 121 | 10 |                                            0xA0,             // 1110 | 
| 122 | 10 |                                            0x90), | 
| 123 | 10 |                              off1_hibits); | 
| 124 | 10 |     __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes); | 
| 125 | 10 |     *has_error = _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under)); | 
| 126 | 10 | } | 
| 127 |  |  | 
| 128 |  | struct processed_utf_bytes { | 
| 129 |  |     __m128i rawbytes; | 
| 130 |  |     __m128i high_nibbles; | 
| 131 |  |     __m128i carried_continuations; | 
| 132 |  | }; | 
| 133 |  |  | 
| 134 | 10 | static inline void count_nibbles(__m128i bytes, struct processed_utf_bytes* answer) { | 
| 135 | 10 |     answer->rawbytes = bytes; | 
| 136 | 10 |     answer->high_nibbles = _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F)); | 
| 137 | 10 | } | 
| 138 |  |  | 
| 139 |  | // check whether the current bytes are valid UTF-8 | 
| 140 |  | // at the end of the function, previous gets updated | 
| 141 |  | static struct processed_utf_bytes checkUTF8Bytes(__m128i current_bytes, | 
| 142 |  |                                                  struct processed_utf_bytes* previous, | 
| 143 | 10 |                                                  __m128i* has_error) { | 
| 144 | 10 |     struct processed_utf_bytes pb; | 
| 145 | 10 |     count_nibbles(current_bytes, &pb); | 
| 146 |  |  | 
| 147 | 10 |     checkSmallerThan0xF4(current_bytes, has_error); | 
| 148 |  |  | 
| 149 | 10 |     __m128i initial_lengths = continuationLengths(pb.high_nibbles); | 
| 150 |  |  | 
| 151 | 10 |     pb.carried_continuations = carryContinuations(initial_lengths, previous->carried_continuations); | 
| 152 |  |  | 
| 153 | 10 |     checkContinuations(initial_lengths, pb.carried_continuations, has_error); | 
| 154 |  |  | 
| 155 | 10 |     __m128i off1_current_bytes = _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1); | 
| 156 | 10 |     checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error); | 
| 157 |  |  | 
| 158 | 10 |     checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles, previous->high_nibbles, | 
| 159 | 10 |                   has_error); | 
| 160 | 10 |     return pb; | 
| 161 | 10 | } | 
| 162 |  |  | 
| 163 | 12 | static bool validate_utf8_fast(const char* src, size_t len) { | 
| 164 | 12 |     size_t i = 0; | 
| 165 | 12 |     __m128i has_error = _mm_setzero_si128(); | 
| 166 | 12 |     struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(), | 
| 167 | 12 |                                            .high_nibbles = _mm_setzero_si128(), | 
| 168 | 12 |                                            .carried_continuations = _mm_setzero_si128()}; | 
| 169 | 12 |     if (len >= 16) { | 
| 170 | 2 |         for (; i <= len - 16; i += 16) { | 
| 171 | 1 |             __m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i)); | 
| 172 | 1 |             previous = checkUTF8Bytes(current_bytes, &previous, &has_error); | 
| 173 | 1 |         } | 
| 174 | 1 |     } | 
| 175 |  |  | 
| 176 |  |     // last part | 
| 177 | 12 |     if (i < len) { | 
| 178 | 9 |         char buffer[16]; | 
| 179 | 9 |         memset(buffer, 0, 16); | 
| 180 | 9 |         memcpy(buffer, src + i, len - i); | 
| 181 | 9 |         __m128i current_bytes = _mm_loadu_si128((const __m128i*)(buffer)); | 
| 182 | 9 |         previous = checkUTF8Bytes(current_bytes, &previous, &has_error); | 
| 183 | 9 |     } else { | 
| 184 | 3 |         has_error = _mm_or_si128( | 
| 185 | 3 |                 _mm_cmpgt_epi8(previous.carried_continuations, | 
| 186 | 3 |                                _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1)), | 
| 187 | 3 |                 has_error); | 
| 188 | 3 |     } | 
| 189 |  |  | 
| 190 | 12 |     return _mm_testz_si128(has_error, has_error); | 
| 191 | 12 | } | 
| 192 |  |  | 
| 193 |  | #ifdef __AVX2__ | 
| 194 |  |  | 
| 195 |  | /*****************************/ | 
| 196 | 0 | static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) { | 
| 197 | 0 |     return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15); | 
| 198 | 0 | } | 
| 199 |  |  | 
| 200 | 0 | static inline __m256i push_last_2bytes_of_a_to_b(__m256i a, __m256i b) { | 
| 201 | 0 |     return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 14); | 
| 202 | 0 | } | 
| 203 |  |  | 
| 204 |  | // all byte values must be no larger than 0xF4 | 
| 205 | 0 | static inline void avxcheckSmallerThan0xF4(__m256i current_bytes, __m256i* has_error) { | 
| 206 | 0 |     // unsigned, saturates to 0 below max | 
| 207 | 0 |     *has_error = | 
| 208 | 0 |             _mm256_or_si256(*has_error, _mm256_subs_epu8(current_bytes, _mm256_set1_epi8(0xF4))); | 
| 209 | 0 | } | 
| 210 |  |  | 
| 211 | 0 | static inline __m256i avxcontinuationLengths(__m256i high_nibbles) { | 
| 212 | 0 |     return _mm256_shuffle_epi8(_mm256_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) | 
| 213 | 0 |                                                 0, 0, 0, 0,             // 10xx (continuation) | 
| 214 | 0 |                                                 2, 2,                   // 110x | 
| 215 | 0 |                                                 3,                      // 1110 | 
| 216 | 0 |                                                 4, // 1111, next should be 0 (not checked here) | 
| 217 | 0 |                                                 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) | 
| 218 | 0 |                                                 0, 0, 0, 0,             // 10xx (continuation) | 
| 219 | 0 |                                                 2, 2,                   // 110x | 
| 220 | 0 |                                                 3,                      // 1110 | 
| 221 | 0 |                                                 4 // 1111, next should be 0 (not checked here) | 
| 222 | 0 |                                                 ), | 
| 223 | 0 |                                high_nibbles); | 
| 224 | 0 | } | 
| 225 |  |  | 
| 226 | 0 | static inline __m256i avxcarryContinuations(__m256i initial_lengths, __m256i previous_carries) { | 
| 227 | 0 |     __m256i right1 = _mm256_subs_epu8(push_last_byte_of_a_to_b(previous_carries, initial_lengths), | 
| 228 | 0 |                                       _mm256_set1_epi8(1)); | 
| 229 | 0 |     __m256i sum = _mm256_add_epi8(initial_lengths, right1); | 
| 230 | 0 | 
 | 
| 231 | 0 |     __m256i right2 = _mm256_subs_epu8(push_last_2bytes_of_a_to_b(previous_carries, sum), | 
| 232 | 0 |                                       _mm256_set1_epi8(2)); | 
| 233 | 0 |     return _mm256_add_epi8(sum, right2); | 
| 234 | 0 | } | 
| 235 |  |  | 
| 236 |  | static inline void avxcheckContinuations(__m256i initial_lengths, __m256i carries, | 
| 237 | 0 |                                          __m256i* has_error) { | 
| 238 | 0 |     // overlap || underlap | 
| 239 | 0 |     // carry > length && length > 0 || !(carry > length) && !(length > 0) | 
| 240 | 0 |     // (carries > length) == (lengths > 0) | 
| 241 | 0 |     __m256i overunder = | 
| 242 | 0 |             _mm256_cmpeq_epi8(_mm256_cmpgt_epi8(carries, initial_lengths), | 
| 243 | 0 |                               _mm256_cmpgt_epi8(initial_lengths, _mm256_setzero_si256())); | 
| 244 | 0 | 
 | 
| 245 | 0 |     *has_error = _mm256_or_si256(*has_error, overunder); | 
| 246 | 0 | } | 
| 247 |  |  | 
| 248 |  | // when 0xED is found, next byte must be no larger than 0x9F | 
| 249 |  | // when 0xF4 is found, next byte must be no larger than 0x8F | 
| 250 |  | // next byte must be continuation, ie sign bit is set, so signed < is ok | 
| 251 |  | static inline void avxcheckFirstContinuationMax(__m256i current_bytes, __m256i off1_current_bytes, | 
| 252 | 0 |                                                 __m256i* has_error) { | 
| 253 | 0 |     __m256i maskED = _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xED)); | 
| 254 | 0 |     __m256i maskF4 = _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xF4)); | 
| 255 | 0 | 
 | 
| 256 | 0 |     __m256i badfollowED = | 
| 257 | 0 |             _mm256_and_si256(_mm256_cmpgt_epi8(current_bytes, _mm256_set1_epi8(0x9F)), maskED); | 
| 258 | 0 |     __m256i badfollowF4 = | 
| 259 | 0 |             _mm256_and_si256(_mm256_cmpgt_epi8(current_bytes, _mm256_set1_epi8(0x8F)), maskF4); | 
| 260 | 0 | 
 | 
| 261 | 0 |     *has_error = _mm256_or_si256(*has_error, _mm256_or_si256(badfollowED, badfollowF4)); | 
| 262 | 0 | } | 
| 263 |  |  | 
| 264 |  | // map off1_hibits => error condition | 
| 265 |  | // hibits     off1    cur | 
| 266 |  | // C       => < C2 && true | 
| 267 |  | // E       => < E1 && < A0 | 
| 268 |  | // F       => < F1 && < 90 | 
| 269 |  | // else      false && false | 
| 270 |  | static inline void avxcheckOverlong(__m256i current_bytes, __m256i off1_current_bytes, | 
| 271 | 0 |                                     __m256i hibits, __m256i previous_hibits, __m256i* has_error) { | 
| 272 | 0 |     __m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits); | 
| 273 | 0 |     __m256i initial_mins = | 
| 274 | 0 |             _mm256_shuffle_epi8(_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, | 
| 275 | 0 |                                                  -128, -128, -128, -128, // 10xx => false | 
| 276 | 0 |                                                  0xC2, -128,             // 110x | 
| 277 | 0 |                                                  0xE1,                   // 1110 | 
| 278 | 0 |                                                  0xF1, -128, -128, -128, -128, -128, -128, -128, | 
| 279 | 0 |                                                  -128, -128, -128, -128, -128, // 10xx => false | 
| 280 | 0 |                                                  0xC2, -128,                   // 110x | 
| 281 | 0 |                                                  0xE1,                         // 1110 | 
| 282 | 0 |                                                  0xF1), | 
| 283 | 0 |                                 off1_hibits); | 
| 284 | 0 | 
 | 
| 285 | 0 |     __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes); | 
| 286 | 0 | 
 | 
| 287 | 0 |     __m256i second_mins = | 
| 288 | 0 |             _mm256_shuffle_epi8(_mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, | 
| 289 | 0 |                                                  -128, -128, -128, -128, // 10xx => false | 
| 290 | 0 |                                                  127, 127,               // 110x => true | 
| 291 | 0 |                                                  0xA0,                   // 1110 | 
| 292 | 0 |                                                  0x90, -128, -128, -128, -128, -128, -128, -128, | 
| 293 | 0 |                                                  -128, -128, -128, -128, -128, // 10xx => false | 
| 294 | 0 |                                                  127, 127,                     // 110x => true | 
| 295 | 0 |                                                  0xA0,                         // 1110 | 
| 296 | 0 |                                                  0x90), | 
| 297 | 0 |                                 off1_hibits); | 
| 298 | 0 |     __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes); | 
| 299 | 0 |     *has_error = _mm256_or_si256(*has_error, _mm256_and_si256(initial_under, second_under)); | 
| 300 | 0 | } | 
| 301 |  |  | 
| 302 |  | struct avx_processed_utf_bytes { | 
| 303 |  |     __m256i rawbytes; | 
| 304 |  |     __m256i high_nibbles; | 
| 305 |  |     __m256i carried_continuations; | 
| 306 |  | }; | 
| 307 |  |  | 
| 308 | 0 | static inline void avx_count_nibbles(__m256i bytes, struct avx_processed_utf_bytes* answer) { | 
| 309 | 0 |     answer->rawbytes = bytes; | 
| 310 | 0 |     answer->high_nibbles = _mm256_and_si256(_mm256_srli_epi16(bytes, 4), _mm256_set1_epi8(0x0F)); | 
| 311 | 0 | } | 
| 312 |  |  | 
| 313 |  | #endif // __AVX2__ |