Coverage Report

Created: 2024-11-21 10:56

/root/doris/be/src/util/utf8_check.cpp
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) cyb70289(https://github.com/cyb70289). All rights reserved.
2
// Use of this source code is governed by a MIT license that can be
3
// found in the LICENSE file.
4
5
/*
6
 * These functions are used for validating utf8 string.
7
 * Details can be seen here: https://github.com/cyb70289/utf8/
8
 */
9
10
#include "util/utf8_check.h"
11
12
#if defined(__i386) || defined(__x86_64__)
13
#include "util/simdutf8check.h"
14
#elif defined(__aarch64__)
15
#include <arm_neon.h>
16
#endif
17
18
/*
19
 * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
20
 *
21
 * Table 3-7. Well-Formed UTF-8 Byte Sequences
22
 *
23
 * +--------------------+------------+-------------+------------+-------------+
24
 * | Code Points        | First Byte | Second Byte | Third Byte | Fourth Byte |
25
 * +--------------------+------------+-------------+------------+-------------+
26
 * | U+0000..U+007F     | 00..7F     |             |            |             |
27
 * +--------------------+------------+-------------+------------+-------------+
28
 * | U+0080..U+07FF     | C2..DF     | 80..BF      |            |             |
29
 * +--------------------+------------+-------------+------------+-------------+
30
 * | U+0800..U+0FFF     | E0         | A0..BF      | 80..BF     |             |
31
 * +--------------------+------------+-------------+------------+-------------+
32
 * | U+1000..U+CFFF     | E1..EC     | 80..BF      | 80..BF     |             |
33
 * +--------------------+------------+-------------+------------+-------------+
34
 * | U+D000..U+D7FF     | ED         | 80..9F      | 80..BF     |             |
35
 * +--------------------+------------+-------------+------------+-------------+
36
 * | U+E000..U+FFFF     | EE..EF     | 80..BF      | 80..BF     |             |
37
 * +--------------------+------------+-------------+------------+-------------+
38
 * | U+10000..U+3FFFF   | F0         | 90..BF      | 80..BF     | 80..BF      |
39
 * +--------------------+------------+-------------+------------+-------------+
40
 * | U+40000..U+FFFFF   | F1..F3     | 80..BF      | 80..BF     | 80..BF      |
41
 * +--------------------+------------+-------------+------------+-------------+
42
 * | U+100000..U+10FFFF | F4         | 80..8F      | 80..BF     | 80..BF      |
43
 * +--------------------+------------+-------------+------------+-------------+
44
 */
45
namespace doris {
46
2
bool validate_utf8_naive(const char* data, size_t len) {
47
2
    while (len) {
48
1
        int bytes;
49
1
        const unsigned char byte1 = data[0];
50
51
        /* 00..7F */
52
1
        if (byte1 <= 0x7F) {
53
0
            bytes = 1;
54
            /* C2..DF, 80..BF */
55
1
        } else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
56
1
                   (signed char)data[1] <= (signed char)0xBF) {
57
0
            bytes = 2;
58
1
        } else if (len >= 3) {
59
0
            const unsigned char byte2 = data[1];
60
61
            /* Is byte2, byte3 between 0x80 ~ 0xBF */
62
0
            const int byte2_ok = (signed char)byte2 <= (signed char)0xBF;
63
0
            const int byte3_ok = (signed char)data[2] <= (signed char)0xBF;
64
65
0
            if (byte2_ok && byte3_ok &&
66
                /* E0, A0..BF, 80..BF */
67
0
                ((byte1 == 0xE0 && byte2 >= 0xA0) ||
68
                 /* E1..EC, 80..BF, 80..BF */
69
0
                 (byte1 >= 0xE1 && byte1 <= 0xEC) ||
70
                 /* ED, 80..9F, 80..BF */
71
0
                 (byte1 == 0xED && byte2 <= 0x9F) ||
72
                 /* EE..EF, 80..BF, 80..BF */
73
0
                 (byte1 >= 0xEE && byte1 <= 0xEF))) {
74
0
                bytes = 3;
75
0
            } else if (len >= 4) {
76
                /* Is byte4 between 0x80 ~ 0xBF */
77
0
                const int byte4_ok = (signed char)data[3] <= (signed char)0xBF;
78
79
0
                if (byte2_ok && byte3_ok && byte4_ok &&
80
                    /* F0, 90..BF, 80..BF, 80..BF */
81
0
                    ((byte1 == 0xF0 && byte2 >= 0x90) ||
82
                     /* F1..F3, 80..BF, 80..BF, 80..BF */
83
0
                     (byte1 >= 0xF1 && byte1 <= 0xF3) ||
84
                     /* F4, 80..8F, 80..BF, 80..BF */
85
0
                     (byte1 == 0xF4 && byte2 <= 0x8F))) {
86
0
                    bytes = 4;
87
0
                } else {
88
0
                    return false;
89
0
                }
90
0
            } else {
91
0
                return false;
92
0
            }
93
1
        } else {
94
1
            return false;
95
1
        }
96
97
0
        len -= bytes;
98
0
        data += bytes;
99
0
    }
100
101
1
    return true;
102
2
}
103
104
#if defined(__i386) || defined(__x86_64__)
105
4
bool validate_utf8(const char* src, size_t len) {
106
4
    return validate_utf8_fast(src, len);
107
4
}
108
#elif defined(__aarch64__)
109
/*
110
 * Map high nibble of "First Byte" to legal character length minus 1
111
 * 0x00 ~ 0xBF --> 0
112
 * 0xC0 ~ 0xDF --> 1
113
 * 0xE0 ~ 0xEF --> 2
114
 * 0xF0 ~ 0xFF --> 3
115
 */
116
const uint8_t _first_len_tbl[] = {
117
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
118
};
119
120
/* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
121
static const uint8_t _first_range_tbl[] = {
122
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
123
};
124
125
/*
126
 * Range table, map range index to min and max values
127
 * Index 0    : 00 ~ 7F (First Byte, ascii)
128
 * Index 1,2,3: 80 ~ BF (Second, Third, Fourth Byte)
129
 * Index 4    : A0 ~ BF (Second Byte after E0)
130
 * Index 5    : 80 ~ 9F (Second Byte after ED)
131
 * Index 6    : 90 ~ BF (Second Byte after F0)
132
 * Index 7    : 80 ~ 8F (Second Byte after F4)
133
 * Index 8    : C2 ~ F4 (First Byte, non ascii)
134
 * Index 9~15 : illegal: u >= 255 && u <= 0
135
 */
136
static const uint8_t _range_min_tbl[] = {
137
        0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
138
        0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
139
};
140
static const uint8_t _range_max_tbl[] = {
141
        0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
142
        0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
143
};
144
145
/*
146
 * This table is for fast handling four special First Bytes(E0,ED,F0,F4), after
147
 * which the Second Byte are not 80~BF. It contains "range index adjustment".
148
 * - The idea is to minus byte with E0, use the result(0~31) as the index to
149
 *   lookup the "range index adjustment". Then add the adjustment to original
150
 *   range index to get the correct range.
151
 * - Range index adjustment
152
 *   +------------+---------------+------------------+----------------+
153
 *   | First Byte | original range| range adjustment | adjusted range |
154
 *   +------------+---------------+------------------+----------------+
155
 *   | E0         | 2             | 2                | 4              |
156
 *   +------------+---------------+------------------+----------------+
157
 *   | ED         | 2             | 3                | 5              |
158
 *   +------------+---------------+------------------+----------------+
159
 *   | F0         | 3             | 3                | 6              |
160
 *   +------------+---------------+------------------+----------------+
161
 *   | F4         | 4             | 4                | 8              |
162
 *   +------------+---------------+------------------+----------------+
163
 * - Below is a uint8x16x2 table, data is interleaved in NEON register. So I'm
164
 *   putting it vertically. 1st column is for E0~EF, 2nd column for F0~FF.
165
 */
166
static const uint8_t _range_adjust_tbl[] = {
167
        /* index -> 0~15  16~31 <- index */
168
        /*  E0 -> */ 2,
169
        3, /* <- F0  */
170
        0,
171
        0,
172
        0,
173
        0,
174
        0,
175
        0,
176
        0,
177
        4, /* <- F4  */
178
        0,
179
        0,
180
        0,
181
        0,
182
        0,
183
        0,
184
        0,
185
        0,
186
        0,
187
        0,
188
        0,
189
        0,
190
        0,
191
        0,
192
        0,
193
        0,
194
        /*  ED -> */ 3,
195
        0,
196
        0,
197
        0,
198
        0,
199
        0,
200
};
201
202
/* 2x ~ 4x faster than naive method */
203
/* Return true on success, false on error */
204
bool utf8_range(const char* data, size_t len) {
205
    if (len >= 16) {
206
        uint8x16_t prev_input = vdupq_n_u8(0);
207
        uint8x16_t prev_first_len = vdupq_n_u8(0);
208
209
        /* Cached tables */
210
        const uint8x16_t first_len_tbl = vld1q_u8(_first_len_tbl);
211
        const uint8x16_t first_range_tbl = vld1q_u8(_first_range_tbl);
212
        const uint8x16_t range_min_tbl = vld1q_u8(_range_min_tbl);
213
        const uint8x16_t range_max_tbl = vld1q_u8(_range_max_tbl);
214
        const uint8x16x2_t range_adjust_tbl = vld2q_u8(_range_adjust_tbl);
215
216
        /* Cached values */
217
        const uint8x16_t const_1 = vdupq_n_u8(1);
218
        const uint8x16_t const_2 = vdupq_n_u8(2);
219
        const uint8x16_t const_e0 = vdupq_n_u8(0xE0);
220
221
        uint8x16_t error = vdupq_n_u8(0);
222
223
        while (len >= 16) {
224
            const uint8x16_t input = vld1q_u8((const uint8_t*)data);
225
226
            /* high_nibbles = input >> 4 */
227
            const uint8x16_t high_nibbles = vshrq_n_u8(input, 4);
228
229
            /* first_len = legal character length minus 1 */
230
            /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
231
            /* first_len = first_len_tbl[high_nibbles] */
232
            const uint8x16_t first_len = vqtbl1q_u8(first_len_tbl, high_nibbles);
233
234
            /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
235
            /* range = first_range_tbl[high_nibbles] */
236
            uint8x16_t range = vqtbl1q_u8(first_range_tbl, high_nibbles);
237
238
            /* Second Byte: set range index to first_len */
239
            /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
240
            /* range |= (first_len, prev_first_len) << 1 byte */
241
            range = vorrq_u8(range, vextq_u8(prev_first_len, first_len, 15));
242
243
            /* Third Byte: set range index to saturate_sub(first_len, 1) */
244
            /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
245
            uint8x16_t tmp1, tmp2;
246
            /* tmp1 = saturate_sub(first_len, 1) */
247
            tmp1 = vqsubq_u8(first_len, const_1);
248
            /* tmp2 = saturate_sub(prev_first_len, 1) */
249
            tmp2 = vqsubq_u8(prev_first_len, const_1);
250
            /* range |= (tmp1, tmp2) << 2 bytes */
251
            range = vorrq_u8(range, vextq_u8(tmp2, tmp1, 14));
252
253
            /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
254
            /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
255
            /* tmp1 = saturate_sub(first_len, 2) */
256
            tmp1 = vqsubq_u8(first_len, const_2);
257
            /* tmp2 = saturate_sub(prev_first_len, 2) */
258
            tmp2 = vqsubq_u8(prev_first_len, const_2);
259
            /* range |= (tmp1, tmp2) << 3 bytes */
260
            range = vorrq_u8(range, vextq_u8(tmp2, tmp1, 13));
261
262
            /*
263
             * Now we have below range indices caluclated
264
             * Correct cases:
265
             * - 8 for C0~FF
266
             * - 3 for 1st byte after F0~FF
267
             * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
268
             * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
269
             *         3rd byte after F0~FF
270
             * - 0 for others
271
             * Error cases:
272
             *   9,10,11 if non ascii First Byte overlaps
273
             *   E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
274
             */
275
276
            /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
277
            /* See _range_adjust_tbl[] definition for details */
278
            /* Overlaps lead to index 9~15, which are illegal in range table */
279
            uint8x16_t shift1 = vextq_u8(prev_input, input, 15);
280
            uint8x16_t pos = vsubq_u8(shift1, const_e0);
281
            range = vaddq_u8(range, vqtbl2q_u8(range_adjust_tbl, pos));
282
283
            /* Load min and max values per calculated range index */
284
            uint8x16_t minv = vqtbl1q_u8(range_min_tbl, range);
285
            uint8x16_t maxv = vqtbl1q_u8(range_max_tbl, range);
286
287
            /* Check value range */
288
            error = vorrq_u8(error, vcltq_u8(input, minv));
289
            error = vorrq_u8(error, vcgtq_u8(input, maxv));
290
291
            prev_input = input;
292
            prev_first_len = first_len;
293
294
            data += 16;
295
            len -= 16;
296
        }
297
298
        /* Delay error check till loop ends */
299
        if (vmaxvq_u8(error)) return false;
300
301
        /* Find previous token (not 80~BF) */
302
        uint32_t token4;
303
        vst1q_lane_u32(&token4, vreinterpretq_u32_u8(prev_input), 3);
304
305
        const int8_t* token = (const int8_t*)&token4;
306
        int lookahead = 0;
307
        if (token[3] > (int8_t)0xBF)
308
            lookahead = 1;
309
        else if (token[2] > (int8_t)0xBF)
310
            lookahead = 2;
311
        else if (token[1] > (int8_t)0xBF)
312
            lookahead = 3;
313
314
        data -= lookahead;
315
        len += lookahead;
316
    }
317
318
    /* Check remaining bytes with naive method */
319
    return validate_utf8_naive(data, len);
320
}
321
322
bool validate_utf8(const char* src, size_t len) {
323
    return utf8_range(src, len);
324
}
325
#else
326
bool validate_utf8(const char* src, size_t len) {
327
    return validate_utf8_naive(src, len);
328
}
329
#endif
330
} // namespace doris