Coverage Report

Created: 2024-11-20 15:52

/root/doris/be/src/gutil/utf/rune.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * The authors of this software are Rob Pike and Ken Thompson.
3
 *              Copyright (c) 2002 by Lucent Technologies.
4
 * Permission to use, copy, modify, and distribute this software for any
5
 * purpose without fee is hereby granted, provided that this entire notice
6
 * is included in all copies of any software which is or includes a copy
7
 * or modification of this software and in all copies of the supporting
8
 * documentation for such software.
9
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10
 * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11
 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13
 */
14
#include "gutil/utf/utf.h"
15
#include "gutil/utf/utfdef.h"
16
17
enum
18
{
19
  Bit1  = 7,
20
  Bitx  = 6,
21
  Bit2  = 5,
22
  Bit3  = 4,
23
  Bit4  = 3,
24
  Bit5  = 2, 
25
26
  T1  = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
27
  Tx  = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
28
  T2  = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
29
  T3  = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
30
  T4  = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
31
  T5  = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
32
33
  Rune1 = (1<<(Bit1+0*Bitx))-1,   /* 0000 0000 0111 1111 */
34
  Rune2 = (1<<(Bit2+1*Bitx))-1,   /* 0000 0111 1111 1111 */
35
  Rune3 = (1<<(Bit3+2*Bitx))-1,   /* 1111 1111 1111 1111 */
36
  Rune4 = (1<<(Bit4+3*Bitx))-1,
37
                                        /* 0001 1111 1111 1111 1111 1111 */
38
39
  Maskx = (1<<Bitx)-1,      /* 0011 1111 */
40
  Testx = Maskx ^ 0xFF,     /* 1100 0000 */
41
42
  Bad = Runeerror,
43
};
44
45
/*
46
 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
47
 * This is a slower but "safe" version of the old chartorune 
48
 * that works on strings that are not necessarily null-terminated.
49
 * 
50
 * If you know for sure that your string is null-terminated,
51
 * chartorune will be a bit faster.
52
 *
53
 * It is guaranteed not to attempt to access "length"
54
 * past the incoming pointer.  This is to avoid
55
 * possible access violations.  If the string appears to be
56
 * well-formed but incomplete (i.e., to get the whole Rune
57
 * we'd need to read past str+length) then we'll set the Rune
58
 * to Bad and return 0.
59
 *
60
 * Note that if we have decoding problems for other
61
 * reasons, we return 1 instead of 0.
62
 */
63
int
64
charntorune(Rune *rune, const char *str, int length)
65
0
{
66
0
  int c, c1, c2, c3;
67
0
  long l;
68
69
  /* When we're not allowed to read anything */
70
0
  if(length <= 0) {
71
0
    goto badlen;
72
0
  }
73
74
  /*
75
   * one character sequence (7-bit value)
76
   *  00000-0007F => T1
77
   */
78
0
  c = *(uchar*)str;
79
0
  if(c < Tx) {
80
0
    *rune = c;
81
0
    return 1;
82
0
  }
83
84
  // If we can't read more than one character we must stop
85
0
  if(length <= 1) {
86
0
    goto badlen;
87
0
  }
88
89
  /*
90
   * two character sequence (11-bit value)
91
   *  0080-07FF => T2 Tx
92
   */
93
0
  c1 = *(uchar*)(str+1) ^ Tx;
94
0
  if(c1 & Testx)
95
0
    goto bad;
96
0
  if(c < T3) {
97
0
    if(c < T2)
98
0
      goto bad;
99
0
    l = ((c << Bitx) | c1) & Rune2;
100
0
    if(l <= Rune1)
101
0
      goto bad;
102
0
    *rune = l;
103
0
    return 2;
104
0
  }
105
106
  // If we can't read more than two characters we must stop
107
0
  if(length <= 2) {
108
0
    goto badlen;
109
0
  }
110
111
  /*
112
   * three character sequence (16-bit value)
113
   *  0800-FFFF => T3 Tx Tx
114
   */
115
0
  c2 = *(uchar*)(str+2) ^ Tx;
116
0
  if(c2 & Testx)
117
0
    goto bad;
118
0
  if(c < T4) {
119
0
    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
120
0
    if(l <= Rune2)
121
0
      goto bad;
122
0
    *rune = l;
123
0
    return 3;
124
0
  }
125
126
0
  if (length <= 3)
127
0
    goto badlen;
128
129
  /*
130
   * four character sequence (21-bit value)
131
   *  10000-1FFFFF => T4 Tx Tx Tx
132
   */
133
0
  c3 = *(uchar*)(str+3) ^ Tx;
134
0
  if (c3 & Testx)
135
0
    goto bad;
136
0
  if (c < T5) {
137
0
    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
138
0
    if (l <= Rune3)
139
0
      goto bad;
140
0
    *rune = l;
141
0
    return 4;
142
0
  }
143
144
  // Support for 5-byte or longer UTF-8 would go here, but
145
  // since we don't have that, we'll just fall through to bad.
146
147
  /*
148
   * bad decoding
149
   */
150
0
bad:
151
0
  *rune = Bad;
152
0
  return 1;
153
0
badlen:
154
0
  *rune = Bad;
155
0
  return 0;
156
157
0
}
158
159
160
/*
161
 * This is the older "unsafe" version, which works fine on 
162
 * null-terminated strings.
163
 */
164
int
165
chartorune(Rune *rune, const char *str)
166
0
{
167
0
  int c, c1, c2, c3;
168
0
  long l;
169
170
  /*
171
   * one character sequence
172
   *  00000-0007F => T1
173
   */
174
0
  c = *(uchar*)str;
175
0
  if(c < Tx) {
176
0
    *rune = c;
177
0
    return 1;
178
0
  }
179
180
  /*
181
   * two character sequence
182
   *  0080-07FF => T2 Tx
183
   */
184
0
  c1 = *(uchar*)(str+1) ^ Tx;
185
0
  if(c1 & Testx)
186
0
    goto bad;
187
0
  if(c < T3) {
188
0
    if(c < T2)
189
0
      goto bad;
190
0
    l = ((c << Bitx) | c1) & Rune2;
191
0
    if(l <= Rune1)
192
0
      goto bad;
193
0
    *rune = l;
194
0
    return 2;
195
0
  }
196
197
  /*
198
   * three character sequence
199
   *  0800-FFFF => T3 Tx Tx
200
   */
201
0
  c2 = *(uchar*)(str+2) ^ Tx;
202
0
  if(c2 & Testx)
203
0
    goto bad;
204
0
  if(c < T4) {
205
0
    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
206
0
    if(l <= Rune2)
207
0
      goto bad;
208
0
    *rune = l;
209
0
    return 3;
210
0
  }
211
212
  /*
213
   * four character sequence (21-bit value)
214
   *  10000-1FFFFF => T4 Tx Tx Tx
215
   */
216
0
  c3 = *(uchar*)(str+3) ^ Tx;
217
0
  if (c3 & Testx)
218
0
    goto bad;
219
0
  if (c < T5) {
220
0
    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
221
0
    if (l <= Rune3)
222
0
      goto bad;
223
0
    *rune = l;
224
0
    return 4;
225
0
  }
226
227
  /*
228
   * Support for 5-byte or longer UTF-8 would go here, but
229
   * since we don't have that, we'll just fall through to bad.
230
   */
231
232
  /*
233
   * bad decoding
234
   */
235
0
bad:
236
0
  *rune = Bad;
237
0
  return 1;
238
0
}
239
240
int
241
0
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
242
0
  *consumed = charntorune(rune, str, length);
243
0
  return *rune != Runeerror || *consumed == 3;
244
0
}
245
    
246
int
247
runetochar(char *str, const Rune *rune)
248
0
{
249
  /* Runes are signed, so convert to unsigned for range check. */
250
0
  unsigned long c;
251
252
  /*
253
   * one character sequence
254
   *  00000-0007F => 00-7F
255
   */
256
0
  c = *rune;
257
0
  if(c <= Rune1) {
258
0
    str[0] = c;
259
0
    return 1;
260
0
  }
261
262
  /*
263
   * two character sequence
264
   *  0080-07FF => T2 Tx
265
   */
266
0
  if(c <= Rune2) {
267
0
    str[0] = T2 | (c >> 1*Bitx);
268
0
    str[1] = Tx | (c & Maskx);
269
0
    return 2;
270
0
  }
271
272
  /*
273
   * If the Rune is out of range, convert it to the error rune.
274
   * Do this test here because the error rune encodes to three bytes.
275
   * Doing it earlier would duplicate work, since an out of range
276
   * Rune wouldn't have fit in one or two bytes.
277
   */
278
0
  if (c > Runemax)
279
0
    c = Runeerror;
280
281
  /*
282
   * three character sequence
283
   *  0800-FFFF => T3 Tx Tx
284
   */
285
0
  if (c <= Rune3) {
286
0
    str[0] = T3 |  (c >> 2*Bitx);
287
0
    str[1] = Tx | ((c >> 1*Bitx) & Maskx);
288
0
    str[2] = Tx |  (c & Maskx);
289
0
    return 3;
290
0
  }
291
292
  /*
293
   * four character sequence (21-bit value)
294
   *     10000-1FFFFF => T4 Tx Tx Tx
295
   */
296
0
  str[0] = T4 | (c >> 3*Bitx);
297
0
  str[1] = Tx | ((c >> 2*Bitx) & Maskx);
298
0
  str[2] = Tx | ((c >> 1*Bitx) & Maskx);
299
0
  str[3] = Tx | (c & Maskx);
300
0
  return 4;
301
0
}
302
303
int
304
runelen(Rune rune)
305
0
{
306
0
  char str[10];
307
308
0
  return runetochar(str, &rune);
309
0
}
310
311
int
312
runenlen(const Rune *r, int nrune)
313
0
{
314
0
  int nb, c;
315
316
0
  nb = 0;
317
0
  while(nrune--) {
318
0
    c = *r++;
319
0
    if (c <= Rune1)
320
0
      nb++;
321
0
    else if (c <= Rune2)
322
0
      nb += 2;
323
0
    else if (c <= Rune3)
324
0
      nb += 3;
325
0
    else /* assert(c <= Rune4) */ 
326
0
      nb += 4;
327
0
  }
328
0
  return nb;
329
0
}
330
331
int
332
fullrune(const char *str, int n)
333
0
{
334
0
  if (n > 0) {
335
0
    int c = *(uchar*)str;
336
0
    if (c < Tx)
337
0
      return 1;
338
0
    if (n > 1) {
339
0
      if (c < T3)
340
0
        return 1;
341
0
      if (n > 2) {
342
0
        if (c < T4 || n > 3)
343
0
          return 1;
344
0
      }
345
0
    }
346
0
  }
347
0
  return 0;
348
0
}