/root/doris/be/src/gutil/utf/rune.c

Source (jump to first uncovered line)
/*
 * The authors of this software are Rob Pike and Ken Thompson.
 *              Copyright (c) 2002 by Lucent Technologies.
 * Permission to use, copy, modify, and distribute this software for any
 * purpose without fee is hereby granted, provided that this entire notice
 * is included in all copies of any software which is or includes a copy
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
 * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
#include "gutil/utf/utf.h"
#include "gutil/utf/utfdef.h"

enum
{
  Bit1  = 7,
  Bitx  = 6,
  Bit2  = 5,
  Bit3  = 4,
  Bit4  = 3,
  Bit5  = 2, 

  T1  = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
  Tx  = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
  T2  = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
  T3  = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
  T4  = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
  T5  = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */

  Rune1 = (1<<(Bit1+0*Bitx))-1,   /* 0000 0000 0111 1111 */
  Rune2 = (1<<(Bit2+1*Bitx))-1,   /* 0000 0111 1111 1111 */
  Rune3 = (1<<(Bit3+2*Bitx))-1,   /* 1111 1111 1111 1111 */
  Rune4 = (1<<(Bit4+3*Bitx))-1,
                                        /* 0001 1111 1111 1111 1111 1111 */

  Maskx = (1<<Bitx)-1,      /* 0011 1111 */
  Testx = Maskx ^ 0xFF,     /* 1100 0000 */

  Bad = Runeerror,
};

/*
 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
 * This is a slower but "safe" version of the old chartorune 
 * that works on strings that are not necessarily null-terminated.
 * 
 * If you know for sure that your string is null-terminated,
 * chartorune will be a bit faster.
 *
 * It is guaranteed not to attempt to access "length"
 * past the incoming pointer.  This is to avoid
 * possible access violations.  If the string appears to be
 * well-formed but incomplete (i.e., to get the whole Rune
 * we'd need to read past str+length) then we'll set the Rune
 * to Bad and return 0.
 *
 * Note that if we have decoding problems for other
 * reasons, we return 1 instead of 0.
 */
int
charntorune(Rune *rune, const char *str, int length)
{
  int c, c1, c2, c3;
  long l;

  /* When we're not allowed to read anything */
  if(length <= 0) {
    goto badlen;
  }

  /*
   * one character sequence (7-bit value)
   *  00000-0007F => T1
   */
  c = *(uchar*)str;
  if(c < Tx) {
    *rune = c;
    return 1;
  }

  // If we can't read more than one character we must stop
  if(length <= 1) {
    goto badlen;
  }

  /*
   * two character sequence (11-bit value)
   *  0080-07FF => T2 Tx
   */
  c1 = *(uchar*)(str+1) ^ Tx;
  if(c1 & Testx)
    goto bad;
  if(c < T3) {
    if(c < T2)
      goto bad;
    l = ((c << Bitx) | c1) & Rune2;
    if(l <= Rune1)
      goto bad;
    *rune = l;
    return 2;
  }

  // If we can't read more than two characters we must stop
  if(length <= 2) {
    goto badlen;
  }

  /*
   * three character sequence (16-bit value)
   *  0800-FFFF => T3 Tx Tx
   */
  c2 = *(uchar*)(str+2) ^ Tx;
  if(c2 & Testx)
    goto bad;
  if(c < T4) {
    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
    if(l <= Rune2)
      goto bad;
    *rune = l;
    return 3;
  }

  if (length <= 3)
    goto badlen;

  /*
   * four character sequence (21-bit value)
   *  10000-1FFFFF => T4 Tx Tx Tx
   */
  c3 = *(uchar*)(str+3) ^ Tx;
  if (c3 & Testx)
    goto bad;
  if (c < T5) {
    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
    if (l <= Rune3)
      goto bad;
    *rune = l;
    return 4;
  }

  // Support for 5-byte or longer UTF-8 would go here, but
  // since we don't have that, we'll just fall through to bad.

  /*
   * bad decoding
   */
bad:
  *rune = Bad;
  return 1;
badlen:
  *rune = Bad;
  return 0;

}


/*
 * This is the older "unsafe" version, which works fine on 
 * null-terminated strings.
 */
int
chartorune(Rune *rune, const char *str)
{
  int c, c1, c2, c3;
  long l;

  /*
   * one character sequence
   *  00000-0007F => T1
   */
  c = *(uchar*)str;
  if(c < Tx) {
    *rune = c;
    return 1;
  }

  /*
   * two character sequence
   *  0080-07FF => T2 Tx
   */
  c1 = *(uchar*)(str+1) ^ Tx;
  if(c1 & Testx)
    goto bad;
  if(c < T3) {
    if(c < T2)
      goto bad;
    l = ((c << Bitx) | c1) & Rune2;
    if(l <= Rune1)
      goto bad;
    *rune = l;
    return 2;
  }

  /*
   * three character sequence
   *  0800-FFFF => T3 Tx Tx
   */
  c2 = *(uchar*)(str+2) ^ Tx;
  if(c2 & Testx)
    goto bad;
  if(c < T4) {
    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
    if(l <= Rune2)
      goto bad;
    *rune = l;
    return 3;
  }

  /*
   * four character sequence (21-bit value)
   *  10000-1FFFFF => T4 Tx Tx Tx
   */
  c3 = *(uchar*)(str+3) ^ Tx;
  if (c3 & Testx)
    goto bad;
  if (c < T5) {
    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
    if (l <= Rune3)
      goto bad;
    *rune = l;
    return 4;
  }

  /*
   * Support for 5-byte or longer UTF-8 would go here, but
   * since we don't have that, we'll just fall through to bad.
   */

  /*
   * bad decoding
   */
bad:
  *rune = Bad;
  return 1;
}

int
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
  *consumed = charntorune(rune, str, length);
  return *rune != Runeerror || *consumed == 3;
}
    
int
runetochar(char *str, const Rune *rune)
{
  /* Runes are signed, so convert to unsigned for range check. */
  unsigned long c;

  /*
   * one character sequence
   *  00000-0007F => 00-7F
   */
  c = *rune;
  if(c <= Rune1) {
    str[0] = c;
    return 1;
  }

  /*
   * two character sequence
   *  0080-07FF => T2 Tx
   */
  if(c <= Rune2) {
    str[0] = T2 | (c >> 1*Bitx);
    str[1] = Tx | (c & Maskx);
    return 2;
  }

  /*
   * If the Rune is out of range, convert it to the error rune.
   * Do this test here because the error rune encodes to three bytes.
   * Doing it earlier would duplicate work, since an out of range
   * Rune wouldn't have fit in one or two bytes.
   */
  if (c > Runemax)
    c = Runeerror;

  /*
   * three character sequence
   *  0800-FFFF => T3 Tx Tx
   */
  if (c <= Rune3) {
    str[0] = T3 |  (c >> 2*Bitx);
    str[1] = Tx | ((c >> 1*Bitx) & Maskx);
    str[2] = Tx |  (c & Maskx);
    return 3;
  }

  /*
   * four character sequence (21-bit value)
   *     10000-1FFFFF => T4 Tx Tx Tx
   */
  str[0] = T4 | (c >> 3*Bitx);
  str[1] = Tx | ((c >> 2*Bitx) & Maskx);
  str[2] = Tx | ((c >> 1*Bitx) & Maskx);
  str[3] = Tx | (c & Maskx);
  return 4;
}

int
runelen(Rune rune)
{
  char str[10];

  return runetochar(str, &rune);
}

int
runenlen(const Rune *r, int nrune)
{
  int nb, c;

  nb = 0;
  while(nrune--) {
    c = *r++;
    if (c <= Rune1)
      nb++;
    else if (c <= Rune2)
      nb += 2;
    else if (c <= Rune3)
      nb += 3;
    else /* assert(c <= Rune4) */ 
      nb += 4;
  }
  return nb;
}

int
fullrune(const char *str, int n)
{
  if (n > 0) {
    int c = *(uchar*)str;
    if (c < Tx)
      return 1;
    if (n > 1) {
      if (c < T3)
        return 1;
      if (n > 2) {
        if (c < T4 || n > 3)
          return 1;
      }
    }
  }
  return 0;
}

Coverage Report

Created: 2024-11-20 21:14

Line	Count	Source (jump to first uncovered line)
1		/*
2		* The authors of this software are Rob Pike and Ken Thompson.
3		* Copyright (c) 2002 by Lucent Technologies.
4		* Permission to use, copy, modify, and distribute this software for any
5		* purpose without fee is hereby granted, provided that this entire notice
6		* is included in all copies of any software which is or includes a copy
7		* or modification of this software and in all copies of the supporting
8		* documentation for such software.
9		* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10		* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11		* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12		* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13		*/
14		#include "gutil/utf/utf.h"
15		#include "gutil/utf/utfdef.h"
16
17		enum
18		{
19		Bit1 = 7,
20		Bitx = 6,
21		Bit2 = 5,
22		Bit3 = 4,
23		Bit4 = 3,
24		Bit5 = 2,
25
26		T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
27		Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
28		T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
29		T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
30		T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
31		T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
32
33		Rune1 = (1<<(Bit1+0Bitx))-1, / 0000 0000 0111 1111 */
34		Rune2 = (1<<(Bit2+1Bitx))-1, / 0000 0111 1111 1111 */
35		Rune3 = (1<<(Bit3+2Bitx))-1, / 1111 1111 1111 1111 */
36		Rune4 = (1<<(Bit4+3*Bitx))-1,
37		/* 0001 1111 1111 1111 1111 1111 */
38
39		Maskx = (1<<Bitx)-1, /* 0011 1111 */
40		Testx = Maskx ^ 0xFF, /* 1100 0000 */
41
42		Bad = Runeerror,
43		};
44
45		/*
46		* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
47		* This is a slower but "safe" version of the old chartorune
48		* that works on strings that are not necessarily null-terminated.
49		*
50		* If you know for sure that your string is null-terminated,
51		* chartorune will be a bit faster.
52		*
53		* It is guaranteed not to attempt to access "length"
54		* past the incoming pointer. This is to avoid
55		* possible access violations. If the string appears to be
56		* well-formed but incomplete (i.e., to get the whole Rune
57		* we'd need to read past str+length) then we'll set the Rune
58		* to Bad and return 0.
59		*
60		* Note that if we have decoding problems for other
61		* reasons, we return 1 instead of 0.
62		*/
63		int
64		charntorune(Rune rune, const char str, int length)
65	0	{
66	0	int c, c1, c2, c3;
67	0	long l;
68
69		/* When we're not allowed to read anything */
70	0	if(length <= 0) {
71	0	goto badlen;
72	0	}
73
74		/*
75		* one character sequence (7-bit value)
76		* 00000-0007F => T1
77		*/
78	0	c = (uchar)str;
79	0	if(c < Tx) {
80	0	*rune = c;
81	0	return 1;
82	0	}
83
84		// If we can't read more than one character we must stop
85	0	if(length <= 1) {
86	0	goto badlen;
87	0	}
88
89		/*
90		* two character sequence (11-bit value)
91		* 0080-07FF => T2 Tx
92		*/
93	0	c1 = (uchar)(str+1) ^ Tx;
94	0	if(c1 & Testx)
95	0	goto bad;
96	0	if(c < T3) {
97	0	if(c < T2)
98	0	goto bad;
99	0	l = ((c << Bitx) \| c1) & Rune2;
100	0	if(l <= Rune1)
101	0	goto bad;
102	0	*rune = l;
103	0	return 2;
104	0	}
105
106		// If we can't read more than two characters we must stop
107	0	if(length <= 2) {
108	0	goto badlen;
109	0	}
110
111		/*
112		* three character sequence (16-bit value)
113		* 0800-FFFF => T3 Tx Tx
114		*/
115	0	c2 = (uchar)(str+2) ^ Tx;
116	0	if(c2 & Testx)
117	0	goto bad;
118	0	if(c < T4) {
119	0	l = ((((c << Bitx) \| c1) << Bitx) \| c2) & Rune3;
120	0	if(l <= Rune2)
121	0	goto bad;
122	0	*rune = l;
123	0	return 3;
124	0	}
125
126	0	if (length <= 3)
127	0	goto badlen;
128
129		/*
130		* four character sequence (21-bit value)
131		* 10000-1FFFFF => T4 Tx Tx Tx
132		*/
133	0	c3 = (uchar)(str+3) ^ Tx;
134	0	if (c3 & Testx)
135	0	goto bad;
136	0	if (c < T5) {
137	0	l = ((((((c << Bitx) \| c1) << Bitx) \| c2) << Bitx) \| c3) & Rune4;
138	0	if (l <= Rune3)
139	0	goto bad;
140	0	*rune = l;
141	0	return 4;
142	0	}
143
144		// Support for 5-byte or longer UTF-8 would go here, but
145		// since we don't have that, we'll just fall through to bad.
146
147		/*
148		* bad decoding
149		*/
150	0	bad:
151	0	*rune = Bad;
152	0	return 1;
153	0	badlen:
154	0	*rune = Bad;
155	0	return 0;
156
157	0	}
158
159
160		/*
161		* This is the older "unsafe" version, which works fine on
162		* null-terminated strings.
163		*/
164		int
165		chartorune(Rune rune, const char str)
166	0	{
167	0	int c, c1, c2, c3;
168	0	long l;
169
170		/*
171		* one character sequence
172		* 00000-0007F => T1
173		*/
174	0	c = (uchar)str;
175	0	if(c < Tx) {
176	0	*rune = c;
177	0	return 1;
178	0	}
179
180		/*
181		* two character sequence
182		* 0080-07FF => T2 Tx
183		*/
184	0	c1 = (uchar)(str+1) ^ Tx;
185	0	if(c1 & Testx)
186	0	goto bad;
187	0	if(c < T3) {
188	0	if(c < T2)
189	0	goto bad;
190	0	l = ((c << Bitx) \| c1) & Rune2;
191	0	if(l <= Rune1)
192	0	goto bad;
193	0	*rune = l;
194	0	return 2;
195	0	}
196
197		/*
198		* three character sequence
199		* 0800-FFFF => T3 Tx Tx
200		*/
201	0	c2 = (uchar)(str+2) ^ Tx;
202	0	if(c2 & Testx)
203	0	goto bad;
204	0	if(c < T4) {
205	0	l = ((((c << Bitx) \| c1) << Bitx) \| c2) & Rune3;
206	0	if(l <= Rune2)
207	0	goto bad;
208	0	*rune = l;
209	0	return 3;
210	0	}
211
212		/*
213		* four character sequence (21-bit value)
214		* 10000-1FFFFF => T4 Tx Tx Tx
215		*/
216	0	c3 = (uchar)(str+3) ^ Tx;
217	0	if (c3 & Testx)
218	0	goto bad;
219	0	if (c < T5) {
220	0	l = ((((((c << Bitx) \| c1) << Bitx) \| c2) << Bitx) \| c3) & Rune4;
221	0	if (l <= Rune3)
222	0	goto bad;
223	0	*rune = l;
224	0	return 4;
225	0	}
226
227		/*
228		* Support for 5-byte or longer UTF-8 would go here, but
229		* since we don't have that, we'll just fall through to bad.
230		*/
231
232		/*
233		* bad decoding
234		*/
235	0	bad:
236	0	*rune = Bad;
237	0	return 1;
238	0	}
239
240		int
241	0	isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
242	0	*consumed = charntorune(rune, str, length);
243	0	return rune != Runeerror \|\| consumed == 3;
244	0	}
245
246		int
247		runetochar(char str, const Rune rune)
248	0	{
249		/* Runes are signed, so convert to unsigned for range check. */
250	0	unsigned long c;
251
252		/*
253		* one character sequence
254		* 00000-0007F => 00-7F
255		*/
256	0	c = *rune;
257	0	if(c <= Rune1) {
258	0	str[0] = c;
259	0	return 1;
260	0	}
261
262		/*
263		* two character sequence
264		* 0080-07FF => T2 Tx
265		*/
266	0	if(c <= Rune2) {
267	0	str[0] = T2 \| (c >> 1*Bitx);
268	0	str[1] = Tx \| (c & Maskx);
269	0	return 2;
270	0	}
271
272		/*
273		* If the Rune is out of range, convert it to the error rune.
274		* Do this test here because the error rune encodes to three bytes.
275		* Doing it earlier would duplicate work, since an out of range
276		* Rune wouldn't have fit in one or two bytes.
277		*/
278	0	if (c > Runemax)
279	0	c = Runeerror;
280
281		/*
282		* three character sequence
283		* 0800-FFFF => T3 Tx Tx
284		*/
285	0	if (c <= Rune3) {
286	0	str[0] = T3 \| (c >> 2*Bitx);
287	0	str[1] = Tx \| ((c >> 1*Bitx) & Maskx);
288	0	str[2] = Tx \| (c & Maskx);
289	0	return 3;
290	0	}
291
292		/*
293		* four character sequence (21-bit value)
294		* 10000-1FFFFF => T4 Tx Tx Tx
295		*/
296	0	str[0] = T4 \| (c >> 3*Bitx);
297	0	str[1] = Tx \| ((c >> 2*Bitx) & Maskx);
298	0	str[2] = Tx \| ((c >> 1*Bitx) & Maskx);
299	0	str[3] = Tx \| (c & Maskx);
300	0	return 4;
301	0	}
302
303		int
304		runelen(Rune rune)
305	0	{
306	0	char str[10];
307
308	0	return runetochar(str, &rune);
309	0	}
310
311		int
312		runenlen(const Rune *r, int nrune)
313	0	{
314	0	int nb, c;
315
316	0	nb = 0;
317	0	while(nrune--) {
318	0	c = *r++;
319	0	if (c <= Rune1)
320	0	nb++;
321	0	else if (c <= Rune2)
322	0	nb += 2;
323	0	else if (c <= Rune3)
324	0	nb += 3;
325	0	else /* assert(c <= Rune4) */
326	0	nb += 4;
327	0	}
328	0	return nb;
329	0	}
330
331		int
332		fullrune(const char *str, int n)
333	0	{
334	0	if (n > 0) {
335	0	int c = (uchar)str;
336	0	if (c < Tx)
337	0	return 1;
338	0	if (n > 1) {
339	0	if (c < T3)
340	0	return 1;
341	0	if (n > 2) {
342	0	if (c < T4 \|\| n > 3)
343	0	return 1;
344	0	}
345	0	}
346	0	}
347	0	return 0;
348	0	}