Coverage Report

Created: 2024-04-26 11:14

/src/re2/util/rune.cc
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * The authors of this software are Rob Pike and Ken Thompson.
3
 *              Copyright (c) 2002 by Lucent Technologies.
4
 * Permission to use, copy, modify, and distribute this software for any
5
 * purpose without fee is hereby granted, provided that this entire notice
6
 * is included in all copies of any software which is or includes a copy
7
 * or modification of this software and in all copies of the supporting
8
 * documentation for such software.
9
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10
 * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11
 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13
 */
14
15
#include <stdarg.h>
16
#include <string.h>
17
18
#include "util/utf.h"
19
20
namespace re2 {
21
22
enum
23
{
24
  Bit1  = 7,
25
  Bitx  = 6,
26
  Bit2  = 5,
27
  Bit3  = 4,
28
  Bit4  = 3,
29
  Bit5  = 2, 
30
31
  T1  = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
32
  Tx  = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
33
  T2  = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
34
  T3  = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
35
  T4  = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
36
  T5  = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
37
38
  Rune1 = (1<<(Bit1+0*Bitx))-1,   /* 0000 0000 0111 1111 */
39
  Rune2 = (1<<(Bit2+1*Bitx))-1,   /* 0000 0111 1111 1111 */
40
  Rune3 = (1<<(Bit3+2*Bitx))-1,   /* 1111 1111 1111 1111 */
41
  Rune4 = (1<<(Bit4+3*Bitx))-1,
42
                                        /* 0001 1111 1111 1111 1111 1111 */
43
44
  Maskx = (1<<Bitx)-1,      /* 0011 1111 */
45
  Testx = Maskx ^ 0xFF,     /* 1100 0000 */
46
47
  Bad = Runeerror,
48
};
49
50
int
51
chartorune(Rune *rune, const char *str)
52
34.9M
{
53
34.9M
  int c, c1, c2, c3;
54
34.9M
  Rune l;
55
56
  /*
57
   * one character sequence
58
   *  00000-0007F => T1
59
   */
60
34.9M
  c = *(unsigned char*)str;
61
34.9M
  if(c < Tx) {
62
28.5M
    *rune = c;
63
28.5M
    return 1;
64
28.5M
  }
65
66
  /*
67
   * two character sequence
68
   *  0080-07FF => T2 Tx
69
   */
70
6.40M
  c1 = *(unsigned char*)(str+1) ^ Tx;
71
6.40M
  if(c1 & Testx)
72
53.3k
    goto bad;
73
6.34M
  if(c < T3) {
74
6.27M
    if(c < T2)
75
5.41k
      goto bad;
76
6.26M
    l = ((c << Bitx) | c1) & Rune2;
77
6.26M
    if(l <= Rune1)
78
636
      goto bad;
79
6.26M
    *rune = l;
80
6.26M
    return 2;
81
6.26M
  }
82
83
  /*
84
   * three character sequence
85
   *  0800-FFFF => T3 Tx Tx
86
   */
87
73.5k
  c2 = *(unsigned char*)(str+2) ^ Tx;
88
73.5k
  if(c2 & Testx)
89
1.99k
    goto bad;
90
71.5k
  if(c < T4) {
91
21.4k
    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
92
21.4k
    if(l <= Rune2)
93
622
      goto bad;
94
20.8k
    *rune = l;
95
20.8k
    return 3;
96
21.4k
  }
97
98
  /*
99
   * four character sequence (21-bit value)
100
   *  10000-1FFFFF => T4 Tx Tx Tx
101
   */
102
50.0k
  c3 = *(unsigned char*)(str+3) ^ Tx;
103
50.0k
  if (c3 & Testx)
104
747
    goto bad;
105
49.3k
  if (c < T5) {
106
49.0k
    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
107
49.0k
    if (l <= Rune3)
108
622
      goto bad;
109
48.4k
    *rune = l;
110
48.4k
    return 4;
111
49.0k
  }
112
113
  /*
114
   * Support for 5-byte or longer UTF-8 would go here, but
115
   * since we don't have that, we'll just fall through to bad.
116
   */
117
118
  /*
119
   * bad decoding
120
   */
121
63.6k
bad:
122
63.6k
  *rune = Bad;
123
63.6k
  return 1;
124
49.3k
}
125
126
int
127
runetochar(char *str, const Rune *rune)
128
908M
{
129
  /* Runes are signed, so convert to unsigned for range check. */
130
908M
  unsigned int c;
131
132
  /*
133
   * one character sequence
134
   *  00000-0007F => 00-7F
135
   */
136
908M
  c = *rune;
137
908M
  if(c <= Rune1) {
138
36.8M
    str[0] = static_cast<char>(c);
139
36.8M
    return 1;
140
36.8M
  }
141
142
  /*
143
   * two character sequence
144
   *  0080-07FF => T2 Tx
145
   */
146
871M
  if(c <= Rune2) {
147
96.5M
    str[0] = T2 | static_cast<char>(c >> 1*Bitx);
148
96.5M
    str[1] = Tx | (c & Maskx);
149
96.5M
    return 2;
150
96.5M
  }
151
152
  /*
153
   * If the Rune is out of range, convert it to the error rune.
154
   * Do this test here because the error rune encodes to three bytes.
155
   * Doing it earlier would duplicate work, since an out of range
156
   * Rune wouldn't have fit in one or two bytes.
157
   */
158
774M
  if (c > Runemax)
159
0
    c = Runeerror;
160
161
  /*
162
   * three character sequence
163
   *  0800-FFFF => T3 Tx Tx
164
   */
165
774M
  if (c <= Rune3) {
166
438M
    str[0] = T3 | static_cast<char>(c >> 2*Bitx);
167
438M
    str[1] = Tx | ((c >> 1*Bitx) & Maskx);
168
438M
    str[2] = Tx | (c & Maskx);
169
438M
    return 3;
170
438M
  }
171
172
  /*
173
   * four character sequence (21-bit value)
174
   *     10000-1FFFFF => T4 Tx Tx Tx
175
   */
176
335M
  str[0] = T4 | static_cast<char>(c >> 3*Bitx);
177
335M
  str[1] = Tx | ((c >> 2*Bitx) & Maskx);
178
335M
  str[2] = Tx | ((c >> 1*Bitx) & Maskx);
179
335M
  str[3] = Tx | (c & Maskx);
180
335M
  return 4;
181
774M
}
182
183
int
184
runelen(Rune rune)
185
0
{
186
0
  char str[10];
187
188
0
  return runetochar(str, &rune);
189
0
}
190
191
int
192
fullrune(const char *str, int n)
193
34.9M
{
194
34.9M
  if (n > 0) {
195
34.9M
    int c = *(unsigned char*)str;
196
34.9M
    if (c < Tx)
197
28.5M
      return 1;
198
6.41M
    if (n > 1) {
199
6.40M
      if (c < T3)
200
6.30M
        return 1;
201
102k
      if (n > 2) {
202
98.6k
        if (c < T4 || n > 3)
203
95.9k
          return 1;
204
98.6k
      }
205
102k
    }
206
6.41M
  }
207
18.4k
  return 0;
208
34.9M
}
209
210
211
int
212
utflen(const char *s)
213
0
{
214
0
  int c;
215
0
  int n;
216
0
  Rune rune;
217
218
0
  n = 0;
219
0
  for(;;) {
220
0
    c = *(unsigned char*)s;
221
0
    if(c < Runeself) {
222
0
      if(c == 0)
223
0
        return n;
224
0
      s++;
225
0
    } else
226
0
      s += chartorune(&rune, s);
227
0
    n++;
228
0
  }
229
0
  return 0;
230
0
}
231
232
char*
233
utfrune(const char *s, Rune c)
234
0
{
235
0
  int c1;
236
0
  Rune r;
237
0
  int n;
238
239
0
  if(c < Runesync)   /* not part of utf sequence */
240
0
    return strchr((char*)s, c);
241
242
0
  for(;;) {
243
0
    c1 = *(unsigned char*)s;
244
0
    if(c1 < Runeself) { /* one byte rune */
245
0
      if(c1 == 0)
246
0
        return 0;
247
0
      if(c1 == c)
248
0
        return (char*)s;
249
0
      s++;
250
0
      continue;
251
0
    }
252
0
    n = chartorune(&r, s);
253
0
    if(r == c)
254
0
      return (char*)s;
255
0
    s += n;
256
0
  }
257
0
  return 0;
258
0
}
259
260
}  // namespace re2