Coverage Report

Created: 2024-06-09 08:58

/src/re2/util/rune.cc
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * The authors of this software are Rob Pike and Ken Thompson.
3
 *              Copyright (c) 2002 by Lucent Technologies.
4
 * Permission to use, copy, modify, and distribute this software for any
5
 * purpose without fee is hereby granted, provided that this entire notice
6
 * is included in all copies of any software which is or includes a copy
7
 * or modification of this software and in all copies of the supporting
8
 * documentation for such software.
9
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10
 * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11
 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13
 */
14
15
#include <stdarg.h>
16
#include <string.h>
17
18
#include "util/utf.h"
19
20
namespace re2 {
21
22
enum
23
{
24
  Bit1  = 7,
25
  Bitx  = 6,
26
  Bit2  = 5,
27
  Bit3  = 4,
28
  Bit4  = 3,
29
  Bit5  = 2, 
30
31
  T1  = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
32
  Tx  = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
33
  T2  = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
34
  T3  = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
35
  T4  = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
36
  T5  = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
37
38
  Rune1 = (1<<(Bit1+0*Bitx))-1,   /* 0000 0000 0111 1111 */
39
  Rune2 = (1<<(Bit2+1*Bitx))-1,   /* 0000 0111 1111 1111 */
40
  Rune3 = (1<<(Bit3+2*Bitx))-1,   /* 1111 1111 1111 1111 */
41
  Rune4 = (1<<(Bit4+3*Bitx))-1,
42
                                        /* 0001 1111 1111 1111 1111 1111 */
43
44
  Maskx = (1<<Bitx)-1,      /* 0011 1111 */
45
  Testx = Maskx ^ 0xFF,     /* 1100 0000 */
46
47
  Bad = Runeerror,
48
};
49
50
int
51
chartorune(Rune *rune, const char *str)
52
1.55M
{
53
1.55M
  int c, c1, c2, c3;
54
1.55M
  Rune l;
55
56
  /*
57
   * one character sequence
58
   *  00000-0007F => T1
59
   */
60
1.55M
  c = *(unsigned char*)str;
61
1.55M
  if(c < Tx) {
62
1.26M
    *rune = c;
63
1.26M
    return 1;
64
1.26M
  }
65
66
  /*
67
   * two character sequence
68
   *  0080-07FF => T2 Tx
69
   */
70
283k
  c1 = *(unsigned char*)(str+1) ^ Tx;
71
283k
  if(c1 & Testx)
72
2.46k
    goto bad;
73
280k
  if(c < T3) {
74
276k
    if(c < T2)
75
196
      goto bad;
76
276k
    l = ((c << Bitx) | c1) & Rune2;
77
276k
    if(l <= Rune1)
78
17
      goto bad;
79
276k
    *rune = l;
80
276k
    return 2;
81
276k
  }
82
83
  /*
84
   * three character sequence
85
   *  0800-FFFF => T3 Tx Tx
86
   */
87
4.00k
  c2 = *(unsigned char*)(str+2) ^ Tx;
88
4.00k
  if(c2 & Testx)
89
96
    goto bad;
90
3.91k
  if(c < T4) {
91
1.19k
    l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
92
1.19k
    if(l <= Rune2)
93
17
      goto bad;
94
1.18k
    *rune = l;
95
1.18k
    return 3;
96
1.19k
  }
97
98
  /*
99
   * four character sequence (21-bit value)
100
   *  10000-1FFFFF => T4 Tx Tx Tx
101
   */
102
2.71k
  c3 = *(unsigned char*)(str+3) ^ Tx;
103
2.71k
  if (c3 & Testx)
104
27
    goto bad;
105
2.68k
  if (c < T5) {
106
2.68k
    l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
107
2.68k
    if (l <= Rune3)
108
17
      goto bad;
109
2.66k
    *rune = l;
110
2.66k
    return 4;
111
2.68k
  }
112
113
  /*
114
   * Support for 5-byte or longer UTF-8 would go here, but
115
   * since we don't have that, we'll just fall through to bad.
116
   */
117
118
  /*
119
   * bad decoding
120
   */
121
2.84k
bad:
122
2.84k
  *rune = Bad;
123
2.84k
  return 1;
124
2.68k
}
125
126
int
127
runetochar(char *str, const Rune *rune)
128
1.08G
{
129
  /* Runes are signed, so convert to unsigned for range check. */
130
1.08G
  unsigned int c;
131
132
  /*
133
   * one character sequence
134
   *  00000-0007F => 00-7F
135
   */
136
1.08G
  c = *rune;
137
1.08G
  if(c <= Rune1) {
138
1.68M
    str[0] = static_cast<char>(c);
139
1.68M
    return 1;
140
1.68M
  }
141
142
  /*
143
   * two character sequence
144
   *  0080-07FF => T2 Tx
145
   */
146
1.08G
  if(c <= Rune2) {
147
99.2M
    str[0] = T2 | static_cast<char>(c >> 1*Bitx);
148
99.2M
    str[1] = Tx | (c & Maskx);
149
99.2M
    return 2;
150
99.2M
  }
151
152
  /*
153
   * If the Rune is out of range, convert it to the error rune.
154
   * Do this test here because the error rune encodes to three bytes.
155
   * Doing it earlier would duplicate work, since an out of range
156
   * Rune wouldn't have fit in one or two bytes.
157
   */
158
986M
  if (c > Runemax)
159
0
    c = Runeerror;
160
161
  /*
162
   * three character sequence
163
   *  0800-FFFF => T3 Tx Tx
164
   */
165
986M
  if (c <= Rune3) {
166
550M
    str[0] = T3 | static_cast<char>(c >> 2*Bitx);
167
550M
    str[1] = Tx | ((c >> 1*Bitx) & Maskx);
168
550M
    str[2] = Tx | (c & Maskx);
169
550M
    return 3;
170
550M
  }
171
172
  /*
173
   * four character sequence (21-bit value)
174
   *     10000-1FFFFF => T4 Tx Tx Tx
175
   */
176
436M
  str[0] = T4 | static_cast<char>(c >> 3*Bitx);
177
436M
  str[1] = Tx | ((c >> 2*Bitx) & Maskx);
178
436M
  str[2] = Tx | ((c >> 1*Bitx) & Maskx);
179
436M
  str[3] = Tx | (c & Maskx);
180
436M
  return 4;
181
986M
}
182
183
int
184
runelen(Rune rune)
185
0
{
186
0
  char str[10];
187
188
0
  return runetochar(str, &rune);
189
0
}
190
191
int
192
fullrune(const char *str, int n)
193
1.55M
{
194
1.55M
  if (n > 0) {
195
1.55M
    int c = *(unsigned char*)str;
196
1.55M
    if (c < Tx)
197
1.26M
      return 1;
198
283k
    if (n > 1) {
199
283k
      if (c < T3)
200
277k
        return 1;
201
5.33k
      if (n > 2) {
202
5.18k
        if (c < T4 || n > 3)
203
5.09k
          return 1;
204
5.18k
      }
205
5.33k
    }
206
283k
  }
207
710
  return 0;
208
1.55M
}
209
210
211
int
212
utflen(const char *s)
213
0
{
214
0
  int c;
215
0
  int n;
216
0
  Rune rune;
217
218
0
  n = 0;
219
0
  for(;;) {
220
0
    c = *(unsigned char*)s;
221
0
    if(c < Runeself) {
222
0
      if(c == 0)
223
0
        return n;
224
0
      s++;
225
0
    } else
226
0
      s += chartorune(&rune, s);
227
0
    n++;
228
0
  }
229
0
  return 0;
230
0
}
231
232
char*
233
utfrune(const char *s, Rune c)
234
0
{
235
0
  int c1;
236
0
  Rune r;
237
0
  int n;
238
239
0
  if(c < Runesync)   /* not part of utf sequence */
240
0
    return strchr((char*)s, c);
241
242
0
  for(;;) {
243
0
    c1 = *(unsigned char*)s;
244
0
    if(c1 < Runeself) { /* one byte rune */
245
0
      if(c1 == 0)
246
0
        return 0;
247
0
      if(c1 == c)
248
0
        return (char*)s;
249
0
      s++;
250
0
      continue;
251
0
    }
252
0
    n = chartorune(&r, s);
253
0
    if(r == c)
254
0
      return (char*)s;
255
0
    s += n;
256
0
  }
257
0
  return 0;
258
0
}
259
260
}  // namespace re2