Coverage Report

Created: 2025-08-26 06:02

/src/sentencepiece/third_party/protobuf-lite/structurally_valid.cc
Line
Count
Source (jump to first uncovered line)
1
// Protocol Buffers - Google's data interchange format
2
// Copyright 2008 Google Inc.  All rights reserved.
3
// https://developers.google.com/protocol-buffers/
4
//
5
// Redistribution and use in source and binary forms, with or without
6
// modification, are permitted provided that the following conditions are
7
// met:
8
//
9
//     * Redistributions of source code must retain the above copyright
10
// notice, this list of conditions and the following disclaimer.
11
//     * Redistributions in binary form must reproduce the above
12
// copyright notice, this list of conditions and the following disclaimer
13
// in the documentation and/or other materials provided with the
14
// distribution.
15
//     * Neither the name of Google Inc. nor the names of its
16
// contributors may be used to endorse or promote products derived from
17
// this software without specific prior written permission.
18
//
19
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31
// Author: jrm@google.com (Jim Meehan)
32
33
#include <google/protobuf/stubs/common.h>
34
35
#include <google/protobuf/stubs/stringpiece.h>
36
37
namespace google {
38
namespace protobuf {
39
namespace internal {
40
41
// These four-byte entries compactly encode how many bytes 0..255 to delete
42
// in making a string replacement, how many bytes to add 0..255, and the offset
43
// 0..64k-1 of the replacement string in remap_string.
44
struct RemapEntry {
45
  uint8 delete_bytes;
46
  uint8 add_bytes;
47
  uint16 bytes_offset;
48
};
49
50
// Exit type codes for state tables. All but the first get stuffed into
51
// signed one-byte entries. The first is only generated by executable code.
52
// To distinguish from next-state entries, these must be contiguous and
53
// all <= kExitNone
54
typedef enum {
55
  kExitDstSpaceFull = 239,
56
  kExitIllegalStructure,  // 240
57
  kExitOK,                // 241
58
  kExitReject,            // ...
59
  kExitReplace1,
60
  kExitReplace2,
61
  kExitReplace3,
62
  kExitReplace21,
63
  kExitReplace31,
64
  kExitReplace32,
65
  kExitReplaceOffset1,
66
  kExitReplaceOffset2,
67
  kExitReplace1S0,
68
  kExitSpecial,
69
  kExitDoAgain,
70
  kExitRejectAlt,
71
  kExitNone               // 255
72
} ExitReason;
73
74
75
// This struct represents one entire state table. The three initialized byte
76
// areas are state_table, remap_base, and remap_string. state0 and state0_size
77
// give the byte offset and length within state_table of the initial state --
78
// table lookups are expected to start and end in this state, but for
79
// truncated UTF-8 strings, may end in a different state. These allow a quick
80
// test for that condition. entry_shift is 8 for tables subscripted by a full
81
// byte value and 6 for space-optimized tables subscripted by only six
82
// significant bits in UTF-8 continuation bytes.
83
typedef struct {
84
  const uint32 state0;
85
  const uint32 state0_size;
86
  const uint32 total_size;
87
  const int max_expand;
88
  const int entry_shift;
89
  const int bytes_per_entry;
90
  const uint32 losub;
91
  const uint32 hiadd;
92
  const uint8* state_table;
93
  const RemapEntry* remap_base;
94
  const uint8* remap_string;
95
  const uint8* fast_state;
96
} UTF8StateMachineObj;
97
98
typedef UTF8StateMachineObj UTF8ScanObj;
99
100
#define X__ (kExitIllegalStructure)
101
#define RJ_ (kExitReject)
102
#define S1_ (kExitReplace1)
103
#define S2_ (kExitReplace2)
104
#define S3_ (kExitReplace3)
105
#define S21 (kExitReplace21)
106
#define S31 (kExitReplace31)
107
#define S32 (kExitReplace32)
108
#define T1_ (kExitReplaceOffset1)
109
#define T2_ (kExitReplaceOffset2)
110
#define S11 (kExitReplace1S0)
111
#define SP_ (kExitSpecial)
112
#define D__ (kExitDoAgain)
113
#define RJA (kExitRejectAlt)
114
115
//  Entire table has 9 state blocks of 256 entries each
116
static const unsigned int utf8acceptnonsurrogates_STATE0 = 0;     // state[0]
117
static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256;  // =[1]
118
static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
119
static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
120
static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
121
static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
122
static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
123
static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
124
125
static const uint8 utf8acceptnonsurrogates[] = {
126
// state[0] 0x000000 Byte 1
127
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
128
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
129
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
130
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
131
132
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
133
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
134
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
135
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
136
137
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
138
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
139
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
140
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
141
142
X__, X__,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
143
  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
144
  2,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   7,   3,   3,
145
  4,   5,   5,   5,   6, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
146
147
// state[1] 0x000080 Byte 2 of 2
148
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
149
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
150
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
151
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
152
153
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
154
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
155
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
156
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
157
158
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
159
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
160
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
161
  0,   0,   0,   0,   0,   0,   0,   0,    0,   0,   0,   0,   0,   0,   0,   0,
162
163
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
164
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
165
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
166
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
167
168
// state[2] 0x000000 Byte 2 of 3
169
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
170
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
171
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
172
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
173
174
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
175
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
176
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
177
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
178
179
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
180
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
181
  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
182
  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
183
184
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
185
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
186
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
187
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
188
189
// state[3] 0x001000 Byte 2 of 3
190
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
191
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
192
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
193
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
194
195
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
196
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
197
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
198
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
199
200
  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
201
  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
202
  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
203
  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
204
205
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
206
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
207
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
208
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
209
210
// state[4] 0x000000 Byte 2 of 4
211
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
212
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
213
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
214
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
215
216
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
217
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
218
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
219
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
220
221
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
222
  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
223
  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
224
  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
225
226
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
227
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
228
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
229
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
230
231
// state[5] 0x040000 Byte 2 of 4
232
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
233
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
234
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
235
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
236
237
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
238
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
239
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
240
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
241
242
  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
243
  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
244
  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
245
  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
246
247
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
248
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
249
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
250
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
251
252
// state[6] 0x100000 Byte 2 of 4
253
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
254
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
255
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
256
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
257
258
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
259
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
260
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
261
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
262
263
  3,   3,   3,   3,   3,   3,   3,   3,    3,   3,   3,   3,   3,   3,   3,   3,
264
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
265
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
266
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
267
268
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
269
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
270
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
271
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
272
273
// state[7] 0x00d000 Byte 2 of 3
274
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
275
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
276
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
277
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
278
279
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
280
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
281
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
282
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
283
284
  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
285
  1,   1,   1,   1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,
286
  8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
287
  8,   8,   8,   8,   8,   8,   8,   8,    8,   8,   8,   8,   8,   8,   8,   8,
288
289
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
290
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
291
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
292
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
293
294
// state[8] 0x00d800 Byte 3 of 3
295
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
296
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
297
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
298
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
299
300
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
301
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
302
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
303
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
304
305
RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
306
RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
307
RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
308
RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,  RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
309
310
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
311
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
312
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
313
X__, X__, X__, X__, X__, X__, X__, X__,  X__, X__, X__, X__, X__, X__, X__, X__,
314
};
315
316
// Remap base[0] = (del, add, string_offset)
317
static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
318
{0, 0, 0} };
319
320
// Remap string[0]
321
static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
322
0 };
323
324
static const unsigned char utf8acceptnonsurrogates_fast[256] = {
325
0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
326
0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
327
0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
328
0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
329
330
0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
331
0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
332
0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
333
0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
334
335
1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
336
1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
337
1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
338
1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
339
340
1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
341
1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
342
1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
343
1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
344
};
345
346
static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
347
  utf8acceptnonsurrogates_STATE0,
348
  utf8acceptnonsurrogates_STATE0_SIZE,
349
  utf8acceptnonsurrogates_TOTAL_SIZE,
350
  utf8acceptnonsurrogates_MAX_EXPAND_X4,
351
  utf8acceptnonsurrogates_SHIFT,
352
  utf8acceptnonsurrogates_BYTES,
353
  utf8acceptnonsurrogates_LOSUB,
354
  utf8acceptnonsurrogates_HIADD,
355
  utf8acceptnonsurrogates,
356
  utf8acceptnonsurrogates_remap_base,
357
  utf8acceptnonsurrogates_remap_string,
358
  utf8acceptnonsurrogates_fast
359
};
360
361
362
#undef X__
363
#undef RJ_
364
#undef S1_
365
#undef S2_
366
#undef S3_
367
#undef S21
368
#undef S31
369
#undef S32
370
#undef T1_
371
#undef T2_
372
#undef S11
373
#undef SP_
374
#undef D__
375
#undef RJA
376
377
// Return true if current Tbl pointer is within state0 range
378
// Note that unsigned compare checks both ends of range simultaneously
379
0
static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
380
0
  const uint8* Tbl0 = &st->state_table[st->state0];
381
0
  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
382
0
}
383
384
// Scan a UTF-8 string based on state table.
385
// Always scan complete UTF-8 characters
386
// Set number of bytes scanned. Return reason for exiting
387
int UTF8GenericScan(const UTF8ScanObj* st,
388
                    const char * str,
389
                    int str_length,
390
0
                    int* bytes_consumed) {
391
0
  *bytes_consumed = 0;
392
0
  if (str_length == 0) return kExitOK;
393
394
0
  int eshift = st->entry_shift;
395
0
  const uint8* isrc = reinterpret_cast<const uint8*>(str);
396
0
  const uint8* src = isrc;
397
0
  const uint8* srclimit = isrc + str_length;
398
0
  const uint8* srclimit8 = str_length < 7 ? isrc : srclimit - 7;
399
0
  const uint8* Tbl_0 = &st->state_table[st->state0];
400
401
0
 DoAgain:
402
  // Do state-table scan
403
0
  int e = 0;
404
0
  uint8 c;
405
0
  const uint8* Tbl2 = &st->fast_state[0];
406
0
  const uint32 losub = st->losub;
407
0
  const uint32 hiadd = st->hiadd;
408
  // Check initial few bytes one at a time until 8-byte aligned
409
  //----------------------------
410
0
  while ((((uintptr_t)src & 0x07) != 0) &&
411
0
         (src < srclimit) &&
412
0
         Tbl2[src[0]] == 0) {
413
0
    src++;
414
0
  }
415
0
  if (((uintptr_t)src & 0x07) == 0) {
416
    // Do fast for groups of 8 identity bytes.
417
    // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
418
    // including slowing slightly on cr/lf/ht
419
    //----------------------------
420
0
    while (src < srclimit8) {
421
0
      uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
422
0
      uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
423
0
      src += 8;
424
      // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
425
0
      uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
426
0
                    (s4567 - losub) | (s4567 + hiadd);
427
0
      if ((temp & 0x80808080) != 0) {
428
        // We typically end up here on cr/lf/ht; src was incremented
429
0
        int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
430
0
                    (Tbl2[src[-6]] | Tbl2[src[-5]]);
431
0
        if (e0123 != 0) {
432
0
          src -= 8;
433
0
          break;
434
0
        }    // Exit on Non-interchange
435
0
        e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
436
0
                (Tbl2[src[-2]] | Tbl2[src[-1]]);
437
0
        if (e0123 != 0) {
438
0
          src -= 4;
439
0
          break;
440
0
        }    // Exit on Non-interchange
441
        // Else OK, go around again
442
0
      }
443
0
    }
444
0
  }
445
  //----------------------------
446
447
  // Byte-at-a-time scan
448
  //----------------------------
449
0
  const uint8* Tbl = Tbl_0;
450
0
  while (src < srclimit) {
451
0
    c = *src;
452
0
    e = Tbl[c];
453
0
    src++;
454
0
    if (e >= kExitIllegalStructure) {break;}
455
0
    Tbl = &Tbl_0[e << eshift];
456
0
  }
457
  //----------------------------
458
459
  // Exit possibilities:
460
  //  Some exit code, !state0, back up over last char
461
  //  Some exit code, state0, back up one byte exactly
462
  //  source consumed, !state0, back up over partial char
463
  //  source consumed, state0, exit OK
464
  // For illegal byte in state0, avoid backup up over PREVIOUS char
465
  // For truncated last char, back up to beginning of it
466
467
0
  if (e >= kExitIllegalStructure) {
468
    // Back up over exactly one byte of rejected/illegal UTF-8 character
469
0
    src--;
470
    // Back up more if needed
471
0
    if (!InStateZero(st, Tbl)) {
472
0
      do {
473
0
        src--;
474
0
      } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
475
0
    }
476
0
  } else if (!InStateZero(st, Tbl)) {
477
    // Back up over truncated UTF-8 character
478
0
    e = kExitIllegalStructure;
479
0
    do {
480
0
      src--;
481
0
    } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
482
0
  } else {
483
    // Normal termination, source fully consumed
484
0
    e = kExitOK;
485
0
  }
486
487
0
  if (e == kExitDoAgain) {
488
    // Loop back up to the fast scan
489
0
    goto DoAgain;
490
0
  }
491
492
0
  *bytes_consumed = src - isrc;
493
0
  return e;
494
0
}
495
496
int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
497
                    const char * str,
498
                    int str_length,
499
0
                    int* bytes_consumed) {
500
0
  *bytes_consumed = 0;
501
0
  if (str_length == 0) return kExitOK;
502
503
0
  const uint8* isrc =  reinterpret_cast<const uint8*>(str);
504
0
  const uint8* src = isrc;
505
0
  const uint8* srclimit = isrc + str_length;
506
0
  const uint8* srclimit8 = str_length < 7 ? isrc : srclimit - 7;
507
0
  int n;
508
0
  int rest_consumed;
509
0
  int exit_reason;
510
0
  do {
511
    // Check initial few bytes one at a time until 8-byte aligned
512
0
    while ((((uintptr_t)src & 0x07) != 0) &&
513
0
           (src < srclimit) && (src[0] < 0x80)) {
514
0
      src++;
515
0
    }
516
0
    if (((uintptr_t)src & 0x07) == 0) {
517
0
      while ((src < srclimit8) &&
518
0
             (((reinterpret_cast<const uint32*>(src)[0] |
519
0
                reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
520
0
        src += 8;
521
0
      }
522
0
    }
523
0
    while ((src < srclimit) && (src[0] < 0x80)) {
524
0
      src++;
525
0
    }
526
    // Run state table on the rest
527
0
    n = src - isrc;
528
0
    exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
529
0
    src += rest_consumed;
530
0
  } while ( exit_reason == kExitDoAgain );
531
532
0
  *bytes_consumed = src - isrc;
533
0
  return exit_reason;
534
0
}
535
536
// Hack:  On some compilers the static tables are initialized at startup.
537
//   We can't use them until they are initialized.  However, some Protocol
538
//   Buffer parsing happens at static init time and may try to validate
539
//   UTF-8 strings.  Since UTF-8 validation is only used for debugging
540
//   anyway, we simply always return success if initialization hasn't
541
//   occurred yet.
542
namespace {
543
544
bool module_initialized_ = false;
545
546
struct InitDetector {
547
2
  InitDetector() {
548
2
    module_initialized_ = true;
549
2
  }
550
};
551
InitDetector init_detector;
552
553
}  // namespace
554
555
0
bool IsStructurallyValidUTF8(const char* buf, int len) {
556
0
  if (!module_initialized_) return true;
557
558
0
  int bytes_consumed = 0;
559
0
  UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
560
0
                           buf, len, &bytes_consumed);
561
0
  return (bytes_consumed == len);
562
0
}
563
564
0
int UTF8SpnStructurallyValid(StringPiece str) {
565
0
  if (!module_initialized_) return str.size();
566
567
0
  int bytes_consumed = 0;
568
0
  UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
569
0
                           str.data(), str.size(), &bytes_consumed);
570
0
  return bytes_consumed;
571
0
}
572
573
// Coerce UTF-8 byte string in src_str to be
574
// a structurally-valid equal-length string by selectively
575
// overwriting illegal bytes with replace_char (typically blank).
576
// replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
577
// src_str is read-only. If any overwriting is needed, a modified byte string
578
// is created in idst, length isrclen.
579
//
580
// Returns pointer to output buffer, isrc if no changes were made,
581
//  or idst if some bytes were changed.
582
//
583
// Fast case: all is structurally valid and no byte copying is done.
584
//
585
char* UTF8CoerceToStructurallyValid(StringPiece src_str, char* idst,
586
0
                                    const char replace_char) {
587
0
  const char* isrc = src_str.data();
588
0
  const int len = src_str.length();
589
0
  int n = UTF8SpnStructurallyValid(src_str);
590
0
  if (n == len) {               // Normal case -- all is cool, return
591
0
    return const_cast<char*>(isrc);
592
0
  } else {                      // Unusual case -- copy w/o bad bytes
593
0
    const char* src = isrc;
594
0
    const char* srclimit = isrc + len;
595
0
    char* dst = idst;
596
0
    memmove(dst, src, n);       // Copy initial good chunk
597
0
    src += n;
598
0
    dst += n;
599
0
    while (src < srclimit) {    // src points to bogus byte or is off the end
600
0
      dst[0] = replace_char;                    // replace one bad byte
601
0
      src++;
602
0
      dst++;
603
0
      StringPiece str2(src, srclimit - src);
604
0
      n = UTF8SpnStructurallyValid(str2);       // scan the remainder
605
0
      memmove(dst, src, n);                     // copy next good chunk
606
0
      src += n;
607
0
      dst += n;
608
0
    }
609
0
  }
610
0
  return idst;
611
0
}
612
613
}  // namespace internal
614
}  // namespace protobuf
615
}  // namespace google