/src/sentencepiece/third_party/protobuf-lite/structurally_valid.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Protocol Buffers - Google's data interchange format |
2 | | // Copyright 2008 Google Inc. All rights reserved. |
3 | | // https://developers.google.com/protocol-buffers/ |
4 | | // |
5 | | // Redistribution and use in source and binary forms, with or without |
6 | | // modification, are permitted provided that the following conditions are |
7 | | // met: |
8 | | // |
9 | | // * Redistributions of source code must retain the above copyright |
10 | | // notice, this list of conditions and the following disclaimer. |
11 | | // * Redistributions in binary form must reproduce the above |
12 | | // copyright notice, this list of conditions and the following disclaimer |
13 | | // in the documentation and/or other materials provided with the |
14 | | // distribution. |
15 | | // * Neither the name of Google Inc. nor the names of its |
16 | | // contributors may be used to endorse or promote products derived from |
17 | | // this software without specific prior written permission. |
18 | | // |
19 | | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
20 | | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
21 | | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
22 | | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
23 | | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
24 | | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
25 | | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
26 | | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
27 | | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
28 | | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
29 | | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 | | |
31 | | // Author: jrm@google.com (Jim Meehan) |
32 | | |
33 | | #include <google/protobuf/stubs/common.h> |
34 | | |
35 | | #include <google/protobuf/stubs/stringpiece.h> |
36 | | |
37 | | namespace google { |
38 | | namespace protobuf { |
39 | | namespace internal { |
40 | | |
41 | | // These four-byte entries compactly encode how many bytes 0..255 to delete |
42 | | // in making a string replacement, how many bytes to add 0..255, and the offset |
43 | | // 0..64k-1 of the replacement string in remap_string. |
44 | | struct RemapEntry { |
45 | | uint8 delete_bytes; |
46 | | uint8 add_bytes; |
47 | | uint16 bytes_offset; |
48 | | }; |
49 | | |
50 | | // Exit type codes for state tables. All but the first get stuffed into |
51 | | // signed one-byte entries. The first is only generated by executable code. |
52 | | // To distinguish from next-state entries, these must be contiguous and |
53 | | // all <= kExitNone |
54 | | typedef enum { |
55 | | kExitDstSpaceFull = 239, |
56 | | kExitIllegalStructure, // 240 |
57 | | kExitOK, // 241 |
58 | | kExitReject, // ... |
59 | | kExitReplace1, |
60 | | kExitReplace2, |
61 | | kExitReplace3, |
62 | | kExitReplace21, |
63 | | kExitReplace31, |
64 | | kExitReplace32, |
65 | | kExitReplaceOffset1, |
66 | | kExitReplaceOffset2, |
67 | | kExitReplace1S0, |
68 | | kExitSpecial, |
69 | | kExitDoAgain, |
70 | | kExitRejectAlt, |
71 | | kExitNone // 255 |
72 | | } ExitReason; |
73 | | |
74 | | |
75 | | // This struct represents one entire state table. The three initialized byte |
76 | | // areas are state_table, remap_base, and remap_string. state0 and state0_size |
77 | | // give the byte offset and length within state_table of the initial state -- |
78 | | // table lookups are expected to start and end in this state, but for |
79 | | // truncated UTF-8 strings, may end in a different state. These allow a quick |
80 | | // test for that condition. entry_shift is 8 for tables subscripted by a full |
81 | | // byte value and 6 for space-optimized tables subscripted by only six |
82 | | // significant bits in UTF-8 continuation bytes. |
83 | | typedef struct { |
84 | | const uint32 state0; |
85 | | const uint32 state0_size; |
86 | | const uint32 total_size; |
87 | | const int max_expand; |
88 | | const int entry_shift; |
89 | | const int bytes_per_entry; |
90 | | const uint32 losub; |
91 | | const uint32 hiadd; |
92 | | const uint8* state_table; |
93 | | const RemapEntry* remap_base; |
94 | | const uint8* remap_string; |
95 | | const uint8* fast_state; |
96 | | } UTF8StateMachineObj; |
97 | | |
98 | | typedef UTF8StateMachineObj UTF8ScanObj; |
99 | | |
100 | | #define X__ (kExitIllegalStructure) |
101 | | #define RJ_ (kExitReject) |
102 | | #define S1_ (kExitReplace1) |
103 | | #define S2_ (kExitReplace2) |
104 | | #define S3_ (kExitReplace3) |
105 | | #define S21 (kExitReplace21) |
106 | | #define S31 (kExitReplace31) |
107 | | #define S32 (kExitReplace32) |
108 | | #define T1_ (kExitReplaceOffset1) |
109 | | #define T2_ (kExitReplaceOffset2) |
110 | | #define S11 (kExitReplace1S0) |
111 | | #define SP_ (kExitSpecial) |
112 | | #define D__ (kExitDoAgain) |
113 | | #define RJA (kExitRejectAlt) |
114 | | |
115 | | // Entire table has 9 state blocks of 256 entries each |
116 | | static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0] |
117 | | static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1] |
118 | | static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304; |
119 | | static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0; |
120 | | static const unsigned int utf8acceptnonsurrogates_SHIFT = 8; |
121 | | static const unsigned int utf8acceptnonsurrogates_BYTES = 1; |
122 | | static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020; |
123 | | static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000; |
124 | | |
125 | | static const uint8 utf8acceptnonsurrogates[] = { |
126 | | // state[0] 0x000000 Byte 1 |
127 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
128 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
129 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
130 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
131 | | |
132 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
133 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
134 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
135 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
136 | | |
137 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
138 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
139 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
140 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
141 | | |
142 | | X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
143 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
144 | | 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3, |
145 | | 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
146 | | |
147 | | // state[1] 0x000080 Byte 2 of 2 |
148 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
149 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
150 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
151 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
152 | | |
153 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
154 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
155 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
156 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
157 | | |
158 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
159 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
160 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
161 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
162 | | |
163 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
164 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
165 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
166 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
167 | | |
168 | | // state[2] 0x000000 Byte 2 of 3 |
169 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
170 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
171 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
172 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
173 | | |
174 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
175 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
176 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
177 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
178 | | |
179 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
180 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
181 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
182 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
183 | | |
184 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
185 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
186 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
187 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
188 | | |
189 | | // state[3] 0x001000 Byte 2 of 3 |
190 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
191 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
192 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
193 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
194 | | |
195 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
196 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
197 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
198 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
199 | | |
200 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
201 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
202 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
203 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
204 | | |
205 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
206 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
207 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
208 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
209 | | |
210 | | // state[4] 0x000000 Byte 2 of 4 |
211 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
212 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
213 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
214 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
215 | | |
216 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
217 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
218 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
219 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
220 | | |
221 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
222 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
223 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
224 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
225 | | |
226 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
227 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
228 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
229 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
230 | | |
231 | | // state[5] 0x040000 Byte 2 of 4 |
232 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
233 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
234 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
235 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
236 | | |
237 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
238 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
239 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
240 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
241 | | |
242 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
243 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
244 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
245 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
246 | | |
247 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
248 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
249 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
250 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
251 | | |
252 | | // state[6] 0x100000 Byte 2 of 4 |
253 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
254 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
255 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
256 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
257 | | |
258 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
259 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
260 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
261 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
262 | | |
263 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
264 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
265 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
266 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
267 | | |
268 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
269 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
270 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
271 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
272 | | |
273 | | // state[7] 0x00d000 Byte 2 of 3 |
274 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
275 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
276 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
277 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
278 | | |
279 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
280 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
281 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
282 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
283 | | |
284 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
285 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
286 | | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
287 | | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
288 | | |
289 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
290 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
291 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
292 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
293 | | |
294 | | // state[8] 0x00d800 Byte 3 of 3 |
295 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
296 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
297 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
298 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
299 | | |
300 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
301 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
302 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
303 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
304 | | |
305 | | RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, |
306 | | RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, |
307 | | RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, |
308 | | RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, |
309 | | |
310 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
311 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
312 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
313 | | X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, |
314 | | }; |
315 | | |
316 | | // Remap base[0] = (del, add, string_offset) |
317 | | static const RemapEntry utf8acceptnonsurrogates_remap_base[] = { |
318 | | {0, 0, 0} }; |
319 | | |
320 | | // Remap string[0] |
321 | | static const unsigned char utf8acceptnonsurrogates_remap_string[] = { |
322 | | 0 }; |
323 | | |
324 | | static const unsigned char utf8acceptnonsurrogates_fast[256] = { |
325 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
326 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
327 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
328 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
329 | | |
330 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
331 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
332 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
333 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
334 | | |
335 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
336 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
337 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
338 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
339 | | |
340 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
341 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
342 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
343 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
344 | | }; |
345 | | |
346 | | static const UTF8ScanObj utf8acceptnonsurrogates_obj = { |
347 | | utf8acceptnonsurrogates_STATE0, |
348 | | utf8acceptnonsurrogates_STATE0_SIZE, |
349 | | utf8acceptnonsurrogates_TOTAL_SIZE, |
350 | | utf8acceptnonsurrogates_MAX_EXPAND_X4, |
351 | | utf8acceptnonsurrogates_SHIFT, |
352 | | utf8acceptnonsurrogates_BYTES, |
353 | | utf8acceptnonsurrogates_LOSUB, |
354 | | utf8acceptnonsurrogates_HIADD, |
355 | | utf8acceptnonsurrogates, |
356 | | utf8acceptnonsurrogates_remap_base, |
357 | | utf8acceptnonsurrogates_remap_string, |
358 | | utf8acceptnonsurrogates_fast |
359 | | }; |
360 | | |
361 | | |
362 | | #undef X__ |
363 | | #undef RJ_ |
364 | | #undef S1_ |
365 | | #undef S2_ |
366 | | #undef S3_ |
367 | | #undef S21 |
368 | | #undef S31 |
369 | | #undef S32 |
370 | | #undef T1_ |
371 | | #undef T2_ |
372 | | #undef S11 |
373 | | #undef SP_ |
374 | | #undef D__ |
375 | | #undef RJA |
376 | | |
377 | | // Return true if current Tbl pointer is within state0 range |
378 | | // Note that unsigned compare checks both ends of range simultaneously |
379 | 0 | static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) { |
380 | 0 | const uint8* Tbl0 = &st->state_table[st->state0]; |
381 | 0 | return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size); |
382 | 0 | } |
383 | | |
384 | | // Scan a UTF-8 string based on state table. |
385 | | // Always scan complete UTF-8 characters |
386 | | // Set number of bytes scanned. Return reason for exiting |
387 | | int UTF8GenericScan(const UTF8ScanObj* st, |
388 | | const char * str, |
389 | | int str_length, |
390 | 0 | int* bytes_consumed) { |
391 | 0 | *bytes_consumed = 0; |
392 | 0 | if (str_length == 0) return kExitOK; |
393 | | |
394 | 0 | int eshift = st->entry_shift; |
395 | 0 | const uint8* isrc = reinterpret_cast<const uint8*>(str); |
396 | 0 | const uint8* src = isrc; |
397 | 0 | const uint8* srclimit = isrc + str_length; |
398 | 0 | const uint8* srclimit8 = str_length < 7 ? isrc : srclimit - 7; |
399 | 0 | const uint8* Tbl_0 = &st->state_table[st->state0]; |
400 | |
|
401 | 0 | DoAgain: |
402 | | // Do state-table scan |
403 | 0 | int e = 0; |
404 | 0 | uint8 c; |
405 | 0 | const uint8* Tbl2 = &st->fast_state[0]; |
406 | 0 | const uint32 losub = st->losub; |
407 | 0 | const uint32 hiadd = st->hiadd; |
408 | | // Check initial few bytes one at a time until 8-byte aligned |
409 | | //---------------------------- |
410 | 0 | while ((((uintptr_t)src & 0x07) != 0) && |
411 | 0 | (src < srclimit) && |
412 | 0 | Tbl2[src[0]] == 0) { |
413 | 0 | src++; |
414 | 0 | } |
415 | 0 | if (((uintptr_t)src & 0x07) == 0) { |
416 | | // Do fast for groups of 8 identity bytes. |
417 | | // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop, |
418 | | // including slowing slightly on cr/lf/ht |
419 | | //---------------------------- |
420 | 0 | while (src < srclimit8) { |
421 | 0 | uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0]; |
422 | 0 | uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1]; |
423 | 0 | src += 8; |
424 | | // This is a fast range check for all bytes in [lowsub..0x80-hiadd) |
425 | 0 | uint32 temp = (s0123 - losub) | (s0123 + hiadd) | |
426 | 0 | (s4567 - losub) | (s4567 + hiadd); |
427 | 0 | if ((temp & 0x80808080) != 0) { |
428 | | // We typically end up here on cr/lf/ht; src was incremented |
429 | 0 | int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | |
430 | 0 | (Tbl2[src[-6]] | Tbl2[src[-5]]); |
431 | 0 | if (e0123 != 0) { |
432 | 0 | src -= 8; |
433 | 0 | break; |
434 | 0 | } // Exit on Non-interchange |
435 | 0 | e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | |
436 | 0 | (Tbl2[src[-2]] | Tbl2[src[-1]]); |
437 | 0 | if (e0123 != 0) { |
438 | 0 | src -= 4; |
439 | 0 | break; |
440 | 0 | } // Exit on Non-interchange |
441 | | // Else OK, go around again |
442 | 0 | } |
443 | 0 | } |
444 | 0 | } |
445 | | //---------------------------- |
446 | | |
447 | | // Byte-at-a-time scan |
448 | | //---------------------------- |
449 | 0 | const uint8* Tbl = Tbl_0; |
450 | 0 | while (src < srclimit) { |
451 | 0 | c = *src; |
452 | 0 | e = Tbl[c]; |
453 | 0 | src++; |
454 | 0 | if (e >= kExitIllegalStructure) {break;} |
455 | 0 | Tbl = &Tbl_0[e << eshift]; |
456 | 0 | } |
457 | | //---------------------------- |
458 | | |
459 | | // Exit possibilities: |
460 | | // Some exit code, !state0, back up over last char |
461 | | // Some exit code, state0, back up one byte exactly |
462 | | // source consumed, !state0, back up over partial char |
463 | | // source consumed, state0, exit OK |
464 | | // For illegal byte in state0, avoid backup up over PREVIOUS char |
465 | | // For truncated last char, back up to beginning of it |
466 | |
|
467 | 0 | if (e >= kExitIllegalStructure) { |
468 | | // Back up over exactly one byte of rejected/illegal UTF-8 character |
469 | 0 | src--; |
470 | | // Back up more if needed |
471 | 0 | if (!InStateZero(st, Tbl)) { |
472 | 0 | do { |
473 | 0 | src--; |
474 | 0 | } while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); |
475 | 0 | } |
476 | 0 | } else if (!InStateZero(st, Tbl)) { |
477 | | // Back up over truncated UTF-8 character |
478 | 0 | e = kExitIllegalStructure; |
479 | 0 | do { |
480 | 0 | src--; |
481 | 0 | } while ((src > isrc) && ((src[0] & 0xc0) == 0x80)); |
482 | 0 | } else { |
483 | | // Normal termination, source fully consumed |
484 | 0 | e = kExitOK; |
485 | 0 | } |
486 | |
|
487 | 0 | if (e == kExitDoAgain) { |
488 | | // Loop back up to the fast scan |
489 | 0 | goto DoAgain; |
490 | 0 | } |
491 | | |
492 | 0 | *bytes_consumed = src - isrc; |
493 | 0 | return e; |
494 | 0 | } |
495 | | |
496 | | int UTF8GenericScanFastAscii(const UTF8ScanObj* st, |
497 | | const char * str, |
498 | | int str_length, |
499 | 0 | int* bytes_consumed) { |
500 | 0 | *bytes_consumed = 0; |
501 | 0 | if (str_length == 0) return kExitOK; |
502 | | |
503 | 0 | const uint8* isrc = reinterpret_cast<const uint8*>(str); |
504 | 0 | const uint8* src = isrc; |
505 | 0 | const uint8* srclimit = isrc + str_length; |
506 | 0 | const uint8* srclimit8 = str_length < 7 ? isrc : srclimit - 7; |
507 | 0 | int n; |
508 | 0 | int rest_consumed; |
509 | 0 | int exit_reason; |
510 | 0 | do { |
511 | | // Check initial few bytes one at a time until 8-byte aligned |
512 | 0 | while ((((uintptr_t)src & 0x07) != 0) && |
513 | 0 | (src < srclimit) && (src[0] < 0x80)) { |
514 | 0 | src++; |
515 | 0 | } |
516 | 0 | if (((uintptr_t)src & 0x07) == 0) { |
517 | 0 | while ((src < srclimit8) && |
518 | 0 | (((reinterpret_cast<const uint32*>(src)[0] | |
519 | 0 | reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) { |
520 | 0 | src += 8; |
521 | 0 | } |
522 | 0 | } |
523 | 0 | while ((src < srclimit) && (src[0] < 0x80)) { |
524 | 0 | src++; |
525 | 0 | } |
526 | | // Run state table on the rest |
527 | 0 | n = src - isrc; |
528 | 0 | exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed); |
529 | 0 | src += rest_consumed; |
530 | 0 | } while ( exit_reason == kExitDoAgain ); |
531 | |
|
532 | 0 | *bytes_consumed = src - isrc; |
533 | 0 | return exit_reason; |
534 | 0 | } |
535 | | |
536 | | // Hack: On some compilers the static tables are initialized at startup. |
537 | | // We can't use them until they are initialized. However, some Protocol |
538 | | // Buffer parsing happens at static init time and may try to validate |
539 | | // UTF-8 strings. Since UTF-8 validation is only used for debugging |
540 | | // anyway, we simply always return success if initialization hasn't |
541 | | // occurred yet. |
542 | | namespace { |
543 | | |
544 | | bool module_initialized_ = false; |
545 | | |
546 | | struct InitDetector { |
547 | 2 | InitDetector() { |
548 | 2 | module_initialized_ = true; |
549 | 2 | } |
550 | | }; |
551 | | InitDetector init_detector; |
552 | | |
553 | | } // namespace |
554 | | |
555 | 0 | bool IsStructurallyValidUTF8(const char* buf, int len) { |
556 | 0 | if (!module_initialized_) return true; |
557 | | |
558 | 0 | int bytes_consumed = 0; |
559 | 0 | UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj, |
560 | 0 | buf, len, &bytes_consumed); |
561 | 0 | return (bytes_consumed == len); |
562 | 0 | } |
563 | | |
564 | 0 | int UTF8SpnStructurallyValid(StringPiece str) { |
565 | 0 | if (!module_initialized_) return str.size(); |
566 | | |
567 | 0 | int bytes_consumed = 0; |
568 | 0 | UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj, |
569 | 0 | str.data(), str.size(), &bytes_consumed); |
570 | 0 | return bytes_consumed; |
571 | 0 | } |
572 | | |
573 | | // Coerce UTF-8 byte string in src_str to be |
574 | | // a structurally-valid equal-length string by selectively |
575 | | // overwriting illegal bytes with replace_char (typically blank). |
576 | | // replace_char must be legal printable 7-bit Ascii 0x20..0x7e. |
577 | | // src_str is read-only. If any overwriting is needed, a modified byte string |
578 | | // is created in idst, length isrclen. |
579 | | // |
580 | | // Returns pointer to output buffer, isrc if no changes were made, |
581 | | // or idst if some bytes were changed. |
582 | | // |
583 | | // Fast case: all is structurally valid and no byte copying is done. |
584 | | // |
585 | | char* UTF8CoerceToStructurallyValid(StringPiece src_str, char* idst, |
586 | 0 | const char replace_char) { |
587 | 0 | const char* isrc = src_str.data(); |
588 | 0 | const int len = src_str.length(); |
589 | 0 | int n = UTF8SpnStructurallyValid(src_str); |
590 | 0 | if (n == len) { // Normal case -- all is cool, return |
591 | 0 | return const_cast<char*>(isrc); |
592 | 0 | } else { // Unusual case -- copy w/o bad bytes |
593 | 0 | const char* src = isrc; |
594 | 0 | const char* srclimit = isrc + len; |
595 | 0 | char* dst = idst; |
596 | 0 | memmove(dst, src, n); // Copy initial good chunk |
597 | 0 | src += n; |
598 | 0 | dst += n; |
599 | 0 | while (src < srclimit) { // src points to bogus byte or is off the end |
600 | 0 | dst[0] = replace_char; // replace one bad byte |
601 | 0 | src++; |
602 | 0 | dst++; |
603 | 0 | StringPiece str2(src, srclimit - src); |
604 | 0 | n = UTF8SpnStructurallyValid(str2); // scan the remainder |
605 | 0 | memmove(dst, src, n); // copy next good chunk |
606 | 0 | src += n; |
607 | 0 | dst += n; |
608 | 0 | } |
609 | 0 | } |
610 | 0 | return idst; |
611 | 0 | } |
612 | | |
613 | | } // namespace internal |
614 | | } // namespace protobuf |
615 | | } // namespace google |