/src/sentencepiece/third_party/protobuf-lite/strutil.cc
Line | Count | Source |
1 | | // Protocol Buffers - Google's data interchange format |
2 | | // Copyright 2008 Google Inc. All rights reserved. |
3 | | // https://developers.google.com/protocol-buffers/ |
4 | | // |
5 | | // Redistribution and use in source and binary forms, with or without |
6 | | // modification, are permitted provided that the following conditions are |
7 | | // met: |
8 | | // |
9 | | // * Redistributions of source code must retain the above copyright |
10 | | // notice, this list of conditions and the following disclaimer. |
11 | | // * Redistributions in binary form must reproduce the above |
12 | | // copyright notice, this list of conditions and the following disclaimer |
13 | | // in the documentation and/or other materials provided with the |
14 | | // distribution. |
15 | | // * Neither the name of Google Inc. nor the names of its |
16 | | // contributors may be used to endorse or promote products derived from |
17 | | // this software without specific prior written permission. |
18 | | // |
19 | | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
20 | | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
21 | | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
22 | | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
23 | | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
24 | | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
25 | | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
26 | | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
27 | | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
28 | | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
29 | | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 | | |
31 | | // from google3/strings/strutil.cc |
32 | | |
33 | | #include <google/protobuf/stubs/strutil.h> |
34 | | |
35 | | #include <errno.h> |
36 | | #include <float.h> // FLT_DIG and DBL_DIG |
37 | | #include <limits.h> |
38 | | #include <stdio.h> |
39 | | #include <cmath> |
40 | | #include <iterator> |
41 | | #include <limits> |
42 | | |
43 | | #include <google/protobuf/stubs/logging.h> |
44 | | #include <google/protobuf/stubs/stl_util.h> |
45 | | |
46 | | #ifdef _WIN32 |
47 | | // MSVC has only _snprintf, not snprintf. |
48 | | // |
49 | | // MinGW has both snprintf and _snprintf, but they appear to be different |
50 | | // functions. The former is buggy. When invoked like so: |
51 | | // char buffer[32]; |
52 | | // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f); |
53 | | // it prints "1.23000e+10". This is plainly wrong: %g should never print |
54 | | // trailing zeros after the decimal point. For some reason this bug only |
55 | | // occurs with some input values, not all. In any case, _snprintf does the |
56 | | // right thing, so we use it. |
57 | | #define snprintf _snprintf |
58 | | #endif |
59 | | |
60 | | namespace google { |
61 | | namespace protobuf { |
62 | | |
63 | | // These are defined as macros on some platforms. #undef them so that we can |
64 | | // redefine them. |
65 | | #undef isxdigit |
66 | | #undef isprint |
67 | | |
68 | | // The definitions of these in ctype.h change based on locale. Since our |
69 | | // string manipulation is all in relation to the protocol buffer and C++ |
70 | | // languages, we always want to use the C locale. So, we re-define these |
71 | | // exactly as we want them. |
72 | 0 | inline bool isxdigit(char c) { |
73 | 0 | return ('0' <= c && c <= '9') || |
74 | 0 | ('a' <= c && c <= 'f') || |
75 | 0 | ('A' <= c && c <= 'F'); |
76 | 0 | } |
77 | | |
78 | 0 | inline bool isprint(char c) { |
79 | 0 | return c >= 0x20 && c <= 0x7E; |
80 | 0 | } |
81 | | |
82 | | // ---------------------------------------------------------------------- |
83 | | // ReplaceCharacters |
84 | | // Replaces any occurrence of the character 'remove' (or the characters |
85 | | // in 'remove') with the character 'replacewith'. |
86 | | // ---------------------------------------------------------------------- |
87 | 0 | void ReplaceCharacters(std::string *s, const char *remove, char replacewith) { |
88 | 0 | const char *str_start = s->c_str(); |
89 | 0 | const char *str = str_start; |
90 | 0 | for (str = strpbrk(str, remove); |
91 | 0 | str != nullptr; |
92 | 0 | str = strpbrk(str + 1, remove)) { |
93 | 0 | (*s)[str - str_start] = replacewith; |
94 | 0 | } |
95 | 0 | } |
96 | | |
97 | 0 | void StripWhitespace(std::string *str) { |
98 | 0 | int str_length = str->length(); |
99 | | |
100 | | // Strip off leading whitespace. |
101 | 0 | int first = 0; |
102 | 0 | while (first < str_length && ascii_isspace(str->at(first))) { |
103 | 0 | ++first; |
104 | 0 | } |
105 | | // If entire string is white space. |
106 | 0 | if (first == str_length) { |
107 | 0 | str->clear(); |
108 | 0 | return; |
109 | 0 | } |
110 | 0 | if (first > 0) { |
111 | 0 | str->erase(0, first); |
112 | 0 | str_length -= first; |
113 | 0 | } |
114 | | |
115 | | // Strip off trailing whitespace. |
116 | 0 | int last = str_length - 1; |
117 | 0 | while (last >= 0 && ascii_isspace(str->at(last))) { |
118 | 0 | --last; |
119 | 0 | } |
120 | 0 | if (last != (str_length - 1) && last >= 0) { |
121 | 0 | str->erase(last + 1, std::string::npos); |
122 | 0 | } |
123 | 0 | } |
124 | | |
125 | | // ---------------------------------------------------------------------- |
126 | | // StringReplace() |
127 | | // Replace the "old" pattern with the "new" pattern in a string, |
128 | | // and append the result to "res". If replace_all is false, |
129 | | // it only replaces the first instance of "old." |
130 | | // ---------------------------------------------------------------------- |
131 | | |
132 | | void StringReplace(const std::string &s, const std::string &oldsub, |
133 | | const std::string &newsub, bool replace_all, |
134 | 0 | std::string *res) { |
135 | 0 | if (oldsub.empty()) { |
136 | 0 | res->append(s); // if empty, append the given string. |
137 | 0 | return; |
138 | 0 | } |
139 | | |
140 | 0 | std::string::size_type start_pos = 0; |
141 | 0 | std::string::size_type pos; |
142 | 0 | do { |
143 | 0 | pos = s.find(oldsub, start_pos); |
144 | 0 | if (pos == std::string::npos) { |
145 | 0 | break; |
146 | 0 | } |
147 | 0 | res->append(s, start_pos, pos - start_pos); |
148 | 0 | res->append(newsub); |
149 | 0 | start_pos = pos + oldsub.size(); // start searching again after the "old" |
150 | 0 | } while (replace_all); |
151 | 0 | res->append(s, start_pos, s.length() - start_pos); |
152 | 0 | } |
153 | | |
154 | | // ---------------------------------------------------------------------- |
155 | | // StringReplace() |
156 | | // Give me a string and two patterns "old" and "new", and I replace |
157 | | // the first instance of "old" in the string with "new", if it |
158 | | // exists. If "global" is true; call this repeatedly until it |
159 | | // fails. RETURN a new string, regardless of whether the replacement |
160 | | // happened or not. |
161 | | // ---------------------------------------------------------------------- |
162 | | |
163 | | std::string StringReplace(const std::string &s, const std::string &oldsub, |
164 | 0 | const std::string &newsub, bool replace_all) { |
165 | 0 | std::string ret; |
166 | 0 | StringReplace(s, oldsub, newsub, replace_all, &ret); |
167 | 0 | return ret; |
168 | 0 | } |
169 | | |
170 | | // ---------------------------------------------------------------------- |
171 | | // SplitStringUsing() |
172 | | // Split a string using a character delimiter. Append the components |
173 | | // to 'result'. |
174 | | // |
175 | | // Note: For multi-character delimiters, this routine will split on *ANY* of |
176 | | // the characters in the string, not the entire string as a single delimiter. |
177 | | // ---------------------------------------------------------------------- |
178 | | template <typename ITR> |
179 | | static inline void SplitStringToIteratorUsing(StringPiece full, |
180 | 0 | const char *delim, ITR &result) { |
181 | | // Optimize the common case where delim is a single character. |
182 | 0 | if (delim[0] != '\0' && delim[1] == '\0') { |
183 | 0 | char c = delim[0]; |
184 | 0 | const char* p = full.data(); |
185 | 0 | const char* end = p + full.size(); |
186 | 0 | while (p != end) { |
187 | 0 | if (*p == c) { |
188 | 0 | ++p; |
189 | 0 | } else { |
190 | 0 | const char* start = p; |
191 | 0 | while (++p != end && *p != c); |
192 | 0 | *result++ = std::string(start, p - start); |
193 | 0 | } |
194 | 0 | } |
195 | 0 | return; |
196 | 0 | } |
197 | | |
198 | 0 | std::string::size_type begin_index, end_index; |
199 | 0 | begin_index = full.find_first_not_of(delim); |
200 | 0 | while (begin_index != std::string::npos) { |
201 | 0 | end_index = full.find_first_of(delim, begin_index); |
202 | 0 | if (end_index == std::string::npos) { |
203 | 0 | *result++ = std::string(full.substr(begin_index)); |
204 | 0 | return; |
205 | 0 | } |
206 | 0 | *result++ = |
207 | 0 | std::string(full.substr(begin_index, (end_index - begin_index))); |
208 | 0 | begin_index = full.find_first_not_of(delim, end_index); |
209 | 0 | } |
210 | 0 | } |
211 | | |
212 | | void SplitStringUsing(StringPiece full, const char *delim, |
213 | 0 | std::vector<std::string> *result) { |
214 | 0 | std::back_insert_iterator<std::vector<std::string> > it(*result); |
215 | 0 | SplitStringToIteratorUsing(full, delim, it); |
216 | 0 | } |
217 | | |
218 | | // Split a string using a character delimiter. Append the components |
219 | | // to 'result'. If there are consecutive delimiters, this function |
220 | | // will return corresponding empty strings. The string is split into |
221 | | // at most the specified number of pieces greedily. This means that the |
222 | | // last piece may possibly be split further. To split into as many pieces |
223 | | // as possible, specify 0 as the number of pieces. |
224 | | // |
225 | | // If "full" is the empty string, yields an empty string as the only value. |
226 | | // |
227 | | // If "pieces" is negative for some reason, it returns the whole string |
228 | | // ---------------------------------------------------------------------- |
229 | | template <typename ITR> |
230 | | static inline void SplitStringToIteratorAllowEmpty(StringPiece full, |
231 | | const char *delim, |
232 | 0 | int pieces, ITR &result) { |
233 | 0 | std::string::size_type begin_index, end_index; |
234 | 0 | begin_index = 0; |
235 | |
|
236 | 0 | for (int i = 0; (i < pieces-1) || (pieces == 0); i++) { |
237 | 0 | end_index = full.find_first_of(delim, begin_index); |
238 | 0 | if (end_index == std::string::npos) { |
239 | 0 | *result++ = std::string(full.substr(begin_index)); |
240 | 0 | return; |
241 | 0 | } |
242 | 0 | *result++ = |
243 | 0 | std::string(full.substr(begin_index, (end_index - begin_index))); |
244 | 0 | begin_index = end_index + 1; |
245 | 0 | } |
246 | 0 | *result++ = std::string(full.substr(begin_index)); |
247 | 0 | } |
248 | | |
249 | | void SplitStringAllowEmpty(StringPiece full, const char *delim, |
250 | 0 | std::vector<std::string> *result) { |
251 | 0 | std::back_insert_iterator<std::vector<std::string> > it(*result); |
252 | 0 | SplitStringToIteratorAllowEmpty(full, delim, 0, it); |
253 | 0 | } |
254 | | |
255 | | // ---------------------------------------------------------------------- |
256 | | // JoinStrings() |
257 | | // This merges a vector of string components with delim inserted |
258 | | // as separaters between components. |
259 | | // |
260 | | // ---------------------------------------------------------------------- |
261 | | template <class ITERATOR> |
262 | | static void JoinStringsIterator(const ITERATOR &start, const ITERATOR &end, |
263 | 0 | const char *delim, std::string *result) { |
264 | 0 | GOOGLE_CHECK(result != nullptr); |
265 | 0 | result->clear(); |
266 | 0 | int delim_length = strlen(delim); |
267 | | |
268 | | // Precompute resulting length so we can reserve() memory in one shot. |
269 | 0 | int length = 0; |
270 | 0 | for (ITERATOR iter = start; iter != end; ++iter) { |
271 | 0 | if (iter != start) { |
272 | 0 | length += delim_length; |
273 | 0 | } |
274 | 0 | length += iter->size(); |
275 | 0 | } |
276 | 0 | result->reserve(length); |
277 | | |
278 | | // Now combine everything. |
279 | 0 | for (ITERATOR iter = start; iter != end; ++iter) { |
280 | 0 | if (iter != start) { |
281 | 0 | result->append(delim, delim_length); |
282 | 0 | } |
283 | 0 | result->append(iter->data(), iter->size()); |
284 | 0 | } |
285 | 0 | } |
286 | | |
287 | | void JoinStrings(const std::vector<std::string> &components, const char *delim, |
288 | 0 | std::string *result) { |
289 | 0 | JoinStringsIterator(components.begin(), components.end(), delim, result); |
290 | 0 | } |
291 | | |
292 | | // ---------------------------------------------------------------------- |
293 | | // UnescapeCEscapeSequences() |
294 | | // This does all the unescaping that C does: \ooo, \r, \n, etc |
295 | | // Returns length of resulting string. |
296 | | // The implementation of \x parses any positive number of hex digits, |
297 | | // but it is an error if the value requires more than 8 bits, and the |
298 | | // result is truncated to 8 bits. |
299 | | // |
300 | | // The second call stores its errors in a supplied string vector. |
301 | | // If the string vector pointer is nullptr, it reports the errors with LOG(). |
302 | | // ---------------------------------------------------------------------- |
303 | | |
304 | 0 | #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7')) |
305 | | |
306 | | // Protocol buffers doesn't ever care about errors, but I don't want to remove |
307 | | // the code. |
308 | 0 | #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false) |
309 | | |
310 | 0 | int UnescapeCEscapeSequences(const char* source, char* dest) { |
311 | 0 | return UnescapeCEscapeSequences(source, dest, nullptr); |
312 | 0 | } |
313 | | |
314 | | int UnescapeCEscapeSequences(const char *source, char *dest, |
315 | 0 | std::vector<std::string> *errors) { |
316 | 0 | GOOGLE_DCHECK(errors == nullptr) << "Error reporting not implemented."; |
317 | |
|
318 | 0 | char* d = dest; |
319 | 0 | const char* p = source; |
320 | | |
321 | | // Small optimization for case where source = dest and there's no escaping |
322 | 0 | while ( p == d && *p != '\0' && *p != '\\' ) |
323 | 0 | p++, d++; |
324 | |
|
325 | 0 | while (*p != '\0') { |
326 | 0 | if (*p != '\\') { |
327 | 0 | *d++ = *p++; |
328 | 0 | } else { |
329 | 0 | switch ( *++p ) { // skip past the '\\' |
330 | 0 | case '\0': |
331 | 0 | LOG_STRING(ERROR, errors) << "String cannot end with \\"; |
332 | 0 | *d = '\0'; |
333 | 0 | return d - dest; // we're done with p |
334 | 0 | case 'a': *d++ = '\a'; break; |
335 | 0 | case 'b': *d++ = '\b'; break; |
336 | 0 | case 'f': *d++ = '\f'; break; |
337 | 0 | case 'n': *d++ = '\n'; break; |
338 | 0 | case 'r': *d++ = '\r'; break; |
339 | 0 | case 't': *d++ = '\t'; break; |
340 | 0 | case 'v': *d++ = '\v'; break; |
341 | 0 | case '\\': *d++ = '\\'; break; |
342 | 0 | case '?': *d++ = '\?'; break; // \? Who knew? |
343 | 0 | case '\'': *d++ = '\''; break; |
344 | 0 | case '"': *d++ = '\"'; break; |
345 | 0 | case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits |
346 | 0 | case '4': case '5': case '6': case '7': { |
347 | 0 | char ch = *p - '0'; |
348 | 0 | if ( IS_OCTAL_DIGIT(p[1]) ) |
349 | 0 | ch = ch * 8 + *++p - '0'; |
350 | 0 | if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice |
351 | 0 | ch = ch * 8 + *++p - '0'; // now points at last digit |
352 | 0 | *d++ = ch; |
353 | 0 | break; |
354 | 0 | } |
355 | 0 | case 'x': case 'X': { |
356 | 0 | if (!isxdigit(p[1])) { |
357 | 0 | if (p[1] == '\0') { |
358 | 0 | LOG_STRING(ERROR, errors) << "String cannot end with \\x"; |
359 | 0 | } else { |
360 | 0 | LOG_STRING(ERROR, errors) << |
361 | 0 | "\\x cannot be followed by non-hex digit: \\" << *p << p[1]; |
362 | 0 | } |
363 | 0 | break; |
364 | 0 | } |
365 | 0 | unsigned int ch = 0; |
366 | 0 | const char *hex_start = p; |
367 | 0 | while (isxdigit(p[1])) // arbitrarily many hex digits |
368 | 0 | ch = (ch << 4) + hex_digit_to_int(*++p); |
369 | 0 | if (ch > 0xFF) |
370 | 0 | LOG_STRING(ERROR, errors) |
371 | 0 | << "Value of " |
372 | 0 | << "\\" << std::string(hex_start, p + 1 - hex_start) |
373 | 0 | << " exceeds 8 bits"; |
374 | 0 | *d++ = ch; |
375 | 0 | break; |
376 | 0 | } |
377 | | #if 0 // TODO(kenton): Support \u and \U? Requires runetochar(). |
378 | | case 'u': { |
379 | | // \uhhhh => convert 4 hex digits to UTF-8 |
380 | | char32 rune = 0; |
381 | | const char *hex_start = p; |
382 | | for (int i = 0; i < 4; ++i) { |
383 | | if (isxdigit(p[1])) { // Look one char ahead. |
384 | | rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p. |
385 | | } else { |
386 | | LOG_STRING(ERROR, errors) |
387 | | << "\\u must be followed by 4 hex digits: \\" |
388 | | << std::string(hex_start, p+1-hex_start); |
389 | | break; |
390 | | } |
391 | | } |
392 | | d += runetochar(d, &rune); |
393 | | break; |
394 | | } |
395 | | case 'U': { |
396 | | // \Uhhhhhhhh => convert 8 hex digits to UTF-8 |
397 | | char32 rune = 0; |
398 | | const char *hex_start = p; |
399 | | for (int i = 0; i < 8; ++i) { |
400 | | if (isxdigit(p[1])) { // Look one char ahead. |
401 | | // Don't change rune until we're sure this |
402 | | // is within the Unicode limit, but do advance p. |
403 | | char32 newrune = (rune << 4) + hex_digit_to_int(*++p); |
404 | | if (newrune > 0x10FFFF) { |
405 | | LOG_STRING(ERROR, errors) |
406 | | << "Value of \\" |
407 | | << std::string(hex_start, p + 1 - hex_start) |
408 | | << " exceeds Unicode limit (0x10FFFF)"; |
409 | | break; |
410 | | } else { |
411 | | rune = newrune; |
412 | | } |
413 | | } else { |
414 | | LOG_STRING(ERROR, errors) |
415 | | << "\\U must be followed by 8 hex digits: \\" |
416 | | << std::string(hex_start, p+1-hex_start); |
417 | | break; |
418 | | } |
419 | | } |
420 | | d += runetochar(d, &rune); |
421 | | break; |
422 | | } |
423 | | #endif |
424 | 0 | default: |
425 | 0 | LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p; |
426 | 0 | } |
427 | 0 | p++; // read past letter we escaped |
428 | 0 | } |
429 | 0 | } |
430 | 0 | *d = '\0'; |
431 | 0 | return d - dest; |
432 | 0 | } |
433 | | |
434 | | // ---------------------------------------------------------------------- |
435 | | // UnescapeCEscapeString() |
436 | | // This does the same thing as UnescapeCEscapeSequences, but creates |
437 | | // a new string. The caller does not need to worry about allocating |
438 | | // a dest buffer. This should be used for non performance critical |
439 | | // tasks such as printing debug messages. It is safe for src and dest |
440 | | // to be the same. |
441 | | // |
442 | | // The second call stores its errors in a supplied string vector. |
443 | | // If the string vector pointer is nullptr, it reports the errors with LOG(). |
444 | | // |
445 | | // In the first and second calls, the length of dest is returned. In the |
446 | | // the third call, the new string is returned. |
447 | | // ---------------------------------------------------------------------- |
448 | 0 | int UnescapeCEscapeString(const std::string &src, std::string *dest) { |
449 | 0 | return UnescapeCEscapeString(src, dest, nullptr); |
450 | 0 | } |
451 | | |
452 | | int UnescapeCEscapeString(const std::string &src, std::string *dest, |
453 | 0 | std::vector<std::string> *errors) { |
454 | 0 | std::unique_ptr<char[]> unescaped(new char[src.size() + 1]); |
455 | 0 | int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors); |
456 | 0 | GOOGLE_CHECK(dest); |
457 | 0 | dest->assign(unescaped.get(), len); |
458 | 0 | return len; |
459 | 0 | } |
460 | | |
461 | 0 | std::string UnescapeCEscapeString(const std::string &src) { |
462 | 0 | std::unique_ptr<char[]> unescaped(new char[src.size() + 1]); |
463 | 0 | int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr); |
464 | 0 | return std::string(unescaped.get(), len); |
465 | 0 | } |
466 | | |
467 | | // ---------------------------------------------------------------------- |
468 | | // CEscapeString() |
469 | | // CHexEscapeString() |
470 | | // Copies 'src' to 'dest', escaping dangerous characters using |
471 | | // C-style escape sequences. This is very useful for preparing query |
472 | | // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses |
473 | | // hexadecimal rather than octal sequences. |
474 | | // Returns the number of bytes written to 'dest' (not including the \0) |
475 | | // or -1 if there was insufficient space. |
476 | | // |
477 | | // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped. |
478 | | // ---------------------------------------------------------------------- |
479 | | int CEscapeInternal(const char* src, int src_len, char* dest, |
480 | 0 | int dest_len, bool use_hex, bool utf8_safe) { |
481 | 0 | const char* src_end = src + src_len; |
482 | 0 | int used = 0; |
483 | 0 | bool last_hex_escape = false; // true if last output char was \xNN |
484 | |
|
485 | 0 | for (; src < src_end; src++) { |
486 | 0 | if (dest_len - used < 2) // Need space for two letter escape |
487 | 0 | return -1; |
488 | | |
489 | 0 | bool is_hex_escape = false; |
490 | 0 | switch (*src) { |
491 | 0 | case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break; |
492 | 0 | case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break; |
493 | 0 | case '\t': dest[used++] = '\\'; dest[used++] = 't'; break; |
494 | 0 | case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break; |
495 | 0 | case '\'': dest[used++] = '\\'; dest[used++] = '\''; break; |
496 | 0 | case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break; |
497 | 0 | default: |
498 | | // Note that if we emit \xNN and the src character after that is a hex |
499 | | // digit then that digit must be escaped too to prevent it being |
500 | | // interpreted as part of the character code by C. |
501 | 0 | if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) && |
502 | 0 | (!isprint(*src) || |
503 | 0 | (last_hex_escape && isxdigit(*src)))) { |
504 | 0 | if (dest_len - used < 4) // need space for 4 letter escape |
505 | 0 | return -1; |
506 | 0 | sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"), |
507 | 0 | static_cast<uint8>(*src)); |
508 | 0 | is_hex_escape = use_hex; |
509 | 0 | used += 4; |
510 | 0 | } else { |
511 | 0 | dest[used++] = *src; break; |
512 | 0 | } |
513 | 0 | } |
514 | 0 | last_hex_escape = is_hex_escape; |
515 | 0 | } |
516 | | |
517 | 0 | if (dest_len - used < 1) // make sure that there is room for \0 |
518 | 0 | return -1; |
519 | | |
520 | 0 | dest[used] = '\0'; // doesn't count towards return value though |
521 | 0 | return used; |
522 | 0 | } |
523 | | |
524 | | // Calculates the length of the C-style escaped version of 'src'. |
525 | | // Assumes that non-printable characters are escaped using octal sequences, and |
526 | | // that UTF-8 bytes are not handled specially. |
527 | 0 | static inline size_t CEscapedLength(StringPiece src) { |
528 | 0 | static char c_escaped_len[256] = { |
529 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r |
530 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
531 | 0 | 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", ' |
532 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9' |
533 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O' |
534 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\' |
535 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o' |
536 | 0 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL |
537 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
538 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
539 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
540 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
541 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
542 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
543 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
544 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
545 | 0 | }; |
546 | |
|
547 | 0 | size_t escaped_len = 0; |
548 | 0 | for (int i = 0; i < src.size(); ++i) { |
549 | 0 | unsigned char c = static_cast<unsigned char>(src[i]); |
550 | 0 | escaped_len += c_escaped_len[c]; |
551 | 0 | } |
552 | 0 | return escaped_len; |
553 | 0 | } |
554 | | |
555 | | // ---------------------------------------------------------------------- |
556 | | // Escapes 'src' using C-style escape sequences, and appends the escaped string |
557 | | // to 'dest'. This version is faster than calling CEscapeInternal as it computes |
558 | | // the required space using a lookup table, and also does not do any special |
559 | | // handling for Hex or UTF-8 characters. |
560 | | // ---------------------------------------------------------------------- |
561 | 0 | void CEscapeAndAppend(StringPiece src, std::string *dest) { |
562 | 0 | size_t escaped_len = CEscapedLength(src); |
563 | 0 | if (escaped_len == src.size()) { |
564 | 0 | dest->append(src.data(), src.size()); |
565 | 0 | return; |
566 | 0 | } |
567 | | |
568 | 0 | size_t cur_dest_len = dest->size(); |
569 | 0 | dest->resize(cur_dest_len + escaped_len); |
570 | 0 | char* append_ptr = &(*dest)[cur_dest_len]; |
571 | |
|
572 | 0 | for (int i = 0; i < src.size(); ++i) { |
573 | 0 | unsigned char c = static_cast<unsigned char>(src[i]); |
574 | 0 | switch (c) { |
575 | 0 | case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break; |
576 | 0 | case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break; |
577 | 0 | case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break; |
578 | 0 | case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break; |
579 | 0 | case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break; |
580 | 0 | case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break; |
581 | 0 | default: |
582 | 0 | if (!isprint(c)) { |
583 | 0 | *append_ptr++ = '\\'; |
584 | 0 | *append_ptr++ = '0' + c / 64; |
585 | 0 | *append_ptr++ = '0' + (c % 64) / 8; |
586 | 0 | *append_ptr++ = '0' + c % 8; |
587 | 0 | } else { |
588 | 0 | *append_ptr++ = c; |
589 | 0 | } |
590 | 0 | break; |
591 | 0 | } |
592 | 0 | } |
593 | 0 | } |
594 | | |
595 | 0 | std::string CEscape(const std::string &src) { |
596 | 0 | std::string dest; |
597 | 0 | CEscapeAndAppend(src, &dest); |
598 | 0 | return dest; |
599 | 0 | } |
600 | | |
601 | | namespace strings { |
602 | | |
603 | 0 | std::string Utf8SafeCEscape(const std::string &src) { |
604 | 0 | const int dest_length = src.size() * 4 + 1; // Maximum possible expansion |
605 | 0 | std::unique_ptr<char[]> dest(new char[dest_length]); |
606 | 0 | const int len = CEscapeInternal(src.data(), src.size(), |
607 | 0 | dest.get(), dest_length, false, true); |
608 | 0 | GOOGLE_DCHECK_GE(len, 0); |
609 | 0 | return std::string(dest.get(), len); |
610 | 0 | } |
611 | | |
612 | 0 | std::string CHexEscape(const std::string &src) { |
613 | 0 | const int dest_length = src.size() * 4 + 1; // Maximum possible expansion |
614 | 0 | std::unique_ptr<char[]> dest(new char[dest_length]); |
615 | 0 | const int len = CEscapeInternal(src.data(), src.size(), |
616 | 0 | dest.get(), dest_length, true, false); |
617 | 0 | GOOGLE_DCHECK_GE(len, 0); |
618 | 0 | return std::string(dest.get(), len); |
619 | 0 | } |
620 | | |
621 | | } // namespace strings |
622 | | |
623 | | // ---------------------------------------------------------------------- |
624 | | // strto32_adaptor() |
625 | | // strtou32_adaptor() |
626 | | // Implementation of strto[u]l replacements that have identical |
627 | | // overflow and underflow characteristics for both ILP-32 and LP-64 |
628 | | // platforms, including errno preservation in error-free calls. |
629 | | // ---------------------------------------------------------------------- |
630 | | |
631 | 0 | int32 strto32_adaptor(const char *nptr, char **endptr, int base) { |
632 | 0 | const int saved_errno = errno; |
633 | 0 | errno = 0; |
634 | 0 | const long result = strtol(nptr, endptr, base); |
635 | 0 | if (errno == ERANGE && result == LONG_MIN) { |
636 | 0 | return kint32min; |
637 | 0 | } else if (errno == ERANGE && result == LONG_MAX) { |
638 | 0 | return kint32max; |
639 | 0 | } else if (errno == 0 && result < kint32min) { |
640 | 0 | errno = ERANGE; |
641 | 0 | return kint32min; |
642 | 0 | } else if (errno == 0 && result > kint32max) { |
643 | 0 | errno = ERANGE; |
644 | 0 | return kint32max; |
645 | 0 | } |
646 | 0 | if (errno == 0) |
647 | 0 | errno = saved_errno; |
648 | 0 | return static_cast<int32>(result); |
649 | 0 | } |
650 | | |
651 | 0 | uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) { |
652 | 0 | const int saved_errno = errno; |
653 | 0 | errno = 0; |
654 | 0 | const unsigned long result = strtoul(nptr, endptr, base); |
655 | 0 | if (errno == ERANGE && result == ULONG_MAX) { |
656 | 0 | return kuint32max; |
657 | 0 | } else if (errno == 0 && result > kuint32max) { |
658 | 0 | errno = ERANGE; |
659 | 0 | return kuint32max; |
660 | 0 | } |
661 | 0 | if (errno == 0) |
662 | 0 | errno = saved_errno; |
663 | 0 | return static_cast<uint32>(result); |
664 | 0 | } |
665 | | |
666 | | inline bool safe_parse_sign(std::string *text /*inout*/, |
667 | 0 | bool *negative_ptr /*output*/) { |
668 | 0 | const char* start = text->data(); |
669 | 0 | const char* end = start + text->size(); |
670 | | |
671 | | // Consume whitespace. |
672 | 0 | while (start < end && (start[0] == ' ')) { |
673 | 0 | ++start; |
674 | 0 | } |
675 | 0 | while (start < end && (end[-1] == ' ')) { |
676 | 0 | --end; |
677 | 0 | } |
678 | 0 | if (start >= end) { |
679 | 0 | return false; |
680 | 0 | } |
681 | | |
682 | | // Consume sign. |
683 | 0 | *negative_ptr = (start[0] == '-'); |
684 | 0 | if (*negative_ptr || start[0] == '+') { |
685 | 0 | ++start; |
686 | 0 | if (start >= end) { |
687 | 0 | return false; |
688 | 0 | } |
689 | 0 | } |
690 | 0 | *text = text->substr(start - text->data(), end - start); |
691 | 0 | return true; |
692 | 0 | } |
693 | | |
694 | | template <typename IntType> |
695 | 0 | bool safe_parse_positive_int(std::string text, IntType *value_p) { |
696 | 0 | int base = 10; |
697 | 0 | IntType value = 0; |
698 | 0 | const IntType vmax = std::numeric_limits<IntType>::max(); |
699 | 0 | assert(vmax > 0); |
700 | 0 | assert(vmax >= base); |
701 | 0 | const IntType vmax_over_base = vmax / base; |
702 | 0 | const char* start = text.data(); |
703 | 0 | const char* end = start + text.size(); |
704 | | // loop over digits |
705 | 0 | for (; start < end; ++start) { |
706 | 0 | unsigned char c = static_cast<unsigned char>(start[0]); |
707 | 0 | int digit = c - '0'; |
708 | 0 | if (digit >= base || digit < 0) { |
709 | 0 | *value_p = value; |
710 | 0 | return false; |
711 | 0 | } |
712 | 0 | if (value > vmax_over_base) { |
713 | 0 | *value_p = vmax; |
714 | 0 | return false; |
715 | 0 | } |
716 | 0 | value *= base; |
717 | 0 | if (value > vmax - digit) { |
718 | 0 | *value_p = vmax; |
719 | 0 | return false; |
720 | 0 | } |
721 | 0 | value += digit; |
722 | 0 | } |
723 | 0 | *value_p = value; |
724 | 0 | return true; |
725 | 0 | } Unexecuted instantiation: bool google::protobuf::safe_parse_positive_int<int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, int*) Unexecuted instantiation: bool google::protobuf::safe_parse_positive_int<unsigned int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, unsigned int*) Unexecuted instantiation: bool google::protobuf::safe_parse_positive_int<long>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, long*) Unexecuted instantiation: bool google::protobuf::safe_parse_positive_int<unsigned long>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, unsigned long*) |
726 | | |
727 | | template <typename IntType> |
728 | 0 | bool safe_parse_negative_int(const std::string &text, IntType *value_p) { |
729 | 0 | int base = 10; |
730 | 0 | IntType value = 0; |
731 | 0 | const IntType vmin = std::numeric_limits<IntType>::min(); |
732 | 0 | assert(vmin < 0); |
733 | 0 | assert(vmin <= 0 - base); |
734 | 0 | IntType vmin_over_base = vmin / base; |
735 | | // 2003 c++ standard [expr.mul] |
736 | | // "... the sign of the remainder is implementation-defined." |
737 | | // Although (vmin/base)*base + vmin%base is always vmin. |
738 | | // 2011 c++ standard tightens the spec but we cannot rely on it. |
739 | 0 | if (vmin % base > 0) { |
740 | 0 | vmin_over_base += 1; |
741 | 0 | } |
742 | 0 | const char* start = text.data(); |
743 | 0 | const char* end = start + text.size(); |
744 | | // loop over digits |
745 | 0 | for (; start < end; ++start) { |
746 | 0 | unsigned char c = static_cast<unsigned char>(start[0]); |
747 | 0 | int digit = c - '0'; |
748 | 0 | if (digit >= base || digit < 0) { |
749 | 0 | *value_p = value; |
750 | 0 | return false; |
751 | 0 | } |
752 | 0 | if (value < vmin_over_base) { |
753 | 0 | *value_p = vmin; |
754 | 0 | return false; |
755 | 0 | } |
756 | 0 | value *= base; |
757 | 0 | if (value < vmin + digit) { |
758 | 0 | *value_p = vmin; |
759 | 0 | return false; |
760 | 0 | } |
761 | 0 | value -= digit; |
762 | 0 | } |
763 | 0 | *value_p = value; |
764 | 0 | return true; |
765 | 0 | } Unexecuted instantiation: bool google::protobuf::safe_parse_negative_int<int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int*) Unexecuted instantiation: bool google::protobuf::safe_parse_negative_int<long>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, long*) |
766 | | |
767 | | template <typename IntType> |
768 | 0 | bool safe_int_internal(std::string text, IntType *value_p) { |
769 | 0 | *value_p = 0; |
770 | 0 | bool negative; |
771 | 0 | if (!safe_parse_sign(&text, &negative)) { |
772 | 0 | return false; |
773 | 0 | } |
774 | 0 | if (!negative) { |
775 | 0 | return safe_parse_positive_int(text, value_p); |
776 | 0 | } else { |
777 | 0 | return safe_parse_negative_int(text, value_p); |
778 | 0 | } |
779 | 0 | } Unexecuted instantiation: bool google::protobuf::safe_int_internal<int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, int*) Unexecuted instantiation: bool google::protobuf::safe_int_internal<long>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, long*) |
780 | | |
781 | | template <typename IntType> |
782 | 0 | bool safe_uint_internal(std::string text, IntType *value_p) { |
783 | 0 | *value_p = 0; |
784 | 0 | bool negative; |
785 | 0 | if (!safe_parse_sign(&text, &negative) || negative) { |
786 | 0 | return false; |
787 | 0 | } |
788 | 0 | return safe_parse_positive_int(text, value_p); |
789 | 0 | } Unexecuted instantiation: bool google::protobuf::safe_uint_internal<unsigned int>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, unsigned int*) Unexecuted instantiation: bool google::protobuf::safe_uint_internal<unsigned long>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, unsigned long*) |
790 | | |
791 | | // ---------------------------------------------------------------------- |
792 | | // FastIntToBuffer() |
793 | | // FastInt64ToBuffer() |
794 | | // FastHexToBuffer() |
795 | | // FastHex64ToBuffer() |
796 | | // FastHex32ToBuffer() |
797 | | // ---------------------------------------------------------------------- |
798 | | |
799 | | // Offset into buffer where FastInt64ToBuffer places the end of string |
800 | | // null character. Also used by FastInt64ToBufferLeft. |
801 | | static const int kFastInt64ToBufferOffset = 21; |
802 | | |
803 | 0 | char *FastInt64ToBuffer(int64 i, char* buffer) { |
804 | | // We could collapse the positive and negative sections, but that |
805 | | // would be slightly slower for positive numbers... |
806 | | // 22 bytes is enough to store -2**64, -18446744073709551616. |
807 | 0 | char* p = buffer + kFastInt64ToBufferOffset; |
808 | 0 | *p-- = '\0'; |
809 | 0 | if (i >= 0) { |
810 | 0 | do { |
811 | 0 | *p-- = '0' + i % 10; |
812 | 0 | i /= 10; |
813 | 0 | } while (i > 0); |
814 | 0 | return p + 1; |
815 | 0 | } else { |
816 | | // On different platforms, % and / have different behaviors for |
817 | | // negative numbers, so we need to jump through hoops to make sure |
818 | | // we don't divide negative numbers. |
819 | 0 | if (i > -10) { |
820 | 0 | i = -i; |
821 | 0 | *p-- = '0' + i; |
822 | 0 | *p = '-'; |
823 | 0 | return p; |
824 | 0 | } else { |
825 | | // Make sure we aren't at MIN_INT, in which case we can't say i = -i |
826 | 0 | i = i + 10; |
827 | 0 | i = -i; |
828 | 0 | *p-- = '0' + i % 10; |
829 | | // Undo what we did a moment ago |
830 | 0 | i = i / 10 + 1; |
831 | 0 | do { |
832 | 0 | *p-- = '0' + i % 10; |
833 | 0 | i /= 10; |
834 | 0 | } while (i > 0); |
835 | 0 | *p = '-'; |
836 | 0 | return p; |
837 | 0 | } |
838 | 0 | } |
839 | 0 | } |
840 | | |
841 | | // Offset into buffer where FastInt32ToBuffer places the end of string |
842 | | // null character. Also used by FastInt32ToBufferLeft |
843 | | static const int kFastInt32ToBufferOffset = 11; |
844 | | |
845 | | // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the |
846 | | // compiler to generate 32 bit arithmetic instructions. It's much faster, at |
847 | | // least with 32 bit binaries. |
848 | 0 | char *FastInt32ToBuffer(int32 i, char* buffer) { |
849 | | // We could collapse the positive and negative sections, but that |
850 | | // would be slightly slower for positive numbers... |
851 | | // 12 bytes is enough to store -2**32, -4294967296. |
852 | 0 | char* p = buffer + kFastInt32ToBufferOffset; |
853 | 0 | *p-- = '\0'; |
854 | 0 | if (i >= 0) { |
855 | 0 | do { |
856 | 0 | *p-- = '0' + i % 10; |
857 | 0 | i /= 10; |
858 | 0 | } while (i > 0); |
859 | 0 | return p + 1; |
860 | 0 | } else { |
861 | | // On different platforms, % and / have different behaviors for |
862 | | // negative numbers, so we need to jump through hoops to make sure |
863 | | // we don't divide negative numbers. |
864 | 0 | if (i > -10) { |
865 | 0 | i = -i; |
866 | 0 | *p-- = '0' + i; |
867 | 0 | *p = '-'; |
868 | 0 | return p; |
869 | 0 | } else { |
870 | | // Make sure we aren't at MIN_INT, in which case we can't say i = -i |
871 | 0 | i = i + 10; |
872 | 0 | i = -i; |
873 | 0 | *p-- = '0' + i % 10; |
874 | | // Undo what we did a moment ago |
875 | 0 | i = i / 10 + 1; |
876 | 0 | do { |
877 | 0 | *p-- = '0' + i % 10; |
878 | 0 | i /= 10; |
879 | 0 | } while (i > 0); |
880 | 0 | *p = '-'; |
881 | 0 | return p; |
882 | 0 | } |
883 | 0 | } |
884 | 0 | } |
885 | | |
886 | 0 | char *FastHexToBuffer(int i, char* buffer) { |
887 | 0 | GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i; |
888 | |
|
889 | 0 | static const char *hexdigits = "0123456789abcdef"; |
890 | 0 | char *p = buffer + 21; |
891 | 0 | *p-- = '\0'; |
892 | 0 | do { |
893 | 0 | *p-- = hexdigits[i & 15]; // mod by 16 |
894 | 0 | i >>= 4; // divide by 16 |
895 | 0 | } while (i > 0); |
896 | 0 | return p + 1; |
897 | 0 | } |
898 | | |
899 | 0 | char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) { |
900 | 0 | static const char *hexdigits = "0123456789abcdef"; |
901 | 0 | buffer[num_byte] = '\0'; |
902 | 0 | for (int i = num_byte - 1; i >= 0; i--) { |
903 | | #ifdef _M_X64 |
904 | | // MSVC x64 platform has a bug optimizing the uint32(value) in the #else |
905 | | // block. Given that the uint32 cast was to improve performance on 32-bit |
906 | | // platforms, we use 64-bit '&' directly. |
907 | | buffer[i] = hexdigits[value & 0xf]; |
908 | | #else |
909 | 0 | buffer[i] = hexdigits[uint32(value) & 0xf]; |
910 | 0 | #endif |
911 | 0 | value >>= 4; |
912 | 0 | } |
913 | 0 | return buffer; |
914 | 0 | } |
915 | | |
916 | 0 | char *FastHex64ToBuffer(uint64 value, char* buffer) { |
917 | 0 | return InternalFastHexToBuffer(value, buffer, 16); |
918 | 0 | } |
919 | | |
920 | 0 | char *FastHex32ToBuffer(uint32 value, char* buffer) { |
921 | 0 | return InternalFastHexToBuffer(value, buffer, 8); |
922 | 0 | } |
923 | | |
924 | | // ---------------------------------------------------------------------- |
925 | | // FastInt32ToBufferLeft() |
926 | | // FastUInt32ToBufferLeft() |
927 | | // FastInt64ToBufferLeft() |
928 | | // FastUInt64ToBufferLeft() |
929 | | // |
930 | | // Like the Fast*ToBuffer() functions above, these are intended for speed. |
931 | | // Unlike the Fast*ToBuffer() functions, however, these functions write |
932 | | // their output to the beginning of the buffer (hence the name, as the |
933 | | // output is left-aligned). The caller is responsible for ensuring that |
934 | | // the buffer has enough space to hold the output. |
935 | | // |
936 | | // Returns a pointer to the end of the string (i.e. the null character |
937 | | // terminating the string). |
938 | | // ---------------------------------------------------------------------- |
939 | | |
940 | | static const char two_ASCII_digits[100][2] = { |
941 | | {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'}, |
942 | | {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'}, |
943 | | {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, |
944 | | {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'}, |
945 | | {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, |
946 | | {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'}, |
947 | | {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, |
948 | | {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'}, |
949 | | {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, |
950 | | {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'}, |
951 | | {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, |
952 | | {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'}, |
953 | | {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, |
954 | | {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'}, |
955 | | {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, |
956 | | {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'}, |
957 | | {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, |
958 | | {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'}, |
959 | | {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, |
960 | | {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'} |
961 | | }; |
962 | | |
963 | 0 | char* FastUInt32ToBufferLeft(uint32 u, char* buffer) { |
964 | 0 | uint32 digits; |
965 | 0 | const char *ASCII_digits = nullptr; |
966 | | // The idea of this implementation is to trim the number of divides to as few |
967 | | // as possible by using multiplication and subtraction rather than mod (%), |
968 | | // and by outputting two digits at a time rather than one. |
969 | | // The huge-number case is first, in the hopes that the compiler will output |
970 | | // that case in one branch-free block of code, and only output conditional |
971 | | // branches into it from below. |
972 | 0 | if (u >= 1000000000) { // >= 1,000,000,000 |
973 | 0 | digits = u / 100000000; // 100,000,000 |
974 | 0 | ASCII_digits = two_ASCII_digits[digits]; |
975 | 0 | buffer[0] = ASCII_digits[0]; |
976 | 0 | buffer[1] = ASCII_digits[1]; |
977 | 0 | buffer += 2; |
978 | 0 | sublt100_000_000: |
979 | 0 | u -= digits * 100000000; // 100,000,000 |
980 | 0 | lt100_000_000: |
981 | 0 | digits = u / 1000000; // 1,000,000 |
982 | 0 | ASCII_digits = two_ASCII_digits[digits]; |
983 | 0 | buffer[0] = ASCII_digits[0]; |
984 | 0 | buffer[1] = ASCII_digits[1]; |
985 | 0 | buffer += 2; |
986 | 0 | sublt1_000_000: |
987 | 0 | u -= digits * 1000000; // 1,000,000 |
988 | 0 | lt1_000_000: |
989 | 0 | digits = u / 10000; // 10,000 |
990 | 0 | ASCII_digits = two_ASCII_digits[digits]; |
991 | 0 | buffer[0] = ASCII_digits[0]; |
992 | 0 | buffer[1] = ASCII_digits[1]; |
993 | 0 | buffer += 2; |
994 | 0 | sublt10_000: |
995 | 0 | u -= digits * 10000; // 10,000 |
996 | 0 | lt10_000: |
997 | 0 | digits = u / 100; |
998 | 0 | ASCII_digits = two_ASCII_digits[digits]; |
999 | 0 | buffer[0] = ASCII_digits[0]; |
1000 | 0 | buffer[1] = ASCII_digits[1]; |
1001 | 0 | buffer += 2; |
1002 | 0 | sublt100: |
1003 | 0 | u -= digits * 100; |
1004 | 0 | lt100: |
1005 | 0 | digits = u; |
1006 | 0 | ASCII_digits = two_ASCII_digits[digits]; |
1007 | 0 | buffer[0] = ASCII_digits[0]; |
1008 | 0 | buffer[1] = ASCII_digits[1]; |
1009 | 0 | buffer += 2; |
1010 | 0 | done: |
1011 | 0 | *buffer = 0; |
1012 | 0 | return buffer; |
1013 | 0 | } |
1014 | | |
1015 | 0 | if (u < 100) { |
1016 | 0 | digits = u; |
1017 | 0 | if (u >= 10) goto lt100; |
1018 | 0 | *buffer++ = '0' + digits; |
1019 | 0 | goto done; |
1020 | 0 | } |
1021 | 0 | if (u < 10000) { // 10,000 |
1022 | 0 | if (u >= 1000) goto lt10_000; |
1023 | 0 | digits = u / 100; |
1024 | 0 | *buffer++ = '0' + digits; |
1025 | 0 | goto sublt100; |
1026 | 0 | } |
1027 | 0 | if (u < 1000000) { // 1,000,000 |
1028 | 0 | if (u >= 100000) goto lt1_000_000; |
1029 | 0 | digits = u / 10000; // 10,000 |
1030 | 0 | *buffer++ = '0' + digits; |
1031 | 0 | goto sublt10_000; |
1032 | 0 | } |
1033 | 0 | if (u < 100000000) { // 100,000,000 |
1034 | 0 | if (u >= 10000000) goto lt100_000_000; |
1035 | 0 | digits = u / 1000000; // 1,000,000 |
1036 | 0 | *buffer++ = '0' + digits; |
1037 | 0 | goto sublt1_000_000; |
1038 | 0 | } |
1039 | | // we already know that u < 1,000,000,000 |
1040 | 0 | digits = u / 100000000; // 100,000,000 |
1041 | 0 | *buffer++ = '0' + digits; |
1042 | 0 | goto sublt100_000_000; |
1043 | 0 | } |
1044 | | |
1045 | 0 | char* FastInt32ToBufferLeft(int32 i, char* buffer) { |
1046 | 0 | uint32 u = 0; |
1047 | 0 | if (i < 0) { |
1048 | 0 | *buffer++ = '-'; |
1049 | 0 | u -= i; |
1050 | 0 | } else { |
1051 | 0 | u = i; |
1052 | 0 | } |
1053 | 0 | return FastUInt32ToBufferLeft(u, buffer); |
1054 | 0 | } |
1055 | | |
1056 | 0 | char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) { |
1057 | 0 | int digits; |
1058 | 0 | const char *ASCII_digits = nullptr; |
1059 | |
|
1060 | 0 | uint32 u = static_cast<uint32>(u64); |
1061 | 0 | if (u == u64) return FastUInt32ToBufferLeft(u, buffer); |
1062 | | |
1063 | 0 | uint64 top_11_digits = u64 / 1000000000; |
1064 | 0 | buffer = FastUInt64ToBufferLeft(top_11_digits, buffer); |
1065 | 0 | u = u64 - (top_11_digits * 1000000000); |
1066 | |
|
1067 | 0 | digits = u / 10000000; // 10,000,000 |
1068 | 0 | GOOGLE_DCHECK_LT(digits, 100); |
1069 | 0 | ASCII_digits = two_ASCII_digits[digits]; |
1070 | 0 | buffer[0] = ASCII_digits[0]; |
1071 | 0 | buffer[1] = ASCII_digits[1]; |
1072 | 0 | buffer += 2; |
1073 | 0 | u -= digits * 10000000; // 10,000,000 |
1074 | 0 | digits = u / 100000; // 100,000 |
1075 | 0 | ASCII_digits = two_ASCII_digits[digits]; |
1076 | 0 | buffer[0] = ASCII_digits[0]; |
1077 | 0 | buffer[1] = ASCII_digits[1]; |
1078 | 0 | buffer += 2; |
1079 | 0 | u -= digits * 100000; // 100,000 |
1080 | 0 | digits = u / 1000; // 1,000 |
1081 | 0 | ASCII_digits = two_ASCII_digits[digits]; |
1082 | 0 | buffer[0] = ASCII_digits[0]; |
1083 | 0 | buffer[1] = ASCII_digits[1]; |
1084 | 0 | buffer += 2; |
1085 | 0 | u -= digits * 1000; // 1,000 |
1086 | 0 | digits = u / 10; |
1087 | 0 | ASCII_digits = two_ASCII_digits[digits]; |
1088 | 0 | buffer[0] = ASCII_digits[0]; |
1089 | 0 | buffer[1] = ASCII_digits[1]; |
1090 | 0 | buffer += 2; |
1091 | 0 | u -= digits * 10; |
1092 | 0 | digits = u; |
1093 | 0 | *buffer++ = '0' + digits; |
1094 | 0 | *buffer = 0; |
1095 | 0 | return buffer; |
1096 | 0 | } |
1097 | | |
1098 | 0 | char* FastInt64ToBufferLeft(int64 i, char* buffer) { |
1099 | 0 | uint64 u = 0; |
1100 | 0 | if (i < 0) { |
1101 | 0 | *buffer++ = '-'; |
1102 | 0 | u -= i; |
1103 | 0 | } else { |
1104 | 0 | u = i; |
1105 | 0 | } |
1106 | 0 | return FastUInt64ToBufferLeft(u, buffer); |
1107 | 0 | } |
1108 | | |
1109 | | // ---------------------------------------------------------------------- |
1110 | | // SimpleItoa() |
1111 | | // Description: converts an integer to a string. |
1112 | | // |
1113 | | // Return value: string |
1114 | | // ---------------------------------------------------------------------- |
1115 | | |
1116 | 0 | std::string SimpleItoa(int i) { |
1117 | 0 | char buffer[kFastToBufferSize]; |
1118 | 0 | return (sizeof(i) == 4) ? |
1119 | 0 | FastInt32ToBuffer(i, buffer) : |
1120 | 0 | FastInt64ToBuffer(i, buffer); |
1121 | 0 | } |
1122 | | |
1123 | 0 | std::string SimpleItoa(unsigned int i) { |
1124 | 0 | char buffer[kFastToBufferSize]; |
1125 | 0 | return std::string(buffer, (sizeof(i) == 4) |
1126 | 0 | ? FastUInt32ToBufferLeft(i, buffer) |
1127 | 0 | : FastUInt64ToBufferLeft(i, buffer)); |
1128 | 0 | } |
1129 | | |
1130 | 0 | std::string SimpleItoa(long i) { |
1131 | 0 | char buffer[kFastToBufferSize]; |
1132 | 0 | return (sizeof(i) == 4) ? |
1133 | 0 | FastInt32ToBuffer(i, buffer) : |
1134 | 0 | FastInt64ToBuffer(i, buffer); |
1135 | 0 | } |
1136 | | |
1137 | 0 | std::string SimpleItoa(unsigned long i) { |
1138 | 0 | char buffer[kFastToBufferSize]; |
1139 | 0 | return std::string(buffer, (sizeof(i) == 4) |
1140 | 0 | ? FastUInt32ToBufferLeft(i, buffer) |
1141 | 0 | : FastUInt64ToBufferLeft(i, buffer)); |
1142 | 0 | } |
1143 | | |
1144 | 0 | std::string SimpleItoa(long long i) { |
1145 | 0 | char buffer[kFastToBufferSize]; |
1146 | 0 | return (sizeof(i) == 4) ? |
1147 | 0 | FastInt32ToBuffer(i, buffer) : |
1148 | 0 | FastInt64ToBuffer(i, buffer); |
1149 | 0 | } |
1150 | | |
1151 | 0 | std::string SimpleItoa(unsigned long long i) { |
1152 | 0 | char buffer[kFastToBufferSize]; |
1153 | 0 | return std::string(buffer, (sizeof(i) == 4) |
1154 | 0 | ? FastUInt32ToBufferLeft(i, buffer) |
1155 | 0 | : FastUInt64ToBufferLeft(i, buffer)); |
1156 | 0 | } |
1157 | | |
1158 | | // ---------------------------------------------------------------------- |
1159 | | // SimpleDtoa() |
1160 | | // SimpleFtoa() |
1161 | | // DoubleToBuffer() |
1162 | | // FloatToBuffer() |
1163 | | // We want to print the value without losing precision, but we also do |
1164 | | // not want to print more digits than necessary. This turns out to be |
1165 | | // trickier than it sounds. Numbers like 0.2 cannot be represented |
1166 | | // exactly in binary. If we print 0.2 with a very large precision, |
1167 | | // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167". |
1168 | | // On the other hand, if we set the precision too low, we lose |
1169 | | // significant digits when printing numbers that actually need them. |
1170 | | // It turns out there is no precision value that does the right thing |
1171 | | // for all numbers. |
1172 | | // |
1173 | | // Our strategy is to first try printing with a precision that is never |
1174 | | // over-precise, then parse the result with strtod() to see if it |
1175 | | // matches. If not, we print again with a precision that will always |
1176 | | // give a precise result, but may use more digits than necessary. |
1177 | | // |
1178 | | // An arguably better strategy would be to use the algorithm described |
1179 | | // in "How to Print Floating-Point Numbers Accurately" by Steele & |
1180 | | // White, e.g. as implemented by David M. Gay's dtoa(). It turns out, |
1181 | | // however, that the following implementation is about as fast as |
1182 | | // DMG's code. Furthermore, DMG's code locks mutexes, which means it |
1183 | | // will not scale well on multi-core machines. DMG's code is slightly |
1184 | | // more accurate (in that it will never use more digits than |
1185 | | // necessary), but this is probably irrelevant for most users. |
1186 | | // |
1187 | | // Rob Pike and Ken Thompson also have an implementation of dtoa() in |
1188 | | // third_party/fmt/fltfmt.cc. Their implementation is similar to this |
1189 | | // one in that it makes guesses and then uses strtod() to check them. |
1190 | | // Their implementation is faster because they use their own code to |
1191 | | // generate the digits in the first place rather than use snprintf(), |
1192 | | // thus avoiding format string parsing overhead. However, this makes |
1193 | | // it considerably more complicated than the following implementation, |
1194 | | // and it is embedded in a larger library. If speed turns out to be |
1195 | | // an issue, we could re-implement this in terms of their |
1196 | | // implementation. |
1197 | | // ---------------------------------------------------------------------- |
1198 | | |
1199 | 0 | std::string SimpleDtoa(double value) { |
1200 | 0 | char buffer[kDoubleToBufferSize]; |
1201 | 0 | return DoubleToBuffer(value, buffer); |
1202 | 0 | } |
1203 | | |
1204 | 0 | std::string SimpleFtoa(float value) { |
1205 | 0 | char buffer[kFloatToBufferSize]; |
1206 | 0 | return FloatToBuffer(value, buffer); |
1207 | 0 | } |
1208 | | |
1209 | 0 | static inline bool IsValidFloatChar(char c) { |
1210 | 0 | return ('0' <= c && c <= '9') || |
1211 | 0 | c == 'e' || c == 'E' || |
1212 | 0 | c == '+' || c == '-'; |
1213 | 0 | } |
1214 | | |
1215 | 0 | void DelocalizeRadix(char* buffer) { |
1216 | | // Fast check: if the buffer has a normal decimal point, assume no |
1217 | | // translation is needed. |
1218 | 0 | if (strchr(buffer, '.') != nullptr) return; |
1219 | | |
1220 | | // Find the first unknown character. |
1221 | 0 | while (IsValidFloatChar(*buffer)) ++buffer; |
1222 | |
|
1223 | 0 | if (*buffer == '\0') { |
1224 | | // No radix character found. |
1225 | 0 | return; |
1226 | 0 | } |
1227 | | |
1228 | | // We are now pointing at the locale-specific radix character. Replace it |
1229 | | // with '.'. |
1230 | 0 | *buffer = '.'; |
1231 | 0 | ++buffer; |
1232 | |
|
1233 | 0 | if (!IsValidFloatChar(*buffer) && *buffer != '\0') { |
1234 | | // It appears the radix was a multi-byte character. We need to remove the |
1235 | | // extra bytes. |
1236 | 0 | char* target = buffer; |
1237 | 0 | do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0'); |
1238 | 0 | memmove(target, buffer, strlen(buffer) + 1); |
1239 | 0 | } |
1240 | 0 | } |
1241 | | |
1242 | 0 | char* DoubleToBuffer(double value, char* buffer) { |
1243 | | // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all |
1244 | | // platforms these days. Just in case some system exists where DBL_DIG |
1245 | | // is significantly larger -- and risks overflowing our buffer -- we have |
1246 | | // this assert. |
1247 | 0 | GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big); |
1248 | |
|
1249 | 0 | if (value == std::numeric_limits<double>::infinity()) { |
1250 | 0 | strcpy(buffer, "inf"); |
1251 | 0 | return buffer; |
1252 | 0 | } else if (value == -std::numeric_limits<double>::infinity()) { |
1253 | 0 | strcpy(buffer, "-inf"); |
1254 | 0 | return buffer; |
1255 | 0 | } else if (std::isnan(value)) { |
1256 | 0 | strcpy(buffer, "nan"); |
1257 | 0 | return buffer; |
1258 | 0 | } |
1259 | | |
1260 | 0 | int snprintf_result = |
1261 | 0 | snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value); |
1262 | | |
1263 | | // The snprintf should never overflow because the buffer is significantly |
1264 | | // larger than the precision we asked for. |
1265 | 0 | GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); |
1266 | | |
1267 | | // We need to make parsed_value volatile in order to force the compiler to |
1268 | | // write it out to the stack. Otherwise, it may keep the value in a |
1269 | | // register, and if it does that, it may keep it as a long double instead |
1270 | | // of a double. This long double may have extra bits that make it compare |
1271 | | // unequal to "value" even though it would be exactly equal if it were |
1272 | | // truncated to a double. |
1273 | 0 | volatile double parsed_value = internal::NoLocaleStrtod(buffer, nullptr); |
1274 | 0 | if (parsed_value != value) { |
1275 | 0 | int snprintf_result = |
1276 | 0 | snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value); |
1277 | | |
1278 | | // Should never overflow; see above. |
1279 | 0 | GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); |
1280 | 0 | } |
1281 | |
|
1282 | 0 | DelocalizeRadix(buffer); |
1283 | 0 | return buffer; |
1284 | 0 | } |
1285 | | |
1286 | 0 | static int memcasecmp(const char *s1, const char *s2, size_t len) { |
1287 | 0 | const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1); |
1288 | 0 | const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2); |
1289 | |
|
1290 | 0 | for ( int i = 0; i < len; i++ ) { |
1291 | 0 | const int diff = |
1292 | 0 | static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) - |
1293 | 0 | static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i]))); |
1294 | 0 | if (diff != 0) return diff; |
1295 | 0 | } |
1296 | 0 | return 0; |
1297 | 0 | } |
1298 | | |
1299 | 0 | inline bool CaseEqual(StringPiece s1, StringPiece s2) { |
1300 | 0 | if (s1.size() != s2.size()) return false; |
1301 | 0 | return memcasecmp(s1.data(), s2.data(), s1.size()) == 0; |
1302 | 0 | } |
1303 | | |
1304 | 0 | bool safe_strtob(StringPiece str, bool* value) { |
1305 | 0 | GOOGLE_CHECK(value != nullptr) << "nullptr output boolean given."; |
1306 | 0 | if (CaseEqual(str, "true") || CaseEqual(str, "t") || |
1307 | 0 | CaseEqual(str, "yes") || CaseEqual(str, "y") || |
1308 | 0 | CaseEqual(str, "1")) { |
1309 | 0 | *value = true; |
1310 | 0 | return true; |
1311 | 0 | } |
1312 | 0 | if (CaseEqual(str, "false") || CaseEqual(str, "f") || |
1313 | 0 | CaseEqual(str, "no") || CaseEqual(str, "n") || |
1314 | 0 | CaseEqual(str, "0")) { |
1315 | 0 | *value = false; |
1316 | 0 | return true; |
1317 | 0 | } |
1318 | 0 | return false; |
1319 | 0 | } |
1320 | | |
1321 | 0 | bool safe_strtof(const char* str, float* value) { |
1322 | 0 | char* endptr; |
1323 | 0 | errno = 0; // errno only gets set on errors |
1324 | | #if defined(_WIN32) || defined (__hpux) // has no strtof() |
1325 | | *value = internal::NoLocaleStrtod(str, &endptr); |
1326 | | #else |
1327 | 0 | *value = strtof(str, &endptr); |
1328 | 0 | #endif |
1329 | 0 | return *str != 0 && *endptr == 0 && errno == 0; |
1330 | 0 | } |
1331 | | |
1332 | 0 | bool safe_strtod(const char* str, double* value) { |
1333 | 0 | char* endptr; |
1334 | 0 | *value = internal::NoLocaleStrtod(str, &endptr); |
1335 | 0 | if (endptr != str) { |
1336 | 0 | while (ascii_isspace(*endptr)) ++endptr; |
1337 | 0 | } |
1338 | | // Ignore range errors from strtod. The values it |
1339 | | // returns on underflow and overflow are the right |
1340 | | // fallback in a robust setting. |
1341 | 0 | return *str != '\0' && *endptr == '\0'; |
1342 | 0 | } |
1343 | | |
1344 | 0 | bool safe_strto32(const std::string &str, int32 *value) { |
1345 | 0 | return safe_int_internal(str, value); |
1346 | 0 | } |
1347 | | |
1348 | 0 | bool safe_strtou32(const std::string &str, uint32 *value) { |
1349 | 0 | return safe_uint_internal(str, value); |
1350 | 0 | } |
1351 | | |
1352 | 0 | bool safe_strto64(const std::string &str, int64 *value) { |
1353 | 0 | return safe_int_internal(str, value); |
1354 | 0 | } |
1355 | | |
1356 | 0 | bool safe_strtou64(const std::string &str, uint64 *value) { |
1357 | 0 | return safe_uint_internal(str, value); |
1358 | 0 | } |
1359 | | |
1360 | 0 | char* FloatToBuffer(float value, char* buffer) { |
1361 | | // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all |
1362 | | // platforms these days. Just in case some system exists where FLT_DIG |
1363 | | // is significantly larger -- and risks overflowing our buffer -- we have |
1364 | | // this assert. |
1365 | 0 | GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big); |
1366 | |
|
1367 | 0 | if (value == std::numeric_limits<double>::infinity()) { |
1368 | 0 | strcpy(buffer, "inf"); |
1369 | 0 | return buffer; |
1370 | 0 | } else if (value == -std::numeric_limits<double>::infinity()) { |
1371 | 0 | strcpy(buffer, "-inf"); |
1372 | 0 | return buffer; |
1373 | 0 | } else if (std::isnan(value)) { |
1374 | 0 | strcpy(buffer, "nan"); |
1375 | 0 | return buffer; |
1376 | 0 | } |
1377 | | |
1378 | 0 | int snprintf_result = |
1379 | 0 | snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value); |
1380 | | |
1381 | | // The snprintf should never overflow because the buffer is significantly |
1382 | | // larger than the precision we asked for. |
1383 | 0 | GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); |
1384 | |
|
1385 | 0 | float parsed_value; |
1386 | 0 | if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) { |
1387 | 0 | int snprintf_result = |
1388 | 0 | snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value); |
1389 | | |
1390 | | // Should never overflow; see above. |
1391 | 0 | GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); |
1392 | 0 | } |
1393 | |
|
1394 | 0 | DelocalizeRadix(buffer); |
1395 | 0 | return buffer; |
1396 | 0 | } |
1397 | | |
1398 | | namespace strings { |
1399 | | |
1400 | 0 | AlphaNum::AlphaNum(strings::Hex hex) { |
1401 | 0 | char *const end = &digits[kFastToBufferSize]; |
1402 | 0 | char *writer = end; |
1403 | 0 | uint64 value = hex.value; |
1404 | 0 | uint64 width = hex.spec; |
1405 | | // We accomplish minimum width by OR'ing in 0x10000 to the user's value, |
1406 | | // where 0x10000 is the smallest hex number that is as wide as the user |
1407 | | // asked for. |
1408 | 0 | uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value; |
1409 | 0 | static const char hexdigits[] = "0123456789abcdef"; |
1410 | 0 | do { |
1411 | 0 | *--writer = hexdigits[value & 0xF]; |
1412 | 0 | value >>= 4; |
1413 | 0 | mask >>= 4; |
1414 | 0 | } while (mask != 0); |
1415 | 0 | piece_data_ = writer; |
1416 | 0 | piece_size_ = end - writer; |
1417 | 0 | } |
1418 | | |
1419 | | } // namespace strings |
1420 | | |
1421 | | // ---------------------------------------------------------------------- |
1422 | | // StrCat() |
1423 | | // This merges the given strings or integers, with no delimiter. This |
1424 | | // is designed to be the fastest possible way to construct a string out |
1425 | | // of a mix of raw C strings, C++ strings, and integer values. |
1426 | | // ---------------------------------------------------------------------- |
1427 | | |
1428 | | // Append is merely a version of memcpy that returns the address of the byte |
1429 | | // after the area just overwritten. It comes in multiple flavors to minimize |
1430 | | // call overhead. |
1431 | 0 | static char *Append1(char *out, const AlphaNum &x) { |
1432 | 0 | if (x.size() > 0) { |
1433 | 0 | memcpy(out, x.data(), x.size()); |
1434 | 0 | out += x.size(); |
1435 | 0 | } |
1436 | 0 | return out; |
1437 | 0 | } |
1438 | | |
1439 | 0 | static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) { |
1440 | 0 | if (x1.size() > 0) { |
1441 | 0 | memcpy(out, x1.data(), x1.size()); |
1442 | 0 | out += x1.size(); |
1443 | 0 | } |
1444 | 0 | if (x2.size() > 0) { |
1445 | 0 | memcpy(out, x2.data(), x2.size()); |
1446 | 0 | out += x2.size(); |
1447 | 0 | } |
1448 | 0 | return out; |
1449 | 0 | } |
1450 | | |
1451 | | static char *Append4(char *out, const AlphaNum &x1, const AlphaNum &x2, |
1452 | 0 | const AlphaNum &x3, const AlphaNum &x4) { |
1453 | 0 | if (x1.size() > 0) { |
1454 | 0 | memcpy(out, x1.data(), x1.size()); |
1455 | 0 | out += x1.size(); |
1456 | 0 | } |
1457 | 0 | if (x2.size() > 0) { |
1458 | 0 | memcpy(out, x2.data(), x2.size()); |
1459 | 0 | out += x2.size(); |
1460 | 0 | } |
1461 | 0 | if (x3.size() > 0) { |
1462 | 0 | memcpy(out, x3.data(), x3.size()); |
1463 | 0 | out += x3.size(); |
1464 | 0 | } |
1465 | 0 | if (x4.size() > 0) { |
1466 | 0 | memcpy(out, x4.data(), x4.size()); |
1467 | 0 | out += x4.size(); |
1468 | 0 | } |
1469 | 0 | return out; |
1470 | 0 | } |
1471 | | |
1472 | 0 | std::string StrCat(const AlphaNum &a, const AlphaNum &b) { |
1473 | 0 | std::string result; |
1474 | 0 | result.resize(a.size() + b.size()); |
1475 | 0 | char *const begin = &*result.begin(); |
1476 | 0 | char *out = Append2(begin, a, b); |
1477 | 0 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1478 | 0 | return result; |
1479 | 0 | } |
1480 | | |
1481 | 0 | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) { |
1482 | 0 | std::string result; |
1483 | 0 | result.resize(a.size() + b.size() + c.size()); |
1484 | 0 | char *const begin = &*result.begin(); |
1485 | 0 | char *out = Append2(begin, a, b); |
1486 | 0 | out = Append1(out, c); |
1487 | 0 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1488 | 0 | return result; |
1489 | 0 | } |
1490 | | |
1491 | | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1492 | 0 | const AlphaNum &d) { |
1493 | 0 | std::string result; |
1494 | 0 | result.resize(a.size() + b.size() + c.size() + d.size()); |
1495 | 0 | char *const begin = &*result.begin(); |
1496 | 0 | char *out = Append4(begin, a, b, c, d); |
1497 | 0 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1498 | 0 | return result; |
1499 | 0 | } |
1500 | | |
1501 | | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1502 | 0 | const AlphaNum &d, const AlphaNum &e) { |
1503 | 0 | std::string result; |
1504 | 0 | result.resize(a.size() + b.size() + c.size() + d.size() + e.size()); |
1505 | 0 | char *const begin = &*result.begin(); |
1506 | 0 | char *out = Append4(begin, a, b, c, d); |
1507 | 0 | out = Append1(out, e); |
1508 | 0 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1509 | 0 | return result; |
1510 | 0 | } |
1511 | | |
1512 | | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1513 | 0 | const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) { |
1514 | 0 | std::string result; |
1515 | 0 | result.resize(a.size() + b.size() + c.size() + d.size() + e.size() + |
1516 | 0 | f.size()); |
1517 | 0 | char *const begin = &*result.begin(); |
1518 | 0 | char *out = Append4(begin, a, b, c, d); |
1519 | 0 | out = Append2(out, e, f); |
1520 | 0 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1521 | 0 | return result; |
1522 | 0 | } |
1523 | | |
1524 | | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1525 | | const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, |
1526 | 0 | const AlphaNum &g) { |
1527 | 0 | std::string result; |
1528 | 0 | result.resize(a.size() + b.size() + c.size() + d.size() + e.size() + |
1529 | 0 | f.size() + g.size()); |
1530 | 0 | char *const begin = &*result.begin(); |
1531 | 0 | char *out = Append4(begin, a, b, c, d); |
1532 | 0 | out = Append2(out, e, f); |
1533 | 0 | out = Append1(out, g); |
1534 | 0 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1535 | 0 | return result; |
1536 | 0 | } |
1537 | | |
1538 | | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1539 | | const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, |
1540 | 0 | const AlphaNum &g, const AlphaNum &h) { |
1541 | 0 | std::string result; |
1542 | 0 | result.resize(a.size() + b.size() + c.size() + d.size() + e.size() + |
1543 | 0 | f.size() + g.size() + h.size()); |
1544 | 0 | char *const begin = &*result.begin(); |
1545 | 0 | char *out = Append4(begin, a, b, c, d); |
1546 | 0 | out = Append4(out, e, f, g, h); |
1547 | 0 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1548 | 0 | return result; |
1549 | 0 | } |
1550 | | |
1551 | | std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, |
1552 | | const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, |
1553 | 0 | const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) { |
1554 | 0 | std::string result; |
1555 | 0 | result.resize(a.size() + b.size() + c.size() + d.size() + e.size() + |
1556 | 0 | f.size() + g.size() + h.size() + i.size()); |
1557 | 0 | char *const begin = &*result.begin(); |
1558 | 0 | char *out = Append4(begin, a, b, c, d); |
1559 | 0 | out = Append4(out, e, f, g, h); |
1560 | 0 | out = Append1(out, i); |
1561 | 0 | GOOGLE_DCHECK_EQ(out, begin + result.size()); |
1562 | 0 | return result; |
1563 | 0 | } |
1564 | | |
1565 | | // It's possible to call StrAppend with a char * pointer that is partway into |
1566 | | // the string we're appending to. However the results of this are random. |
1567 | | // Therefore, check for this in debug mode. Use unsigned math so we only have |
1568 | | // to do one comparison. |
1569 | | #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \ |
1570 | 0 | GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \ |
1571 | 0 | uintptr_t((dest).size())) |
1572 | | |
1573 | 0 | void StrAppend(std::string *result, const AlphaNum &a) { |
1574 | 0 | GOOGLE_DCHECK_NO_OVERLAP(*result, a); |
1575 | 0 | result->append(a.data(), a.size()); |
1576 | 0 | } |
1577 | | |
1578 | 0 | void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b) { |
1579 | 0 | GOOGLE_DCHECK_NO_OVERLAP(*result, a); |
1580 | 0 | GOOGLE_DCHECK_NO_OVERLAP(*result, b); |
1581 | 0 | std::string::size_type old_size = result->size(); |
1582 | 0 | result->resize(old_size + a.size() + b.size()); |
1583 | 0 | char *const begin = &*result->begin(); |
1584 | 0 | char *out = Append2(begin + old_size, a, b); |
1585 | 0 | GOOGLE_DCHECK_EQ(out, begin + result->size()); |
1586 | 0 | } |
1587 | | |
1588 | | void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b, |
1589 | 0 | const AlphaNum &c) { |
1590 | 0 | GOOGLE_DCHECK_NO_OVERLAP(*result, a); |
1591 | 0 | GOOGLE_DCHECK_NO_OVERLAP(*result, b); |
1592 | 0 | GOOGLE_DCHECK_NO_OVERLAP(*result, c); |
1593 | 0 | std::string::size_type old_size = result->size(); |
1594 | 0 | result->resize(old_size + a.size() + b.size() + c.size()); |
1595 | 0 | char *const begin = &*result->begin(); |
1596 | 0 | char *out = Append2(begin + old_size, a, b); |
1597 | 0 | out = Append1(out, c); |
1598 | 0 | GOOGLE_DCHECK_EQ(out, begin + result->size()); |
1599 | 0 | } |
1600 | | |
1601 | | void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b, |
1602 | 0 | const AlphaNum &c, const AlphaNum &d) { |
1603 | 0 | GOOGLE_DCHECK_NO_OVERLAP(*result, a); |
1604 | 0 | GOOGLE_DCHECK_NO_OVERLAP(*result, b); |
1605 | 0 | GOOGLE_DCHECK_NO_OVERLAP(*result, c); |
1606 | 0 | GOOGLE_DCHECK_NO_OVERLAP(*result, d); |
1607 | 0 | std::string::size_type old_size = result->size(); |
1608 | 0 | result->resize(old_size + a.size() + b.size() + c.size() + d.size()); |
1609 | 0 | char *const begin = &*result->begin(); |
1610 | 0 | char *out = Append4(begin + old_size, a, b, c, d); |
1611 | 0 | GOOGLE_DCHECK_EQ(out, begin + result->size()); |
1612 | 0 | } |
1613 | | |
1614 | | int GlobalReplaceSubstring(const std::string &substring, |
1615 | 0 | const std::string &replacement, std::string *s) { |
1616 | 0 | GOOGLE_CHECK(s != nullptr); |
1617 | 0 | if (s->empty() || substring.empty()) |
1618 | 0 | return 0; |
1619 | 0 | std::string tmp; |
1620 | 0 | int num_replacements = 0; |
1621 | 0 | int pos = 0; |
1622 | 0 | for (int match_pos = s->find(substring.data(), pos, substring.length()); |
1623 | 0 | match_pos != std::string::npos; pos = match_pos + substring.length(), |
1624 | 0 | match_pos = s->find(substring.data(), pos, substring.length())) { |
1625 | 0 | ++num_replacements; |
1626 | | // Append the original content before the match. |
1627 | 0 | tmp.append(*s, pos, match_pos - pos); |
1628 | | // Append the replacement for the match. |
1629 | 0 | tmp.append(replacement.begin(), replacement.end()); |
1630 | 0 | } |
1631 | | // Append the content after the last match. If no replacements were made, the |
1632 | | // original string is left untouched. |
1633 | 0 | if (num_replacements > 0) { |
1634 | 0 | tmp.append(*s, pos, s->length() - pos); |
1635 | 0 | s->swap(tmp); |
1636 | 0 | } |
1637 | 0 | return num_replacements; |
1638 | 0 | } |
1639 | | |
1640 | 0 | int CalculateBase64EscapedLen(int input_len, bool do_padding) { |
1641 | | // Base64 encodes three bytes of input at a time. If the input is not |
1642 | | // divisible by three, we pad as appropriate. |
1643 | | // |
1644 | | // (from http://tools.ietf.org/html/rfc3548) |
1645 | | // Special processing is performed if fewer than 24 bits are available |
1646 | | // at the end of the data being encoded. A full encoding quantum is |
1647 | | // always completed at the end of a quantity. When fewer than 24 input |
1648 | | // bits are available in an input group, zero bits are added (on the |
1649 | | // right) to form an integral number of 6-bit groups. Padding at the |
1650 | | // end of the data is performed using the '=' character. Since all base |
1651 | | // 64 input is an integral number of octets, only the following cases |
1652 | | // can arise: |
1653 | | |
1654 | | |
1655 | | // Base64 encodes each three bytes of input into four bytes of output. |
1656 | 0 | int len = (input_len / 3) * 4; |
1657 | |
|
1658 | 0 | if (input_len % 3 == 0) { |
1659 | | // (from http://tools.ietf.org/html/rfc3548) |
1660 | | // (1) the final quantum of encoding input is an integral multiple of 24 |
1661 | | // bits; here, the final unit of encoded output will be an integral |
1662 | | // multiple of 4 characters with no "=" padding, |
1663 | 0 | } else if (input_len % 3 == 1) { |
1664 | | // (from http://tools.ietf.org/html/rfc3548) |
1665 | | // (2) the final quantum of encoding input is exactly 8 bits; here, the |
1666 | | // final unit of encoded output will be two characters followed by two |
1667 | | // "=" padding characters, or |
1668 | 0 | len += 2; |
1669 | 0 | if (do_padding) { |
1670 | 0 | len += 2; |
1671 | 0 | } |
1672 | 0 | } else { // (input_len % 3 == 2) |
1673 | | // (from http://tools.ietf.org/html/rfc3548) |
1674 | | // (3) the final quantum of encoding input is exactly 16 bits; here, the |
1675 | | // final unit of encoded output will be three characters followed by one |
1676 | | // "=" padding character. |
1677 | 0 | len += 3; |
1678 | 0 | if (do_padding) { |
1679 | 0 | len += 1; |
1680 | 0 | } |
1681 | 0 | } |
1682 | |
|
1683 | 0 | assert(len >= input_len); // make sure we didn't overflow |
1684 | 0 | return len; |
1685 | 0 | } |
1686 | | |
1687 | | // Base64Escape does padding, so this calculation includes padding. |
1688 | 0 | int CalculateBase64EscapedLen(int input_len) { |
1689 | 0 | return CalculateBase64EscapedLen(input_len, true); |
1690 | 0 | } |
1691 | | |
1692 | | // ---------------------------------------------------------------------- |
1693 | | // int Base64Unescape() - base64 decoder |
1694 | | // int Base64Escape() - base64 encoder |
1695 | | // int WebSafeBase64Unescape() - Google's variation of base64 decoder |
1696 | | // int WebSafeBase64Escape() - Google's variation of base64 encoder |
1697 | | // |
1698 | | // Check out |
1699 | | // http://tools.ietf.org/html/rfc2045 for formal description, but what we |
1700 | | // care about is that... |
1701 | | // Take the encoded stuff in groups of 4 characters and turn each |
1702 | | // character into a code 0 to 63 thus: |
1703 | | // A-Z map to 0 to 25 |
1704 | | // a-z map to 26 to 51 |
1705 | | // 0-9 map to 52 to 61 |
1706 | | // +(- for WebSafe) maps to 62 |
1707 | | // /(_ for WebSafe) maps to 63 |
1708 | | // There will be four numbers, all less than 64 which can be represented |
1709 | | // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively). |
1710 | | // Arrange the 6 digit binary numbers into three bytes as such: |
1711 | | // aaaaaabb bbbbcccc ccdddddd |
1712 | | // Equals signs (one or two) are used at the end of the encoded block to |
1713 | | // indicate that the text was not an integer multiple of three bytes long. |
1714 | | // ---------------------------------------------------------------------- |
1715 | | |
1716 | | int Base64UnescapeInternal(const char *src_param, int szsrc, |
1717 | | char *dest, int szdest, |
1718 | 0 | const signed char* unbase64) { |
1719 | 0 | static const char kPad64Equals = '='; |
1720 | 0 | static const char kPad64Dot = '.'; |
1721 | |
|
1722 | 0 | int decode = 0; |
1723 | 0 | int destidx = 0; |
1724 | 0 | int state = 0; |
1725 | 0 | unsigned int ch = 0; |
1726 | 0 | unsigned int temp = 0; |
1727 | | |
1728 | | // If "char" is signed by default, using *src as an array index results in |
1729 | | // accessing negative array elements. Treat the input as a pointer to |
1730 | | // unsigned char to avoid this. |
1731 | 0 | const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param); |
1732 | | |
1733 | | // The GET_INPUT macro gets the next input character, skipping |
1734 | | // over any whitespace, and stopping when we reach the end of the |
1735 | | // string or when we read any non-data character. The arguments are |
1736 | | // an arbitrary identifier (used as a label for goto) and the number |
1737 | | // of data bytes that must remain in the input to avoid aborting the |
1738 | | // loop. |
1739 | 0 | #define GET_INPUT(label, remain) \ |
1740 | 0 | label: \ |
1741 | 0 | --szsrc; \ |
1742 | 0 | ch = *src++; \ |
1743 | 0 | decode = unbase64[ch]; \ |
1744 | 0 | if (decode < 0) { \ |
1745 | 0 | if (ascii_isspace(ch) && szsrc >= remain) \ |
1746 | 0 | goto label; \ |
1747 | 0 | state = 4 - remain; \ |
1748 | 0 | break; \ |
1749 | 0 | } |
1750 | | |
1751 | | // if dest is null, we're just checking to see if it's legal input |
1752 | | // rather than producing output. (I suspect this could just be done |
1753 | | // with a regexp...). We duplicate the loop so this test can be |
1754 | | // outside it instead of in every iteration. |
1755 | |
|
1756 | 0 | if (dest) { |
1757 | | // This loop consumes 4 input bytes and produces 3 output bytes |
1758 | | // per iteration. We can't know at the start that there is enough |
1759 | | // data left in the string for a full iteration, so the loop may |
1760 | | // break out in the middle; if so 'state' will be set to the |
1761 | | // number of input bytes read. |
1762 | |
|
1763 | 0 | while (szsrc >= 4) { |
1764 | | // We'll start by optimistically assuming that the next four |
1765 | | // bytes of the string (src[0..3]) are four good data bytes |
1766 | | // (that is, no nulls, whitespace, padding chars, or illegal |
1767 | | // chars). We need to test src[0..2] for nulls individually |
1768 | | // before constructing temp to preserve the property that we |
1769 | | // never read past a null in the string (no matter how long |
1770 | | // szsrc claims the string is). |
1771 | |
|
1772 | 0 | if (!src[0] || !src[1] || !src[2] || |
1773 | 0 | (temp = ((unsigned(unbase64[src[0]]) << 18) | |
1774 | 0 | (unsigned(unbase64[src[1]]) << 12) | |
1775 | 0 | (unsigned(unbase64[src[2]]) << 6) | |
1776 | 0 | (unsigned(unbase64[src[3]])))) & 0x80000000) { |
1777 | | // Iff any of those four characters was bad (null, illegal, |
1778 | | // whitespace, padding), then temp's high bit will be set |
1779 | | // (because unbase64[] is -1 for all bad characters). |
1780 | | // |
1781 | | // We'll back up and resort to the slower decoder, which knows |
1782 | | // how to handle those cases. |
1783 | |
|
1784 | 0 | GET_INPUT(first, 4); |
1785 | 0 | temp = decode; |
1786 | 0 | GET_INPUT(second, 3); |
1787 | 0 | temp = (temp << 6) | decode; |
1788 | 0 | GET_INPUT(third, 2); |
1789 | 0 | temp = (temp << 6) | decode; |
1790 | 0 | GET_INPUT(fourth, 1); |
1791 | 0 | temp = (temp << 6) | decode; |
1792 | 0 | } else { |
1793 | | // We really did have four good data bytes, so advance four |
1794 | | // characters in the string. |
1795 | |
|
1796 | 0 | szsrc -= 4; |
1797 | 0 | src += 4; |
1798 | 0 | decode = -1; |
1799 | 0 | ch = '\0'; |
1800 | 0 | } |
1801 | | |
1802 | | // temp has 24 bits of input, so write that out as three bytes. |
1803 | | |
1804 | 0 | if (destidx+3 > szdest) return -1; |
1805 | 0 | dest[destidx+2] = temp; |
1806 | 0 | temp >>= 8; |
1807 | 0 | dest[destidx+1] = temp; |
1808 | 0 | temp >>= 8; |
1809 | 0 | dest[destidx] = temp; |
1810 | 0 | destidx += 3; |
1811 | 0 | } |
1812 | 0 | } else { |
1813 | 0 | while (szsrc >= 4) { |
1814 | 0 | if (!src[0] || !src[1] || !src[2] || |
1815 | 0 | (temp = ((unsigned(unbase64[src[0]]) << 18) | |
1816 | 0 | (unsigned(unbase64[src[1]]) << 12) | |
1817 | 0 | (unsigned(unbase64[src[2]]) << 6) | |
1818 | 0 | (unsigned(unbase64[src[3]])))) & 0x80000000) { |
1819 | 0 | GET_INPUT(first_no_dest, 4); |
1820 | 0 | GET_INPUT(second_no_dest, 3); |
1821 | 0 | GET_INPUT(third_no_dest, 2); |
1822 | 0 | GET_INPUT(fourth_no_dest, 1); |
1823 | 0 | } else { |
1824 | 0 | szsrc -= 4; |
1825 | 0 | src += 4; |
1826 | 0 | decode = -1; |
1827 | 0 | ch = '\0'; |
1828 | 0 | } |
1829 | 0 | destidx += 3; |
1830 | 0 | } |
1831 | 0 | } |
1832 | | |
1833 | 0 | #undef GET_INPUT |
1834 | | |
1835 | | // if the loop terminated because we read a bad character, return |
1836 | | // now. |
1837 | 0 | if (decode < 0 && ch != '\0' && |
1838 | 0 | ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch)) |
1839 | 0 | return -1; |
1840 | | |
1841 | 0 | if (ch == kPad64Equals || ch == kPad64Dot) { |
1842 | | // if we stopped by hitting an '=' or '.', un-read that character -- we'll |
1843 | | // look at it again when we count to check for the proper number of |
1844 | | // equals signs at the end. |
1845 | 0 | ++szsrc; |
1846 | 0 | --src; |
1847 | 0 | } else { |
1848 | | // This loop consumes 1 input byte per iteration. It's used to |
1849 | | // clean up the 0-3 input bytes remaining when the first, faster |
1850 | | // loop finishes. 'temp' contains the data from 'state' input |
1851 | | // characters read by the first loop. |
1852 | 0 | while (szsrc > 0) { |
1853 | 0 | --szsrc; |
1854 | 0 | ch = *src++; |
1855 | 0 | decode = unbase64[ch]; |
1856 | 0 | if (decode < 0) { |
1857 | 0 | if (ascii_isspace(ch)) { |
1858 | 0 | continue; |
1859 | 0 | } else if (ch == '\0') { |
1860 | 0 | break; |
1861 | 0 | } else if (ch == kPad64Equals || ch == kPad64Dot) { |
1862 | | // back up one character; we'll read it again when we check |
1863 | | // for the correct number of pad characters at the end. |
1864 | 0 | ++szsrc; |
1865 | 0 | --src; |
1866 | 0 | break; |
1867 | 0 | } else { |
1868 | 0 | return -1; |
1869 | 0 | } |
1870 | 0 | } |
1871 | | |
1872 | | // Each input character gives us six bits of output. |
1873 | 0 | temp = (temp << 6) | decode; |
1874 | 0 | ++state; |
1875 | 0 | if (state == 4) { |
1876 | | // If we've accumulated 24 bits of output, write that out as |
1877 | | // three bytes. |
1878 | 0 | if (dest) { |
1879 | 0 | if (destidx+3 > szdest) return -1; |
1880 | 0 | dest[destidx+2] = temp; |
1881 | 0 | temp >>= 8; |
1882 | 0 | dest[destidx+1] = temp; |
1883 | 0 | temp >>= 8; |
1884 | 0 | dest[destidx] = temp; |
1885 | 0 | } |
1886 | 0 | destidx += 3; |
1887 | 0 | state = 0; |
1888 | 0 | temp = 0; |
1889 | 0 | } |
1890 | 0 | } |
1891 | 0 | } |
1892 | | |
1893 | | // Process the leftover data contained in 'temp' at the end of the input. |
1894 | 0 | int expected_equals = 0; |
1895 | 0 | switch (state) { |
1896 | 0 | case 0: |
1897 | | // Nothing left over; output is a multiple of 3 bytes. |
1898 | 0 | break; |
1899 | | |
1900 | 0 | case 1: |
1901 | | // Bad input; we have 6 bits left over. |
1902 | 0 | return -1; |
1903 | | |
1904 | 0 | case 2: |
1905 | | // Produce one more output byte from the 12 input bits we have left. |
1906 | 0 | if (dest) { |
1907 | 0 | if (destidx+1 > szdest) return -1; |
1908 | 0 | temp >>= 4; |
1909 | 0 | dest[destidx] = temp; |
1910 | 0 | } |
1911 | 0 | ++destidx; |
1912 | 0 | expected_equals = 2; |
1913 | 0 | break; |
1914 | | |
1915 | 0 | case 3: |
1916 | | // Produce two more output bytes from the 18 input bits we have left. |
1917 | 0 | if (dest) { |
1918 | 0 | if (destidx+2 > szdest) return -1; |
1919 | 0 | temp >>= 2; |
1920 | 0 | dest[destidx+1] = temp; |
1921 | 0 | temp >>= 8; |
1922 | 0 | dest[destidx] = temp; |
1923 | 0 | } |
1924 | 0 | destidx += 2; |
1925 | 0 | expected_equals = 1; |
1926 | 0 | break; |
1927 | | |
1928 | 0 | default: |
1929 | | // state should have no other values at this point. |
1930 | 0 | GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state; |
1931 | 0 | } |
1932 | | |
1933 | | // The remainder of the string should be all whitespace, mixed with |
1934 | | // exactly 0 equals signs, or exactly 'expected_equals' equals |
1935 | | // signs. (Always accepting 0 equals signs is a google extension |
1936 | | // not covered in the RFC, as is accepting dot as the pad character.) |
1937 | | |
1938 | 0 | int equals = 0; |
1939 | 0 | while (szsrc > 0 && *src) { |
1940 | 0 | if (*src == kPad64Equals || *src == kPad64Dot) |
1941 | 0 | ++equals; |
1942 | 0 | else if (!ascii_isspace(*src)) |
1943 | 0 | return -1; |
1944 | 0 | --szsrc; |
1945 | 0 | ++src; |
1946 | 0 | } |
1947 | | |
1948 | 0 | return (equals == 0 || equals == expected_equals) ? destidx : -1; |
1949 | 0 | } |
1950 | | |
1951 | | // The arrays below were generated by the following code |
1952 | | // #include <sys/time.h> |
1953 | | // #include <stdlib.h> |
1954 | | // #include <string.h> |
1955 | | // #include <stdio.h> |
1956 | | // main() |
1957 | | // { |
1958 | | // static const char Base64[] = |
1959 | | // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; |
1960 | | // const char *pos; |
1961 | | // int idx, i, j; |
1962 | | // printf(" "); |
1963 | | // for (i = 0; i < 255; i += 8) { |
1964 | | // for (j = i; j < i + 8; j++) { |
1965 | | // pos = strchr(Base64, j); |
1966 | | // if ((pos == nullptr) || (j == 0)) |
1967 | | // idx = -1; |
1968 | | // else |
1969 | | // idx = pos - Base64; |
1970 | | // if (idx == -1) |
1971 | | // printf(" %2d, ", idx); |
1972 | | // else |
1973 | | // printf(" %2d/""*%c*""/,", idx, j); |
1974 | | // } |
1975 | | // printf("\n "); |
1976 | | // } |
1977 | | // } |
1978 | | // |
1979 | | // where the value of "Base64[]" was replaced by one of the base-64 conversion |
1980 | | // tables from the functions below. |
1981 | | static const signed char kUnBase64[] = { |
1982 | | -1, -1, -1, -1, -1, -1, -1, -1, |
1983 | | -1, -1, -1, -1, -1, -1, -1, -1, |
1984 | | -1, -1, -1, -1, -1, -1, -1, -1, |
1985 | | -1, -1, -1, -1, -1, -1, -1, -1, |
1986 | | -1, -1, -1, -1, -1, -1, -1, -1, |
1987 | | -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */, |
1988 | | 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/, |
1989 | | 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1, |
1990 | | -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, |
1991 | | 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, |
1992 | | 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, |
1993 | | 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1, |
1994 | | -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/, |
1995 | | 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/, |
1996 | | 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/, |
1997 | | 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1, |
1998 | | -1, -1, -1, -1, -1, -1, -1, -1, |
1999 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2000 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2001 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2002 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2003 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2004 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2005 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2006 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2007 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2008 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2009 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2010 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2011 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2012 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2013 | | -1, -1, -1, -1, -1, -1, -1, -1 |
2014 | | }; |
2015 | | static const signed char kUnWebSafeBase64[] = { |
2016 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2017 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2018 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2019 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2020 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2021 | | -1, -1, -1, -1, -1, 62/*-*/, -1, -1, |
2022 | | 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/, |
2023 | | 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1, |
2024 | | -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, |
2025 | | 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, |
2026 | | 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, |
2027 | | 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/, |
2028 | | -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/, |
2029 | | 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/, |
2030 | | 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/, |
2031 | | 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1, |
2032 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2033 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2034 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2035 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2036 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2037 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2038 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2039 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2040 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2041 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2042 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2043 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2044 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2045 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2046 | | -1, -1, -1, -1, -1, -1, -1, -1, |
2047 | | -1, -1, -1, -1, -1, -1, -1, -1 |
2048 | | }; |
2049 | | |
2050 | 0 | int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) { |
2051 | 0 | return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64); |
2052 | 0 | } |
2053 | | |
2054 | | static bool Base64UnescapeInternal(const char *src, int slen, std::string *dest, |
2055 | 0 | const signed char *unbase64) { |
2056 | | // Determine the size of the output string. Base64 encodes every 3 bytes into |
2057 | | // 4 characters. any leftover chars are added directly for good measure. |
2058 | | // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548 |
2059 | 0 | const int dest_len = 3 * (slen / 4) + (slen % 4); |
2060 | |
|
2061 | 0 | dest->resize(dest_len); |
2062 | | |
2063 | | // We are getting the destination buffer by getting the beginning of the |
2064 | | // string and converting it into a char *. |
2065 | 0 | const int len = Base64UnescapeInternal(src, slen, string_as_array(dest), |
2066 | 0 | dest_len, unbase64); |
2067 | 0 | if (len < 0) { |
2068 | 0 | dest->clear(); |
2069 | 0 | return false; |
2070 | 0 | } |
2071 | | |
2072 | | // could be shorter if there was padding |
2073 | 0 | GOOGLE_DCHECK_LE(len, dest_len); |
2074 | 0 | dest->erase(len); |
2075 | |
|
2076 | 0 | return true; |
2077 | 0 | } |
2078 | | |
2079 | 0 | bool Base64Unescape(StringPiece src, std::string *dest) { |
2080 | 0 | return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64); |
2081 | 0 | } |
2082 | | |
2083 | 0 | bool WebSafeBase64Unescape(StringPiece src, std::string *dest) { |
2084 | 0 | return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64); |
2085 | 0 | } |
2086 | | |
2087 | | int Base64EscapeInternal(const unsigned char *src, int szsrc, |
2088 | | char *dest, int szdest, const char *base64, |
2089 | 0 | bool do_padding) { |
2090 | 0 | static const char kPad64 = '='; |
2091 | |
|
2092 | 0 | if (szsrc <= 0) return 0; |
2093 | | |
2094 | 0 | if (szsrc * 4 > szdest * 3) return 0; |
2095 | | |
2096 | 0 | char *cur_dest = dest; |
2097 | 0 | const unsigned char *cur_src = src; |
2098 | |
|
2099 | 0 | char *limit_dest = dest + szdest; |
2100 | 0 | const unsigned char *limit_src = src + szsrc; |
2101 | | |
2102 | | // Three bytes of data encodes to four characters of cyphertext. |
2103 | | // So we can pump through three-byte chunks atomically. |
2104 | 0 | while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits |
2105 | 0 | uint32 in = BigEndian::Load32(cur_src) >> 8; |
2106 | |
|
2107 | 0 | cur_dest[0] = base64[in >> 18]; |
2108 | 0 | in &= 0x3FFFF; |
2109 | 0 | cur_dest[1] = base64[in >> 12]; |
2110 | 0 | in &= 0xFFF; |
2111 | 0 | cur_dest[2] = base64[in >> 6]; |
2112 | 0 | in &= 0x3F; |
2113 | 0 | cur_dest[3] = base64[in]; |
2114 | |
|
2115 | 0 | cur_dest += 4; |
2116 | 0 | cur_src += 3; |
2117 | 0 | } |
2118 | | // To save time, we didn't update szdest or szsrc in the loop. So do it now. |
2119 | 0 | szdest = limit_dest - cur_dest; |
2120 | 0 | szsrc = limit_src - cur_src; |
2121 | | |
2122 | | /* now deal with the tail (<=3 bytes) */ |
2123 | 0 | switch (szsrc) { |
2124 | 0 | case 0: |
2125 | | // Nothing left; nothing more to do. |
2126 | 0 | break; |
2127 | 0 | case 1: { |
2128 | | // One byte left: this encodes to two characters, and (optionally) |
2129 | | // two pad characters to round out the four-character cypherblock. |
2130 | 0 | if ((szdest -= 2) < 0) return 0; |
2131 | 0 | uint32 in = cur_src[0]; |
2132 | 0 | cur_dest[0] = base64[in >> 2]; |
2133 | 0 | in &= 0x3; |
2134 | 0 | cur_dest[1] = base64[in << 4]; |
2135 | 0 | cur_dest += 2; |
2136 | 0 | if (do_padding) { |
2137 | 0 | if ((szdest -= 2) < 0) return 0; |
2138 | 0 | cur_dest[0] = kPad64; |
2139 | 0 | cur_dest[1] = kPad64; |
2140 | 0 | cur_dest += 2; |
2141 | 0 | } |
2142 | 0 | break; |
2143 | 0 | } |
2144 | 0 | case 2: { |
2145 | | // Two bytes left: this encodes to three characters, and (optionally) |
2146 | | // one pad character to round out the four-character cypherblock. |
2147 | 0 | if ((szdest -= 3) < 0) return 0; |
2148 | 0 | uint32 in = BigEndian::Load16(cur_src); |
2149 | 0 | cur_dest[0] = base64[in >> 10]; |
2150 | 0 | in &= 0x3FF; |
2151 | 0 | cur_dest[1] = base64[in >> 4]; |
2152 | 0 | in &= 0x00F; |
2153 | 0 | cur_dest[2] = base64[in << 2]; |
2154 | 0 | cur_dest += 3; |
2155 | 0 | if (do_padding) { |
2156 | 0 | if ((szdest -= 1) < 0) return 0; |
2157 | 0 | cur_dest[0] = kPad64; |
2158 | 0 | cur_dest += 1; |
2159 | 0 | } |
2160 | 0 | break; |
2161 | 0 | } |
2162 | 0 | case 3: { |
2163 | | // Three bytes left: same as in the big loop above. We can't do this in |
2164 | | // the loop because the loop above always reads 4 bytes, and the fourth |
2165 | | // byte is past the end of the input. |
2166 | 0 | if ((szdest -= 4) < 0) return 0; |
2167 | 0 | uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1); |
2168 | 0 | cur_dest[0] = base64[in >> 18]; |
2169 | 0 | in &= 0x3FFFF; |
2170 | 0 | cur_dest[1] = base64[in >> 12]; |
2171 | 0 | in &= 0xFFF; |
2172 | 0 | cur_dest[2] = base64[in >> 6]; |
2173 | 0 | in &= 0x3F; |
2174 | 0 | cur_dest[3] = base64[in]; |
2175 | 0 | cur_dest += 4; |
2176 | 0 | break; |
2177 | 0 | } |
2178 | 0 | default: |
2179 | | // Should not be reached: blocks of 4 bytes are handled |
2180 | | // in the while loop before this switch statement. |
2181 | 0 | GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc; |
2182 | 0 | break; |
2183 | 0 | } |
2184 | 0 | return (cur_dest - dest); |
2185 | 0 | } |
2186 | | |
2187 | | static const char kBase64Chars[] = |
2188 | | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; |
2189 | | |
2190 | | static const char kWebSafeBase64Chars[] = |
2191 | | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; |
2192 | | |
2193 | 0 | int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) { |
2194 | 0 | return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true); |
2195 | 0 | } |
2196 | | int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest, |
2197 | 0 | int szdest, bool do_padding) { |
2198 | 0 | return Base64EscapeInternal(src, szsrc, dest, szdest, |
2199 | 0 | kWebSafeBase64Chars, do_padding); |
2200 | 0 | } |
2201 | | |
2202 | | void Base64EscapeInternal(const unsigned char *src, int szsrc, |
2203 | | std::string *dest, bool do_padding, |
2204 | 0 | const char *base64_chars) { |
2205 | 0 | const int calc_escaped_size = |
2206 | 0 | CalculateBase64EscapedLen(szsrc, do_padding); |
2207 | 0 | dest->resize(calc_escaped_size); |
2208 | 0 | const int escaped_len = Base64EscapeInternal(src, szsrc, |
2209 | 0 | string_as_array(dest), |
2210 | 0 | dest->size(), |
2211 | 0 | base64_chars, |
2212 | 0 | do_padding); |
2213 | 0 | GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len); |
2214 | 0 | dest->erase(escaped_len); |
2215 | 0 | } |
2216 | | |
2217 | | void Base64Escape(const unsigned char *src, int szsrc, std::string *dest, |
2218 | 0 | bool do_padding) { |
2219 | 0 | Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars); |
2220 | 0 | } |
2221 | | |
2222 | | void WebSafeBase64Escape(const unsigned char *src, int szsrc, std::string *dest, |
2223 | 0 | bool do_padding) { |
2224 | 0 | Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars); |
2225 | 0 | } |
2226 | | |
2227 | 0 | void Base64Escape(StringPiece src, std::string *dest) { |
2228 | 0 | Base64Escape(reinterpret_cast<const unsigned char*>(src.data()), |
2229 | 0 | src.size(), dest, true); |
2230 | 0 | } |
2231 | | |
2232 | 0 | void WebSafeBase64Escape(StringPiece src, std::string *dest) { |
2233 | 0 | WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()), |
2234 | 0 | src.size(), dest, false); |
2235 | 0 | } |
2236 | | |
2237 | 0 | void WebSafeBase64EscapeWithPadding(StringPiece src, std::string *dest) { |
2238 | 0 | WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()), |
2239 | 0 | src.size(), dest, true); |
2240 | 0 | } |
2241 | | |
2242 | | // Helper to append a Unicode code point to a string as UTF8, without bringing |
2243 | | // in any external dependencies. |
2244 | 0 | int EncodeAsUTF8Char(uint32 code_point, char* output) { |
2245 | 0 | uint32 tmp = 0; |
2246 | 0 | int len = 0; |
2247 | 0 | if (code_point <= 0x7f) { |
2248 | 0 | tmp = code_point; |
2249 | 0 | len = 1; |
2250 | 0 | } else if (code_point <= 0x07ff) { |
2251 | 0 | tmp = 0x0000c080 | |
2252 | 0 | ((code_point & 0x07c0) << 2) | |
2253 | 0 | (code_point & 0x003f); |
2254 | 0 | len = 2; |
2255 | 0 | } else if (code_point <= 0xffff) { |
2256 | 0 | tmp = 0x00e08080 | |
2257 | 0 | ((code_point & 0xf000) << 4) | |
2258 | 0 | ((code_point & 0x0fc0) << 2) | |
2259 | 0 | (code_point & 0x003f); |
2260 | 0 | len = 3; |
2261 | 0 | } else { |
2262 | | // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is |
2263 | | // normally only defined up to there as well. |
2264 | 0 | tmp = 0xf0808080 | |
2265 | 0 | ((code_point & 0x1c0000) << 6) | |
2266 | 0 | ((code_point & 0x03f000) << 4) | |
2267 | 0 | ((code_point & 0x000fc0) << 2) | |
2268 | 0 | (code_point & 0x003f); |
2269 | 0 | len = 4; |
2270 | 0 | } |
2271 | 0 | tmp = ghtonl(tmp); |
2272 | 0 | memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len); |
2273 | 0 | return len; |
2274 | 0 | } |
2275 | | |
2276 | | // Table of UTF-8 character lengths, based on first byte |
2277 | | static const unsigned char kUTF8LenTbl[256] = { |
2278 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2279 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2280 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2281 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2282 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2283 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2284 | | |
2285 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2286 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
2287 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, |
2288 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
2289 | | 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
2290 | | 3, 3, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; |
2291 | | |
2292 | | // Return length of a single UTF-8 source character |
2293 | 0 | int UTF8FirstLetterNumBytes(const char* src, int len) { |
2294 | 0 | if (len == 0) { |
2295 | 0 | return 0; |
2296 | 0 | } |
2297 | 0 | return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)]; |
2298 | 0 | } |
2299 | | |
2300 | | // ---------------------------------------------------------------------- |
2301 | | // CleanStringLineEndings() |
2302 | | // Clean up a multi-line string to conform to Unix line endings. |
2303 | | // Reads from src and appends to dst, so usually dst should be empty. |
2304 | | // |
2305 | | // If there is no line ending at the end of a non-empty string, it can |
2306 | | // be added automatically. |
2307 | | // |
2308 | | // Four different types of input are correctly handled: |
2309 | | // |
2310 | | // - Unix/Linux files: line ending is LF: pass through unchanged |
2311 | | // |
2312 | | // - DOS/Windows files: line ending is CRLF: convert to LF |
2313 | | // |
2314 | | // - Legacy Mac files: line ending is CR: convert to LF |
2315 | | // |
2316 | | // - Garbled files: random line endings: convert gracefully |
2317 | | // lonely CR, lonely LF, CRLF: convert to LF |
2318 | | // |
2319 | | // @param src The multi-line string to convert |
2320 | | // @param dst The converted string is appended to this string |
2321 | | // @param auto_end_last_line Automatically terminate the last line |
2322 | | // |
2323 | | // Limitations: |
2324 | | // |
2325 | | // This does not do the right thing for CRCRLF files created by |
2326 | | // broken programs that do another Unix->DOS conversion on files |
2327 | | // that are already in CRLF format. For this, a two-pass approach |
2328 | | // brute-force would be needed that |
2329 | | // |
2330 | | // (1) determines the presence of LF (first one is ok) |
2331 | | // (2) if yes, removes any CR, else convert every CR to LF |
2332 | | |
2333 | | void CleanStringLineEndings(const std::string &src, std::string *dst, |
2334 | 0 | bool auto_end_last_line) { |
2335 | 0 | if (dst->empty()) { |
2336 | 0 | dst->append(src); |
2337 | 0 | CleanStringLineEndings(dst, auto_end_last_line); |
2338 | 0 | } else { |
2339 | 0 | std::string tmp = src; |
2340 | 0 | CleanStringLineEndings(&tmp, auto_end_last_line); |
2341 | 0 | dst->append(tmp); |
2342 | 0 | } |
2343 | 0 | } |
2344 | | |
2345 | 0 | void CleanStringLineEndings(std::string *str, bool auto_end_last_line) { |
2346 | 0 | ptrdiff_t output_pos = 0; |
2347 | 0 | bool r_seen = false; |
2348 | 0 | ptrdiff_t len = str->size(); |
2349 | |
|
2350 | 0 | char *p = &(*str)[0]; |
2351 | |
|
2352 | 0 | for (ptrdiff_t input_pos = 0; input_pos < len;) { |
2353 | 0 | if (!r_seen && input_pos + 8 < len) { |
2354 | 0 | uint64_t v = GOOGLE_UNALIGNED_LOAD64(p + input_pos); |
2355 | | // Loop over groups of 8 bytes at a time until we come across |
2356 | | // a word that has a byte whose value is less than or equal to |
2357 | | // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ). |
2358 | | // |
2359 | | // We use a has_less macro that quickly tests a whole 64-bit |
2360 | | // word to see if any of the bytes has a value < N. |
2361 | | // |
2362 | | // For more details, see: |
2363 | | // http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord |
2364 | 0 | #define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128) |
2365 | 0 | if (!has_less(v, '\r' + 1)) { |
2366 | 0 | #undef has_less |
2367 | | // No byte in this word has a value that could be a \r or a \n |
2368 | 0 | if (output_pos != input_pos) { |
2369 | 0 | GOOGLE_UNALIGNED_STORE64(p + output_pos, v); |
2370 | 0 | } |
2371 | 0 | input_pos += 8; |
2372 | 0 | output_pos += 8; |
2373 | 0 | continue; |
2374 | 0 | } |
2375 | 0 | } |
2376 | 0 | std::string::const_reference in = p[input_pos]; |
2377 | 0 | if (in == '\r') { |
2378 | 0 | if (r_seen) p[output_pos++] = '\n'; |
2379 | 0 | r_seen = true; |
2380 | 0 | } else if (in == '\n') { |
2381 | 0 | if (input_pos != output_pos) |
2382 | 0 | p[output_pos++] = '\n'; |
2383 | 0 | else |
2384 | 0 | output_pos++; |
2385 | 0 | r_seen = false; |
2386 | 0 | } else { |
2387 | 0 | if (r_seen) p[output_pos++] = '\n'; |
2388 | 0 | r_seen = false; |
2389 | 0 | if (input_pos != output_pos) |
2390 | 0 | p[output_pos++] = in; |
2391 | 0 | else |
2392 | 0 | output_pos++; |
2393 | 0 | } |
2394 | 0 | input_pos++; |
2395 | 0 | } |
2396 | 0 | if (r_seen || |
2397 | 0 | (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) { |
2398 | 0 | str->resize(output_pos + 1); |
2399 | 0 | str->operator[](output_pos) = '\n'; |
2400 | 0 | } else if (output_pos < len) { |
2401 | 0 | str->resize(output_pos); |
2402 | 0 | } |
2403 | 0 | } |
2404 | | |
2405 | | namespace internal { |
2406 | | |
2407 | | // ---------------------------------------------------------------------- |
2408 | | // NoLocaleStrtod() |
2409 | | // This code will make you cry. |
2410 | | // ---------------------------------------------------------------------- |
2411 | | |
2412 | | namespace { |
2413 | | |
2414 | | // Returns a string identical to *input except that the character pointed to |
2415 | | // by radix_pos (which should be '.') is replaced with the locale-specific |
2416 | | // radix character. |
2417 | 0 | std::string LocalizeRadix(const char *input, const char *radix_pos) { |
2418 | | // Determine the locale-specific radix character by calling sprintf() to |
2419 | | // print the number 1.5, then stripping off the digits. As far as I can |
2420 | | // tell, this is the only portable, thread-safe way to get the C library |
2421 | | // to divuldge the locale's radix character. No, localeconv() is NOT |
2422 | | // thread-safe. |
2423 | 0 | char temp[16]; |
2424 | 0 | int size = snprintf(temp, sizeof(temp), "%.1f", 1.5); |
2425 | 0 | GOOGLE_CHECK_EQ(temp[0], '1'); |
2426 | 0 | GOOGLE_CHECK_EQ(temp[size - 1], '5'); |
2427 | 0 | GOOGLE_CHECK_LE(size, 6); |
2428 | | |
2429 | | // Now replace the '.' in the input with it. |
2430 | 0 | std::string result; |
2431 | 0 | result.reserve(strlen(input) + size - 3); |
2432 | 0 | result.append(input, radix_pos); |
2433 | 0 | result.append(temp + 1, size - 2); |
2434 | 0 | result.append(radix_pos + 1); |
2435 | 0 | return result; |
2436 | 0 | } |
2437 | | |
2438 | | } // namespace |
2439 | | |
2440 | 0 | double NoLocaleStrtod(const char *str, char **endptr) { |
2441 | | // We cannot simply set the locale to "C" temporarily with setlocale() |
2442 | | // as this is not thread-safe. Instead, we try to parse in the current |
2443 | | // locale first. If parsing stops at a '.' character, then this is a |
2444 | | // pretty good hint that we're actually in some other locale in which |
2445 | | // '.' is not the radix character. |
2446 | |
|
2447 | 0 | char *temp_endptr; |
2448 | 0 | double result = strtod(str, &temp_endptr); |
2449 | 0 | if (endptr != NULL) *endptr = temp_endptr; |
2450 | 0 | if (*temp_endptr != '.') return result; |
2451 | | |
2452 | | // Parsing halted on a '.'. Perhaps we're in a different locale? Let's |
2453 | | // try to replace the '.' with a locale-specific radix character and |
2454 | | // try again. |
2455 | 0 | std::string localized = LocalizeRadix(str, temp_endptr); |
2456 | 0 | const char *localized_cstr = localized.c_str(); |
2457 | 0 | char *localized_endptr; |
2458 | 0 | result = strtod(localized_cstr, &localized_endptr); |
2459 | 0 | if ((localized_endptr - localized_cstr) > (temp_endptr - str)) { |
2460 | | // This attempt got further, so replacing the decimal must have helped. |
2461 | | // Update endptr to point at the right location. |
2462 | 0 | if (endptr != NULL) { |
2463 | | // size_diff is non-zero if the localized radix has multiple bytes. |
2464 | 0 | int size_diff = localized.size() - strlen(str); |
2465 | | // const_cast is necessary to match the strtod() interface. |
2466 | 0 | *endptr = const_cast<char *>( |
2467 | 0 | str + (localized_endptr - localized_cstr - size_diff)); |
2468 | 0 | } |
2469 | 0 | } |
2470 | |
|
2471 | 0 | return result; |
2472 | 0 | } |
2473 | | |
2474 | | } // namespace internal |
2475 | | |
2476 | | } // namespace protobuf |
2477 | | } // namespace google |