Coverage Report

Created: 2025-09-08 06:20

/proc/self/cwd/cpp/htmlparser/strings.cc
Line
Count
Source (jump to first uncovered line)
1
#include "cpp/htmlparser/strings.h"
2
3
#include <algorithm>
4
#include <array>
5
#include <functional>
6
#include <sstream>
7
#include <tuple>
8
#include "cpp/htmlparser/casetable.h"
9
#include "cpp/htmlparser/entity.h"
10
#include "cpp/htmlparser/whitespacetable.h"
11
12
namespace htmlparser {
13
14
// These replacements permit compatibility with old numeric entities that
15
// assumed Windows-1252 encoding.
16
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
17
constexpr std::array<char32_t, 32> kReplacementTable{
18
    L'\u20AC', // First entry is what 0x80 should be replaced with.
19
    L'\u0081',
20
    L'\u201A',
21
    L'\u0192',
22
    L'\u201E',
23
    L'\u2026',
24
    L'\u2020',
25
    L'\u2021',
26
    L'\u02C6',
27
    L'\u2030',
28
    L'\u0160',
29
    L'\u2039',
30
    L'\u0152',
31
    L'\u008D',
32
    L'\u017D',
33
    L'\u008F',
34
    L'\u0090',
35
    L'\u2018',
36
    L'\u2019',
37
    L'\u201C',
38
    L'\u201D',
39
    L'\u2022',
40
    L'\u2013',
41
    L'\u2014',
42
    L'\u02DC',
43
    L'\u2122',
44
    L'\u0161',
45
    L'\u203A',
46
    L'\u0153',
47
    L'\u009D',
48
    L'\u017E',
49
    L'\u0178', // Last entry is 0x9F.
50
    // 0x00->L'\uFFFD' is handled programmatically.
51
    // 0x0D->L'\u000D' is a no-op.
52
};
53
54
// Copied from https://github.com/abseil/abseil-cpp/blob/master/absl/strings/ascii.cc
55
constexpr std::array<unsigned char, 256> kPropertyBits{
56
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
57
    0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
58
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
59
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
60
    0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
61
    0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
62
    0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
63
    0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
64
    0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
65
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
66
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
67
    0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
68
    0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
69
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
70
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
71
    0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
72
};
73
74
75
// Internal functions forward
76
// ==========================
77
namespace {
78
79
// Unescapes the entity inline. &lt;html&gt; becomes <html>. The resulting
80
// string may be smaller than original string.
81
std::pair<int, int> UnescapeEntity(std::string* b, int dst, int src,
82
    bool attribute = false);
83
84
// Converts the case of a string s according to the rules of character map in
85
// the case conversion table.
86
void CaseTransformInternal(bool to_upper, std::string* s);
87
88
// For multi-sequence utf-8 codepoints, reads the next valid byte as out
89
// parameter. Returns false if next byte in the sequence is not a valid byte.
90
bool ReadContinuationByte(uint8_t byte, uint8_t* out);
91
92
// Checks if the character is ASCII that is in range 1-127.
93
inline bool IsOneByteASCIIChar(uint8_t c);
94
95
// For a given string extracts all its char (including big char).
96
// Extraction may fail if there is error decoding utf-8 bytes inside the str.
97
// Returns false in case of error.
98
bool ExtractChars(std::string_view str, std::vector<char32_t>* chars);
99
100
// Converts 0xFF to 255, 0x8d to 141 etc. Better and exception safe than
101
// std::stoi and others.
102
bool OneByteHexCodeToInt(std::string_view hex_code, uint8_t* out);
103
104
}  // namespace.
105
106
std::optional<std::string> Strings::DecodePercentEncodedURL(
107
0
    std::string_view uri) {
108
0
  if (uri.empty()) return "";
109
110
0
  std::stringbuf uri_decoded;
111
0
  while (!uri.empty()) {
112
0
    if (uri.front() != '%') {
113
0
      uri_decoded.sputc(uri.front());
114
0
      uri.remove_prefix(1);
115
0
      continue;
116
0
    }
117
118
0
    uint8_t x1 = 0;
119
0
    if (uri.size() < 3 ||
120
0
        !OneByteHexCodeToInt(uri.substr(1, 2), &x1)) {
121
0
      return std::nullopt;
122
0
    }
123
124
    // Consumed the first three percent encoded chars. eg. %a8.
125
0
    uri.remove_prefix(3);
126
127
    // Sequence byte without initial byte.
128
0
    if ((x1 & 0xc0) == 0x80) return std::nullopt;
129
130
0
    auto num_bytes = Strings::CodePointByteSequenceCount(x1);
131
0
    uri_decoded.sputc(x1);
132
0
    if (num_bytes == 1) {
133
      // Single byte char must be signed char.
134
0
      if (x1 > 127) return std::nullopt;
135
0
      continue;
136
0
    }
137
138
    // 2 bytes sequence.
139
0
    if (num_bytes > 1) {
140
0
      uint8_t x2 = 0;
141
0
      if (uri.size() < 3 ||
142
0
          uri.front() != '%' ||
143
0
          !OneByteHexCodeToInt(uri.substr(1, 2), &x2) ||
144
0
          (x2 & 0xc0) != 0x80) {
145
0
        return std::nullopt;
146
0
      }
147
0
      uri.remove_prefix(3);
148
0
      uri_decoded.sputc(x2);
149
0
    }
150
151
    // 3 byte sequence.
152
0
    if (num_bytes > 2) {
153
0
      uint8_t x3 = 0;
154
0
      if (uri.size() < 3 ||
155
0
          uri.front() != '%' ||
156
0
          !OneByteHexCodeToInt(uri.substr(1, 2), &x3) ||
157
0
          (x3 & 0xc0) != 0x80) {
158
0
        return std::nullopt;
159
0
      }
160
0
      uri.remove_prefix(3);
161
0
      uri_decoded.sputc(x3);
162
0
    }
163
164
    // 4 byte sequence.
165
0
    if (num_bytes > 3) {
166
0
      uint8_t x4 = 0;
167
0
      if (uri.size() < 3 ||
168
0
          uri.front() != '%' ||
169
0
          !OneByteHexCodeToInt(uri.substr(1, 2), &x4) ||
170
0
          (x4 & 0xc0) != 0x80) {
171
0
        return std::nullopt;
172
0
      }
173
0
      uri.remove_prefix(3);
174
0
      uri_decoded.sputc(x4);
175
0
    }
176
0
  }
177
178
0
  return uri_decoded.str();
179
0
}
180
181
18.3M
bool Strings::IsCharAlphabet(char c) {
182
18.3M
  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
183
18.3M
}
184
185
// Returns true if character is char 0-9.
186
2.46M
bool Strings::IsDigit(char c) {
187
2.46M
  return '0' <= c && c <= '9';
188
2.46M
}
189
190
2.27M
void Strings::ConvertNewLines(std::string* s) {
191
48.2M
  for (std::size_t i = 0; i < s->size(); i++) {
192
45.9M
    char c = s->at(i);
193
45.9M
    if (!(c == '\r' || c == '\f')) continue;
194
195
    // Converts any lone \r that is not followed by \n to \n.
196
    // \r\rfoo becomes \n\nfoo.
197
    // \r\r\nfoo becomes \n\nfoo.
198
    // \r\f\r\nfoo becomes \n\n\nfoo
199
7.62M
    std::size_t next = i + 1;
200
7.62M
    if (c == '\r') {
201
7.59M
      if (next >= s->size() || s->at(next) != '\n') {
202
7.58M
        (*s)[i] = '\n';
203
7.58M
        continue;
204
7.58M
      }
205
7.59M
    }
206
207
33.4k
    if (c == '\f') {
208
27.1k
      (*s)[i] = '\n';
209
27.1k
      continue;
210
27.1k
    }
211
212
6.24k
    int dest = i;
213
3.50M
    while (next < s->size()) {
214
3.50M
      if (s->at(next) == '\r') {
215
3.00M
        if ((next + 1) < s->size() && s->at(next + 1) == '\n') {
216
2.27k
          next++;
217
2.27k
        }
218
3.00M
        (*s)[dest] = '\n';
219
3.00M
      } else {
220
495k
        (*s)[dest] = s->at(next);
221
495k
      }
222
3.50M
      next++;
223
3.50M
      dest++;
224
3.50M
    }
225
6.24k
    s->resize(dest);
226
6.24k
  }
227
2.27M
}
228
229
0
std::string Strings::ToHexString(uint32_t c) {
230
0
  std::stringstream ss;
231
0
  ss << "0x" << std::hex << c;
232
0
  return ss.str();
233
0
}
234
235
193M
int8_t Strings::CodePointByteSequenceCount(uint8_t c) {
236
193M
  if ((c & 0x80) == 0) return 1;     // Ascii char.
237
17.3M
  if ((c & 0xe0) == 0xc0) return 2;  // 2 bytes sequence.
238
14.3M
  if ((c & 0xf0) == 0xe0) return 3;  // 3 bytes sequence.
239
13.6M
  if ((c & 0xf8) == 0xf0) return 4;  // 4 bytes sequence.
240
241
242
  // Defaults to 1 byte ascii.
243
1.96M
  return 1;
244
13.6M
}
245
246
0
int8_t Strings::CodePointNumBytes(char32_t c) {
247
0
  if (c & 0xffffff80) return 1;
248
0
  if (c & 0xfffff800) return 2;
249
0
  if (c & 0xffff0000) return 3;
250
0
  if (c & 0xffe00000) return 4;
251
252
  // Defaults to 1 byte ascii.
253
0
  return 1;
254
0
}
255
256
2.10M
std::optional<char32_t> Strings::DecodeUtf8Symbol(std::string_view* s) {
257
2.10M
  if (!s || s->empty()) {
258
0
    return std::nullopt;
259
0
  }
260
261
  // Checks first byte is valid.
262
2.10M
  uint8_t c = *(s->data()) & 0xff;
263
264
  // 1 byte sequence.
265
2.10M
  if (IsOneByteASCIIChar(c)) {
266
0
    s->remove_prefix(1);
267
0
    return c;
268
0
  }
269
270
2.10M
  if (!(CodePointByteSequenceCount(c) > 1)) {
271
0
    return std::nullopt;
272
0
  }
273
274
  // 2 byte sequence.
275
2.10M
  if ((c & 0xe0) == 0xc0) {
276
955k
    if (s->size() < 2) return std::nullopt;
277
953k
    s->remove_prefix(1);
278
953k
    uint8_t c2;
279
953k
    bool c2_ok = ReadContinuationByte(*(s->data()), &c2);
280
953k
    s->remove_prefix(1);
281
    // Invalid byte in the sequence.
282
953k
    if (!c2_ok) return L'\uFFFD';
283
6.94k
    char32_t code_point = ((c & 0x1f) << 6) | c2;
284
6.94k
    if (code_point < 0x80) {
285
3.63k
      return std::nullopt;
286
3.63k
    }
287
3.31k
    return code_point;
288
6.94k
  }
289
290
  // 3 byte sequence.
291
1.15M
  if ((c &  0xf0) == 0xe0) {
292
132k
    if (s->size() < 3) return std::nullopt;
293
131k
    s->remove_prefix(1);
294
131k
    uint8_t c2;
295
131k
    bool c2_ok = ReadContinuationByte(*(s->data()), &c2);
296
131k
    s->remove_prefix(1);
297
131k
    uint8_t c3;
298
131k
    bool c3_ok = ReadContinuationByte(*(s->data()), &c3);
299
131k
    s->remove_prefix(1);
300
    // Invalid bytes in the sequence.
301
131k
    if (!(c2_ok && c3_ok)) return L'\uFFFD';
302
124k
    char32_t code_point = ((c & 0x0f) << 12) | (c2 << 6) | c3;
303
124k
    if (code_point < 0x0800) {
304
77
      return std::nullopt;
305
77
    }
306
    // Check if this is codepoint is low surrgates.
307
124k
    if (code_point >= 0xd800 && code_point <= 0xdfff) {
308
742
      return std::nullopt;
309
742
    }
310
311
123k
    return code_point;
312
124k
  }
313
314
  // 4 byte sequence.
315
1.01M
  if ((c & 0xf8) == 0xf0) {
316
1.01M
    if (s->size() < 4) return std::nullopt;
317
310k
    s->remove_prefix(1);
318
310k
    uint8_t c2;
319
310k
    bool c2_ok = ReadContinuationByte(*(s->data()), &c2);
320
310k
    s->remove_prefix(1);
321
310k
    uint8_t c3;
322
310k
    bool c3_ok = ReadContinuationByte(*(s->data()), &c3);
323
310k
    s->remove_prefix(1);
324
310k
    uint8_t c4;
325
310k
    bool c4_ok = ReadContinuationByte(*(s->data()), &c4);
326
310k
    s->remove_prefix(1);
327
    // Invalid bytes in the sequence.
328
310k
    if (!(c2_ok && c3_ok && c4_ok)) return L'\uFFFD';
329
31.5k
    char32_t code_point =  ((c & 0x07) << 0x12) |
330
31.5k
                           (c2 << 0x0c) |
331
31.5k
                           (c3 << 0x06) | c4;
332
31.5k
    if (!(code_point >= 0x010000 && code_point <= 0x10ffff)) {
333
486
      return std::nullopt;
334
486
    }
335
31.0k
    return code_point;
336
31.5k
  }
337
338
0
  return std::nullopt;
339
1.01M
}
340
341
35.0k
std::optional<std::string> Strings::EncodeUtf8Symbol(char32_t code_point) {
342
35.0k
  if ((code_point & 0xffffff80) == 0) {  // 1 byte sequence.
343
614
    return std::string{static_cast<char>(code_point)};
344
34.4k
  } else if ((code_point & 0xfffff800) == 0) {  // 2 byte sequence.
345
1.41k
    return std::string{
346
1.41k
      static_cast<char>((code_point >> 6) | 0xc0),
347
1.41k
      static_cast<char>((code_point & 0x3f) | 0x80)
348
1.41k
    };
349
33.0k
  } else if ((code_point & 0xffff0000) == 0) {  // 3 byte sequence.
350
26.4k
    return std::string{
351
26.4k
      static_cast<char>((code_point >> 12) | 0xe0),
352
26.4k
      static_cast<char>(((code_point >> 6) & 0x3f) | 0x80),
353
26.4k
      static_cast<char>((code_point & 0x3f) | 0x80)
354
26.4k
    };
355
26.4k
  } else if ((code_point & 0xffe00000) == 0) {  // 4 byte sequence.
356
6.59k
    return std::string{
357
6.59k
      static_cast<char>((code_point >> 18) | 0xf0),
358
6.59k
      static_cast<char>(((code_point >> 12) & 0x3f) | 0x80),
359
6.59k
      static_cast<char>(((code_point >> 6) & 0x3f) | 0x80),
360
6.59k
      static_cast<char>((code_point & 0x3f) | 0x80)
361
6.59k
    };
362
6.59k
  }
363
364
0
  return std::nullopt;
365
35.0k
}
366
367
0
std::string Strings::EscapeString(std::string_view s) {
368
0
  std::stringbuf buffer;
369
0
  Escape(s, &buffer);
370
0
  return buffer.str();
371
0
}
372
373
374
0
void Strings::Escape(std::string_view s, std::stringbuf* escaped) {
375
0
  for (auto c : s) {
376
0
    if (kEscapeChars.find(c) == std::string::npos) {
377
0
      escaped->sputc(c);
378
0
      continue;
379
0
    }
380
381
0
    std::string esc = "";
382
0
    switch (c) {
383
0
      case '"':
384
0
        esc = "&#34;";
385
0
        break;
386
0
      case '&':
387
0
        esc = "&amp;";
388
0
        break;
389
      // "&#39;" is shorter than "&apos;" and apos was not in HTML until
390
      // HTML5.
391
0
      case '\'':
392
0
        esc = "&#39;";
393
0
        break;
394
0
      case '<':
395
0
        esc = "&lt;";
396
0
        break;
397
0
      case '>':
398
0
        esc = "&gt;";
399
0
        break;
400
0
      default:
401
0
        continue;
402
0
    }
403
0
    escaped->sputn(esc.c_str(), esc.size());
404
0
  }
405
0
}
406
407
2.27M
void Strings::UnescapeString(std::string* s, bool attribute) {
408
2.27M
  if (s->empty()) return;
409
1.61M
  std::size_t src, dst = 0;
410
13.8M
  for (std::size_t i = 0; i < s->size() - 1; i++) {
411
12.2M
    if (s->at(i) == '&') {
412
33.7k
      std::tie(dst, src) = UnescapeEntity(s, i, i, attribute);
413
8.64M
      while (src < s->size()) {
414
8.61M
        auto c = s->at(src);
415
8.61M
        if (c == '&') {
416
537k
          std::tie(dst, src) = UnescapeEntity(s, dst, src, attribute);
417
8.07M
        } else {
418
8.07M
          s->at(dst) = c;
419
8.07M
          std::tie(dst, src) = std::tuple<int, int>(dst + 1, src + 1);
420
8.07M
        }
421
8.61M
      }
422
33.7k
      return s->resize(dst);
423
33.7k
    }
424
12.2M
  }
425
1.61M
}
426
427
14.0M
void Strings::ToLower(std::string* s) {
428
14.0M
  CaseTransformInternal(false, s);
429
14.0M
}
430
431
0
void Strings::ToUpper(std::string* s) {
432
0
  CaseTransformInternal(true, s);
433
0
}
434
435
std::size_t Strings::IndexAny(const std::string_view s,
436
125
                              std::string_view chars) {
437
125
  return s.find_first_of(chars);
438
125
}
439
440
18.8k
void Strings::TrimLeft(std::string* s, std::string_view chars_to_trim) {
441
18.8k
  s->erase(0, s->find_first_not_of(chars_to_trim));
442
18.8k
}
443
444
0
void Strings::TrimRight(std::string* s, std::string_view chars_to_trim) {
445
0
  s->erase(s->find_last_not_of(chars_to_trim) + 1);
446
0
}
447
448
0
void Strings::Trim(std::string* s, std::string_view chars_to_trim) {
449
0
  TrimLeft(s, chars_to_trim);
450
0
  TrimRight(s, chars_to_trim);
451
0
}
452
453
0
void Strings::TrimLeft(std::string_view* s, std::string_view chars_to_trim) {
454
0
  if (auto count = s->find_first_not_of(chars_to_trim);
455
0
      count != std::string_view::npos) {
456
0
    s->remove_prefix(count);
457
0
  } else {
458
    // All whitespace.
459
0
    s->remove_prefix(s->size());
460
0
  }
461
0
}
462
463
0
void Strings::TrimRight(std::string_view* s, std::string_view chars_to_trim) {
464
0
  if (auto count = s->find_last_not_of(chars_to_trim);
465
0
      count != std::string_view::npos) {
466
0
    s->remove_suffix(s->size() - count - 1);
467
0
  } else {
468
    // All whitespace.
469
0
    s->remove_suffix(s->size());
470
0
  }
471
0
}
472
473
0
void Strings::Trim(std::string_view* s, std::string_view chars_to_trim) {
474
0
  TrimLeft(s, chars_to_trim);
475
0
  TrimRight(s, chars_to_trim);
476
0
}
477
478
0
bool Strings::StripTrailingNewline(std::string* s) {
479
0
  if (!s->empty() && (*s)[s->size() - 1] == '\n') {
480
0
    if (s->size() > 1 && (*s)[s->size() - 2] == '\r')
481
0
      s->resize(s->size() - 2);
482
0
    else
483
0
      s->resize(s->size() - 1);
484
0
    return true;
485
0
  }
486
0
  return false;
487
0
}
488
489
0
void Strings::RemoveExtraSpaceChars(std::string* s) {
490
0
  int put_index = 0;
491
0
  bool ignore_next_space_char = false;
492
0
  for (std::size_t i = 0; i < s->size(); ++i) {
493
0
    if (s->at(i) == ' ') {
494
      // Previous character was a space, so ignore this char.
495
0
      if (ignore_next_space_char) {
496
0
        continue;
497
0
      }
498
0
      ignore_next_space_char = true;
499
0
    } else {
500
0
      ignore_next_space_char = false;
501
0
    }
502
0
    s->at(put_index++) = s->at(i);
503
0
  }
504
0
  s->resize(put_index);
505
0
}
506
507
0
bool Strings::StartsWith(std::string_view s, std::string_view prefix) {
508
0
  if (prefix.size() > s.size()) return false;
509
510
0
  for (std::size_t i = 0; i < prefix.size(); ++i) {
511
0
    uint8_t c1 = prefix.at(i) & 0xff;
512
0
    uint8_t c2 = s.at(i) & 0xff;
513
0
    if (c1 != c2) return false;
514
0
  }
515
516
0
  return true;
517
0
}
518
519
0
bool Strings::EndsWith(std::string_view s, std::string_view suffix) {
520
0
  if (suffix.size() > s.size()) return false;
521
522
0
  std::size_t i;
523
0
  std::size_t j;
524
0
  for (i = suffix.size() - 1, j = s.size() - 1; i > 0; --i) {
525
0
    uint8_t c1 = suffix.at(i) & 0xff;
526
0
    uint8_t c2 = s.at(j--) & 0xff;
527
0
    if (c1 != c2) return false;
528
0
  }
529
530
0
  return true;
531
0
}
532
533
void Strings::Replace(std::string* s, std::string_view from,
534
0
    std::string_view to) {
535
0
  if (from.empty()) return;
536
537
0
  std::size_t i = s->find(from);
538
0
  s->replace(i, from.size(), to);
539
0
}
540
541
void Strings::ReplaceAll(std::string* s, std::string_view from,
542
0
                         std::string_view to) {
543
0
  if (from.empty()) return;
544
0
  std::size_t i = s->find(from);
545
0
  while (i != std::string::npos) {
546
0
    s->replace(i, from.size(), to);
547
0
    i = s->find(from, i);
548
0
  }
549
0
}
550
551
void Strings::ReplaceAny(std::string* s, std::string_view chars,
552
1.76M
                         std::string_view to) {
553
1.76M
  if (chars.empty()) return;
554
1.76M
  std::size_t i = s->find_first_of(chars);
555
1.94M
  while (i != std::string::npos) {
556
178k
    s->replace(i, 1, to);
557
178k
    i = s->find_first_of(chars);
558
178k
  }
559
1.76M
}
560
561
std::optional<std::string> Strings::Translate(std::string_view str,
562
                                              std::string_view abc,
563
0
                                              std::string_view xyz) {
564
  // Contains sequence of characters found in abc string.
565
0
  std::vector<char32_t> abc_bytes;
566
  // Contains sequence of characters founds in xyz string.
567
0
  std::vector<char32_t> xyz_bytes;
568
569
  // Captures the characters.
570
0
  if (!(ExtractChars(abc, &abc_bytes) &&
571
0
        ExtractChars(xyz, &xyz_bytes))) {
572
0
    return std::nullopt;
573
0
  }
574
575
  // Helper function to find out index of matching char in the abc string.
576
  // Returns -1 if char is not found.
577
0
  std::function<int(char32_t)> getCharIndex =
578
0
      [&](char32_t c) -> std::size_t {
579
0
    for (std::size_t i = 0; i < abc_bytes.size(); ++i) {
580
0
      if (abc_bytes.at(i) == c) return i;
581
0
    }
582
0
    return std::string::npos;
583
0
  };
584
585
  // Evaluate and translate.
586
0
  std::stringbuf buf;
587
0
  while (!str.empty()) {
588
0
    uint8_t new_char = str.front() & 0xff;
589
0
    if (IsOneByteASCIIChar(new_char)) {
590
0
      std::size_t i = getCharIndex(new_char);
591
0
      if (i == std::string::npos) {
592
0
        buf.sputc(new_char);
593
0
      } else if (i >= xyz_bytes.size()) {
594
        // Ignore the character. i.e. remove from translated string.
595
0
      } else {
596
        // Replacement byte can be utf-8 code.
597
0
        std::string s = EncodeUtf8Symbol(xyz_bytes.at(i)).value_or("");
598
0
        buf.sputn(s.c_str(), s.size());
599
0
      }
600
0
      str.remove_prefix(1);
601
0
      continue;
602
0
    }
603
604
0
    auto big_char_or = DecodeUtf8Symbol(&str);
605
0
    if (!big_char_or.has_value()) {
606
      // Error decoding string.
607
0
      return std::nullopt;
608
0
    }
609
0
    char32_t big_char = big_char_or.value();
610
0
    std::size_t i = getCharIndex(big_char);
611
0
    if (i == std::string::npos) {
612
0
      auto s_or = EncodeUtf8Symbol(big_char);
613
0
      if (!s_or.has_value()) return std::nullopt;
614
0
      buf.sputn(s_or.value().c_str(), s_or.value().size());
615
0
    } else if (i >= xyz_bytes.size()) {
616
      // Ignore the character. i.e. remove from translated string.
617
0
    } else {
618
0
      auto s_or = EncodeUtf8Symbol(xyz_bytes.at(i));
619
0
      if (!s_or.has_value()) return std::nullopt;
620
0
      buf.sputn(s_or.value().c_str(), s_or.value().size());
621
0
    }
622
0
  }
623
624
0
  return buf.str();
625
0
}
626
627
bool Strings::IsAllWhitespaceChars(std::string_view s,
628
41.0k
      std::string_view whitespace_chars) {
629
41.0k
  return s.find_first_not_of(whitespace_chars) == std::string::npos;
630
41.0k
}
631
632
50.3k
bool Strings::EqualFold(std::string_view l, std::string_view r) {
633
73.5k
  while (!l.empty()) {
634
    // Reached the end of r, but more chars in l.
635
64.5k
    if (r.empty()) return false;
636
637
64.1k
    uint8_t l_char = l.front() & 0xff;
638
64.1k
    uint8_t r_char = r.front() & 0xff;
639
640
    // ASCII characters first.
641
64.1k
    if (IsOneByteASCIIChar(l_char)) {
642
58.1k
      if (('A' <= l_char && l_char <= 'Z') ||
643
58.1k
          ('a' <= l_char && l_char <= 'z')) {
644
        // Compare lower character for both the chars.
645
50.7k
        if ((l_char | 0x20) != (r_char | 0x20)) {
646
28.5k
          return false;
647
28.5k
        }
648
50.7k
      } else if (l_char != r_char) { // Compare other ascii character as-is.
649
6.66k
        return false;
650
6.66k
      }
651
652
22.9k
      l.remove_prefix(1);
653
22.9k
      r.remove_prefix(1);
654
22.9k
      continue;
655
58.1k
    }
656
657
5.99k
    if (!(CodePointByteSequenceCount(l_char) > 1 &&
658
5.99k
          CodePointByteSequenceCount(r_char) > 1)) {
659
2.49k
      return false;
660
2.49k
    }
661
662
3.49k
    auto l_char_opt = DecodeUtf8Symbol(&l);
663
3.49k
    auto r_char_opt = DecodeUtf8Symbol(&r);
664
665
    // Checks decoding succeeded.
666
3.49k
    if (!(l_char_opt.has_value() && r_char_opt.has_value())) return false;
667
668
1.47k
    char32_t l_char_wide = l_char_opt.value();
669
1.47k
    char32_t r_char_wide = r_char_opt.value();
670
671
    // Two characters matched. No case conversion needed.
672
1.47k
    if (l_char_wide == r_char_wide) {
673
233
      continue;
674
233
    }
675
676
    // Convert both to lowercase.
677
1.23k
    l_char_wide = ToLowerChar(l_char_wide);
678
1.23k
    r_char_wide = ToLowerChar(r_char_wide);
679
680
1.23k
    if (l_char_wide != r_char_wide) return false;
681
1.23k
  }
682
683
  // Checks all the bytes are processed in both the strings. If some bytes
684
  // left in either string, they are not equal.
685
8.93k
  return l.empty() && r.empty();
686
50.3k
}
687
688
std::vector<std::string> Strings::SplitStringAt(
689
0
      std::string_view s, char delimiter) {
690
0
  std::vector<std::string> columns;
691
0
  size_t first = 0;
692
693
0
  while (first < s.size()) {
694
0
    auto second = s.find_first_of(delimiter, first);
695
696
0
    if (first != second)
697
0
      columns.emplace_back(std::string(s.substr(first, second-first)));
698
699
0
    if (second == std::string_view::npos)
700
0
      break;
701
702
0
    first = second + 1;
703
0
  }
704
705
0
  return columns;
706
0
}
707
708
std::vector<std::string_view> Strings::SplitStrAtUtf8Whitespace(
709
0
    std::string_view s) {
710
0
  std::vector<std::string_view> columns;
711
0
  std::size_t start = 0;
712
0
  std::size_t end = 0;
713
0
  while (end < s.size()) {
714
0
    auto num_ws = IsUtf8WhiteSpaceChar(s, end);
715
0
    if (num_ws > 0) {
716
0
      if (start < end) {
717
0
        columns.emplace_back(s.substr(start, end - start));
718
0
      }
719
0
      start = end + num_ws;
720
0
      end = start;
721
0
    } else {
722
0
      end++;
723
0
    }
724
0
  }
725
0
  columns.emplace_back(s.substr(start, s.size()));
726
0
  return columns;
727
0
}
728
729
0
int Strings::IsUtf8WhiteSpaceChar(std::string_view s, std::size_t position) {
730
0
  std::size_t i = position;
731
0
  int state = 0;
732
0
  while (i < s.size()) {
733
0
    uint8_t c = s.at(i++);
734
0
    state = kWhitespaceTable[state][c];
735
736
0
    if (state == 0) {
737
0
      return 0;
738
0
    }
739
740
0
    if (state == 1) {
741
0
      return i - position;
742
0
    }
743
0
  }
744
745
0
  return 0;
746
0
}
747
748
0
int Strings::CountTerms(std::string_view s) {
749
0
  bool in_term = false;
750
0
  int num_terms = 0;
751
0
  while (!s.empty()) {
752
0
    unsigned char c = s.front();
753
0
    s.remove_prefix(1);
754
    // whitespace and punctuations.
755
0
    if ((kPropertyBits[c] & 0x08) != 0 || (kPropertyBits[c] & 0x10) != 0) {
756
0
      in_term = false;
757
0
    } else if (!in_term) {
758
      // First character of a term
759
0
      ++num_terms;
760
0
      in_term = true;
761
0
    }
762
0
  }
763
0
  return num_terms;
764
0
}
765
766
namespace {
767
768
// Reads an entity like "&lt;" from b[src:] and writes the corresponding "<"
769
// to b[dst:], returning the incremented dst and src cursors.
770
// Precondition: b[src] == '&' && dst <= src.
771
// attribute should be true if passing an attribute value.
772
std::pair<int, int> UnescapeEntity(std::string* b, int dst, int src,
773
571k
    bool attribute) {
774
571k
  std::string s = b->substr(src);
775
571k
  if (s.size() <= 1) {
776
401
    b->at(dst) = b->at(src);
777
401
    return std::pair<int, int>(dst + 1, src + 1);
778
401
  }
779
780
  // i starts at 1 because we already know that s[0] == '&'.
781
571k
  std::size_t i = 1;
782
571k
  if (s.at(i) == '#') {
783
503k
    if (s.size() <= 3) {  // We need to have at least  "&#.".
784
30.2k
      b->at(dst) = b->at(src);
785
30.2k
      return std::pair<int, int>(dst + 1, src + 1);
786
30.2k
    }
787
473k
    i++;
788
473k
    auto c = s.at(i);
789
473k
    bool hex = false;
790
473k
    if (c == 'x' || c == 'X') {
791
422k
      hex = true;
792
422k
      i++;
793
422k
    }
794
795
473k
    char32_t x = '\x00';
796
1.70M
    while (i < s.size()) {
797
1.70M
      auto c = s.at(i);
798
1.70M
      i++;
799
1.70M
      if (hex) {
800
1.60M
        if (Strings::IsDigit(c)) {
801
7.25k
          x = (16 * x) | (c - '0');
802
7.25k
          continue;
803
1.60M
        } else if ('a' <= c && c <= 'f') {
804
1.17M
          x = 16 * x + c - 'a' + 10;
805
1.17M
          continue;
806
1.17M
        } else if ('A' <= c && c <= 'F') {
807
407
          x = 16 * x + c - 'A' + 10;
808
407
          continue;
809
407
        }
810
1.60M
      } else if (Strings::IsDigit(c)) {
811
48.3k
        x = 10 * x + c - '0';
812
48.3k
        continue;
813
48.3k
      }
814
473k
      if (c != ';') {
815
454k
          i--;
816
454k
      }
817
473k
      break;
818
1.70M
    }
819
820
473k
    if (i <= 3) {  // No characters matched.
821
446k
      b->at(dst) = b->at(src);
822
446k
      return std::pair<int, int>(dst + 1, src + 1);
823
446k
    }
824
825
27.2k
    if (0x80 <= x && x <= 0x9F) {
826
      // Replace characters from Windows-1252 with UTF-8 equivalents.
827
201
      x = kReplacementTable[x - 0x80];
828
27.0k
    } else if (x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF) {
829
      // Replace invalid characters with the replacement chracter.
830
25.4k
      x = L'\uFFFD';
831
25.4k
    }
832
833
27.2k
    auto encoded_bytes = Strings::EncodeUtf8Symbol(x);
834
27.2k
    if (encoded_bytes.has_value()) {
835
27.2k
      std::transform(encoded_bytes.value().begin(),
836
27.2k
          encoded_bytes.value().end(), b->begin() + dst,
837
80.5k
          [](uint8_t c) -> char { return static_cast<char>(c); });
838
27.2k
      return std::pair<int, int>(dst + encoded_bytes.value().size(), src + i);
839
27.2k
    }
840
27.2k
  }
841
842
  // Consume the maximum number of chracters possible, with the consumed
843
  // characters matching one of the named references.
844
2.84M
  while (i < s.size()) {
845
2.84M
    auto c = s.at(i);
846
2.84M
    i++;
847
    // Lower-cased characters are more common in entities, so we check for
848
    // them first.
849
2.84M
   if (Strings::IsCharAlphabet(c) || Strings::IsDigit(c)) {
850
2.78M
     continue;
851
2.78M
   }
852
66.3k
   if (c != ';') {
853
50.3k
     i--;
854
50.3k
   }
855
66.3k
   break;
856
2.84M
  }
857
858
67.3k
  std::string entityName = s.substr(1, i - 1);
859
67.3k
  auto encoded_bytes = EntityLookup(entityName);
860
67.3k
  if (entityName.empty()) {
861
    // No-op.
862
51.0k
  } else if (attribute && entityName.at(entityName.size() - 1) != ';' &&
863
51.0k
      s.size() > i && s.at(i) == '=') {
864
    // No-op.
865
50.5k
  } else if (!encoded_bytes.empty()) {
866
667
    int overflow = encoded_bytes.size() - entityName.size() - 1 /* & */;
867
667
    if (overflow > 0) {
868
      // Insert some dummy chars which will get occupied by overflow entity
869
      // chars.
870
      // Suppose &xy; = \x1\x2\x3\x4\x5 (5 bytes char)
871
      // abc&xy;def (10 bytes) after this statement is:
872
      // abc&xy; def (11 bytes).
873
      // After unescape: abc\x1\x2\x3\x4\x5def (11 bytes).
874
274
      b->insert(src + encoded_bytes.size() - 1, " ", overflow);
875
274
    }
876
    // Copies the unescaped bytes to the destination,
877
667
    std::transform(encoded_bytes.begin(), encoded_bytes.end(), b->begin() + dst,
878
2.06k
        [](uint8_t c) -> char { return static_cast<char>(c); });
879
667
    return std::pair<int, int>(
880
667
        dst + encoded_bytes.size() - (overflow > 0 ? overflow : 0), src + i);
881
49.8k
  } else if (!attribute) {
882
11.4k
    int max_length = entityName.size() - 1;
883
11.4k
    if (max_length > kLongestEntityWithoutSemiColon) {
884
6.38k
      max_length = kLongestEntityWithoutSemiColon;
885
6.38k
    }
886
45.6k
    for (int j = max_length; j > 1; --j) {
887
34.8k
      auto encoded_bytes = EntityLookup(entityName.substr(0, j));
888
34.8k
      if (!encoded_bytes.empty()) {
889
552
        std::transform(encoded_bytes.begin(), encoded_bytes.end(),
890
834
                       b->begin() + dst, [](uint8_t c) -> char {
891
834
                         return static_cast<char>(c); });
892
552
        return std::pair<int, int>(dst + encoded_bytes.size(), src + j + 1);
893
552
      }
894
34.8k
    }
895
11.4k
  }
896
897
66.1k
  std::copy(b->begin() + src, b->begin() + src + i, b->begin() + dst);
898
66.1k
  return std::pair<int, int>(dst + i, src + i);
899
67.3k
}
900
901
14.0M
void CaseTransformInternal(bool to_upper, std::string* s) {
902
69.7M
  for (std::size_t i = 0; i < s->size(); ++i) {
903
904
55.7M
    uint8_t code_point = s->at(i) & 0xff;
905
906
    // ASCII characters first.
907
55.7M
    if (IsOneByteASCIIChar(code_point)) {
908
52.9M
      auto c = to_upper ? ToUpperChar(code_point) : ToLowerChar(code_point);
909
52.9M
      if (c != code_point) {
910
11.4M
        s->at(i) = static_cast<char>(c);
911
11.4M
      }
912
52.9M
      continue;
913
52.9M
    }
914
915
2.76M
    if (Strings::CodePointByteSequenceCount(code_point) > 1) {
916
2.09M
      std::string_view sv = *s;
917
2.09M
      sv.remove_prefix(i);
918
2.09M
      auto decoded = Strings::DecodeUtf8Symbol(&sv);
919
2.09M
      if (decoded.has_value()) {
920
1.38M
        char32_t decode_value = decoded.value();
921
1.38M
        auto c =
922
1.38M
            to_upper ? ToUpperChar(decode_value) : ToLowerChar(decode_value);
923
1.38M
        if (c != decode_value) {
924
7.86k
          auto char_encoded = Strings::EncodeUtf8Symbol(c);
925
7.86k
          if (char_encoded.has_value()) {
926
7.86k
            std::transform(char_encoded.value().begin(),
927
7.86k
                char_encoded.value().end(), s->begin() + i,
928
28.6k
                [](uint8_t c) -> char { return static_cast<char>(c); });
929
7.86k
          }
930
7.86k
        }
931
1.38M
      }
932
2.09M
    }
933
2.76M
  }
934
14.0M
}
935
936
2.14M
bool ReadContinuationByte(uint8_t byte, uint8_t* out) {
937
  // Checks it is valid continuation byte. 0b10xxxxxx.
938
2.14M
  if ((byte & 0xc0) == 0x80) {
939
    // Mask last six bits 0b00xxxxxx.
940
623k
    *out = byte & 0x3f;
941
623k
    return true;
942
623k
  }
943
944
  // Invalid continuation byte.
945
1.52M
  return false;
946
2.14M
}
947
948
57.9M
inline bool IsOneByteASCIIChar(uint8_t c) {
949
57.9M
  return (c & 0x80) == 0;
950
57.9M
}
951
952
0
bool ExtractChars(std::string_view str, std::vector<char32_t>* chars) {
953
0
  while (!str.empty()) {
954
0
    uint8_t c = str.front() & 0xff;
955
956
    // ASCII chracters first.
957
0
    if (IsOneByteASCIIChar(c)) {
958
0
      chars->push_back(c);
959
0
      str.remove_prefix(1);
960
0
      continue;
961
    // Check if this character is member of codepoint sequence.
962
0
    } else if (Strings::CodePointByteSequenceCount(c) > 1) {
963
      // Decode moves the string view prefix so no need to remove prefix
964
      // manually.
965
0
      auto old_big_char = Strings::DecodeUtf8Symbol(&str);
966
0
      if (!old_big_char.has_value()) {
967
        // Error decoding string.
968
0
        chars->clear();
969
0
        return false;
970
0
      }
971
0
      chars->push_back(old_big_char.value());
972
0
    } else {
973
      // Unknown character type.
974
0
      chars->clear();
975
0
      return false;
976
0
    }
977
0
  }
978
0
  return true;
979
0
}
980
981
0
bool OneByteHexCodeToInt(std::string_view hex_code, uint8_t* out) {
982
  // Will overflow.
983
0
  if (hex_code.size() > 2) return false;
984
0
  uint8_t x = 0;
985
0
  while (!hex_code.empty()) {
986
0
    auto h = hex_code.at(0);
987
0
    hex_code.remove_prefix(1);
988
0
    if (Strings::IsDigit(h)) {
989
0
      x = (16 * x) | (h - '0');
990
0
    } else if ('a' <= h && h <= 'f') {
991
0
      x = 16 * x + h - 'a' + 10;
992
0
    } else if ('A' <= h && h <= 'F') {
993
0
      x = 16 * x + h - 'A' + 10;
994
0
    } else {
995
      // Invalid hex code eg. %2x or %m8
996
0
      return false;
997
0
    }
998
0
  }
999
0
  *out = x;
1000
0
  return true;
1001
0
}
1002
1003
}  // namespace
1004
1005
}  // namespace htmlparser