Coverage Report

Created: 2025-07-23 06:45

/proc/self/cwd/cpp/htmlparser/strings.cc
Line
Count
Source (jump to first uncovered line)
1
#include "cpp/htmlparser/strings.h"
2
3
#include <algorithm>
4
#include <array>
5
#include <functional>
6
#include <sstream>
7
#include <tuple>
8
#include "cpp/htmlparser/casetable.h"
9
#include "cpp/htmlparser/entity.h"
10
#include "cpp/htmlparser/whitespacetable.h"
11
12
namespace htmlparser {
13
14
// These replacements permit compatibility with old numeric entities that
15
// assumed Windows-1252 encoding.
16
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
17
constexpr std::array<char32_t, 32> kReplacementTable{
18
    L'\u20AC', // First entry is what 0x80 should be replaced with.
19
    L'\u0081',
20
    L'\u201A',
21
    L'\u0192',
22
    L'\u201E',
23
    L'\u2026',
24
    L'\u2020',
25
    L'\u2021',
26
    L'\u02C6',
27
    L'\u2030',
28
    L'\u0160',
29
    L'\u2039',
30
    L'\u0152',
31
    L'\u008D',
32
    L'\u017D',
33
    L'\u008F',
34
    L'\u0090',
35
    L'\u2018',
36
    L'\u2019',
37
    L'\u201C',
38
    L'\u201D',
39
    L'\u2022',
40
    L'\u2013',
41
    L'\u2014',
42
    L'\u02DC',
43
    L'\u2122',
44
    L'\u0161',
45
    L'\u203A',
46
    L'\u0153',
47
    L'\u009D',
48
    L'\u017E',
49
    L'\u0178', // Last entry is 0x9F.
50
    // 0x00->L'\uFFFD' is handled programmatically.
51
    // 0x0D->L'\u000D' is a no-op.
52
};
53
54
// Copied from https://github.com/abseil/abseil-cpp/blob/master/absl/strings/ascii.cc
55
constexpr std::array<unsigned char, 256> kPropertyBits{
56
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
57
    0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
58
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
59
    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
60
    0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
61
    0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
62
    0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
63
    0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
64
    0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
65
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
66
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
67
    0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
68
    0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
69
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
70
    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
71
    0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
72
};
73
74
75
// Internal functions forward
76
// ==========================
77
namespace {
78
79
// Unescapes the entity inline. &lt;html&gt; becomes <html>. The resulting
80
// string may be smaller than original string.
81
std::pair<int, int> UnescapeEntity(std::string* b, int dst, int src,
82
    bool attribute = false);
83
84
// Converts the case of a string s according to the rules of character map in
85
// the case conversion table.
86
void CaseTransformInternal(bool to_upper, std::string* s);
87
88
// For multi-sequence utf-8 codepoints, reads the next valid byte as out
89
// parameter. Returns false if next byte in the sequence is not a valid byte.
90
bool ReadContinuationByte(uint8_t byte, uint8_t* out);
91
92
// Checks if the character is ASCII that is in range 1-127.
93
inline bool IsOneByteASCIIChar(uint8_t c);
94
95
// For a given string extracts all its char (including big char).
96
// Extraction may fail if there is error decoding utf-8 bytes inside the str.
97
// Returns false in case of error.
98
bool ExtractChars(std::string_view str, std::vector<char32_t>* chars);
99
100
// Converts 0xFF to 255, 0x8d to 141 etc. Better and exception safe than
101
// std::stoi and others.
102
bool OneByteHexCodeToInt(std::string_view hex_code, uint8_t* out);
103
104
}  // namespace.
105
106
std::optional<std::string> Strings::DecodePercentEncodedURL(
107
0
    std::string_view uri) {
108
0
  if (uri.empty()) return "";
109
110
0
  std::stringbuf uri_decoded;
111
0
  while (!uri.empty()) {
112
0
    if (uri.front() != '%') {
113
0
      uri_decoded.sputc(uri.front());
114
0
      uri.remove_prefix(1);
115
0
      continue;
116
0
    }
117
118
0
    uint8_t x1 = 0;
119
0
    if (uri.size() < 3 ||
120
0
        !OneByteHexCodeToInt(uri.substr(1, 2), &x1)) {
121
0
      return std::nullopt;
122
0
    }
123
124
    // Consumed the first three percent encoded chars. eg. %a8.
125
0
    uri.remove_prefix(3);
126
127
    // Sequence byte without initial byte.
128
0
    if ((x1 & 0xc0) == 0x80) return std::nullopt;
129
130
0
    auto num_bytes = Strings::CodePointByteSequenceCount(x1);
131
0
    uri_decoded.sputc(x1);
132
0
    if (num_bytes == 1) {
133
      // Single byte char must be signed char.
134
0
      if (x1 > 127) return std::nullopt;
135
0
      continue;
136
0
    }
137
138
    // 2 bytes sequence.
139
0
    if (num_bytes > 1) {
140
0
      uint8_t x2 = 0;
141
0
      if (uri.size() < 3 ||
142
0
          uri.front() != '%' ||
143
0
          !OneByteHexCodeToInt(uri.substr(1, 2), &x2) ||
144
0
          (x2 & 0xc0) != 0x80) {
145
0
        return std::nullopt;
146
0
      }
147
0
      uri.remove_prefix(3);
148
0
      uri_decoded.sputc(x2);
149
0
    }
150
151
    // 3 byte sequence.
152
0
    if (num_bytes > 2) {
153
0
      uint8_t x3 = 0;
154
0
      if (uri.size() < 3 ||
155
0
          uri.front() != '%' ||
156
0
          !OneByteHexCodeToInt(uri.substr(1, 2), &x3) ||
157
0
          (x3 & 0xc0) != 0x80) {
158
0
        return std::nullopt;
159
0
      }
160
0
      uri.remove_prefix(3);
161
0
      uri_decoded.sputc(x3);
162
0
    }
163
164
    // 4 byte sequence.
165
0
    if (num_bytes > 3) {
166
0
      uint8_t x4 = 0;
167
0
      if (uri.size() < 3 ||
168
0
          uri.front() != '%' ||
169
0
          !OneByteHexCodeToInt(uri.substr(1, 2), &x4) ||
170
0
          (x4 & 0xc0) != 0x80) {
171
0
        return std::nullopt;
172
0
      }
173
0
      uri.remove_prefix(3);
174
0
      uri_decoded.sputc(x4);
175
0
    }
176
0
  }
177
178
0
  return uri_decoded.str();
179
0
}
180
181
17.4M
bool Strings::IsCharAlphabet(char c) {
182
17.4M
  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
183
17.4M
}
184
185
// Returns true if character is char 0-9.
186
1.24M
bool Strings::IsDigit(char c) {
187
1.24M
  return '0' <= c && c <= '9';
188
1.24M
}
189
190
2.38M
void Strings::ConvertNewLines(std::string* s) {
191
32.1M
  for (std::size_t i = 0; i < s->size(); i++) {
192
29.7M
    char c = s->at(i);
193
29.7M
    if (!(c == '\r' || c == '\f')) continue;
194
195
    // Converts any lone \r that is not followed by \n to \n.
196
    // \r\rfoo becomes \n\nfoo.
197
    // \r\r\nfoo becomes \n\nfoo.
198
    // \r\f\r\nfoo becomes \n\n\nfoo
199
5.53M
    std::size_t next = i + 1;
200
5.53M
    if (c == '\r') {
201
5.49M
      if (next >= s->size() || s->at(next) != '\n') {
202
5.48M
        (*s)[i] = '\n';
203
5.48M
        continue;
204
5.48M
      }
205
5.49M
    }
206
207
49.2k
    if (c == '\f') {
208
41.3k
      (*s)[i] = '\n';
209
41.3k
      continue;
210
41.3k
    }
211
212
7.87k
    int dest = i;
213
1.79M
    while (next < s->size()) {
214
1.78M
      if (s->at(next) == '\r') {
215
212k
        if ((next + 1) < s->size() && s->at(next + 1) == '\n') {
216
1.75k
          next++;
217
1.75k
        }
218
212k
        (*s)[dest] = '\n';
219
1.56M
      } else {
220
1.56M
        (*s)[dest] = s->at(next);
221
1.56M
      }
222
1.78M
      next++;
223
1.78M
      dest++;
224
1.78M
    }
225
7.87k
    s->resize(dest);
226
7.87k
  }
227
2.38M
}
228
229
0
std::string Strings::ToHexString(uint32_t c) {
230
0
  std::stringstream ss;
231
0
  ss << "0x" << std::hex << c;
232
0
  return ss.str();
233
0
}
234
235
175M
int8_t Strings::CodePointByteSequenceCount(uint8_t c) {
236
175M
  if ((c & 0x80) == 0) return 1;     // Ascii char.
237
16.8M
  if ((c & 0xe0) == 0xc0) return 2;  // 2 bytes sequence.
238
15.2M
  if ((c & 0xf0) == 0xe0) return 3;  // 3 bytes sequence.
239
14.6M
  if ((c & 0xf8) == 0xf0) return 4;  // 4 bytes sequence.
240
241
242
  // Defaults to 1 byte ascii.
243
1.83M
  return 1;
244
14.6M
}
245
246
0
int8_t Strings::CodePointNumBytes(char32_t c) {
247
0
  if (c & 0xffffff80) return 1;
248
0
  if (c & 0xfffff800) return 2;
249
0
  if (c & 0xffff0000) return 3;
250
0
  if (c & 0xffe00000) return 4;
251
252
  // Defaults to 1 byte ascii.
253
0
  return 1;
254
0
}
255
256
1.29M
std::optional<char32_t> Strings::DecodeUtf8Symbol(std::string_view* s) {
257
1.29M
  if (!s || s->empty()) {
258
0
    return std::nullopt;
259
0
  }
260
261
  // Checks first byte is valid.
262
1.29M
  uint8_t c = *(s->data()) & 0xff;
263
264
  // 1 byte sequence.
265
1.29M
  if (IsOneByteASCIIChar(c)) {
266
0
    s->remove_prefix(1);
267
0
    return c;
268
0
  }
269
270
1.29M
  if (!(CodePointByteSequenceCount(c) > 1)) {
271
0
    return std::nullopt;
272
0
  }
273
274
  // 2 byte sequence.
275
1.29M
  if ((c & 0xe0) == 0xc0) {
276
521k
    if (s->size() < 2) return std::nullopt;
277
519k
    s->remove_prefix(1);
278
519k
    uint8_t c2;
279
519k
    bool c2_ok = ReadContinuationByte(*(s->data()), &c2);
280
519k
    s->remove_prefix(1);
281
    // Invalid byte in the sequence.
282
519k
    if (!c2_ok) return L'\uFFFD';
283
12.3k
    char32_t code_point = ((c & 0x1f) << 6) | c2;
284
12.3k
    if (code_point < 0x80) {
285
8.88k
      return std::nullopt;
286
8.88k
    }
287
3.41k
    return code_point;
288
12.3k
  }
289
290
  // 3 byte sequence.
291
770k
  if ((c &  0xf0) == 0xe0) {
292
135k
    if (s->size() < 3) return std::nullopt;
293
133k
    s->remove_prefix(1);
294
133k
    uint8_t c2;
295
133k
    bool c2_ok = ReadContinuationByte(*(s->data()), &c2);
296
133k
    s->remove_prefix(1);
297
133k
    uint8_t c3;
298
133k
    bool c3_ok = ReadContinuationByte(*(s->data()), &c3);
299
133k
    s->remove_prefix(1);
300
    // Invalid bytes in the sequence.
301
133k
    if (!(c2_ok && c3_ok)) return L'\uFFFD';
302
129k
    char32_t code_point = ((c & 0x0f) << 12) | (c2 << 6) | c3;
303
129k
    if (code_point < 0x0800) {
304
78
      return std::nullopt;
305
78
    }
306
    // Check if this is codepoint is low surrgates.
307
129k
    if (code_point >= 0xd800 && code_point <= 0xdfff) {
308
1.19k
      return std::nullopt;
309
1.19k
    }
310
311
128k
    return code_point;
312
129k
  }
313
314
  // 4 byte sequence.
315
634k
  if ((c & 0xf8) == 0xf0) {
316
634k
    if (s->size() < 4) return std::nullopt;
317
252k
    s->remove_prefix(1);
318
252k
    uint8_t c2;
319
252k
    bool c2_ok = ReadContinuationByte(*(s->data()), &c2);
320
252k
    s->remove_prefix(1);
321
252k
    uint8_t c3;
322
252k
    bool c3_ok = ReadContinuationByte(*(s->data()), &c3);
323
252k
    s->remove_prefix(1);
324
252k
    uint8_t c4;
325
252k
    bool c4_ok = ReadContinuationByte(*(s->data()), &c4);
326
252k
    s->remove_prefix(1);
327
    // Invalid bytes in the sequence.
328
252k
    if (!(c2_ok && c3_ok && c4_ok)) return L'\uFFFD';
329
25.5k
    char32_t code_point =  ((c & 0x07) << 0x12) |
330
25.5k
                           (c2 << 0x0c) |
331
25.5k
                           (c3 << 0x06) | c4;
332
25.5k
    if (!(code_point >= 0x010000 && code_point <= 0x10ffff)) {
333
343
      return std::nullopt;
334
343
    }
335
25.2k
    return code_point;
336
25.5k
  }
337
338
0
  return std::nullopt;
339
634k
}
340
341
11.2k
std::optional<std::string> Strings::EncodeUtf8Symbol(char32_t code_point) {
342
11.2k
  if ((code_point & 0xffffff80) == 0) {  // 1 byte sequence.
343
3.59k
    return std::string{static_cast<char>(code_point)};
344
7.69k
  } else if ((code_point & 0xfffff800) == 0) {  // 2 byte sequence.
345
2.05k
    return std::string{
346
2.05k
      static_cast<char>((code_point >> 6) | 0xc0),
347
2.05k
      static_cast<char>((code_point & 0x3f) | 0x80)
348
2.05k
    };
349
5.64k
  } else if ((code_point & 0xffff0000) == 0) {  // 3 byte sequence.
350
3.97k
    return std::string{
351
3.97k
      static_cast<char>((code_point >> 12) | 0xe0),
352
3.97k
      static_cast<char>(((code_point >> 6) & 0x3f) | 0x80),
353
3.97k
      static_cast<char>((code_point & 0x3f) | 0x80)
354
3.97k
    };
355
3.97k
  } else if ((code_point & 0xffe00000) == 0) {  // 4 byte sequence.
356
1.66k
    return std::string{
357
1.66k
      static_cast<char>((code_point >> 18) | 0xf0),
358
1.66k
      static_cast<char>(((code_point >> 12) & 0x3f) | 0x80),
359
1.66k
      static_cast<char>(((code_point >> 6) & 0x3f) | 0x80),
360
1.66k
      static_cast<char>((code_point & 0x3f) | 0x80)
361
1.66k
    };
362
1.66k
  }
363
364
0
  return std::nullopt;
365
11.2k
}
366
367
0
std::string Strings::EscapeString(std::string_view s) {
368
0
  std::stringbuf buffer;
369
0
  Escape(s, &buffer);
370
0
  return buffer.str();
371
0
}
372
373
374
0
void Strings::Escape(std::string_view s, std::stringbuf* escaped) {
375
0
  for (auto c : s) {
376
0
    if (kEscapeChars.find(c) == std::string::npos) {
377
0
      escaped->sputc(c);
378
0
      continue;
379
0
    }
380
381
0
    std::string esc = "";
382
0
    switch (c) {
383
0
      case '"':
384
0
        esc = "&#34;";
385
0
        break;
386
0
      case '&':
387
0
        esc = "&amp;";
388
0
        break;
389
      // "&#39;" is shorter than "&apos;" and apos was not in HTML until
390
      // HTML5.
391
0
      case '\'':
392
0
        esc = "&#39;";
393
0
        break;
394
0
      case '<':
395
0
        esc = "&lt;";
396
0
        break;
397
0
      case '>':
398
0
        esc = "&gt;";
399
0
        break;
400
0
      default:
401
0
        continue;
402
0
    }
403
0
    escaped->sputn(esc.c_str(), esc.size());
404
0
  }
405
0
}
406
407
2.38M
void Strings::UnescapeString(std::string* s, bool attribute) {
408
2.38M
  if (s->empty()) return;
409
1.96M
  std::size_t src, dst = 0;
410
9.42M
  for (std::size_t i = 0; i < s->size() - 1; i++) {
411
7.46M
    if (s->at(i) == '&') {
412
2.93k
      std::tie(dst, src) = UnescapeEntity(s, i, i, attribute);
413
8.77M
      while (src < s->size()) {
414
8.77M
        auto c = s->at(src);
415
8.77M
        if (c == '&') {
416
521k
          std::tie(dst, src) = UnescapeEntity(s, dst, src, attribute);
417
8.25M
        } else {
418
8.25M
          s->at(dst) = c;
419
8.25M
          std::tie(dst, src) = std::tuple<int, int>(dst + 1, src + 1);
420
8.25M
        }
421
8.77M
      }
422
2.93k
      return s->resize(dst);
423
2.93k
    }
424
7.46M
  }
425
1.96M
}
426
427
13.7M
void Strings::ToLower(std::string* s) {
428
13.7M
  CaseTransformInternal(false, s);
429
13.7M
}
430
431
0
void Strings::ToUpper(std::string* s) {
432
0
  CaseTransformInternal(true, s);
433
0
}
434
435
std::size_t Strings::IndexAny(const std::string_view s,
436
106
                              std::string_view chars) {
437
106
  return s.find_first_of(chars);
438
106
}
439
440
14.7k
void Strings::TrimLeft(std::string* s, std::string_view chars_to_trim) {
441
14.7k
  s->erase(0, s->find_first_not_of(chars_to_trim));
442
14.7k
}
443
444
0
void Strings::TrimRight(std::string* s, std::string_view chars_to_trim) {
445
0
  s->erase(s->find_last_not_of(chars_to_trim) + 1);
446
0
}
447
448
0
void Strings::Trim(std::string* s, std::string_view chars_to_trim) {
449
0
  TrimLeft(s, chars_to_trim);
450
0
  TrimRight(s, chars_to_trim);
451
0
}
452
453
0
void Strings::TrimLeft(std::string_view* s, std::string_view chars_to_trim) {
454
0
  if (auto count = s->find_first_not_of(chars_to_trim);
455
0
      count != std::string_view::npos) {
456
0
    s->remove_prefix(count);
457
0
  } else {
458
    // All whitespace.
459
0
    s->remove_prefix(s->size());
460
0
  }
461
0
}
462
463
0
void Strings::TrimRight(std::string_view* s, std::string_view chars_to_trim) {
464
0
  if (auto count = s->find_last_not_of(chars_to_trim);
465
0
      count != std::string_view::npos) {
466
0
    s->remove_suffix(s->size() - count - 1);
467
0
  } else {
468
    // All whitespace.
469
0
    s->remove_suffix(s->size());
470
0
  }
471
0
}
472
473
0
void Strings::Trim(std::string_view* s, std::string_view chars_to_trim) {
474
0
  TrimLeft(s, chars_to_trim);
475
0
  TrimRight(s, chars_to_trim);
476
0
}
477
478
0
bool Strings::StripTrailingNewline(std::string* s) {
479
0
  if (!s->empty() && (*s)[s->size() - 1] == '\n') {
480
0
    if (s->size() > 1 && (*s)[s->size() - 2] == '\r')
481
0
      s->resize(s->size() - 2);
482
0
    else
483
0
      s->resize(s->size() - 1);
484
0
    return true;
485
0
  }
486
0
  return false;
487
0
}
488
489
0
void Strings::RemoveExtraSpaceChars(std::string* s) {
490
0
  int put_index = 0;
491
0
  bool ignore_next_space_char = false;
492
0
  for (std::size_t i = 0; i < s->size(); ++i) {
493
0
    if (s->at(i) == ' ') {
494
      // Previous character was a space, so ignore this char.
495
0
      if (ignore_next_space_char) {
496
0
        continue;
497
0
      }
498
0
      ignore_next_space_char = true;
499
0
    } else {
500
0
      ignore_next_space_char = false;
501
0
    }
502
0
    s->at(put_index++) = s->at(i);
503
0
  }
504
0
  s->resize(put_index);
505
0
}
506
507
0
bool Strings::StartsWith(std::string_view s, std::string_view prefix) {
508
0
  if (prefix.size() > s.size()) return false;
509
510
0
  for (std::size_t i = 0; i < prefix.size(); ++i) {
511
0
    uint8_t c1 = prefix.at(i) & 0xff;
512
0
    uint8_t c2 = s.at(i) & 0xff;
513
0
    if (c1 != c2) return false;
514
0
  }
515
516
0
  return true;
517
0
}
518
519
0
bool Strings::EndsWith(std::string_view s, std::string_view suffix) {
520
0
  if (suffix.size() > s.size()) return false;
521
522
0
  std::size_t i;
523
0
  std::size_t j;
524
0
  for (i = suffix.size() - 1, j = s.size() - 1; i > 0; --i) {
525
0
    uint8_t c1 = suffix.at(i) & 0xff;
526
0
    uint8_t c2 = s.at(j--) & 0xff;
527
0
    if (c1 != c2) return false;
528
0
  }
529
530
0
  return true;
531
0
}
532
533
void Strings::Replace(std::string* s, std::string_view from,
534
0
    std::string_view to) {
535
0
  if (from.empty()) return;
536
537
0
  std::size_t i = s->find(from);
538
0
  s->replace(i, from.size(), to);
539
0
}
540
541
void Strings::ReplaceAll(std::string* s, std::string_view from,
542
0
                         std::string_view to) {
543
0
  if (from.empty()) return;
544
0
  std::size_t i = s->find(from);
545
0
  while (i != std::string::npos) {
546
0
    s->replace(i, from.size(), to);
547
0
    i = s->find(from, i);
548
0
  }
549
0
}
550
551
void Strings::ReplaceAny(std::string* s, std::string_view chars,
552
2.11M
                         std::string_view to) {
553
2.11M
  if (chars.empty()) return;
554
2.11M
  std::size_t i = s->find_first_of(chars);
555
2.18M
  while (i != std::string::npos) {
556
70.7k
    s->replace(i, 1, to);
557
70.7k
    i = s->find_first_of(chars);
558
70.7k
  }
559
2.11M
}
560
561
std::optional<std::string> Strings::Translate(std::string_view str,
562
                                              std::string_view abc,
563
0
                                              std::string_view xyz) {
564
  // Contains sequence of characters found in abc string.
565
0
  std::vector<char32_t> abc_bytes;
566
  // Contains sequence of characters founds in xyz string.
567
0
  std::vector<char32_t> xyz_bytes;
568
569
  // Captures the characters.
570
0
  if (!(ExtractChars(abc, &abc_bytes) &&
571
0
        ExtractChars(xyz, &xyz_bytes))) {
572
0
    return std::nullopt;
573
0
  }
574
575
  // Helper function to find out index of matching char in the abc string.
576
  // Returns -1 if char is not found.
577
0
  std::function<int(char32_t)> getCharIndex =
578
0
      [&](char32_t c) -> std::size_t {
579
0
    for (std::size_t i = 0; i < abc_bytes.size(); ++i) {
580
0
      if (abc_bytes.at(i) == c) return i;
581
0
    }
582
0
    return std::string::npos;
583
0
  };
584
585
  // Evaluate and translate.
586
0
  std::stringbuf buf;
587
0
  while (!str.empty()) {
588
0
    uint8_t new_char = str.front() & 0xff;
589
0
    if (IsOneByteASCIIChar(new_char)) {
590
0
      std::size_t i = getCharIndex(new_char);
591
0
      if (i == std::string::npos) {
592
0
        buf.sputc(new_char);
593
0
      } else if (i >= xyz_bytes.size()) {
594
        // Ignore the character. i.e. remove from translated string.
595
0
      } else {
596
        // Replacement byte can be utf-8 code.
597
0
        std::string s = EncodeUtf8Symbol(xyz_bytes.at(i)).value_or("");
598
0
        buf.sputn(s.c_str(), s.size());
599
0
      }
600
0
      str.remove_prefix(1);
601
0
      continue;
602
0
    }
603
604
0
    auto big_char_or = DecodeUtf8Symbol(&str);
605
0
    if (!big_char_or.has_value()) {
606
      // Error decoding string.
607
0
      return std::nullopt;
608
0
    }
609
0
    char32_t big_char = big_char_or.value();
610
0
    std::size_t i = getCharIndex(big_char);
611
0
    if (i == std::string::npos) {
612
0
      auto s_or = EncodeUtf8Symbol(big_char);
613
0
      if (!s_or.has_value()) return std::nullopt;
614
0
      buf.sputn(s_or.value().c_str(), s_or.value().size());
615
0
    } else if (i >= xyz_bytes.size()) {
616
      // Ignore the character. i.e. remove from translated string.
617
0
    } else {
618
0
      auto s_or = EncodeUtf8Symbol(xyz_bytes.at(i));
619
0
      if (!s_or.has_value()) return std::nullopt;
620
0
      buf.sputn(s_or.value().c_str(), s_or.value().size());
621
0
    }
622
0
  }
623
624
0
  return buf.str();
625
0
}
626
627
bool Strings::IsAllWhitespaceChars(std::string_view s,
628
39.2k
      std::string_view whitespace_chars) {
629
39.2k
  return s.find_first_not_of(whitespace_chars) == std::string::npos;
630
39.2k
}
631
632
92.4k
bool Strings::EqualFold(std::string_view l, std::string_view r) {
633
119k
  while (!l.empty()) {
634
    // Reached the end of r, but more chars in l.
635
108k
    if (r.empty()) return false;
636
637
107k
    uint8_t l_char = l.front() & 0xff;
638
107k
    uint8_t r_char = r.front() & 0xff;
639
640
    // ASCII characters first.
641
107k
    if (IsOneByteASCIIChar(l_char)) {
642
101k
      if (('A' <= l_char && l_char <= 'Z') ||
643
101k
          ('a' <= l_char && l_char <= 'z')) {
644
        // Compare lower character for both the chars.
645
92.3k
        if ((l_char | 0x20) != (r_char | 0x20)) {
646
66.5k
          return false;
647
66.5k
        }
648
92.3k
      } else if (l_char != r_char) { // Compare other ascii character as-is.
649
8.29k
        return false;
650
8.29k
      }
651
652
26.4k
      l.remove_prefix(1);
653
26.4k
      r.remove_prefix(1);
654
26.4k
      continue;
655
101k
    }
656
657
6.34k
    if (!(CodePointByteSequenceCount(l_char) > 1 &&
658
6.34k
          CodePointByteSequenceCount(r_char) > 1)) {
659
2.46k
      return false;
660
2.46k
    }
661
662
3.87k
    auto l_char_opt = DecodeUtf8Symbol(&l);
663
3.87k
    auto r_char_opt = DecodeUtf8Symbol(&r);
664
665
    // Checks decoding succeeded.
666
3.87k
    if (!(l_char_opt.has_value() && r_char_opt.has_value())) return false;
667
668
1.51k
    char32_t l_char_wide = l_char_opt.value();
669
1.51k
    char32_t r_char_wide = r_char_opt.value();
670
671
    // Two characters matched. No case conversion needed.
672
1.51k
    if (l_char_wide == r_char_wide) {
673
700
      continue;
674
700
    }
675
676
    // Convert both to lowercase.
677
811
    l_char_wide = ToLowerChar(l_char_wide);
678
811
    r_char_wide = ToLowerChar(r_char_wide);
679
680
811
    if (l_char_wide != r_char_wide) return false;
681
811
  }
682
683
  // Checks all the bytes are processed in both the strings. If some bytes
684
  // left in either string, they are not equal.
685
11.4k
  return l.empty() && r.empty();
686
92.4k
}
687
688
std::vector<std::string> Strings::SplitStringAt(
689
0
      std::string_view s, char delimiter) {
690
0
  std::vector<std::string> columns;
691
0
  size_t first = 0;
692
693
0
  while (first < s.size()) {
694
0
    auto second = s.find_first_of(delimiter, first);
695
696
0
    if (first != second)
697
0
      columns.emplace_back(std::string(s.substr(first, second-first)));
698
699
0
    if (second == std::string_view::npos)
700
0
      break;
701
702
0
    first = second + 1;
703
0
  }
704
705
0
  return columns;
706
0
}
707
708
std::vector<std::string_view> Strings::SplitStrAtUtf8Whitespace(
709
0
    std::string_view s) {
710
0
  std::vector<std::string_view> columns;
711
0
  std::size_t start = 0;
712
0
  std::size_t end = 0;
713
0
  while (end < s.size()) {
714
0
    auto num_ws = IsUtf8WhiteSpaceChar(s, end);
715
0
    if (num_ws > 0) {
716
0
      if (start < end) {
717
0
        columns.emplace_back(s.substr(start, end - start));
718
0
      }
719
0
      start = end + num_ws;
720
0
      end = start;
721
0
    } else {
722
0
      end++;
723
0
    }
724
0
  }
725
0
  columns.emplace_back(s.substr(start, s.size()));
726
0
  return columns;
727
0
}
728
729
0
int Strings::IsUtf8WhiteSpaceChar(std::string_view s, std::size_t position) {
730
0
  std::size_t i = position;
731
0
  int state = 0;
732
0
  while (i < s.size()) {
733
0
    uint8_t c = s.at(i++);
734
0
    state = kWhitespaceTable[state][c];
735
736
0
    if (state == 0) {
737
0
      return 0;
738
0
    }
739
740
0
    if (state == 1) {
741
0
      return i - position;
742
0
    }
743
0
  }
744
745
0
  return 0;
746
0
}
747
748
0
int Strings::CountTerms(std::string_view s) {
749
0
  bool in_term = false;
750
0
  int num_terms = 0;
751
0
  while (!s.empty()) {
752
0
    unsigned char c = s.front();
753
0
    s.remove_prefix(1);
754
    // whitespace and punctuations.
755
0
    if ((kPropertyBits[c] & 0x08) != 0 || (kPropertyBits[c] & 0x10) != 0) {
756
0
      in_term = false;
757
0
    } else if (!in_term) {
758
      // First character of a term
759
0
      ++num_terms;
760
0
      in_term = true;
761
0
    }
762
0
  }
763
0
  return num_terms;
764
0
}
765
766
namespace {
767
768
// Reads an entity like "&lt;" from b[src:] and writes the corresponding "<"
769
// to b[dst:], returning the incremented dst and src cursors.
770
// Precondition: b[src] == '&' && dst <= src.
771
// attribute should be true if passing an attribute value.
772
std::pair<int, int> UnescapeEntity(std::string* b, int dst, int src,
773
524k
    bool attribute) {
774
524k
  std::string s = b->substr(src);
775
524k
  if (s.size() <= 1) {
776
86
    b->at(dst) = b->at(src);
777
86
    return std::pair<int, int>(dst + 1, src + 1);
778
86
  }
779
780
  // i starts at 1 because we already know that s[0] == '&'.
781
524k
  std::size_t i = 1;
782
524k
  if (s.at(i) == '#') {
783
428k
    if (s.size() <= 3) {  // We need to have at least  "&#.".
784
90
      b->at(dst) = b->at(src);
785
90
      return std::pair<int, int>(dst + 1, src + 1);
786
90
    }
787
428k
    i++;
788
428k
    auto c = s.at(i);
789
428k
    bool hex = false;
790
428k
    if (c == 'x' || c == 'X') {
791
404k
      hex = true;
792
404k
      i++;
793
404k
    }
794
795
428k
    char32_t x = '\x00';
796
909k
    while (i < s.size()) {
797
909k
      auto c = s.at(i);
798
909k
      i++;
799
909k
      if (hex) {
800
866k
        if (Strings::IsDigit(c)) {
801
2.84k
          x = (16 * x) | (c - '0');
802
2.84k
          continue;
803
863k
        } else if ('a' <= c && c <= 'f') {
804
457k
          x = 16 * x + c - 'a' + 10;
805
457k
          continue;
806
457k
        } else if ('A' <= c && c <= 'F') {
807
2.50k
          x = 16 * x + c - 'A' + 10;
808
2.50k
          continue;
809
2.50k
        }
810
866k
      } else if (Strings::IsDigit(c)) {
811
18.5k
        x = 10 * x + c - '0';
812
18.5k
        continue;
813
18.5k
      }
814
427k
      if (c != ';') {
815
426k
          i--;
816
426k
      }
817
427k
      break;
818
909k
    }
819
820
428k
    if (i <= 3) {  // No characters matched.
821
420k
      b->at(dst) = b->at(src);
822
420k
      return std::pair<int, int>(dst + 1, src + 1);
823
420k
    }
824
825
7.82k
    if (0x80 <= x && x <= 0x9F) {
826
      // Replace characters from Windows-1252 with UTF-8 equivalents.
827
232
      x = kReplacementTable[x - 0x80];
828
7.59k
    } else if (x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF) {
829
      // Replace invalid characters with the replacement chracter.
830
3.00k
      x = L'\uFFFD';
831
3.00k
    }
832
833
7.82k
    auto encoded_bytes = Strings::EncodeUtf8Symbol(x);
834
7.82k
    if (encoded_bytes.has_value()) {
835
7.82k
      std::transform(encoded_bytes.value().begin(),
836
7.82k
          encoded_bytes.value().end(), b->begin() + dst,
837
16.6k
          [](uint8_t c) -> char { return static_cast<char>(c); });
838
7.82k
      return std::pair<int, int>(dst + encoded_bytes.value().size(), src + i);
839
7.82k
    }
840
7.82k
  }
841
842
  // Consume the maximum number of chracters possible, with the consumed
843
  // characters matching one of the named references.
844
1.61M
  while (i < s.size()) {
845
1.61M
    auto c = s.at(i);
846
1.61M
    i++;
847
    // Lower-cased characters are more common in entities, so we check for
848
    // them first.
849
1.61M
   if (Strings::IsCharAlphabet(c) || Strings::IsDigit(c)) {
850
1.52M
     continue;
851
1.52M
   }
852
95.4k
   if (c != ';') {
853
90.5k
     i--;
854
90.5k
   }
855
95.4k
   break;
856
1.61M
  }
857
858
96.6k
  std::string entityName = s.substr(1, i - 1);
859
96.6k
  auto encoded_bytes = EntityLookup(entityName);
860
96.6k
  if (entityName.empty()) {
861
    // No-op.
862
64.5k
  } else if (attribute && entityName.at(entityName.size() - 1) != ';' &&
863
32.0k
      s.size() > i && s.at(i) == '=') {
864
    // No-op.
865
31.6k
  } else if (!encoded_bytes.empty()) {
866
476
    int overflow = encoded_bytes.size() - entityName.size() - 1 /* & */;
867
476
    if (overflow > 0) {
868
      // Insert some dummy chars which will get occupied by overflow entity
869
      // chars.
870
      // Suppose &xy; = \x1\x2\x3\x4\x5 (5 bytes char)
871
      // abc&xy;def (10 bytes) after this statement is:
872
      // abc&xy; def (11 bytes).
873
      // After unescape: abc\x1\x2\x3\x4\x5def (11 bytes).
874
49
      b->insert(src + encoded_bytes.size() - 1, " ", overflow);
875
49
    }
876
    // Copies the unescaped bytes to the destination,
877
476
    std::transform(encoded_bytes.begin(), encoded_bytes.end(), b->begin() + dst,
878
749
        [](uint8_t c) -> char { return static_cast<char>(c); });
879
476
    return std::pair<int, int>(
880
476
        dst + encoded_bytes.size() - (overflow > 0 ? overflow : 0), src + i);
881
31.1k
  } else if (!attribute) {
882
10.0k
    int max_length = entityName.size() - 1;
883
10.0k
    if (max_length > kLongestEntityWithoutSemiColon) {
884
6.10k
      max_length = kLongestEntityWithoutSemiColon;
885
6.10k
    }
886
44.0k
    for (int j = max_length; j > 1; --j) {
887
34.4k
      auto encoded_bytes = EntityLookup(entityName.substr(0, j));
888
34.4k
      if (!encoded_bytes.empty()) {
889
470
        std::transform(encoded_bytes.begin(), encoded_bytes.end(),
890
855
                       b->begin() + dst, [](uint8_t c) -> char {
891
855
                         return static_cast<char>(c); });
892
470
        return std::pair<int, int>(dst + encoded_bytes.size(), src + j + 1);
893
470
      }
894
34.4k
    }
895
10.0k
  }
896
897
95.6k
  std::copy(b->begin() + src, b->begin() + src + i, b->begin() + dst);
898
95.6k
  return std::pair<int, int>(dst + i, src + i);
899
96.6k
}
900
901
13.7M
void CaseTransformInternal(bool to_upper, std::string* s) {
902
67.0M
  for (std::size_t i = 0; i < s->size(); ++i) {
903
904
53.2M
    uint8_t code_point = s->at(i) & 0xff;
905
906
    // ASCII characters first.
907
53.2M
    if (IsOneByteASCIIChar(code_point)) {
908
51.3M
      auto c = to_upper ? ToUpperChar(code_point) : ToLowerChar(code_point);
909
51.3M
      if (c != code_point) {
910
12.8M
        s->at(i) = static_cast<char>(c);
911
12.8M
      }
912
51.3M
      continue;
913
51.3M
    }
914
915
1.90M
    if (Strings::CodePointByteSequenceCount(code_point) > 1) {
916
1.28M
      std::string_view sv = *s;
917
1.28M
      sv.remove_prefix(i);
918
1.28M
      auto decoded = Strings::DecodeUtf8Symbol(&sv);
919
1.28M
      if (decoded.has_value()) {
920
890k
        char32_t decode_value = decoded.value();
921
890k
        auto c =
922
890k
            to_upper ? ToUpperChar(decode_value) : ToLowerChar(decode_value);
923
890k
        if (c != decode_value) {
924
3.45k
          auto char_encoded = Strings::EncodeUtf8Symbol(c);
925
3.45k
          if (char_encoded.has_value()) {
926
3.45k
            std::transform(char_encoded.value().begin(),
927
3.45k
                char_encoded.value().end(), s->begin() + i,
928
9.58k
                [](uint8_t c) -> char { return static_cast<char>(c); });
929
3.45k
          }
930
3.45k
        }
931
890k
      }
932
1.28M
    }
933
1.90M
  }
934
13.7M
}
935
936
1.54M
bool ReadContinuationByte(uint8_t byte, uint8_t* out) {
937
  // Checks it is valid continuation byte. 0b10xxxxxx.
938
1.54M
  if ((byte & 0xc0) == 0x80) {
939
    // Mask last six bits 0b00xxxxxx.
940
566k
    *out = byte & 0x3f;
941
566k
    return true;
942
566k
  }
943
944
  // Invalid continuation byte.
945
976k
  return false;
946
1.54M
}
947
948
54.6M
inline bool IsOneByteASCIIChar(uint8_t c) {
949
54.6M
  return (c & 0x80) == 0;
950
54.6M
}
951
952
0
bool ExtractChars(std::string_view str, std::vector<char32_t>* chars) {
953
0
  while (!str.empty()) {
954
0
    uint8_t c = str.front() & 0xff;
955
956
    // ASCII chracters first.
957
0
    if (IsOneByteASCIIChar(c)) {
958
0
      chars->push_back(c);
959
0
      str.remove_prefix(1);
960
0
      continue;
961
    // Check if this character is member of codepoint sequence.
962
0
    } else if (Strings::CodePointByteSequenceCount(c) > 1) {
963
      // Decode moves the string view prefix so no need to remove prefix
964
      // manually.
965
0
      auto old_big_char = Strings::DecodeUtf8Symbol(&str);
966
0
      if (!old_big_char.has_value()) {
967
        // Error decoding string.
968
0
        chars->clear();
969
0
        return false;
970
0
      }
971
0
      chars->push_back(old_big_char.value());
972
0
    } else {
973
      // Unknown character type.
974
0
      chars->clear();
975
0
      return false;
976
0
    }
977
0
  }
978
0
  return true;
979
0
}
980
981
0
bool OneByteHexCodeToInt(std::string_view hex_code, uint8_t* out) {
982
  // Will overflow.
983
0
  if (hex_code.size() > 2) return false;
984
0
  uint8_t x = 0;
985
0
  while (!hex_code.empty()) {
986
0
    auto h = hex_code.at(0);
987
0
    hex_code.remove_prefix(1);
988
0
    if (Strings::IsDigit(h)) {
989
0
      x = (16 * x) | (h - '0');
990
0
    } else if ('a' <= h && h <= 'f') {
991
0
      x = 16 * x + h - 'a' + 10;
992
0
    } else if ('A' <= h && h <= 'F') {
993
0
      x = 16 * x + h - 'A' + 10;
994
0
    } else {
995
      // Invalid hex code eg. %2x or %m8
996
0
      return false;
997
0
    }
998
0
  }
999
0
  *out = x;
1000
0
  return true;
1001
0
}
1002
1003
}  // namespace
1004
1005
}  // namespace htmlparser