/proc/self/cwd/cpp/htmlparser/strings.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include "cpp/htmlparser/strings.h" |
2 | | |
3 | | #include <algorithm> |
4 | | #include <array> |
5 | | #include <functional> |
6 | | #include <sstream> |
7 | | #include <tuple> |
8 | | #include "cpp/htmlparser/casetable.h" |
9 | | #include "cpp/htmlparser/entity.h" |
10 | | #include "cpp/htmlparser/whitespacetable.h" |
11 | | |
12 | | namespace htmlparser { |
13 | | |
14 | | // These replacements permit compatibility with old numeric entities that |
15 | | // assumed Windows-1252 encoding. |
16 | | // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference |
17 | | constexpr std::array<char32_t, 32> kReplacementTable{ |
18 | | L'\u20AC', // First entry is what 0x80 should be replaced with. |
19 | | L'\u0081', |
20 | | L'\u201A', |
21 | | L'\u0192', |
22 | | L'\u201E', |
23 | | L'\u2026', |
24 | | L'\u2020', |
25 | | L'\u2021', |
26 | | L'\u02C6', |
27 | | L'\u2030', |
28 | | L'\u0160', |
29 | | L'\u2039', |
30 | | L'\u0152', |
31 | | L'\u008D', |
32 | | L'\u017D', |
33 | | L'\u008F', |
34 | | L'\u0090', |
35 | | L'\u2018', |
36 | | L'\u2019', |
37 | | L'\u201C', |
38 | | L'\u201D', |
39 | | L'\u2022', |
40 | | L'\u2013', |
41 | | L'\u2014', |
42 | | L'\u02DC', |
43 | | L'\u2122', |
44 | | L'\u0161', |
45 | | L'\u203A', |
46 | | L'\u0153', |
47 | | L'\u009D', |
48 | | L'\u017E', |
49 | | L'\u0178', // Last entry is 0x9F. |
50 | | // 0x00->L'\uFFFD' is handled programmatically. |
51 | | // 0x0D->L'\u000D' is a no-op. |
52 | | }; |
53 | | |
54 | | // Copied from https://github.com/abseil/abseil-cpp/blob/master/absl/strings/ascii.cc |
55 | | constexpr std::array<unsigned char, 256> kPropertyBits{ |
56 | | 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00 |
57 | | 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40, |
58 | | 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10 |
59 | | 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, |
60 | | 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20 |
61 | | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
62 | | 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30 |
63 | | 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
64 | | 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40 |
65 | | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, |
66 | | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50 |
67 | | 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10, |
68 | | 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60 |
69 | | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, |
70 | | 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70 |
71 | | 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40, |
72 | | }; |
73 | | |
74 | | |
75 | | // Internal functions forward |
76 | | // ========================== |
77 | | namespace { |
78 | | |
79 | | // Unescapes the entity inline. <html> becomes <html>. The resulting |
80 | | // string may be smaller than original string. |
81 | | std::pair<int, int> UnescapeEntity(std::string* b, int dst, int src, |
82 | | bool attribute = false); |
83 | | |
84 | | // Converts the case of a string s according to the rules of character map in |
85 | | // the case conversion table. |
86 | | void CaseTransformInternal(bool to_upper, std::string* s); |
87 | | |
88 | | // For multi-sequence utf-8 codepoints, reads the next valid byte as out |
89 | | // parameter. Returns false if next byte in the sequence is not a valid byte. |
90 | | bool ReadContinuationByte(uint8_t byte, uint8_t* out); |
91 | | |
92 | | // Checks if the character is ASCII that is in range 1-127. |
93 | | inline bool IsOneByteASCIIChar(uint8_t c); |
94 | | |
95 | | // For a given string extracts all its char (including big char). |
96 | | // Extraction may fail if there is error decoding utf-8 bytes inside the str. |
97 | | // Returns false in case of error. |
98 | | bool ExtractChars(std::string_view str, std::vector<char32_t>* chars); |
99 | | |
100 | | // Converts 0xFF to 255, 0x8d to 141 etc. Better and exception safe than |
101 | | // std::stoi and others. |
102 | | bool OneByteHexCodeToInt(std::string_view hex_code, uint8_t* out); |
103 | | |
104 | | } // namespace. |
105 | | |
106 | | std::optional<std::string> Strings::DecodePercentEncodedURL( |
107 | 0 | std::string_view uri) { |
108 | 0 | if (uri.empty()) return ""; |
109 | | |
110 | 0 | std::stringbuf uri_decoded; |
111 | 0 | while (!uri.empty()) { |
112 | 0 | if (uri.front() != '%') { |
113 | 0 | uri_decoded.sputc(uri.front()); |
114 | 0 | uri.remove_prefix(1); |
115 | 0 | continue; |
116 | 0 | } |
117 | | |
118 | 0 | uint8_t x1 = 0; |
119 | 0 | if (uri.size() < 3 || |
120 | 0 | !OneByteHexCodeToInt(uri.substr(1, 2), &x1)) { |
121 | 0 | return std::nullopt; |
122 | 0 | } |
123 | | |
124 | | // Consumed the first three percent encoded chars. eg. %a8. |
125 | 0 | uri.remove_prefix(3); |
126 | | |
127 | | // Sequence byte without initial byte. |
128 | 0 | if ((x1 & 0xc0) == 0x80) return std::nullopt; |
129 | | |
130 | 0 | auto num_bytes = Strings::CodePointByteSequenceCount(x1); |
131 | 0 | uri_decoded.sputc(x1); |
132 | 0 | if (num_bytes == 1) { |
133 | | // Single byte char must be signed char. |
134 | 0 | if (x1 > 127) return std::nullopt; |
135 | 0 | continue; |
136 | 0 | } |
137 | | |
138 | | // 2 bytes sequence. |
139 | 0 | if (num_bytes > 1) { |
140 | 0 | uint8_t x2 = 0; |
141 | 0 | if (uri.size() < 3 || |
142 | 0 | uri.front() != '%' || |
143 | 0 | !OneByteHexCodeToInt(uri.substr(1, 2), &x2) || |
144 | 0 | (x2 & 0xc0) != 0x80) { |
145 | 0 | return std::nullopt; |
146 | 0 | } |
147 | 0 | uri.remove_prefix(3); |
148 | 0 | uri_decoded.sputc(x2); |
149 | 0 | } |
150 | | |
151 | | // 3 byte sequence. |
152 | 0 | if (num_bytes > 2) { |
153 | 0 | uint8_t x3 = 0; |
154 | 0 | if (uri.size() < 3 || |
155 | 0 | uri.front() != '%' || |
156 | 0 | !OneByteHexCodeToInt(uri.substr(1, 2), &x3) || |
157 | 0 | (x3 & 0xc0) != 0x80) { |
158 | 0 | return std::nullopt; |
159 | 0 | } |
160 | 0 | uri.remove_prefix(3); |
161 | 0 | uri_decoded.sputc(x3); |
162 | 0 | } |
163 | | |
164 | | // 4 byte sequence. |
165 | 0 | if (num_bytes > 3) { |
166 | 0 | uint8_t x4 = 0; |
167 | 0 | if (uri.size() < 3 || |
168 | 0 | uri.front() != '%' || |
169 | 0 | !OneByteHexCodeToInt(uri.substr(1, 2), &x4) || |
170 | 0 | (x4 & 0xc0) != 0x80) { |
171 | 0 | return std::nullopt; |
172 | 0 | } |
173 | 0 | uri.remove_prefix(3); |
174 | 0 | uri_decoded.sputc(x4); |
175 | 0 | } |
176 | 0 | } |
177 | | |
178 | 0 | return uri_decoded.str(); |
179 | 0 | } |
180 | | |
181 | 17.4M | bool Strings::IsCharAlphabet(char c) { |
182 | 17.4M | return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); |
183 | 17.4M | } |
184 | | |
185 | | // Returns true if character is char 0-9. |
186 | 1.24M | bool Strings::IsDigit(char c) { |
187 | 1.24M | return '0' <= c && c <= '9'; |
188 | 1.24M | } |
189 | | |
190 | 2.38M | void Strings::ConvertNewLines(std::string* s) { |
191 | 32.1M | for (std::size_t i = 0; i < s->size(); i++) { |
192 | 29.7M | char c = s->at(i); |
193 | 29.7M | if (!(c == '\r' || c == '\f')) continue; |
194 | | |
195 | | // Converts any lone \r that is not followed by \n to \n. |
196 | | // \r\rfoo becomes \n\nfoo. |
197 | | // \r\r\nfoo becomes \n\nfoo. |
198 | | // \r\f\r\nfoo becomes \n\n\nfoo |
199 | 5.53M | std::size_t next = i + 1; |
200 | 5.53M | if (c == '\r') { |
201 | 5.49M | if (next >= s->size() || s->at(next) != '\n') { |
202 | 5.48M | (*s)[i] = '\n'; |
203 | 5.48M | continue; |
204 | 5.48M | } |
205 | 5.49M | } |
206 | | |
207 | 49.2k | if (c == '\f') { |
208 | 41.3k | (*s)[i] = '\n'; |
209 | 41.3k | continue; |
210 | 41.3k | } |
211 | | |
212 | 7.87k | int dest = i; |
213 | 1.79M | while (next < s->size()) { |
214 | 1.78M | if (s->at(next) == '\r') { |
215 | 212k | if ((next + 1) < s->size() && s->at(next + 1) == '\n') { |
216 | 1.75k | next++; |
217 | 1.75k | } |
218 | 212k | (*s)[dest] = '\n'; |
219 | 1.56M | } else { |
220 | 1.56M | (*s)[dest] = s->at(next); |
221 | 1.56M | } |
222 | 1.78M | next++; |
223 | 1.78M | dest++; |
224 | 1.78M | } |
225 | 7.87k | s->resize(dest); |
226 | 7.87k | } |
227 | 2.38M | } |
228 | | |
229 | 0 | std::string Strings::ToHexString(uint32_t c) { |
230 | 0 | std::stringstream ss; |
231 | 0 | ss << "0x" << std::hex << c; |
232 | 0 | return ss.str(); |
233 | 0 | } |
234 | | |
235 | 175M | int8_t Strings::CodePointByteSequenceCount(uint8_t c) { |
236 | 175M | if ((c & 0x80) == 0) return 1; // Ascii char. |
237 | 16.8M | if ((c & 0xe0) == 0xc0) return 2; // 2 bytes sequence. |
238 | 15.2M | if ((c & 0xf0) == 0xe0) return 3; // 3 bytes sequence. |
239 | 14.6M | if ((c & 0xf8) == 0xf0) return 4; // 4 bytes sequence. |
240 | | |
241 | | |
242 | | // Defaults to 1 byte ascii. |
243 | 1.83M | return 1; |
244 | 14.6M | } |
245 | | |
246 | 0 | int8_t Strings::CodePointNumBytes(char32_t c) { |
247 | 0 | if (c & 0xffffff80) return 1; |
248 | 0 | if (c & 0xfffff800) return 2; |
249 | 0 | if (c & 0xffff0000) return 3; |
250 | 0 | if (c & 0xffe00000) return 4; |
251 | | |
252 | | // Defaults to 1 byte ascii. |
253 | 0 | return 1; |
254 | 0 | } |
255 | | |
256 | 1.29M | std::optional<char32_t> Strings::DecodeUtf8Symbol(std::string_view* s) { |
257 | 1.29M | if (!s || s->empty()) { |
258 | 0 | return std::nullopt; |
259 | 0 | } |
260 | | |
261 | | // Checks first byte is valid. |
262 | 1.29M | uint8_t c = *(s->data()) & 0xff; |
263 | | |
264 | | // 1 byte sequence. |
265 | 1.29M | if (IsOneByteASCIIChar(c)) { |
266 | 0 | s->remove_prefix(1); |
267 | 0 | return c; |
268 | 0 | } |
269 | | |
270 | 1.29M | if (!(CodePointByteSequenceCount(c) > 1)) { |
271 | 0 | return std::nullopt; |
272 | 0 | } |
273 | | |
274 | | // 2 byte sequence. |
275 | 1.29M | if ((c & 0xe0) == 0xc0) { |
276 | 521k | if (s->size() < 2) return std::nullopt; |
277 | 519k | s->remove_prefix(1); |
278 | 519k | uint8_t c2; |
279 | 519k | bool c2_ok = ReadContinuationByte(*(s->data()), &c2); |
280 | 519k | s->remove_prefix(1); |
281 | | // Invalid byte in the sequence. |
282 | 519k | if (!c2_ok) return L'\uFFFD'; |
283 | 12.3k | char32_t code_point = ((c & 0x1f) << 6) | c2; |
284 | 12.3k | if (code_point < 0x80) { |
285 | 8.88k | return std::nullopt; |
286 | 8.88k | } |
287 | 3.41k | return code_point; |
288 | 12.3k | } |
289 | | |
290 | | // 3 byte sequence. |
291 | 770k | if ((c & 0xf0) == 0xe0) { |
292 | 135k | if (s->size() < 3) return std::nullopt; |
293 | 133k | s->remove_prefix(1); |
294 | 133k | uint8_t c2; |
295 | 133k | bool c2_ok = ReadContinuationByte(*(s->data()), &c2); |
296 | 133k | s->remove_prefix(1); |
297 | 133k | uint8_t c3; |
298 | 133k | bool c3_ok = ReadContinuationByte(*(s->data()), &c3); |
299 | 133k | s->remove_prefix(1); |
300 | | // Invalid bytes in the sequence. |
301 | 133k | if (!(c2_ok && c3_ok)) return L'\uFFFD'; |
302 | 129k | char32_t code_point = ((c & 0x0f) << 12) | (c2 << 6) | c3; |
303 | 129k | if (code_point < 0x0800) { |
304 | 78 | return std::nullopt; |
305 | 78 | } |
306 | | // Check if this is codepoint is low surrgates. |
307 | 129k | if (code_point >= 0xd800 && code_point <= 0xdfff) { |
308 | 1.19k | return std::nullopt; |
309 | 1.19k | } |
310 | | |
311 | 128k | return code_point; |
312 | 129k | } |
313 | | |
314 | | // 4 byte sequence. |
315 | 634k | if ((c & 0xf8) == 0xf0) { |
316 | 634k | if (s->size() < 4) return std::nullopt; |
317 | 252k | s->remove_prefix(1); |
318 | 252k | uint8_t c2; |
319 | 252k | bool c2_ok = ReadContinuationByte(*(s->data()), &c2); |
320 | 252k | s->remove_prefix(1); |
321 | 252k | uint8_t c3; |
322 | 252k | bool c3_ok = ReadContinuationByte(*(s->data()), &c3); |
323 | 252k | s->remove_prefix(1); |
324 | 252k | uint8_t c4; |
325 | 252k | bool c4_ok = ReadContinuationByte(*(s->data()), &c4); |
326 | 252k | s->remove_prefix(1); |
327 | | // Invalid bytes in the sequence. |
328 | 252k | if (!(c2_ok && c3_ok && c4_ok)) return L'\uFFFD'; |
329 | 25.5k | char32_t code_point = ((c & 0x07) << 0x12) | |
330 | 25.5k | (c2 << 0x0c) | |
331 | 25.5k | (c3 << 0x06) | c4; |
332 | 25.5k | if (!(code_point >= 0x010000 && code_point <= 0x10ffff)) { |
333 | 343 | return std::nullopt; |
334 | 343 | } |
335 | 25.2k | return code_point; |
336 | 25.5k | } |
337 | | |
338 | 0 | return std::nullopt; |
339 | 634k | } |
340 | | |
341 | 11.2k | std::optional<std::string> Strings::EncodeUtf8Symbol(char32_t code_point) { |
342 | 11.2k | if ((code_point & 0xffffff80) == 0) { // 1 byte sequence. |
343 | 3.59k | return std::string{static_cast<char>(code_point)}; |
344 | 7.69k | } else if ((code_point & 0xfffff800) == 0) { // 2 byte sequence. |
345 | 2.05k | return std::string{ |
346 | 2.05k | static_cast<char>((code_point >> 6) | 0xc0), |
347 | 2.05k | static_cast<char>((code_point & 0x3f) | 0x80) |
348 | 2.05k | }; |
349 | 5.64k | } else if ((code_point & 0xffff0000) == 0) { // 3 byte sequence. |
350 | 3.97k | return std::string{ |
351 | 3.97k | static_cast<char>((code_point >> 12) | 0xe0), |
352 | 3.97k | static_cast<char>(((code_point >> 6) & 0x3f) | 0x80), |
353 | 3.97k | static_cast<char>((code_point & 0x3f) | 0x80) |
354 | 3.97k | }; |
355 | 3.97k | } else if ((code_point & 0xffe00000) == 0) { // 4 byte sequence. |
356 | 1.66k | return std::string{ |
357 | 1.66k | static_cast<char>((code_point >> 18) | 0xf0), |
358 | 1.66k | static_cast<char>(((code_point >> 12) & 0x3f) | 0x80), |
359 | 1.66k | static_cast<char>(((code_point >> 6) & 0x3f) | 0x80), |
360 | 1.66k | static_cast<char>((code_point & 0x3f) | 0x80) |
361 | 1.66k | }; |
362 | 1.66k | } |
363 | | |
364 | 0 | return std::nullopt; |
365 | 11.2k | } |
366 | | |
367 | 0 | std::string Strings::EscapeString(std::string_view s) { |
368 | 0 | std::stringbuf buffer; |
369 | 0 | Escape(s, &buffer); |
370 | 0 | return buffer.str(); |
371 | 0 | } |
372 | | |
373 | | |
374 | 0 | void Strings::Escape(std::string_view s, std::stringbuf* escaped) { |
375 | 0 | for (auto c : s) { |
376 | 0 | if (kEscapeChars.find(c) == std::string::npos) { |
377 | 0 | escaped->sputc(c); |
378 | 0 | continue; |
379 | 0 | } |
380 | | |
381 | 0 | std::string esc = ""; |
382 | 0 | switch (c) { |
383 | 0 | case '"': |
384 | 0 | esc = """; |
385 | 0 | break; |
386 | 0 | case '&': |
387 | 0 | esc = "&"; |
388 | 0 | break; |
389 | | // "'" is shorter than "'" and apos was not in HTML until |
390 | | // HTML5. |
391 | 0 | case '\'': |
392 | 0 | esc = "'"; |
393 | 0 | break; |
394 | 0 | case '<': |
395 | 0 | esc = "<"; |
396 | 0 | break; |
397 | 0 | case '>': |
398 | 0 | esc = ">"; |
399 | 0 | break; |
400 | 0 | default: |
401 | 0 | continue; |
402 | 0 | } |
403 | 0 | escaped->sputn(esc.c_str(), esc.size()); |
404 | 0 | } |
405 | 0 | } |
406 | | |
407 | 2.38M | void Strings::UnescapeString(std::string* s, bool attribute) { |
408 | 2.38M | if (s->empty()) return; |
409 | 1.96M | std::size_t src, dst = 0; |
410 | 9.42M | for (std::size_t i = 0; i < s->size() - 1; i++) { |
411 | 7.46M | if (s->at(i) == '&') { |
412 | 2.93k | std::tie(dst, src) = UnescapeEntity(s, i, i, attribute); |
413 | 8.77M | while (src < s->size()) { |
414 | 8.77M | auto c = s->at(src); |
415 | 8.77M | if (c == '&') { |
416 | 521k | std::tie(dst, src) = UnescapeEntity(s, dst, src, attribute); |
417 | 8.25M | } else { |
418 | 8.25M | s->at(dst) = c; |
419 | 8.25M | std::tie(dst, src) = std::tuple<int, int>(dst + 1, src + 1); |
420 | 8.25M | } |
421 | 8.77M | } |
422 | 2.93k | return s->resize(dst); |
423 | 2.93k | } |
424 | 7.46M | } |
425 | 1.96M | } |
426 | | |
427 | 13.7M | void Strings::ToLower(std::string* s) { |
428 | 13.7M | CaseTransformInternal(false, s); |
429 | 13.7M | } |
430 | | |
431 | 0 | void Strings::ToUpper(std::string* s) { |
432 | 0 | CaseTransformInternal(true, s); |
433 | 0 | } |
434 | | |
435 | | std::size_t Strings::IndexAny(const std::string_view s, |
436 | 106 | std::string_view chars) { |
437 | 106 | return s.find_first_of(chars); |
438 | 106 | } |
439 | | |
440 | 14.7k | void Strings::TrimLeft(std::string* s, std::string_view chars_to_trim) { |
441 | 14.7k | s->erase(0, s->find_first_not_of(chars_to_trim)); |
442 | 14.7k | } |
443 | | |
444 | 0 | void Strings::TrimRight(std::string* s, std::string_view chars_to_trim) { |
445 | 0 | s->erase(s->find_last_not_of(chars_to_trim) + 1); |
446 | 0 | } |
447 | | |
448 | 0 | void Strings::Trim(std::string* s, std::string_view chars_to_trim) { |
449 | 0 | TrimLeft(s, chars_to_trim); |
450 | 0 | TrimRight(s, chars_to_trim); |
451 | 0 | } |
452 | | |
453 | 0 | void Strings::TrimLeft(std::string_view* s, std::string_view chars_to_trim) { |
454 | 0 | if (auto count = s->find_first_not_of(chars_to_trim); |
455 | 0 | count != std::string_view::npos) { |
456 | 0 | s->remove_prefix(count); |
457 | 0 | } else { |
458 | | // All whitespace. |
459 | 0 | s->remove_prefix(s->size()); |
460 | 0 | } |
461 | 0 | } |
462 | | |
463 | 0 | void Strings::TrimRight(std::string_view* s, std::string_view chars_to_trim) { |
464 | 0 | if (auto count = s->find_last_not_of(chars_to_trim); |
465 | 0 | count != std::string_view::npos) { |
466 | 0 | s->remove_suffix(s->size() - count - 1); |
467 | 0 | } else { |
468 | | // All whitespace. |
469 | 0 | s->remove_suffix(s->size()); |
470 | 0 | } |
471 | 0 | } |
472 | | |
473 | 0 | void Strings::Trim(std::string_view* s, std::string_view chars_to_trim) { |
474 | 0 | TrimLeft(s, chars_to_trim); |
475 | 0 | TrimRight(s, chars_to_trim); |
476 | 0 | } |
477 | | |
478 | 0 | bool Strings::StripTrailingNewline(std::string* s) { |
479 | 0 | if (!s->empty() && (*s)[s->size() - 1] == '\n') { |
480 | 0 | if (s->size() > 1 && (*s)[s->size() - 2] == '\r') |
481 | 0 | s->resize(s->size() - 2); |
482 | 0 | else |
483 | 0 | s->resize(s->size() - 1); |
484 | 0 | return true; |
485 | 0 | } |
486 | 0 | return false; |
487 | 0 | } |
488 | | |
489 | 0 | void Strings::RemoveExtraSpaceChars(std::string* s) { |
490 | 0 | int put_index = 0; |
491 | 0 | bool ignore_next_space_char = false; |
492 | 0 | for (std::size_t i = 0; i < s->size(); ++i) { |
493 | 0 | if (s->at(i) == ' ') { |
494 | | // Previous character was a space, so ignore this char. |
495 | 0 | if (ignore_next_space_char) { |
496 | 0 | continue; |
497 | 0 | } |
498 | 0 | ignore_next_space_char = true; |
499 | 0 | } else { |
500 | 0 | ignore_next_space_char = false; |
501 | 0 | } |
502 | 0 | s->at(put_index++) = s->at(i); |
503 | 0 | } |
504 | 0 | s->resize(put_index); |
505 | 0 | } |
506 | | |
507 | 0 | bool Strings::StartsWith(std::string_view s, std::string_view prefix) { |
508 | 0 | if (prefix.size() > s.size()) return false; |
509 | | |
510 | 0 | for (std::size_t i = 0; i < prefix.size(); ++i) { |
511 | 0 | uint8_t c1 = prefix.at(i) & 0xff; |
512 | 0 | uint8_t c2 = s.at(i) & 0xff; |
513 | 0 | if (c1 != c2) return false; |
514 | 0 | } |
515 | | |
516 | 0 | return true; |
517 | 0 | } |
518 | | |
519 | 0 | bool Strings::EndsWith(std::string_view s, std::string_view suffix) { |
520 | 0 | if (suffix.size() > s.size()) return false; |
521 | | |
522 | 0 | std::size_t i; |
523 | 0 | std::size_t j; |
524 | 0 | for (i = suffix.size() - 1, j = s.size() - 1; i > 0; --i) { |
525 | 0 | uint8_t c1 = suffix.at(i) & 0xff; |
526 | 0 | uint8_t c2 = s.at(j--) & 0xff; |
527 | 0 | if (c1 != c2) return false; |
528 | 0 | } |
529 | | |
530 | 0 | return true; |
531 | 0 | } |
532 | | |
533 | | void Strings::Replace(std::string* s, std::string_view from, |
534 | 0 | std::string_view to) { |
535 | 0 | if (from.empty()) return; |
536 | | |
537 | 0 | std::size_t i = s->find(from); |
538 | 0 | s->replace(i, from.size(), to); |
539 | 0 | } |
540 | | |
541 | | void Strings::ReplaceAll(std::string* s, std::string_view from, |
542 | 0 | std::string_view to) { |
543 | 0 | if (from.empty()) return; |
544 | 0 | std::size_t i = s->find(from); |
545 | 0 | while (i != std::string::npos) { |
546 | 0 | s->replace(i, from.size(), to); |
547 | 0 | i = s->find(from, i); |
548 | 0 | } |
549 | 0 | } |
550 | | |
551 | | void Strings::ReplaceAny(std::string* s, std::string_view chars, |
552 | 2.11M | std::string_view to) { |
553 | 2.11M | if (chars.empty()) return; |
554 | 2.11M | std::size_t i = s->find_first_of(chars); |
555 | 2.18M | while (i != std::string::npos) { |
556 | 70.7k | s->replace(i, 1, to); |
557 | 70.7k | i = s->find_first_of(chars); |
558 | 70.7k | } |
559 | 2.11M | } |
560 | | |
561 | | std::optional<std::string> Strings::Translate(std::string_view str, |
562 | | std::string_view abc, |
563 | 0 | std::string_view xyz) { |
564 | | // Contains sequence of characters found in abc string. |
565 | 0 | std::vector<char32_t> abc_bytes; |
566 | | // Contains sequence of characters founds in xyz string. |
567 | 0 | std::vector<char32_t> xyz_bytes; |
568 | | |
569 | | // Captures the characters. |
570 | 0 | if (!(ExtractChars(abc, &abc_bytes) && |
571 | 0 | ExtractChars(xyz, &xyz_bytes))) { |
572 | 0 | return std::nullopt; |
573 | 0 | } |
574 | | |
575 | | // Helper function to find out index of matching char in the abc string. |
576 | | // Returns -1 if char is not found. |
577 | 0 | std::function<int(char32_t)> getCharIndex = |
578 | 0 | [&](char32_t c) -> std::size_t { |
579 | 0 | for (std::size_t i = 0; i < abc_bytes.size(); ++i) { |
580 | 0 | if (abc_bytes.at(i) == c) return i; |
581 | 0 | } |
582 | 0 | return std::string::npos; |
583 | 0 | }; |
584 | | |
585 | | // Evaluate and translate. |
586 | 0 | std::stringbuf buf; |
587 | 0 | while (!str.empty()) { |
588 | 0 | uint8_t new_char = str.front() & 0xff; |
589 | 0 | if (IsOneByteASCIIChar(new_char)) { |
590 | 0 | std::size_t i = getCharIndex(new_char); |
591 | 0 | if (i == std::string::npos) { |
592 | 0 | buf.sputc(new_char); |
593 | 0 | } else if (i >= xyz_bytes.size()) { |
594 | | // Ignore the character. i.e. remove from translated string. |
595 | 0 | } else { |
596 | | // Replacement byte can be utf-8 code. |
597 | 0 | std::string s = EncodeUtf8Symbol(xyz_bytes.at(i)).value_or(""); |
598 | 0 | buf.sputn(s.c_str(), s.size()); |
599 | 0 | } |
600 | 0 | str.remove_prefix(1); |
601 | 0 | continue; |
602 | 0 | } |
603 | | |
604 | 0 | auto big_char_or = DecodeUtf8Symbol(&str); |
605 | 0 | if (!big_char_or.has_value()) { |
606 | | // Error decoding string. |
607 | 0 | return std::nullopt; |
608 | 0 | } |
609 | 0 | char32_t big_char = big_char_or.value(); |
610 | 0 | std::size_t i = getCharIndex(big_char); |
611 | 0 | if (i == std::string::npos) { |
612 | 0 | auto s_or = EncodeUtf8Symbol(big_char); |
613 | 0 | if (!s_or.has_value()) return std::nullopt; |
614 | 0 | buf.sputn(s_or.value().c_str(), s_or.value().size()); |
615 | 0 | } else if (i >= xyz_bytes.size()) { |
616 | | // Ignore the character. i.e. remove from translated string. |
617 | 0 | } else { |
618 | 0 | auto s_or = EncodeUtf8Symbol(xyz_bytes.at(i)); |
619 | 0 | if (!s_or.has_value()) return std::nullopt; |
620 | 0 | buf.sputn(s_or.value().c_str(), s_or.value().size()); |
621 | 0 | } |
622 | 0 | } |
623 | | |
624 | 0 | return buf.str(); |
625 | 0 | } |
626 | | |
627 | | bool Strings::IsAllWhitespaceChars(std::string_view s, |
628 | 39.2k | std::string_view whitespace_chars) { |
629 | 39.2k | return s.find_first_not_of(whitespace_chars) == std::string::npos; |
630 | 39.2k | } |
631 | | |
632 | 92.4k | bool Strings::EqualFold(std::string_view l, std::string_view r) { |
633 | 119k | while (!l.empty()) { |
634 | | // Reached the end of r, but more chars in l. |
635 | 108k | if (r.empty()) return false; |
636 | | |
637 | 107k | uint8_t l_char = l.front() & 0xff; |
638 | 107k | uint8_t r_char = r.front() & 0xff; |
639 | | |
640 | | // ASCII characters first. |
641 | 107k | if (IsOneByteASCIIChar(l_char)) { |
642 | 101k | if (('A' <= l_char && l_char <= 'Z') || |
643 | 101k | ('a' <= l_char && l_char <= 'z')) { |
644 | | // Compare lower character for both the chars. |
645 | 92.3k | if ((l_char | 0x20) != (r_char | 0x20)) { |
646 | 66.5k | return false; |
647 | 66.5k | } |
648 | 92.3k | } else if (l_char != r_char) { // Compare other ascii character as-is. |
649 | 8.29k | return false; |
650 | 8.29k | } |
651 | | |
652 | 26.4k | l.remove_prefix(1); |
653 | 26.4k | r.remove_prefix(1); |
654 | 26.4k | continue; |
655 | 101k | } |
656 | | |
657 | 6.34k | if (!(CodePointByteSequenceCount(l_char) > 1 && |
658 | 6.34k | CodePointByteSequenceCount(r_char) > 1)) { |
659 | 2.46k | return false; |
660 | 2.46k | } |
661 | | |
662 | 3.87k | auto l_char_opt = DecodeUtf8Symbol(&l); |
663 | 3.87k | auto r_char_opt = DecodeUtf8Symbol(&r); |
664 | | |
665 | | // Checks decoding succeeded. |
666 | 3.87k | if (!(l_char_opt.has_value() && r_char_opt.has_value())) return false; |
667 | | |
668 | 1.51k | char32_t l_char_wide = l_char_opt.value(); |
669 | 1.51k | char32_t r_char_wide = r_char_opt.value(); |
670 | | |
671 | | // Two characters matched. No case conversion needed. |
672 | 1.51k | if (l_char_wide == r_char_wide) { |
673 | 700 | continue; |
674 | 700 | } |
675 | | |
676 | | // Convert both to lowercase. |
677 | 811 | l_char_wide = ToLowerChar(l_char_wide); |
678 | 811 | r_char_wide = ToLowerChar(r_char_wide); |
679 | | |
680 | 811 | if (l_char_wide != r_char_wide) return false; |
681 | 811 | } |
682 | | |
683 | | // Checks all the bytes are processed in both the strings. If some bytes |
684 | | // left in either string, they are not equal. |
685 | 11.4k | return l.empty() && r.empty(); |
686 | 92.4k | } |
687 | | |
688 | | std::vector<std::string> Strings::SplitStringAt( |
689 | 0 | std::string_view s, char delimiter) { |
690 | 0 | std::vector<std::string> columns; |
691 | 0 | size_t first = 0; |
692 | |
|
693 | 0 | while (first < s.size()) { |
694 | 0 | auto second = s.find_first_of(delimiter, first); |
695 | |
|
696 | 0 | if (first != second) |
697 | 0 | columns.emplace_back(std::string(s.substr(first, second-first))); |
698 | |
|
699 | 0 | if (second == std::string_view::npos) |
700 | 0 | break; |
701 | | |
702 | 0 | first = second + 1; |
703 | 0 | } |
704 | |
|
705 | 0 | return columns; |
706 | 0 | } |
707 | | |
708 | | std::vector<std::string_view> Strings::SplitStrAtUtf8Whitespace( |
709 | 0 | std::string_view s) { |
710 | 0 | std::vector<std::string_view> columns; |
711 | 0 | std::size_t start = 0; |
712 | 0 | std::size_t end = 0; |
713 | 0 | while (end < s.size()) { |
714 | 0 | auto num_ws = IsUtf8WhiteSpaceChar(s, end); |
715 | 0 | if (num_ws > 0) { |
716 | 0 | if (start < end) { |
717 | 0 | columns.emplace_back(s.substr(start, end - start)); |
718 | 0 | } |
719 | 0 | start = end + num_ws; |
720 | 0 | end = start; |
721 | 0 | } else { |
722 | 0 | end++; |
723 | 0 | } |
724 | 0 | } |
725 | 0 | columns.emplace_back(s.substr(start, s.size())); |
726 | 0 | return columns; |
727 | 0 | } |
728 | | |
729 | 0 | int Strings::IsUtf8WhiteSpaceChar(std::string_view s, std::size_t position) { |
730 | 0 | std::size_t i = position; |
731 | 0 | int state = 0; |
732 | 0 | while (i < s.size()) { |
733 | 0 | uint8_t c = s.at(i++); |
734 | 0 | state = kWhitespaceTable[state][c]; |
735 | |
|
736 | 0 | if (state == 0) { |
737 | 0 | return 0; |
738 | 0 | } |
739 | | |
740 | 0 | if (state == 1) { |
741 | 0 | return i - position; |
742 | 0 | } |
743 | 0 | } |
744 | | |
745 | 0 | return 0; |
746 | 0 | } |
747 | | |
748 | 0 | int Strings::CountTerms(std::string_view s) { |
749 | 0 | bool in_term = false; |
750 | 0 | int num_terms = 0; |
751 | 0 | while (!s.empty()) { |
752 | 0 | unsigned char c = s.front(); |
753 | 0 | s.remove_prefix(1); |
754 | | // whitespace and punctuations. |
755 | 0 | if ((kPropertyBits[c] & 0x08) != 0 || (kPropertyBits[c] & 0x10) != 0) { |
756 | 0 | in_term = false; |
757 | 0 | } else if (!in_term) { |
758 | | // First character of a term |
759 | 0 | ++num_terms; |
760 | 0 | in_term = true; |
761 | 0 | } |
762 | 0 | } |
763 | 0 | return num_terms; |
764 | 0 | } |
765 | | |
766 | | namespace { |
767 | | |
768 | | // Reads an entity like "<" from b[src:] and writes the corresponding "<" |
769 | | // to b[dst:], returning the incremented dst and src cursors. |
770 | | // Precondition: b[src] == '&' && dst <= src. |
771 | | // attribute should be true if passing an attribute value. |
772 | | std::pair<int, int> UnescapeEntity(std::string* b, int dst, int src, |
773 | 524k | bool attribute) { |
774 | 524k | std::string s = b->substr(src); |
775 | 524k | if (s.size() <= 1) { |
776 | 86 | b->at(dst) = b->at(src); |
777 | 86 | return std::pair<int, int>(dst + 1, src + 1); |
778 | 86 | } |
779 | | |
780 | | // i starts at 1 because we already know that s[0] == '&'. |
781 | 524k | std::size_t i = 1; |
782 | 524k | if (s.at(i) == '#') { |
783 | 428k | if (s.size() <= 3) { // We need to have at least "&#.". |
784 | 90 | b->at(dst) = b->at(src); |
785 | 90 | return std::pair<int, int>(dst + 1, src + 1); |
786 | 90 | } |
787 | 428k | i++; |
788 | 428k | auto c = s.at(i); |
789 | 428k | bool hex = false; |
790 | 428k | if (c == 'x' || c == 'X') { |
791 | 404k | hex = true; |
792 | 404k | i++; |
793 | 404k | } |
794 | | |
795 | 428k | char32_t x = '\x00'; |
796 | 909k | while (i < s.size()) { |
797 | 909k | auto c = s.at(i); |
798 | 909k | i++; |
799 | 909k | if (hex) { |
800 | 866k | if (Strings::IsDigit(c)) { |
801 | 2.84k | x = (16 * x) | (c - '0'); |
802 | 2.84k | continue; |
803 | 863k | } else if ('a' <= c && c <= 'f') { |
804 | 457k | x = 16 * x + c - 'a' + 10; |
805 | 457k | continue; |
806 | 457k | } else if ('A' <= c && c <= 'F') { |
807 | 2.50k | x = 16 * x + c - 'A' + 10; |
808 | 2.50k | continue; |
809 | 2.50k | } |
810 | 866k | } else if (Strings::IsDigit(c)) { |
811 | 18.5k | x = 10 * x + c - '0'; |
812 | 18.5k | continue; |
813 | 18.5k | } |
814 | 427k | if (c != ';') { |
815 | 426k | i--; |
816 | 426k | } |
817 | 427k | break; |
818 | 909k | } |
819 | | |
820 | 428k | if (i <= 3) { // No characters matched. |
821 | 420k | b->at(dst) = b->at(src); |
822 | 420k | return std::pair<int, int>(dst + 1, src + 1); |
823 | 420k | } |
824 | | |
825 | 7.82k | if (0x80 <= x && x <= 0x9F) { |
826 | | // Replace characters from Windows-1252 with UTF-8 equivalents. |
827 | 232 | x = kReplacementTable[x - 0x80]; |
828 | 7.59k | } else if (x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF) { |
829 | | // Replace invalid characters with the replacement chracter. |
830 | 3.00k | x = L'\uFFFD'; |
831 | 3.00k | } |
832 | | |
833 | 7.82k | auto encoded_bytes = Strings::EncodeUtf8Symbol(x); |
834 | 7.82k | if (encoded_bytes.has_value()) { |
835 | 7.82k | std::transform(encoded_bytes.value().begin(), |
836 | 7.82k | encoded_bytes.value().end(), b->begin() + dst, |
837 | 16.6k | [](uint8_t c) -> char { return static_cast<char>(c); }); |
838 | 7.82k | return std::pair<int, int>(dst + encoded_bytes.value().size(), src + i); |
839 | 7.82k | } |
840 | 7.82k | } |
841 | | |
842 | | // Consume the maximum number of chracters possible, with the consumed |
843 | | // characters matching one of the named references. |
844 | 1.61M | while (i < s.size()) { |
845 | 1.61M | auto c = s.at(i); |
846 | 1.61M | i++; |
847 | | // Lower-cased characters are more common in entities, so we check for |
848 | | // them first. |
849 | 1.61M | if (Strings::IsCharAlphabet(c) || Strings::IsDigit(c)) { |
850 | 1.52M | continue; |
851 | 1.52M | } |
852 | 95.4k | if (c != ';') { |
853 | 90.5k | i--; |
854 | 90.5k | } |
855 | 95.4k | break; |
856 | 1.61M | } |
857 | | |
858 | 96.6k | std::string entityName = s.substr(1, i - 1); |
859 | 96.6k | auto encoded_bytes = EntityLookup(entityName); |
860 | 96.6k | if (entityName.empty()) { |
861 | | // No-op. |
862 | 64.5k | } else if (attribute && entityName.at(entityName.size() - 1) != ';' && |
863 | 32.0k | s.size() > i && s.at(i) == '=') { |
864 | | // No-op. |
865 | 31.6k | } else if (!encoded_bytes.empty()) { |
866 | 476 | int overflow = encoded_bytes.size() - entityName.size() - 1 /* & */; |
867 | 476 | if (overflow > 0) { |
868 | | // Insert some dummy chars which will get occupied by overflow entity |
869 | | // chars. |
870 | | // Suppose &xy; = \x1\x2\x3\x4\x5 (5 bytes char) |
871 | | // abc&xy;def (10 bytes) after this statement is: |
872 | | // abc&xy; def (11 bytes). |
873 | | // After unescape: abc\x1\x2\x3\x4\x5def (11 bytes). |
874 | 49 | b->insert(src + encoded_bytes.size() - 1, " ", overflow); |
875 | 49 | } |
876 | | // Copies the unescaped bytes to the destination, |
877 | 476 | std::transform(encoded_bytes.begin(), encoded_bytes.end(), b->begin() + dst, |
878 | 749 | [](uint8_t c) -> char { return static_cast<char>(c); }); |
879 | 476 | return std::pair<int, int>( |
880 | 476 | dst + encoded_bytes.size() - (overflow > 0 ? overflow : 0), src + i); |
881 | 31.1k | } else if (!attribute) { |
882 | 10.0k | int max_length = entityName.size() - 1; |
883 | 10.0k | if (max_length > kLongestEntityWithoutSemiColon) { |
884 | 6.10k | max_length = kLongestEntityWithoutSemiColon; |
885 | 6.10k | } |
886 | 44.0k | for (int j = max_length; j > 1; --j) { |
887 | 34.4k | auto encoded_bytes = EntityLookup(entityName.substr(0, j)); |
888 | 34.4k | if (!encoded_bytes.empty()) { |
889 | 470 | std::transform(encoded_bytes.begin(), encoded_bytes.end(), |
890 | 855 | b->begin() + dst, [](uint8_t c) -> char { |
891 | 855 | return static_cast<char>(c); }); |
892 | 470 | return std::pair<int, int>(dst + encoded_bytes.size(), src + j + 1); |
893 | 470 | } |
894 | 34.4k | } |
895 | 10.0k | } |
896 | | |
897 | 95.6k | std::copy(b->begin() + src, b->begin() + src + i, b->begin() + dst); |
898 | 95.6k | return std::pair<int, int>(dst + i, src + i); |
899 | 96.6k | } |
900 | | |
901 | 13.7M | void CaseTransformInternal(bool to_upper, std::string* s) { |
902 | 67.0M | for (std::size_t i = 0; i < s->size(); ++i) { |
903 | | |
904 | 53.2M | uint8_t code_point = s->at(i) & 0xff; |
905 | | |
906 | | // ASCII characters first. |
907 | 53.2M | if (IsOneByteASCIIChar(code_point)) { |
908 | 51.3M | auto c = to_upper ? ToUpperChar(code_point) : ToLowerChar(code_point); |
909 | 51.3M | if (c != code_point) { |
910 | 12.8M | s->at(i) = static_cast<char>(c); |
911 | 12.8M | } |
912 | 51.3M | continue; |
913 | 51.3M | } |
914 | | |
915 | 1.90M | if (Strings::CodePointByteSequenceCount(code_point) > 1) { |
916 | 1.28M | std::string_view sv = *s; |
917 | 1.28M | sv.remove_prefix(i); |
918 | 1.28M | auto decoded = Strings::DecodeUtf8Symbol(&sv); |
919 | 1.28M | if (decoded.has_value()) { |
920 | 890k | char32_t decode_value = decoded.value(); |
921 | 890k | auto c = |
922 | 890k | to_upper ? ToUpperChar(decode_value) : ToLowerChar(decode_value); |
923 | 890k | if (c != decode_value) { |
924 | 3.45k | auto char_encoded = Strings::EncodeUtf8Symbol(c); |
925 | 3.45k | if (char_encoded.has_value()) { |
926 | 3.45k | std::transform(char_encoded.value().begin(), |
927 | 3.45k | char_encoded.value().end(), s->begin() + i, |
928 | 9.58k | [](uint8_t c) -> char { return static_cast<char>(c); }); |
929 | 3.45k | } |
930 | 3.45k | } |
931 | 890k | } |
932 | 1.28M | } |
933 | 1.90M | } |
934 | 13.7M | } |
935 | | |
936 | 1.54M | bool ReadContinuationByte(uint8_t byte, uint8_t* out) { |
937 | | // Checks it is valid continuation byte. 0b10xxxxxx. |
938 | 1.54M | if ((byte & 0xc0) == 0x80) { |
939 | | // Mask last six bits 0b00xxxxxx. |
940 | 566k | *out = byte & 0x3f; |
941 | 566k | return true; |
942 | 566k | } |
943 | | |
944 | | // Invalid continuation byte. |
945 | 976k | return false; |
946 | 1.54M | } |
947 | | |
948 | 54.6M | inline bool IsOneByteASCIIChar(uint8_t c) { |
949 | 54.6M | return (c & 0x80) == 0; |
950 | 54.6M | } |
951 | | |
952 | 0 | bool ExtractChars(std::string_view str, std::vector<char32_t>* chars) { |
953 | 0 | while (!str.empty()) { |
954 | 0 | uint8_t c = str.front() & 0xff; |
955 | | |
956 | | // ASCII chracters first. |
957 | 0 | if (IsOneByteASCIIChar(c)) { |
958 | 0 | chars->push_back(c); |
959 | 0 | str.remove_prefix(1); |
960 | 0 | continue; |
961 | | // Check if this character is member of codepoint sequence. |
962 | 0 | } else if (Strings::CodePointByteSequenceCount(c) > 1) { |
963 | | // Decode moves the string view prefix so no need to remove prefix |
964 | | // manually. |
965 | 0 | auto old_big_char = Strings::DecodeUtf8Symbol(&str); |
966 | 0 | if (!old_big_char.has_value()) { |
967 | | // Error decoding string. |
968 | 0 | chars->clear(); |
969 | 0 | return false; |
970 | 0 | } |
971 | 0 | chars->push_back(old_big_char.value()); |
972 | 0 | } else { |
973 | | // Unknown character type. |
974 | 0 | chars->clear(); |
975 | 0 | return false; |
976 | 0 | } |
977 | 0 | } |
978 | 0 | return true; |
979 | 0 | } |
980 | | |
981 | 0 | bool OneByteHexCodeToInt(std::string_view hex_code, uint8_t* out) { |
982 | | // Will overflow. |
983 | 0 | if (hex_code.size() > 2) return false; |
984 | 0 | uint8_t x = 0; |
985 | 0 | while (!hex_code.empty()) { |
986 | 0 | auto h = hex_code.at(0); |
987 | 0 | hex_code.remove_prefix(1); |
988 | 0 | if (Strings::IsDigit(h)) { |
989 | 0 | x = (16 * x) | (h - '0'); |
990 | 0 | } else if ('a' <= h && h <= 'f') { |
991 | 0 | x = 16 * x + h - 'a' + 10; |
992 | 0 | } else if ('A' <= h && h <= 'F') { |
993 | 0 | x = 16 * x + h - 'A' + 10; |
994 | 0 | } else { |
995 | | // Invalid hex code eg. %2x or %m8 |
996 | 0 | return false; |
997 | 0 | } |
998 | 0 | } |
999 | 0 | *out = x; |
1000 | 0 | return true; |
1001 | 0 | } |
1002 | | |
1003 | | } // namespace |
1004 | | |
1005 | | } // namespace htmlparser |