Coverage Report

Created: 2025-07-23 06:45

/proc/self/cwd/cpp/htmlparser/strings.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef CPP_HTMLPARSER_STRINGS_H_
2
#define CPP_HTMLPARSER_STRINGS_H_
3
4
#include <optional>
5
#include <sstream>
6
#include <string>
7
#include <string_view>
8
#include <vector>
9
10
namespace htmlparser {
11
12
class Strings {
13
 public:
14
  // One of:
15
  // U+0009 CHARACTER TABULATION,
16
  // U+000A LINE FEED (LF),
17
  // U+000C FORM FEED (FF),
18
  // U+000D CARRIAGE RETURN (CR), or
19
  // U+0020 SPACE.
20
  inline static const std::string kWhitespace {
21
    // Do not sort or re-order.
22
    ' ',
23
    '\t',
24
    '\r',
25
    '\n',
26
    '\f',
27
    '\v'};
28
29
  // kWhitespace plus null char.
30
  inline static const std::string kWhitespaceOrNull {
31
    // Do not sort or re-order.
32
    ' ',
33
    '\t',
34
    '\r',
35
    '\n',
36
    '\f',
37
    '\0',
38
    '\v'};
39
40
  inline static const std::string kEscapeChars {
41
    // Do not sort or re-order.
42
    '&',
43
    '\'',
44
    '<',
45
    '>',
46
    '"'};
47
48
  inline static const std::string kNullChar = {'\0'};
49
50
  inline static const std::string kNullReplacementChar {
51
    '\xef', '\xbf', '\xbd'};  // encoded \ufffd (3 bytes).
52
53
  // Decodes a percent encoded string like "google.com%20%2F%20%3Fx%3Db" to
54
  // "google.com / ?x=b".
55
  static std::optional<std::string> DecodePercentEncodedURL(
56
    std::string_view uri);
57
58
  // Returns hex string representation of a 4 byte codepoint.
59
  static std::string ToHexString(uint32_t c);
60
61
  // byte is in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z).
62
  static bool IsCharAlphabet(char c);
63
64
  // byte is in the range 0x30-0x39 (chars: 0-9)
65
  static bool IsDigit(char c);
66
67
  // Converts "\r" and "\r\n" in s to "\n".
68
  // The conversion happens in place, but the resulting string may be shorter.
69
  static void ConvertNewLines(std::string* s);
70
71
  // UTF-8 Encoding/Decoding utility functions.
72
  // =================================
73
  //
74
  // Checks if the byte is a beginning of a unicode codepoint byte sequence.
75
  // First byte is masked as follows:
76
  // 0b110xxxxx - 2 byte sequence.
77
  // 0b1110xxxx - 3 byte sequence.
78
  // 0b11110xxx - 4 byte sequence.
79
  //
80
  // Returns number of byte sequence needed to encode the codepoint.
81
  static int8_t CodePointByteSequenceCount(uint8_t c);
82
83
  // Similar to CodePointByteSequenceCount except that it accepts entire
84
  // codepoint and tells how many bytes the codepoint contains.
85
  static int8_t CodePointNumBytes(char32_t c);
86
87
  // Decodes byte sequence to utf-8 codepoint.
88
  // The s points to the first byte in the sequence. Moves the cursor past
89
  // the byte sequence if decoding is successful.
90
  //
91
  // Returns 4 byte utf-8 codepoint value, or nullopt if:
92
  // - First byte is not valid (IsCodePoint),
93
  // - The three byte sequence includes unpaired surrogate which is not a scalar
94
  //   value.
95
  // - Invalid utf-8 data.
96
  static std::optional<char32_t> DecodeUtf8Symbol(std::string_view* s);
97
98
  // Same as DecodeUtf8Symbol(string_view*) except that the prefix is not
99
  // updated, meaning cursor is at the first byte of the current character
100
  // decoded in s.
101
  static std::optional<char32_t>
102
0
      DecodeUtf8Symbol(std::string_view s, std::size_t position = 0) {
103
0
    if (position < 0 || position > s.size()) return std::nullopt;
104
0
105
0
    if (position == 0) {
106
0
      return DecodeUtf8Symbol(&s);
107
0
    }
108
0
109
0
    std::string_view s_at_prefix = s.substr(position);
110
0
    return DecodeUtf8Symbol(&s_at_prefix);
111
0
  }
112
113
  // Encodes a utf-8 codepoint.
114
  // Fills the codepoint in the following sequence:
115
  //
116
  // 7 bits US ASCII characters.
117
  // 0b0xxxxxx
118
  //
119
  // Codepoint upto 11 bits.
120
  // 0b110xxxxx 0b10xxxxxx
121
  //
122
  // Codepoint upto 16 bits.
123
  // 0b1110xxxx 0b10xxxxxx 0b10xxxxxx
124
  //
125
  // Codepoint upto 21 bits.
126
  // 0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx
127
  //
128
  // Returns nullopt on error.
129
  static std::optional<std::string> EncodeUtf8Symbol(char32_t code_point);
130
131
0
  static std::vector<char32_t> Utf8ToCodepoints(std::string_view utf8) {
132
0
    std::vector<char32_t> out;
133
0
    out.reserve(utf8.size() / 2);
134
0
    // We use the UnicodeText abstraction because it handles
135
0
    // validation / coersion  under the hood, so what comes out of this is
136
0
    // surely valid UTF8.
137
0
    auto codepoint = DecodeUtf8Symbol(&utf8);
138
0
    while (codepoint) {
139
0
      out.push_back(*codepoint);
140
0
      codepoint = DecodeUtf8Symbol(&utf8);
141
0
    }
142
0
    return out;
143
0
  }
144
145
  static void AppendCodepointToUtf8String(char32_t code,
146
0
                                          std::string* utf8_str) {
147
0
    // The implementation is modified from UnicodeText::push_back to append
148
0
    // to an existing string, rather than allocate a new one.
149
0
    auto encoded = EncodeUtf8Symbol(code);
150
0
    if (encoded) {
151
0
      *utf8_str += *encoded;
152
0
    } else {
153
0
      utf8_str->push_back(' ');
154
0
    }
155
0
  }
156
157
0
  static std::string CodepointToUtf8String(char32_t code) {
158
0
    auto output = htmlparser::Strings::EncodeUtf8Symbol(code);
159
0
    return output ? std::move(output.value()) : "";
160
0
  }
161
162
  // Converts unicode code points to a string.
163
0
  static std::string CodepointsToUtf8String(std::vector<char32_t> codes) {
164
0
    std::stringbuf buf;
165
0
    for (auto c : codes) {
166
0
      if (auto encoded = htmlparser::Strings::EncodeUtf8Symbol(c);
167
0
          encoded) {
168
0
        buf.sputn(encoded->c_str(), encoded->size());
169
0
      }
170
0
    }
171
0
    return buf.str();
172
0
  }
173
174
  // Returns index of the first instance of any character in chars or
175
  // npos if no character found. For unicode character returns the index of
176
  // initial byte of the sequence of bytes.
177
  static std::size_t IndexAny(std::string_view s,
178
                              std::string_view chars);
179
180
  // Escapes special characters like "<" to become "&lt;". It escapes only
181
  // five such characters: <, >, &, ' and ".
182
  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
183
  // always true.
184
  static std::string EscapeString(std::string_view s);
185
  static void Escape(std::string_view s, std::stringbuf* escaped);
186
187
  // Unescapes s's entities in-place, so that "a&lt;b" becomes "a<b".
188
  // attribute should be true if passing an attribute value.
189
  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
190
  // always true.
191
  static void UnescapeString(std::string* s, bool attribute = false);
192
193
  // Converts case of string in-place.
194
  static void ToLower(std::string* s);
195
  static void ToUpper(std::string* s);
196
197
  // Checks if string contains whitespace only chracters.
198
  static bool IsAllWhitespaceChars(std::string_view s,
199
      std::string_view whitespace_chars = kWhitespace);
200
201
  // Case insensitive equals.
202
  static bool EqualFold(std::string_view l, std::string_view r);
203
204
  // Search replace functions.
205
  // Replaces first occurrence of the f in s with t.
206
  static void Replace(std::string* s, std::string_view f,
207
      std::string_view t);
208
  // Replaces all occurrences of the f in s with t.
209
  static void ReplaceAll(std::string* s, std::string_view f,
210
                         std::string_view t);
211
  static void ReplaceAny(std::string* s, std::string_view chars,
212
                         std::string_view to);
213
214
  // Replaces the string of characters in abc with the string of characters
215
  // in xyz. The first character in xyz will replace every occurrence of the
216
  // first character in abc that appears in the str.
217
  //
218
  // Example:
219
  // Translate("The quick brown fox.",
220
  //           "abcdefghijklmnopqrstuvwxyz",
221
  //           "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
222
  // Returns:
223
  //   THE QUICK BROWN FOX.
224
  //
225
  // This works for utf-8 characters.
226
  // Strings::Translate("AmAltAs", "A", "서") outputs: 서m서lt서s.
227
  //
228
  // If abc contains duplicates, replacements are performed in the order they
229
  // appear in the target string. So Translate("amaltas", "atas", "ipox") will
230
  // replace all occurrences of 'a' with 'i'. The later 'a'->'o' mapping is
231
  // ignored.
232
  //
233
  // If abc is longer than xyz, then every occurrence of characters from str
234
  // that do not have a corresponding character in xyz will be removed.
235
  //
236
  // Example:
237
  // Strings::Translate("The quick brown fox.", "brown", "red");
238
  // returns: "The quick red fdx."
239
  //
240
  // The translated string may be smaller or larger than the evaluated string.
241
  //
242
  // Translation is guaranteed for ASCII characters. Translation may fail if
243
  // any of the strings str, abc, xyz contains utf-8 chars and decoding of those
244
  // chars failed, in which case this function returns std::nullopt.
245
  static std::optional<std::string> Translate(
246
      std::string_view str,   // String to evaluate.
247
      std::string_view abc,   // Chars that will be replaced.
248
      std::string_view xyz);  // Chars used for replacement.
249
250
  // Strips whitespace from a string in-place.
251
  static void TrimLeft(std::string* s,
252
      std::string_view chars_to_trim = kWhitespace);
253
  static void TrimRight(std::string* s,
254
      std::string_view chars_to_trim = kWhitespace);
255
  static void Trim(std::string* s,
256
      std::string_view chars_to_trim = kWhitespace);
257
  static void TrimLeft(std::string_view* s,
258
      std::string_view chars_to_trim = kWhitespace);
259
  static void TrimRight(std::string_view* s,
260
      std::string_view chars_to_trim = kWhitespace);
261
  static void Trim(std::string_view* s,
262
      std::string_view chars_to_trim = kWhitespace);
263
264
  static bool StripTrailingNewline(std::string* s);
265
266
  // Reduces all consecutive sequences of space characters to a single space
267
  // character.
268
  // Resulting string may be smaller (resized) than the original string.
269
  static void RemoveExtraSpaceChars(std::string* s);
270
271
  // Prefix and suffix matching functions.
272
  static bool StartsWith(std::string_view s, std::string_view prefix);
273
  static bool EndsWith(std::string_view s, std::string_view suffix);
274
275
  // Splits a string at delimiter character and returns the columns.
276
  static std::vector<std::string> SplitStringAt(
277
      std::string_view s, char delimiter);
278
279
  // Splits the string at any utf8 or ascii whitespace and returns the columns.
280
  // The returned values are the views of original string argument passed to
281
  // this method. If the original string goes out of scope after this method is
282
  // called, the contents of the columns is undefined.
283
  static std::vector<std::string_view> SplitStrAtUtf8Whitespace(
284
      std::string_view s);
285
286
  // Determines if a character at current position is a whitespace char.
287
  // Returns the number of bytes from current character that are part of the
288
  // whitespace. For multichar whitespace like ideographic space \u3000 it
289
  // returns 3 as ideographic space has 3 codepoints.
290
  //
291
  // Returns 0, if the character at current position is not a whitespace.
292
  static int IsUtf8WhiteSpaceChar(std::string_view s, std::size_t position = 0);
293
294
  // Counts number of terms in a text separated by whitespace and punctuations.
295
  static int CountTerms(std::string_view s);
296
297
 private:
298
  // No instance of this class.
299
  Strings() = default;
300
};
301
302
}  // namespace htmlparser
303
304
#endif  // CPP_HTMLPARSER_STRINGS_H_