/proc/self/cwd/cpp/htmlparser/strings.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef CPP_HTMLPARSER_STRINGS_H_ |
2 | | #define CPP_HTMLPARSER_STRINGS_H_ |
3 | | |
4 | | #include <optional> |
5 | | #include <sstream> |
6 | | #include <string> |
7 | | #include <string_view> |
8 | | #include <vector> |
9 | | |
10 | | namespace htmlparser { |
11 | | |
12 | | class Strings { |
13 | | public: |
14 | | // One of: |
15 | | // U+0009 CHARACTER TABULATION, |
16 | | // U+000A LINE FEED (LF), |
17 | | // U+000C FORM FEED (FF), |
18 | | // U+000D CARRIAGE RETURN (CR), or |
19 | | // U+0020 SPACE. |
20 | | inline static const std::string kWhitespace { |
21 | | // Do not sort or re-order. |
22 | | ' ', |
23 | | '\t', |
24 | | '\r', |
25 | | '\n', |
26 | | '\f', |
27 | | '\v'}; |
28 | | |
29 | | // kWhitespace plus null char. |
30 | | inline static const std::string kWhitespaceOrNull { |
31 | | // Do not sort or re-order. |
32 | | ' ', |
33 | | '\t', |
34 | | '\r', |
35 | | '\n', |
36 | | '\f', |
37 | | '\0', |
38 | | '\v'}; |
39 | | |
40 | | inline static const std::string kEscapeChars { |
41 | | // Do not sort or re-order. |
42 | | '&', |
43 | | '\'', |
44 | | '<', |
45 | | '>', |
46 | | '"'}; |
47 | | |
48 | | inline static const std::string kNullChar = {'\0'}; |
49 | | |
50 | | inline static const std::string kNullReplacementChar { |
51 | | '\xef', '\xbf', '\xbd'}; // encoded \ufffd (3 bytes). |
52 | | |
53 | | // Decodes a percent encoded string like "google.com%20%2F%20%3Fx%3Db" to |
54 | | // "google.com / ?x=b". |
55 | | static std::optional<std::string> DecodePercentEncodedURL( |
56 | | std::string_view uri); |
57 | | |
58 | | // Returns hex string representation of a 4 byte codepoint. |
59 | | static std::string ToHexString(uint32_t c); |
60 | | |
61 | | // byte is in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z). |
62 | | static bool IsCharAlphabet(char c); |
63 | | |
64 | | // byte is in the range 0x30-0x39 (chars: 0-9) |
65 | | static bool IsDigit(char c); |
66 | | |
67 | | // Converts "\r" and "\r\n" in s to "\n". |
68 | | // The conversion happens in place, but the resulting string may be shorter. |
69 | | static void ConvertNewLines(std::string* s); |
70 | | |
71 | | // UTF-8 Encoding/Decoding utility functions. |
72 | | // ================================= |
73 | | // |
74 | | // Checks if the byte is a beginning of a unicode codepoint byte sequence. |
75 | | // First byte is masked as follows: |
76 | | // 0b110xxxxx - 2 byte sequence. |
77 | | // 0b1110xxxx - 3 byte sequence. |
78 | | // 0b11110xxx - 4 byte sequence. |
79 | | // |
80 | | // Returns number of byte sequence needed to encode the codepoint. |
81 | | static int8_t CodePointByteSequenceCount(uint8_t c); |
82 | | |
83 | | // Similar to CodePointByteSequenceCount except that it accepts entire |
84 | | // codepoint and tells how many bytes the codepoint contains. |
85 | | static int8_t CodePointNumBytes(char32_t c); |
86 | | |
87 | | // Decodes byte sequence to utf-8 codepoint. |
88 | | // The s points to the first byte in the sequence. Moves the cursor past |
89 | | // the byte sequence if decoding is successful. |
90 | | // |
91 | | // Returns 4 byte utf-8 codepoint value, or nullopt if: |
92 | | // - First byte is not valid (IsCodePoint), |
93 | | // - The three byte sequence includes unpaired surrogate which is not a scalar |
94 | | // value. |
95 | | // - Invalid utf-8 data. |
96 | | static std::optional<char32_t> DecodeUtf8Symbol(std::string_view* s); |
97 | | |
98 | | // Same as DecodeUtf8Symbol(string_view*) except that the prefix is not |
99 | | // updated, meaning cursor is at the first byte of the current character |
100 | | // decoded in s. |
101 | | static std::optional<char32_t> |
102 | 0 | DecodeUtf8Symbol(std::string_view s, std::size_t position = 0) { |
103 | 0 | if (position < 0 || position > s.size()) return std::nullopt; |
104 | 0 |
|
105 | 0 | if (position == 0) { |
106 | 0 | return DecodeUtf8Symbol(&s); |
107 | 0 | } |
108 | 0 |
|
109 | 0 | std::string_view s_at_prefix = s.substr(position); |
110 | 0 | return DecodeUtf8Symbol(&s_at_prefix); |
111 | 0 | } |
112 | | |
113 | | // Encodes a utf-8 codepoint. |
114 | | // Fills the codepoint in the following sequence: |
115 | | // |
116 | | // 7 bits US ASCII characters. |
117 | | // 0b0xxxxxx |
118 | | // |
119 | | // Codepoint upto 11 bits. |
120 | | // 0b110xxxxx 0b10xxxxxx |
121 | | // |
122 | | // Codepoint upto 16 bits. |
123 | | // 0b1110xxxx 0b10xxxxxx 0b10xxxxxx |
124 | | // |
125 | | // Codepoint upto 21 bits. |
126 | | // 0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx |
127 | | // |
128 | | // Returns nullopt on error. |
129 | | static std::optional<std::string> EncodeUtf8Symbol(char32_t code_point); |
130 | | |
131 | 0 | static std::vector<char32_t> Utf8ToCodepoints(std::string_view utf8) { |
132 | 0 | std::vector<char32_t> out; |
133 | 0 | out.reserve(utf8.size() / 2); |
134 | 0 | // We use the UnicodeText abstraction because it handles |
135 | 0 | // validation / coersion under the hood, so what comes out of this is |
136 | 0 | // surely valid UTF8. |
137 | 0 | auto codepoint = DecodeUtf8Symbol(&utf8); |
138 | 0 | while (codepoint) { |
139 | 0 | out.push_back(*codepoint); |
140 | 0 | codepoint = DecodeUtf8Symbol(&utf8); |
141 | 0 | } |
142 | 0 | return out; |
143 | 0 | } |
144 | | |
145 | | static void AppendCodepointToUtf8String(char32_t code, |
146 | 0 | std::string* utf8_str) { |
147 | 0 | // The implementation is modified from UnicodeText::push_back to append |
148 | 0 | // to an existing string, rather than allocate a new one. |
149 | 0 | auto encoded = EncodeUtf8Symbol(code); |
150 | 0 | if (encoded) { |
151 | 0 | *utf8_str += *encoded; |
152 | 0 | } else { |
153 | 0 | utf8_str->push_back(' '); |
154 | 0 | } |
155 | 0 | } |
156 | | |
157 | 0 | static std::string CodepointToUtf8String(char32_t code) { |
158 | 0 | auto output = htmlparser::Strings::EncodeUtf8Symbol(code); |
159 | 0 | return output ? std::move(output.value()) : ""; |
160 | 0 | } |
161 | | |
162 | | // Converts unicode code points to a string. |
163 | 0 | static std::string CodepointsToUtf8String(std::vector<char32_t> codes) { |
164 | 0 | std::stringbuf buf; |
165 | 0 | for (auto c : codes) { |
166 | 0 | if (auto encoded = htmlparser::Strings::EncodeUtf8Symbol(c); |
167 | 0 | encoded) { |
168 | 0 | buf.sputn(encoded->c_str(), encoded->size()); |
169 | 0 | } |
170 | 0 | } |
171 | 0 | return buf.str(); |
172 | 0 | } |
173 | | |
174 | | // Returns index of the first instance of any character in chars or |
175 | | // npos if no character found. For unicode character returns the index of |
176 | | // initial byte of the sequence of bytes. |
177 | | static std::size_t IndexAny(std::string_view s, |
178 | | std::string_view chars); |
179 | | |
180 | | // Escapes special characters like "<" to become "<". It escapes only |
181 | | // five such characters: <, >, &, ' and ". |
182 | | // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't |
183 | | // always true. |
184 | | static std::string EscapeString(std::string_view s); |
185 | | static void Escape(std::string_view s, std::stringbuf* escaped); |
186 | | |
187 | | // Unescapes s's entities in-place, so that "a<b" becomes "a<b". |
188 | | // attribute should be true if passing an attribute value. |
189 | | // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't |
190 | | // always true. |
191 | | static void UnescapeString(std::string* s, bool attribute = false); |
192 | | |
193 | | // Converts case of string in-place. |
194 | | static void ToLower(std::string* s); |
195 | | static void ToUpper(std::string* s); |
196 | | |
197 | | // Checks if string contains whitespace only chracters. |
198 | | static bool IsAllWhitespaceChars(std::string_view s, |
199 | | std::string_view whitespace_chars = kWhitespace); |
200 | | |
201 | | // Case insensitive equals. |
202 | | static bool EqualFold(std::string_view l, std::string_view r); |
203 | | |
204 | | // Search replace functions. |
205 | | // Replaces first occurrence of the f in s with t. |
206 | | static void Replace(std::string* s, std::string_view f, |
207 | | std::string_view t); |
208 | | // Replaces all occurrences of the f in s with t. |
209 | | static void ReplaceAll(std::string* s, std::string_view f, |
210 | | std::string_view t); |
211 | | static void ReplaceAny(std::string* s, std::string_view chars, |
212 | | std::string_view to); |
213 | | |
214 | | // Replaces the string of characters in abc with the string of characters |
215 | | // in xyz. The first character in xyz will replace every occurrence of the |
216 | | // first character in abc that appears in the str. |
217 | | // |
218 | | // Example: |
219 | | // Translate("The quick brown fox.", |
220 | | // "abcdefghijklmnopqrstuvwxyz", |
221 | | // "ABCDEFGHIJKLMNOPQRSTUVWXYZ") |
222 | | // Returns: |
223 | | // THE QUICK BROWN FOX. |
224 | | // |
225 | | // This works for utf-8 characters. |
226 | | // Strings::Translate("AmAltAs", "A", "서") outputs: 서m서lt서s. |
227 | | // |
228 | | // If abc contains duplicates, replacements are performed in the order they |
229 | | // appear in the target string. So Translate("amaltas", "atas", "ipox") will |
230 | | // replace all occurrences of 'a' with 'i'. The later 'a'->'o' mapping is |
231 | | // ignored. |
232 | | // |
233 | | // If abc is longer than xyz, then every occurrence of characters from str |
234 | | // that do not have a corresponding character in xyz will be removed. |
235 | | // |
236 | | // Example: |
237 | | // Strings::Translate("The quick brown fox.", "brown", "red"); |
238 | | // returns: "The quick red fdx." |
239 | | // |
240 | | // The translated string may be smaller or larger than the evaluated string. |
241 | | // |
242 | | // Translation is guaranteed for ASCII characters. Translation may fail if |
243 | | // any of the strings str, abc, xyz contains utf-8 chars and decoding of those |
244 | | // chars failed, in which case this function returns std::nullopt. |
245 | | static std::optional<std::string> Translate( |
246 | | std::string_view str, // String to evaluate. |
247 | | std::string_view abc, // Chars that will be replaced. |
248 | | std::string_view xyz); // Chars used for replacement. |
249 | | |
250 | | // Strips whitespace from a string in-place. |
251 | | static void TrimLeft(std::string* s, |
252 | | std::string_view chars_to_trim = kWhitespace); |
253 | | static void TrimRight(std::string* s, |
254 | | std::string_view chars_to_trim = kWhitespace); |
255 | | static void Trim(std::string* s, |
256 | | std::string_view chars_to_trim = kWhitespace); |
257 | | static void TrimLeft(std::string_view* s, |
258 | | std::string_view chars_to_trim = kWhitespace); |
259 | | static void TrimRight(std::string_view* s, |
260 | | std::string_view chars_to_trim = kWhitespace); |
261 | | static void Trim(std::string_view* s, |
262 | | std::string_view chars_to_trim = kWhitespace); |
263 | | |
264 | | static bool StripTrailingNewline(std::string* s); |
265 | | |
266 | | // Reduces all consecutive sequences of space characters to a single space |
267 | | // character. |
268 | | // Resulting string may be smaller (resized) than the original string. |
269 | | static void RemoveExtraSpaceChars(std::string* s); |
270 | | |
271 | | // Prefix and suffix matching functions. |
272 | | static bool StartsWith(std::string_view s, std::string_view prefix); |
273 | | static bool EndsWith(std::string_view s, std::string_view suffix); |
274 | | |
275 | | // Splits a string at delimiter character and returns the columns. |
276 | | static std::vector<std::string> SplitStringAt( |
277 | | std::string_view s, char delimiter); |
278 | | |
279 | | // Splits the string at any utf8 or ascii whitespace and returns the columns. |
280 | | // The returned values are the views of original string argument passed to |
281 | | // this method. If the original string goes out of scope after this method is |
282 | | // called, the contents of the columns is undefined. |
283 | | static std::vector<std::string_view> SplitStrAtUtf8Whitespace( |
284 | | std::string_view s); |
285 | | |
286 | | // Determines if a character at current position is a whitespace char. |
287 | | // Returns the number of bytes from current character that are part of the |
288 | | // whitespace. For multichar whitespace like ideographic space \u3000 it |
289 | | // returns 3 as ideographic space has 3 codepoints. |
290 | | // |
291 | | // Returns 0, if the character at current position is not a whitespace. |
292 | | static int IsUtf8WhiteSpaceChar(std::string_view s, std::size_t position = 0); |
293 | | |
294 | | // Counts number of terms in a text separated by whitespace and punctuations. |
295 | | static int CountTerms(std::string_view s); |
296 | | |
297 | | private: |
298 | | // No instance of this class. |
299 | | Strings() = default; |
300 | | }; |
301 | | |
302 | | } // namespace htmlparser |
303 | | |
304 | | #endif // CPP_HTMLPARSER_STRINGS_H_ |