/proc/self/cwd/cpp/htmlparser/strings.h

Source (jump to first uncovered line)
#ifndef CPP_HTMLPARSER_STRINGS_H_
#define CPP_HTMLPARSER_STRINGS_H_

#include <optional>
#include <sstream>
#include <string>
#include <string_view>
#include <vector>

namespace htmlparser {

class Strings {
 public:
  // One of:
  // U+0009 CHARACTER TABULATION,
  // U+000A LINE FEED (LF),
  // U+000C FORM FEED (FF),
  // U+000D CARRIAGE RETURN (CR), or
  // U+0020 SPACE.
  inline static const std::string kWhitespace {
    // Do not sort or re-order.
    ' ',
    '\t',
    '\r',
    '\n',
    '\f',
    '\v'};

  // kWhitespace plus null char.
  inline static const std::string kWhitespaceOrNull {
    // Do not sort or re-order.
    ' ',
    '\t',
    '\r',
    '\n',
    '\f',
    '\0',
    '\v'};

  inline static const std::string kEscapeChars {
    // Do not sort or re-order.
    '&',
    '\'',
    '<',
    '>',
    '"'};

  inline static const std::string kNullChar = {'\0'};

  inline static const std::string kNullReplacementChar {
    '\xef', '\xbf', '\xbd'};  // encoded \ufffd (3 bytes).

  // Decodes a percent encoded string like "google.com%20%2F%20%3Fx%3Db" to
  // "google.com / ?x=b".
  static std::optional<std::string> DecodePercentEncodedURL(
    std::string_view uri);

  // Returns hex string representation of a 4 byte codepoint.
  static std::string ToHexString(uint32_t c);

  // byte is in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z).
  static bool IsCharAlphabet(char c);

  // byte is in the range 0x30-0x39 (chars: 0-9)
  static bool IsDigit(char c);

  // Converts "\r" and "\r\n" in s to "\n".
  // The conversion happens in place, but the resulting string may be shorter.
  static void ConvertNewLines(std::string* s);

  // UTF-8 Encoding/Decoding utility functions.
  // =================================
  //
  // Checks if the byte is a beginning of a unicode codepoint byte sequence.
  // First byte is masked as follows:
  // 0b110xxxxx - 2 byte sequence.
  // 0b1110xxxx - 3 byte sequence.
  // 0b11110xxx - 4 byte sequence.
  //
  // Returns number of byte sequence needed to encode the codepoint.
  static int8_t CodePointByteSequenceCount(uint8_t c);

  // Similar to CodePointByteSequenceCount except that it accepts entire
  // codepoint and tells how many bytes the codepoint contains.
  static int8_t CodePointNumBytes(char32_t c);

  // Decodes byte sequence to utf-8 codepoint.
  // The s points to the first byte in the sequence. Moves the cursor past
  // the byte sequence if decoding is successful.
  //
  // Returns 4 byte utf-8 codepoint value, or nullopt if:
  // - First byte is not valid (IsCodePoint),
  // - The three byte sequence includes unpaired surrogate which is not a scalar
  //   value.
  // - Invalid utf-8 data.
  static std::optional<char32_t> DecodeUtf8Symbol(std::string_view* s);

  // Same as DecodeUtf8Symbol(string_view*) except that the prefix is not
  // updated, meaning cursor is at the first byte of the current character
  // decoded in s.
  static std::optional<char32_t>
      DecodeUtf8Symbol(std::string_view s, std::size_t position = 0) {
    if (position < 0 || position > s.size()) return std::nullopt;

    if (position == 0) {
      return DecodeUtf8Symbol(&s);
    }

    std::string_view s_at_prefix = s.substr(position);
    return DecodeUtf8Symbol(&s_at_prefix);
  }

  // Encodes a utf-8 codepoint.
  // Fills the codepoint in the following sequence:
  //
  // 7 bits US ASCII characters.
  // 0b0xxxxxx
  //
  // Codepoint upto 11 bits.
  // 0b110xxxxx 0b10xxxxxx
  //
  // Codepoint upto 16 bits.
  // 0b1110xxxx 0b10xxxxxx 0b10xxxxxx
  //
  // Codepoint upto 21 bits.
  // 0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx
  //
  // Returns nullopt on error.
  static std::optional<std::string> EncodeUtf8Symbol(char32_t code_point);

  static std::vector<char32_t> Utf8ToCodepoints(std::string_view utf8) {
    std::vector<char32_t> out;
    out.reserve(utf8.size() / 2);
    // We use the UnicodeText abstraction because it handles
    // validation / coersion  under the hood, so what comes out of this is
    // surely valid UTF8.
    auto codepoint = DecodeUtf8Symbol(&utf8);
    while (codepoint) {
      out.push_back(*codepoint);
      codepoint = DecodeUtf8Symbol(&utf8);
    }
    return out;
  }

  static void AppendCodepointToUtf8String(char32_t code,
                                          std::string* utf8_str) {
    // The implementation is modified from UnicodeText::push_back to append
    // to an existing string, rather than allocate a new one.
    auto encoded = EncodeUtf8Symbol(code);
    if (encoded) {
      *utf8_str += *encoded;
    } else {
      utf8_str->push_back(' ');
    }
  }

  static std::string CodepointToUtf8String(char32_t code) {
    auto output = htmlparser::Strings::EncodeUtf8Symbol(code);
    return output ? std::move(output.value()) : "";
  }

  // Converts unicode code points to a string.
  static std::string CodepointsToUtf8String(std::vector<char32_t> codes) {
    std::stringbuf buf;
    for (auto c : codes) {
      if (auto encoded = htmlparser::Strings::EncodeUtf8Symbol(c);
          encoded) {
        buf.sputn(encoded->c_str(), encoded->size());
      }
    }
    return buf.str();
  }

  // Returns index of the first instance of any character in chars or
  // npos if no character found. For unicode character returns the index of
  // initial byte of the sequence of bytes.
  static std::size_t IndexAny(std::string_view s,
                              std::string_view chars);

  // Escapes special characters like "<" to become "&lt;". It escapes only
  // five such characters: <, >, &, ' and ".
  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
  // always true.
  static std::string EscapeString(std::string_view s);
  static void Escape(std::string_view s, std::stringbuf* escaped);

  // Unescapes s's entities in-place, so that "a&lt;b" becomes "a<b".
  // attribute should be true if passing an attribute value.
  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
  // always true.
  static void UnescapeString(std::string* s, bool attribute = false);

  // Converts case of string in-place.
  static void ToLower(std::string* s);
  static void ToUpper(std::string* s);

  // Checks if string contains whitespace only chracters.
  static bool IsAllWhitespaceChars(std::string_view s,
      std::string_view whitespace_chars = kWhitespace);

  // Case insensitive equals.
  static bool EqualFold(std::string_view l, std::string_view r);

  // Search replace functions.
  // Replaces first occurrence of the f in s with t.
  static void Replace(std::string* s, std::string_view f,
      std::string_view t);
  // Replaces all occurrences of the f in s with t.
  static void ReplaceAll(std::string* s, std::string_view f,
                         std::string_view t);
  static void ReplaceAny(std::string* s, std::string_view chars,
                         std::string_view to);

  // Replaces the string of characters in abc with the string of characters
  // in xyz. The first character in xyz will replace every occurrence of the
  // first character in abc that appears in the str.
  //
  // Example:
  // Translate("The quick brown fox.",
  //           "abcdefghijklmnopqrstuvwxyz",
  //           "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
  // Returns:
  //   THE QUICK BROWN FOX.
  //
  // This works for utf-8 characters.
  // Strings::Translate("AmAltAs", "A", "서") outputs: 서m서lt서s.
  //
  // If abc contains duplicates, replacements are performed in the order they
  // appear in the target string. So Translate("amaltas", "atas", "ipox") will
  // replace all occurrences of 'a' with 'i'. The later 'a'->'o' mapping is
  // ignored.
  //
  // If abc is longer than xyz, then every occurrence of characters from str
  // that do not have a corresponding character in xyz will be removed.
  //
  // Example:
  // Strings::Translate("The quick brown fox.", "brown", "red");
  // returns: "The quick red fdx."
  //
  // The translated string may be smaller or larger than the evaluated string.
  //
  // Translation is guaranteed for ASCII characters. Translation may fail if
  // any of the strings str, abc, xyz contains utf-8 chars and decoding of those
  // chars failed, in which case this function returns std::nullopt.
  static std::optional<std::string> Translate(
      std::string_view str,   // String to evaluate.
      std::string_view abc,   // Chars that will be replaced.
      std::string_view xyz);  // Chars used for replacement.

  // Strips whitespace from a string in-place.
  static void TrimLeft(std::string* s,
      std::string_view chars_to_trim = kWhitespace);
  static void TrimRight(std::string* s,
      std::string_view chars_to_trim = kWhitespace);
  static void Trim(std::string* s,
      std::string_view chars_to_trim = kWhitespace);
  static void TrimLeft(std::string_view* s,
      std::string_view chars_to_trim = kWhitespace);
  static void TrimRight(std::string_view* s,
      std::string_view chars_to_trim = kWhitespace);
  static void Trim(std::string_view* s,
      std::string_view chars_to_trim = kWhitespace);

  static bool StripTrailingNewline(std::string* s);

  // Reduces all consecutive sequences of space characters to a single space
  // character.
  // Resulting string may be smaller (resized) than the original string.
  static void RemoveExtraSpaceChars(std::string* s);

  // Prefix and suffix matching functions.
  static bool StartsWith(std::string_view s, std::string_view prefix);
  static bool EndsWith(std::string_view s, std::string_view suffix);

  // Splits a string at delimiter character and returns the columns.
  static std::vector<std::string> SplitStringAt(
      std::string_view s, char delimiter);

  // Splits the string at any utf8 or ascii whitespace and returns the columns.
  // The returned values are the views of original string argument passed to
  // this method. If the original string goes out of scope after this method is
  // called, the contents of the columns is undefined.
  static std::vector<std::string_view> SplitStrAtUtf8Whitespace(
      std::string_view s);

  // Determines if a character at current position is a whitespace char.
  // Returns the number of bytes from current character that are part of the
  // whitespace. For multichar whitespace like ideographic space \u3000 it
  // returns 3 as ideographic space has 3 codepoints.
  //
  // Returns 0, if the character at current position is not a whitespace.
  static int IsUtf8WhiteSpaceChar(std::string_view s, std::size_t position = 0);

  // Counts number of terms in a text separated by whitespace and punctuations.
  static int CountTerms(std::string_view s);

 private:
  // No instance of this class.
  Strings() = default;
};

}  // namespace htmlparser

#endif  // CPP_HTMLPARSER_STRINGS_H_

Coverage Report

Created: 2025-07-23 06:45

Line	Count	Source (jump to first uncovered line)
1		#ifndef CPP_HTMLPARSER_STRINGS_H_
2		#define CPP_HTMLPARSER_STRINGS_H_
3
4		#include <optional>
5		#include <sstream>
6		#include <string>
7		#include <string_view>
8		#include <vector>
9
10		namespace htmlparser {
11
12		class Strings {
13		public:
14		// One of:
15		// U+0009 CHARACTER TABULATION,
16		// U+000A LINE FEED (LF),
17		// U+000C FORM FEED (FF),
18		// U+000D CARRIAGE RETURN (CR), or
19		// U+0020 SPACE.
20		inline static const std::string kWhitespace {
21		// Do not sort or re-order.
22		' ',
23		'\t',
24		'\r',
25		'\n',
26		'\f',
27		'\v'};
28
29		// kWhitespace plus null char.
30		inline static const std::string kWhitespaceOrNull {
31		// Do not sort or re-order.
32		' ',
33		'\t',
34		'\r',
35		'\n',
36		'\f',
37		'\0',
38		'\v'};
39
40		inline static const std::string kEscapeChars {
41		// Do not sort or re-order.
42		'&',
43		'\'',
44		'<',
45		'>',
46		'"'};
47
48		inline static const std::string kNullChar = {'\0'};
49
50		inline static const std::string kNullReplacementChar {
51		'\xef', '\xbf', '\xbd'}; // encoded \ufffd (3 bytes).
52
53		// Decodes a percent encoded string like "google.com%20%2F%20%3Fx%3Db" to
54		// "google.com / ?x=b".
55		static std::optional<std::string> DecodePercentEncodedURL(
56		std::string_view uri);
57
58		// Returns hex string representation of a 4 byte codepoint.
59		static std::string ToHexString(uint32_t c);
60
61		// byte is in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z).
62		static bool IsCharAlphabet(char c);
63
64		// byte is in the range 0x30-0x39 (chars: 0-9)
65		static bool IsDigit(char c);
66
67		// Converts "\r" and "\r\n" in s to "\n".
68		// The conversion happens in place, but the resulting string may be shorter.
69		static void ConvertNewLines(std::string* s);
70
71		// UTF-8 Encoding/Decoding utility functions.
72		// =================================
73		//
74		// Checks if the byte is a beginning of a unicode codepoint byte sequence.
75		// First byte is masked as follows:
76		// 0b110xxxxx - 2 byte sequence.
77		// 0b1110xxxx - 3 byte sequence.
78		// 0b11110xxx - 4 byte sequence.
79		//
80		// Returns number of byte sequence needed to encode the codepoint.
81		static int8_t CodePointByteSequenceCount(uint8_t c);
82
83		// Similar to CodePointByteSequenceCount except that it accepts entire
84		// codepoint and tells how many bytes the codepoint contains.
85		static int8_t CodePointNumBytes(char32_t c);
86
87		// Decodes byte sequence to utf-8 codepoint.
88		// The s points to the first byte in the sequence. Moves the cursor past
89		// the byte sequence if decoding is successful.
90		//
91		// Returns 4 byte utf-8 codepoint value, or nullopt if:
92		// - First byte is not valid (IsCodePoint),
93		// - The three byte sequence includes unpaired surrogate which is not a scalar
94		// value.
95		// - Invalid utf-8 data.
96		static std::optional<char32_t> DecodeUtf8Symbol(std::string_view* s);
97
98		// Same as DecodeUtf8Symbol(string_view*) except that the prefix is not
99		// updated, meaning cursor is at the first byte of the current character
100		// decoded in s.
101		static std::optional<char32_t>
102	0	DecodeUtf8Symbol(std::string_view s, std::size_t position = 0) {
103	0	if (position < 0 \|\| position > s.size()) return std::nullopt;
104	0
105	0	if (position == 0) {
106	0	return DecodeUtf8Symbol(&s);
107	0	}
108	0
109	0	std::string_view s_at_prefix = s.substr(position);
110	0	return DecodeUtf8Symbol(&s_at_prefix);
111	0	}
112
113		// Encodes a utf-8 codepoint.
114		// Fills the codepoint in the following sequence:
115		//
116		// 7 bits US ASCII characters.
117		// 0b0xxxxxx
118		//
119		// Codepoint upto 11 bits.
120		// 0b110xxxxx 0b10xxxxxx
121		//
122		// Codepoint upto 16 bits.
123		// 0b1110xxxx 0b10xxxxxx 0b10xxxxxx
124		//
125		// Codepoint upto 21 bits.
126		// 0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx
127		//
128		// Returns nullopt on error.
129		static std::optional<std::string> EncodeUtf8Symbol(char32_t code_point);
130
131	0	static std::vector<char32_t> Utf8ToCodepoints(std::string_view utf8) {
132	0	std::vector<char32_t> out;
133	0	out.reserve(utf8.size() / 2);
134	0	// We use the UnicodeText abstraction because it handles
135	0	// validation / coersion under the hood, so what comes out of this is
136	0	// surely valid UTF8.
137	0	auto codepoint = DecodeUtf8Symbol(&utf8);
138	0	while (codepoint) {
139	0	out.push_back(*codepoint);
140	0	codepoint = DecodeUtf8Symbol(&utf8);
141	0	}
142	0	return out;
143	0	}
144
145		static void AppendCodepointToUtf8String(char32_t code,
146	0	std::string* utf8_str) {
147	0	// The implementation is modified from UnicodeText::push_back to append
148	0	// to an existing string, rather than allocate a new one.
149	0	auto encoded = EncodeUtf8Symbol(code);
150	0	if (encoded) {
151	0	utf8_str += encoded;
152	0	} else {
153	0	utf8_str->push_back(' ');
154	0	}
155	0	}
156
157	0	static std::string CodepointToUtf8String(char32_t code) {
158	0	auto output = htmlparser::Strings::EncodeUtf8Symbol(code);
159	0	return output ? std::move(output.value()) : "";
160	0	}
161
162		// Converts unicode code points to a string.
163	0	static std::string CodepointsToUtf8String(std::vector<char32_t> codes) {
164	0	std::stringbuf buf;
165	0	for (auto c : codes) {
166	0	if (auto encoded = htmlparser::Strings::EncodeUtf8Symbol(c);
167	0	encoded) {
168	0	buf.sputn(encoded->c_str(), encoded->size());
169	0	}
170	0	}
171	0	return buf.str();
172	0	}
173
174		// Returns index of the first instance of any character in chars or
175		// npos if no character found. For unicode character returns the index of
176		// initial byte of the sequence of bytes.
177		static std::size_t IndexAny(std::string_view s,
178		std::string_view chars);
179
180		// Escapes special characters like "<" to become "<". It escapes only
181		// five such characters: <, >, &, ' and ".
182		// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
183		// always true.
184		static std::string EscapeString(std::string_view s);
185		static void Escape(std::string_view s, std::stringbuf* escaped);
186
187		// Unescapes s's entities in-place, so that "a<b" becomes "a<b".
188		// attribute should be true if passing an attribute value.
189		// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
190		// always true.
191		static void UnescapeString(std::string* s, bool attribute = false);
192
193		// Converts case of string in-place.
194		static void ToLower(std::string* s);
195		static void ToUpper(std::string* s);
196
197		// Checks if string contains whitespace only chracters.
198		static bool IsAllWhitespaceChars(std::string_view s,
199		std::string_view whitespace_chars = kWhitespace);
200
201		// Case insensitive equals.
202		static bool EqualFold(std::string_view l, std::string_view r);
203
204		// Search replace functions.
205		// Replaces first occurrence of the f in s with t.
206		static void Replace(std::string* s, std::string_view f,
207		std::string_view t);
208		// Replaces all occurrences of the f in s with t.
209		static void ReplaceAll(std::string* s, std::string_view f,
210		std::string_view t);
211		static void ReplaceAny(std::string* s, std::string_view chars,
212		std::string_view to);
213
214		// Replaces the string of characters in abc with the string of characters
215		// in xyz. The first character in xyz will replace every occurrence of the
216		// first character in abc that appears in the str.
217		//
218		// Example:
219		// Translate("The quick brown fox.",
220		// "abcdefghijklmnopqrstuvwxyz",
221		// "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
222		// Returns:
223		// THE QUICK BROWN FOX.
224		//
225		// This works for utf-8 characters.
226		// Strings::Translate("AmAltAs", "A", "서") outputs: 서m서lt서s.
227		//
228		// If abc contains duplicates, replacements are performed in the order they
229		// appear in the target string. So Translate("amaltas", "atas", "ipox") will
230		// replace all occurrences of 'a' with 'i'. The later 'a'->'o' mapping is
231		// ignored.
232		//
233		// If abc is longer than xyz, then every occurrence of characters from str
234		// that do not have a corresponding character in xyz will be removed.
235		//
236		// Example:
237		// Strings::Translate("The quick brown fox.", "brown", "red");
238		// returns: "The quick red fdx."
239		//
240		// The translated string may be smaller or larger than the evaluated string.
241		//
242		// Translation is guaranteed for ASCII characters. Translation may fail if
243		// any of the strings str, abc, xyz contains utf-8 chars and decoding of those
244		// chars failed, in which case this function returns std::nullopt.
245		static std::optional<std::string> Translate(
246		std::string_view str, // String to evaluate.
247		std::string_view abc, // Chars that will be replaced.
248		std::string_view xyz); // Chars used for replacement.
249
250		// Strips whitespace from a string in-place.
251		static void TrimLeft(std::string* s,
252		std::string_view chars_to_trim = kWhitespace);
253		static void TrimRight(std::string* s,
254		std::string_view chars_to_trim = kWhitespace);
255		static void Trim(std::string* s,
256		std::string_view chars_to_trim = kWhitespace);
257		static void TrimLeft(std::string_view* s,
258		std::string_view chars_to_trim = kWhitespace);
259		static void TrimRight(std::string_view* s,
260		std::string_view chars_to_trim = kWhitespace);
261		static void Trim(std::string_view* s,
262		std::string_view chars_to_trim = kWhitespace);
263
264		static bool StripTrailingNewline(std::string* s);
265
266		// Reduces all consecutive sequences of space characters to a single space
267		// character.
268		// Resulting string may be smaller (resized) than the original string.
269		static void RemoveExtraSpaceChars(std::string* s);
270
271		// Prefix and suffix matching functions.
272		static bool StartsWith(std::string_view s, std::string_view prefix);
273		static bool EndsWith(std::string_view s, std::string_view suffix);
274
275		// Splits a string at delimiter character and returns the columns.
276		static std::vector<std::string> SplitStringAt(
277		std::string_view s, char delimiter);
278
279		// Splits the string at any utf8 or ascii whitespace and returns the columns.
280		// The returned values are the views of original string argument passed to
281		// this method. If the original string goes out of scope after this method is
282		// called, the contents of the columns is undefined.
283		static std::vector<std::string_view> SplitStrAtUtf8Whitespace(
284		std::string_view s);
285
286		// Determines if a character at current position is a whitespace char.
287		// Returns the number of bytes from current character that are part of the
288		// whitespace. For multichar whitespace like ideographic space \u3000 it
289		// returns 3 as ideographic space has 3 codepoints.
290		//
291		// Returns 0, if the character at current position is not a whitespace.
292		static int IsUtf8WhiteSpaceChar(std::string_view s, std::size_t position = 0);
293
294		// Counts number of terms in a text separated by whitespace and punctuations.
295		static int CountTerms(std::string_view s);
296
297		private:
298		// No instance of this class.
299		Strings() = default;
300		};
301
302		} // namespace htmlparser
303
304		#endif // CPP_HTMLPARSER_STRINGS_H_