/proc/self/cwd/cpp/htmlparser/tokenizer.h

Source (jump to first uncovered line)
#ifndef CPP_HTMLPARSER_TOKENIZER_H_
#define CPP_HTMLPARSER_TOKENIZER_H_

#include <memory>
#include <optional>
#include <tuple>
#include <vector>

#include "cpp/htmlparser/token.h"

namespace htmlparser {

class Tokenizer {
 public:
  // Constructs a new HTML Tokenizer for the given html.
  // The input is assumed to be UTF-8 encoded.
  //
  // If tokenizing InnerHTML fragment, context_tag is that element's tag, such
  // as "div" or "iframe".
  explicit Tokenizer(std::string_view html, std::string context_tag = "");

  Tokenizer() = delete;

  // Span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
  // the end is exclusive.
  struct Span {
    int start = 0;
    int end = 0;
  };

  using RawAttribute = std::tuple<Span, Span, LineCol>;

  // Sets whether or not the tokenizer recognizes `<![CDATA[foo]]>` as
  // the text "foo". The default value is false, which means to recognize it as
  // a bogus comment `<!-- [CDATA[foo]] -->` instead.
  //
  // Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and
  // only if tokenizing foreign content, such as MathML and SVG. However,
  // tracking foreign-contentness is difficult to do purely in the tokenizer,
  // as opposed to the parser, due to HTML integration points: an <svg> element
  // can contain a <foreignObject> that is foreign-to-SVG but not foreign-to-
  // HTML. For strict compliance with the HTML5 tokenization algorithm, it is
  // the responsibility of the user of a tokenizer to call AllowCDATA as
  // appropriate.
  // In practice, if using the tokenizer without caring whether MathML or SVG
  // CDATA is text or comments, such as tokenizing HTML to find all the anchor
  // text, it is acceptable to ignore this responsibility.
  void SetAllowCDATA(bool allow_cdata);

  // NextIsNotRawText instructs the tokenizer that the next token should not be
  // considered as 'raw text'. Some elements, such as script and title elements,
  // normally require the next token after the opening tag to be 'raw text' that
  // has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>"
  // yields a start tag token for "<title>", a text token for "a<b>c</b>d", and
  // an end tag token for "</title>". There are no distinct start tag or end tag
  // tokens for the "<b>" and "</b>".
  //
  // This tokenizer implementation will generally look for raw text at the right
  // times. Strictly speaking, an HTML5 compliant tokenizer should not look for
  // raw text if in foreign content: <title> generally needs raw text, but a
  // <title> inside an <svg> does not. Another example is that a <textarea>
  // generally needs raw text, but a <textarea> is not allowed as an immediate
  // child of a <select>; in normal parsing, a <textarea> implies </select>, but
  // one cannot close the implicit element when parsing a <select>'s InnerHTML.
  // Similarly to AllowCDATA, tracking the correct moment to override raw-text-
  // ness is difficult to do purely in the tokenizer, as opposed to the parser.
  // For strict compliance with the HTML5 tokenization algorithm, it is the
  // responsibility of the user of a tokenizer to call NextIsNotRawText as
  // appropriate. In practice, like AllowCDATA, it is acceptable to ignore this
  // responsibility for basic usage.
  //
  // Note that this 'raw text' concept is different from the one offered by the
  // Tokenizer.Raw method.
  void NextIsNotRawText();

  // The following two states have special meaning.
  // EOF is when tokenizer reaches end of HTML input.
  // Error is the first time error encountered during tokenization.
  // It is possible for not Error && EOF. which means Next call will return an
  // error token. For example, if the HTML text was just "plain", or
  // `<!-- xml version="1.0"` (without closing), then the first Next call would
  // set EOF and return TextToken or CommentToken. Subsequent calls to Next
  // will return ErrorToken with Error set to true.
  bool IsEOF() const { return eof_; }
  bool Error() const { return err_; }

  // Returns the unmodified text of the current token. Calling Next, Token,
  // Text, TagName or TagAttr may change the contents of the returned slice.
  std::string_view Raw();

  // Scans the next token and returns its type.
  TokenType Next(bool template_mode = false);

  // Returns the unescaped text of a text, comment or doctype token. The
  // contents of the returned slice may change on the next call to Next.
  std::string Text();

  // Returns the lower-cased name of a tag token (the `img` out of
  // `<IMG SRC="foo">`) and whether the tag has attributes.
  // The contents of the returned slice may change on the next call to Next.
  std::optional<std::tuple<std::string, bool>> TagName();

  // Returns the lower-cased key and unescaped value of the next unparsed
  // attribute for the current tag token and whether there are more attributes.
  // The contents of the returned slices may change on the next call to Next.
  std::optional<std::tuple<Attribute, bool>> TagAttr();

  // Returns the current Token. The result's Data and Attr values remain
  // valid after subsequent Next calls.
  Token token();

  // Returns current position of the tokenizer in the html source.
  LineCol CurrentPosition() { return current_line_col_; }

  // Count of lines processed in html source.
  int LinesProcessed() { return lines_cols_.size(); }

 private:
  // Fragment tokenization is allowed from these parent elements only.
  inline static constexpr std::array<Atom, 10> kAllowedFragmentContainers{
      Atom::IFRAME,    Atom::NOEMBED, Atom::NOFRAMES, Atom::NOSCRIPT,
      Atom::PLAINTEXT, Atom::SCRIPT,  Atom::STYLE,    Atom::TEXTAREA,
      Atom::TITLE,     Atom::XMP,
  };

  // Returns the next byte from the input stream, doing a buffered read
  // from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a
  // contiguous byte slice that holds all the bytes read so far for the current
  // token.
  // It sets z.err if the underlying reader returns an error.
  // Pre-condition: z.err == nil.
  char ReadByte();

  // Moves cursor back past one byte.
  void UnreadByte();

  // Reads until next ">".
  void ReadUntilCloseAngle();

  // Reads the next start tag token. The opening "<a" has already
  // been consumed, where 'a' means anything in [A-Za-z].
  TokenType ReadStartTag(bool template_mode = false);

  // Attempts to read a CDATA section and returns true if
  // successful. The opening "<!" has already been consumed.
  bool ReadCDATA();

  // Reads until the next "</foo>", where "foo" is z.rawTag and
  // is typically something like "script" or "textarea".
  void ReadRawOrRCDATA();

  // Attempts to read a doctype declaration and returns true if
  // successful. The opening "<!" has already been consumed.
  bool ReadDoctype();

  // Reads the next tag token and its attributes. If saveAttr, those
  // attributes are saved in z.attr, otherwise z.attr is set to an empty slice.
  // The opening "<a" or "</a" has already been consumed, where 'a' means
  // anything in [A-Za-z].
  void ReadTag(bool save_attr, bool template_mode = false);

  // Sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)
  // is positioned such that the first byte of the tag name (the "d" in "<div")
  // has already been consumed.
  void ReadTagName();

  // Sets z.pendingAttr[0] to the "k" in "<div k=v>".
  // Precondition: z.err == nil.
  void ReadTagAttributeKey(bool template_mode = false);

  // Sets z.pendingAttr[1] to the "v" in "<div k=v>".
  void ReadTagAttributeValue();

  // Attempts to read a tag like "</foo>", where "foo" is z.rawTag.
  // If it succeeds, it backs up the input position to reconsume the tag and
  // returns true. Otherwise it returns false. The opening "</" has already been
  // consumed.
  bool ReadRawEndTag();

  // Reads until the next </script> tag, following the byzantine
  // rules for escaping/hiding the closing tag.
  void ReadScript();

  // Reads the next token starting with "<!". It might be
  // a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or
  // "<!a bogus comment". The opening "<!" has already been consumed.
  TokenType ReadMarkupDeclaration();

  // Reads the next comment token starting with "<!--". The opening
  // "<!--" has already been consumed.
  void ReadComment();

  // Skips past any white space.
  void SkipWhiteSpace();

  // Returns whether the start tag in buffer[data.start:data.end]
  // case-insensitively matches any element of ss.
  template <typename... Args>
  bool StartTagIn(Args... ss);

  std::string_view buffer_;

  // buffer_[raw.start:raw.end] holds the raw bytes of the current token.
  // buf[raw.end:] is buffered input that will yield future tokens.
  Span raw_ = {0, 0};

  // buffer_[data.start:data.end] holds the raw bytes of the current token's
  // data: a text token's text, a tag token's tag name, etc.
  Span data_ = {0, 0};

  // TokenType of the current token.
  TokenType token_type_;

  // Attribute key and value currently being tokenized.
  RawAttribute pending_attribute_;

  std::vector<RawAttribute> attributes_{};

  int n_attributes_returned_ = 0;

  // raw_tag_ is the "script" in "</script>" that closes the next token. If
  // non-empty, the subsequent call to Next will return a raw or RCDATA text
  // token: one that treats "<p>" as text instead of an element.
  // raw_tag_'s contents are lower-cased.
  std::string raw_tag_;

  // text_is_raw_ is whether the current text token's data is not escaped.
  bool text_is_raw_ = false;

  // Whether NULL bytes in the current token's data should be converted
  // into \ufffd replacement characters.
  bool convert_null_ = false;

  // allow_cdata_ is whether CDATA sectiosn are allowed in the current context.
  bool allow_cdata_ = false;

  // Cursor reached the end of the buffer.
  bool eof_ = false;
  bool err_ = false;

  // Tells if the token is manufactured.
  // In a few cases, for example '<' followed by '?', is treated as comment and
  // a comment token is manufactured.
  // This is not same as manufactured html, head, body, tbody, thead etc,
  // these are manufactured during parsing, not tokenization.
  // This field accounts for only special cases where illegal characters leads
  // to  manufacturing of comments token.
  // Eg:
  // https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-question-mark-instead-of-tag-name
  bool is_token_manufactured_ = false;

  // Keeps track of all the lines and columns in HTML source.
  std::vector<LineCol> lines_cols_;

  // Keeps track of current position of the cursor.
  LineCol current_line_col_;

  // Current token's line col record. One line can have several tokens.
  LineCol token_line_col_;
};

}  // namespace htmlparser

#endif  // CPP_HTMLPARSER_TOKENIZER_H_

Coverage Report

Created: 2025-07-23 06:45

Line	Count	Source (jump to first uncovered line)
1		#ifndef CPP_HTMLPARSER_TOKENIZER_H_
2		#define CPP_HTMLPARSER_TOKENIZER_H_
3
4		#include <memory>
5		#include <optional>
6		#include <tuple>
7		#include <vector>
8
9		#include "cpp/htmlparser/token.h"
10
11		namespace htmlparser {
12
13		class Tokenizer {
14		public:
15		// Constructs a new HTML Tokenizer for the given html.
16		// The input is assumed to be UTF-8 encoded.
17		//
18		// If tokenizing InnerHTML fragment, context_tag is that element's tag, such
19		// as "div" or "iframe".
20		explicit Tokenizer(std::string_view html, std::string context_tag = "");
21
22		Tokenizer() = delete;
23
24		// Span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
25		// the end is exclusive.
26		struct Span {
27		int start = 0;
28		int end = 0;
29		};
30
31		using RawAttribute = std::tuple<Span, Span, LineCol>;
32
33		// Sets whether or not the tokenizer recognizes `<![CDATA[foo]]>` as
34		// the text "foo". The default value is false, which means to recognize it as
35		// a bogus comment `<!-- [CDATA[foo]] -->` instead.
36		//
37		// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and
38		// only if tokenizing foreign content, such as MathML and SVG. However,
39		// tracking foreign-contentness is difficult to do purely in the tokenizer,
40		// as opposed to the parser, due to HTML integration points: an <svg> element
41		// can contain a <foreignObject> that is foreign-to-SVG but not foreign-to-
42		// HTML. For strict compliance with the HTML5 tokenization algorithm, it is
43		// the responsibility of the user of a tokenizer to call AllowCDATA as
44		// appropriate.
45		// In practice, if using the tokenizer without caring whether MathML or SVG
46		// CDATA is text or comments, such as tokenizing HTML to find all the anchor
47		// text, it is acceptable to ignore this responsibility.
48		void SetAllowCDATA(bool allow_cdata);
49
50		// NextIsNotRawText instructs the tokenizer that the next token should not be
51		// considered as 'raw text'. Some elements, such as script and title elements,
52		// normally require the next token after the opening tag to be 'raw text' that
53		// has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>"
54		// yields a start tag token for "<title>", a text token for "a<b>c</b>d", and
55		// an end tag token for "</title>". There are no distinct start tag or end tag
56		// tokens for the "<b>" and "</b>".
57		//
58		// This tokenizer implementation will generally look for raw text at the right
59		// times. Strictly speaking, an HTML5 compliant tokenizer should not look for
60		// raw text if in foreign content: <title> generally needs raw text, but a
61		// <title> inside an <svg> does not. Another example is that a <textarea>
62		// generally needs raw text, but a <textarea> is not allowed as an immediate
63		// child of a <select>; in normal parsing, a <textarea> implies </select>, but
64		// one cannot close the implicit element when parsing a <select>'s InnerHTML.
65		// Similarly to AllowCDATA, tracking the correct moment to override raw-text-
66		// ness is difficult to do purely in the tokenizer, as opposed to the parser.
67		// For strict compliance with the HTML5 tokenization algorithm, it is the
68		// responsibility of the user of a tokenizer to call NextIsNotRawText as
69		// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this
70		// responsibility for basic usage.
71		//
72		// Note that this 'raw text' concept is different from the one offered by the
73		// Tokenizer.Raw method.
74		void NextIsNotRawText();
75
76		// The following two states have special meaning.
77		// EOF is when tokenizer reaches end of HTML input.
78		// Error is the first time error encountered during tokenization.
79		// It is possible for not Error && EOF. which means Next call will return an
80		// error token. For example, if the HTML text was just "plain", or
81		// `<!-- xml version="1.0"` (without closing), then the first Next call would
82		// set EOF and return TextToken or CommentToken. Subsequent calls to Next
83		// will return ErrorToken with Error set to true.
84	23.0k	bool IsEOF() const { return eof_; }
85	0	bool Error() const { return err_; }
86
87		// Returns the unmodified text of the current token. Calling Next, Token,
88		// Text, TagName or TagAttr may change the contents of the returned slice.
89		std::string_view Raw();
90
91		// Scans the next token and returns its type.
92		TokenType Next(bool template_mode = false);
93
94		// Returns the unescaped text of a text, comment or doctype token. The
95		// contents of the returned slice may change on the next call to Next.
96		std::string Text();
97
98		// Returns the lower-cased name of a tag token (the `img` out of
99		// `<IMG SRC="foo">`) and whether the tag has attributes.
100		// The contents of the returned slice may change on the next call to Next.
101		std::optional<std::tuple<std::string, bool>> TagName();
102
103		// Returns the lower-cased key and unescaped value of the next unparsed
104		// attribute for the current tag token and whether there are more attributes.
105		// The contents of the returned slices may change on the next call to Next.
106		std::optional<std::tuple<Attribute, bool>> TagAttr();
107
108		// Returns the current Token. The result's Data and Attr values remain
109		// valid after subsequent Next calls.
110		Token token();
111
112		// Returns current position of the tokenizer in the html source.
113	11.5k	LineCol CurrentPosition() { return current_line_col_; }
114
115		// Count of lines processed in html source.
116	0	int LinesProcessed() { return lines_cols_.size(); }
117
118		private:
119		// Fragment tokenization is allowed from these parent elements only.
120		inline static constexpr std::array<Atom, 10> kAllowedFragmentContainers{
121		Atom::IFRAME, Atom::NOEMBED, Atom::NOFRAMES, Atom::NOSCRIPT,
122		Atom::PLAINTEXT, Atom::SCRIPT, Atom::STYLE, Atom::TEXTAREA,
123		Atom::TITLE, Atom::XMP,
124		};
125
126		// Returns the next byte from the input stream, doing a buffered read
127		// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a
128		// contiguous byte slice that holds all the bytes read so far for the current
129		// token.
130		// It sets z.err if the underlying reader returns an error.
131		// Pre-condition: z.err == nil.
132		char ReadByte();
133
134		// Moves cursor back past one byte.
135		void UnreadByte();
136
137		// Reads until next ">".
138		void ReadUntilCloseAngle();
139
140		// Reads the next start tag token. The opening "<a" has already
141		// been consumed, where 'a' means anything in [A-Za-z].
142		TokenType ReadStartTag(bool template_mode = false);
143
144		// Attempts to read a CDATA section and returns true if
145		// successful. The opening "<!" has already been consumed.
146		bool ReadCDATA();
147
148		// Reads until the next "</foo>", where "foo" is z.rawTag and
149		// is typically something like "script" or "textarea".
150		void ReadRawOrRCDATA();
151
152		// Attempts to read a doctype declaration and returns true if
153		// successful. The opening "<!" has already been consumed.
154		bool ReadDoctype();
155
156		// Reads the next tag token and its attributes. If saveAttr, those
157		// attributes are saved in z.attr, otherwise z.attr is set to an empty slice.
158		// The opening "<a" or "</a" has already been consumed, where 'a' means
159		// anything in [A-Za-z].
160		void ReadTag(bool save_attr, bool template_mode = false);
161
162		// Sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)
163		// is positioned such that the first byte of the tag name (the "d" in "<div")
164		// has already been consumed.
165		void ReadTagName();
166
167		// Sets z.pendingAttr[0] to the "k" in "<div k=v>".
168		// Precondition: z.err == nil.
169		void ReadTagAttributeKey(bool template_mode = false);
170
171		// Sets z.pendingAttr[1] to the "v" in "<div k=v>".
172		void ReadTagAttributeValue();
173
174		// Attempts to read a tag like "</foo>", where "foo" is z.rawTag.
175		// If it succeeds, it backs up the input position to reconsume the tag and
176		// returns true. Otherwise it returns false. The opening "</" has already been
177		// consumed.
178		bool ReadRawEndTag();
179
180		// Reads until the next </script> tag, following the byzantine
181		// rules for escaping/hiding the closing tag.
182		void ReadScript();
183
184		// Reads the next token starting with "<!". It might be
185		// a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or
186		// "<!a bogus comment". The opening "<!" has already been consumed.
187		TokenType ReadMarkupDeclaration();
188
189		// Reads the next comment token starting with "<!--". The opening
190		// "<!--" has already been consumed.
191		void ReadComment();
192
193		// Skips past any white space.
194		void SkipWhiteSpace();
195
196		// Returns whether the start tag in buffer[data.start:data.end]
197		// case-insensitively matches any element of ss.
198		template <typename... Args>
199		bool StartTagIn(Args... ss);
200
201		std::string_view buffer_;
202
203		// buffer_[raw.start:raw.end] holds the raw bytes of the current token.
204		// buf[raw.end:] is buffered input that will yield future tokens.
205		Span raw_ = {0, 0};
206
207		// buffer_[data.start:data.end] holds the raw bytes of the current token's
208		// data: a text token's text, a tag token's tag name, etc.
209		Span data_ = {0, 0};
210
211		// TokenType of the current token.
212		TokenType token_type_;
213
214		// Attribute key and value currently being tokenized.
215		RawAttribute pending_attribute_;
216
217		std::vector<RawAttribute> attributes_{};
218
219		int n_attributes_returned_ = 0;
220
221		// raw_tag_ is the "script" in "</script>" that closes the next token. If
222		// non-empty, the subsequent call to Next will return a raw or RCDATA text
223		// token: one that treats "<p>" as text instead of an element.
224		// raw_tag_'s contents are lower-cased.
225		std::string raw_tag_;
226
227		// text_is_raw_ is whether the current text token's data is not escaped.
228		bool text_is_raw_ = false;
229
230		// Whether NULL bytes in the current token's data should be converted
231		// into \ufffd replacement characters.
232		bool convert_null_ = false;
233
234		// allow_cdata_ is whether CDATA sectiosn are allowed in the current context.
235		bool allow_cdata_ = false;
236
237		// Cursor reached the end of the buffer.
238		bool eof_ = false;
239		bool err_ = false;
240
241		// Tells if the token is manufactured.
242		// In a few cases, for example '<' followed by '?', is treated as comment and
243		// a comment token is manufactured.
244		// This is not same as manufactured html, head, body, tbody, thead etc,
245		// these are manufactured during parsing, not tokenization.
246		// This field accounts for only special cases where illegal characters leads
247		// to manufacturing of comments token.
248		// Eg:
249		// https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-question-mark-instead-of-tag-name
250		bool is_token_manufactured_ = false;
251
252		// Keeps track of all the lines and columns in HTML source.
253		std::vector<LineCol> lines_cols_;
254
255		// Keeps track of current position of the cursor.
256		LineCol current_line_col_;
257
258		// Current token's line col record. One line can have several tokens.
259		LineCol token_line_col_;
260		};
261
262		} // namespace htmlparser
263
264		#endif // CPP_HTMLPARSER_TOKENIZER_H_