/proc/self/cwd/cpp/htmlparser/document.h

Source (jump to first uncovered line)
#ifndef CPP_HTMLPARSER_DOCUMENT_H_
#define CPP_HTMLPARSER_DOCUMENT_H_

#include <memory>
#include <vector>

#include "absl/status/status.h"
#include "cpp/htmlparser/allocator.h"
#include "cpp/htmlparser/iterators.h"
#include "cpp/htmlparser/node.h"
#include "cpp/htmlparser/token.h"

namespace htmlparser {

class Parser;
struct ParseOptions;

// Contains pieces of information about a particular HTML parse operation.
// Clients are expected to treat all the fields as constants, but are given the
// flexibility of modifying for upstream error reporting.
struct DocumentMetadata {
 public:
  // Tells if any of the HTML, HEAD, and/or BODY elements are generated by
  // the Parser because they were missing or implicitly created before they
  // are parsed.
  // Example:
  // Original document: <html><div>foo</div></html>
  // Parsed document: <html><head></head><body><div>foo</div></body></html>
  //
  // The has_manufactured_* accounting applies only to missing
  // TokenType::START_TAG_TOKEN.
  // If any of the </html>, </head>, or </body> end tags are missing, parser
  // auto closes the elements but they are not treated as manufactured from
  // the clients perspective.
  bool has_manufactured_html = false;
  bool has_manufactured_head = false;
  bool has_manufactured_body = false;

  // HTML5 algorithm handles duplication of unique tags by merging them and
  // producing a valid HTML. However, if clients are interested in knowing if
  // the original HTML source contains duplicate elements, following bits are
  // set.
  bool duplicate_html_elements = false;
  bool duplicate_body_elements = false;
  // Set only if above duplicate bits are true.
  std::optional<LineCol> duplicate_html_element_location = std::nullopt;
  std::optional<LineCol> duplicate_body_element_location = std::nullopt;

  // If true, parsed src is missing required <!doctype html> declaration or is
  // invalid syntax or is XHTML 4 or legacy doctype.
  bool quirks_mode = false;

  // The line column position of the last element in the document. Useful for
  // error reporting at the end of the document.
  LineCol document_end_location {0, 0};

  // The actual size of html src in bytes.
  std::size_t html_src_bytes = 0;

  // The document's <base> url and target.
  // See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
  std::pair<std::string, std::string> base_url;

  // The link rel=canonical url found in the html src. If multiple link
  // rel=canonical found the last one is recorded.
  std::string canonical_url;
};


// The document class is a wrapper for the DOM tree exposed with RootNode().
// All the nodes inside the document are owned by document. The nodes are
// destroyed when Document objects goes out of scope or deleted.
//
// Usage:
// unique_ptr<Document> doc = parser.Parse(html);
// if (!doc->status().ok()) {
//   LOG(ERROR) << "Parsing failed. " << doc->status();
//   return;
// }
//
// Node* root_node = doc.RootNode();
// ...
//
class Document {
 public:
  Document();
  ~Document() = default;

  const DocumentMetadata& Metadata() const { return metadata_; }

  // Creates a new node. The node is owned by Document and is destroyed when
  // document is destructed.
  Node* NewNode(NodeType node_type, Atom atom = Atom::UNKNOWN);

  // Returns OK if Document is result of successful html parsing.
  // Accessing any fields/methods when status() != OK is undefined behavior.
  absl::Status status() const {
    return status_;
  }

  // Returns the root node of a DOM tree. Node* owned by document.
  Node* RootNode() const { return root_node_; }

  // Returns list of nodes parsed as a document fragment. All the Nodes are
  // owned by the document.
  const std::vector<Node*> FragmentNodes() const { return fragment_nodes_; }

  using const_iterator = NodeIterator<true>;
  using iterator = NodeIterator<false>;

  iterator begin() { return iterator{root_node_}; }
  iterator end() { return iterator{nullptr}; }
  const_iterator cbegin() const { return const_iterator{root_node_}; }
  const_iterator cend() const { return const_iterator{nullptr}; }

 private:
  // Returns a new node with the same type, data and attributes.
  // The clone has no parent, no siblings and no children.
  // The node is owned by the document and is destroyed when document is
  // destructed.
  Node* CloneNode(const Node* from);

  // The node allocator.
  std::unique_ptr<Allocator<Node>> node_allocator_;

  Node* root_node_;
  std::vector<Node*> fragment_nodes_{};
  std::size_t html_src_bytes_;
  DocumentMetadata metadata_;
  // Document parsing status.
  absl::Status status_ = absl::OkStatus();

  friend class Parser;
  friend std::unique_ptr<Document> Parse(std::string_view html);
  friend std::unique_ptr<Document> ParseWithOptions(
      std::string_view html, const ParseOptions& options);
  friend std::unique_ptr<Document> ParseFragment(std::string_view html,
                                                 Node* fragment_parent);
  friend std::unique_ptr<Document> ParseFragmentWithOptions(
      const std::string_view html,
      const ParseOptions& options,
      Node* fragment_parent);
};

}  // namespace htmlparser


#endif  // CPP_HTMLPARSER_DOCUMENT_H_

Line	Count	Source (jump to first uncovered line)
1		#ifndef CPP_HTMLPARSER_DOCUMENT_H_
2		#define CPP_HTMLPARSER_DOCUMENT_H_
3
4		#include <memory>
5		#include <vector>
6
7		#include "absl/status/status.h"
8		#include "cpp/htmlparser/allocator.h"
9		#include "cpp/htmlparser/iterators.h"
10		#include "cpp/htmlparser/node.h"
11		#include "cpp/htmlparser/token.h"
12
13		namespace htmlparser {
14
15		class Parser;
16		struct ParseOptions;
17
18		// Contains pieces of information about a particular HTML parse operation.
19		// Clients are expected to treat all the fields as constants, but are given the
20		// flexibility of modifying for upstream error reporting.
21		struct DocumentMetadata {
22		public:
23		// Tells if any of the HTML, HEAD, and/or BODY elements are generated by
24		// the Parser because they were missing or implicitly created before they
25		// are parsed.
26		// Example:
27		// Original document: <html><div>foo</div></html>
28		// Parsed document: <html><head></head><body><div>foo</div></body></html>
29		//
30		// The has_manufactured_* accounting applies only to missing
31		// TokenType::START_TAG_TOKEN.
32		// If any of the </html>, </head>, or </body> end tags are missing, parser
33		// auto closes the elements but they are not treated as manufactured from
34		// the clients perspective.
35		bool has_manufactured_html = false;
36		bool has_manufactured_head = false;
37		bool has_manufactured_body = false;
38
39		// HTML5 algorithm handles duplication of unique tags by merging them and
40		// producing a valid HTML. However, if clients are interested in knowing if
41		// the original HTML source contains duplicate elements, following bits are
42		// set.
43		bool duplicate_html_elements = false;
44		bool duplicate_body_elements = false;
45		// Set only if above duplicate bits are true.
46		std::optional<LineCol> duplicate_html_element_location = std::nullopt;
47		std::optional<LineCol> duplicate_body_element_location = std::nullopt;
48
49		// If true, parsed src is missing required <!doctype html> declaration or is
50		// invalid syntax or is XHTML 4 or legacy doctype.
51		bool quirks_mode = false;
52
53		// The line column position of the last element in the document. Useful for
54		// error reporting at the end of the document.
55		LineCol document_end_location {0, 0};
56
57		// The actual size of html src in bytes.
58		std::size_t html_src_bytes = 0;
59
60		// The document's <base> url and target.
61		// See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
62		std::pair<std::string, std::string> base_url;
63
64		// The link rel=canonical url found in the html src. If multiple link
65		// rel=canonical found the last one is recorded.
66		std::string canonical_url;
67		};
68
69
70		// The document class is a wrapper for the DOM tree exposed with RootNode().
71		// All the nodes inside the document are owned by document. The nodes are
72		// destroyed when Document objects goes out of scope or deleted.
73		//
74		// Usage:
75		// unique_ptr<Document> doc = parser.Parse(html);
76		// if (!doc->status().ok()) {
77		// LOG(ERROR) << "Parsing failed. " << doc->status();
78		// return;
79		// }
80		//
81		// Node* root_node = doc.RootNode();
82		// ...
83		//
84		class Document {
85		public:
86		Document();
87	11.5k	~Document() = default;
88
89	0	const DocumentMetadata& Metadata() const { return metadata_; }
90
91		// Creates a new node. The node is owned by Document and is destroyed when
92		// document is destructed.
93		Node* NewNode(NodeType node_type, Atom atom = Atom::UNKNOWN);
94
95		// Returns OK if Document is result of successful html parsing.
96		// Accessing any fields/methods when status() != OK is undefined behavior.
97	0	absl::Status status() const {
98	0	return status_;
99	0	}
100
101		// Returns the root node of a DOM tree. Node* owned by document.
102	0	Node* RootNode() const { return root_node_; }
103
104		// Returns list of nodes parsed as a document fragment. All the Nodes are
105		// owned by the document.
106	0	const std::vector<Node*> FragmentNodes() const { return fragment_nodes_; }
107
108		using const_iterator = NodeIterator<true>;
109		using iterator = NodeIterator<false>;
110
111	0	iterator begin() { return iterator{root_node_}; }
112	0	iterator end() { return iterator{nullptr}; }
113	0	const_iterator cbegin() const { return const_iterator{root_node_}; }
114	0	const_iterator cend() const { return const_iterator{nullptr}; }
115
116		private:
117		// Returns a new node with the same type, data and attributes.
118		// The clone has no parent, no siblings and no children.
119		// The node is owned by the document and is destroyed when document is
120		// destructed.
121		Node* CloneNode(const Node* from);
122
123		// The node allocator.
124		std::unique_ptr<Allocator<Node>> node_allocator_;
125
126		Node* root_node_;
127		std::vector<Node*> fragment_nodes_{};
128		std::size_t html_src_bytes_;
129		DocumentMetadata metadata_;
130		// Document parsing status.
131		absl::Status status_ = absl::OkStatus();
132
133		friend class Parser;
134		friend std::unique_ptr<Document> Parse(std::string_view html);
135		friend std::unique_ptr<Document> ParseWithOptions(
136		std::string_view html, const ParseOptions& options);
137		friend std::unique_ptr<Document> ParseFragment(std::string_view html,
138		Node* fragment_parent);
139		friend std::unique_ptr<Document> ParseFragmentWithOptions(
140		const std::string_view html,
141		const ParseOptions& options,
142		Node* fragment_parent);
143		};
144
145		} // namespace htmlparser
146
147
148		#endif // CPP_HTMLPARSER_DOCUMENT_H_

Coverage Report

Created: 2025-07-23 06:45