Coverage Report

Created: 2025-07-23 06:45

/proc/self/cwd/cpp/htmlparser/document.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef CPP_HTMLPARSER_DOCUMENT_H_
2
#define CPP_HTMLPARSER_DOCUMENT_H_
3
4
#include <memory>
5
#include <vector>
6
7
#include "absl/status/status.h"
8
#include "cpp/htmlparser/allocator.h"
9
#include "cpp/htmlparser/iterators.h"
10
#include "cpp/htmlparser/node.h"
11
#include "cpp/htmlparser/token.h"
12
13
namespace htmlparser {
14
15
class Parser;
16
struct ParseOptions;
17
18
// Contains pieces of information about a particular HTML parse operation.
19
// Clients are expected to treat all the fields as constants, but are given the
20
// flexibility of modifying for upstream error reporting.
21
struct DocumentMetadata {
22
 public:
23
  // Tells if any of the HTML, HEAD, and/or BODY elements are generated by
24
  // the Parser because they were missing or implicitly created before they
25
  // are parsed.
26
  // Example:
27
  // Original document: <html><div>foo</div></html>
28
  // Parsed document: <html><head></head><body><div>foo</div></body></html>
29
  //
30
  // The has_manufactured_* accounting applies only to missing
31
  // TokenType::START_TAG_TOKEN.
32
  // If any of the </html>, </head>, or </body> end tags are missing, parser
33
  // auto closes the elements but they are not treated as manufactured from
34
  // the clients perspective.
35
  bool has_manufactured_html = false;
36
  bool has_manufactured_head = false;
37
  bool has_manufactured_body = false;
38
39
  // HTML5 algorithm handles duplication of unique tags by merging them and
40
  // producing a valid HTML. However, if clients are interested in knowing if
41
  // the original HTML source contains duplicate elements, following bits are
42
  // set.
43
  bool duplicate_html_elements = false;
44
  bool duplicate_body_elements = false;
45
  // Set only if above duplicate bits are true.
46
  std::optional<LineCol> duplicate_html_element_location = std::nullopt;
47
  std::optional<LineCol> duplicate_body_element_location = std::nullopt;
48
49
  // If true, parsed src is missing required <!doctype html> declaration or is
50
  // invalid syntax or is XHTML 4 or legacy doctype.
51
  bool quirks_mode = false;
52
53
  // The line column position of the last element in the document. Useful for
54
  // error reporting at the end of the document.
55
  LineCol document_end_location {0, 0};
56
57
  // The actual size of html src in bytes.
58
  std::size_t html_src_bytes = 0;
59
60
  // The document's <base> url and target.
61
  // See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
62
  std::pair<std::string, std::string> base_url;
63
64
  // The link rel=canonical url found in the html src. If multiple link
65
  // rel=canonical found the last one is recorded.
66
  std::string canonical_url;
67
};
68
69
70
// The document class is a wrapper for the DOM tree exposed with RootNode().
71
// All the nodes inside the document are owned by document. The nodes are
72
// destroyed when Document objects goes out of scope or deleted.
73
//
74
// Usage:
75
// unique_ptr<Document> doc = parser.Parse(html);
76
// if (!doc->status().ok()) {
77
//   LOG(ERROR) << "Parsing failed. " << doc->status();
78
//   return;
79
// }
80
//
81
// Node* root_node = doc.RootNode();
82
// ...
83
//
84
class Document {
85
 public:
86
  Document();
87
11.5k
  ~Document() = default;
88
89
0
  const DocumentMetadata& Metadata() const { return metadata_; }
90
91
  // Creates a new node. The node is owned by Document and is destroyed when
92
  // document is destructed.
93
  Node* NewNode(NodeType node_type, Atom atom = Atom::UNKNOWN);
94
95
  // Returns OK if Document is result of successful html parsing.
96
  // Accessing any fields/methods when status() != OK is undefined behavior.
97
0
  absl::Status status() const {
98
0
    return status_;
99
0
  }
100
101
  // Returns the root node of a DOM tree. Node* owned by document.
102
0
  Node* RootNode() const { return root_node_; }
103
104
  // Returns list of nodes parsed as a document fragment. All the Nodes are
105
  // owned by the document.
106
0
  const std::vector<Node*> FragmentNodes() const { return fragment_nodes_; }
107
108
  using const_iterator = NodeIterator<true>;
109
  using iterator = NodeIterator<false>;
110
111
0
  iterator begin() { return iterator{root_node_}; }
112
0
  iterator end() { return iterator{nullptr}; }
113
0
  const_iterator cbegin() const { return const_iterator{root_node_}; }
114
0
  const_iterator cend() const { return const_iterator{nullptr}; }
115
116
 private:
117
  // Returns a new node with the same type, data and attributes.
118
  // The clone has no parent, no siblings and no children.
119
  // The node is owned by the document and is destroyed when document is
120
  // destructed.
121
  Node* CloneNode(const Node* from);
122
123
  // The node allocator.
124
  std::unique_ptr<Allocator<Node>> node_allocator_;
125
126
  Node* root_node_;
127
  std::vector<Node*> fragment_nodes_{};
128
  std::size_t html_src_bytes_;
129
  DocumentMetadata metadata_;
130
  // Document parsing status.
131
  absl::Status status_ = absl::OkStatus();
132
133
  friend class Parser;
134
  friend std::unique_ptr<Document> Parse(std::string_view html);
135
  friend std::unique_ptr<Document> ParseWithOptions(
136
      std::string_view html, const ParseOptions& options);
137
  friend std::unique_ptr<Document> ParseFragment(std::string_view html,
138
                                                 Node* fragment_parent);
139
  friend std::unique_ptr<Document> ParseFragmentWithOptions(
140
      const std::string_view html,
141
      const ParseOptions& options,
142
      Node* fragment_parent);
143
};
144
145
}  // namespace htmlparser
146
147
148
#endif  // CPP_HTMLPARSER_DOCUMENT_H_