/proc/self/cwd/cpp/htmlparser/document.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef CPP_HTMLPARSER_DOCUMENT_H_ |
2 | | #define CPP_HTMLPARSER_DOCUMENT_H_ |
3 | | |
4 | | #include <memory> |
5 | | #include <vector> |
6 | | |
7 | | #include "absl/status/status.h" |
8 | | #include "cpp/htmlparser/allocator.h" |
9 | | #include "cpp/htmlparser/iterators.h" |
10 | | #include "cpp/htmlparser/node.h" |
11 | | #include "cpp/htmlparser/token.h" |
12 | | |
13 | | namespace htmlparser { |
14 | | |
15 | | class Parser; |
16 | | struct ParseOptions; |
17 | | |
18 | | // Contains pieces of information about a particular HTML parse operation. |
19 | | // Clients are expected to treat all the fields as constants, but are given the |
20 | | // flexibility of modifying for upstream error reporting. |
21 | | struct DocumentMetadata { |
22 | | public: |
23 | | // Tells if any of the HTML, HEAD, and/or BODY elements are generated by |
24 | | // the Parser because they were missing or implicitly created before they |
25 | | // are parsed. |
26 | | // Example: |
27 | | // Original document: <html><div>foo</div></html> |
28 | | // Parsed document: <html><head></head><body><div>foo</div></body></html> |
29 | | // |
30 | | // The has_manufactured_* accounting applies only to missing |
31 | | // TokenType::START_TAG_TOKEN. |
32 | | // If any of the </html>, </head>, or </body> end tags are missing, parser |
33 | | // auto closes the elements but they are not treated as manufactured from |
34 | | // the clients perspective. |
35 | | bool has_manufactured_html = false; |
36 | | bool has_manufactured_head = false; |
37 | | bool has_manufactured_body = false; |
38 | | |
39 | | // HTML5 algorithm handles duplication of unique tags by merging them and |
40 | | // producing a valid HTML. However, if clients are interested in knowing if |
41 | | // the original HTML source contains duplicate elements, following bits are |
42 | | // set. |
43 | | bool duplicate_html_elements = false; |
44 | | bool duplicate_body_elements = false; |
45 | | // Set only if above duplicate bits are true. |
46 | | std::optional<LineCol> duplicate_html_element_location = std::nullopt; |
47 | | std::optional<LineCol> duplicate_body_element_location = std::nullopt; |
48 | | |
49 | | // If true, parsed src is missing required <!doctype html> declaration or is |
50 | | // invalid syntax or is XHTML 4 or legacy doctype. |
51 | | bool quirks_mode = false; |
52 | | |
53 | | // The line column position of the last element in the document. Useful for |
54 | | // error reporting at the end of the document. |
55 | | LineCol document_end_location {0, 0}; |
56 | | |
57 | | // The actual size of html src in bytes. |
58 | | std::size_t html_src_bytes = 0; |
59 | | |
60 | | // The document's <base> url and target. |
61 | | // See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base |
62 | | std::pair<std::string, std::string> base_url; |
63 | | |
64 | | // The link rel=canonical url found in the html src. If multiple link |
65 | | // rel=canonical found the last one is recorded. |
66 | | std::string canonical_url; |
67 | | }; |
68 | | |
69 | | |
70 | | // The document class is a wrapper for the DOM tree exposed with RootNode(). |
71 | | // All the nodes inside the document are owned by document. The nodes are |
72 | | // destroyed when Document objects goes out of scope or deleted. |
73 | | // |
74 | | // Usage: |
75 | | // unique_ptr<Document> doc = parser.Parse(html); |
76 | | // if (!doc->status().ok()) { |
77 | | // LOG(ERROR) << "Parsing failed. " << doc->status(); |
78 | | // return; |
79 | | // } |
80 | | // |
81 | | // Node* root_node = doc.RootNode(); |
82 | | // ... |
83 | | // |
84 | | class Document { |
85 | | public: |
86 | | Document(); |
87 | 11.5k | ~Document() = default; |
88 | | |
89 | 0 | const DocumentMetadata& Metadata() const { return metadata_; } |
90 | | |
91 | | // Creates a new node. The node is owned by Document and is destroyed when |
92 | | // document is destructed. |
93 | | Node* NewNode(NodeType node_type, Atom atom = Atom::UNKNOWN); |
94 | | |
95 | | // Returns OK if Document is result of successful html parsing. |
96 | | // Accessing any fields/methods when status() != OK is undefined behavior. |
97 | 0 | absl::Status status() const { |
98 | 0 | return status_; |
99 | 0 | } |
100 | | |
101 | | // Returns the root node of a DOM tree. Node* owned by document. |
102 | 0 | Node* RootNode() const { return root_node_; } |
103 | | |
104 | | // Returns list of nodes parsed as a document fragment. All the Nodes are |
105 | | // owned by the document. |
106 | 0 | const std::vector<Node*> FragmentNodes() const { return fragment_nodes_; } |
107 | | |
108 | | using const_iterator = NodeIterator<true>; |
109 | | using iterator = NodeIterator<false>; |
110 | | |
111 | 0 | iterator begin() { return iterator{root_node_}; } |
112 | 0 | iterator end() { return iterator{nullptr}; } |
113 | 0 | const_iterator cbegin() const { return const_iterator{root_node_}; } |
114 | 0 | const_iterator cend() const { return const_iterator{nullptr}; } |
115 | | |
116 | | private: |
117 | | // Returns a new node with the same type, data and attributes. |
118 | | // The clone has no parent, no siblings and no children. |
119 | | // The node is owned by the document and is destroyed when document is |
120 | | // destructed. |
121 | | Node* CloneNode(const Node* from); |
122 | | |
123 | | // The node allocator. |
124 | | std::unique_ptr<Allocator<Node>> node_allocator_; |
125 | | |
126 | | Node* root_node_; |
127 | | std::vector<Node*> fragment_nodes_{}; |
128 | | std::size_t html_src_bytes_; |
129 | | DocumentMetadata metadata_; |
130 | | // Document parsing status. |
131 | | absl::Status status_ = absl::OkStatus(); |
132 | | |
133 | | friend class Parser; |
134 | | friend std::unique_ptr<Document> Parse(std::string_view html); |
135 | | friend std::unique_ptr<Document> ParseWithOptions( |
136 | | std::string_view html, const ParseOptions& options); |
137 | | friend std::unique_ptr<Document> ParseFragment(std::string_view html, |
138 | | Node* fragment_parent); |
139 | | friend std::unique_ptr<Document> ParseFragmentWithOptions( |
140 | | const std::string_view html, |
141 | | const ParseOptions& options, |
142 | | Node* fragment_parent); |
143 | | }; |
144 | | |
145 | | } // namespace htmlparser |
146 | | |
147 | | |
148 | | #endif // CPP_HTMLPARSER_DOCUMENT_H_ |