/proc/self/cwd/cpp/htmlparser/node.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef CPP_HTMLPARSER_NODE_H_ |
2 | | #define CPP_HTMLPARSER_NODE_H_ |
3 | | |
4 | | #include <deque> |
5 | | #include <memory> |
6 | | #include <optional> |
7 | | #include <string> |
8 | | #include <string_view> |
9 | | #include <tuple> |
10 | | #include <utility> |
11 | | #include <vector> |
12 | | |
13 | | #include "cpp/htmlparser/atom.h" |
14 | | #include "cpp/htmlparser/error.h" |
15 | | #include "cpp/htmlparser/token.h" |
16 | | |
17 | | namespace htmlparser { |
18 | | |
19 | | class Parser; |
20 | | |
21 | | enum class NodeType { |
22 | | ERROR_NODE, |
23 | | TEXT_NODE, |
24 | | DOCUMENT_NODE, |
25 | | ELEMENT_NODE, |
26 | | COMMENT_NODE, |
27 | | DOCTYPE_NODE, |
28 | | SCOPE_MARKER_NODE, |
29 | | }; |
30 | | |
31 | | // A Node consists of a NodeType and data (for text and comment node). |
32 | | // A node is a member of a tree of Nodes. Element nodes may also |
33 | | // have a Namespace and contain a slice of Attributes. Data is unescaped, so |
34 | | // that it looks like "a<b" rather than "a<b". For element nodes, DataAtom |
35 | | // is the atom for Data, or Atom::UNKNOWN if Data is not a known tag name. |
36 | | // |
37 | | // An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace. |
38 | | // Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and |
39 | | // "svg" is short for "http://www.w3.org/2000/svg". |
40 | | class Node { |
41 | | public: |
42 | | Node(NodeType node_type, Atom atom = Atom::UNKNOWN, |
43 | | std::string name_space = ""); |
44 | 24.3M | ~Node() = default; |
45 | | |
46 | | // Allows move. |
47 | | Node(Node&&) = default; |
48 | | Node& operator=(Node&&) = default; |
49 | | |
50 | | // Disallow copy and assign. |
51 | | Node(const Node&) = delete; |
52 | | void operator=(const Node&) = delete; |
53 | | |
54 | | void SetData(std::string_view data); |
55 | | void AddAttribute(const Attribute& attr); |
56 | | // Sorts the attributes of this node. |
57 | | void SortAttributes(bool remove_duplicates = false); |
58 | | void DropDuplicateAttributes(); |
59 | | |
60 | | // Updates child nodes line and column numbers relative to the given node. |
61 | | // This does not change order or parent/child relationship of this or child |
62 | | // nodes in the tree. |
63 | | // Generally, treat this as a private function. Part of public interface for |
64 | | // some specific sceanrios: |
65 | | // A) Unit testing. |
66 | | // B) When parsing a fragment. |
67 | | // C) Custom error/warning reporting. |
68 | | void UpdateChildNodesPositions(Node* relative_node); |
69 | | |
70 | 7.85M | NodeType Type() const { return node_type_; } |
71 | 0 | std::string_view Data() const { return data_; } |
72 | 2.23M | Atom DataAtom() const { return atom_; } |
73 | 959k | std::string_view NameSpace() const { return name_space_; } |
74 | | // Returns nullopt if ParseOptions.store_node_offsets is not set. |
75 | 0 | std::optional<LineCol> LineColInHtmlSrc() const { |
76 | 0 | return line_col_in_html_src_; |
77 | 0 | } |
78 | 0 | int NumTerms() const { |
79 | 0 | return num_terms_; |
80 | 0 | } |
81 | | |
82 | 22.2M | const std::vector<Attribute>& Attributes() const { return attributes_; } |
83 | 25.1M | Node* Parent() const { return parent_; } |
84 | 430 | Node* FirstChild() const { return first_child_; } |
85 | 25.2M | Node* LastChild() const { return last_child_; } |
86 | 24.7M | Node* PrevSibling() const { return prev_sibling_; } |
87 | 24.7M | Node* NextSibling() const { return next_sibling_; } |
88 | | |
89 | | // Section 12.2.4.2 of the HTML5 specification says "The following elements |
90 | | // have varying levels of special parsing rules". |
91 | | // https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements |
92 | | // For the list of such elements see elements.h:kIsSpecialElement. |
93 | | bool IsSpecialElement() const; |
94 | | |
95 | | // InsertBefore inserts new_child as a child of this node, immediately before |
96 | | // old_child in the sequence of this node's children. |
97 | | // old_child may be null, in which case new_child is appended to the end of |
98 | | // this node's children. |
99 | | // |
100 | | // Returns false if new_child already has a parent or siblings. |
101 | | bool InsertBefore(Node* new_child, Node* old_child); |
102 | | |
103 | | // AppendChild adds new_child as a child of this node. |
104 | | // |
105 | | // Returns false if new_child is already has a parent or siblings. |
106 | | bool AppendChild(Node* new_child); |
107 | | |
108 | | // RemoveChild removes child_node if it is a child of this node. |
109 | | // Afterwards, child_node will have no parent and no siblings. |
110 | | Node* RemoveChild(Node* child_node); |
111 | | |
112 | | // Reparents all the child nodes of this node to the destination node. |
113 | | void ReparentChildrenTo(Node* destination); |
114 | | |
115 | | // Returns true if node element is html block element. |
116 | | // This doesn't take into account CSS style which can override this behavior, |
117 | | // example <div style="display:inline"> |
118 | | bool IsBlockElementNode(); |
119 | | |
120 | | // Similar to javascript's innerText. (Strips HTML). |
121 | | // Except: All elements are treated as inline elements. No new lines are |
122 | | // inserted for block elements. <div>hello</div><div>world</div> returns |
123 | | // 'hello world' not hello\nworld. |
124 | | std::string InnerText() const; |
125 | | |
126 | | // True, if this node is manufactured by parser as per HTML5 specification. |
127 | | // Currently, this applies only to HTML, HEAD and BODY tags. |
128 | 0 | bool IsManufactured() const { return is_manufactured_; } |
129 | | |
130 | | // Debug/Logging utils. |
131 | | // Outputs node debug info. |
132 | | std::string DebugString(); |
133 | | |
134 | | private: |
135 | 1.43M | void SetManufactured(bool is_manufactured) { |
136 | 1.43M | is_manufactured_ = is_manufactured; |
137 | 1.43M | } |
138 | | |
139 | | NodeType node_type_; |
140 | | Atom atom_; |
141 | | std::string data_; |
142 | | std::string name_space_; |
143 | | // Position at which this node appears in HTML source. |
144 | | std::optional<LineCol> line_col_in_html_src_; |
145 | | // Records the number of terms for text contents. |
146 | | // Populated and meaningful only if node is of type TEXT_NODE. |
147 | | int num_terms_ = -1; |
148 | | std::vector<Attribute> attributes_{}; |
149 | | Node* first_child_ = nullptr; |
150 | | Node* next_sibling_ = nullptr; |
151 | | |
152 | | // Not owned. |
153 | | Node* parent_ = nullptr; |
154 | | Node* last_child_ = nullptr; |
155 | | Node* prev_sibling_ = nullptr; |
156 | | bool is_manufactured_{false}; |
157 | | |
158 | | #ifdef HTMLPARSER_NODE_DEBUG |
159 | | int64_t recursive_counter_ = 0; |
160 | | #endif |
161 | | |
162 | | friend class Document; |
163 | | friend class NodeStack; |
164 | | friend class Parser; |
165 | | }; |
166 | | |
167 | | class NodeStack { |
168 | | public: |
169 | | // Pops the stack. |
170 | | Node* Pop(); |
171 | | // Pops n (count) elements off the stack. |
172 | | // if count is greater than the number of elements in the stack, entire stack |
173 | | // is cleared. |
174 | | void Pop(int count); |
175 | | |
176 | | // Returns the most recently pushed node, or nullptr if stack is empty. |
177 | | Node* Top(); |
178 | | |
179 | | // Allows iterator like access to elements in stack_. |
180 | | // Since this is a stack. It returns reverse iterator. |
181 | 11.4k | std::deque<Node*>::const_reverse_iterator begin() { return stack_.rbegin(); } |
182 | | |
183 | 0 | std::deque<Node*>::const_reverse_iterator begin() const { |
184 | 0 | return stack_.rbegin(); |
185 | 0 | } |
186 | | |
187 | 11.4k | std::deque<Node*>::const_reverse_iterator end() { return stack_.rend(); } |
188 | | |
189 | 0 | std::deque<Node*>::const_reverse_iterator end() const { |
190 | 0 | return stack_.rend(); |
191 | 0 | } |
192 | | |
193 | | // Returns the index of the top-most occurrence of a node in the stack, or -1 |
194 | | // if node is not present. |
195 | | int Index(Node* node); |
196 | | |
197 | | // Whether stack contains any node representing atom. |
198 | | bool Contains(Atom atom); |
199 | | |
200 | | // Inserts inserts a node at the given index. |
201 | | void Insert(int index, Node* node); |
202 | | |
203 | | // Replaces (old) node at the given index, with the given (new) node. |
204 | | // The index begins at the end of the deque (since it is a stack). |
205 | | void Replace(int index, Node* node); |
206 | | |
207 | | void Push(Node* node); |
208 | | |
209 | | // Removes a node from the stack. It is a no-op if node is not present. |
210 | | void Remove(Node* node); |
211 | | |
212 | 64.6M | int size() const { return stack_.size(); } |
213 | | |
214 | 6.76G | Node* at(int index) const { return stack_.at(index); } |
215 | | |
216 | | private: |
217 | | std::deque<Node*> stack_; |
218 | | }; |
219 | | |
220 | | // The following two functions can be used by client's if they want to |
221 | | // test consistency of the nodes built manually, or by the Parser. |
222 | | // However, consider these as private. Should be used only by tests and not |
223 | | // in production. |
224 | | // --------------------------------------------------------------------------- |
225 | | // |
226 | | // Checks that a node and its descendants are all consistent in their |
227 | | // parent/child/sibling relationships. |
228 | | std::optional<Error> CheckTreeConsistency(Node* node); |
229 | | |
230 | | // Checks that a node's parent/child/sibling relationships are consistent. |
231 | | std::optional<Error> CheckNodeConsistency(Node* node); |
232 | | |
233 | | } // namespace htmlparser |
234 | | |
235 | | #endif // CPP_HTMLPARSER_NODE_H_ |