Coverage Report

Created: 2025-07-23 06:45

/proc/self/cwd/cpp/htmlparser/node.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef CPP_HTMLPARSER_NODE_H_
2
#define CPP_HTMLPARSER_NODE_H_
3
4
#include <deque>
5
#include <memory>
6
#include <optional>
7
#include <string>
8
#include <string_view>
9
#include <tuple>
10
#include <utility>
11
#include <vector>
12
13
#include "cpp/htmlparser/atom.h"
14
#include "cpp/htmlparser/error.h"
15
#include "cpp/htmlparser/token.h"
16
17
namespace htmlparser {
18
19
class Parser;
20
21
enum class NodeType {
22
  ERROR_NODE,
23
  TEXT_NODE,
24
  DOCUMENT_NODE,
25
  ELEMENT_NODE,
26
  COMMENT_NODE,
27
  DOCTYPE_NODE,
28
  SCOPE_MARKER_NODE,
29
};
30
31
// A Node consists of a NodeType and data (for text and comment node).
32
// A node is a member of a tree of Nodes. Element nodes may also
33
// have a Namespace and contain a slice of Attributes. Data is unescaped, so
34
// that it looks like "a<b" rather than "a&lt;b". For element nodes, DataAtom
35
// is the atom for Data, or Atom::UNKNOWN if Data is not a known tag name.
36
//
37
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
38
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
39
// "svg" is short for "http://www.w3.org/2000/svg".
40
class Node {
41
 public:
42
  Node(NodeType node_type, Atom atom = Atom::UNKNOWN,
43
       std::string name_space = "");
44
24.3M
  ~Node() = default;
45
46
  // Allows move.
47
  Node(Node&&) = default;
48
  Node& operator=(Node&&) = default;
49
50
  // Disallow copy and assign.
51
  Node(const Node&) = delete;
52
  void operator=(const Node&) = delete;
53
54
  void SetData(std::string_view data);
55
  void AddAttribute(const Attribute& attr);
56
  // Sorts the attributes of this node.
57
  void SortAttributes(bool remove_duplicates = false);
58
  void DropDuplicateAttributes();
59
60
  // Updates child nodes line and column numbers relative to the given node.
61
  // This does not change order or parent/child relationship of this or child
62
  // nodes in the tree.
63
  // Generally, treat this as a private function. Part of public interface for
64
  // some specific sceanrios:
65
  // A) Unit testing.
66
  // B) When parsing a fragment.
67
  // C) Custom error/warning reporting.
68
  void UpdateChildNodesPositions(Node* relative_node);
69
70
7.85M
  NodeType Type() const { return node_type_; }
71
0
  std::string_view Data() const { return data_; }
72
2.23M
  Atom DataAtom() const { return atom_; }
73
959k
  std::string_view NameSpace() const { return name_space_; }
74
  // Returns nullopt if ParseOptions.store_node_offsets is not set.
75
0
  std::optional<LineCol> LineColInHtmlSrc() const {
76
0
    return line_col_in_html_src_;
77
0
  }
78
0
  int NumTerms() const {
79
0
    return num_terms_;
80
0
  }
81
82
22.2M
  const std::vector<Attribute>& Attributes() const { return attributes_; }
83
25.1M
  Node* Parent() const { return parent_; }
84
430
  Node* FirstChild() const { return first_child_; }
85
25.2M
  Node* LastChild() const { return last_child_; }
86
24.7M
  Node* PrevSibling() const { return prev_sibling_; }
87
24.7M
  Node* NextSibling() const { return next_sibling_; }
88
89
  // Section 12.2.4.2 of the HTML5 specification says "The following elements
90
  // have varying levels of special parsing rules".
91
  // https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
92
  // For the list of such elements see elements.h:kIsSpecialElement.
93
  bool IsSpecialElement() const;
94
95
  // InsertBefore inserts new_child as a child of this node, immediately before
96
  // old_child in the sequence of this node's children.
97
  // old_child may be null, in which case new_child is appended to the end of
98
  // this node's children.
99
  //
100
  // Returns false if new_child already has a parent or siblings.
101
  bool InsertBefore(Node* new_child, Node* old_child);
102
103
  // AppendChild adds new_child as a child of this node.
104
  //
105
  // Returns false if new_child is already has a parent or siblings.
106
  bool AppendChild(Node* new_child);
107
108
  // RemoveChild removes child_node if it is a child of this node.
109
  // Afterwards, child_node will have no parent and no siblings.
110
  Node* RemoveChild(Node* child_node);
111
112
  // Reparents all the child nodes of this node to the destination node.
113
  void ReparentChildrenTo(Node* destination);
114
115
  // Returns true if node element is html block element.
116
  // This doesn't take into account CSS style which can override this behavior,
117
  // example <div style="display:inline">
118
  bool IsBlockElementNode();
119
120
  // Similar to javascript's innerText. (Strips HTML).
121
  // Except: All elements are treated as inline elements. No new lines are
122
  // inserted for block elements. <div>hello</div><div>world</div> returns
123
  // 'hello world' not hello\nworld.
124
  std::string InnerText() const;
125
126
  // True, if this node is manufactured by parser as per HTML5 specification.
127
  // Currently, this applies only to HTML, HEAD and BODY tags.
128
0
  bool IsManufactured() const { return is_manufactured_; }
129
130
  // Debug/Logging utils.
131
  // Outputs node debug info.
132
  std::string DebugString();
133
134
 private:
135
1.43M
  void SetManufactured(bool is_manufactured) {
136
1.43M
    is_manufactured_ = is_manufactured;
137
1.43M
  }
138
139
  NodeType node_type_;
140
  Atom atom_;
141
  std::string data_;
142
  std::string name_space_;
143
  // Position at which this node appears in HTML source.
144
  std::optional<LineCol> line_col_in_html_src_;
145
  // Records the number of terms for text contents.
146
  // Populated and meaningful only if node is of type TEXT_NODE.
147
  int num_terms_ = -1;
148
  std::vector<Attribute> attributes_{};
149
  Node* first_child_ = nullptr;
150
  Node* next_sibling_ = nullptr;
151
152
  // Not owned.
153
  Node* parent_ = nullptr;
154
  Node* last_child_ = nullptr;
155
  Node* prev_sibling_ = nullptr;
156
  bool is_manufactured_{false};
157
158
#ifdef HTMLPARSER_NODE_DEBUG
159
  int64_t recursive_counter_ = 0;
160
#endif
161
162
  friend class Document;
163
  friend class NodeStack;
164
  friend class Parser;
165
};
166
167
class NodeStack {
168
 public:
169
  // Pops the stack.
170
  Node* Pop();
171
  // Pops n (count) elements off the stack.
172
  // if count is greater than the number of elements in the stack, entire stack
173
  // is cleared.
174
  void Pop(int count);
175
176
  // Returns the most recently pushed node, or nullptr if stack is empty.
177
  Node* Top();
178
179
  // Allows iterator like access to elements in stack_.
180
  // Since this is a stack. It returns reverse iterator.
181
11.4k
  std::deque<Node*>::const_reverse_iterator begin() { return stack_.rbegin(); }
182
183
0
  std::deque<Node*>::const_reverse_iterator begin() const {
184
0
    return stack_.rbegin();
185
0
  }
186
187
11.4k
  std::deque<Node*>::const_reverse_iterator end() { return stack_.rend(); }
188
189
0
  std::deque<Node*>::const_reverse_iterator end() const {
190
0
    return stack_.rend();
191
0
  }
192
193
  // Returns the index of the top-most occurrence of a node in the stack, or -1
194
  // if node is not present.
195
  int Index(Node* node);
196
197
  // Whether stack contains any node representing atom.
198
  bool Contains(Atom atom);
199
200
  // Inserts inserts a node at the given index.
201
  void Insert(int index, Node* node);
202
203
  // Replaces (old) node at the given index, with the given (new) node.
204
  // The index begins at the end of the deque (since it is a stack).
205
  void Replace(int index, Node* node);
206
207
  void Push(Node* node);
208
209
  // Removes a node from the stack. It is a no-op if node is not present.
210
  void Remove(Node* node);
211
212
64.6M
  int size() const { return stack_.size(); }
213
214
6.76G
  Node* at(int index) const { return stack_.at(index); }
215
216
 private:
217
  std::deque<Node*> stack_;
218
};
219
220
// The following two functions can be used by client's if they want to
221
// test consistency of the nodes built manually, or by the Parser.
222
// However, consider these as private. Should be used only by tests and not
223
// in production.
224
// ---------------------------------------------------------------------------
225
//
226
// Checks that a node and its descendants are all consistent in their
227
// parent/child/sibling relationships.
228
std::optional<Error> CheckTreeConsistency(Node* node);
229
230
// Checks that a node's parent/child/sibling relationships are consistent.
231
std::optional<Error> CheckNodeConsistency(Node* node);
232
233
}  // namespace htmlparser
234
235
#endif  // CPP_HTMLPARSER_NODE_H_