/proc/self/cwd/cpp/htmlparser/node.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include "cpp/htmlparser/node.h" |
2 | | |
3 | | #include <algorithm> |
4 | | #include <functional> |
5 | | #include <sstream> |
6 | | |
7 | | #include "absl/strings/str_join.h" |
8 | | #include "absl/strings/string_view.h" |
9 | | #include "cpp/htmlparser/atomutil.h" |
10 | | #include "cpp/htmlparser/elements.h" |
11 | | #include "cpp/htmlparser/logging.h" |
12 | | |
13 | | namespace htmlparser { |
14 | | |
15 | | Node::Node(NodeType node_type, Atom atom, std::string name_space) : |
16 | 24.3M | node_type_(node_type), atom_(atom), name_space_(name_space) {} |
17 | | |
18 | 106 | void Node::SetData(std::string_view data) { |
19 | 106 | data_ = data; |
20 | 106 | } |
21 | | |
22 | 0 | void Node::AddAttribute(const Attribute& attr) { |
23 | 0 | attributes_.push_back(attr); |
24 | 0 | } |
25 | | |
26 | 0 | void Node::SortAttributes(bool remove_duplicates) { |
27 | 0 | std::stable_sort(attributes_.begin(), attributes_.end(), |
28 | 0 | [](const Attribute& left, const Attribute& right) -> bool { |
29 | 0 | return left.KeyPart() < right.KeyPart(); |
30 | 0 | }); |
31 | 0 | if (remove_duplicates) DropDuplicateAttributes(); |
32 | 0 | } |
33 | | |
34 | 0 | void Node::DropDuplicateAttributes() { |
35 | 0 | auto remove_attributes = [&](auto first, auto last) { |
36 | 0 | for (; first != last; ++first) { |
37 | 0 | last = std::remove_if(std::next(first), last, [first](const auto& attr) { |
38 | 0 | return first->KeyPart() == attr.KeyPart(); |
39 | 0 | }); |
40 | 0 | } |
41 | 0 | return last; |
42 | 0 | }; |
43 | 0 | attributes_.erase(remove_attributes(attributes_.begin(), attributes_.end()), |
44 | 0 | attributes_.end()); |
45 | 0 | } |
46 | | |
47 | 21.5M | bool Node::IsSpecialElement() const { |
48 | 21.5M | if (name_space_ == "" || name_space_ == "html") { |
49 | 21.4M | return std::find(kSpecialElements.begin(), |
50 | 21.4M | kSpecialElements.end(), |
51 | 21.4M | atom_) != kSpecialElements.end(); |
52 | 21.4M | } else if (name_space_ == "math") { |
53 | 15.0k | if (atom_ == Atom::MI || |
54 | 15.0k | atom_ == Atom::MO || |
55 | 15.0k | atom_ == Atom::MN || |
56 | 15.0k | atom_ == Atom::MS || |
57 | 15.0k | atom_ == Atom::MTEXT || |
58 | 15.0k | atom_ == Atom::ANNOTATION_XML) { |
59 | 14.7k | return true; |
60 | 14.7k | } |
61 | 23.1k | } else if (name_space_ == "svg") { |
62 | 23.1k | if (atom_ == Atom::FOREIGN_OBJECT || |
63 | 23.1k | atom_ == Atom::DESC || |
64 | 23.1k | atom_ == Atom::TITLE) { |
65 | 18 | return true; |
66 | 18 | } |
67 | 23.1k | } |
68 | 23.4k | return false; |
69 | 21.5M | } |
70 | | |
71 | 45.1k | bool Node::InsertBefore(Node* new_child, Node* old_child) { |
72 | | // Checks if it new_child is already attached. |
73 | 45.1k | if (new_child->Parent() || |
74 | 45.1k | new_child->PrevSibling() || |
75 | 45.1k | new_child->NextSibling()) { |
76 | 0 | return false; |
77 | 0 | } |
78 | | |
79 | 45.1k | Node* prev = nullptr; |
80 | 45.1k | Node* next = nullptr; |
81 | 45.1k | if (old_child) { |
82 | 45.1k | prev = old_child->PrevSibling(); |
83 | 45.1k | next = old_child; |
84 | 45.1k | } else { |
85 | 0 | prev = LastChild(); |
86 | 0 | } |
87 | 45.1k | if (prev) { |
88 | 15.5k | prev->next_sibling_ = new_child; |
89 | 29.5k | } else { |
90 | 29.5k | first_child_ = new_child; |
91 | 29.5k | } |
92 | | |
93 | 45.1k | if (next) { |
94 | 45.1k | next->prev_sibling_ = new_child; |
95 | 45.1k | } else { |
96 | 0 | last_child_ = new_child; |
97 | 0 | } |
98 | | |
99 | 45.1k | new_child->parent_ = this; |
100 | 45.1k | new_child->prev_sibling_ = prev; |
101 | 45.1k | new_child->next_sibling_ = next; |
102 | | |
103 | 45.1k | return true; |
104 | 45.1k | } |
105 | | |
106 | 24.6M | bool Node::AppendChild(Node* new_child) { |
107 | 24.6M | CHECK(!(new_child->Parent() || new_child->PrevSibling() || |
108 | 24.6M | new_child->NextSibling())) |
109 | 24.6M | << "html: AppendChild called for an attached child Node."; |
110 | | |
111 | 24.6M | Node* last = LastChild(); |
112 | 24.6M | if (last) { |
113 | 2.80M | last->next_sibling_ = new_child; |
114 | 21.8M | } else { |
115 | 21.8M | first_child_ = new_child; |
116 | 21.8M | } |
117 | 24.6M | last_child_ = new_child; |
118 | 24.6M | new_child->parent_ = this; |
119 | 24.6M | new_child->prev_sibling_ = last; |
120 | | |
121 | 24.6M | return true; |
122 | 24.6M | } |
123 | | |
124 | 376k | Node* Node::RemoveChild(Node* c) { |
125 | | // Remove child called for a non-child node. |
126 | 376k | CHECK(c->parent_ == this) << "html: RemoveChild called for a non-child Node"; |
127 | | |
128 | 376k | if (first_child_ == c) { |
129 | 322k | first_child_ = c->next_sibling_; |
130 | 322k | } |
131 | | |
132 | 376k | if (c->next_sibling_) { |
133 | 91.7k | c->NextSibling()->prev_sibling_ = c->prev_sibling_; |
134 | 91.7k | } |
135 | | |
136 | 376k | if (last_child_ == c) { |
137 | 285k | last_child_ = c->prev_sibling_; |
138 | 285k | } |
139 | | |
140 | 376k | if (c->prev_sibling_) { |
141 | 54.5k | c->prev_sibling_->next_sibling_ = c->next_sibling_; |
142 | 54.5k | } |
143 | | |
144 | 376k | c->parent_ = nullptr; |
145 | 376k | c->prev_sibling_ = nullptr; |
146 | 376k | c->next_sibling_ = nullptr; |
147 | 376k | return c; |
148 | 376k | } |
149 | | |
150 | 172k | void Node::ReparentChildrenTo(Node* destination) { |
151 | 376k | while (true) { |
152 | 376k | Node* child = first_child_; |
153 | 376k | if (!child) break; |
154 | 204k | destination->AppendChild(RemoveChild(child)); |
155 | 204k | } |
156 | 172k | } |
157 | | |
158 | 3.02M | Node* NodeStack::Pop() { |
159 | 3.02M | if (stack_.size() > 0) { |
160 | 3.02M | Node* node = stack_.back(); |
161 | 3.02M | stack_.pop_back(); |
162 | 3.02M | return node; |
163 | 3.02M | } |
164 | 0 | return nullptr; |
165 | 3.02M | } |
166 | | |
167 | 3.56M | void NodeStack::Pop(int count) { |
168 | 3.56M | if (stack_.empty()) return; |
169 | | |
170 | 3.56M | int sz = stack_.size(); |
171 | 3.56M | if (count >= sz) { |
172 | 0 | stack_.clear(); |
173 | 0 | } |
174 | 3.56M | stack_.erase(stack_.end() - count, stack_.end()); |
175 | 3.56M | } |
176 | | |
177 | 81.6M | Node* NodeStack::Top() { |
178 | 81.6M | if (stack_.size() > 0) return stack_.at(stack_.size() - 1); |
179 | 3.39M | return nullptr; |
180 | 81.6M | } |
181 | | |
182 | 17.3M | int NodeStack::Index(Node* node) { |
183 | 22.5G | for (int i = stack_.size() - 1; i >= 0; --i) { |
184 | 22.5G | Node* other = stack_[i]; |
185 | 22.5G | if (other == node) { |
186 | 8.94M | return i; |
187 | 8.94M | } |
188 | 22.5G | } |
189 | | |
190 | 8.42M | return -1; |
191 | 17.3M | } |
192 | | |
193 | 33.0k | bool NodeStack::Contains(Atom atom) { |
194 | 176M | for (Node* n : stack_) { |
195 | 176M | if (n->atom_ == atom && n->name_space_.empty()) return true; |
196 | 176M | } |
197 | 33.0k | return false; |
198 | 33.0k | } |
199 | | |
200 | 27.2M | void NodeStack::Push(Node* node) { |
201 | 27.2M | stack_.push_back(node); |
202 | 27.2M | } |
203 | | |
204 | 345k | void NodeStack::Insert(int index, Node* node) { |
205 | 345k | stack_.insert(stack_.begin() + index, node); |
206 | 345k | } |
207 | | |
208 | 7.32M | void NodeStack::Replace(int i, Node* node) { |
209 | 7.32M | if (i > stack_.size() - 1) return; |
210 | 7.32M | stack_[i] = node; |
211 | 7.32M | } |
212 | | |
213 | 1.98M | void NodeStack::Remove(Node* node) { |
214 | 9.47G | for (auto it = stack_.begin(); it != stack_.end(); ++it) { |
215 | 9.47G | if (*it == node) { |
216 | 1.64M | stack_.erase(it); |
217 | 1.64M | return; |
218 | 1.64M | } |
219 | 9.47G | } |
220 | 1.98M | } |
221 | | |
222 | | namespace { |
223 | | |
224 | 0 | std::optional<Error> CheckTreeConsistencyInternal(Node* node, int depth) { |
225 | 0 | if (depth == 0x1e4) { |
226 | 0 | return error("html: tree looks like it contains a cycle"); |
227 | 0 | } |
228 | | |
229 | 0 | auto err = CheckNodeConsistency(node); |
230 | 0 | if (err) { |
231 | 0 | return err; |
232 | 0 | } |
233 | | |
234 | 0 | for (Node* c = node->FirstChild(); c; c = c->NextSibling()) { |
235 | 0 | auto err = CheckTreeConsistencyInternal(c, depth+1); |
236 | 0 | if (err) { |
237 | 0 | return err; |
238 | 0 | } |
239 | 0 | } |
240 | | |
241 | 0 | return std::nullopt; |
242 | 0 | } |
243 | | |
244 | | } // namespace |
245 | | |
246 | 0 | std::optional<Error> CheckTreeConsistency(Node* node) { |
247 | 0 | return CheckTreeConsistencyInternal(node, 0); |
248 | 0 | } |
249 | | |
250 | 0 | std::optional<Error> CheckNodeConsistency(Node* node) { |
251 | 0 | if (!node) return std::nullopt; |
252 | | |
253 | 0 | int num_parents = 0; |
254 | |
|
255 | 0 | for (Node* parent = node->Parent(); parent; parent = parent->Parent()) { |
256 | 0 | num_parents++; |
257 | 0 | if (num_parents == 0x1e4) { |
258 | 0 | return error("html: parent list looks like an infinite loop"); |
259 | 0 | } |
260 | 0 | } |
261 | | |
262 | 0 | int num_forwards = 0; |
263 | 0 | for (Node* c = node->FirstChild(); c; c = c->NextSibling()) { |
264 | 0 | num_forwards++; |
265 | 0 | if (num_forwards == 0x1e6) { |
266 | 0 | return error("html: forward list of children looks like an infinite " |
267 | 0 | "loop"); |
268 | 0 | } |
269 | 0 | if (c->Parent() != node) { |
270 | 0 | return error("html: inconsistent child/parent relationship"); |
271 | 0 | } |
272 | 0 | } |
273 | | |
274 | 0 | int num_backwards = 0; |
275 | 0 | for (Node* c = node->LastChild(); c; c = c->PrevSibling()) { |
276 | 0 | num_backwards++; |
277 | 0 | if (num_backwards == 0x1e6) { |
278 | 0 | return error("html: backward list of children looks like an infinite " |
279 | 0 | "loop"); |
280 | 0 | } |
281 | 0 | if (c->Parent() != node) { |
282 | 0 | return error("html: inconsistent child/parent relationship"); |
283 | 0 | } |
284 | 0 | } |
285 | | |
286 | 0 | if (node->Parent()) { |
287 | 0 | if (node->Parent() == node) { |
288 | 0 | return error("html: inconsistent parent relationship"); |
289 | 0 | } |
290 | 0 | if (node->Parent() == node->FirstChild()) { |
291 | 0 | return error("html: inconsistent parent/first relationship"); |
292 | 0 | } |
293 | 0 | if (node->Parent() == node->LastChild()) { |
294 | 0 | return error("html: inconsistent parent/last relationship"); |
295 | 0 | } |
296 | 0 | if (node->Parent() == node->PrevSibling()) { |
297 | 0 | return error("html: inconsistent parent/prev relationship"); |
298 | 0 | } |
299 | 0 | if (node->Parent() == node->NextSibling()) { |
300 | 0 | return error("html: inconsistent parent/next relationship"); |
301 | 0 | } |
302 | | |
303 | 0 | bool parent_has_n_as_a_child = false; |
304 | 0 | for (Node* c = node->Parent()->FirstChild(); c; c = c->NextSibling()) { |
305 | 0 | if (c == node) { |
306 | 0 | parent_has_n_as_a_child = true; |
307 | 0 | break; |
308 | 0 | } |
309 | 0 | } |
310 | 0 | if (!parent_has_n_as_a_child) { |
311 | 0 | return error("html: inconsistent parent/child relationship"); |
312 | 0 | } |
313 | 0 | } |
314 | | |
315 | 0 | if (node->PrevSibling() && node->PrevSibling()->NextSibling() != node) { |
316 | 0 | return error("html: inconsistent prev/next relationship"); |
317 | 0 | } |
318 | 0 | if (node->NextSibling() && node->NextSibling()->PrevSibling() != node) { |
319 | 0 | return error("html: inconsistent next/prev relationship"); |
320 | 0 | } |
321 | 0 | if ((node->FirstChild() == nullptr) != (node->LastChild() == nullptr)) { |
322 | 0 | return error("html: inconsistent first/last relationship"); |
323 | 0 | } |
324 | 0 | if (node->FirstChild() && node->FirstChild() == node->LastChild()) { |
325 | | // We have a sole child. |
326 | 0 | if (node->FirstChild()->PrevSibling() || |
327 | 0 | node->FirstChild()->NextSibling()) { |
328 | 0 | return error("html: inconsistent sold child's sibling relationship"); |
329 | 0 | } |
330 | 0 | } |
331 | | |
332 | | // Sorted inserts and no duplicates. |
333 | 0 | std::vector<Node*> seen; |
334 | 0 | Node* last = nullptr; |
335 | 0 | for (Node* c = node->FirstChild(); c; c = c->NextSibling()) { |
336 | 0 | auto insert_position = std::lower_bound(seen.begin(), |
337 | 0 | seen.end(), |
338 | 0 | c); |
339 | 0 | if (insert_position != seen.end() && *insert_position == c) { |
340 | 0 | return error("html: inconsistent repeated child"); |
341 | 0 | } |
342 | | |
343 | 0 | seen.insert(insert_position, c); |
344 | 0 | last = c; |
345 | 0 | } |
346 | | |
347 | 0 | if (last != node->LastChild()) { |
348 | 0 | return error("html: inconsistent last relationship"); |
349 | 0 | } |
350 | | |
351 | 0 | Node* first = nullptr; |
352 | 0 | for (Node* c = node->LastChild(); c; c = c->PrevSibling()) { |
353 | 0 | auto iter = std::lower_bound(seen.begin(), |
354 | 0 | seen.end(), |
355 | 0 | c); |
356 | 0 | if (iter == seen.end() || *iter != c) { |
357 | 0 | return error("html: inconsistent missing child"); |
358 | 0 | } |
359 | 0 | seen.erase(iter); |
360 | 0 | first = c; |
361 | 0 | } |
362 | 0 | if (first != node->FirstChild()) { |
363 | 0 | return error("html: inconsistent first relationship"); |
364 | 0 | } |
365 | | |
366 | 0 | if (!seen.empty()) { |
367 | 0 | return error("html: inconsistent forwards/backwards child list"); |
368 | 0 | } |
369 | | |
370 | 0 | return std::nullopt; |
371 | 0 | } |
372 | | |
373 | 0 | bool Node::IsBlockElementNode() { |
374 | 0 | return std::find(kBlockElements.begin(), |
375 | 0 | kBlockElements.end(), |
376 | 0 | atom_) != kBlockElements.end(); |
377 | 0 | } |
378 | | |
379 | 0 | std::string Node::InnerText() const { |
380 | 0 | static std::function<void(const Node*, std::vector<absl::string_view>&)> |
381 | 0 | output = |
382 | 0 | [](const Node* node, std::vector<absl::string_view>& output_content) { |
383 | 0 | switch (node->Type()) { |
384 | 0 | case NodeType::TEXT_NODE: { |
385 | 0 | output_content.push_back(absl::string_view( |
386 | 0 | node->Data().data(), node->Data().size())); |
387 | 0 | return; |
388 | 0 | } |
389 | 0 | case NodeType::COMMENT_NODE: { |
390 | | // Ignore comments. |
391 | 0 | return; |
392 | 0 | } |
393 | 0 | default: |
394 | 0 | break; |
395 | 0 | } |
396 | | |
397 | 0 | for (Node* child = node->FirstChild(); child; |
398 | 0 | child = child->NextSibling()) { |
399 | 0 | output(child, output_content); |
400 | 0 | } |
401 | 0 | }; |
402 | |
|
403 | 0 | std::vector<absl::string_view> buffer; |
404 | 0 | output(this, buffer); |
405 | |
|
406 | 0 | return absl::StrJoin(buffer, " "); |
407 | 0 | } |
408 | | |
409 | 0 | void Node::UpdateChildNodesPositions(Node* relative_node) { |
410 | | // Cannot proceed if relative node has no positional information. |
411 | 0 | if (!relative_node->LineColInHtmlSrc().has_value()) return; |
412 | | |
413 | 0 | auto [r_line, r_col] = relative_node->LineColInHtmlSrc().value(); |
414 | | |
415 | | // Update the positions of this node. |
416 | 0 | if (line_col_in_html_src_.has_value()) { |
417 | 0 | auto [line, col] = line_col_in_html_src_.value(); |
418 | 0 | int effective_col = line == 1 ? |
419 | 0 | r_col + col + AtomUtil::ToString( |
420 | 0 | relative_node->DataAtom()).size() + 1 /* closing > */ : col; |
421 | 0 | line_col_in_html_src_ = LineCol({line + r_line - 1, effective_col}); |
422 | 0 | } |
423 | | |
424 | | // Update the positions of this node's children. |
425 | 0 | for (auto c = FirstChild(); c; c = c->NextSibling()) { |
426 | 0 | c->UpdateChildNodesPositions(relative_node); |
427 | 0 | } |
428 | 0 | } |
429 | | |
430 | 0 | std::string Node::DebugString() { |
431 | 0 | std::ostringstream ost; |
432 | 0 | switch (node_type_) { |
433 | 0 | case NodeType::ELEMENT_NODE: |
434 | 0 | ost << "<" << AtomUtil::ToString(atom_) << ">"; |
435 | 0 | break; |
436 | 0 | case NodeType::TEXT_NODE: |
437 | 0 | ost << "TEXT[" << data_.size() << "]"; |
438 | 0 | break; |
439 | 0 | case NodeType::COMMENT_NODE: |
440 | 0 | ost << "COMMENT[" << data_.size() << "]"; |
441 | 0 | break; |
442 | 0 | default: |
443 | | // Ignores doctype, error, document node types. |
444 | 0 | break; |
445 | 0 | } |
446 | | |
447 | 0 | if (line_col_in_html_src_.has_value()) { |
448 | 0 | ost << line_col_in_html_src_.value().first << ":" |
449 | 0 | << line_col_in_html_src_.value().second; |
450 | 0 | } |
451 | 0 | ost << "\n"; |
452 | |
|
453 | 0 | return ost.str(); |
454 | 0 | } |
455 | | |
456 | | } // namespace htmlparser |