Coverage Report

Created: 2025-09-08 06:20

/proc/self/cwd/cpp/htmlparser/node.cc
Line
Count
Source (jump to first uncovered line)
1
#include "cpp/htmlparser/node.h"
2
3
#include <algorithm>
4
#include <functional>
5
#include <sstream>
6
7
#include "absl/strings/str_join.h"
8
#include "absl/strings/string_view.h"
9
#include "cpp/htmlparser/atomutil.h"
10
#include "cpp/htmlparser/elements.h"
11
#include "cpp/htmlparser/logging.h"
12
13
namespace htmlparser {
14
15
Node::Node(NodeType node_type, Atom atom, std::string name_space) :
16
23.2M
    node_type_(node_type), atom_(atom), name_space_(name_space) {}
17
18
125
void Node::SetData(std::string_view data) {
19
125
  data_ = data;
20
125
}
21
22
0
void Node::AddAttribute(const Attribute& attr) {
23
0
  attributes_.push_back(attr);
24
0
}
25
26
0
void Node::SortAttributes(bool remove_duplicates) {
27
0
  std::stable_sort(attributes_.begin(), attributes_.end(),
28
0
            [](const Attribute& left, const Attribute& right) -> bool {
29
0
              return left.KeyPart() < right.KeyPart();
30
0
            });
31
0
  if (remove_duplicates) DropDuplicateAttributes();
32
0
}
33
34
0
void Node::DropDuplicateAttributes() {
35
0
  auto remove_attributes = [&](auto first, auto last) {
36
0
    for (; first != last; ++first) {
37
0
      last = std::remove_if(std::next(first), last, [first](const auto& attr) {
38
0
        return first->KeyPart() == attr.KeyPart();
39
0
      });
40
0
    }
41
0
    return last;
42
0
  };
43
0
  attributes_.erase(remove_attributes(attributes_.begin(), attributes_.end()),
44
0
                    attributes_.end());
45
0
}
46
47
25.8M
bool Node::IsSpecialElement() const {
48
25.8M
  if (name_space_ == "" || name_space_ == "html") {
49
25.8M
    return std::find(kSpecialElements.begin(),
50
25.8M
                     kSpecialElements.end(),
51
25.8M
                     atom_) != kSpecialElements.end();
52
25.8M
  } else if (name_space_ == "math") {
53
1.04k
    if (atom_ == Atom::MI ||
54
1.04k
        atom_ == Atom::MO ||
55
1.04k
        atom_ == Atom::MN ||
56
1.04k
        atom_ == Atom::MS ||
57
1.04k
        atom_ == Atom::MTEXT ||
58
1.04k
        atom_ == Atom::ANNOTATION_XML) {
59
807
      return true;
60
807
    }
61
16.4k
  } else if (name_space_ == "svg") {
62
16.4k
    if (atom_ == Atom::FOREIGN_OBJECT ||
63
16.4k
        atom_ == Atom::DESC ||
64
16.4k
        atom_ == Atom::TITLE) {
65
396
      return true;
66
396
    }
67
16.4k
  }
68
16.3k
  return false;
69
25.8M
}
70
71
39.5k
bool Node::InsertBefore(Node* new_child, Node* old_child) {
72
  // Checks if it new_child is already attached.
73
39.5k
  if (new_child->Parent() ||
74
39.5k
      new_child->PrevSibling() ||
75
39.5k
      new_child->NextSibling()) {
76
0
    return false;
77
0
  }
78
79
39.5k
  Node* prev = nullptr;
80
39.5k
  Node* next = nullptr;
81
39.5k
  if (old_child) {
82
39.5k
    prev = old_child->PrevSibling();
83
39.5k
    next = old_child;
84
39.5k
  } else {
85
0
    prev = LastChild();
86
0
  }
87
39.5k
  if (prev) {
88
15.5k
    prev->next_sibling_ = new_child;
89
23.9k
  } else {
90
23.9k
    first_child_ = new_child;
91
23.9k
  }
92
93
39.5k
  if (next) {
94
39.5k
    next->prev_sibling_ = new_child;
95
39.5k
  } else {
96
0
    last_child_ = new_child;
97
0
  }
98
99
39.5k
  new_child->parent_ = this;
100
39.5k
  new_child->prev_sibling_ = prev;
101
39.5k
  new_child->next_sibling_ = next;
102
103
39.5k
  return true;
104
39.5k
}
105
106
23.4M
bool Node::AppendChild(Node* new_child) {
107
23.4M
  CHECK(!(new_child->Parent() || new_child->PrevSibling() ||
108
23.4M
          new_child->NextSibling()))
109
23.4M
      << "html: AppendChild called for an attached child Node.";
110
111
23.4M
  Node* last = LastChild();
112
23.4M
  if (last) {
113
2.28M
    last->next_sibling_ = new_child;
114
21.1M
  } else {
115
21.1M
    first_child_ = new_child;
116
21.1M
  }
117
23.4M
  last_child_ = new_child;
118
23.4M
  new_child->parent_ = this;
119
23.4M
  new_child->prev_sibling_ = last;
120
121
23.4M
  return true;
122
23.4M
}
123
124
252k
Node* Node::RemoveChild(Node* c) {
125
  // Remove child called for a non-child node.
126
252k
  CHECK(c->parent_ == this) << "html: RemoveChild called for a non-child Node";
127
128
252k
  if (first_child_ == c) {
129
229k
    first_child_ = c->next_sibling_;
130
229k
  }
131
132
252k
  if (c->next_sibling_) {
133
54.3k
    c->NextSibling()->prev_sibling_ = c->prev_sibling_;
134
54.3k
  }
135
136
252k
  if (last_child_ == c) {
137
197k
    last_child_ = c->prev_sibling_;
138
197k
  }
139
140
252k
  if (c->prev_sibling_) {
141
22.3k
    c->prev_sibling_->next_sibling_ = c->next_sibling_;
142
22.3k
  }
143
144
252k
  c->parent_ = nullptr;
145
252k
  c->prev_sibling_ = nullptr;
146
252k
  c->next_sibling_ = nullptr;
147
252k
  return c;
148
252k
}
149
150
118k
void Node::ReparentChildrenTo(Node* destination) {
151
252k
  while (true) {
152
252k
    Node* child = first_child_;
153
252k
    if (!child) break;
154
133k
    destination->AppendChild(RemoveChild(child));
155
133k
  }
156
118k
}
157
158
2.31M
Node* NodeStack::Pop() {
159
2.31M
  if (stack_.size() > 0) {
160
2.31M
    Node* node = stack_.back();
161
2.31M
    stack_.pop_back();
162
2.31M
    return node;
163
2.31M
  }
164
0
  return nullptr;
165
2.31M
}
166
167
3.96M
void NodeStack::Pop(int count) {
168
3.96M
  if (stack_.empty()) return;
169
170
3.96M
  int sz = stack_.size();
171
3.96M
  if (count >= sz) {
172
0
    stack_.clear();
173
0
  }
174
3.96M
  stack_.erase(stack_.end() - count, stack_.end());
175
3.96M
}
176
177
79.8M
Node* NodeStack::Top() {
178
79.8M
  if (stack_.size() > 0) return stack_.at(stack_.size() - 1);
179
3.06M
  return nullptr;
180
79.8M
}
181
182
14.7M
int NodeStack::Index(Node* node) {
183
18.3G
  for (int i = stack_.size() - 1; i >= 0; --i) {
184
18.3G
    Node* other = stack_[i];
185
18.3G
    if (other == node) {
186
7.46M
      return i;
187
7.46M
    }
188
18.3G
  }
189
190
7.31M
  return -1;
191
14.7M
}
192
193
29.5k
bool NodeStack::Contains(Atom atom) {
194
128M
  for (Node* n : stack_) {
195
128M
    if (n->atom_ == atom && n->name_space_.empty()) return true;
196
128M
  }
197
29.5k
  return false;
198
29.5k
}
199
200
24.8M
void NodeStack::Push(Node* node) {
201
24.8M
  stack_.push_back(node);
202
24.8M
}
203
204
237k
void NodeStack::Insert(int index, Node* node) {
205
237k
  stack_.insert(stack_.begin() + index, node);
206
237k
}
207
208
5.95M
void NodeStack::Replace(int i, Node* node) {
209
5.95M
  if (i > stack_.size() - 1) return;
210
5.95M
  stack_[i] = node;
211
5.95M
}
212
213
1.98M
void NodeStack::Remove(Node* node) {
214
9.98G
  for (auto it = stack_.begin(); it != stack_.end(); ++it) {
215
9.98G
    if (*it == node) {
216
1.64M
      stack_.erase(it);
217
1.64M
      return;
218
1.64M
    }
219
9.98G
  }
220
1.98M
}
221
222
namespace {
223
224
0
std::optional<Error> CheckTreeConsistencyInternal(Node* node, int depth) {
225
0
  if (depth == 0x1e4) {
226
0
    return error("html: tree looks like it contains a cycle");
227
0
  }
228
229
0
  auto err = CheckNodeConsistency(node);
230
0
  if (err) {
231
0
    return err;
232
0
  }
233
234
0
  for (Node* c = node->FirstChild(); c; c = c->NextSibling()) {
235
0
    auto err = CheckTreeConsistencyInternal(c, depth+1);
236
0
    if (err) {
237
0
      return err;
238
0
    }
239
0
  }
240
241
0
  return std::nullopt;
242
0
}
243
244
}  // namespace
245
246
0
std::optional<Error> CheckTreeConsistency(Node* node) {
247
0
  return CheckTreeConsistencyInternal(node, 0);
248
0
}
249
250
0
std::optional<Error> CheckNodeConsistency(Node* node) {
251
0
  if (!node) return std::nullopt;
252
253
0
  int num_parents = 0;
254
255
0
  for (Node* parent = node->Parent(); parent; parent = parent->Parent()) {
256
0
    num_parents++;
257
0
    if (num_parents == 0x1e4) {
258
0
      return error("html: parent list looks like an infinite loop");
259
0
    }
260
0
  }
261
262
0
  int num_forwards = 0;
263
0
  for (Node* c = node->FirstChild(); c; c = c->NextSibling()) {
264
0
    num_forwards++;
265
0
    if (num_forwards == 0x1e6) {
266
0
      return error("html: forward list of children looks like an infinite "
267
0
                   "loop");
268
0
    }
269
0
    if (c->Parent() != node) {
270
0
      return error("html: inconsistent child/parent relationship");
271
0
    }
272
0
  }
273
274
0
  int num_backwards = 0;
275
0
  for (Node* c = node->LastChild(); c; c = c->PrevSibling()) {
276
0
    num_backwards++;
277
0
    if (num_backwards == 0x1e6) {
278
0
      return error("html: backward list of children looks like an infinite "
279
0
                   "loop");
280
0
    }
281
0
    if (c->Parent() != node) {
282
0
      return error("html: inconsistent child/parent relationship");
283
0
    }
284
0
  }
285
286
0
  if (node->Parent()) {
287
0
    if (node->Parent() == node) {
288
0
      return error("html: inconsistent parent relationship");
289
0
    }
290
0
    if (node->Parent() == node->FirstChild()) {
291
0
      return error("html: inconsistent parent/first relationship");
292
0
    }
293
0
    if (node->Parent() == node->LastChild()) {
294
0
      return error("html: inconsistent parent/last relationship");
295
0
    }
296
0
    if (node->Parent() == node->PrevSibling()) {
297
0
      return error("html: inconsistent parent/prev relationship");
298
0
    }
299
0
    if (node->Parent() == node->NextSibling()) {
300
0
      return error("html: inconsistent parent/next relationship");
301
0
    }
302
303
0
    bool parent_has_n_as_a_child = false;
304
0
    for (Node* c = node->Parent()->FirstChild(); c; c = c->NextSibling()) {
305
0
      if (c == node) {
306
0
        parent_has_n_as_a_child = true;
307
0
        break;
308
0
      }
309
0
    }
310
0
    if (!parent_has_n_as_a_child) {
311
0
      return error("html: inconsistent parent/child relationship");
312
0
    }
313
0
  }
314
315
0
  if (node->PrevSibling() && node->PrevSibling()->NextSibling() != node) {
316
0
    return error("html: inconsistent prev/next relationship");
317
0
  }
318
0
  if (node->NextSibling() && node->NextSibling()->PrevSibling() != node) {
319
0
    return error("html: inconsistent next/prev relationship");
320
0
  }
321
0
  if ((node->FirstChild() == nullptr) != (node->LastChild() == nullptr)) {
322
0
    return error("html: inconsistent first/last relationship");
323
0
  }
324
0
  if (node->FirstChild() && node->FirstChild() == node->LastChild()) {
325
    // We have a sole child.
326
0
    if (node->FirstChild()->PrevSibling() ||
327
0
        node->FirstChild()->NextSibling()) {
328
0
      return error("html: inconsistent sold child's sibling relationship");
329
0
    }
330
0
  }
331
332
  // Sorted inserts and no duplicates.
333
0
  std::vector<Node*> seen;
334
0
  Node* last = nullptr;
335
0
  for (Node* c = node->FirstChild(); c; c = c->NextSibling()) {
336
0
    auto insert_position = std::lower_bound(seen.begin(),
337
0
                                            seen.end(),
338
0
                                            c);
339
0
    if (insert_position != seen.end() && *insert_position == c) {
340
0
      return error("html: inconsistent repeated child");
341
0
    }
342
343
0
    seen.insert(insert_position, c);
344
0
    last = c;
345
0
  }
346
347
0
  if (last != node->LastChild()) {
348
0
    return error("html: inconsistent last relationship");
349
0
  }
350
351
0
  Node* first = nullptr;
352
0
  for (Node* c = node->LastChild(); c; c = c->PrevSibling()) {
353
0
    auto iter = std::lower_bound(seen.begin(),
354
0
                                 seen.end(),
355
0
                                 c);
356
0
    if (iter == seen.end() || *iter != c) {
357
0
      return error("html: inconsistent missing child");
358
0
    }
359
0
    seen.erase(iter);
360
0
    first = c;
361
0
  }
362
0
  if (first != node->FirstChild()) {
363
0
    return error("html: inconsistent first relationship");
364
0
  }
365
366
0
  if (!seen.empty()) {
367
0
    return error("html: inconsistent forwards/backwards child list");
368
0
  }
369
370
0
  return std::nullopt;
371
0
}
372
373
0
bool Node::IsBlockElementNode() {
374
0
  return std::find(kBlockElements.begin(),
375
0
                   kBlockElements.end(),
376
0
                   atom_) != kBlockElements.end();
377
0
}
378
379
0
std::string Node::InnerText() const {
380
0
  static std::function<void(const Node*, std::vector<absl::string_view>&)>
381
0
      output =
382
0
          [](const Node* node, std::vector<absl::string_view>& output_content) {
383
0
            switch (node->Type()) {
384
0
              case NodeType::TEXT_NODE: {
385
0
                output_content.push_back(absl::string_view(
386
0
                    node->Data().data(), node->Data().size()));
387
0
                return;
388
0
              }
389
0
              case NodeType::COMMENT_NODE: {
390
                // Ignore comments.
391
0
                return;
392
0
              }
393
0
              default:
394
0
                break;
395
0
            }
396
397
0
            for (Node* child = node->FirstChild(); child;
398
0
                 child = child->NextSibling()) {
399
0
              output(child, output_content);
400
0
            }
401
0
          };
402
403
0
  std::vector<absl::string_view> buffer;
404
0
  output(this, buffer);
405
406
0
  return absl::StrJoin(buffer, " ");
407
0
}
408
409
0
void Node::UpdateChildNodesPositions(Node* relative_node) {
410
  // Cannot proceed if relative node has no positional information.
411
0
  if (!relative_node->LineColInHtmlSrc().has_value()) return;
412
413
0
  auto [r_line, r_col] = relative_node->LineColInHtmlSrc().value();
414
415
  // Update the positions of this node.
416
0
  if (line_col_in_html_src_.has_value()) {
417
0
    auto [line, col] = line_col_in_html_src_.value();
418
0
    int effective_col = line == 1 ?
419
0
        r_col + col + AtomUtil::ToString(
420
0
            relative_node->DataAtom()).size() + 1 /* closing > */ : col;
421
0
    line_col_in_html_src_ = LineCol({line + r_line - 1, effective_col});
422
0
  }
423
424
  // Update the positions of this node's children.
425
0
  for (auto c = FirstChild(); c; c = c->NextSibling()) {
426
0
    c->UpdateChildNodesPositions(relative_node);
427
0
  }
428
0
}
429
430
0
std::string Node::DebugString() {
431
0
  std::ostringstream ost;
432
0
  switch (node_type_) {
433
0
    case NodeType::ELEMENT_NODE:
434
0
      ost << "<" << AtomUtil::ToString(atom_) << ">";
435
0
      break;
436
0
    case NodeType::TEXT_NODE:
437
0
      ost << "TEXT[" << data_.size() << "]";
438
0
      break;
439
0
    case NodeType::COMMENT_NODE:
440
0
      ost << "COMMENT[" << data_.size() << "]";
441
0
      break;
442
0
    default:
443
      // Ignores doctype, error, document node types.
444
0
      break;
445
0
  }
446
447
0
  if (line_col_in_html_src_.has_value()) {
448
0
    ost << line_col_in_html_src_.value().first << ":"
449
0
        << line_col_in_html_src_.value().second;
450
0
  }
451
0
  ost << "\n";
452
453
0
  return ost.str();
454
0
}
455
456
}  // namespace htmlparser