Coverage Report

Created: 2025-07-23 06:45

/proc/self/cwd/cpp/htmlparser/node.cc
Line
Count
Source (jump to first uncovered line)
1
#include "cpp/htmlparser/node.h"
2
3
#include <algorithm>
4
#include <functional>
5
#include <sstream>
6
7
#include "absl/strings/str_join.h"
8
#include "absl/strings/string_view.h"
9
#include "cpp/htmlparser/atomutil.h"
10
#include "cpp/htmlparser/elements.h"
11
#include "cpp/htmlparser/logging.h"
12
13
namespace htmlparser {
14
15
Node::Node(NodeType node_type, Atom atom, std::string name_space) :
16
24.3M
    node_type_(node_type), atom_(atom), name_space_(name_space) {}
17
18
106
void Node::SetData(std::string_view data) {
19
106
  data_ = data;
20
106
}
21
22
0
void Node::AddAttribute(const Attribute& attr) {
23
0
  attributes_.push_back(attr);
24
0
}
25
26
0
void Node::SortAttributes(bool remove_duplicates) {
27
0
  std::stable_sort(attributes_.begin(), attributes_.end(),
28
0
            [](const Attribute& left, const Attribute& right) -> bool {
29
0
              return left.KeyPart() < right.KeyPart();
30
0
            });
31
0
  if (remove_duplicates) DropDuplicateAttributes();
32
0
}
33
34
0
void Node::DropDuplicateAttributes() {
35
0
  auto remove_attributes = [&](auto first, auto last) {
36
0
    for (; first != last; ++first) {
37
0
      last = std::remove_if(std::next(first), last, [first](const auto& attr) {
38
0
        return first->KeyPart() == attr.KeyPart();
39
0
      });
40
0
    }
41
0
    return last;
42
0
  };
43
0
  attributes_.erase(remove_attributes(attributes_.begin(), attributes_.end()),
44
0
                    attributes_.end());
45
0
}
46
47
21.5M
bool Node::IsSpecialElement() const {
48
21.5M
  if (name_space_ == "" || name_space_ == "html") {
49
21.4M
    return std::find(kSpecialElements.begin(),
50
21.4M
                     kSpecialElements.end(),
51
21.4M
                     atom_) != kSpecialElements.end();
52
21.4M
  } else if (name_space_ == "math") {
53
15.0k
    if (atom_ == Atom::MI ||
54
15.0k
        atom_ == Atom::MO ||
55
15.0k
        atom_ == Atom::MN ||
56
15.0k
        atom_ == Atom::MS ||
57
15.0k
        atom_ == Atom::MTEXT ||
58
15.0k
        atom_ == Atom::ANNOTATION_XML) {
59
14.7k
      return true;
60
14.7k
    }
61
23.1k
  } else if (name_space_ == "svg") {
62
23.1k
    if (atom_ == Atom::FOREIGN_OBJECT ||
63
23.1k
        atom_ == Atom::DESC ||
64
23.1k
        atom_ == Atom::TITLE) {
65
18
      return true;
66
18
    }
67
23.1k
  }
68
23.4k
  return false;
69
21.5M
}
70
71
45.1k
bool Node::InsertBefore(Node* new_child, Node* old_child) {
72
  // Checks if it new_child is already attached.
73
45.1k
  if (new_child->Parent() ||
74
45.1k
      new_child->PrevSibling() ||
75
45.1k
      new_child->NextSibling()) {
76
0
    return false;
77
0
  }
78
79
45.1k
  Node* prev = nullptr;
80
45.1k
  Node* next = nullptr;
81
45.1k
  if (old_child) {
82
45.1k
    prev = old_child->PrevSibling();
83
45.1k
    next = old_child;
84
45.1k
  } else {
85
0
    prev = LastChild();
86
0
  }
87
45.1k
  if (prev) {
88
15.5k
    prev->next_sibling_ = new_child;
89
29.5k
  } else {
90
29.5k
    first_child_ = new_child;
91
29.5k
  }
92
93
45.1k
  if (next) {
94
45.1k
    next->prev_sibling_ = new_child;
95
45.1k
  } else {
96
0
    last_child_ = new_child;
97
0
  }
98
99
45.1k
  new_child->parent_ = this;
100
45.1k
  new_child->prev_sibling_ = prev;
101
45.1k
  new_child->next_sibling_ = next;
102
103
45.1k
  return true;
104
45.1k
}
105
106
24.6M
bool Node::AppendChild(Node* new_child) {
107
24.6M
  CHECK(!(new_child->Parent() || new_child->PrevSibling() ||
108
24.6M
          new_child->NextSibling()))
109
24.6M
      << "html: AppendChild called for an attached child Node.";
110
111
24.6M
  Node* last = LastChild();
112
24.6M
  if (last) {
113
2.80M
    last->next_sibling_ = new_child;
114
21.8M
  } else {
115
21.8M
    first_child_ = new_child;
116
21.8M
  }
117
24.6M
  last_child_ = new_child;
118
24.6M
  new_child->parent_ = this;
119
24.6M
  new_child->prev_sibling_ = last;
120
121
24.6M
  return true;
122
24.6M
}
123
124
376k
Node* Node::RemoveChild(Node* c) {
125
  // Remove child called for a non-child node.
126
376k
  CHECK(c->parent_ == this) << "html: RemoveChild called for a non-child Node";
127
128
376k
  if (first_child_ == c) {
129
322k
    first_child_ = c->next_sibling_;
130
322k
  }
131
132
376k
  if (c->next_sibling_) {
133
91.7k
    c->NextSibling()->prev_sibling_ = c->prev_sibling_;
134
91.7k
  }
135
136
376k
  if (last_child_ == c) {
137
285k
    last_child_ = c->prev_sibling_;
138
285k
  }
139
140
376k
  if (c->prev_sibling_) {
141
54.5k
    c->prev_sibling_->next_sibling_ = c->next_sibling_;
142
54.5k
  }
143
144
376k
  c->parent_ = nullptr;
145
376k
  c->prev_sibling_ = nullptr;
146
376k
  c->next_sibling_ = nullptr;
147
376k
  return c;
148
376k
}
149
150
172k
void Node::ReparentChildrenTo(Node* destination) {
151
376k
  while (true) {
152
376k
    Node* child = first_child_;
153
376k
    if (!child) break;
154
204k
    destination->AppendChild(RemoveChild(child));
155
204k
  }
156
172k
}
157
158
3.02M
Node* NodeStack::Pop() {
159
3.02M
  if (stack_.size() > 0) {
160
3.02M
    Node* node = stack_.back();
161
3.02M
    stack_.pop_back();
162
3.02M
    return node;
163
3.02M
  }
164
0
  return nullptr;
165
3.02M
}
166
167
3.56M
void NodeStack::Pop(int count) {
168
3.56M
  if (stack_.empty()) return;
169
170
3.56M
  int sz = stack_.size();
171
3.56M
  if (count >= sz) {
172
0
    stack_.clear();
173
0
  }
174
3.56M
  stack_.erase(stack_.end() - count, stack_.end());
175
3.56M
}
176
177
81.6M
Node* NodeStack::Top() {
178
81.6M
  if (stack_.size() > 0) return stack_.at(stack_.size() - 1);
179
3.39M
  return nullptr;
180
81.6M
}
181
182
17.3M
int NodeStack::Index(Node* node) {
183
22.5G
  for (int i = stack_.size() - 1; i >= 0; --i) {
184
22.5G
    Node* other = stack_[i];
185
22.5G
    if (other == node) {
186
8.94M
      return i;
187
8.94M
    }
188
22.5G
  }
189
190
8.42M
  return -1;
191
17.3M
}
192
193
33.0k
bool NodeStack::Contains(Atom atom) {
194
176M
  for (Node* n : stack_) {
195
176M
    if (n->atom_ == atom && n->name_space_.empty()) return true;
196
176M
  }
197
33.0k
  return false;
198
33.0k
}
199
200
27.2M
void NodeStack::Push(Node* node) {
201
27.2M
  stack_.push_back(node);
202
27.2M
}
203
204
345k
void NodeStack::Insert(int index, Node* node) {
205
345k
  stack_.insert(stack_.begin() + index, node);
206
345k
}
207
208
7.32M
void NodeStack::Replace(int i, Node* node) {
209
7.32M
  if (i > stack_.size() - 1) return;
210
7.32M
  stack_[i] = node;
211
7.32M
}
212
213
1.98M
void NodeStack::Remove(Node* node) {
214
9.47G
  for (auto it = stack_.begin(); it != stack_.end(); ++it) {
215
9.47G
    if (*it == node) {
216
1.64M
      stack_.erase(it);
217
1.64M
      return;
218
1.64M
    }
219
9.47G
  }
220
1.98M
}
221
222
namespace {
223
224
0
std::optional<Error> CheckTreeConsistencyInternal(Node* node, int depth) {
225
0
  if (depth == 0x1e4) {
226
0
    return error("html: tree looks like it contains a cycle");
227
0
  }
228
229
0
  auto err = CheckNodeConsistency(node);
230
0
  if (err) {
231
0
    return err;
232
0
  }
233
234
0
  for (Node* c = node->FirstChild(); c; c = c->NextSibling()) {
235
0
    auto err = CheckTreeConsistencyInternal(c, depth+1);
236
0
    if (err) {
237
0
      return err;
238
0
    }
239
0
  }
240
241
0
  return std::nullopt;
242
0
}
243
244
}  // namespace
245
246
0
std::optional<Error> CheckTreeConsistency(Node* node) {
247
0
  return CheckTreeConsistencyInternal(node, 0);
248
0
}
249
250
0
std::optional<Error> CheckNodeConsistency(Node* node) {
251
0
  if (!node) return std::nullopt;
252
253
0
  int num_parents = 0;
254
255
0
  for (Node* parent = node->Parent(); parent; parent = parent->Parent()) {
256
0
    num_parents++;
257
0
    if (num_parents == 0x1e4) {
258
0
      return error("html: parent list looks like an infinite loop");
259
0
    }
260
0
  }
261
262
0
  int num_forwards = 0;
263
0
  for (Node* c = node->FirstChild(); c; c = c->NextSibling()) {
264
0
    num_forwards++;
265
0
    if (num_forwards == 0x1e6) {
266
0
      return error("html: forward list of children looks like an infinite "
267
0
                   "loop");
268
0
    }
269
0
    if (c->Parent() != node) {
270
0
      return error("html: inconsistent child/parent relationship");
271
0
    }
272
0
  }
273
274
0
  int num_backwards = 0;
275
0
  for (Node* c = node->LastChild(); c; c = c->PrevSibling()) {
276
0
    num_backwards++;
277
0
    if (num_backwards == 0x1e6) {
278
0
      return error("html: backward list of children looks like an infinite "
279
0
                   "loop");
280
0
    }
281
0
    if (c->Parent() != node) {
282
0
      return error("html: inconsistent child/parent relationship");
283
0
    }
284
0
  }
285
286
0
  if (node->Parent()) {
287
0
    if (node->Parent() == node) {
288
0
      return error("html: inconsistent parent relationship");
289
0
    }
290
0
    if (node->Parent() == node->FirstChild()) {
291
0
      return error("html: inconsistent parent/first relationship");
292
0
    }
293
0
    if (node->Parent() == node->LastChild()) {
294
0
      return error("html: inconsistent parent/last relationship");
295
0
    }
296
0
    if (node->Parent() == node->PrevSibling()) {
297
0
      return error("html: inconsistent parent/prev relationship");
298
0
    }
299
0
    if (node->Parent() == node->NextSibling()) {
300
0
      return error("html: inconsistent parent/next relationship");
301
0
    }
302
303
0
    bool parent_has_n_as_a_child = false;
304
0
    for (Node* c = node->Parent()->FirstChild(); c; c = c->NextSibling()) {
305
0
      if (c == node) {
306
0
        parent_has_n_as_a_child = true;
307
0
        break;
308
0
      }
309
0
    }
310
0
    if (!parent_has_n_as_a_child) {
311
0
      return error("html: inconsistent parent/child relationship");
312
0
    }
313
0
  }
314
315
0
  if (node->PrevSibling() && node->PrevSibling()->NextSibling() != node) {
316
0
    return error("html: inconsistent prev/next relationship");
317
0
  }
318
0
  if (node->NextSibling() && node->NextSibling()->PrevSibling() != node) {
319
0
    return error("html: inconsistent next/prev relationship");
320
0
  }
321
0
  if ((node->FirstChild() == nullptr) != (node->LastChild() == nullptr)) {
322
0
    return error("html: inconsistent first/last relationship");
323
0
  }
324
0
  if (node->FirstChild() && node->FirstChild() == node->LastChild()) {
325
    // We have a sole child.
326
0
    if (node->FirstChild()->PrevSibling() ||
327
0
        node->FirstChild()->NextSibling()) {
328
0
      return error("html: inconsistent sold child's sibling relationship");
329
0
    }
330
0
  }
331
332
  // Sorted inserts and no duplicates.
333
0
  std::vector<Node*> seen;
334
0
  Node* last = nullptr;
335
0
  for (Node* c = node->FirstChild(); c; c = c->NextSibling()) {
336
0
    auto insert_position = std::lower_bound(seen.begin(),
337
0
                                            seen.end(),
338
0
                                            c);
339
0
    if (insert_position != seen.end() && *insert_position == c) {
340
0
      return error("html: inconsistent repeated child");
341
0
    }
342
343
0
    seen.insert(insert_position, c);
344
0
    last = c;
345
0
  }
346
347
0
  if (last != node->LastChild()) {
348
0
    return error("html: inconsistent last relationship");
349
0
  }
350
351
0
  Node* first = nullptr;
352
0
  for (Node* c = node->LastChild(); c; c = c->PrevSibling()) {
353
0
    auto iter = std::lower_bound(seen.begin(),
354
0
                                 seen.end(),
355
0
                                 c);
356
0
    if (iter == seen.end() || *iter != c) {
357
0
      return error("html: inconsistent missing child");
358
0
    }
359
0
    seen.erase(iter);
360
0
    first = c;
361
0
  }
362
0
  if (first != node->FirstChild()) {
363
0
    return error("html: inconsistent first relationship");
364
0
  }
365
366
0
  if (!seen.empty()) {
367
0
    return error("html: inconsistent forwards/backwards child list");
368
0
  }
369
370
0
  return std::nullopt;
371
0
}
372
373
0
bool Node::IsBlockElementNode() {
374
0
  return std::find(kBlockElements.begin(),
375
0
                   kBlockElements.end(),
376
0
                   atom_) != kBlockElements.end();
377
0
}
378
379
0
std::string Node::InnerText() const {
380
0
  static std::function<void(const Node*, std::vector<absl::string_view>&)>
381
0
      output =
382
0
          [](const Node* node, std::vector<absl::string_view>& output_content) {
383
0
            switch (node->Type()) {
384
0
              case NodeType::TEXT_NODE: {
385
0
                output_content.push_back(absl::string_view(
386
0
                    node->Data().data(), node->Data().size()));
387
0
                return;
388
0
              }
389
0
              case NodeType::COMMENT_NODE: {
390
                // Ignore comments.
391
0
                return;
392
0
              }
393
0
              default:
394
0
                break;
395
0
            }
396
397
0
            for (Node* child = node->FirstChild(); child;
398
0
                 child = child->NextSibling()) {
399
0
              output(child, output_content);
400
0
            }
401
0
          };
402
403
0
  std::vector<absl::string_view> buffer;
404
0
  output(this, buffer);
405
406
0
  return absl::StrJoin(buffer, " ");
407
0
}
408
409
0
void Node::UpdateChildNodesPositions(Node* relative_node) {
410
  // Cannot proceed if relative node has no positional information.
411
0
  if (!relative_node->LineColInHtmlSrc().has_value()) return;
412
413
0
  auto [r_line, r_col] = relative_node->LineColInHtmlSrc().value();
414
415
  // Update the positions of this node.
416
0
  if (line_col_in_html_src_.has_value()) {
417
0
    auto [line, col] = line_col_in_html_src_.value();
418
0
    int effective_col = line == 1 ?
419
0
        r_col + col + AtomUtil::ToString(
420
0
            relative_node->DataAtom()).size() + 1 /* closing > */ : col;
421
0
    line_col_in_html_src_ = LineCol({line + r_line - 1, effective_col});
422
0
  }
423
424
  // Update the positions of this node's children.
425
0
  for (auto c = FirstChild(); c; c = c->NextSibling()) {
426
0
    c->UpdateChildNodesPositions(relative_node);
427
0
  }
428
0
}
429
430
0
std::string Node::DebugString() {
431
0
  std::ostringstream ost;
432
0
  switch (node_type_) {
433
0
    case NodeType::ELEMENT_NODE:
434
0
      ost << "<" << AtomUtil::ToString(atom_) << ">";
435
0
      break;
436
0
    case NodeType::TEXT_NODE:
437
0
      ost << "TEXT[" << data_.size() << "]";
438
0
      break;
439
0
    case NodeType::COMMENT_NODE:
440
0
      ost << "COMMENT[" << data_.size() << "]";
441
0
      break;
442
0
    default:
443
      // Ignores doctype, error, document node types.
444
0
      break;
445
0
  }
446
447
0
  if (line_col_in_html_src_.has_value()) {
448
0
    ost << line_col_in_html_src_.value().first << ":"
449
0
        << line_col_in_html_src_.value().second;
450
0
  }
451
0
  ost << "\n";
452
453
0
  return ost.str();
454
0
}
455
456
}  // namespace htmlparser