Coverage Report

Created: 2023-11-19 06:28

/src/cmark/src/blocks.c
Line
Count
Source (jump to first uncovered line)
1
/**
2
 * Block parsing implementation.
3
 *
4
 * For a high-level overview of the block parsing process,
5
 * see http://spec.commonmark.org/0.24/#phase-1-block-structure
6
 */
7
8
#include <stdlib.h>
9
#include <assert.h>
10
#include <stdio.h>
11
#include <limits.h>
12
13
#include "cmark_ctype.h"
14
#include "config.h"
15
#include "parser.h"
16
#include "cmark.h"
17
#include "node.h"
18
#include "references.h"
19
#include "utf8.h"
20
#include "scanners.h"
21
#include "inlines.h"
22
#include "houdini.h"
23
#include "buffer.h"
24
#include "chunk.h"
25
26
14.0M
#define CODE_INDENT 4
27
69.0M
#define TAB_STOP 4
28
29
#ifndef MIN
30
1.39M
#define MIN(x, y) ((x < y) ? x : y)
31
#endif
32
33
126M
#define peek_at(i, n) (i)->data[n]
34
35
1.58M
static bool S_last_line_blank(const cmark_node *node) {
36
1.58M
  return (node->flags & CMARK_NODE__LAST_LINE_BLANK) != 0;
37
1.58M
}
38
39
144k
static bool S_last_line_checked(const cmark_node *node) {
40
144k
  return (node->flags & CMARK_NODE__LAST_LINE_CHECKED) != 0;
41
144k
}
42
43
126M
static CMARK_INLINE cmark_node_type S_type(const cmark_node *node) {
44
126M
  return (cmark_node_type)node->type;
45
126M
}
46
47
24.0M
static void S_set_last_line_blank(cmark_node *node, bool is_blank) {
48
24.0M
  if (is_blank)
49
2.78M
    node->flags |= CMARK_NODE__LAST_LINE_BLANK;
50
21.2M
  else
51
21.2M
    node->flags &= ~CMARK_NODE__LAST_LINE_BLANK;
52
24.0M
}
53
54
142k
static void S_set_last_line_checked(cmark_node *node) {
55
142k
  node->flags |= CMARK_NODE__LAST_LINE_CHECKED;
56
142k
}
57
58
385M
static CMARK_INLINE bool S_is_line_end_char(char c) {
59
385M
  return (c == '\n' || c == '\r');
60
385M
}
61
62
7.60M
static CMARK_INLINE bool S_is_space_or_tab(char c) {
63
7.60M
  return (c == ' ' || c == '\t');
64
7.60M
}
65
66
static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
67
                          size_t len, bool eof);
68
69
static void S_process_line(cmark_parser *parser, const unsigned char *buffer,
70
                           bufsize_t bytes);
71
72
static cmark_node *make_block(cmark_mem *mem, cmark_node_type tag,
73
8.95M
                              int start_line, int start_column) {
74
8.95M
  cmark_node *e;
75
76
8.95M
  e = (cmark_node *)mem->calloc(1, sizeof(*e));
77
8.95M
  e->mem = mem;
78
8.95M
  e->type = (uint16_t)tag;
79
8.95M
  e->flags = CMARK_NODE__OPEN;
80
8.95M
  e->start_line = start_line;
81
8.95M
  e->start_column = start_column;
82
8.95M
  e->end_line = start_line;
83
84
8.95M
  return e;
85
8.95M
}
86
87
// Create a root document node.
88
39.9k
static cmark_node *make_document(cmark_mem *mem) {
89
39.9k
  cmark_node *e = make_block(mem, CMARK_NODE_DOCUMENT, 1, 1);
90
39.9k
  return e;
91
39.9k
}
92
93
39.9k
cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) {
94
39.9k
  cmark_parser *parser = (cmark_parser *)mem->calloc(1, sizeof(cmark_parser));
95
39.9k
  parser->mem = mem;
96
97
39.9k
  cmark_node *document = make_document(mem);
98
99
39.9k
  cmark_strbuf_init(mem, &parser->curline, 256);
100
39.9k
  cmark_strbuf_init(mem, &parser->linebuf, 0);
101
39.9k
  cmark_strbuf_init(mem, &parser->content, 0);
102
103
39.9k
  parser->refmap = cmark_reference_map_new(mem);
104
39.9k
  parser->root = document;
105
39.9k
  parser->current = document;
106
39.9k
  parser->line_number = 0;
107
39.9k
  parser->offset = 0;
108
39.9k
  parser->column = 0;
109
39.9k
  parser->first_nonspace = 0;
110
39.9k
  parser->first_nonspace_column = 0;
111
39.9k
  parser->thematic_break_kill_pos = 0;
112
39.9k
  parser->indent = 0;
113
39.9k
  parser->blank = false;
114
39.9k
  parser->partially_consumed_tab = false;
115
39.9k
  parser->last_line_length = 0;
116
39.9k
  parser->options = options;
117
39.9k
  parser->last_buffer_ended_with_cr = false;
118
119
39.9k
  return parser;
120
39.9k
}
121
122
39.9k
cmark_parser *cmark_parser_new(int options) {
123
39.9k
  extern cmark_mem DEFAULT_MEM_ALLOCATOR;
124
39.9k
  return cmark_parser_new_with_mem(options, &DEFAULT_MEM_ALLOCATOR);
125
39.9k
}
126
127
39.9k
void cmark_parser_free(cmark_parser *parser) {
128
39.9k
  cmark_mem *mem = parser->mem;
129
39.9k
  cmark_strbuf_free(&parser->curline);
130
39.9k
  cmark_strbuf_free(&parser->linebuf);
131
39.9k
  cmark_reference_map_free(parser->refmap);
132
39.9k
  mem->free(parser);
133
39.9k
}
134
135
static cmark_node *finalize(cmark_parser *parser, cmark_node *b);
136
137
// Returns true if line has only space characters, else false.
138
1.13M
static bool is_blank(cmark_strbuf *s, bufsize_t offset) {
139
1.13M
  while (offset < s->size) {
140
1.12M
    switch (s->ptr[offset]) {
141
0
    case '\r':
142
0
    case '\n':
143
0
      return true;
144
549
    case ' ':
145
549
      offset++;
146
549
      break;
147
1.08k
    case '\t':
148
1.08k
      offset++;
149
1.08k
      break;
150
1.12M
    default:
151
1.12M
      return false;
152
1.12M
    }
153
1.12M
  }
154
155
10.7k
  return true;
156
1.13M
}
157
158
static CMARK_INLINE bool can_contain(cmark_node_type parent_type,
159
9.99M
                                     cmark_node_type child_type) {
160
9.99M
  return (parent_type == CMARK_NODE_DOCUMENT ||
161
9.99M
          parent_type == CMARK_NODE_BLOCK_QUOTE ||
162
9.99M
          parent_type == CMARK_NODE_ITEM ||
163
9.99M
          (parent_type == CMARK_NODE_LIST && child_type == CMARK_NODE_ITEM));
164
9.99M
}
165
166
10.9M
static CMARK_INLINE bool accepts_lines(cmark_node_type block_type) {
167
10.9M
  return (block_type == CMARK_NODE_PARAGRAPH ||
168
10.9M
          block_type == CMARK_NODE_HEADING ||
169
10.9M
          block_type == CMARK_NODE_CODE_BLOCK);
170
10.9M
}
171
172
8.94M
static CMARK_INLINE bool contains_inlines(cmark_node_type block_type) {
173
8.94M
  return (block_type == CMARK_NODE_PARAGRAPH ||
174
8.94M
          block_type == CMARK_NODE_HEADING);
175
8.94M
}
176
177
6.20M
static void add_line(cmark_chunk *ch, cmark_parser *parser) {
178
6.20M
  int chars_to_tab;
179
6.20M
  int i;
180
6.20M
  if (parser->partially_consumed_tab) {
181
5.90k
    parser->offset += 1; // skip over tab
182
    // add space characters:
183
5.90k
    chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
184
17.1k
    for (i = 0; i < chars_to_tab; i++) {
185
11.2k
      cmark_strbuf_putc(&parser->content, ' ');
186
11.2k
    }
187
5.90k
  }
188
6.20M
  cmark_strbuf_put(&parser->content, ch->data + parser->offset,
189
6.20M
                   ch->len - parser->offset);
190
6.20M
}
191
192
76.2k
static void remove_trailing_blank_lines(cmark_strbuf *ln) {
193
76.2k
  bufsize_t i;
194
76.2k
  unsigned char c;
195
196
1.65M
  for (i = ln->size - 1; i >= 0; --i) {
197
1.65M
    c = ln->ptr[i];
198
199
1.65M
    if (c != ' ' && c != '\t' && !S_is_line_end_char(c))
200
76.2k
      break;
201
1.65M
  }
202
203
76.2k
  if (i < 0) {
204
0
    cmark_strbuf_clear(ln);
205
0
    return;
206
0
  }
207
208
1.48M
  for (; i < ln->size; ++i) {
209
1.48M
    c = ln->ptr[i];
210
211
1.48M
    if (!S_is_line_end_char(c))
212
1.40M
      continue;
213
214
76.2k
    cmark_strbuf_truncate(ln, i);
215
76.2k
    break;
216
1.48M
  }
217
76.2k
}
218
219
// Check to see if a node ends with a blank line, descending
220
// if needed into lists and sublists.
221
144k
static bool S_ends_with_blank_line(cmark_node *node) {
222
144k
  if (S_last_line_checked(node)) {
223
1.71k
    return(S_last_line_blank(node));
224
142k
  } else if ((S_type(node) == CMARK_NODE_LIST ||
225
142k
              S_type(node) == CMARK_NODE_ITEM) && node->last_child) {
226
56.7k
    S_set_last_line_checked(node);
227
56.7k
    return(S_ends_with_blank_line(node->last_child));
228
85.5k
  } else {
229
85.5k
    S_set_last_line_checked(node);
230
85.5k
    return (S_last_line_blank(node));
231
85.5k
  }
232
144k
}
233
234
// returns true if content remains after link defs are resolved.
235
1.13M
static bool resolve_reference_link_definitions(cmark_parser *parser) {
236
1.13M
  bufsize_t pos;
237
1.13M
  cmark_strbuf *node_content = &parser->content;
238
1.13M
  cmark_chunk chunk = {node_content->ptr, node_content->size};
239
1.22M
  while (chunk.len && chunk.data[0] == '[' &&
240
1.22M
         (pos = cmark_parse_reference_inline(parser->mem, &chunk,
241
177k
                                             parser->refmap))) {
242
243
89.3k
    chunk.data += pos;
244
89.3k
    chunk.len -= pos;
245
89.3k
  }
246
1.13M
  cmark_strbuf_drop(node_content, (node_content->size - chunk.len));
247
1.13M
  return !is_blank(node_content, 0);
248
1.13M
}
249
250
8.95M
static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
251
8.95M
  bufsize_t pos;
252
8.95M
  cmark_node *item;
253
8.95M
  cmark_node *subitem;
254
8.95M
  cmark_node *parent;
255
8.95M
  bool has_content;
256
257
8.95M
  parent = b->parent;
258
8.95M
  assert(b->flags &
259
8.95M
         CMARK_NODE__OPEN); // shouldn't call finalize on closed blocks
260
8.95M
  b->flags &= ~CMARK_NODE__OPEN;
261
262
8.95M
  if (parser->curline.size == 0) {
263
    // end of input - line number has not been incremented
264
475k
    b->end_line = parser->line_number;
265
475k
    b->end_column = parser->last_line_length;
266
8.47M
  } else if (S_type(b) == CMARK_NODE_DOCUMENT ||
267
8.47M
             (S_type(b) == CMARK_NODE_CODE_BLOCK && b->as.code.fenced) ||
268
8.47M
             (S_type(b) == CMARK_NODE_HEADING && b->as.heading.setext)) {
269
84.9k
    b->end_line = parser->line_number;
270
84.9k
    b->end_column = parser->curline.size;
271
84.9k
    if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\n')
272
84.9k
      b->end_column -= 1;
273
84.9k
    if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\r')
274
0
      b->end_column -= 1;
275
8.39M
  } else {
276
8.39M
    b->end_line = parser->line_number - 1;
277
8.39M
    b->end_column = parser->last_line_length;
278
8.39M
  }
279
280
8.95M
  cmark_strbuf *node_content = &parser->content;
281
282
8.95M
  switch (S_type(b)) {
283
1.05M
  case CMARK_NODE_PARAGRAPH:
284
1.05M
  {
285
1.05M
    has_content = resolve_reference_link_definitions(parser);
286
1.05M
    if (!has_content) {
287
      // remove blank node (former reference def)
288
10.3k
      cmark_node_free(b);
289
1.04M
    } else {
290
1.04M
      b->len = node_content->size;
291
1.04M
      b->data = cmark_strbuf_detach(node_content);
292
1.04M
    }
293
1.05M
    break;
294
0
  }
295
296
88.6k
  case CMARK_NODE_CODE_BLOCK:
297
88.6k
    if (!b->as.code.fenced) { // indented code
298
76.2k
      remove_trailing_blank_lines(node_content);
299
76.2k
      cmark_strbuf_putc(node_content, '\n');
300
76.2k
    } else {
301
      // first line of contents becomes info
302
3.29M
      for (pos = 0; pos < node_content->size; ++pos) {
303
3.29M
        if (S_is_line_end_char(node_content->ptr[pos]))
304
12.4k
          break;
305
3.29M
      }
306
12.4k
      assert(pos < node_content->size);
307
308
12.4k
      if (pos == 0) {
309
4.90k
        b->as.code.info = NULL;
310
7.53k
      } else {
311
7.53k
        cmark_strbuf tmp = CMARK_BUF_INIT(parser->mem);
312
7.53k
        houdini_unescape_html_f(&tmp, node_content->ptr, pos);
313
7.53k
        cmark_strbuf_trim(&tmp);
314
7.53k
        cmark_strbuf_unescape(&tmp);
315
7.53k
        b->as.code.info = cmark_strbuf_detach(&tmp);
316
7.53k
      }
317
318
12.4k
      if (node_content->ptr[pos] == '\r')
319
0
        pos += 1;
320
12.4k
      if (node_content->ptr[pos] == '\n')
321
12.4k
        pos += 1;
322
12.4k
      cmark_strbuf_drop(node_content, pos);
323
12.4k
    }
324
88.6k
    b->len = node_content->size;
325
88.6k
    b->data = cmark_strbuf_detach(node_content);
326
88.6k
    break;
327
328
121k
  case CMARK_NODE_HEADING:
329
207k
  case CMARK_NODE_HTML_BLOCK:
330
207k
    b->len = node_content->size;
331
207k
    b->data = cmark_strbuf_detach(node_content);
332
207k
    break;
333
334
1.45M
  case CMARK_NODE_LIST:      // determine tight/loose status
335
1.45M
    b->as.list.tight = true; // tight by default
336
1.45M
    item = b->first_child;
337
338
2.89M
    while (item) {
339
      // check for non-final non-empty list item ending with blank line:
340
1.50M
      if (S_last_line_blank(item) && item->next) {
341
25.5k
        b->as.list.tight = false;
342
25.5k
        break;
343
25.5k
      }
344
      // recurse into children of list item, to see if there are
345
      // spaces between them:
346
1.47M
      subitem = item->first_child;
347
2.09M
      while (subitem) {
348
666k
        if ((item->next || subitem->next) &&
349
666k
            S_ends_with_blank_line(subitem)) {
350
44.0k
          b->as.list.tight = false;
351
44.0k
          break;
352
44.0k
        }
353
622k
        subitem = subitem->next;
354
622k
      }
355
1.47M
      if (!(b->as.list.tight)) {
356
44.0k
        break;
357
44.0k
      }
358
1.43M
      item = item->next;
359
1.43M
    }
360
361
1.45M
    break;
362
363
6.13M
  default:
364
6.13M
    break;
365
8.95M
  }
366
367
8.95M
  return parent;
368
8.95M
}
369
370
// Add a node as child of another.  Return pointer to child.
371
static cmark_node *add_child(cmark_parser *parser, cmark_node *parent,
372
8.91M
                             cmark_node_type block_type, int start_column) {
373
8.91M
  assert(parent);
374
375
  // if 'parent' isn't the kind of node that can accept this child,
376
  // then back up til we hit a node that can.
377
9.99M
  while (!can_contain(S_type(parent), block_type)) {
378
1.08M
    parent = finalize(parser, parent);
379
1.08M
  }
380
381
8.91M
  cmark_node *child =
382
8.91M
      make_block(parser->mem, block_type, parser->line_number, start_column);
383
8.91M
  child->parent = parent;
384
385
8.91M
  if (parent->last_child) {
386
2.37M
    parent->last_child->next = child;
387
2.37M
    child->prev = parent->last_child;
388
6.53M
  } else {
389
6.53M
    parent->first_child = child;
390
6.53M
    child->prev = NULL;
391
6.53M
  }
392
8.91M
  parent->last_child = child;
393
8.91M
  return child;
394
8.91M
}
395
396
// Walk through node and all children, recursively, parsing
397
// string content into inline content where appropriate.
398
static void process_inlines(cmark_mem *mem, cmark_node *root,
399
39.9k
                            cmark_reference_map *refmap, int options) {
400
39.9k
  cmark_iter *iter = cmark_iter_new(root);
401
39.9k
  cmark_node *cur;
402
39.9k
  cmark_event_type ev_type;
403
404
17.6M
  while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
405
17.6M
    cur = cmark_iter_get_node(iter);
406
17.6M
    if (ev_type == CMARK_EVENT_ENTER) {
407
8.94M
      if (contains_inlines(S_type(cur))) {
408
1.16M
        cmark_parse_inlines(mem, cur, refmap, options);
409
1.16M
        mem->free(cur->data);
410
1.16M
        cur->data = NULL;
411
1.16M
        cur->len = 0;
412
1.16M
      }
413
8.94M
    }
414
17.6M
  }
415
416
39.9k
  cmark_iter_free(iter);
417
39.9k
}
418
419
// Attempts to parse a list item marker (bullet or enumerated).
420
// On success, returns length of the marker, and populates
421
// data with the details.  On failure, returns 0.
422
static bufsize_t parse_list_marker(cmark_mem *mem, cmark_chunk *input,
423
                                   bufsize_t pos, bool interrupts_paragraph,
424
8.89M
                                   cmark_list **dataptr) {
425
8.89M
  unsigned char c;
426
8.89M
  bufsize_t startpos;
427
8.89M
  cmark_list *data;
428
8.89M
  bufsize_t i;
429
430
8.89M
  startpos = pos;
431
8.89M
  c = peek_at(input, pos);
432
433
8.89M
  if (c == '*' || c == '-' || c == '+') {
434
1.95M
    pos++;
435
1.95M
    if (!cmark_isspace(peek_at(input, pos))) {
436
364k
      return 0;
437
364k
    }
438
439
1.59M
    if (interrupts_paragraph) {
440
23.6k
      i = pos;
441
      // require non-blank content after list marker:
442
41.0k
      while (S_is_space_or_tab(peek_at(input, i))) {
443
17.3k
        i++;
444
17.3k
      }
445
23.6k
      if (peek_at(input, i) == '\n') {
446
8.73k
        return 0;
447
8.73k
      }
448
23.6k
    }
449
450
1.58M
    data = (cmark_list *)mem->calloc(1, sizeof(*data));
451
1.58M
    data->marker_offset = 0; // will be adjusted later
452
1.58M
    data->list_type = CMARK_BULLET_LIST;
453
1.58M
    data->bullet_char = c;
454
1.58M
    data->start = 0;
455
1.58M
    data->delimiter = CMARK_NO_DELIM;
456
1.58M
    data->tight = false;
457
6.94M
  } else if (cmark_isdigit(c)) {
458
63.8k
    int start = 0;
459
63.8k
    int digits = 0;
460
461
132k
    do {
462
132k
      start = (10 * start) + (peek_at(input, pos) - '0');
463
132k
      pos++;
464
132k
      digits++;
465
      // We limit to 9 digits to avoid overflow,
466
      // assuming max int is 2^31 - 1
467
      // This also seems to be the limit for 'start' in some browsers.
468
132k
    } while (digits < 9 && cmark_isdigit(peek_at(input, pos)));
469
470
63.8k
    if (interrupts_paragraph && start != 1) {
471
34.1k
      return 0;
472
34.1k
    }
473
29.7k
    c = peek_at(input, pos);
474
29.7k
    if (c == '.' || c == ')') {
475
10.1k
      pos++;
476
10.1k
      if (!cmark_isspace(peek_at(input, pos))) {
477
2.49k
        return 0;
478
2.49k
      }
479
7.66k
      if (interrupts_paragraph) {
480
        // require non-blank content after list marker:
481
1.62k
        i = pos;
482
2.67k
        while (S_is_space_or_tab(peek_at(input, i))) {
483
1.05k
          i++;
484
1.05k
        }
485
1.62k
        if (S_is_line_end_char(peek_at(input, i))) {
486
1.14k
          return 0;
487
1.14k
        }
488
1.62k
      }
489
490
6.52k
      data = (cmark_list *)mem->calloc(1, sizeof(*data));
491
6.52k
      data->marker_offset = 0; // will be adjusted later
492
6.52k
      data->list_type = CMARK_ORDERED_LIST;
493
6.52k
      data->bullet_char = 0;
494
6.52k
      data->start = start;
495
6.52k
      data->delimiter = (c == '.' ? CMARK_PERIOD_DELIM : CMARK_PAREN_DELIM);
496
6.52k
      data->tight = false;
497
19.5k
    } else {
498
19.5k
      return 0;
499
19.5k
    }
500
6.88M
  } else {
501
6.88M
    return 0;
502
6.88M
  }
503
504
1.58M
  *dataptr = data;
505
1.58M
  return (pos - startpos);
506
8.89M
}
507
508
// Return 1 if list item belongs in list, else 0.
509
567k
static int lists_match(cmark_list *list_data, cmark_list *item_data) {
510
567k
  return (list_data->list_type == item_data->list_type &&
511
567k
          list_data->delimiter == item_data->delimiter &&
512
          // list_data->marker_offset == item_data.marker_offset &&
513
567k
          list_data->bullet_char == item_data->bullet_char);
514
567k
}
515
516
39.9k
static cmark_node *finalize_document(cmark_parser *parser) {
517
475k
  while (parser->current != parser->root) {
518
435k
    parser->current = finalize(parser, parser->current);
519
435k
  }
520
521
39.9k
  finalize(parser, parser->root);
522
523
  // Limit total size of extra content created from reference links to
524
  // document size to avoid superlinear growth. Always allow 100KB.
525
39.9k
  if (parser->total_size > 100000)
526
707
    parser->refmap->max_ref_size = parser->total_size;
527
39.2k
  else
528
39.2k
    parser->refmap->max_ref_size = 100000;
529
530
39.9k
  process_inlines(parser->mem, parser->root, parser->refmap, parser->options);
531
532
39.9k
  cmark_strbuf_free(&parser->content);
533
534
39.9k
  return parser->root;
535
39.9k
}
536
537
0
cmark_node *cmark_parse_file(FILE *f, int options) {
538
0
  unsigned char buffer[4096];
539
0
  cmark_parser *parser = cmark_parser_new(options);
540
0
  size_t bytes;
541
0
  cmark_node *document;
542
543
0
  while ((bytes = fread(buffer, 1, sizeof(buffer), f)) > 0) {
544
0
    bool eof = bytes < sizeof(buffer);
545
0
    S_parser_feed(parser, buffer, bytes, eof);
546
0
    if (eof) {
547
0
      break;
548
0
    }
549
0
  }
550
551
0
  document = cmark_parser_finish(parser);
552
0
  cmark_parser_free(parser);
553
0
  return document;
554
0
}
555
556
39.9k
cmark_node *cmark_parse_document(const char *buffer, size_t len, int options) {
557
39.9k
  cmark_parser *parser = cmark_parser_new(options);
558
39.9k
  cmark_node *document;
559
560
39.9k
  S_parser_feed(parser, (const unsigned char *)buffer, len, true);
561
562
39.9k
  document = cmark_parser_finish(parser);
563
39.9k
  cmark_parser_free(parser);
564
39.9k
  return document;
565
39.9k
}
566
567
0
void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len) {
568
0
  S_parser_feed(parser, (const unsigned char *)buffer, len, false);
569
0
}
570
571
static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
572
39.9k
                          size_t len, bool eof) {
573
39.9k
  const unsigned char *end = buffer + len;
574
39.9k
  static const uint8_t repl[] = {239, 191, 189};
575
576
39.9k
  if (len > UINT_MAX - parser->total_size)
577
0
    parser->total_size = UINT_MAX;
578
39.9k
  else
579
39.9k
    parser->total_size += len;
580
581
  // Skip UTF-8 BOM if present; see #334
582
39.9k
  if (parser->line_number == 0 && parser->column == 0 && len >= 3 &&
583
39.9k
      *buffer == 0xEF && *(buffer + 1) == 0xBB &&
584
39.9k
      *(buffer + 2) == 0xBF) {
585
22
    buffer += 3;
586
39.9k
  } else if (parser->last_buffer_ended_with_cr && *buffer == '\n') {
587
    // skip NL if last buffer ended with CR ; see #117
588
0
    buffer++;
589
0
  }
590
591
39.9k
  parser->last_buffer_ended_with_cr = false;
592
106M
  while (buffer < end) {
593
106M
    const unsigned char *eol;
594
106M
    bufsize_t chunk_len;
595
106M
    bool process = false;
596
340M
    for (eol = buffer; eol < end; ++eol) {
597
340M
      if (S_is_line_end_char(*eol)) {
598
8.71M
        process = true;
599
8.71M
        break;
600
8.71M
      }
601
331M
      if (*eol == '\0' && eol < end) {
602
97.6M
        break;
603
97.6M
      }
604
331M
    }
605
106M
    if (eol >= end && eof) {
606
35.1k
      process = true;
607
35.1k
    }
608
609
106M
    chunk_len = (eol - buffer);
610
106M
    if (process) {
611
8.74M
      if (parser->linebuf.size > 0) {
612
1.34M
        cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
613
1.34M
        S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
614
1.34M
        cmark_strbuf_clear(&parser->linebuf);
615
7.39M
      } else {
616
7.39M
        S_process_line(parser, buffer, chunk_len);
617
7.39M
      }
618
97.6M
    } else {
619
97.6M
      if (eol < end && *eol == '\0') {
620
        // omit NULL byte
621
97.6M
        cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
622
        // add replacement character
623
97.6M
        cmark_strbuf_put(&parser->linebuf, repl, 3);
624
97.6M
      } else {
625
0
        cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
626
0
      }
627
97.6M
    }
628
629
106M
    buffer += chunk_len;
630
106M
    if (buffer < end) {
631
106M
      if (*buffer == '\0') {
632
        // skip over NULL
633
97.6M
        buffer++;
634
97.6M
      } else {
635
        // skip over line ending characters
636
8.71M
        if (*buffer == '\r') {
637
3.55M
          buffer++;
638
3.55M
          if (buffer == end)
639
412
            parser->last_buffer_ended_with_cr = true;
640
3.55M
        }
641
8.71M
        if (buffer < end && *buffer == '\n')
642
5.16M
          buffer++;
643
8.71M
      }
644
106M
    }
645
106M
  }
646
39.9k
}
647
648
47.5k
static void chop_trailing_hashtags(cmark_chunk *ch) {
649
47.5k
  bufsize_t n, orig_n;
650
651
47.5k
  cmark_chunk_rtrim(ch);
652
47.5k
  orig_n = n = ch->len - 1;
653
654
  // if string ends in space followed by #s, remove these:
655
125k
  while (n >= 0 && peek_at(ch, n) == '#')
656
77.5k
    n--;
657
658
  // Check for a space before the final #s:
659
47.5k
  if (n != orig_n && n >= 0 && S_is_space_or_tab(peek_at(ch, n))) {
660
21.2k
    ch->len = n;
661
21.2k
    cmark_chunk_rtrim(ch);
662
21.2k
  }
663
47.5k
}
664
665
// Check for thematic break.  On failure, return 0 and update
666
// thematic_break_kill_pos with the index at which the
667
// parse fails.  On success, return length of match.
668
// "...three or more hyphens, asterisks,
669
// or underscores on a line by themselves. If you wish, you may use
670
// spaces between the hyphens or asterisks."
671
static int S_scan_thematic_break(cmark_parser *parser, cmark_chunk *input,
672
8.50M
                                 bufsize_t offset) {
673
8.50M
  bufsize_t i;
674
8.50M
  char c;
675
8.50M
  char nextc = '\0';
676
8.50M
  int count;
677
8.50M
  i = offset;
678
8.50M
  c = peek_at(input, i);
679
8.50M
  if (!(c == '*' || c == '_' || c == '-')) {
680
6.94M
    parser->thematic_break_kill_pos = i;
681
6.94M
    return 0;
682
6.94M
  }
683
1.55M
  count = 1;
684
5.50M
  while ((nextc = peek_at(input, ++i))) {
685
5.50M
    if (nextc == c) {
686
2.70M
      count++;
687
2.79M
    } else if (nextc != ' ' && nextc != '\t') {
688
1.55M
      break;
689
1.55M
    }
690
5.50M
  }
691
1.55M
  if (count >= 3 && (nextc == '\r' || nextc == '\n')) {
692
105k
    return (i - offset) + 1;
693
1.45M
  } else {
694
1.45M
    parser->thematic_break_kill_pos = i;
695
1.45M
    return 0;
696
1.45M
  }
697
1.55M
}
698
699
// Find first nonspace character from current offset, setting
700
// parser->first_nonspace, parser->first_nonspace_column,
701
// parser->indent, and parser->blank. Does not advance parser->offset.
702
32.0M
static void S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input) {
703
32.0M
  char c;
704
32.0M
  int chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
705
706
32.0M
  if (parser->first_nonspace <= parser->offset) {
707
31.4M
    parser->first_nonspace = parser->offset;
708
31.4M
    parser->first_nonspace_column = parser->column;
709
35.7M
    while ((c = peek_at(input, parser->first_nonspace))) {
710
35.6M
      if (c == ' ') {
711
3.45M
        parser->first_nonspace += 1;
712
3.45M
        parser->first_nonspace_column += 1;
713
3.45M
        chars_to_tab = chars_to_tab - 1;
714
3.45M
        if (chars_to_tab == 0) {
715
834k
          chars_to_tab = TAB_STOP;
716
834k
        }
717
32.2M
      } else if (c == '\t') {
718
830k
        parser->first_nonspace += 1;
719
830k
        parser->first_nonspace_column += chars_to_tab;
720
830k
        chars_to_tab = TAB_STOP;
721
31.4M
      } else {
722
31.4M
        break;
723
31.4M
      }
724
35.6M
    }
725
31.4M
  }
726
727
32.0M
  parser->indent = parser->first_nonspace_column - parser->column;
728
32.0M
  parser->blank = S_is_line_end_char(peek_at(input, parser->first_nonspace));
729
32.0M
}
730
731
// Advance parser->offset and parser->column.  parser->offset is the
732
// byte position in input; parser->column is a virtual column number
733
// that takes into account tabs. (Multibyte characters are not taken
734
// into account, because the Markdown line prefixes we are interested in
735
// analyzing are entirely ASCII.)  The count parameter indicates
736
// how far to advance the offset.  If columns is true, then count
737
// indicates a number of columns; otherwise, a number of bytes.
738
// If advancing a certain number of columns partially consumes
739
// a tab character, parser->partially_consumed_tab is set to true.
740
static void S_advance_offset(cmark_parser *parser, cmark_chunk *input,
741
13.0M
                             bufsize_t count, bool columns) {
742
13.0M
  char c;
743
13.0M
  int chars_to_tab;
744
13.0M
  int chars_to_advance;
745
23.4M
  while (count > 0 && (c = peek_at(input, parser->offset))) {
746
10.4M
    if (c == '\t') {
747
1.59M
      chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
748
1.59M
      if (columns) {
749
1.39M
        parser->partially_consumed_tab = chars_to_tab > count;
750
1.39M
        chars_to_advance = MIN(count, chars_to_tab);
751
1.39M
        parser->column += chars_to_advance;
752
1.39M
        parser->offset += (parser->partially_consumed_tab ? 0 : 1);
753
1.39M
        count -= chars_to_advance;
754
1.39M
      } else {
755
196k
        parser->partially_consumed_tab = false;
756
196k
        parser->column += chars_to_tab;
757
196k
        parser->offset += 1;
758
196k
        count -= 1;
759
196k
      }
760
8.85M
    } else {
761
8.85M
      parser->partially_consumed_tab = false;
762
8.85M
      parser->offset += 1;
763
8.85M
      parser->column += 1; // assume ascii; block starts are ascii
764
8.85M
      count -= 1;
765
8.85M
    }
766
10.4M
  }
767
13.0M
}
768
769
15.7M
static bool S_last_child_is_open(cmark_node *container) {
770
15.7M
  return container->last_child &&
771
15.7M
         (container->last_child->flags & CMARK_NODE__OPEN);
772
15.7M
}
773
774
430k
static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) {
775
430k
  bool res = false;
776
430k
  bufsize_t matched = 0;
777
778
430k
  matched =
779
430k
      parser->indent <= 3 && peek_at(input, parser->first_nonspace) == '>';
780
430k
  if (matched) {
781
782
34.8k
    S_advance_offset(parser, input, parser->indent + 1, true);
783
784
34.8k
    if (S_is_space_or_tab(peek_at(input, parser->offset))) {
785
2.30k
      S_advance_offset(parser, input, 1, true);
786
2.30k
    }
787
788
34.8k
    res = true;
789
34.8k
  }
790
430k
  return res;
791
430k
}
792
793
static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input,
794
1.58M
                                   cmark_node *container) {
795
1.58M
  bool res = false;
796
797
1.58M
  if (parser->indent >=
798
1.58M
      container->as.list.marker_offset + container->as.list.padding) {
799
107k
    S_advance_offset(parser, input, container->as.list.marker_offset +
800
107k
                                        container->as.list.padding,
801
107k
                     true);
802
107k
    res = true;
803
1.47M
  } else if (parser->blank && container->first_child != NULL) {
804
    // if container->first_child is NULL, then the opening line
805
    // of the list item was blank after the list marker; in this
806
    // case, we are done with the list item.
807
272k
    S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
808
272k
                     false);
809
272k
    res = true;
810
272k
  }
811
1.58M
  return res;
812
1.58M
}
813
814
static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input,
815
                                    cmark_node *container,
816
234k
                                    bool *should_continue) {
817
234k
  bool res = false;
818
819
234k
  if (!container->as.code.fenced) { // indented
820
152k
    if (parser->indent >= CODE_INDENT) {
821
8.64k
      S_advance_offset(parser, input, CODE_INDENT, true);
822
8.64k
      res = true;
823
144k
    } else if (parser->blank) {
824
72.0k
      S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
825
72.0k
                       false);
826
72.0k
      res = true;
827
72.0k
    }
828
152k
  } else { // fenced
829
81.3k
    bufsize_t matched = 0;
830
831
81.3k
    if (parser->indent <= 3 && (peek_at(input, parser->first_nonspace) ==
832
80.1k
                                container->as.code.fence_char)) {
833
20.3k
      matched = scan_close_code_fence(input, parser->first_nonspace);
834
20.3k
    }
835
836
81.3k
    if (matched >= container->as.code.fence_length) {
837
      // closing fence - and since we're at
838
      // the end of a line, we can stop processing it:
839
5.74k
      *should_continue = false;
840
5.74k
      S_advance_offset(parser, input, matched, false);
841
5.74k
      parser->current = finalize(parser, container);
842
75.6k
    } else {
843
      // skip opt. spaces of fence parser->offset
844
75.6k
      int i = container->as.code.fence_offset;
845
846
77.5k
      while (i > 0 && S_is_space_or_tab(peek_at(input, parser->offset))) {
847
1.94k
        S_advance_offset(parser, input, 1, true);
848
1.94k
        i--;
849
1.94k
      }
850
75.6k
      res = true;
851
75.6k
    }
852
81.3k
  }
853
854
234k
  return res;
855
234k
}
856
857
static bool parse_html_block_prefix(cmark_parser *parser,
858
917k
                                    cmark_node *container) {
859
917k
  bool res = false;
860
917k
  int html_block_type = container->as.html_block_type;
861
862
917k
  assert(html_block_type >= 1 && html_block_type <= 7);
863
917k
  switch (html_block_type) {
864
560k
  case 1:
865
620k
  case 2:
866
697k
  case 3:
867
771k
  case 4:
868
863k
  case 5:
869
    // these types of blocks can accept blanks
870
863k
    res = true;
871
863k
    break;
872
47.0k
  case 6:
873
53.9k
  case 7:
874
53.9k
    res = !parser->blank;
875
53.9k
    break;
876
917k
  }
877
878
917k
  return res;
879
917k
}
880
881
/**
882
 * For each containing node, try to parse the associated line start.
883
 *
884
 * Will not close unmatched blocks, as we may have a lazy continuation
885
 * line -> http://spec.commonmark.org/0.24/#lazy-continuation-line
886
 *
887
 * Returns: The last matching node, or NULL
888
 */
889
static cmark_node *check_open_blocks(cmark_parser *parser, cmark_chunk *input,
890
8.74M
                                     bool *all_matched) {
891
8.74M
  bool should_continue = true;
892
8.74M
  *all_matched = false;
893
8.74M
  cmark_node *container = parser->root;
894
8.74M
  cmark_node_type cont_type;
895
896
15.7M
  while (S_last_child_is_open(container)) {
897
9.55M
    container = container->last_child;
898
9.55M
    cont_type = S_type(container);
899
900
9.55M
    S_find_first_nonspace(parser, input);
901
902
9.55M
    switch (cont_type) {
903
430k
    case CMARK_NODE_BLOCK_QUOTE:
904
430k
      if (!parse_block_quote_prefix(parser, input))
905
395k
        goto done;
906
34.8k
      break;
907
1.58M
    case CMARK_NODE_ITEM:
908
1.58M
      if (!parse_node_item_prefix(parser, input, container))
909
1.20M
        goto done;
910
379k
      break;
911
379k
    case CMARK_NODE_CODE_BLOCK:
912
234k
      if (!parse_code_block_prefix(parser, input, container, &should_continue))
913
77.8k
        goto done;
914
156k
      break;
915
156k
    case CMARK_NODE_HEADING:
916
      // a heading can never contain more than one line
917
80.9k
      goto done;
918
917k
    case CMARK_NODE_HTML_BLOCK:
919
917k
      if (!parse_html_block_prefix(parser, container))
920
31.7k
        goto done;
921
885k
      break;
922
4.34M
    case CMARK_NODE_PARAGRAPH:
923
4.34M
      if (parser->blank)
924
789k
        goto done;
925
3.56M
      break;
926
3.56M
    default:
927
1.96M
      break;
928
9.55M
    }
929
9.55M
  }
930
931
6.17M
  *all_matched = true;
932
933
8.74M
done:
934
8.74M
  if (!*all_matched) {
935
2.57M
    container = container->parent; // back up to last matching node
936
2.57M
  }
937
938
8.74M
  if (!should_continue) {
939
5.74k
    container = NULL;
940
5.74k
  }
941
942
8.74M
  return container;
943
6.17M
}
944
945
static void open_new_blocks(cmark_parser *parser, cmark_node **container,
946
8.74M
                            cmark_chunk *input, bool all_matched) {
947
8.74M
  bool indented;
948
8.74M
  cmark_list *data = NULL;
949
8.74M
  bool maybe_lazy = S_type(parser->current) == CMARK_NODE_PARAGRAPH;
950
8.74M
  cmark_node_type cont_type = S_type(*container);
951
8.74M
  bufsize_t matched = 0;
952
8.74M
  int lev = 0;
953
8.74M
  bool save_partially_consumed_tab;
954
8.74M
  bool has_content;
955
8.74M
  int save_offset;
956
8.74M
  int save_column;
957
958
14.9M
  while (cont_type != CMARK_NODE_CODE_BLOCK &&
959
14.9M
         cont_type != CMARK_NODE_HTML_BLOCK) {
960
961
13.7M
    S_find_first_nonspace(parser, input);
962
13.7M
    indented = parser->indent >= CODE_INDENT;
963
964
13.7M
    if (!indented && peek_at(input, parser->first_nonspace) == '>') {
965
966
4.40M
      bufsize_t blockquote_startpos = parser->first_nonspace;
967
968
4.40M
      S_advance_offset(parser, input,
969
4.40M
                       parser->first_nonspace + 1 - parser->offset, false);
970
      // optional following character
971
4.40M
      if (S_is_space_or_tab(peek_at(input, parser->offset))) {
972
21.9k
        S_advance_offset(parser, input, 1, true);
973
21.9k
      }
974
4.40M
      *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE,
975
4.40M
                             blockquote_startpos + 1);
976
977
9.39M
    } else if (!indented && (matched = scan_atx_heading_start(
978
9.22M
                                 input, parser->first_nonspace))) {
979
48.0k
      bufsize_t hashpos;
980
48.0k
      int level = 0;
981
48.0k
      bufsize_t heading_startpos = parser->first_nonspace;
982
983
48.0k
      S_advance_offset(parser, input,
984
48.0k
                       parser->first_nonspace + matched - parser->offset,
985
48.0k
                       false);
986
48.0k
      *container = add_child(parser, *container, CMARK_NODE_HEADING,
987
48.0k
                             heading_startpos + 1);
988
989
48.0k
      hashpos = cmark_chunk_strchr(input, '#', parser->first_nonspace);
990
991
140k
      while (peek_at(input, hashpos) == '#') {
992
92.3k
        level++;
993
92.3k
        hashpos++;
994
92.3k
      }
995
996
48.0k
      (*container)->as.heading.level = level;
997
48.0k
      (*container)->as.heading.setext = false;
998
48.0k
      (*container)->as.heading.internal_offset = matched;
999
1000
9.34M
    } else if (!indented && (matched = scan_open_code_fence(
1001
9.17M
                                 input, parser->first_nonspace))) {
1002
12.4k
      *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK,
1003
12.4k
                             parser->first_nonspace + 1);
1004
12.4k
      (*container)->as.code.fenced = true;
1005
12.4k
      (*container)->as.code.fence_char = peek_at(input, parser->first_nonspace);
1006
12.4k
      (*container)->as.code.fence_length = (matched > 255) ? 255 : matched;
1007
12.4k
      (*container)->as.code.fence_offset =
1008
12.4k
          (int8_t)(parser->first_nonspace - parser->offset);
1009
12.4k
      (*container)->as.code.info = NULL;
1010
12.4k
      S_advance_offset(parser, input,
1011
12.4k
                       parser->first_nonspace + matched - parser->offset,
1012
12.4k
                       false);
1013
1014
9.33M
    } else if (!indented && ((matched = scan_html_block_start(
1015
9.16M
                                  input, parser->first_nonspace)) ||
1016
9.16M
                             (cont_type != CMARK_NODE_PARAGRAPH &&
1017
9.08M
                              !maybe_lazy &&
1018
9.08M
                              (matched = scan_html_block_start_7(
1019
4.35M
                                   input, parser->first_nonspace))))) {
1020
86.3k
      *container = add_child(parser, *container, CMARK_NODE_HTML_BLOCK,
1021
86.3k
                             parser->first_nonspace + 1);
1022
86.3k
      (*container)->as.html_block_type = matched;
1023
      // note, we don't adjust parser->offset because the tag is part of the
1024
      // text
1025
9.24M
    } else if (!indented && cont_type == CMARK_NODE_PARAGRAPH &&
1026
9.24M
               (lev =
1027
3.40M
                    scan_setext_heading_line(input, parser->first_nonspace))) {
1028
      // finalize paragraph, resolving reference links
1029
73.5k
      has_content = resolve_reference_link_definitions(parser);
1030
1031
73.5k
      if (has_content) {
1032
1033
73.1k
        (*container)->type = (uint16_t)CMARK_NODE_HEADING;
1034
73.1k
        (*container)->as.heading.level = lev;
1035
73.1k
        (*container)->as.heading.setext = true;
1036
73.1k
        S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
1037
73.1k
      }
1038
9.17M
    } else if (!indented &&
1039
9.17M
               !(cont_type == CMARK_NODE_PARAGRAPH && !all_matched) &&
1040
9.17M
               (parser->thematic_break_kill_pos <= parser->first_nonspace) &&
1041
9.17M
               S_scan_thematic_break(parser, input, parser->first_nonspace)) {
1042
      // it's only now that we know the line is not part of a setext heading:
1043
105k
      *container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK,
1044
105k
                             parser->first_nonspace + 1);
1045
105k
      S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
1046
9.06M
    } else if ((!indented || cont_type == CMARK_NODE_LIST) &&
1047
9.06M
               parser->indent < 4 &&
1048
9.06M
               (matched = parse_list_marker(
1049
8.89M
                    parser->mem, input, parser->first_nonspace,
1050
8.89M
                    (*container)->type == CMARK_NODE_PARAGRAPH, &data))) {
1051
1052
      // Note that we can have new list items starting with >= 4
1053
      // spaces indent, as long as the list container is still open.
1054
1.58M
      int i = 0;
1055
1056
      // compute padding:
1057
1.58M
      S_advance_offset(parser, input,
1058
1.58M
                       parser->first_nonspace + matched - parser->offset,
1059
1.58M
                       false);
1060
1061
1.58M
      save_partially_consumed_tab = parser->partially_consumed_tab;
1062
1.58M
      save_offset = parser->offset;
1063
1.58M
      save_column = parser->column;
1064
1065
3.09M
      while (parser->column - save_column <= 5 &&
1066
3.09M
             S_is_space_or_tab(peek_at(input, parser->offset))) {
1067
1.50M
        S_advance_offset(parser, input, 1, true);
1068
1.50M
      }
1069
1070
1.58M
      i = parser->column - save_column;
1071
1.58M
      if (i >= 5 || i < 1 ||
1072
          // only spaces after list marker:
1073
1.58M
          S_is_line_end_char(peek_at(input, parser->offset))) {
1074
917k
        data->padding = matched + 1;
1075
917k
        parser->offset = save_offset;
1076
917k
        parser->column = save_column;
1077
917k
        parser->partially_consumed_tab = save_partially_consumed_tab;
1078
917k
        if (i > 0) {
1079
118k
          S_advance_offset(parser, input, 1, true);
1080
118k
        }
1081
917k
      } else {
1082
671k
        data->padding = matched + i;
1083
671k
      }
1084
1085
      // check container; if it's a list, see if this list item
1086
      // can continue the list; otherwise, create a list container.
1087
1088
1.58M
      data->marker_offset = parser->indent;
1089
1090
1.58M
      if (cont_type != CMARK_NODE_LIST ||
1091
1.58M
          !lists_match(&((*container)->as.list), data)) {
1092
1.45M
        *container = add_child(parser, *container, CMARK_NODE_LIST,
1093
1.45M
                               parser->first_nonspace + 1);
1094
1095
1.45M
        memcpy(&((*container)->as.list), data, sizeof(*data));
1096
1.45M
      }
1097
1098
      // add the list item
1099
1.58M
      *container = add_child(parser, *container, CMARK_NODE_ITEM,
1100
1.58M
                             parser->first_nonspace + 1);
1101
      /* TODO: static */
1102
1.58M
      memcpy(&((*container)->as.list), data, sizeof(*data));
1103
1.58M
      parser->mem->free(data);
1104
7.47M
    } else if (indented && !maybe_lazy && !parser->blank) {
1105
76.2k
      S_advance_offset(parser, input, CODE_INDENT, true);
1106
76.2k
      *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK,
1107
76.2k
                             parser->offset + 1);
1108
76.2k
      (*container)->as.code.fenced = false;
1109
76.2k
      (*container)->as.code.fence_char = 0;
1110
76.2k
      (*container)->as.code.fence_length = 0;
1111
76.2k
      (*container)->as.code.fence_offset = 0;
1112
76.2k
      (*container)->as.code.info = NULL;
1113
1114
7.40M
    } else {
1115
7.40M
      break;
1116
7.40M
    }
1117
1118
6.39M
    if (accepts_lines(S_type(*container))) {
1119
      // if it's a line container, it can't contain other containers
1120
210k
      break;
1121
210k
    }
1122
1123
6.18M
    cont_type = S_type(*container);
1124
6.18M
    maybe_lazy = false;
1125
6.18M
  }
1126
8.74M
}
1127
1128
static void add_text_to_container(cmark_parser *parser, cmark_node *container,
1129
                                  cmark_node *last_matched_container,
1130
8.74M
                                  cmark_chunk *input) {
1131
8.74M
  cmark_node *tmp;
1132
  // what remains at parser->offset is a text line.  add the text to the
1133
  // appropriate container.
1134
1135
8.74M
  S_find_first_nonspace(parser, input);
1136
1137
8.74M
  if (parser->blank && container->last_child)
1138
1.30M
    S_set_last_line_blank(container->last_child, true);
1139
1140
  // block quote lines are never blank as they start with >
1141
  // and we don't count blanks in fenced code for purposes of tight/loose
1142
  // lists or breaking out of lists.  we also don't set last_line_blank
1143
  // on an empty list item.
1144
8.74M
  const cmark_node_type ctype = S_type(container);
1145
8.74M
  const bool last_line_blank =
1146
8.74M
      (parser->blank && ctype != CMARK_NODE_BLOCK_QUOTE &&
1147
8.74M
       ctype != CMARK_NODE_HEADING && ctype != CMARK_NODE_THEMATIC_BREAK &&
1148
8.74M
       !(ctype == CMARK_NODE_CODE_BLOCK && container->as.code.fenced) &&
1149
8.74M
       !(ctype == CMARK_NODE_ITEM && container->first_child == NULL &&
1150
2.38M
         container->start_line == parser->line_number));
1151
1152
8.74M
  S_set_last_line_blank(container, last_line_blank);
1153
1154
8.74M
  tmp = container;
1155
22.6M
  while (tmp->parent) {
1156
13.9M
    S_set_last_line_blank(tmp->parent, false);
1157
13.9M
    tmp = tmp->parent;
1158
13.9M
  }
1159
1160
  // If the last line processed belonged to a paragraph node,
1161
  // and we didn't match all of the line prefixes for the open containers,
1162
  // and we didn't start any new containers,
1163
  // and the line isn't blank,
1164
  // then treat this as a "lazy continuation line" and add it to
1165
  // the open paragraph.
1166
8.74M
  if (parser->current != last_matched_container &&
1167
8.74M
      container == last_matched_container && !parser->blank &&
1168
8.74M
      S_type(parser->current) == CMARK_NODE_PARAGRAPH) {
1169
422k
    add_line(input, parser);
1170
8.31M
  } else { // not a lazy continuation
1171
    // Finalize any blocks that were not matched and set cur to container:
1172
15.6M
    while (parser->current != last_matched_container) {
1173
7.33M
      parser->current = finalize(parser, parser->current);
1174
7.33M
      assert(parser->current != NULL);
1175
7.33M
    }
1176
1177
8.31M
    if (S_type(container) == CMARK_NODE_CODE_BLOCK) {
1178
245k
      add_line(input, parser);
1179
8.07M
    } else if (S_type(container) == CMARK_NODE_HTML_BLOCK) {
1180
972k
      add_line(input, parser);
1181
1182
972k
      int matches_end_condition;
1183
972k
      switch (container->as.html_block_type) {
1184
580k
      case 1:
1185
        // </script>, </style>, </textarea>, </pre>
1186
580k
        matches_end_condition =
1187
580k
            scan_html_block_end_1(input, parser->first_nonspace);
1188
580k
        break;
1189
68.0k
      case 2:
1190
        // -->
1191
68.0k
        matches_end_condition =
1192
68.0k
            scan_html_block_end_2(input, parser->first_nonspace);
1193
68.0k
        break;
1194
86.6k
      case 3:
1195
        // ?>
1196
86.6k
        matches_end_condition =
1197
86.6k
            scan_html_block_end_3(input, parser->first_nonspace);
1198
86.6k
        break;
1199
82.1k
      case 4:
1200
        // >
1201
82.1k
        matches_end_condition =
1202
82.1k
            scan_html_block_end_4(input, parser->first_nonspace);
1203
82.1k
        break;
1204
99.7k
      case 5:
1205
        // ]]>
1206
99.7k
        matches_end_condition =
1207
99.7k
            scan_html_block_end_5(input, parser->first_nonspace);
1208
99.7k
        break;
1209
54.6k
      default:
1210
54.6k
        matches_end_condition = 0;
1211
54.6k
        break;
1212
972k
      }
1213
1214
972k
      if (matches_end_condition) {
1215
48.2k
        container = finalize(parser, container);
1216
48.2k
        assert(parser->current != NULL);
1217
48.2k
      }
1218
7.10M
    } else if (parser->blank) {
1219
      // ??? do nothing
1220
4.56M
    } else if (accepts_lines(S_type(container))) {
1221
3.42M
      if (S_type(container) == CMARK_NODE_HEADING &&
1222
3.42M
          container->as.heading.setext == false) {
1223
47.5k
        chop_trailing_hashtags(input);
1224
47.5k
      }
1225
3.42M
      S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
1226
3.42M
                       false);
1227
3.42M
      add_line(input, parser);
1228
3.42M
    } else {
1229
      // create paragraph container for line
1230
1.13M
      container = add_child(parser, container, CMARK_NODE_PARAGRAPH,
1231
1.13M
                            parser->first_nonspace + 1);
1232
1.13M
      S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
1233
1.13M
                       false);
1234
1.13M
      add_line(input, parser);
1235
1.13M
    }
1236
1237
8.31M
    parser->current = container;
1238
8.31M
  }
1239
8.74M
}
1240
1241
/* See http://spec.commonmark.org/0.24/#phase-1-block-structure */
1242
static void S_process_line(cmark_parser *parser, const unsigned char *buffer,
1243
8.74M
                           bufsize_t bytes) {
1244
8.74M
  cmark_node *last_matched_container;
1245
8.74M
  bool all_matched = true;
1246
8.74M
  cmark_node *container;
1247
8.74M
  cmark_chunk input;
1248
1249
8.74M
  if (parser->options & CMARK_OPT_VALIDATE_UTF8)
1250
2.21M
    cmark_utf8proc_check(&parser->curline, buffer, bytes);
1251
6.53M
  else
1252
6.53M
    cmark_strbuf_put(&parser->curline, buffer, bytes);
1253
1254
8.74M
  bytes = parser->curline.size;
1255
1256
  // ensure line ends with a newline:
1257
8.74M
  if (bytes == 0 || !S_is_line_end_char(parser->curline.ptr[bytes - 1]))
1258
8.74M
    cmark_strbuf_putc(&parser->curline, '\n');
1259
1260
8.74M
  parser->offset = 0;
1261
8.74M
  parser->column = 0;
1262
8.74M
  parser->first_nonspace = 0;
1263
8.74M
  parser->first_nonspace_column = 0;
1264
8.74M
  parser->thematic_break_kill_pos = 0;
1265
8.74M
  parser->indent = 0;
1266
8.74M
  parser->blank = false;
1267
8.74M
  parser->partially_consumed_tab = false;
1268
1269
8.74M
  input.data = parser->curline.ptr;
1270
8.74M
  input.len = parser->curline.size;
1271
1272
8.74M
  parser->line_number++;
1273
1274
8.74M
  last_matched_container = check_open_blocks(parser, &input, &all_matched);
1275
1276
8.74M
  if (!last_matched_container)
1277
5.74k
    goto finished;
1278
1279
8.74M
  container = last_matched_container;
1280
1281
8.74M
  open_new_blocks(parser, &container, &input, all_matched);
1282
1283
8.74M
  add_text_to_container(parser, container, last_matched_container, &input);
1284
1285
8.74M
finished:
1286
8.74M
  parser->last_line_length = input.len;
1287
8.74M
  if (parser->last_line_length &&
1288
8.74M
      input.data[parser->last_line_length - 1] == '\n')
1289
8.70M
    parser->last_line_length -= 1;
1290
8.74M
  if (parser->last_line_length &&
1291
8.74M
      input.data[parser->last_line_length - 1] == '\r')
1292
0
    parser->last_line_length -= 1;
1293
1294
8.74M
  cmark_strbuf_clear(&parser->curline);
1295
8.74M
}
1296
1297
39.9k
cmark_node *cmark_parser_finish(cmark_parser *parser) {
1298
39.9k
  if (parser->linebuf.size) {
1299
2.07k
    S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
1300
2.07k
    cmark_strbuf_clear(&parser->linebuf);
1301
2.07k
  }
1302
1303
39.9k
  finalize_document(parser);
1304
1305
39.9k
  cmark_consolidate_text_nodes(parser->root);
1306
1307
39.9k
  cmark_strbuf_free(&parser->curline);
1308
1309
#if CMARK_DEBUG_NODES
1310
  if (cmark_node_check(parser->root, stderr)) {
1311
    abort();
1312
  }
1313
#endif
1314
39.9k
  return parser->root;
1315
39.9k
}