Coverage Report

Created: 2023-03-17 06:19

/src/cmark/src/inlines.c
Line
Count
Source (jump to first uncovered line)
1
#include <stdlib.h>
2
#include <string.h>
3
#include <stdio.h>
4
5
#include "cmark_ctype.h"
6
#include "config.h"
7
#include "node.h"
8
#include "parser.h"
9
#include "references.h"
10
#include "cmark.h"
11
#include "houdini.h"
12
#include "utf8.h"
13
#include "scanners.h"
14
#include "inlines.h"
15
16
static const char *EMDASH = "\xE2\x80\x94";
17
static const char *ENDASH = "\xE2\x80\x93";
18
static const char *ELLIPSES = "\xE2\x80\xA6";
19
static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C";
20
static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D";
21
static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98";
22
static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99";
23
24
// Macros for creating various kinds of simple.
25
23.8k
#define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK)
26
2.17M
#define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK)
27
285k
#define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH)
28
559k
#define make_strong(mem) make_simple(mem, CMARK_NODE_STRONG)
29
30
1.00G
#define MAXBACKTICKS 1000
31
32
typedef struct delimiter {
33
  struct delimiter *previous;
34
  struct delimiter *next;
35
  cmark_node *inl_text;
36
  bufsize_t position;
37
  bufsize_t length;
38
  unsigned char delim_char;
39
  bool can_open;
40
  bool can_close;
41
} delimiter;
42
43
typedef struct bracket {
44
  struct bracket *previous;
45
  cmark_node *inl_text;
46
  bufsize_t position;
47
  bool image;
48
  bool active;
49
  bool bracket_after;
50
} bracket;
51
52
43.5k
#define FLAG_SKIP_HTML_CDATA        (1u << 0)
53
254k
#define FLAG_SKIP_HTML_DECLARATION  (1u << 1)
54
108k
#define FLAG_SKIP_HTML_PI           (1u << 2)
55
543k
#define FLAG_SKIP_HTML_COMMENT      (1u << 3)
56
57
typedef struct {
58
  cmark_mem *mem;
59
  cmark_chunk input;
60
  unsigned flags;
61
  int line;
62
  bufsize_t pos;
63
  int block_offset;
64
  int column_offset;
65
  cmark_reference_map *refmap;
66
  delimiter *last_delim;
67
  bracket *last_bracket;
68
  bufsize_t backticks[MAXBACKTICKS + 1];
69
  bool scanned_for_backticks;
70
  bool no_link_openers;
71
} subject;
72
73
13.4M
static CMARK_INLINE bool S_is_line_end_char(char c) {
74
13.4M
  return (c == '\n' || c == '\r');
75
13.4M
}
76
77
static delimiter *S_insert_emph(subject *subj, delimiter *opener,
78
                                delimiter *closer);
79
80
static int parse_inline(subject *subj, cmark_node *parent, int options);
81
82
static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,
83
                             cmark_chunk *chunk, cmark_reference_map *refmap);
84
static bufsize_t subject_find_special_char(subject *subj, int options);
85
86
// Create an inline with a literal string value.
87
static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t,
88
30.8M
                                             int start_column, int end_column) {
89
30.8M
  cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e));
90
30.8M
  e->mem = subj->mem;
91
30.8M
  e->type = (uint16_t)t;
92
30.8M
  e->start_line = e->end_line = subj->line;
93
  // columns are 1 based.
94
30.8M
  e->start_column = start_column + 1 + subj->column_offset + subj->block_offset;
95
30.8M
  e->end_column = end_column + 1 + subj->column_offset + subj->block_offset;
96
30.8M
  return e;
97
30.8M
}
98
99
// Create an inline with no value.
100
2.92M
static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) {
101
2.92M
  cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e));
102
2.92M
  e->mem = mem;
103
2.92M
  e->type = t;
104
2.92M
  return e;
105
2.92M
}
106
107
29.3M
static cmark_node *make_str(subject *subj, int sc, int ec, cmark_chunk s) {
108
29.3M
  cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec);
109
29.3M
  e->data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1);
110
29.3M
  if (s.data != NULL) {
111
29.3M
    memcpy(e->data, s.data, s.len);
112
29.3M
  }
113
29.3M
  e->data[s.len] = 0;
114
29.3M
  e->len = s.len;
115
29.3M
  return e;
116
29.3M
}
117
118
static cmark_node *make_str_from_buf(subject *subj, int sc, int ec,
119
1.10M
                                     cmark_strbuf *buf) {
120
1.10M
  cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec);
121
1.10M
  e->len = buf->size;
122
1.10M
  e->data = cmark_strbuf_detach(buf);
123
1.10M
  return e;
124
1.10M
}
125
126
// Like make_str, but parses entities.
127
static cmark_node *make_str_with_entities(subject *subj,
128
                                          int start_column, int end_column,
129
90.3k
                                          cmark_chunk *content) {
130
90.3k
  cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem);
131
132
90.3k
  if (houdini_unescape_html(&unescaped, content->data, content->len)) {
133
2.06k
    return make_str_from_buf(subj, start_column, end_column, &unescaped);
134
88.2k
  } else {
135
88.2k
    return make_str(subj, start_column, end_column, *content);
136
88.2k
  }
137
90.3k
}
138
139
// Like cmark_node_append_child but without costly sanity checks.
140
// Assumes that child was newly created.
141
37.1M
static void append_child(cmark_node *node, cmark_node *child) {
142
37.1M
  cmark_node *old_last_child = node->last_child;
143
144
37.1M
  child->next = NULL;
145
37.1M
  child->prev = old_last_child;
146
37.1M
  child->parent = node;
147
37.1M
  node->last_child = child;
148
149
37.1M
  if (old_last_child) {
150
35.6M
    old_last_child->next = child;
151
35.6M
  } else {
152
    // Also set first_child if node previously had no children.
153
1.52M
    node->first_child = child;
154
1.52M
  }
155
37.1M
}
156
157
// Duplicate a chunk by creating a copy of the buffer not by reusing the
158
// buffer like cmark_chunk_dup does.
159
387k
static unsigned char *cmark_strdup(cmark_mem *mem, unsigned char *src) {
160
387k
  if (src == NULL) {
161
190k
    return NULL;
162
190k
  }
163
197k
  size_t len = strlen((char *)src);
164
197k
  unsigned char *data = (unsigned char *)mem->realloc(NULL, len + 1);
165
197k
  memcpy(data, src, len + 1);
166
197k
  return data;
167
387k
}
168
169
static unsigned char *cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url,
170
90.3k
                                           int is_email) {
171
90.3k
  cmark_strbuf buf = CMARK_BUF_INIT(mem);
172
173
90.3k
  cmark_chunk_trim(url);
174
175
90.3k
  if (is_email)
176
72.6k
    cmark_strbuf_puts(&buf, "mailto:");
177
178
90.3k
  houdini_unescape_html_f(&buf, url->data, url->len);
179
90.3k
  return cmark_strbuf_detach(&buf);
180
90.3k
}
181
182
static CMARK_INLINE cmark_node *make_autolink(subject *subj,
183
                                              int start_column, int end_column,
184
90.3k
                                              cmark_chunk url, int is_email) {
185
90.3k
  cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK);
186
90.3k
  link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email);
187
90.3k
  link->as.link.title = NULL;
188
90.3k
  link->start_line = link->end_line = subj->line;
189
90.3k
  link->start_column = start_column + 1;
190
90.3k
  link->end_column = end_column + 1;
191
90.3k
  append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
192
90.3k
  return link;
193
90.3k
}
194
195
static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,
196
1.00M
                             cmark_chunk *chunk, cmark_reference_map *refmap) {
197
1.00M
  int i;
198
1.00M
  e->mem = mem;
199
1.00M
  e->input = *chunk;
200
1.00M
  e->flags = 0;
201
1.00M
  e->line = line_number;
202
1.00M
  e->pos = 0;
203
1.00M
  e->block_offset = block_offset;
204
1.00M
  e->column_offset = 0;
205
1.00M
  e->refmap = refmap;
206
1.00M
  e->last_delim = NULL;
207
1.00M
  e->last_bracket = NULL;
208
1.00G
  for (i = 0; i <= MAXBACKTICKS; i++) {
209
1.00G
    e->backticks[i] = 0;
210
1.00G
  }
211
1.00M
  e->scanned_for_backticks = false;
212
1.00M
  e->no_link_openers = true;
213
1.00M
}
214
215
2.26M
static CMARK_INLINE int isbacktick(int c) { return (c == '`'); }
216
217
255M
static CMARK_INLINE unsigned char peek_char(subject *subj) {
218
  // NULL bytes should have been stripped out by now.  If they're
219
  // present, it's a programming error:
220
255M
  assert(!(subj->pos < subj->input.len && subj->input.data[subj->pos] == 0));
221
255M
  return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0;
222
255M
}
223
224
10.3M
static CMARK_INLINE unsigned char peek_at(subject *subj, bufsize_t pos) {
225
10.3M
  return subj->input.data[pos];
226
10.3M
}
227
228
// Return true if there are more characters in the subject.
229
35.8M
static CMARK_INLINE int is_eof(subject *subj) {
230
35.8M
  return (subj->pos >= subj->input.len);
231
35.8M
}
232
233
// Advance the subject.  Doesn't check for eof.
234
208M
#define advance(subj) (subj)->pos += 1
235
236
2.48M
static CMARK_INLINE bool skip_spaces(subject *subj) {
237
2.48M
  bool skipped = false;
238
3.19M
  while (peek_char(subj) == ' ' || peek_char(subj) == '\t') {
239
711k
    advance(subj);
240
711k
    skipped = true;
241
711k
  }
242
2.48M
  return skipped;
243
2.48M
}
244
245
580k
static CMARK_INLINE bool skip_line_end(subject *subj) {
246
580k
  bool seen_line_end_char = false;
247
580k
  if (peek_char(subj) == '\r') {
248
0
    advance(subj);
249
0
    seen_line_end_char = true;
250
0
  }
251
580k
  if (peek_char(subj) == '\n') {
252
168k
    advance(subj);
253
168k
    seen_line_end_char = true;
254
168k
  }
255
580k
  return seen_line_end_char || is_eof(subj);
256
580k
}
257
258
// Take characters while a predicate holds, and return a string.
259
315k
static CMARK_INLINE cmark_chunk take_while(subject *subj, int (*f)(int)) {
260
315k
  unsigned char c;
261
315k
  bufsize_t startpos = subj->pos;
262
315k
  bufsize_t len = 0;
263
264
2.27M
  while ((c = peek_char(subj)) && (*f)(c)) {
265
1.96M
    advance(subj);
266
1.96M
    len++;
267
1.96M
  }
268
269
315k
  return cmark_chunk_dup(&subj->input, startpos, len);
270
315k
}
271
272
// Return the number of newlines in a given span of text in a subject.  If
273
// the number is greater than zero, also return the number of characters
274
// between the last newline and the end of the span in `since_newline`.
275
211k
static int count_newlines(subject *subj, bufsize_t from, bufsize_t len, int *since_newline) {
276
211k
  int nls = 0;
277
211k
  int since_nl = 0;
278
279
93.9M
  while (len--) {
280
93.6M
    if (subj->input.data[from++] == '\n') {
281
182k
      ++nls;
282
182k
      since_nl = 0;
283
93.5M
    } else {
284
93.5M
      ++since_nl;
285
93.5M
    }
286
93.6M
  }
287
288
211k
  if (!nls)
289
175k
    return 0;
290
291
35.2k
  *since_newline = since_nl;
292
35.2k
  return nls;
293
211k
}
294
295
// Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and
296
// `column_offset` according to the number of newlines in a just-matched span
297
// of text in `subj`.
298
443k
static void adjust_subj_node_newlines(subject *subj, cmark_node *node, int matchlen, int extra, int options) {
299
443k
  if (!(options & CMARK_OPT_SOURCEPOS)) {
300
232k
    return;
301
232k
  }
302
303
211k
  int since_newline;
304
211k
  int newlines = count_newlines(subj, subj->pos - matchlen - extra, matchlen, &since_newline);
305
211k
  if (newlines) {
306
35.2k
    subj->line += newlines;
307
35.2k
    node->end_line += newlines;
308
35.2k
    node->end_column = since_newline;
309
35.2k
    subj->column_offset = -subj->pos + since_newline + extra;
310
35.2k
  }
311
211k
}
312
313
// Try to process a backtick code span that began with a
314
// span of ticks of length openticklength length (already
315
// parsed).  Return 0 if you don't find matching closing
316
// backticks, otherwise return the position in the subject
317
// after the closing backticks.
318
static bufsize_t scan_to_closing_backticks(subject *subj,
319
315k
                                           bufsize_t openticklength) {
320
321
315k
  bool found = false;
322
315k
  if (openticklength > MAXBACKTICKS) {
323
    // we limit backtick string length because of the array subj->backticks:
324
254
    return 0;
325
254
  }
326
315k
  if (subj->scanned_for_backticks &&
327
315k
      subj->backticks[openticklength] <= subj->pos) {
328
    // return if we already know there's no closer
329
70.7k
    return 0;
330
70.7k
  }
331
902k
  while (!found) {
332
    // read non backticks
333
902k
    unsigned char c;
334
146M
    while ((c = peek_char(subj)) && c != '`') {
335
145M
      advance(subj);
336
145M
    }
337
902k
    if (is_eof(subj)) {
338
97.2k
      break;
339
97.2k
    }
340
805k
    bufsize_t numticks = 0;
341
7.34M
    while (peek_char(subj) == '`') {
342
6.53M
      advance(subj);
343
6.53M
      numticks++;
344
6.53M
    }
345
    // store position of ender
346
805k
    if (numticks <= MAXBACKTICKS) {
347
805k
      subj->backticks[numticks] = subj->pos - numticks;
348
805k
    }
349
805k
    if (numticks == openticklength) {
350
147k
      return (subj->pos);
351
147k
    }
352
805k
  }
353
  // got through whole input without finding closer
354
97.2k
  subj->scanned_for_backticks = true;
355
97.2k
  return 0;
356
244k
}
357
358
// Destructively modify string, converting newlines to
359
// spaces, then removing a single leading + trailing space,
360
// unless the code span consists entirely of space characters.
361
147k
static void S_normalize_code(cmark_strbuf *s) {
362
147k
  bufsize_t r, w;
363
147k
  bool contains_nonspace = false;
364
365
100M
  for (r = 0, w = 0; r < s->size; ++r) {
366
100M
    switch (s->ptr[r]) {
367
0
    case '\r':
368
0
      if (s->ptr[r + 1] != '\n') {
369
0
        s->ptr[w++] = ' ';
370
0
      }
371
0
      break;
372
374k
    case '\n':
373
374k
      s->ptr[w++] = ' ';
374
374k
      break;
375
99.6M
    default:
376
99.6M
      s->ptr[w++] = s->ptr[r];
377
100M
    }
378
100M
    if (s->ptr[r] != ' ') {
379
99.2M
      contains_nonspace = true;
380
99.2M
    }
381
100M
  }
382
383
  // begins and ends with space?
384
147k
  if (contains_nonspace &&
385
147k
      s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') {
386
4.71k
    cmark_strbuf_drop(s, 1);
387
4.71k
    cmark_strbuf_truncate(s, w - 2);
388
142k
  } else {
389
142k
    cmark_strbuf_truncate(s, w);
390
142k
  }
391
392
147k
}
393
394
395
// Parse backtick code section or raw backticks, return an inline.
396
// Assumes that the subject has a backtick at the current position.
397
315k
static cmark_node *handle_backticks(subject *subj, int options) {
398
315k
  bufsize_t initpos = subj->pos;
399
315k
  cmark_chunk openticks = take_while(subj, isbacktick);
400
315k
  bufsize_t startpos = subj->pos;
401
315k
  bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len);
402
403
315k
  if (endpos == 0) {      // not found
404
168k
    subj->pos = startpos; // rewind
405
168k
    return make_str(subj, initpos, initpos + openticks.len - 1, openticks);
406
168k
  } else {
407
147k
    cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
408
409
147k
    cmark_strbuf_set(&buf, subj->input.data + startpos,
410
147k
                     endpos - startpos - openticks.len);
411
147k
    S_normalize_code(&buf);
412
413
147k
    cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos,
414
147k
                                    endpos - openticks.len - 1);
415
147k
    node->len = buf.size;
416
147k
    node->data = cmark_strbuf_detach(&buf);
417
147k
    adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options);
418
147k
    return node;
419
147k
  }
420
315k
}
421
422
423
// Scan ***, **, or * and return number scanned, or 0.
424
// Advances position.
425
static int scan_delims(subject *subj, unsigned char c, bool *can_open,
426
2.10M
                       bool *can_close) {
427
2.10M
  int numdelims = 0;
428
2.10M
  bufsize_t before_char_pos;
429
2.10M
  int32_t after_char = 0;
430
2.10M
  int32_t before_char = 0;
431
2.10M
  int len;
432
2.10M
  bool left_flanking, right_flanking;
433
434
2.10M
  if (subj->pos == 0) {
435
12.8k
    before_char = 10;
436
2.09M
  } else {
437
2.09M
    before_char_pos = subj->pos - 1;
438
    // walk back to the beginning of the UTF_8 sequence:
439
3.53M
    while (peek_at(subj, before_char_pos) >> 6 == 2 && before_char_pos > 0) {
440
1.43M
      before_char_pos -= 1;
441
1.43M
    }
442
2.09M
    len = cmark_utf8proc_iterate(subj->input.data + before_char_pos,
443
2.09M
                                 subj->pos - before_char_pos, &before_char);
444
2.09M
    if (len == -1) {
445
88.0k
      before_char = 10;
446
88.0k
    }
447
2.09M
  }
448
449
2.10M
  if (c == '\'' || c == '"') {
450
1.06M
    numdelims++;
451
1.06M
    advance(subj); // limit to 1 delim for quotes
452
1.06M
  } else {
453
3.02M
    while (peek_char(subj) == c) {
454
1.97M
      numdelims++;
455
1.97M
      advance(subj);
456
1.97M
    }
457
1.04M
  }
458
459
2.10M
  len = cmark_utf8proc_iterate(subj->input.data + subj->pos,
460
2.10M
                               subj->input.len - subj->pos, &after_char);
461
2.10M
  if (len == -1) {
462
246k
    after_char = 10;
463
246k
  }
464
2.10M
  left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) &&
465
2.10M
                  (!cmark_utf8proc_is_punctuation(after_char) ||
466
1.79M
                   cmark_utf8proc_is_space(before_char) ||
467
1.79M
                   cmark_utf8proc_is_punctuation(before_char));
468
2.10M
  right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) &&
469
2.10M
                   (!cmark_utf8proc_is_punctuation(before_char) ||
470
1.74M
                    cmark_utf8proc_is_space(after_char) ||
471
1.74M
                    cmark_utf8proc_is_punctuation(after_char));
472
2.10M
  if (c == '_') {
473
442k
    *can_open = left_flanking &&
474
442k
                (!right_flanking || cmark_utf8proc_is_punctuation(before_char));
475
442k
    *can_close = right_flanking &&
476
442k
                 (!left_flanking || cmark_utf8proc_is_punctuation(after_char));
477
1.66M
  } else if (c == '\'' || c == '"') {
478
1.06M
    *can_open = left_flanking &&
479
1.06M
         (!right_flanking || before_char == '(' || before_char == '[') &&
480
1.06M
         before_char != ']' && before_char != ')';
481
1.06M
    *can_close = right_flanking;
482
1.06M
  } else {
483
600k
    *can_open = left_flanking;
484
600k
    *can_close = right_flanking;
485
600k
  }
486
2.10M
  return numdelims;
487
2.10M
}
488
489
/*
490
static void print_delimiters(subject *subj)
491
{
492
        delimiter *delim;
493
        delim = subj->last_delim;
494
        while (delim != NULL) {
495
                printf("Item at stack pos %p: %d %d %d next(%p) prev(%p)\n",
496
                       (void*)delim, delim->delim_char,
497
                       delim->can_open, delim->can_close,
498
                       (void*)delim->next, (void*)delim->previous);
499
                delim = delim->previous;
500
        }
501
}
502
*/
503
504
1.78M
static void remove_delimiter(subject *subj, delimiter *delim) {
505
1.78M
  if (delim == NULL)
506
0
    return;
507
1.78M
  if (delim->next == NULL) {
508
    // end of list:
509
424k
    assert(delim == subj->last_delim);
510
424k
    subj->last_delim = delim->previous;
511
1.36M
  } else {
512
1.36M
    delim->next->previous = delim->previous;
513
1.36M
  }
514
1.78M
  if (delim->previous != NULL) {
515
828k
    delim->previous->next = delim->next;
516
828k
  }
517
1.78M
  subj->mem->free(delim);
518
1.78M
}
519
520
2.26M
static void pop_bracket(subject *subj) {
521
2.26M
  bracket *b;
522
2.26M
  if (subj->last_bracket == NULL)
523
0
    return;
524
2.26M
  b = subj->last_bracket;
525
2.26M
  subj->last_bracket = subj->last_bracket->previous;
526
2.26M
  subj->mem->free(b);
527
2.26M
}
528
529
static void push_delimiter(subject *subj, unsigned char c, bool can_open,
530
1.78M
                           bool can_close, cmark_node *inl_text) {
531
1.78M
  delimiter *delim = (delimiter *)subj->mem->calloc(1, sizeof(delimiter));
532
1.78M
  delim->delim_char = c;
533
1.78M
  delim->can_open = can_open;
534
1.78M
  delim->can_close = can_close;
535
1.78M
  delim->inl_text = inl_text;
536
1.78M
  delim->position = subj->pos;
537
1.78M
  delim->length = inl_text->len;
538
1.78M
  delim->previous = subj->last_delim;
539
1.78M
  delim->next = NULL;
540
1.78M
  if (delim->previous != NULL) {
541
1.58M
    delim->previous->next = delim;
542
1.58M
  }
543
1.78M
  subj->last_delim = delim;
544
1.78M
}
545
546
2.26M
static void push_bracket(subject *subj, bool image, cmark_node *inl_text) {
547
2.26M
  bracket *b = (bracket *)subj->mem->calloc(1, sizeof(bracket));
548
2.26M
  if (subj->last_bracket != NULL) {
549
1.38M
    subj->last_bracket->bracket_after = true;
550
1.38M
  }
551
2.26M
  b->image = image;
552
2.26M
  b->active = true;
553
2.26M
  b->inl_text = inl_text;
554
2.26M
  b->previous = subj->last_bracket;
555
2.26M
  b->position = subj->pos;
556
2.26M
  b->bracket_after = false;
557
2.26M
  subj->last_bracket = b;
558
2.26M
  if (!image) {
559
2.20M
    subj->no_link_openers = false;
560
2.20M
  }
561
2.26M
}
562
563
// Assumes the subject has a c at the current position.
564
2.10M
static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) {
565
2.10M
  bufsize_t numdelims;
566
2.10M
  cmark_node *inl_text;
567
2.10M
  bool can_open, can_close;
568
2.10M
  cmark_chunk contents;
569
570
2.10M
  numdelims = scan_delims(subj, c, &can_open, &can_close);
571
572
2.10M
  if (c == '\'' && smart) {
573
233k
    contents = cmark_chunk_literal(RIGHTSINGLEQUOTE);
574
1.87M
  } else if (c == '"' && smart) {
575
686k
    contents =
576
686k
        cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE);
577
1.18M
  } else {
578
1.18M
    contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims);
579
1.18M
  }
580
581
2.10M
  inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents);
582
583
2.10M
  if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) {
584
1.78M
    push_delimiter(subj, c, can_open, can_close, inl_text);
585
1.78M
  }
586
587
2.10M
  return inl_text;
588
2.10M
}
589
590
// Assumes we have a hyphen at the current position.
591
4.44M
static cmark_node *handle_hyphen(subject *subj, bool smart) {
592
4.44M
  int startpos = subj->pos;
593
594
4.44M
  advance(subj);
595
596
4.44M
  if (!smart || peek_char(subj) != '-') {
597
3.38M
    return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-"));
598
3.38M
  }
599
600
4.81M
  while (smart && peek_char(subj) == '-') {
601
3.75M
    advance(subj);
602
3.75M
  }
603
604
1.06M
  int numhyphens = subj->pos - startpos;
605
1.06M
  int en_count = 0;
606
1.06M
  int em_count = 0;
607
1.06M
  int i;
608
1.06M
  cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
609
610
1.06M
  if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes
611
320k
    em_count = numhyphens / 3;
612
745k
  } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes
613
658k
    en_count = numhyphens / 2;
614
658k
  } else if (numhyphens % 3 == 2) { // use one en dash at end
615
65.1k
    en_count = 1;
616
65.1k
    em_count = (numhyphens - 2) / 3;
617
65.1k
  } else { // use two en dashes at the end
618
21.9k
    en_count = 2;
619
21.9k
    em_count = (numhyphens - 4) / 3;
620
21.9k
  }
621
622
1.83M
  for (i = em_count; i > 0; i--) {
623
766k
    cmark_strbuf_puts(&buf, EMDASH);
624
766k
  }
625
626
2.32M
  for (i = en_count; i > 0; i--) {
627
1.25M
    cmark_strbuf_puts(&buf, ENDASH);
628
1.25M
  }
629
630
1.06M
  return make_str_from_buf(subj, startpos, subj->pos - 1, &buf);
631
4.44M
}
632
633
// Assumes we have a period at the current position.
634
253k
static cmark_node *handle_period(subject *subj, bool smart) {
635
253k
  advance(subj);
636
253k
  if (smart && peek_char(subj) == '.') {
637
58.4k
    advance(subj);
638
58.4k
    if (peek_char(subj) == '.') {
639
54.8k
      advance(subj);
640
54.8k
      return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES));
641
54.8k
    } else {
642
3.60k
      return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal(".."));
643
3.60k
    }
644
195k
  } else {
645
195k
    return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("."));
646
195k
  }
647
253k
}
648
649
1.06M
static void process_emphasis(subject *subj, bufsize_t stack_bottom) {
650
1.06M
  delimiter *candidate;
651
1.06M
  delimiter *closer = NULL;
652
1.06M
  delimiter *opener;
653
1.06M
  delimiter *old_closer;
654
1.06M
  bool opener_found;
655
1.06M
  int openers_bottom_index = 0;
656
1.06M
  bufsize_t openers_bottom[9] = {stack_bottom, stack_bottom, stack_bottom,
657
1.06M
                                 stack_bottom, stack_bottom, stack_bottom,
658
1.06M
                                 stack_bottom, stack_bottom, stack_bottom};
659
660
  // move back to first relevant delim.
661
1.06M
  candidate = subj->last_delim;
662
2.84M
  while (candidate != NULL && candidate->position >= stack_bottom) {
663
1.78M
    closer = candidate;
664
1.78M
    candidate = candidate->previous;
665
1.78M
  }
666
667
  // now move forward, looking for closers, and handling each
668
2.98M
  while (closer != NULL) {
669
1.92M
    if (closer->can_close) {
670
1.50M
      switch (closer->delim_char) {
671
468k
      case '"':
672
468k
        openers_bottom_index = 0;
673
468k
        break;
674
163k
      case '\'':
675
163k
        openers_bottom_index = 1;
676
163k
        break;
677
367k
      case '_':
678
367k
        openers_bottom_index = 2;
679
367k
        break;
680
501k
      case '*':
681
501k
        openers_bottom_index = 3 +
682
501k
                (closer->can_open ? 3 : 0) + (closer->length % 3);
683
501k
        break;
684
0
      default:
685
0
        assert(false);
686
1.50M
      }
687
688
      // Now look backwards for first matching opener:
689
1.50M
      opener = closer->previous;
690
1.50M
      opener_found = false;
691
1.98M
      while (opener != NULL &&
692
1.98M
             opener->position >= openers_bottom[openers_bottom_index]) {
693
972k
        if (opener->can_open && opener->delim_char == closer->delim_char) {
694
          // interior closer of size 2 can't match opener of size 1
695
          // or of size 1 can't match 2
696
501k
          if (!(closer->can_open || opener->can_close) ||
697
501k
              closer->length % 3 == 0 ||
698
501k
              (opener->length + closer->length) % 3 != 0) {
699
491k
            opener_found = true;
700
491k
            break;
701
491k
          }
702
501k
        }
703
480k
        opener = opener->previous;
704
480k
      }
705
1.50M
      old_closer = closer;
706
1.50M
      if (closer->delim_char == '*' || closer->delim_char == '_') {
707
868k
        if (opener_found) {
708
422k
          closer = S_insert_emph(subj, opener, closer);
709
446k
        } else {
710
446k
          closer = closer->next;
711
446k
        }
712
868k
      } else if (closer->delim_char == '\'' || closer->delim_char == '"') {
713
631k
        if (closer->delim_char == '\'') {
714
163k
          cmark_node_set_literal(closer->inl_text, RIGHTSINGLEQUOTE);
715
468k
        } else {
716
468k
          cmark_node_set_literal(closer->inl_text, RIGHTDOUBLEQUOTE);
717
468k
        }
718
631k
        closer = closer->next;
719
631k
        if (opener_found) {
720
69.2k
          if (old_closer->delim_char == '\'') {
721
20.2k
            cmark_node_set_literal(opener->inl_text, LEFTSINGLEQUOTE);
722
49.0k
          } else {
723
49.0k
            cmark_node_set_literal(opener->inl_text, LEFTDOUBLEQUOTE);
724
49.0k
          }
725
69.2k
          remove_delimiter(subj, opener);
726
69.2k
          remove_delimiter(subj, old_closer);
727
69.2k
        }
728
631k
      }
729
1.50M
      if (!opener_found) {
730
        // set lower bound for future searches for openers
731
1.00M
        openers_bottom[openers_bottom_index] = old_closer->position;
732
1.00M
        if (!old_closer->can_open) {
733
          // we can remove a closer that can't be an
734
          // opener, once we've seen there's no
735
          // matching opener:
736
735k
          remove_delimiter(subj, old_closer);
737
735k
        }
738
1.00M
      }
739
1.50M
    } else {
740
423k
      closer = closer->next;
741
423k
    }
742
1.92M
  }
743
  // free all delimiters in list until stack_bottom:
744
1.36M
  while (subj->last_delim != NULL &&
745
1.36M
         subj->last_delim->position >= stack_bottom) {
746
305k
    remove_delimiter(subj, subj->last_delim);
747
305k
  }
748
1.06M
}
749
750
static delimiter *S_insert_emph(subject *subj, delimiter *opener,
751
422k
                                delimiter *closer) {
752
422k
  delimiter *delim, *tmp_delim;
753
422k
  bufsize_t use_delims;
754
422k
  cmark_node *opener_inl = opener->inl_text;
755
422k
  cmark_node *closer_inl = closer->inl_text;
756
422k
  bufsize_t opener_num_chars = opener_inl->len;
757
422k
  bufsize_t closer_num_chars = closer_inl->len;
758
422k
  cmark_node *tmp, *tmpnext, *emph;
759
760
  // calculate the actual number of characters used from this closer
761
422k
  use_delims = (closer_num_chars >= 2 && opener_num_chars >= 2) ? 2 : 1;
762
763
  // remove used characters from associated inlines.
764
422k
  opener_num_chars -= use_delims;
765
422k
  closer_num_chars -= use_delims;
766
422k
  opener_inl->len = opener_num_chars;
767
422k
  opener_inl->data[opener_num_chars] = 0;
768
422k
  closer_inl->len = closer_num_chars;
769
422k
  closer_inl->data[closer_num_chars] = 0;
770
771
  // free delimiters between opener and closer
772
422k
  delim = closer->previous;
773
460k
  while (delim != NULL && delim != opener) {
774
38.0k
    tmp_delim = delim->previous;
775
38.0k
    remove_delimiter(subj, delim);
776
38.0k
    delim = tmp_delim;
777
38.0k
  }
778
779
  // create new emph or strong, and splice it in to our inlines
780
  // between the opener and closer
781
422k
  emph = use_delims == 1 ? make_emph(subj->mem) : make_strong(subj->mem);
782
783
422k
  tmp = opener_inl->next;
784
3.98M
  while (tmp && tmp != closer_inl) {
785
3.55M
    tmpnext = tmp->next;
786
3.55M
    cmark_node_unlink(tmp);
787
3.55M
    append_child(emph, tmp);
788
3.55M
    tmp = tmpnext;
789
3.55M
  }
790
422k
  cmark_node_insert_after(opener_inl, emph);
791
792
422k
  emph->start_line = opener_inl->start_line;
793
422k
  emph->end_line = closer_inl->end_line;
794
422k
  emph->start_column = opener_inl->start_column;
795
422k
  emph->end_column = closer_inl->end_column;
796
797
  // if opener has 0 characters, remove it and its associated inline
798
422k
  if (opener_num_chars == 0) {
799
284k
    cmark_node_free(opener_inl);
800
284k
    remove_delimiter(subj, opener);
801
284k
  }
802
803
  // if closer has 0 characters, remove it and its associated inline
804
422k
  if (closer_num_chars == 0) {
805
    // remove empty closer inline
806
282k
    cmark_node_free(closer_inl);
807
    // remove closer from list
808
282k
    tmp_delim = closer->next;
809
282k
    remove_delimiter(subj, closer);
810
282k
    closer = tmp_delim;
811
282k
  }
812
813
422k
  return closer;
814
422k
}
815
816
// Parse backslash-escape or just a backslash, returning an inline.
817
628k
static cmark_node *handle_backslash(subject *subj) {
818
628k
  advance(subj);
819
628k
  unsigned char nextchar = peek_char(subj);
820
628k
  if (cmark_ispunct(
821
628k
          nextchar)) { // only ascii symbols and newline can be escaped
822
278k
    advance(subj);
823
278k
    return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1));
824
350k
  } else if (!is_eof(subj) && skip_line_end(subj)) {
825
21.6k
    return make_linebreak(subj->mem);
826
329k
  } else {
827
329k
    return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\"));
828
329k
  }
829
628k
}
830
831
// Parse an entity or a regular "&" string.
832
// Assumes the subject has an '&' character at the current position.
833
514k
static cmark_node *handle_entity(subject *subj) {
834
514k
  cmark_strbuf ent = CMARK_BUF_INIT(subj->mem);
835
514k
  bufsize_t len;
836
837
514k
  advance(subj);
838
839
514k
  len = houdini_unescape_ent(&ent, subj->input.data + subj->pos,
840
514k
                             subj->input.len - subj->pos);
841
842
514k
  if (len <= 0)
843
473k
    return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&"));
844
845
41.0k
  subj->pos += len;
846
41.0k
  return make_str_from_buf(subj, subj->pos - 1 - len, subj->pos - 1, &ent);
847
514k
}
848
849
// Clean a URL: remove surrounding whitespace, and remove \ that escape
850
// punctuation.
851
88.0k
unsigned char *cmark_clean_url(cmark_mem *mem, cmark_chunk *url) {
852
88.0k
  cmark_strbuf buf = CMARK_BUF_INIT(mem);
853
854
88.0k
  cmark_chunk_trim(url);
855
856
88.0k
  houdini_unescape_html_f(&buf, url->data, url->len);
857
858
88.0k
  cmark_strbuf_unescape(&buf);
859
88.0k
  return cmark_strbuf_detach(&buf);
860
88.0k
}
861
862
88.0k
unsigned char *cmark_clean_title(cmark_mem *mem, cmark_chunk *title) {
863
88.0k
  cmark_strbuf buf = CMARK_BUF_INIT(mem);
864
88.0k
  unsigned char first, last;
865
866
88.0k
  if (title->len == 0) {
867
80.7k
    return NULL;
868
80.7k
  }
869
870
7.28k
  first = title->data[0];
871
7.28k
  last = title->data[title->len - 1];
872
873
  // remove surrounding quotes if any:
874
7.28k
  if ((first == '\'' && last == '\'') || (first == '(' && last == ')') ||
875
7.28k
      (first == '"' && last == '"')) {
876
7.28k
    houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
877
7.28k
  } else {
878
0
    houdini_unescape_html_f(&buf, title->data, title->len);
879
0
  }
880
881
7.28k
  cmark_strbuf_unescape(&buf);
882
7.28k
  return cmark_strbuf_detach(&buf);
883
88.0k
}
884
885
// Parse an autolink or HTML tag.
886
// Assumes the subject has a '<' character at the current position.
887
3.95M
static cmark_node *handle_pointy_brace(subject *subj, int options) {
888
3.95M
  bufsize_t matchlen = 0;
889
3.95M
  cmark_chunk contents;
890
891
3.95M
  advance(subj); // advance past first <
892
893
  // first try to match a URL autolink
894
3.95M
  matchlen = scan_autolink_uri(&subj->input, subj->pos);
895
3.95M
  if (matchlen > 0) {
896
17.6k
    contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
897
17.6k
    subj->pos += matchlen;
898
899
17.6k
    return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0);
900
17.6k
  }
901
902
  // next try to match an email autolink
903
3.93M
  matchlen = scan_autolink_email(&subj->input, subj->pos);
904
3.93M
  if (matchlen > 0) {
905
72.6k
    contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
906
72.6k
    subj->pos += matchlen;
907
908
72.6k
    return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1);
909
72.6k
  }
910
911
  // finally, try to match an html tag
912
3.86M
  if (subj->pos + 2 <= subj->input.len) {
913
3.84M
    int c = subj->input.data[subj->pos];
914
3.84M
    if (c == '!' && (subj->flags & FLAG_SKIP_HTML_COMMENT) == 0) {
915
321k
      c = subj->input.data[subj->pos+1];
916
321k
      if (c == '-' && subj->input.data[subj->pos+2] == '-') {
917
37.9k
  if (subj->input.data[subj->pos+3] == '>') {
918
693
    matchlen = 4;
919
37.2k
  } else if (subj->input.data[subj->pos+3] == '-' &&
920
37.2k
                   subj->input.data[subj->pos+4] == '>') {
921
273
          matchlen = 5;
922
37.0k
        } else {
923
37.0k
          matchlen = scan_html_comment(&subj->input, subj->pos + 1);
924
37.0k
          if (matchlen > 0) {
925
286
            matchlen += 1; // prefix "<"
926
36.7k
    } else { // no match through end of input: set a flag so
927
       // we don't reparse looking for -->:
928
36.7k
      subj->flags |= FLAG_SKIP_HTML_COMMENT;
929
36.7k
    }
930
37.0k
  }
931
283k
      } else if (c == '[') {
932
40.6k
        if ((subj->flags & FLAG_SKIP_HTML_CDATA) == 0) {
933
36.8k
          matchlen = scan_html_cdata(&subj->input, subj->pos + 2);
934
36.8k
          if (matchlen > 0) {
935
            // The regex doesn't require the final "]]>". But if we're not at
936
            // the end of input, it must come after the match. Otherwise,
937
            // disable subsequent scans to avoid quadratic behavior.
938
17.9k
            matchlen += 5; // prefix "![", suffix "]]>"
939
17.9k
            if (subj->pos + matchlen > subj->input.len) {
940
2.80k
              subj->flags |= FLAG_SKIP_HTML_CDATA;
941
2.80k
              matchlen = 0;
942
2.80k
            }
943
17.9k
          }
944
36.8k
        }
945
242k
      } else if ((subj->flags & FLAG_SKIP_HTML_DECLARATION) == 0) {
946
230k
        matchlen = scan_html_declaration(&subj->input, subj->pos + 1);
947
230k
        if (matchlen > 0) {
948
48.1k
          matchlen += 2; // prefix "!", suffix ">"
949
48.1k
          if (subj->pos + matchlen > subj->input.len) {
950
12.3k
            subj->flags |= FLAG_SKIP_HTML_DECLARATION;
951
12.3k
            matchlen = 0;
952
12.3k
          }
953
48.1k
        }
954
230k
      }
955
3.51M
    } else if (c == '?') {
956
98.3k
      if ((subj->flags & FLAG_SKIP_HTML_PI) == 0) {
957
        // Note that we allow an empty match.
958
76.2k
        matchlen = scan_html_pi(&subj->input, subj->pos + 1);
959
76.2k
        matchlen += 3; // prefix "?", suffix "?>"
960
76.2k
        if (subj->pos + matchlen > subj->input.len) {
961
10.0k
          subj->flags |= FLAG_SKIP_HTML_PI;
962
10.0k
          matchlen = 0;
963
10.0k
        }
964
76.2k
      }
965
3.42M
    } else {
966
3.42M
      matchlen = scan_html_tag(&subj->input, subj->pos);
967
3.42M
    }
968
3.84M
  }
969
3.86M
  if (matchlen > 0) {
970
296k
    const unsigned char *src = subj->input.data + subj->pos - 1;
971
296k
    bufsize_t len = matchlen + 1;
972
296k
    subj->pos += matchlen;
973
296k
    cmark_node *node = make_literal(subj, CMARK_NODE_HTML_INLINE,
974
296k
                                    subj->pos - matchlen - 1, subj->pos - 1);
975
296k
    node->data = (unsigned char *)subj->mem->realloc(NULL, len + 1);
976
296k
    memcpy(node->data, src, len);
977
296k
    node->data[len] = 0;
978
296k
    node->len = len;
979
296k
    adjust_subj_node_newlines(subj, node, matchlen, 1, options);
980
296k
    return node;
981
296k
  }
982
983
  // if nothing matches, just return the opening <:
984
3.56M
  return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<"));
985
3.86M
}
986
987
// Parse a link label.  Returns 1 if successful.
988
// Note:  unescaped brackets are not allowed in labels.
989
// The label begins with `[` and ends with the first `]` character
990
// encountered.  Backticks in labels do not start code spans.
991
1.90M
static int link_label(subject *subj, cmark_chunk *raw_label) {
992
1.90M
  bufsize_t startpos = subj->pos;
993
1.90M
  int length = 0;
994
1.90M
  unsigned char c;
995
996
  // advance past [
997
1.90M
  if (peek_char(subj) == '[') {
998
661k
    advance(subj);
999
1.24M
  } else {
1000
1.24M
    return 0;
1001
1.24M
  }
1002
1003
28.2M
  while ((c = peek_char(subj)) && c != '[' && c != ']') {
1004
27.5M
    if (c == '\\') {
1005
41.9k
      advance(subj);
1006
41.9k
      length++;
1007
41.9k
      if (cmark_ispunct(peek_char(subj))) {
1008
19.9k
        advance(subj);
1009
19.9k
        length++;
1010
19.9k
      }
1011
27.5M
    } else {
1012
27.5M
      advance(subj);
1013
27.5M
      length++;
1014
27.5M
    }
1015
27.5M
    if (length > MAX_LINK_LABEL_LENGTH) {
1016
1.91k
      goto noMatch;
1017
1.91k
    }
1018
27.5M
  }
1019
1020
659k
  if (c == ']') { // match found
1021
487k
    *raw_label =
1022
487k
        cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1));
1023
487k
    cmark_chunk_trim(raw_label);
1024
487k
    advance(subj); // advance past ]
1025
487k
    return 1;
1026
487k
  }
1027
1028
173k
noMatch:
1029
173k
  subj->pos = startpos; // rewind
1030
173k
  return 0;
1031
659k
}
1032
1033
static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset,
1034
354k
                                        cmark_chunk *output) {
1035
354k
  bufsize_t i = offset;
1036
354k
  size_t nb_p = 0;
1037
1038
39.3M
  while (i < input->len) {
1039
39.3M
    if (input->data[i] == '\\' &&
1040
39.3M
        i + 1 < input-> len &&
1041
39.3M
        cmark_ispunct(input->data[i+1]))
1042
6.01k
      i += 2;
1043
39.3M
    else if (input->data[i] == '(') {
1044
66.5k
      ++nb_p;
1045
66.5k
      ++i;
1046
66.5k
      if (nb_p > 32)
1047
364
        return -1;
1048
39.2M
    } else if (input->data[i] == ')') {
1049
26.5k
      if (nb_p == 0)
1050
16.8k
        break;
1051
9.71k
      --nb_p;
1052
9.71k
      ++i;
1053
39.2M
    } else if (cmark_isspace(input->data[i])) {
1054
332k
      if (i == offset) {
1055
633
        return -1;
1056
633
      }
1057
331k
      break;
1058
38.8M
    } else {
1059
38.8M
      ++i;
1060
38.8M
    }
1061
39.3M
  }
1062
1063
353k
  if (i >= input->len || nb_p != 0)
1064
20.8k
    return -1;
1065
1066
332k
  {
1067
332k
    cmark_chunk result = {input->data + offset, i - offset};
1068
332k
    *output = result;
1069
332k
  }
1070
332k
  return i - offset;
1071
353k
}
1072
1073
static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset,
1074
413k
                                      cmark_chunk *output) {
1075
413k
  bufsize_t i = offset;
1076
1077
413k
  if (i < input->len && input->data[i] == '<') {
1078
59.3k
    ++i;
1079
592k
    while (i < input->len) {
1080
590k
      if (input->data[i] == '>') {
1081
46.2k
        ++i;
1082
46.2k
        break;
1083
544k
      } else if (input->data[i] == '\\')
1084
5.33k
        i += 2;
1085
539k
      else if (input->data[i] == '\n' || input->data[i] == '<')
1086
11.0k
        return -1;
1087
528k
      else
1088
528k
        ++i;
1089
590k
    }
1090
354k
  } else {
1091
354k
    return manual_scan_link_url_2(input, offset, output);
1092
354k
  }
1093
1094
48.3k
  if (i >= input->len)
1095
2.12k
    return -1;
1096
1097
46.1k
  {
1098
46.1k
    cmark_chunk result = {input->data + offset + 1, i - 2 - offset};
1099
46.1k
    *output = result;
1100
46.1k
  }
1101
46.1k
  return i - offset;
1102
48.3k
}
1103
1104
// Return a link, an image, or a literal close bracket.
1105
2.30M
static cmark_node *handle_close_bracket(subject *subj) {
1106
2.30M
  bufsize_t initial_pos, after_link_text_pos;
1107
2.30M
  bufsize_t endurl, starttitle, endtitle, endall;
1108
2.30M
  bufsize_t sps, n;
1109
2.30M
  cmark_reference *ref = NULL;
1110
2.30M
  cmark_chunk url_chunk, title_chunk;
1111
2.30M
  unsigned char *url, *title;
1112
2.30M
  bracket *opener;
1113
2.30M
  cmark_node *inl;
1114
2.30M
  cmark_chunk raw_label;
1115
2.30M
  int found_label;
1116
2.30M
  cmark_node *tmp, *tmpnext;
1117
2.30M
  bool is_image;
1118
1119
2.30M
  advance(subj); // advance past ]
1120
2.30M
  initial_pos = subj->pos;
1121
1122
  // get last [ or ![
1123
2.30M
  opener = subj->last_bracket;
1124
1125
2.30M
  if (opener == NULL) {
1126
519k
    return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
1127
519k
  }
1128
1129
  // If we got here, we matched a potential link/image text.
1130
  // Now we check to see if it's a link/image.
1131
1.78M
  is_image = opener->image;
1132
1133
1.78M
  if (!is_image && subj->no_link_openers) {
1134
    // take delimiter off stack
1135
17.8k
    pop_bracket(subj);
1136
17.8k
    return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
1137
17.8k
  }
1138
1139
1.77M
  after_link_text_pos = subj->pos;
1140
1141
  // First, look for an inline link.
1142
1.77M
  if (peek_char(subj) == '(' &&
1143
1.77M
      ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&
1144
1.77M
      ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps,
1145
335k
                                 &url_chunk)) > -1)) {
1146
1147
    // try to parse an explicit link:
1148
303k
    endurl = subj->pos + 1 + sps + n;
1149
303k
    starttitle = endurl + scan_spacechars(&subj->input, endurl);
1150
1151
    // ensure there are spaces btw url and title
1152
303k
    endtitle = (starttitle == endurl)
1153
303k
                   ? starttitle
1154
303k
                   : starttitle + scan_link_title(&subj->input, starttitle);
1155
1156
303k
    endall = endtitle + scan_spacechars(&subj->input, endtitle);
1157
1158
303k
    if (peek_at(subj, endall) == ')') {
1159
20.3k
      subj->pos = endall + 1;
1160
1161
20.3k
      title_chunk =
1162
20.3k
          cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle);
1163
20.3k
      url = cmark_clean_url(subj->mem, &url_chunk);
1164
20.3k
      title = cmark_clean_title(subj->mem, &title_chunk);
1165
20.3k
      cmark_chunk_free(&url_chunk);
1166
20.3k
      cmark_chunk_free(&title_chunk);
1167
20.3k
      goto match;
1168
1169
282k
    } else {
1170
      // it could still be a shortcut reference link
1171
282k
      subj->pos = after_link_text_pos;
1172
282k
    }
1173
303k
  }
1174
1175
  // Next, look for a following [link label] that matches in refmap.
1176
  // skip spaces
1177
1.74M
  raw_label = cmark_chunk_literal("");
1178
1.74M
  found_label = link_label(subj, &raw_label);
1179
1.74M
  if (!found_label) {
1180
    // If we have a shortcut reference link, back up
1181
    // to before the spaces we skipped.
1182
1.38M
    subj->pos = initial_pos;
1183
1.38M
  }
1184
1185
1.74M
  if ((!found_label || raw_label.len == 0) && !opener->bracket_after) {
1186
1.30M
    cmark_chunk_free(&raw_label);
1187
1.30M
    raw_label = cmark_chunk_dup(&subj->input, opener->position,
1188
1.30M
                                initial_pos - opener->position - 1);
1189
1.30M
    found_label = true;
1190
1.30M
  }
1191
1192
1.74M
  if (found_label) {
1193
1.63M
    ref = cmark_reference_lookup(subj->refmap, &raw_label);
1194
1.63M
    cmark_chunk_free(&raw_label);
1195
1.63M
  }
1196
1197
1.74M
  if (ref != NULL) { // found
1198
193k
    url = cmark_strdup(subj->mem, ref->url);
1199
193k
    title = cmark_strdup(subj->mem, ref->title);
1200
193k
    goto match;
1201
1.55M
  } else {
1202
1.55M
    goto noMatch;
1203
1.55M
  }
1204
1205
1.55M
noMatch:
1206
  // If we fall through to here, it means we didn't match a link:
1207
1.55M
  pop_bracket(subj); // remove this opener from delimiter list
1208
1.55M
  subj->pos = initial_pos;
1209
1.55M
  return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
1210
1211
213k
match:
1212
213k
  inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK);
1213
213k
  inl->as.link.url = url;
1214
213k
  inl->as.link.title = title;
1215
213k
  inl->start_line = inl->end_line = subj->line;
1216
213k
  inl->start_column = opener->inl_text->start_column;
1217
213k
  inl->end_column = subj->pos + subj->column_offset + subj->block_offset;
1218
213k
  cmark_node_insert_before(opener->inl_text, inl);
1219
  // Add link text:
1220
213k
  tmp = opener->inl_text->next;
1221
630k
  while (tmp) {
1222
416k
    tmpnext = tmp->next;
1223
416k
    cmark_node_unlink(tmp);
1224
416k
    append_child(inl, tmp);
1225
416k
    tmp = tmpnext;
1226
416k
  }
1227
1228
  // Free the bracket [:
1229
213k
  cmark_node_free(opener->inl_text);
1230
1231
213k
  process_emphasis(subj, opener->position);
1232
213k
  pop_bracket(subj);
1233
1234
  // Now, if we have a link, we also want to deactivate links until
1235
  // we get a new opener. (This code can be removed if we decide to allow links
1236
  // inside links.)
1237
213k
  if (!is_image) {
1238
205k
    subj->no_link_openers = true;
1239
205k
  }
1240
1241
213k
  return NULL;
1242
1.74M
}
1243
1244
// Parse a hard or soft linebreak, returning an inline.
1245
// Assumes the subject has a cr or newline at the current position.
1246
2.17M
static cmark_node *handle_newline(subject *subj) {
1247
2.17M
  bufsize_t nlpos = subj->pos;
1248
  // skip over cr, crlf, or lf:
1249
2.17M
  if (peek_at(subj, subj->pos) == '\r') {
1250
0
    advance(subj);
1251
0
  }
1252
2.17M
  if (peek_at(subj, subj->pos) == '\n') {
1253
2.17M
    advance(subj);
1254
2.17M
  }
1255
2.17M
  ++subj->line;
1256
2.17M
  subj->column_offset = -subj->pos;
1257
  // skip spaces at beginning of line
1258
2.17M
  skip_spaces(subj);
1259
2.17M
  if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' &&
1260
2.17M
      peek_at(subj, nlpos - 2) == ' ') {
1261
2.22k
    return make_linebreak(subj->mem);
1262
2.17M
  } else {
1263
2.17M
    return make_softbreak(subj->mem);
1264
2.17M
  }
1265
2.17M
}
1266
1267
13.4M
static bufsize_t subject_find_special_char(subject *subj, int options) {
1268
  // "\r\n\\`&_*[]<!"
1269
13.4M
  static const int8_t SPECIAL_CHARS[256] = {
1270
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1271
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
1272
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1273
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
1274
13.4M
      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1275
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1276
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1277
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1278
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1279
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1280
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1281
1282
  // " ' . -
1283
13.4M
  static const char SMART_PUNCT_CHARS[] = {
1284
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1285
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
1286
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1287
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1288
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1289
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1290
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1291
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1292
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1293
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1294
13.4M
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1295
13.4M
  };
1296
1297
13.4M
  bufsize_t n = subj->pos + 1;
1298
1299
283M
  while (n < subj->input.len) {
1300
282M
    if (SPECIAL_CHARS[subj->input.data[n]])
1301
8.50M
      return n;
1302
274M
    if (options & CMARK_OPT_SMART && SMART_PUNCT_CHARS[subj->input.data[n]])
1303
4.36M
      return n;
1304
269M
    n++;
1305
269M
  }
1306
1307
593k
  return subj->input.len;
1308
13.4M
}
1309
1310
// Parse an inline, advancing subject, and add it as a child of parent.
1311
// Return 0 if no inline can be parsed, 1 otherwise.
1312
33.3M
static int parse_inline(subject *subj, cmark_node *parent, int options) {
1313
33.3M
  cmark_node *new_inl = NULL;
1314
33.3M
  cmark_chunk contents;
1315
33.3M
  unsigned char c;
1316
33.3M
  bufsize_t startpos, endpos;
1317
33.3M
  c = peek_char(subj);
1318
33.3M
  if (c == 0) {
1319
0
    return 0;
1320
0
  }
1321
33.3M
  switch (c) {
1322
0
  case '\r':
1323
2.17M
  case '\n':
1324
2.17M
    new_inl = handle_newline(subj);
1325
2.17M
    break;
1326
315k
  case '`':
1327
315k
    new_inl = handle_backticks(subj, options);
1328
315k
    break;
1329
628k
  case '\\':
1330
628k
    new_inl = handle_backslash(subj);
1331
628k
    break;
1332
514k
  case '&':
1333
514k
    new_inl = handle_entity(subj);
1334
514k
    break;
1335
3.95M
  case '<':
1336
3.95M
    new_inl = handle_pointy_brace(subj, options);
1337
3.95M
    break;
1338
600k
  case '*':
1339
1.04M
  case '_':
1340
1.38M
  case '\'':
1341
2.10M
  case '"':
1342
2.10M
    new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0);
1343
2.10M
    break;
1344
4.44M
  case '-':
1345
4.44M
    new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0);
1346
4.44M
    break;
1347
253k
  case '.':
1348
253k
    new_inl = handle_period(subj, (options & CMARK_OPT_SMART) != 0);
1349
253k
    break;
1350
2.20M
  case '[':
1351
2.20M
    advance(subj);
1352
2.20M
    new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("["));
1353
2.20M
    push_bracket(subj, false, new_inl);
1354
2.20M
    break;
1355
2.30M
  case ']':
1356
2.30M
    new_inl = handle_close_bracket(subj);
1357
2.30M
    break;
1358
935k
  case '!':
1359
935k
    advance(subj);
1360
935k
    if (peek_char(subj) == '[') {
1361
60.6k
      advance(subj);
1362
60.6k
      new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("!["));
1363
60.6k
      push_bracket(subj, true, new_inl);
1364
874k
    } else {
1365
874k
      new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!"));
1366
874k
    }
1367
935k
    break;
1368
13.4M
  default:
1369
13.4M
    endpos = subject_find_special_char(subj, options);
1370
13.4M
    contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos);
1371
13.4M
    startpos = subj->pos;
1372
13.4M
    subj->pos = endpos;
1373
1374
    // if we're at a newline, strip trailing spaces.
1375
13.4M
    if (S_is_line_end_char(peek_char(subj))) {
1376
1.67M
      cmark_chunk_rtrim(&contents);
1377
1.67M
    }
1378
1379
13.4M
    new_inl = make_str(subj, startpos, endpos - 1, contents);
1380
33.3M
  }
1381
33.3M
  if (new_inl != NULL) {
1382
33.0M
    append_child(parent, new_inl);
1383
33.0M
  }
1384
1385
33.3M
  return 1;
1386
33.3M
}
1387
1388
// Parse inlines from parent's string_content, adding as children of parent.
1389
void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent,
1390
850k
                         cmark_reference_map *refmap, int options) {
1391
850k
  int internal_offset = parent->type == CMARK_NODE_HEADING ?
1392
766k
    parent->as.heading.internal_offset : 0;
1393
850k
  subject subj;
1394
850k
  cmark_chunk content = {parent->data, parent->len};
1395
850k
  subject_from_buf(mem, parent->start_line, parent->start_column - 1 + internal_offset, &subj, &content, refmap);
1396
850k
  cmark_chunk_rtrim(&subj.input);
1397
1398
34.1M
  while (!is_eof(&subj) && parse_inline(&subj, parent, options))
1399
33.3M
    ;
1400
1401
850k
  process_emphasis(&subj, 0);
1402
  // free bracket and delim stack
1403
850k
  while (subj.last_delim) {
1404
0
    remove_delimiter(&subj, subj.last_delim);
1405
0
  }
1406
1.32M
  while (subj.last_bracket) {
1407
479k
    pop_bracket(&subj);
1408
479k
  }
1409
850k
}
1410
1411
// Parse zero or more space characters, including at most one newline.
1412
153k
static void spnl(subject *subj) {
1413
153k
  skip_spaces(subj);
1414
153k
  if (skip_line_end(subj)) {
1415
78.7k
    skip_spaces(subj);
1416
78.7k
  }
1417
153k
}
1418
1419
// Parse reference.  Assumes string begins with '[' character.
1420
// Modify refmap if a reference is encountered.
1421
// Return 0 if no reference found, otherwise position of subject
1422
// after reference is parsed.
1423
bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_chunk *input,
1424
152k
                                       cmark_reference_map *refmap) {
1425
152k
  subject subj;
1426
1427
152k
  cmark_chunk lab;
1428
152k
  cmark_chunk url;
1429
152k
  cmark_chunk title;
1430
1431
152k
  bufsize_t matchlen = 0;
1432
152k
  bufsize_t beforetitle;
1433
1434
152k
  subject_from_buf(mem, -1, 0, &subj, input, NULL);
1435
1436
  // parse label:
1437
152k
  if (!link_label(&subj, &lab) || lab.len == 0)
1438
37.5k
    return 0;
1439
1440
  // colon:
1441
114k
  if (peek_char(&subj) == ':') {
1442
78.1k
    advance(&subj);
1443
78.1k
  } else {
1444
36.5k
    return 0;
1445
36.5k
  }
1446
1447
  // parse link url:
1448
78.1k
  spnl(&subj);
1449
78.1k
  if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1) {
1450
75.5k
    subj.pos += matchlen;
1451
75.5k
  } else {
1452
2.57k
    return 0;
1453
2.57k
  }
1454
1455
  // parse optional link_title
1456
75.5k
  beforetitle = subj.pos;
1457
75.5k
  spnl(&subj);
1458
75.5k
  matchlen = subj.pos == beforetitle ? 0 : scan_link_title(&subj.input, subj.pos);
1459
75.5k
  if (matchlen) {
1460
2.77k
    title = cmark_chunk_dup(&subj.input, subj.pos, matchlen);
1461
2.77k
    subj.pos += matchlen;
1462
72.7k
  } else {
1463
72.7k
    subj.pos = beforetitle;
1464
72.7k
    title = cmark_chunk_literal("");
1465
72.7k
  }
1466
1467
  // parse final spaces and newline:
1468
75.5k
  skip_spaces(&subj);
1469
75.5k
  if (!skip_line_end(&subj)) {
1470
10.1k
    if (matchlen) { // try rewinding before title
1471
2.74k
      subj.pos = beforetitle;
1472
2.74k
      skip_spaces(&subj);
1473
2.74k
      if (!skip_line_end(&subj)) {
1474
420
        return 0;
1475
420
      }
1476
7.38k
    } else {
1477
7.38k
      return 0;
1478
7.38k
    }
1479
10.1k
  }
1480
  // insert reference into refmap
1481
67.7k
  cmark_reference_create(refmap, &lab, &url, &title);
1482
67.7k
  return subj.pos;
1483
75.5k
}