Line | Count | Source (jump to first uncovered line) |
1 | | #include <stdlib.h> |
2 | | #include <string.h> |
3 | | #include <stdio.h> |
4 | | |
5 | | #include "cmark_ctype.h" |
6 | | #include "config.h" |
7 | | #include "node.h" |
8 | | #include "parser.h" |
9 | | #include "references.h" |
10 | | #include "cmark.h" |
11 | | #include "houdini.h" |
12 | | #include "utf8.h" |
13 | | #include "scanners.h" |
14 | | #include "inlines.h" |
15 | | |
16 | | static const char *EMDASH = "\xE2\x80\x94"; |
17 | | static const char *ENDASH = "\xE2\x80\x93"; |
18 | | static const char *ELLIPSES = "\xE2\x80\xA6"; |
19 | | static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C"; |
20 | | static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D"; |
21 | | static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98"; |
22 | | static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99"; |
23 | | |
24 | | // Macros for creating various kinds of simple. |
25 | 23.8k | #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK) |
26 | 2.17M | #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK) |
27 | 285k | #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH) |
28 | 559k | #define make_strong(mem) make_simple(mem, CMARK_NODE_STRONG) |
29 | | |
30 | 1.00G | #define MAXBACKTICKS 1000 |
31 | | |
32 | | typedef struct delimiter { |
33 | | struct delimiter *previous; |
34 | | struct delimiter *next; |
35 | | cmark_node *inl_text; |
36 | | bufsize_t position; |
37 | | bufsize_t length; |
38 | | unsigned char delim_char; |
39 | | bool can_open; |
40 | | bool can_close; |
41 | | } delimiter; |
42 | | |
43 | | typedef struct bracket { |
44 | | struct bracket *previous; |
45 | | cmark_node *inl_text; |
46 | | bufsize_t position; |
47 | | bool image; |
48 | | bool active; |
49 | | bool bracket_after; |
50 | | } bracket; |
51 | | |
52 | 43.5k | #define FLAG_SKIP_HTML_CDATA (1u << 0) |
53 | 254k | #define FLAG_SKIP_HTML_DECLARATION (1u << 1) |
54 | 108k | #define FLAG_SKIP_HTML_PI (1u << 2) |
55 | 543k | #define FLAG_SKIP_HTML_COMMENT (1u << 3) |
56 | | |
57 | | typedef struct { |
58 | | cmark_mem *mem; |
59 | | cmark_chunk input; |
60 | | unsigned flags; |
61 | | int line; |
62 | | bufsize_t pos; |
63 | | int block_offset; |
64 | | int column_offset; |
65 | | cmark_reference_map *refmap; |
66 | | delimiter *last_delim; |
67 | | bracket *last_bracket; |
68 | | bufsize_t backticks[MAXBACKTICKS + 1]; |
69 | | bool scanned_for_backticks; |
70 | | bool no_link_openers; |
71 | | } subject; |
72 | | |
73 | 13.4M | static CMARK_INLINE bool S_is_line_end_char(char c) { |
74 | 13.4M | return (c == '\n' || c == '\r'); |
75 | 13.4M | } |
76 | | |
77 | | static delimiter *S_insert_emph(subject *subj, delimiter *opener, |
78 | | delimiter *closer); |
79 | | |
80 | | static int parse_inline(subject *subj, cmark_node *parent, int options); |
81 | | |
82 | | static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, |
83 | | cmark_chunk *chunk, cmark_reference_map *refmap); |
84 | | static bufsize_t subject_find_special_char(subject *subj, int options); |
85 | | |
86 | | // Create an inline with a literal string value. |
87 | | static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t, |
88 | 30.8M | int start_column, int end_column) { |
89 | 30.8M | cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e)); |
90 | 30.8M | e->mem = subj->mem; |
91 | 30.8M | e->type = (uint16_t)t; |
92 | 30.8M | e->start_line = e->end_line = subj->line; |
93 | | // columns are 1 based. |
94 | 30.8M | e->start_column = start_column + 1 + subj->column_offset + subj->block_offset; |
95 | 30.8M | e->end_column = end_column + 1 + subj->column_offset + subj->block_offset; |
96 | 30.8M | return e; |
97 | 30.8M | } |
98 | | |
99 | | // Create an inline with no value. |
100 | 2.92M | static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) { |
101 | 2.92M | cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e)); |
102 | 2.92M | e->mem = mem; |
103 | 2.92M | e->type = t; |
104 | 2.92M | return e; |
105 | 2.92M | } |
106 | | |
107 | 29.3M | static cmark_node *make_str(subject *subj, int sc, int ec, cmark_chunk s) { |
108 | 29.3M | cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec); |
109 | 29.3M | e->data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1); |
110 | 29.3M | if (s.data != NULL) { |
111 | 29.3M | memcpy(e->data, s.data, s.len); |
112 | 29.3M | } |
113 | 29.3M | e->data[s.len] = 0; |
114 | 29.3M | e->len = s.len; |
115 | 29.3M | return e; |
116 | 29.3M | } |
117 | | |
118 | | static cmark_node *make_str_from_buf(subject *subj, int sc, int ec, |
119 | 1.10M | cmark_strbuf *buf) { |
120 | 1.10M | cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec); |
121 | 1.10M | e->len = buf->size; |
122 | 1.10M | e->data = cmark_strbuf_detach(buf); |
123 | 1.10M | return e; |
124 | 1.10M | } |
125 | | |
126 | | // Like make_str, but parses entities. |
127 | | static cmark_node *make_str_with_entities(subject *subj, |
128 | | int start_column, int end_column, |
129 | 90.3k | cmark_chunk *content) { |
130 | 90.3k | cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem); |
131 | | |
132 | 90.3k | if (houdini_unescape_html(&unescaped, content->data, content->len)) { |
133 | 2.06k | return make_str_from_buf(subj, start_column, end_column, &unescaped); |
134 | 88.2k | } else { |
135 | 88.2k | return make_str(subj, start_column, end_column, *content); |
136 | 88.2k | } |
137 | 90.3k | } |
138 | | |
139 | | // Like cmark_node_append_child but without costly sanity checks. |
140 | | // Assumes that child was newly created. |
141 | 37.1M | static void append_child(cmark_node *node, cmark_node *child) { |
142 | 37.1M | cmark_node *old_last_child = node->last_child; |
143 | | |
144 | 37.1M | child->next = NULL; |
145 | 37.1M | child->prev = old_last_child; |
146 | 37.1M | child->parent = node; |
147 | 37.1M | node->last_child = child; |
148 | | |
149 | 37.1M | if (old_last_child) { |
150 | 35.6M | old_last_child->next = child; |
151 | 35.6M | } else { |
152 | | // Also set first_child if node previously had no children. |
153 | 1.52M | node->first_child = child; |
154 | 1.52M | } |
155 | 37.1M | } |
156 | | |
157 | | // Duplicate a chunk by creating a copy of the buffer not by reusing the |
158 | | // buffer like cmark_chunk_dup does. |
159 | 387k | static unsigned char *cmark_strdup(cmark_mem *mem, unsigned char *src) { |
160 | 387k | if (src == NULL) { |
161 | 190k | return NULL; |
162 | 190k | } |
163 | 197k | size_t len = strlen((char *)src); |
164 | 197k | unsigned char *data = (unsigned char *)mem->realloc(NULL, len + 1); |
165 | 197k | memcpy(data, src, len + 1); |
166 | 197k | return data; |
167 | 387k | } |
168 | | |
169 | | static unsigned char *cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url, |
170 | 90.3k | int is_email) { |
171 | 90.3k | cmark_strbuf buf = CMARK_BUF_INIT(mem); |
172 | | |
173 | 90.3k | cmark_chunk_trim(url); |
174 | | |
175 | 90.3k | if (is_email) |
176 | 72.6k | cmark_strbuf_puts(&buf, "mailto:"); |
177 | | |
178 | 90.3k | houdini_unescape_html_f(&buf, url->data, url->len); |
179 | 90.3k | return cmark_strbuf_detach(&buf); |
180 | 90.3k | } |
181 | | |
182 | | static CMARK_INLINE cmark_node *make_autolink(subject *subj, |
183 | | int start_column, int end_column, |
184 | 90.3k | cmark_chunk url, int is_email) { |
185 | 90.3k | cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK); |
186 | 90.3k | link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email); |
187 | 90.3k | link->as.link.title = NULL; |
188 | 90.3k | link->start_line = link->end_line = subj->line; |
189 | 90.3k | link->start_column = start_column + 1; |
190 | 90.3k | link->end_column = end_column + 1; |
191 | 90.3k | append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url)); |
192 | 90.3k | return link; |
193 | 90.3k | } |
194 | | |
195 | | static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, |
196 | 1.00M | cmark_chunk *chunk, cmark_reference_map *refmap) { |
197 | 1.00M | int i; |
198 | 1.00M | e->mem = mem; |
199 | 1.00M | e->input = *chunk; |
200 | 1.00M | e->flags = 0; |
201 | 1.00M | e->line = line_number; |
202 | 1.00M | e->pos = 0; |
203 | 1.00M | e->block_offset = block_offset; |
204 | 1.00M | e->column_offset = 0; |
205 | 1.00M | e->refmap = refmap; |
206 | 1.00M | e->last_delim = NULL; |
207 | 1.00M | e->last_bracket = NULL; |
208 | 1.00G | for (i = 0; i <= MAXBACKTICKS; i++) { |
209 | 1.00G | e->backticks[i] = 0; |
210 | 1.00G | } |
211 | 1.00M | e->scanned_for_backticks = false; |
212 | 1.00M | e->no_link_openers = true; |
213 | 1.00M | } |
214 | | |
215 | 2.26M | static CMARK_INLINE int isbacktick(int c) { return (c == '`'); } |
216 | | |
217 | 255M | static CMARK_INLINE unsigned char peek_char(subject *subj) { |
218 | | // NULL bytes should have been stripped out by now. If they're |
219 | | // present, it's a programming error: |
220 | 255M | assert(!(subj->pos < subj->input.len && subj->input.data[subj->pos] == 0)); |
221 | 255M | return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0; |
222 | 255M | } |
223 | | |
224 | 10.3M | static CMARK_INLINE unsigned char peek_at(subject *subj, bufsize_t pos) { |
225 | 10.3M | return subj->input.data[pos]; |
226 | 10.3M | } |
227 | | |
228 | | // Return true if there are more characters in the subject. |
229 | 35.8M | static CMARK_INLINE int is_eof(subject *subj) { |
230 | 35.8M | return (subj->pos >= subj->input.len); |
231 | 35.8M | } |
232 | | |
233 | | // Advance the subject. Doesn't check for eof. |
234 | 208M | #define advance(subj) (subj)->pos += 1 |
235 | | |
236 | 2.48M | static CMARK_INLINE bool skip_spaces(subject *subj) { |
237 | 2.48M | bool skipped = false; |
238 | 3.19M | while (peek_char(subj) == ' ' || peek_char(subj) == '\t') { |
239 | 711k | advance(subj); |
240 | 711k | skipped = true; |
241 | 711k | } |
242 | 2.48M | return skipped; |
243 | 2.48M | } |
244 | | |
245 | 580k | static CMARK_INLINE bool skip_line_end(subject *subj) { |
246 | 580k | bool seen_line_end_char = false; |
247 | 580k | if (peek_char(subj) == '\r') { |
248 | 0 | advance(subj); |
249 | 0 | seen_line_end_char = true; |
250 | 0 | } |
251 | 580k | if (peek_char(subj) == '\n') { |
252 | 168k | advance(subj); |
253 | 168k | seen_line_end_char = true; |
254 | 168k | } |
255 | 580k | return seen_line_end_char || is_eof(subj); |
256 | 580k | } |
257 | | |
258 | | // Take characters while a predicate holds, and return a string. |
259 | 315k | static CMARK_INLINE cmark_chunk take_while(subject *subj, int (*f)(int)) { |
260 | 315k | unsigned char c; |
261 | 315k | bufsize_t startpos = subj->pos; |
262 | 315k | bufsize_t len = 0; |
263 | | |
264 | 2.27M | while ((c = peek_char(subj)) && (*f)(c)) { |
265 | 1.96M | advance(subj); |
266 | 1.96M | len++; |
267 | 1.96M | } |
268 | | |
269 | 315k | return cmark_chunk_dup(&subj->input, startpos, len); |
270 | 315k | } |
271 | | |
272 | | // Return the number of newlines in a given span of text in a subject. If |
273 | | // the number is greater than zero, also return the number of characters |
274 | | // between the last newline and the end of the span in `since_newline`. |
275 | 211k | static int count_newlines(subject *subj, bufsize_t from, bufsize_t len, int *since_newline) { |
276 | 211k | int nls = 0; |
277 | 211k | int since_nl = 0; |
278 | | |
279 | 93.9M | while (len--) { |
280 | 93.6M | if (subj->input.data[from++] == '\n') { |
281 | 182k | ++nls; |
282 | 182k | since_nl = 0; |
283 | 93.5M | } else { |
284 | 93.5M | ++since_nl; |
285 | 93.5M | } |
286 | 93.6M | } |
287 | | |
288 | 211k | if (!nls) |
289 | 175k | return 0; |
290 | | |
291 | 35.2k | *since_newline = since_nl; |
292 | 35.2k | return nls; |
293 | 211k | } |
294 | | |
295 | | // Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and |
296 | | // `column_offset` according to the number of newlines in a just-matched span |
297 | | // of text in `subj`. |
298 | 443k | static void adjust_subj_node_newlines(subject *subj, cmark_node *node, int matchlen, int extra, int options) { |
299 | 443k | if (!(options & CMARK_OPT_SOURCEPOS)) { |
300 | 232k | return; |
301 | 232k | } |
302 | | |
303 | 211k | int since_newline; |
304 | 211k | int newlines = count_newlines(subj, subj->pos - matchlen - extra, matchlen, &since_newline); |
305 | 211k | if (newlines) { |
306 | 35.2k | subj->line += newlines; |
307 | 35.2k | node->end_line += newlines; |
308 | 35.2k | node->end_column = since_newline; |
309 | 35.2k | subj->column_offset = -subj->pos + since_newline + extra; |
310 | 35.2k | } |
311 | 211k | } |
312 | | |
313 | | // Try to process a backtick code span that began with a |
314 | | // span of ticks of length openticklength length (already |
315 | | // parsed). Return 0 if you don't find matching closing |
316 | | // backticks, otherwise return the position in the subject |
317 | | // after the closing backticks. |
318 | | static bufsize_t scan_to_closing_backticks(subject *subj, |
319 | 315k | bufsize_t openticklength) { |
320 | | |
321 | 315k | bool found = false; |
322 | 315k | if (openticklength > MAXBACKTICKS) { |
323 | | // we limit backtick string length because of the array subj->backticks: |
324 | 254 | return 0; |
325 | 254 | } |
326 | 315k | if (subj->scanned_for_backticks && |
327 | 315k | subj->backticks[openticklength] <= subj->pos) { |
328 | | // return if we already know there's no closer |
329 | 70.7k | return 0; |
330 | 70.7k | } |
331 | 902k | while (!found) { |
332 | | // read non backticks |
333 | 902k | unsigned char c; |
334 | 146M | while ((c = peek_char(subj)) && c != '`') { |
335 | 145M | advance(subj); |
336 | 145M | } |
337 | 902k | if (is_eof(subj)) { |
338 | 97.2k | break; |
339 | 97.2k | } |
340 | 805k | bufsize_t numticks = 0; |
341 | 7.34M | while (peek_char(subj) == '`') { |
342 | 6.53M | advance(subj); |
343 | 6.53M | numticks++; |
344 | 6.53M | } |
345 | | // store position of ender |
346 | 805k | if (numticks <= MAXBACKTICKS) { |
347 | 805k | subj->backticks[numticks] = subj->pos - numticks; |
348 | 805k | } |
349 | 805k | if (numticks == openticklength) { |
350 | 147k | return (subj->pos); |
351 | 147k | } |
352 | 805k | } |
353 | | // got through whole input without finding closer |
354 | 97.2k | subj->scanned_for_backticks = true; |
355 | 97.2k | return 0; |
356 | 244k | } |
357 | | |
358 | | // Destructively modify string, converting newlines to |
359 | | // spaces, then removing a single leading + trailing space, |
360 | | // unless the code span consists entirely of space characters. |
361 | 147k | static void S_normalize_code(cmark_strbuf *s) { |
362 | 147k | bufsize_t r, w; |
363 | 147k | bool contains_nonspace = false; |
364 | | |
365 | 100M | for (r = 0, w = 0; r < s->size; ++r) { |
366 | 100M | switch (s->ptr[r]) { |
367 | 0 | case '\r': |
368 | 0 | if (s->ptr[r + 1] != '\n') { |
369 | 0 | s->ptr[w++] = ' '; |
370 | 0 | } |
371 | 0 | break; |
372 | 374k | case '\n': |
373 | 374k | s->ptr[w++] = ' '; |
374 | 374k | break; |
375 | 99.6M | default: |
376 | 99.6M | s->ptr[w++] = s->ptr[r]; |
377 | 100M | } |
378 | 100M | if (s->ptr[r] != ' ') { |
379 | 99.2M | contains_nonspace = true; |
380 | 99.2M | } |
381 | 100M | } |
382 | | |
383 | | // begins and ends with space? |
384 | 147k | if (contains_nonspace && |
385 | 147k | s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') { |
386 | 4.71k | cmark_strbuf_drop(s, 1); |
387 | 4.71k | cmark_strbuf_truncate(s, w - 2); |
388 | 142k | } else { |
389 | 142k | cmark_strbuf_truncate(s, w); |
390 | 142k | } |
391 | | |
392 | 147k | } |
393 | | |
394 | | |
395 | | // Parse backtick code section or raw backticks, return an inline. |
396 | | // Assumes that the subject has a backtick at the current position. |
397 | 315k | static cmark_node *handle_backticks(subject *subj, int options) { |
398 | 315k | bufsize_t initpos = subj->pos; |
399 | 315k | cmark_chunk openticks = take_while(subj, isbacktick); |
400 | 315k | bufsize_t startpos = subj->pos; |
401 | 315k | bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len); |
402 | | |
403 | 315k | if (endpos == 0) { // not found |
404 | 168k | subj->pos = startpos; // rewind |
405 | 168k | return make_str(subj, initpos, initpos + openticks.len - 1, openticks); |
406 | 168k | } else { |
407 | 147k | cmark_strbuf buf = CMARK_BUF_INIT(subj->mem); |
408 | | |
409 | 147k | cmark_strbuf_set(&buf, subj->input.data + startpos, |
410 | 147k | endpos - startpos - openticks.len); |
411 | 147k | S_normalize_code(&buf); |
412 | | |
413 | 147k | cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos, |
414 | 147k | endpos - openticks.len - 1); |
415 | 147k | node->len = buf.size; |
416 | 147k | node->data = cmark_strbuf_detach(&buf); |
417 | 147k | adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options); |
418 | 147k | return node; |
419 | 147k | } |
420 | 315k | } |
421 | | |
422 | | |
423 | | // Scan ***, **, or * and return number scanned, or 0. |
424 | | // Advances position. |
425 | | static int scan_delims(subject *subj, unsigned char c, bool *can_open, |
426 | 2.10M | bool *can_close) { |
427 | 2.10M | int numdelims = 0; |
428 | 2.10M | bufsize_t before_char_pos; |
429 | 2.10M | int32_t after_char = 0; |
430 | 2.10M | int32_t before_char = 0; |
431 | 2.10M | int len; |
432 | 2.10M | bool left_flanking, right_flanking; |
433 | | |
434 | 2.10M | if (subj->pos == 0) { |
435 | 12.8k | before_char = 10; |
436 | 2.09M | } else { |
437 | 2.09M | before_char_pos = subj->pos - 1; |
438 | | // walk back to the beginning of the UTF_8 sequence: |
439 | 3.53M | while (peek_at(subj, before_char_pos) >> 6 == 2 && before_char_pos > 0) { |
440 | 1.43M | before_char_pos -= 1; |
441 | 1.43M | } |
442 | 2.09M | len = cmark_utf8proc_iterate(subj->input.data + before_char_pos, |
443 | 2.09M | subj->pos - before_char_pos, &before_char); |
444 | 2.09M | if (len == -1) { |
445 | 88.0k | before_char = 10; |
446 | 88.0k | } |
447 | 2.09M | } |
448 | | |
449 | 2.10M | if (c == '\'' || c == '"') { |
450 | 1.06M | numdelims++; |
451 | 1.06M | advance(subj); // limit to 1 delim for quotes |
452 | 1.06M | } else { |
453 | 3.02M | while (peek_char(subj) == c) { |
454 | 1.97M | numdelims++; |
455 | 1.97M | advance(subj); |
456 | 1.97M | } |
457 | 1.04M | } |
458 | | |
459 | 2.10M | len = cmark_utf8proc_iterate(subj->input.data + subj->pos, |
460 | 2.10M | subj->input.len - subj->pos, &after_char); |
461 | 2.10M | if (len == -1) { |
462 | 246k | after_char = 10; |
463 | 246k | } |
464 | 2.10M | left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) && |
465 | 2.10M | (!cmark_utf8proc_is_punctuation(after_char) || |
466 | 1.79M | cmark_utf8proc_is_space(before_char) || |
467 | 1.79M | cmark_utf8proc_is_punctuation(before_char)); |
468 | 2.10M | right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) && |
469 | 2.10M | (!cmark_utf8proc_is_punctuation(before_char) || |
470 | 1.74M | cmark_utf8proc_is_space(after_char) || |
471 | 1.74M | cmark_utf8proc_is_punctuation(after_char)); |
472 | 2.10M | if (c == '_') { |
473 | 442k | *can_open = left_flanking && |
474 | 442k | (!right_flanking || cmark_utf8proc_is_punctuation(before_char)); |
475 | 442k | *can_close = right_flanking && |
476 | 442k | (!left_flanking || cmark_utf8proc_is_punctuation(after_char)); |
477 | 1.66M | } else if (c == '\'' || c == '"') { |
478 | 1.06M | *can_open = left_flanking && |
479 | 1.06M | (!right_flanking || before_char == '(' || before_char == '[') && |
480 | 1.06M | before_char != ']' && before_char != ')'; |
481 | 1.06M | *can_close = right_flanking; |
482 | 1.06M | } else { |
483 | 600k | *can_open = left_flanking; |
484 | 600k | *can_close = right_flanking; |
485 | 600k | } |
486 | 2.10M | return numdelims; |
487 | 2.10M | } |
488 | | |
489 | | /* |
490 | | static void print_delimiters(subject *subj) |
491 | | { |
492 | | delimiter *delim; |
493 | | delim = subj->last_delim; |
494 | | while (delim != NULL) { |
495 | | printf("Item at stack pos %p: %d %d %d next(%p) prev(%p)\n", |
496 | | (void*)delim, delim->delim_char, |
497 | | delim->can_open, delim->can_close, |
498 | | (void*)delim->next, (void*)delim->previous); |
499 | | delim = delim->previous; |
500 | | } |
501 | | } |
502 | | */ |
503 | | |
504 | 1.78M | static void remove_delimiter(subject *subj, delimiter *delim) { |
505 | 1.78M | if (delim == NULL) |
506 | 0 | return; |
507 | 1.78M | if (delim->next == NULL) { |
508 | | // end of list: |
509 | 424k | assert(delim == subj->last_delim); |
510 | 424k | subj->last_delim = delim->previous; |
511 | 1.36M | } else { |
512 | 1.36M | delim->next->previous = delim->previous; |
513 | 1.36M | } |
514 | 1.78M | if (delim->previous != NULL) { |
515 | 828k | delim->previous->next = delim->next; |
516 | 828k | } |
517 | 1.78M | subj->mem->free(delim); |
518 | 1.78M | } |
519 | | |
520 | 2.26M | static void pop_bracket(subject *subj) { |
521 | 2.26M | bracket *b; |
522 | 2.26M | if (subj->last_bracket == NULL) |
523 | 0 | return; |
524 | 2.26M | b = subj->last_bracket; |
525 | 2.26M | subj->last_bracket = subj->last_bracket->previous; |
526 | 2.26M | subj->mem->free(b); |
527 | 2.26M | } |
528 | | |
529 | | static void push_delimiter(subject *subj, unsigned char c, bool can_open, |
530 | 1.78M | bool can_close, cmark_node *inl_text) { |
531 | 1.78M | delimiter *delim = (delimiter *)subj->mem->calloc(1, sizeof(delimiter)); |
532 | 1.78M | delim->delim_char = c; |
533 | 1.78M | delim->can_open = can_open; |
534 | 1.78M | delim->can_close = can_close; |
535 | 1.78M | delim->inl_text = inl_text; |
536 | 1.78M | delim->position = subj->pos; |
537 | 1.78M | delim->length = inl_text->len; |
538 | 1.78M | delim->previous = subj->last_delim; |
539 | 1.78M | delim->next = NULL; |
540 | 1.78M | if (delim->previous != NULL) { |
541 | 1.58M | delim->previous->next = delim; |
542 | 1.58M | } |
543 | 1.78M | subj->last_delim = delim; |
544 | 1.78M | } |
545 | | |
546 | 2.26M | static void push_bracket(subject *subj, bool image, cmark_node *inl_text) { |
547 | 2.26M | bracket *b = (bracket *)subj->mem->calloc(1, sizeof(bracket)); |
548 | 2.26M | if (subj->last_bracket != NULL) { |
549 | 1.38M | subj->last_bracket->bracket_after = true; |
550 | 1.38M | } |
551 | 2.26M | b->image = image; |
552 | 2.26M | b->active = true; |
553 | 2.26M | b->inl_text = inl_text; |
554 | 2.26M | b->previous = subj->last_bracket; |
555 | 2.26M | b->position = subj->pos; |
556 | 2.26M | b->bracket_after = false; |
557 | 2.26M | subj->last_bracket = b; |
558 | 2.26M | if (!image) { |
559 | 2.20M | subj->no_link_openers = false; |
560 | 2.20M | } |
561 | 2.26M | } |
562 | | |
563 | | // Assumes the subject has a c at the current position. |
564 | 2.10M | static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { |
565 | 2.10M | bufsize_t numdelims; |
566 | 2.10M | cmark_node *inl_text; |
567 | 2.10M | bool can_open, can_close; |
568 | 2.10M | cmark_chunk contents; |
569 | | |
570 | 2.10M | numdelims = scan_delims(subj, c, &can_open, &can_close); |
571 | | |
572 | 2.10M | if (c == '\'' && smart) { |
573 | 233k | contents = cmark_chunk_literal(RIGHTSINGLEQUOTE); |
574 | 1.87M | } else if (c == '"' && smart) { |
575 | 686k | contents = |
576 | 686k | cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE); |
577 | 1.18M | } else { |
578 | 1.18M | contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims); |
579 | 1.18M | } |
580 | | |
581 | 2.10M | inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents); |
582 | | |
583 | 2.10M | if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) { |
584 | 1.78M | push_delimiter(subj, c, can_open, can_close, inl_text); |
585 | 1.78M | } |
586 | | |
587 | 2.10M | return inl_text; |
588 | 2.10M | } |
589 | | |
590 | | // Assumes we have a hyphen at the current position. |
591 | 4.44M | static cmark_node *handle_hyphen(subject *subj, bool smart) { |
592 | 4.44M | int startpos = subj->pos; |
593 | | |
594 | 4.44M | advance(subj); |
595 | | |
596 | 4.44M | if (!smart || peek_char(subj) != '-') { |
597 | 3.38M | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-")); |
598 | 3.38M | } |
599 | | |
600 | 4.81M | while (smart && peek_char(subj) == '-') { |
601 | 3.75M | advance(subj); |
602 | 3.75M | } |
603 | | |
604 | 1.06M | int numhyphens = subj->pos - startpos; |
605 | 1.06M | int en_count = 0; |
606 | 1.06M | int em_count = 0; |
607 | 1.06M | int i; |
608 | 1.06M | cmark_strbuf buf = CMARK_BUF_INIT(subj->mem); |
609 | | |
610 | 1.06M | if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes |
611 | 320k | em_count = numhyphens / 3; |
612 | 745k | } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes |
613 | 658k | en_count = numhyphens / 2; |
614 | 658k | } else if (numhyphens % 3 == 2) { // use one en dash at end |
615 | 65.1k | en_count = 1; |
616 | 65.1k | em_count = (numhyphens - 2) / 3; |
617 | 65.1k | } else { // use two en dashes at the end |
618 | 21.9k | en_count = 2; |
619 | 21.9k | em_count = (numhyphens - 4) / 3; |
620 | 21.9k | } |
621 | | |
622 | 1.83M | for (i = em_count; i > 0; i--) { |
623 | 766k | cmark_strbuf_puts(&buf, EMDASH); |
624 | 766k | } |
625 | | |
626 | 2.32M | for (i = en_count; i > 0; i--) { |
627 | 1.25M | cmark_strbuf_puts(&buf, ENDASH); |
628 | 1.25M | } |
629 | | |
630 | 1.06M | return make_str_from_buf(subj, startpos, subj->pos - 1, &buf); |
631 | 4.44M | } |
632 | | |
633 | | // Assumes we have a period at the current position. |
634 | 253k | static cmark_node *handle_period(subject *subj, bool smart) { |
635 | 253k | advance(subj); |
636 | 253k | if (smart && peek_char(subj) == '.') { |
637 | 58.4k | advance(subj); |
638 | 58.4k | if (peek_char(subj) == '.') { |
639 | 54.8k | advance(subj); |
640 | 54.8k | return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES)); |
641 | 54.8k | } else { |
642 | 3.60k | return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("..")); |
643 | 3.60k | } |
644 | 195k | } else { |
645 | 195k | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal(".")); |
646 | 195k | } |
647 | 253k | } |
648 | | |
649 | 1.06M | static void process_emphasis(subject *subj, bufsize_t stack_bottom) { |
650 | 1.06M | delimiter *candidate; |
651 | 1.06M | delimiter *closer = NULL; |
652 | 1.06M | delimiter *opener; |
653 | 1.06M | delimiter *old_closer; |
654 | 1.06M | bool opener_found; |
655 | 1.06M | int openers_bottom_index = 0; |
656 | 1.06M | bufsize_t openers_bottom[9] = {stack_bottom, stack_bottom, stack_bottom, |
657 | 1.06M | stack_bottom, stack_bottom, stack_bottom, |
658 | 1.06M | stack_bottom, stack_bottom, stack_bottom}; |
659 | | |
660 | | // move back to first relevant delim. |
661 | 1.06M | candidate = subj->last_delim; |
662 | 2.84M | while (candidate != NULL && candidate->position >= stack_bottom) { |
663 | 1.78M | closer = candidate; |
664 | 1.78M | candidate = candidate->previous; |
665 | 1.78M | } |
666 | | |
667 | | // now move forward, looking for closers, and handling each |
668 | 2.98M | while (closer != NULL) { |
669 | 1.92M | if (closer->can_close) { |
670 | 1.50M | switch (closer->delim_char) { |
671 | 468k | case '"': |
672 | 468k | openers_bottom_index = 0; |
673 | 468k | break; |
674 | 163k | case '\'': |
675 | 163k | openers_bottom_index = 1; |
676 | 163k | break; |
677 | 367k | case '_': |
678 | 367k | openers_bottom_index = 2; |
679 | 367k | break; |
680 | 501k | case '*': |
681 | 501k | openers_bottom_index = 3 + |
682 | 501k | (closer->can_open ? 3 : 0) + (closer->length % 3); |
683 | 501k | break; |
684 | 0 | default: |
685 | 0 | assert(false); |
686 | 1.50M | } |
687 | | |
688 | | // Now look backwards for first matching opener: |
689 | 1.50M | opener = closer->previous; |
690 | 1.50M | opener_found = false; |
691 | 1.98M | while (opener != NULL && |
692 | 1.98M | opener->position >= openers_bottom[openers_bottom_index]) { |
693 | 972k | if (opener->can_open && opener->delim_char == closer->delim_char) { |
694 | | // interior closer of size 2 can't match opener of size 1 |
695 | | // or of size 1 can't match 2 |
696 | 501k | if (!(closer->can_open || opener->can_close) || |
697 | 501k | closer->length % 3 == 0 || |
698 | 501k | (opener->length + closer->length) % 3 != 0) { |
699 | 491k | opener_found = true; |
700 | 491k | break; |
701 | 491k | } |
702 | 501k | } |
703 | 480k | opener = opener->previous; |
704 | 480k | } |
705 | 1.50M | old_closer = closer; |
706 | 1.50M | if (closer->delim_char == '*' || closer->delim_char == '_') { |
707 | 868k | if (opener_found) { |
708 | 422k | closer = S_insert_emph(subj, opener, closer); |
709 | 446k | } else { |
710 | 446k | closer = closer->next; |
711 | 446k | } |
712 | 868k | } else if (closer->delim_char == '\'' || closer->delim_char == '"') { |
713 | 631k | if (closer->delim_char == '\'') { |
714 | 163k | cmark_node_set_literal(closer->inl_text, RIGHTSINGLEQUOTE); |
715 | 468k | } else { |
716 | 468k | cmark_node_set_literal(closer->inl_text, RIGHTDOUBLEQUOTE); |
717 | 468k | } |
718 | 631k | closer = closer->next; |
719 | 631k | if (opener_found) { |
720 | 69.2k | if (old_closer->delim_char == '\'') { |
721 | 20.2k | cmark_node_set_literal(opener->inl_text, LEFTSINGLEQUOTE); |
722 | 49.0k | } else { |
723 | 49.0k | cmark_node_set_literal(opener->inl_text, LEFTDOUBLEQUOTE); |
724 | 49.0k | } |
725 | 69.2k | remove_delimiter(subj, opener); |
726 | 69.2k | remove_delimiter(subj, old_closer); |
727 | 69.2k | } |
728 | 631k | } |
729 | 1.50M | if (!opener_found) { |
730 | | // set lower bound for future searches for openers |
731 | 1.00M | openers_bottom[openers_bottom_index] = old_closer->position; |
732 | 1.00M | if (!old_closer->can_open) { |
733 | | // we can remove a closer that can't be an |
734 | | // opener, once we've seen there's no |
735 | | // matching opener: |
736 | 735k | remove_delimiter(subj, old_closer); |
737 | 735k | } |
738 | 1.00M | } |
739 | 1.50M | } else { |
740 | 423k | closer = closer->next; |
741 | 423k | } |
742 | 1.92M | } |
743 | | // free all delimiters in list until stack_bottom: |
744 | 1.36M | while (subj->last_delim != NULL && |
745 | 1.36M | subj->last_delim->position >= stack_bottom) { |
746 | 305k | remove_delimiter(subj, subj->last_delim); |
747 | 305k | } |
748 | 1.06M | } |
749 | | |
750 | | static delimiter *S_insert_emph(subject *subj, delimiter *opener, |
751 | 422k | delimiter *closer) { |
752 | 422k | delimiter *delim, *tmp_delim; |
753 | 422k | bufsize_t use_delims; |
754 | 422k | cmark_node *opener_inl = opener->inl_text; |
755 | 422k | cmark_node *closer_inl = closer->inl_text; |
756 | 422k | bufsize_t opener_num_chars = opener_inl->len; |
757 | 422k | bufsize_t closer_num_chars = closer_inl->len; |
758 | 422k | cmark_node *tmp, *tmpnext, *emph; |
759 | | |
760 | | // calculate the actual number of characters used from this closer |
761 | 422k | use_delims = (closer_num_chars >= 2 && opener_num_chars >= 2) ? 2 : 1; |
762 | | |
763 | | // remove used characters from associated inlines. |
764 | 422k | opener_num_chars -= use_delims; |
765 | 422k | closer_num_chars -= use_delims; |
766 | 422k | opener_inl->len = opener_num_chars; |
767 | 422k | opener_inl->data[opener_num_chars] = 0; |
768 | 422k | closer_inl->len = closer_num_chars; |
769 | 422k | closer_inl->data[closer_num_chars] = 0; |
770 | | |
771 | | // free delimiters between opener and closer |
772 | 422k | delim = closer->previous; |
773 | 460k | while (delim != NULL && delim != opener) { |
774 | 38.0k | tmp_delim = delim->previous; |
775 | 38.0k | remove_delimiter(subj, delim); |
776 | 38.0k | delim = tmp_delim; |
777 | 38.0k | } |
778 | | |
779 | | // create new emph or strong, and splice it in to our inlines |
780 | | // between the opener and closer |
781 | 422k | emph = use_delims == 1 ? make_emph(subj->mem) : make_strong(subj->mem); |
782 | | |
783 | 422k | tmp = opener_inl->next; |
784 | 3.98M | while (tmp && tmp != closer_inl) { |
785 | 3.55M | tmpnext = tmp->next; |
786 | 3.55M | cmark_node_unlink(tmp); |
787 | 3.55M | append_child(emph, tmp); |
788 | 3.55M | tmp = tmpnext; |
789 | 3.55M | } |
790 | 422k | cmark_node_insert_after(opener_inl, emph); |
791 | | |
792 | 422k | emph->start_line = opener_inl->start_line; |
793 | 422k | emph->end_line = closer_inl->end_line; |
794 | 422k | emph->start_column = opener_inl->start_column; |
795 | 422k | emph->end_column = closer_inl->end_column; |
796 | | |
797 | | // if opener has 0 characters, remove it and its associated inline |
798 | 422k | if (opener_num_chars == 0) { |
799 | 284k | cmark_node_free(opener_inl); |
800 | 284k | remove_delimiter(subj, opener); |
801 | 284k | } |
802 | | |
803 | | // if closer has 0 characters, remove it and its associated inline |
804 | 422k | if (closer_num_chars == 0) { |
805 | | // remove empty closer inline |
806 | 282k | cmark_node_free(closer_inl); |
807 | | // remove closer from list |
808 | 282k | tmp_delim = closer->next; |
809 | 282k | remove_delimiter(subj, closer); |
810 | 282k | closer = tmp_delim; |
811 | 282k | } |
812 | | |
813 | 422k | return closer; |
814 | 422k | } |
815 | | |
816 | | // Parse backslash-escape or just a backslash, returning an inline. |
817 | 628k | static cmark_node *handle_backslash(subject *subj) { |
818 | 628k | advance(subj); |
819 | 628k | unsigned char nextchar = peek_char(subj); |
820 | 628k | if (cmark_ispunct( |
821 | 628k | nextchar)) { // only ascii symbols and newline can be escaped |
822 | 278k | advance(subj); |
823 | 278k | return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1)); |
824 | 350k | } else if (!is_eof(subj) && skip_line_end(subj)) { |
825 | 21.6k | return make_linebreak(subj->mem); |
826 | 329k | } else { |
827 | 329k | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\")); |
828 | 329k | } |
829 | 628k | } |
830 | | |
831 | | // Parse an entity or a regular "&" string. |
832 | | // Assumes the subject has an '&' character at the current position. |
833 | 514k | static cmark_node *handle_entity(subject *subj) { |
834 | 514k | cmark_strbuf ent = CMARK_BUF_INIT(subj->mem); |
835 | 514k | bufsize_t len; |
836 | | |
837 | 514k | advance(subj); |
838 | | |
839 | 514k | len = houdini_unescape_ent(&ent, subj->input.data + subj->pos, |
840 | 514k | subj->input.len - subj->pos); |
841 | | |
842 | 514k | if (len <= 0) |
843 | 473k | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&")); |
844 | | |
845 | 41.0k | subj->pos += len; |
846 | 41.0k | return make_str_from_buf(subj, subj->pos - 1 - len, subj->pos - 1, &ent); |
847 | 514k | } |
848 | | |
849 | | // Clean a URL: remove surrounding whitespace, and remove \ that escape |
850 | | // punctuation. |
851 | 88.0k | unsigned char *cmark_clean_url(cmark_mem *mem, cmark_chunk *url) { |
852 | 88.0k | cmark_strbuf buf = CMARK_BUF_INIT(mem); |
853 | | |
854 | 88.0k | cmark_chunk_trim(url); |
855 | | |
856 | 88.0k | houdini_unescape_html_f(&buf, url->data, url->len); |
857 | | |
858 | 88.0k | cmark_strbuf_unescape(&buf); |
859 | 88.0k | return cmark_strbuf_detach(&buf); |
860 | 88.0k | } |
861 | | |
862 | 88.0k | unsigned char *cmark_clean_title(cmark_mem *mem, cmark_chunk *title) { |
863 | 88.0k | cmark_strbuf buf = CMARK_BUF_INIT(mem); |
864 | 88.0k | unsigned char first, last; |
865 | | |
866 | 88.0k | if (title->len == 0) { |
867 | 80.7k | return NULL; |
868 | 80.7k | } |
869 | | |
870 | 7.28k | first = title->data[0]; |
871 | 7.28k | last = title->data[title->len - 1]; |
872 | | |
873 | | // remove surrounding quotes if any: |
874 | 7.28k | if ((first == '\'' && last == '\'') || (first == '(' && last == ')') || |
875 | 7.28k | (first == '"' && last == '"')) { |
876 | 7.28k | houdini_unescape_html_f(&buf, title->data + 1, title->len - 2); |
877 | 7.28k | } else { |
878 | 0 | houdini_unescape_html_f(&buf, title->data, title->len); |
879 | 0 | } |
880 | | |
881 | 7.28k | cmark_strbuf_unescape(&buf); |
882 | 7.28k | return cmark_strbuf_detach(&buf); |
883 | 88.0k | } |
884 | | |
885 | | // Parse an autolink or HTML tag. |
886 | | // Assumes the subject has a '<' character at the current position. |
887 | 3.95M | static cmark_node *handle_pointy_brace(subject *subj, int options) { |
888 | 3.95M | bufsize_t matchlen = 0; |
889 | 3.95M | cmark_chunk contents; |
890 | | |
891 | 3.95M | advance(subj); // advance past first < |
892 | | |
893 | | // first try to match a URL autolink |
894 | 3.95M | matchlen = scan_autolink_uri(&subj->input, subj->pos); |
895 | 3.95M | if (matchlen > 0) { |
896 | 17.6k | contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); |
897 | 17.6k | subj->pos += matchlen; |
898 | | |
899 | 17.6k | return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0); |
900 | 17.6k | } |
901 | | |
902 | | // next try to match an email autolink |
903 | 3.93M | matchlen = scan_autolink_email(&subj->input, subj->pos); |
904 | 3.93M | if (matchlen > 0) { |
905 | 72.6k | contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); |
906 | 72.6k | subj->pos += matchlen; |
907 | | |
908 | 72.6k | return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1); |
909 | 72.6k | } |
910 | | |
911 | | // finally, try to match an html tag |
912 | 3.86M | if (subj->pos + 2 <= subj->input.len) { |
913 | 3.84M | int c = subj->input.data[subj->pos]; |
914 | 3.84M | if (c == '!' && (subj->flags & FLAG_SKIP_HTML_COMMENT) == 0) { |
915 | 321k | c = subj->input.data[subj->pos+1]; |
916 | 321k | if (c == '-' && subj->input.data[subj->pos+2] == '-') { |
917 | 37.9k | if (subj->input.data[subj->pos+3] == '>') { |
918 | 693 | matchlen = 4; |
919 | 37.2k | } else if (subj->input.data[subj->pos+3] == '-' && |
920 | 37.2k | subj->input.data[subj->pos+4] == '>') { |
921 | 273 | matchlen = 5; |
922 | 37.0k | } else { |
923 | 37.0k | matchlen = scan_html_comment(&subj->input, subj->pos + 1); |
924 | 37.0k | if (matchlen > 0) { |
925 | 286 | matchlen += 1; // prefix "<" |
926 | 36.7k | } else { // no match through end of input: set a flag so |
927 | | // we don't reparse looking for -->: |
928 | 36.7k | subj->flags |= FLAG_SKIP_HTML_COMMENT; |
929 | 36.7k | } |
930 | 37.0k | } |
931 | 283k | } else if (c == '[') { |
932 | 40.6k | if ((subj->flags & FLAG_SKIP_HTML_CDATA) == 0) { |
933 | 36.8k | matchlen = scan_html_cdata(&subj->input, subj->pos + 2); |
934 | 36.8k | if (matchlen > 0) { |
935 | | // The regex doesn't require the final "]]>". But if we're not at |
936 | | // the end of input, it must come after the match. Otherwise, |
937 | | // disable subsequent scans to avoid quadratic behavior. |
938 | 17.9k | matchlen += 5; // prefix "![", suffix "]]>" |
939 | 17.9k | if (subj->pos + matchlen > subj->input.len) { |
940 | 2.80k | subj->flags |= FLAG_SKIP_HTML_CDATA; |
941 | 2.80k | matchlen = 0; |
942 | 2.80k | } |
943 | 17.9k | } |
944 | 36.8k | } |
945 | 242k | } else if ((subj->flags & FLAG_SKIP_HTML_DECLARATION) == 0) { |
946 | 230k | matchlen = scan_html_declaration(&subj->input, subj->pos + 1); |
947 | 230k | if (matchlen > 0) { |
948 | 48.1k | matchlen += 2; // prefix "!", suffix ">" |
949 | 48.1k | if (subj->pos + matchlen > subj->input.len) { |
950 | 12.3k | subj->flags |= FLAG_SKIP_HTML_DECLARATION; |
951 | 12.3k | matchlen = 0; |
952 | 12.3k | } |
953 | 48.1k | } |
954 | 230k | } |
955 | 3.51M | } else if (c == '?') { |
956 | 98.3k | if ((subj->flags & FLAG_SKIP_HTML_PI) == 0) { |
957 | | // Note that we allow an empty match. |
958 | 76.2k | matchlen = scan_html_pi(&subj->input, subj->pos + 1); |
959 | 76.2k | matchlen += 3; // prefix "?", suffix "?>" |
960 | 76.2k | if (subj->pos + matchlen > subj->input.len) { |
961 | 10.0k | subj->flags |= FLAG_SKIP_HTML_PI; |
962 | 10.0k | matchlen = 0; |
963 | 10.0k | } |
964 | 76.2k | } |
965 | 3.42M | } else { |
966 | 3.42M | matchlen = scan_html_tag(&subj->input, subj->pos); |
967 | 3.42M | } |
968 | 3.84M | } |
969 | 3.86M | if (matchlen > 0) { |
970 | 296k | const unsigned char *src = subj->input.data + subj->pos - 1; |
971 | 296k | bufsize_t len = matchlen + 1; |
972 | 296k | subj->pos += matchlen; |
973 | 296k | cmark_node *node = make_literal(subj, CMARK_NODE_HTML_INLINE, |
974 | 296k | subj->pos - matchlen - 1, subj->pos - 1); |
975 | 296k | node->data = (unsigned char *)subj->mem->realloc(NULL, len + 1); |
976 | 296k | memcpy(node->data, src, len); |
977 | 296k | node->data[len] = 0; |
978 | 296k | node->len = len; |
979 | 296k | adjust_subj_node_newlines(subj, node, matchlen, 1, options); |
980 | 296k | return node; |
981 | 296k | } |
982 | | |
983 | | // if nothing matches, just return the opening <: |
984 | 3.56M | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<")); |
985 | 3.86M | } |
986 | | |
987 | | // Parse a link label. Returns 1 if successful. |
988 | | // Note: unescaped brackets are not allowed in labels. |
989 | | // The label begins with `[` and ends with the first `]` character |
990 | | // encountered. Backticks in labels do not start code spans. |
991 | 1.90M | static int link_label(subject *subj, cmark_chunk *raw_label) { |
992 | 1.90M | bufsize_t startpos = subj->pos; |
993 | 1.90M | int length = 0; |
994 | 1.90M | unsigned char c; |
995 | | |
996 | | // advance past [ |
997 | 1.90M | if (peek_char(subj) == '[') { |
998 | 661k | advance(subj); |
999 | 1.24M | } else { |
1000 | 1.24M | return 0; |
1001 | 1.24M | } |
1002 | | |
1003 | 28.2M | while ((c = peek_char(subj)) && c != '[' && c != ']') { |
1004 | 27.5M | if (c == '\\') { |
1005 | 41.9k | advance(subj); |
1006 | 41.9k | length++; |
1007 | 41.9k | if (cmark_ispunct(peek_char(subj))) { |
1008 | 19.9k | advance(subj); |
1009 | 19.9k | length++; |
1010 | 19.9k | } |
1011 | 27.5M | } else { |
1012 | 27.5M | advance(subj); |
1013 | 27.5M | length++; |
1014 | 27.5M | } |
1015 | 27.5M | if (length > MAX_LINK_LABEL_LENGTH) { |
1016 | 1.91k | goto noMatch; |
1017 | 1.91k | } |
1018 | 27.5M | } |
1019 | | |
1020 | 659k | if (c == ']') { // match found |
1021 | 487k | *raw_label = |
1022 | 487k | cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1)); |
1023 | 487k | cmark_chunk_trim(raw_label); |
1024 | 487k | advance(subj); // advance past ] |
1025 | 487k | return 1; |
1026 | 487k | } |
1027 | | |
1028 | 173k | noMatch: |
1029 | 173k | subj->pos = startpos; // rewind |
1030 | 173k | return 0; |
1031 | 659k | } |
1032 | | |
1033 | | static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset, |
1034 | 354k | cmark_chunk *output) { |
1035 | 354k | bufsize_t i = offset; |
1036 | 354k | size_t nb_p = 0; |
1037 | | |
1038 | 39.3M | while (i < input->len) { |
1039 | 39.3M | if (input->data[i] == '\\' && |
1040 | 39.3M | i + 1 < input-> len && |
1041 | 39.3M | cmark_ispunct(input->data[i+1])) |
1042 | 6.01k | i += 2; |
1043 | 39.3M | else if (input->data[i] == '(') { |
1044 | 66.5k | ++nb_p; |
1045 | 66.5k | ++i; |
1046 | 66.5k | if (nb_p > 32) |
1047 | 364 | return -1; |
1048 | 39.2M | } else if (input->data[i] == ')') { |
1049 | 26.5k | if (nb_p == 0) |
1050 | 16.8k | break; |
1051 | 9.71k | --nb_p; |
1052 | 9.71k | ++i; |
1053 | 39.2M | } else if (cmark_isspace(input->data[i])) { |
1054 | 332k | if (i == offset) { |
1055 | 633 | return -1; |
1056 | 633 | } |
1057 | 331k | break; |
1058 | 38.8M | } else { |
1059 | 38.8M | ++i; |
1060 | 38.8M | } |
1061 | 39.3M | } |
1062 | | |
1063 | 353k | if (i >= input->len || nb_p != 0) |
1064 | 20.8k | return -1; |
1065 | | |
1066 | 332k | { |
1067 | 332k | cmark_chunk result = {input->data + offset, i - offset}; |
1068 | 332k | *output = result; |
1069 | 332k | } |
1070 | 332k | return i - offset; |
1071 | 353k | } |
1072 | | |
1073 | | static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset, |
1074 | 413k | cmark_chunk *output) { |
1075 | 413k | bufsize_t i = offset; |
1076 | | |
1077 | 413k | if (i < input->len && input->data[i] == '<') { |
1078 | 59.3k | ++i; |
1079 | 592k | while (i < input->len) { |
1080 | 590k | if (input->data[i] == '>') { |
1081 | 46.2k | ++i; |
1082 | 46.2k | break; |
1083 | 544k | } else if (input->data[i] == '\\') |
1084 | 5.33k | i += 2; |
1085 | 539k | else if (input->data[i] == '\n' || input->data[i] == '<') |
1086 | 11.0k | return -1; |
1087 | 528k | else |
1088 | 528k | ++i; |
1089 | 590k | } |
1090 | 354k | } else { |
1091 | 354k | return manual_scan_link_url_2(input, offset, output); |
1092 | 354k | } |
1093 | | |
1094 | 48.3k | if (i >= input->len) |
1095 | 2.12k | return -1; |
1096 | | |
1097 | 46.1k | { |
1098 | 46.1k | cmark_chunk result = {input->data + offset + 1, i - 2 - offset}; |
1099 | 46.1k | *output = result; |
1100 | 46.1k | } |
1101 | 46.1k | return i - offset; |
1102 | 48.3k | } |
1103 | | |
1104 | | // Return a link, an image, or a literal close bracket. |
1105 | 2.30M | static cmark_node *handle_close_bracket(subject *subj) { |
1106 | 2.30M | bufsize_t initial_pos, after_link_text_pos; |
1107 | 2.30M | bufsize_t endurl, starttitle, endtitle, endall; |
1108 | 2.30M | bufsize_t sps, n; |
1109 | 2.30M | cmark_reference *ref = NULL; |
1110 | 2.30M | cmark_chunk url_chunk, title_chunk; |
1111 | 2.30M | unsigned char *url, *title; |
1112 | 2.30M | bracket *opener; |
1113 | 2.30M | cmark_node *inl; |
1114 | 2.30M | cmark_chunk raw_label; |
1115 | 2.30M | int found_label; |
1116 | 2.30M | cmark_node *tmp, *tmpnext; |
1117 | 2.30M | bool is_image; |
1118 | | |
1119 | 2.30M | advance(subj); // advance past ] |
1120 | 2.30M | initial_pos = subj->pos; |
1121 | | |
1122 | | // get last [ or ![ |
1123 | 2.30M | opener = subj->last_bracket; |
1124 | | |
1125 | 2.30M | if (opener == NULL) { |
1126 | 519k | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); |
1127 | 519k | } |
1128 | | |
1129 | | // If we got here, we matched a potential link/image text. |
1130 | | // Now we check to see if it's a link/image. |
1131 | 1.78M | is_image = opener->image; |
1132 | | |
1133 | 1.78M | if (!is_image && subj->no_link_openers) { |
1134 | | // take delimiter off stack |
1135 | 17.8k | pop_bracket(subj); |
1136 | 17.8k | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); |
1137 | 17.8k | } |
1138 | | |
1139 | 1.77M | after_link_text_pos = subj->pos; |
1140 | | |
1141 | | // First, look for an inline link. |
1142 | 1.77M | if (peek_char(subj) == '(' && |
1143 | 1.77M | ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && |
1144 | 1.77M | ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps, |
1145 | 335k | &url_chunk)) > -1)) { |
1146 | | |
1147 | | // try to parse an explicit link: |
1148 | 303k | endurl = subj->pos + 1 + sps + n; |
1149 | 303k | starttitle = endurl + scan_spacechars(&subj->input, endurl); |
1150 | | |
1151 | | // ensure there are spaces btw url and title |
1152 | 303k | endtitle = (starttitle == endurl) |
1153 | 303k | ? starttitle |
1154 | 303k | : starttitle + scan_link_title(&subj->input, starttitle); |
1155 | | |
1156 | 303k | endall = endtitle + scan_spacechars(&subj->input, endtitle); |
1157 | | |
1158 | 303k | if (peek_at(subj, endall) == ')') { |
1159 | 20.3k | subj->pos = endall + 1; |
1160 | | |
1161 | 20.3k | title_chunk = |
1162 | 20.3k | cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle); |
1163 | 20.3k | url = cmark_clean_url(subj->mem, &url_chunk); |
1164 | 20.3k | title = cmark_clean_title(subj->mem, &title_chunk); |
1165 | 20.3k | cmark_chunk_free(&url_chunk); |
1166 | 20.3k | cmark_chunk_free(&title_chunk); |
1167 | 20.3k | goto match; |
1168 | | |
1169 | 282k | } else { |
1170 | | // it could still be a shortcut reference link |
1171 | 282k | subj->pos = after_link_text_pos; |
1172 | 282k | } |
1173 | 303k | } |
1174 | | |
1175 | | // Next, look for a following [link label] that matches in refmap. |
1176 | | // skip spaces |
1177 | 1.74M | raw_label = cmark_chunk_literal(""); |
1178 | 1.74M | found_label = link_label(subj, &raw_label); |
1179 | 1.74M | if (!found_label) { |
1180 | | // If we have a shortcut reference link, back up |
1181 | | // to before the spaces we skipped. |
1182 | 1.38M | subj->pos = initial_pos; |
1183 | 1.38M | } |
1184 | | |
1185 | 1.74M | if ((!found_label || raw_label.len == 0) && !opener->bracket_after) { |
1186 | 1.30M | cmark_chunk_free(&raw_label); |
1187 | 1.30M | raw_label = cmark_chunk_dup(&subj->input, opener->position, |
1188 | 1.30M | initial_pos - opener->position - 1); |
1189 | 1.30M | found_label = true; |
1190 | 1.30M | } |
1191 | | |
1192 | 1.74M | if (found_label) { |
1193 | 1.63M | ref = cmark_reference_lookup(subj->refmap, &raw_label); |
1194 | 1.63M | cmark_chunk_free(&raw_label); |
1195 | 1.63M | } |
1196 | | |
1197 | 1.74M | if (ref != NULL) { // found |
1198 | 193k | url = cmark_strdup(subj->mem, ref->url); |
1199 | 193k | title = cmark_strdup(subj->mem, ref->title); |
1200 | 193k | goto match; |
1201 | 1.55M | } else { |
1202 | 1.55M | goto noMatch; |
1203 | 1.55M | } |
1204 | | |
1205 | 1.55M | noMatch: |
1206 | | // If we fall through to here, it means we didn't match a link: |
1207 | 1.55M | pop_bracket(subj); // remove this opener from delimiter list |
1208 | 1.55M | subj->pos = initial_pos; |
1209 | 1.55M | return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); |
1210 | | |
1211 | 213k | match: |
1212 | 213k | inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK); |
1213 | 213k | inl->as.link.url = url; |
1214 | 213k | inl->as.link.title = title; |
1215 | 213k | inl->start_line = inl->end_line = subj->line; |
1216 | 213k | inl->start_column = opener->inl_text->start_column; |
1217 | 213k | inl->end_column = subj->pos + subj->column_offset + subj->block_offset; |
1218 | 213k | cmark_node_insert_before(opener->inl_text, inl); |
1219 | | // Add link text: |
1220 | 213k | tmp = opener->inl_text->next; |
1221 | 630k | while (tmp) { |
1222 | 416k | tmpnext = tmp->next; |
1223 | 416k | cmark_node_unlink(tmp); |
1224 | 416k | append_child(inl, tmp); |
1225 | 416k | tmp = tmpnext; |
1226 | 416k | } |
1227 | | |
1228 | | // Free the bracket [: |
1229 | 213k | cmark_node_free(opener->inl_text); |
1230 | | |
1231 | 213k | process_emphasis(subj, opener->position); |
1232 | 213k | pop_bracket(subj); |
1233 | | |
1234 | | // Now, if we have a link, we also want to deactivate links until |
1235 | | // we get a new opener. (This code can be removed if we decide to allow links |
1236 | | // inside links.) |
1237 | 213k | if (!is_image) { |
1238 | 205k | subj->no_link_openers = true; |
1239 | 205k | } |
1240 | | |
1241 | 213k | return NULL; |
1242 | 1.74M | } |
1243 | | |
1244 | | // Parse a hard or soft linebreak, returning an inline. |
1245 | | // Assumes the subject has a cr or newline at the current position. |
1246 | 2.17M | static cmark_node *handle_newline(subject *subj) { |
1247 | 2.17M | bufsize_t nlpos = subj->pos; |
1248 | | // skip over cr, crlf, or lf: |
1249 | 2.17M | if (peek_at(subj, subj->pos) == '\r') { |
1250 | 0 | advance(subj); |
1251 | 0 | } |
1252 | 2.17M | if (peek_at(subj, subj->pos) == '\n') { |
1253 | 2.17M | advance(subj); |
1254 | 2.17M | } |
1255 | 2.17M | ++subj->line; |
1256 | 2.17M | subj->column_offset = -subj->pos; |
1257 | | // skip spaces at beginning of line |
1258 | 2.17M | skip_spaces(subj); |
1259 | 2.17M | if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' && |
1260 | 2.17M | peek_at(subj, nlpos - 2) == ' ') { |
1261 | 2.22k | return make_linebreak(subj->mem); |
1262 | 2.17M | } else { |
1263 | 2.17M | return make_softbreak(subj->mem); |
1264 | 2.17M | } |
1265 | 2.17M | } |
1266 | | |
1267 | 13.4M | static bufsize_t subject_find_special_char(subject *subj, int options) { |
1268 | | // "\r\n\\`&_*[]<!" |
1269 | 13.4M | static const int8_t SPECIAL_CHARS[256] = { |
1270 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1271 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, |
1272 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1273 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, |
1274 | 13.4M | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1275 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1276 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1277 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1278 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1279 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1280 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
1281 | | |
1282 | | // " ' . - |
1283 | 13.4M | static const char SMART_PUNCT_CHARS[] = { |
1284 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1285 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, |
1286 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1287 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1288 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1289 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1290 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1291 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1292 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1293 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1294 | 13.4M | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
1295 | 13.4M | }; |
1296 | | |
1297 | 13.4M | bufsize_t n = subj->pos + 1; |
1298 | | |
1299 | 283M | while (n < subj->input.len) { |
1300 | 282M | if (SPECIAL_CHARS[subj->input.data[n]]) |
1301 | 8.50M | return n; |
1302 | 274M | if (options & CMARK_OPT_SMART && SMART_PUNCT_CHARS[subj->input.data[n]]) |
1303 | 4.36M | return n; |
1304 | 269M | n++; |
1305 | 269M | } |
1306 | | |
1307 | 593k | return subj->input.len; |
1308 | 13.4M | } |
1309 | | |
1310 | | // Parse an inline, advancing subject, and add it as a child of parent. |
1311 | | // Return 0 if no inline can be parsed, 1 otherwise. |
1312 | 33.3M | static int parse_inline(subject *subj, cmark_node *parent, int options) { |
1313 | 33.3M | cmark_node *new_inl = NULL; |
1314 | 33.3M | cmark_chunk contents; |
1315 | 33.3M | unsigned char c; |
1316 | 33.3M | bufsize_t startpos, endpos; |
1317 | 33.3M | c = peek_char(subj); |
1318 | 33.3M | if (c == 0) { |
1319 | 0 | return 0; |
1320 | 0 | } |
1321 | 33.3M | switch (c) { |
1322 | 0 | case '\r': |
1323 | 2.17M | case '\n': |
1324 | 2.17M | new_inl = handle_newline(subj); |
1325 | 2.17M | break; |
1326 | 315k | case '`': |
1327 | 315k | new_inl = handle_backticks(subj, options); |
1328 | 315k | break; |
1329 | 628k | case '\\': |
1330 | 628k | new_inl = handle_backslash(subj); |
1331 | 628k | break; |
1332 | 514k | case '&': |
1333 | 514k | new_inl = handle_entity(subj); |
1334 | 514k | break; |
1335 | 3.95M | case '<': |
1336 | 3.95M | new_inl = handle_pointy_brace(subj, options); |
1337 | 3.95M | break; |
1338 | 600k | case '*': |
1339 | 1.04M | case '_': |
1340 | 1.38M | case '\'': |
1341 | 2.10M | case '"': |
1342 | 2.10M | new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0); |
1343 | 2.10M | break; |
1344 | 4.44M | case '-': |
1345 | 4.44M | new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0); |
1346 | 4.44M | break; |
1347 | 253k | case '.': |
1348 | 253k | new_inl = handle_period(subj, (options & CMARK_OPT_SMART) != 0); |
1349 | 253k | break; |
1350 | 2.20M | case '[': |
1351 | 2.20M | advance(subj); |
1352 | 2.20M | new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("[")); |
1353 | 2.20M | push_bracket(subj, false, new_inl); |
1354 | 2.20M | break; |
1355 | 2.30M | case ']': |
1356 | 2.30M | new_inl = handle_close_bracket(subj); |
1357 | 2.30M | break; |
1358 | 935k | case '!': |
1359 | 935k | advance(subj); |
1360 | 935k | if (peek_char(subj) == '[') { |
1361 | 60.6k | advance(subj); |
1362 | 60.6k | new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("![")); |
1363 | 60.6k | push_bracket(subj, true, new_inl); |
1364 | 874k | } else { |
1365 | 874k | new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!")); |
1366 | 874k | } |
1367 | 935k | break; |
1368 | 13.4M | default: |
1369 | 13.4M | endpos = subject_find_special_char(subj, options); |
1370 | 13.4M | contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos); |
1371 | 13.4M | startpos = subj->pos; |
1372 | 13.4M | subj->pos = endpos; |
1373 | | |
1374 | | // if we're at a newline, strip trailing spaces. |
1375 | 13.4M | if (S_is_line_end_char(peek_char(subj))) { |
1376 | 1.67M | cmark_chunk_rtrim(&contents); |
1377 | 1.67M | } |
1378 | | |
1379 | 13.4M | new_inl = make_str(subj, startpos, endpos - 1, contents); |
1380 | 33.3M | } |
1381 | 33.3M | if (new_inl != NULL) { |
1382 | 33.0M | append_child(parent, new_inl); |
1383 | 33.0M | } |
1384 | | |
1385 | 33.3M | return 1; |
1386 | 33.3M | } |
1387 | | |
1388 | | // Parse inlines from parent's string_content, adding as children of parent. |
1389 | | void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, |
1390 | 850k | cmark_reference_map *refmap, int options) { |
1391 | 850k | int internal_offset = parent->type == CMARK_NODE_HEADING ? |
1392 | 766k | parent->as.heading.internal_offset : 0; |
1393 | 850k | subject subj; |
1394 | 850k | cmark_chunk content = {parent->data, parent->len}; |
1395 | 850k | subject_from_buf(mem, parent->start_line, parent->start_column - 1 + internal_offset, &subj, &content, refmap); |
1396 | 850k | cmark_chunk_rtrim(&subj.input); |
1397 | | |
1398 | 34.1M | while (!is_eof(&subj) && parse_inline(&subj, parent, options)) |
1399 | 33.3M | ; |
1400 | | |
1401 | 850k | process_emphasis(&subj, 0); |
1402 | | // free bracket and delim stack |
1403 | 850k | while (subj.last_delim) { |
1404 | 0 | remove_delimiter(&subj, subj.last_delim); |
1405 | 0 | } |
1406 | 1.32M | while (subj.last_bracket) { |
1407 | 479k | pop_bracket(&subj); |
1408 | 479k | } |
1409 | 850k | } |
1410 | | |
1411 | | // Parse zero or more space characters, including at most one newline. |
1412 | 153k | static void spnl(subject *subj) { |
1413 | 153k | skip_spaces(subj); |
1414 | 153k | if (skip_line_end(subj)) { |
1415 | 78.7k | skip_spaces(subj); |
1416 | 78.7k | } |
1417 | 153k | } |
1418 | | |
1419 | | // Parse reference. Assumes string begins with '[' character. |
1420 | | // Modify refmap if a reference is encountered. |
1421 | | // Return 0 if no reference found, otherwise position of subject |
1422 | | // after reference is parsed. |
1423 | | bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_chunk *input, |
1424 | 152k | cmark_reference_map *refmap) { |
1425 | 152k | subject subj; |
1426 | | |
1427 | 152k | cmark_chunk lab; |
1428 | 152k | cmark_chunk url; |
1429 | 152k | cmark_chunk title; |
1430 | | |
1431 | 152k | bufsize_t matchlen = 0; |
1432 | 152k | bufsize_t beforetitle; |
1433 | | |
1434 | 152k | subject_from_buf(mem, -1, 0, &subj, input, NULL); |
1435 | | |
1436 | | // parse label: |
1437 | 152k | if (!link_label(&subj, &lab) || lab.len == 0) |
1438 | 37.5k | return 0; |
1439 | | |
1440 | | // colon: |
1441 | 114k | if (peek_char(&subj) == ':') { |
1442 | 78.1k | advance(&subj); |
1443 | 78.1k | } else { |
1444 | 36.5k | return 0; |
1445 | 36.5k | } |
1446 | | |
1447 | | // parse link url: |
1448 | 78.1k | spnl(&subj); |
1449 | 78.1k | if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1) { |
1450 | 75.5k | subj.pos += matchlen; |
1451 | 75.5k | } else { |
1452 | 2.57k | return 0; |
1453 | 2.57k | } |
1454 | | |
1455 | | // parse optional link_title |
1456 | 75.5k | beforetitle = subj.pos; |
1457 | 75.5k | spnl(&subj); |
1458 | 75.5k | matchlen = subj.pos == beforetitle ? 0 : scan_link_title(&subj.input, subj.pos); |
1459 | 75.5k | if (matchlen) { |
1460 | 2.77k | title = cmark_chunk_dup(&subj.input, subj.pos, matchlen); |
1461 | 2.77k | subj.pos += matchlen; |
1462 | 72.7k | } else { |
1463 | 72.7k | subj.pos = beforetitle; |
1464 | 72.7k | title = cmark_chunk_literal(""); |
1465 | 72.7k | } |
1466 | | |
1467 | | // parse final spaces and newline: |
1468 | 75.5k | skip_spaces(&subj); |
1469 | 75.5k | if (!skip_line_end(&subj)) { |
1470 | 10.1k | if (matchlen) { // try rewinding before title |
1471 | 2.74k | subj.pos = beforetitle; |
1472 | 2.74k | skip_spaces(&subj); |
1473 | 2.74k | if (!skip_line_end(&subj)) { |
1474 | 420 | return 0; |
1475 | 420 | } |
1476 | 7.38k | } else { |
1477 | 7.38k | return 0; |
1478 | 7.38k | } |
1479 | 10.1k | } |
1480 | | // insert reference into refmap |
1481 | 67.7k | cmark_reference_create(refmap, &lab, &url, &title); |
1482 | 67.7k | return subj.pos; |
1483 | 75.5k | } |