Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | #include <stdlib.h>  | 
2  |  | #include <string.h>  | 
3  |  | #include <stdio.h>  | 
4  |  |  | 
5  |  | #include "cmark_ctype.h"  | 
6  |  | #include "config.h"  | 
7  |  | #include "node.h"  | 
8  |  | #include "parser.h"  | 
9  |  | #include "references.h"  | 
10  |  | #include "cmark.h"  | 
11  |  | #include "houdini.h"  | 
12  |  | #include "utf8.h"  | 
13  |  | #include "scanners.h"  | 
14  |  | #include "inlines.h"  | 
15  |  |  | 
16  |  | static const char *EMDASH = "\xE2\x80\x94";  | 
17  |  | static const char *ENDASH = "\xE2\x80\x93";  | 
18  |  | static const char *ELLIPSES = "\xE2\x80\xA6";  | 
19  |  | static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C";  | 
20  |  | static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D";  | 
21  |  | static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98";  | 
22  |  | static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99";  | 
23  |  |  | 
24  |  | // Macros for creating various kinds of simple.  | 
25  | 23.8k  | #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK)  | 
26  | 2.17M  | #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK)  | 
27  | 285k  | #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH)  | 
28  | 559k  | #define make_strong(mem) make_simple(mem, CMARK_NODE_STRONG)  | 
29  |  |  | 
30  | 1.00G  | #define MAXBACKTICKS 1000  | 
31  |  |  | 
32  |  | typedef struct delimiter { | 
33  |  |   struct delimiter *previous;  | 
34  |  |   struct delimiter *next;  | 
35  |  |   cmark_node *inl_text;  | 
36  |  |   bufsize_t position;  | 
37  |  |   bufsize_t length;  | 
38  |  |   unsigned char delim_char;  | 
39  |  |   bool can_open;  | 
40  |  |   bool can_close;  | 
41  |  | } delimiter;  | 
42  |  |  | 
43  |  | typedef struct bracket { | 
44  |  |   struct bracket *previous;  | 
45  |  |   cmark_node *inl_text;  | 
46  |  |   bufsize_t position;  | 
47  |  |   bool image;  | 
48  |  |   bool active;  | 
49  |  |   bool bracket_after;  | 
50  |  | } bracket;  | 
51  |  |  | 
52  | 43.5k  | #define FLAG_SKIP_HTML_CDATA        (1u << 0)  | 
53  | 254k  | #define FLAG_SKIP_HTML_DECLARATION  (1u << 1)  | 
54  | 108k  | #define FLAG_SKIP_HTML_PI           (1u << 2)  | 
55  | 543k  | #define FLAG_SKIP_HTML_COMMENT      (1u << 3)  | 
56  |  |  | 
57  |  | typedef struct { | 
58  |  |   cmark_mem *mem;  | 
59  |  |   cmark_chunk input;  | 
60  |  |   unsigned flags;  | 
61  |  |   int line;  | 
62  |  |   bufsize_t pos;  | 
63  |  |   int block_offset;  | 
64  |  |   int column_offset;  | 
65  |  |   cmark_reference_map *refmap;  | 
66  |  |   delimiter *last_delim;  | 
67  |  |   bracket *last_bracket;  | 
68  |  |   bufsize_t backticks[MAXBACKTICKS + 1];  | 
69  |  |   bool scanned_for_backticks;  | 
70  |  |   bool no_link_openers;  | 
71  |  | } subject;  | 
72  |  |  | 
73  | 13.4M  | static CMARK_INLINE bool S_is_line_end_char(char c) { | 
74  | 13.4M  |   return (c == '\n' || c == '\r');  | 
75  | 13.4M  | }  | 
76  |  |  | 
77  |  | static delimiter *S_insert_emph(subject *subj, delimiter *opener,  | 
78  |  |                                 delimiter *closer);  | 
79  |  |  | 
80  |  | static int parse_inline(subject *subj, cmark_node *parent, int options);  | 
81  |  |  | 
82  |  | static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,  | 
83  |  |                              cmark_chunk *chunk, cmark_reference_map *refmap);  | 
84  |  | static bufsize_t subject_find_special_char(subject *subj, int options);  | 
85  |  |  | 
86  |  | // Create an inline with a literal string value.  | 
87  |  | static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t,  | 
88  | 30.8M  |                                              int start_column, int end_column) { | 
89  | 30.8M  |   cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e));  | 
90  | 30.8M  |   e->mem = subj->mem;  | 
91  | 30.8M  |   e->type = (uint16_t)t;  | 
92  | 30.8M  |   e->start_line = e->end_line = subj->line;  | 
93  |  |   // columns are 1 based.  | 
94  | 30.8M  |   e->start_column = start_column + 1 + subj->column_offset + subj->block_offset;  | 
95  | 30.8M  |   e->end_column = end_column + 1 + subj->column_offset + subj->block_offset;  | 
96  | 30.8M  |   return e;  | 
97  | 30.8M  | }  | 
98  |  |  | 
99  |  | // Create an inline with no value.  | 
100  | 2.92M  | static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) { | 
101  | 2.92M  |   cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e));  | 
102  | 2.92M  |   e->mem = mem;  | 
103  | 2.92M  |   e->type = t;  | 
104  | 2.92M  |   return e;  | 
105  | 2.92M  | }  | 
106  |  |  | 
107  | 29.3M  | static cmark_node *make_str(subject *subj, int sc, int ec, cmark_chunk s) { | 
108  | 29.3M  |   cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec);  | 
109  | 29.3M  |   e->data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1);  | 
110  | 29.3M  |   if (s.data != NULL) { | 
111  | 29.3M  |     memcpy(e->data, s.data, s.len);  | 
112  | 29.3M  |   }  | 
113  | 29.3M  |   e->data[s.len] = 0;  | 
114  | 29.3M  |   e->len = s.len;  | 
115  | 29.3M  |   return e;  | 
116  | 29.3M  | }  | 
117  |  |  | 
118  |  | static cmark_node *make_str_from_buf(subject *subj, int sc, int ec,  | 
119  | 1.10M  |                                      cmark_strbuf *buf) { | 
120  | 1.10M  |   cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec);  | 
121  | 1.10M  |   e->len = buf->size;  | 
122  | 1.10M  |   e->data = cmark_strbuf_detach(buf);  | 
123  | 1.10M  |   return e;  | 
124  | 1.10M  | }  | 
125  |  |  | 
126  |  | // Like make_str, but parses entities.  | 
127  |  | static cmark_node *make_str_with_entities(subject *subj,  | 
128  |  |                                           int start_column, int end_column,  | 
129  | 90.3k  |                                           cmark_chunk *content) { | 
130  | 90.3k  |   cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem);  | 
131  |  |  | 
132  | 90.3k  |   if (houdini_unescape_html(&unescaped, content->data, content->len)) { | 
133  | 2.06k  |     return make_str_from_buf(subj, start_column, end_column, &unescaped);  | 
134  | 88.2k  |   } else { | 
135  | 88.2k  |     return make_str(subj, start_column, end_column, *content);  | 
136  | 88.2k  |   }  | 
137  | 90.3k  | }  | 
138  |  |  | 
139  |  | // Like cmark_node_append_child but without costly sanity checks.  | 
140  |  | // Assumes that child was newly created.  | 
141  | 37.1M  | static void append_child(cmark_node *node, cmark_node *child) { | 
142  | 37.1M  |   cmark_node *old_last_child = node->last_child;  | 
143  |  |  | 
144  | 37.1M  |   child->next = NULL;  | 
145  | 37.1M  |   child->prev = old_last_child;  | 
146  | 37.1M  |   child->parent = node;  | 
147  | 37.1M  |   node->last_child = child;  | 
148  |  |  | 
149  | 37.1M  |   if (old_last_child) { | 
150  | 35.6M  |     old_last_child->next = child;  | 
151  | 35.6M  |   } else { | 
152  |  |     // Also set first_child if node previously had no children.  | 
153  | 1.52M  |     node->first_child = child;  | 
154  | 1.52M  |   }  | 
155  | 37.1M  | }  | 
156  |  |  | 
157  |  | // Duplicate a chunk by creating a copy of the buffer not by reusing the  | 
158  |  | // buffer like cmark_chunk_dup does.  | 
159  | 387k  | static unsigned char *cmark_strdup(cmark_mem *mem, unsigned char *src) { | 
160  | 387k  |   if (src == NULL) { | 
161  | 190k  |     return NULL;  | 
162  | 190k  |   }  | 
163  | 197k  |   size_t len = strlen((char *)src);  | 
164  | 197k  |   unsigned char *data = (unsigned char *)mem->realloc(NULL, len + 1);  | 
165  | 197k  |   memcpy(data, src, len + 1);  | 
166  | 197k  |   return data;  | 
167  | 387k  | }  | 
168  |  |  | 
169  |  | static unsigned char *cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url,  | 
170  | 90.3k  |                                            int is_email) { | 
171  | 90.3k  |   cmark_strbuf buf = CMARK_BUF_INIT(mem);  | 
172  |  |  | 
173  | 90.3k  |   cmark_chunk_trim(url);  | 
174  |  |  | 
175  | 90.3k  |   if (is_email)  | 
176  | 72.6k  |     cmark_strbuf_puts(&buf, "mailto:");  | 
177  |  |  | 
178  | 90.3k  |   houdini_unescape_html_f(&buf, url->data, url->len);  | 
179  | 90.3k  |   return cmark_strbuf_detach(&buf);  | 
180  | 90.3k  | }  | 
181  |  |  | 
182  |  | static CMARK_INLINE cmark_node *make_autolink(subject *subj,  | 
183  |  |                                               int start_column, int end_column,  | 
184  | 90.3k  |                                               cmark_chunk url, int is_email) { | 
185  | 90.3k  |   cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK);  | 
186  | 90.3k  |   link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email);  | 
187  | 90.3k  |   link->as.link.title = NULL;  | 
188  | 90.3k  |   link->start_line = link->end_line = subj->line;  | 
189  | 90.3k  |   link->start_column = start_column + 1;  | 
190  | 90.3k  |   link->end_column = end_column + 1;  | 
191  | 90.3k  |   append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));  | 
192  | 90.3k  |   return link;  | 
193  | 90.3k  | }  | 
194  |  |  | 
195  |  | static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,  | 
196  | 1.00M  |                              cmark_chunk *chunk, cmark_reference_map *refmap) { | 
197  | 1.00M  |   int i;  | 
198  | 1.00M  |   e->mem = mem;  | 
199  | 1.00M  |   e->input = *chunk;  | 
200  | 1.00M  |   e->flags = 0;  | 
201  | 1.00M  |   e->line = line_number;  | 
202  | 1.00M  |   e->pos = 0;  | 
203  | 1.00M  |   e->block_offset = block_offset;  | 
204  | 1.00M  |   e->column_offset = 0;  | 
205  | 1.00M  |   e->refmap = refmap;  | 
206  | 1.00M  |   e->last_delim = NULL;  | 
207  | 1.00M  |   e->last_bracket = NULL;  | 
208  | 1.00G  |   for (i = 0; i <= MAXBACKTICKS; i++) { | 
209  | 1.00G  |     e->backticks[i] = 0;  | 
210  | 1.00G  |   }  | 
211  | 1.00M  |   e->scanned_for_backticks = false;  | 
212  | 1.00M  |   e->no_link_openers = true;  | 
213  | 1.00M  | }  | 
214  |  |  | 
215  | 2.26M  | static CMARK_INLINE int isbacktick(int c) { return (c == '`'); } | 
216  |  |  | 
217  | 255M  | static CMARK_INLINE unsigned char peek_char(subject *subj) { | 
218  |  |   // NULL bytes should have been stripped out by now.  If they're  | 
219  |  |   // present, it's a programming error:  | 
220  | 255M  |   assert(!(subj->pos < subj->input.len && subj->input.data[subj->pos] == 0));  | 
221  | 255M  |   return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0;  | 
222  | 255M  | }  | 
223  |  |  | 
224  | 10.3M  | static CMARK_INLINE unsigned char peek_at(subject *subj, bufsize_t pos) { | 
225  | 10.3M  |   return subj->input.data[pos];  | 
226  | 10.3M  | }  | 
227  |  |  | 
228  |  | // Return true if there are more characters in the subject.  | 
229  | 35.8M  | static CMARK_INLINE int is_eof(subject *subj) { | 
230  | 35.8M  |   return (subj->pos >= subj->input.len);  | 
231  | 35.8M  | }  | 
232  |  |  | 
233  |  | // Advance the subject.  Doesn't check for eof.  | 
234  | 208M  | #define advance(subj) (subj)->pos += 1  | 
235  |  |  | 
236  | 2.48M  | static CMARK_INLINE bool skip_spaces(subject *subj) { | 
237  | 2.48M  |   bool skipped = false;  | 
238  | 3.19M  |   while (peek_char(subj) == ' ' || peek_char(subj) == '\t') { | 
239  | 711k  |     advance(subj);  | 
240  | 711k  |     skipped = true;  | 
241  | 711k  |   }  | 
242  | 2.48M  |   return skipped;  | 
243  | 2.48M  | }  | 
244  |  |  | 
245  | 580k  | static CMARK_INLINE bool skip_line_end(subject *subj) { | 
246  | 580k  |   bool seen_line_end_char = false;  | 
247  | 580k  |   if (peek_char(subj) == '\r') { | 
248  | 0  |     advance(subj);  | 
249  | 0  |     seen_line_end_char = true;  | 
250  | 0  |   }  | 
251  | 580k  |   if (peek_char(subj) == '\n') { | 
252  | 168k  |     advance(subj);  | 
253  | 168k  |     seen_line_end_char = true;  | 
254  | 168k  |   }  | 
255  | 580k  |   return seen_line_end_char || is_eof(subj);  | 
256  | 580k  | }  | 
257  |  |  | 
258  |  | // Take characters while a predicate holds, and return a string.  | 
259  | 315k  | static CMARK_INLINE cmark_chunk take_while(subject *subj, int (*f)(int)) { | 
260  | 315k  |   unsigned char c;  | 
261  | 315k  |   bufsize_t startpos = subj->pos;  | 
262  | 315k  |   bufsize_t len = 0;  | 
263  |  |  | 
264  | 2.27M  |   while ((c = peek_char(subj)) && (*f)(c)) { | 
265  | 1.96M  |     advance(subj);  | 
266  | 1.96M  |     len++;  | 
267  | 1.96M  |   }  | 
268  |  |  | 
269  | 315k  |   return cmark_chunk_dup(&subj->input, startpos, len);  | 
270  | 315k  | }  | 
271  |  |  | 
272  |  | // Return the number of newlines in a given span of text in a subject.  If  | 
273  |  | // the number is greater than zero, also return the number of characters  | 
274  |  | // between the last newline and the end of the span in `since_newline`.  | 
275  | 211k  | static int count_newlines(subject *subj, bufsize_t from, bufsize_t len, int *since_newline) { | 
276  | 211k  |   int nls = 0;  | 
277  | 211k  |   int since_nl = 0;  | 
278  |  |  | 
279  | 93.9M  |   while (len--) { | 
280  | 93.6M  |     if (subj->input.data[from++] == '\n') { | 
281  | 182k  |       ++nls;  | 
282  | 182k  |       since_nl = 0;  | 
283  | 93.5M  |     } else { | 
284  | 93.5M  |       ++since_nl;  | 
285  | 93.5M  |     }  | 
286  | 93.6M  |   }  | 
287  |  |  | 
288  | 211k  |   if (!nls)  | 
289  | 175k  |     return 0;  | 
290  |  |  | 
291  | 35.2k  |   *since_newline = since_nl;  | 
292  | 35.2k  |   return nls;  | 
293  | 211k  | }  | 
294  |  |  | 
295  |  | // Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and  | 
296  |  | // `column_offset` according to the number of newlines in a just-matched span  | 
297  |  | // of text in `subj`.  | 
298  | 443k  | static void adjust_subj_node_newlines(subject *subj, cmark_node *node, int matchlen, int extra, int options) { | 
299  | 443k  |   if (!(options & CMARK_OPT_SOURCEPOS)) { | 
300  | 232k  |     return;  | 
301  | 232k  |   }  | 
302  |  |  | 
303  | 211k  |   int since_newline;  | 
304  | 211k  |   int newlines = count_newlines(subj, subj->pos - matchlen - extra, matchlen, &since_newline);  | 
305  | 211k  |   if (newlines) { | 
306  | 35.2k  |     subj->line += newlines;  | 
307  | 35.2k  |     node->end_line += newlines;  | 
308  | 35.2k  |     node->end_column = since_newline;  | 
309  | 35.2k  |     subj->column_offset = -subj->pos + since_newline + extra;  | 
310  | 35.2k  |   }  | 
311  | 211k  | }  | 
312  |  |  | 
313  |  | // Try to process a backtick code span that began with a  | 
314  |  | // span of ticks of length openticklength length (already  | 
315  |  | // parsed).  Return 0 if you don't find matching closing  | 
316  |  | // backticks, otherwise return the position in the subject  | 
317  |  | // after the closing backticks.  | 
318  |  | static bufsize_t scan_to_closing_backticks(subject *subj,  | 
319  | 315k  |                                            bufsize_t openticklength) { | 
320  |  |  | 
321  | 315k  |   bool found = false;  | 
322  | 315k  |   if (openticklength > MAXBACKTICKS) { | 
323  |  |     // we limit backtick string length because of the array subj->backticks:  | 
324  | 254  |     return 0;  | 
325  | 254  |   }  | 
326  | 315k  |   if (subj->scanned_for_backticks &&  | 
327  | 315k  |       subj->backticks[openticklength] <= subj->pos) { | 
328  |  |     // return if we already know there's no closer  | 
329  | 70.7k  |     return 0;  | 
330  | 70.7k  |   }  | 
331  | 902k  |   while (!found) { | 
332  |  |     // read non backticks  | 
333  | 902k  |     unsigned char c;  | 
334  | 146M  |     while ((c = peek_char(subj)) && c != '`') { | 
335  | 145M  |       advance(subj);  | 
336  | 145M  |     }  | 
337  | 902k  |     if (is_eof(subj)) { | 
338  | 97.2k  |       break;  | 
339  | 97.2k  |     }  | 
340  | 805k  |     bufsize_t numticks = 0;  | 
341  | 7.34M  |     while (peek_char(subj) == '`') { | 
342  | 6.53M  |       advance(subj);  | 
343  | 6.53M  |       numticks++;  | 
344  | 6.53M  |     }  | 
345  |  |     // store position of ender  | 
346  | 805k  |     if (numticks <= MAXBACKTICKS) { | 
347  | 805k  |       subj->backticks[numticks] = subj->pos - numticks;  | 
348  | 805k  |     }  | 
349  | 805k  |     if (numticks == openticklength) { | 
350  | 147k  |       return (subj->pos);  | 
351  | 147k  |     }  | 
352  | 805k  |   }  | 
353  |  |   // got through whole input without finding closer  | 
354  | 97.2k  |   subj->scanned_for_backticks = true;  | 
355  | 97.2k  |   return 0;  | 
356  | 244k  | }  | 
357  |  |  | 
358  |  | // Destructively modify string, converting newlines to  | 
359  |  | // spaces, then removing a single leading + trailing space,  | 
360  |  | // unless the code span consists entirely of space characters.  | 
361  | 147k  | static void S_normalize_code(cmark_strbuf *s) { | 
362  | 147k  |   bufsize_t r, w;  | 
363  | 147k  |   bool contains_nonspace = false;  | 
364  |  |  | 
365  | 100M  |   for (r = 0, w = 0; r < s->size; ++r) { | 
366  | 100M  |     switch (s->ptr[r]) { | 
367  | 0  |     case '\r':  | 
368  | 0  |       if (s->ptr[r + 1] != '\n') { | 
369  | 0  |         s->ptr[w++] = ' ';  | 
370  | 0  |       }  | 
371  | 0  |       break;  | 
372  | 374k  |     case '\n':  | 
373  | 374k  |       s->ptr[w++] = ' ';  | 
374  | 374k  |       break;  | 
375  | 99.6M  |     default:  | 
376  | 99.6M  |       s->ptr[w++] = s->ptr[r];  | 
377  | 100M  |     }  | 
378  | 100M  |     if (s->ptr[r] != ' ') { | 
379  | 99.2M  |       contains_nonspace = true;  | 
380  | 99.2M  |     }  | 
381  | 100M  |   }  | 
382  |  |  | 
383  |  |   // begins and ends with space?  | 
384  | 147k  |   if (contains_nonspace &&  | 
385  | 147k  |       s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') { | 
386  | 4.71k  |     cmark_strbuf_drop(s, 1);  | 
387  | 4.71k  |     cmark_strbuf_truncate(s, w - 2);  | 
388  | 142k  |   } else { | 
389  | 142k  |     cmark_strbuf_truncate(s, w);  | 
390  | 142k  |   }  | 
391  |  |  | 
392  | 147k  | }  | 
393  |  |  | 
394  |  |  | 
395  |  | // Parse backtick code section or raw backticks, return an inline.  | 
396  |  | // Assumes that the subject has a backtick at the current position.  | 
397  | 315k  | static cmark_node *handle_backticks(subject *subj, int options) { | 
398  | 315k  |   bufsize_t initpos = subj->pos;  | 
399  | 315k  |   cmark_chunk openticks = take_while(subj, isbacktick);  | 
400  | 315k  |   bufsize_t startpos = subj->pos;  | 
401  | 315k  |   bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len);  | 
402  |  |  | 
403  | 315k  |   if (endpos == 0) {      // not found | 
404  | 168k  |     subj->pos = startpos; // rewind  | 
405  | 168k  |     return make_str(subj, initpos, initpos + openticks.len - 1, openticks);  | 
406  | 168k  |   } else { | 
407  | 147k  |     cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);  | 
408  |  |  | 
409  | 147k  |     cmark_strbuf_set(&buf, subj->input.data + startpos,  | 
410  | 147k  |                      endpos - startpos - openticks.len);  | 
411  | 147k  |     S_normalize_code(&buf);  | 
412  |  |  | 
413  | 147k  |     cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos,  | 
414  | 147k  |                                     endpos - openticks.len - 1);  | 
415  | 147k  |     node->len = buf.size;  | 
416  | 147k  |     node->data = cmark_strbuf_detach(&buf);  | 
417  | 147k  |     adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options);  | 
418  | 147k  |     return node;  | 
419  | 147k  |   }  | 
420  | 315k  | }  | 
421  |  |  | 
422  |  |  | 
423  |  | // Scan ***, **, or * and return number scanned, or 0.  | 
424  |  | // Advances position.  | 
425  |  | static int scan_delims(subject *subj, unsigned char c, bool *can_open,  | 
426  | 2.10M  |                        bool *can_close) { | 
427  | 2.10M  |   int numdelims = 0;  | 
428  | 2.10M  |   bufsize_t before_char_pos;  | 
429  | 2.10M  |   int32_t after_char = 0;  | 
430  | 2.10M  |   int32_t before_char = 0;  | 
431  | 2.10M  |   int len;  | 
432  | 2.10M  |   bool left_flanking, right_flanking;  | 
433  |  |  | 
434  | 2.10M  |   if (subj->pos == 0) { | 
435  | 12.8k  |     before_char = 10;  | 
436  | 2.09M  |   } else { | 
437  | 2.09M  |     before_char_pos = subj->pos - 1;  | 
438  |  |     // walk back to the beginning of the UTF_8 sequence:  | 
439  | 3.53M  |     while (peek_at(subj, before_char_pos) >> 6 == 2 && before_char_pos > 0) { | 
440  | 1.43M  |       before_char_pos -= 1;  | 
441  | 1.43M  |     }  | 
442  | 2.09M  |     len = cmark_utf8proc_iterate(subj->input.data + before_char_pos,  | 
443  | 2.09M  |                                  subj->pos - before_char_pos, &before_char);  | 
444  | 2.09M  |     if (len == -1) { | 
445  | 88.0k  |       before_char = 10;  | 
446  | 88.0k  |     }  | 
447  | 2.09M  |   }  | 
448  |  |  | 
449  | 2.10M  |   if (c == '\'' || c == '"') { | 
450  | 1.06M  |     numdelims++;  | 
451  | 1.06M  |     advance(subj); // limit to 1 delim for quotes  | 
452  | 1.06M  |   } else { | 
453  | 3.02M  |     while (peek_char(subj) == c) { | 
454  | 1.97M  |       numdelims++;  | 
455  | 1.97M  |       advance(subj);  | 
456  | 1.97M  |     }  | 
457  | 1.04M  |   }  | 
458  |  |  | 
459  | 2.10M  |   len = cmark_utf8proc_iterate(subj->input.data + subj->pos,  | 
460  | 2.10M  |                                subj->input.len - subj->pos, &after_char);  | 
461  | 2.10M  |   if (len == -1) { | 
462  | 246k  |     after_char = 10;  | 
463  | 246k  |   }  | 
464  | 2.10M  |   left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) &&  | 
465  | 2.10M  |                   (!cmark_utf8proc_is_punctuation(after_char) ||  | 
466  | 1.79M  |                    cmark_utf8proc_is_space(before_char) ||  | 
467  | 1.79M  |                    cmark_utf8proc_is_punctuation(before_char));  | 
468  | 2.10M  |   right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) &&  | 
469  | 2.10M  |                    (!cmark_utf8proc_is_punctuation(before_char) ||  | 
470  | 1.74M  |                     cmark_utf8proc_is_space(after_char) ||  | 
471  | 1.74M  |                     cmark_utf8proc_is_punctuation(after_char));  | 
472  | 2.10M  |   if (c == '_') { | 
473  | 442k  |     *can_open = left_flanking &&  | 
474  | 442k  |                 (!right_flanking || cmark_utf8proc_is_punctuation(before_char));  | 
475  | 442k  |     *can_close = right_flanking &&  | 
476  | 442k  |                  (!left_flanking || cmark_utf8proc_is_punctuation(after_char));  | 
477  | 1.66M  |   } else if (c == '\'' || c == '"') { | 
478  | 1.06M  |     *can_open = left_flanking &&  | 
479  | 1.06M  |          (!right_flanking || before_char == '(' || before_char == '[') && | 
480  | 1.06M  |          before_char != ']' && before_char != ')';  | 
481  | 1.06M  |     *can_close = right_flanking;  | 
482  | 1.06M  |   } else { | 
483  | 600k  |     *can_open = left_flanking;  | 
484  | 600k  |     *can_close = right_flanking;  | 
485  | 600k  |   }  | 
486  | 2.10M  |   return numdelims;  | 
487  | 2.10M  | }  | 
488  |  |  | 
489  |  | /*  | 
490  |  | static void print_delimiters(subject *subj)  | 
491  |  | { | 
492  |  |         delimiter *delim;  | 
493  |  |         delim = subj->last_delim;  | 
494  |  |         while (delim != NULL) { | 
495  |  |                 printf("Item at stack pos %p: %d %d %d next(%p) prev(%p)\n", | 
496  |  |                        (void*)delim, delim->delim_char,  | 
497  |  |                        delim->can_open, delim->can_close,  | 
498  |  |                        (void*)delim->next, (void*)delim->previous);  | 
499  |  |                 delim = delim->previous;  | 
500  |  |         }  | 
501  |  | }  | 
502  |  | */  | 
503  |  |  | 
504  | 1.78M  | static void remove_delimiter(subject *subj, delimiter *delim) { | 
505  | 1.78M  |   if (delim == NULL)  | 
506  | 0  |     return;  | 
507  | 1.78M  |   if (delim->next == NULL) { | 
508  |  |     // end of list:  | 
509  | 424k  |     assert(delim == subj->last_delim);  | 
510  | 424k  |     subj->last_delim = delim->previous;  | 
511  | 1.36M  |   } else { | 
512  | 1.36M  |     delim->next->previous = delim->previous;  | 
513  | 1.36M  |   }  | 
514  | 1.78M  |   if (delim->previous != NULL) { | 
515  | 828k  |     delim->previous->next = delim->next;  | 
516  | 828k  |   }  | 
517  | 1.78M  |   subj->mem->free(delim);  | 
518  | 1.78M  | }  | 
519  |  |  | 
520  | 2.26M  | static void pop_bracket(subject *subj) { | 
521  | 2.26M  |   bracket *b;  | 
522  | 2.26M  |   if (subj->last_bracket == NULL)  | 
523  | 0  |     return;  | 
524  | 2.26M  |   b = subj->last_bracket;  | 
525  | 2.26M  |   subj->last_bracket = subj->last_bracket->previous;  | 
526  | 2.26M  |   subj->mem->free(b);  | 
527  | 2.26M  | }  | 
528  |  |  | 
529  |  | static void push_delimiter(subject *subj, unsigned char c, bool can_open,  | 
530  | 1.78M  |                            bool can_close, cmark_node *inl_text) { | 
531  | 1.78M  |   delimiter *delim = (delimiter *)subj->mem->calloc(1, sizeof(delimiter));  | 
532  | 1.78M  |   delim->delim_char = c;  | 
533  | 1.78M  |   delim->can_open = can_open;  | 
534  | 1.78M  |   delim->can_close = can_close;  | 
535  | 1.78M  |   delim->inl_text = inl_text;  | 
536  | 1.78M  |   delim->position = subj->pos;  | 
537  | 1.78M  |   delim->length = inl_text->len;  | 
538  | 1.78M  |   delim->previous = subj->last_delim;  | 
539  | 1.78M  |   delim->next = NULL;  | 
540  | 1.78M  |   if (delim->previous != NULL) { | 
541  | 1.58M  |     delim->previous->next = delim;  | 
542  | 1.58M  |   }  | 
543  | 1.78M  |   subj->last_delim = delim;  | 
544  | 1.78M  | }  | 
545  |  |  | 
546  | 2.26M  | static void push_bracket(subject *subj, bool image, cmark_node *inl_text) { | 
547  | 2.26M  |   bracket *b = (bracket *)subj->mem->calloc(1, sizeof(bracket));  | 
548  | 2.26M  |   if (subj->last_bracket != NULL) { | 
549  | 1.38M  |     subj->last_bracket->bracket_after = true;  | 
550  | 1.38M  |   }  | 
551  | 2.26M  |   b->image = image;  | 
552  | 2.26M  |   b->active = true;  | 
553  | 2.26M  |   b->inl_text = inl_text;  | 
554  | 2.26M  |   b->previous = subj->last_bracket;  | 
555  | 2.26M  |   b->position = subj->pos;  | 
556  | 2.26M  |   b->bracket_after = false;  | 
557  | 2.26M  |   subj->last_bracket = b;  | 
558  | 2.26M  |   if (!image) { | 
559  | 2.20M  |     subj->no_link_openers = false;  | 
560  | 2.20M  |   }  | 
561  | 2.26M  | }  | 
562  |  |  | 
563  |  | // Assumes the subject has a c at the current position.  | 
564  | 2.10M  | static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { | 
565  | 2.10M  |   bufsize_t numdelims;  | 
566  | 2.10M  |   cmark_node *inl_text;  | 
567  | 2.10M  |   bool can_open, can_close;  | 
568  | 2.10M  |   cmark_chunk contents;  | 
569  |  |  | 
570  | 2.10M  |   numdelims = scan_delims(subj, c, &can_open, &can_close);  | 
571  |  |  | 
572  | 2.10M  |   if (c == '\'' && smart) { | 
573  | 233k  |     contents = cmark_chunk_literal(RIGHTSINGLEQUOTE);  | 
574  | 1.87M  |   } else if (c == '"' && smart) { | 
575  | 686k  |     contents =  | 
576  | 686k  |         cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE);  | 
577  | 1.18M  |   } else { | 
578  | 1.18M  |     contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims);  | 
579  | 1.18M  |   }  | 
580  |  |  | 
581  | 2.10M  |   inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents);  | 
582  |  |  | 
583  | 2.10M  |   if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) { | 
584  | 1.78M  |     push_delimiter(subj, c, can_open, can_close, inl_text);  | 
585  | 1.78M  |   }  | 
586  |  |  | 
587  | 2.10M  |   return inl_text;  | 
588  | 2.10M  | }  | 
589  |  |  | 
590  |  | // Assumes we have a hyphen at the current position.  | 
591  | 4.44M  | static cmark_node *handle_hyphen(subject *subj, bool smart) { | 
592  | 4.44M  |   int startpos = subj->pos;  | 
593  |  |  | 
594  | 4.44M  |   advance(subj);  | 
595  |  |  | 
596  | 4.44M  |   if (!smart || peek_char(subj) != '-') { | 
597  | 3.38M  |     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-")); | 
598  | 3.38M  |   }  | 
599  |  |  | 
600  | 4.81M  |   while (smart && peek_char(subj) == '-') { | 
601  | 3.75M  |     advance(subj);  | 
602  | 3.75M  |   }  | 
603  |  |  | 
604  | 1.06M  |   int numhyphens = subj->pos - startpos;  | 
605  | 1.06M  |   int en_count = 0;  | 
606  | 1.06M  |   int em_count = 0;  | 
607  | 1.06M  |   int i;  | 
608  | 1.06M  |   cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);  | 
609  |  |  | 
610  | 1.06M  |   if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes | 
611  | 320k  |     em_count = numhyphens / 3;  | 
612  | 745k  |   } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes | 
613  | 658k  |     en_count = numhyphens / 2;  | 
614  | 658k  |   } else if (numhyphens % 3 == 2) { // use one en dash at end | 
615  | 65.1k  |     en_count = 1;  | 
616  | 65.1k  |     em_count = (numhyphens - 2) / 3;  | 
617  | 65.1k  |   } else { // use two en dashes at the end | 
618  | 21.9k  |     en_count = 2;  | 
619  | 21.9k  |     em_count = (numhyphens - 4) / 3;  | 
620  | 21.9k  |   }  | 
621  |  |  | 
622  | 1.83M  |   for (i = em_count; i > 0; i--) { | 
623  | 766k  |     cmark_strbuf_puts(&buf, EMDASH);  | 
624  | 766k  |   }  | 
625  |  |  | 
626  | 2.32M  |   for (i = en_count; i > 0; i--) { | 
627  | 1.25M  |     cmark_strbuf_puts(&buf, ENDASH);  | 
628  | 1.25M  |   }  | 
629  |  |  | 
630  | 1.06M  |   return make_str_from_buf(subj, startpos, subj->pos - 1, &buf);  | 
631  | 4.44M  | }  | 
632  |  |  | 
633  |  | // Assumes we have a period at the current position.  | 
634  | 253k  | static cmark_node *handle_period(subject *subj, bool smart) { | 
635  | 253k  |   advance(subj);  | 
636  | 253k  |   if (smart && peek_char(subj) == '.') { | 
637  | 58.4k  |     advance(subj);  | 
638  | 58.4k  |     if (peek_char(subj) == '.') { | 
639  | 54.8k  |       advance(subj);  | 
640  | 54.8k  |       return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES));  | 
641  | 54.8k  |     } else { | 
642  | 3.60k  |       return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("..")); | 
643  | 3.60k  |     }  | 
644  | 195k  |   } else { | 
645  | 195k  |     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal(".")); | 
646  | 195k  |   }  | 
647  | 253k  | }  | 
648  |  |  | 
649  | 1.06M  | static void process_emphasis(subject *subj, bufsize_t stack_bottom) { | 
650  | 1.06M  |   delimiter *candidate;  | 
651  | 1.06M  |   delimiter *closer = NULL;  | 
652  | 1.06M  |   delimiter *opener;  | 
653  | 1.06M  |   delimiter *old_closer;  | 
654  | 1.06M  |   bool opener_found;  | 
655  | 1.06M  |   int openers_bottom_index = 0;  | 
656  | 1.06M  |   bufsize_t openers_bottom[9] = {stack_bottom, stack_bottom, stack_bottom, | 
657  | 1.06M  |                                  stack_bottom, stack_bottom, stack_bottom,  | 
658  | 1.06M  |                                  stack_bottom, stack_bottom, stack_bottom};  | 
659  |  |  | 
660  |  |   // move back to first relevant delim.  | 
661  | 1.06M  |   candidate = subj->last_delim;  | 
662  | 2.84M  |   while (candidate != NULL && candidate->position >= stack_bottom) { | 
663  | 1.78M  |     closer = candidate;  | 
664  | 1.78M  |     candidate = candidate->previous;  | 
665  | 1.78M  |   }  | 
666  |  |  | 
667  |  |   // now move forward, looking for closers, and handling each  | 
668  | 2.98M  |   while (closer != NULL) { | 
669  | 1.92M  |     if (closer->can_close) { | 
670  | 1.50M  |       switch (closer->delim_char) { | 
671  | 468k  |       case '"':  | 
672  | 468k  |         openers_bottom_index = 0;  | 
673  | 468k  |         break;  | 
674  | 163k  |       case '\'':  | 
675  | 163k  |         openers_bottom_index = 1;  | 
676  | 163k  |         break;  | 
677  | 367k  |       case '_':  | 
678  | 367k  |         openers_bottom_index = 2;  | 
679  | 367k  |         break;  | 
680  | 501k  |       case '*':  | 
681  | 501k  |         openers_bottom_index = 3 +  | 
682  | 501k  |                 (closer->can_open ? 3 : 0) + (closer->length % 3);  | 
683  | 501k  |         break;  | 
684  | 0  |       default:  | 
685  | 0  |         assert(false);  | 
686  | 1.50M  |       }  | 
687  |  |  | 
688  |  |       // Now look backwards for first matching opener:  | 
689  | 1.50M  |       opener = closer->previous;  | 
690  | 1.50M  |       opener_found = false;  | 
691  | 1.98M  |       while (opener != NULL &&  | 
692  | 1.98M  |              opener->position >= openers_bottom[openers_bottom_index]) { | 
693  | 972k  |         if (opener->can_open && opener->delim_char == closer->delim_char) { | 
694  |  |           // interior closer of size 2 can't match opener of size 1  | 
695  |  |           // or of size 1 can't match 2  | 
696  | 501k  |           if (!(closer->can_open || opener->can_close) ||  | 
697  | 501k  |               closer->length % 3 == 0 ||  | 
698  | 501k  |               (opener->length + closer->length) % 3 != 0) { | 
699  | 491k  |             opener_found = true;  | 
700  | 491k  |             break;  | 
701  | 491k  |           }  | 
702  | 501k  |         }  | 
703  | 480k  |         opener = opener->previous;  | 
704  | 480k  |       }  | 
705  | 1.50M  |       old_closer = closer;  | 
706  | 1.50M  |       if (closer->delim_char == '*' || closer->delim_char == '_') { | 
707  | 868k  |         if (opener_found) { | 
708  | 422k  |           closer = S_insert_emph(subj, opener, closer);  | 
709  | 446k  |         } else { | 
710  | 446k  |           closer = closer->next;  | 
711  | 446k  |         }  | 
712  | 868k  |       } else if (closer->delim_char == '\'' || closer->delim_char == '"') { | 
713  | 631k  |         if (closer->delim_char == '\'') { | 
714  | 163k  |           cmark_node_set_literal(closer->inl_text, RIGHTSINGLEQUOTE);  | 
715  | 468k  |         } else { | 
716  | 468k  |           cmark_node_set_literal(closer->inl_text, RIGHTDOUBLEQUOTE);  | 
717  | 468k  |         }  | 
718  | 631k  |         closer = closer->next;  | 
719  | 631k  |         if (opener_found) { | 
720  | 69.2k  |           if (old_closer->delim_char == '\'') { | 
721  | 20.2k  |             cmark_node_set_literal(opener->inl_text, LEFTSINGLEQUOTE);  | 
722  | 49.0k  |           } else { | 
723  | 49.0k  |             cmark_node_set_literal(opener->inl_text, LEFTDOUBLEQUOTE);  | 
724  | 49.0k  |           }  | 
725  | 69.2k  |           remove_delimiter(subj, opener);  | 
726  | 69.2k  |           remove_delimiter(subj, old_closer);  | 
727  | 69.2k  |         }  | 
728  | 631k  |       }  | 
729  | 1.50M  |       if (!opener_found) { | 
730  |  |         // set lower bound for future searches for openers  | 
731  | 1.00M  |         openers_bottom[openers_bottom_index] = old_closer->position;  | 
732  | 1.00M  |         if (!old_closer->can_open) { | 
733  |  |           // we can remove a closer that can't be an  | 
734  |  |           // opener, once we've seen there's no  | 
735  |  |           // matching opener:  | 
736  | 735k  |           remove_delimiter(subj, old_closer);  | 
737  | 735k  |         }  | 
738  | 1.00M  |       }  | 
739  | 1.50M  |     } else { | 
740  | 423k  |       closer = closer->next;  | 
741  | 423k  |     }  | 
742  | 1.92M  |   }  | 
743  |  |   // free all delimiters in list until stack_bottom:  | 
744  | 1.36M  |   while (subj->last_delim != NULL &&  | 
745  | 1.36M  |          subj->last_delim->position >= stack_bottom) { | 
746  | 305k  |     remove_delimiter(subj, subj->last_delim);  | 
747  | 305k  |   }  | 
748  | 1.06M  | }  | 
749  |  |  | 
750  |  | static delimiter *S_insert_emph(subject *subj, delimiter *opener,  | 
751  | 422k  |                                 delimiter *closer) { | 
752  | 422k  |   delimiter *delim, *tmp_delim;  | 
753  | 422k  |   bufsize_t use_delims;  | 
754  | 422k  |   cmark_node *opener_inl = opener->inl_text;  | 
755  | 422k  |   cmark_node *closer_inl = closer->inl_text;  | 
756  | 422k  |   bufsize_t opener_num_chars = opener_inl->len;  | 
757  | 422k  |   bufsize_t closer_num_chars = closer_inl->len;  | 
758  | 422k  |   cmark_node *tmp, *tmpnext, *emph;  | 
759  |  |  | 
760  |  |   // calculate the actual number of characters used from this closer  | 
761  | 422k  |   use_delims = (closer_num_chars >= 2 && opener_num_chars >= 2) ? 2 : 1;  | 
762  |  |  | 
763  |  |   // remove used characters from associated inlines.  | 
764  | 422k  |   opener_num_chars -= use_delims;  | 
765  | 422k  |   closer_num_chars -= use_delims;  | 
766  | 422k  |   opener_inl->len = opener_num_chars;  | 
767  | 422k  |   opener_inl->data[opener_num_chars] = 0;  | 
768  | 422k  |   closer_inl->len = closer_num_chars;  | 
769  | 422k  |   closer_inl->data[closer_num_chars] = 0;  | 
770  |  |  | 
771  |  |   // free delimiters between opener and closer  | 
772  | 422k  |   delim = closer->previous;  | 
773  | 460k  |   while (delim != NULL && delim != opener) { | 
774  | 38.0k  |     tmp_delim = delim->previous;  | 
775  | 38.0k  |     remove_delimiter(subj, delim);  | 
776  | 38.0k  |     delim = tmp_delim;  | 
777  | 38.0k  |   }  | 
778  |  |  | 
779  |  |   // create new emph or strong, and splice it in to our inlines  | 
780  |  |   // between the opener and closer  | 
781  | 422k  |   emph = use_delims == 1 ? make_emph(subj->mem) : make_strong(subj->mem);  | 
782  |  |  | 
783  | 422k  |   tmp = opener_inl->next;  | 
784  | 3.98M  |   while (tmp && tmp != closer_inl) { | 
785  | 3.55M  |     tmpnext = tmp->next;  | 
786  | 3.55M  |     cmark_node_unlink(tmp);  | 
787  | 3.55M  |     append_child(emph, tmp);  | 
788  | 3.55M  |     tmp = tmpnext;  | 
789  | 3.55M  |   }  | 
790  | 422k  |   cmark_node_insert_after(opener_inl, emph);  | 
791  |  |  | 
792  | 422k  |   emph->start_line = opener_inl->start_line;  | 
793  | 422k  |   emph->end_line = closer_inl->end_line;  | 
794  | 422k  |   emph->start_column = opener_inl->start_column;  | 
795  | 422k  |   emph->end_column = closer_inl->end_column;  | 
796  |  |  | 
797  |  |   // if opener has 0 characters, remove it and its associated inline  | 
798  | 422k  |   if (opener_num_chars == 0) { | 
799  | 284k  |     cmark_node_free(opener_inl);  | 
800  | 284k  |     remove_delimiter(subj, opener);  | 
801  | 284k  |   }  | 
802  |  |  | 
803  |  |   // if closer has 0 characters, remove it and its associated inline  | 
804  | 422k  |   if (closer_num_chars == 0) { | 
805  |  |     // remove empty closer inline  | 
806  | 282k  |     cmark_node_free(closer_inl);  | 
807  |  |     // remove closer from list  | 
808  | 282k  |     tmp_delim = closer->next;  | 
809  | 282k  |     remove_delimiter(subj, closer);  | 
810  | 282k  |     closer = tmp_delim;  | 
811  | 282k  |   }  | 
812  |  |  | 
813  | 422k  |   return closer;  | 
814  | 422k  | }  | 
815  |  |  | 
816  |  | // Parse backslash-escape or just a backslash, returning an inline.  | 
817  | 628k  | static cmark_node *handle_backslash(subject *subj) { | 
818  | 628k  |   advance(subj);  | 
819  | 628k  |   unsigned char nextchar = peek_char(subj);  | 
820  | 628k  |   if (cmark_ispunct(  | 
821  | 628k  |           nextchar)) { // only ascii symbols and newline can be escaped | 
822  | 278k  |     advance(subj);  | 
823  | 278k  |     return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1));  | 
824  | 350k  |   } else if (!is_eof(subj) && skip_line_end(subj)) { | 
825  | 21.6k  |     return make_linebreak(subj->mem);  | 
826  | 329k  |   } else { | 
827  | 329k  |     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\")); | 
828  | 329k  |   }  | 
829  | 628k  | }  | 
830  |  |  | 
831  |  | // Parse an entity or a regular "&" string.  | 
832  |  | // Assumes the subject has an '&' character at the current position.  | 
833  | 514k  | static cmark_node *handle_entity(subject *subj) { | 
834  | 514k  |   cmark_strbuf ent = CMARK_BUF_INIT(subj->mem);  | 
835  | 514k  |   bufsize_t len;  | 
836  |  |  | 
837  | 514k  |   advance(subj);  | 
838  |  |  | 
839  | 514k  |   len = houdini_unescape_ent(&ent, subj->input.data + subj->pos,  | 
840  | 514k  |                              subj->input.len - subj->pos);  | 
841  |  |  | 
842  | 514k  |   if (len <= 0)  | 
843  | 473k  |     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&")); | 
844  |  |  | 
845  | 41.0k  |   subj->pos += len;  | 
846  | 41.0k  |   return make_str_from_buf(subj, subj->pos - 1 - len, subj->pos - 1, &ent);  | 
847  | 514k  | }  | 
848  |  |  | 
849  |  | // Clean a URL: remove surrounding whitespace, and remove \ that escape  | 
850  |  | // punctuation.  | 
851  | 88.0k  | unsigned char *cmark_clean_url(cmark_mem *mem, cmark_chunk *url) { | 
852  | 88.0k  |   cmark_strbuf buf = CMARK_BUF_INIT(mem);  | 
853  |  |  | 
854  | 88.0k  |   cmark_chunk_trim(url);  | 
855  |  |  | 
856  | 88.0k  |   houdini_unescape_html_f(&buf, url->data, url->len);  | 
857  |  |  | 
858  | 88.0k  |   cmark_strbuf_unescape(&buf);  | 
859  | 88.0k  |   return cmark_strbuf_detach(&buf);  | 
860  | 88.0k  | }  | 
861  |  |  | 
862  | 88.0k  | unsigned char *cmark_clean_title(cmark_mem *mem, cmark_chunk *title) { | 
863  | 88.0k  |   cmark_strbuf buf = CMARK_BUF_INIT(mem);  | 
864  | 88.0k  |   unsigned char first, last;  | 
865  |  |  | 
866  | 88.0k  |   if (title->len == 0) { | 
867  | 80.7k  |     return NULL;  | 
868  | 80.7k  |   }  | 
869  |  |  | 
870  | 7.28k  |   first = title->data[0];  | 
871  | 7.28k  |   last = title->data[title->len - 1];  | 
872  |  |  | 
873  |  |   // remove surrounding quotes if any:  | 
874  | 7.28k  |   if ((first == '\'' && last == '\'') || (first == '(' && last == ')') || | 
875  | 7.28k  |       (first == '"' && last == '"')) { | 
876  | 7.28k  |     houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);  | 
877  | 7.28k  |   } else { | 
878  | 0  |     houdini_unescape_html_f(&buf, title->data, title->len);  | 
879  | 0  |   }  | 
880  |  |  | 
881  | 7.28k  |   cmark_strbuf_unescape(&buf);  | 
882  | 7.28k  |   return cmark_strbuf_detach(&buf);  | 
883  | 88.0k  | }  | 
884  |  |  | 
885  |  | // Parse an autolink or HTML tag.  | 
886  |  | // Assumes the subject has a '<' character at the current position.  | 
887  | 3.95M  | static cmark_node *handle_pointy_brace(subject *subj, int options) { | 
888  | 3.95M  |   bufsize_t matchlen = 0;  | 
889  | 3.95M  |   cmark_chunk contents;  | 
890  |  |  | 
891  | 3.95M  |   advance(subj); // advance past first <  | 
892  |  |  | 
893  |  |   // first try to match a URL autolink  | 
894  | 3.95M  |   matchlen = scan_autolink_uri(&subj->input, subj->pos);  | 
895  | 3.95M  |   if (matchlen > 0) { | 
896  | 17.6k  |     contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);  | 
897  | 17.6k  |     subj->pos += matchlen;  | 
898  |  |  | 
899  | 17.6k  |     return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0);  | 
900  | 17.6k  |   }  | 
901  |  |  | 
902  |  |   // next try to match an email autolink  | 
903  | 3.93M  |   matchlen = scan_autolink_email(&subj->input, subj->pos);  | 
904  | 3.93M  |   if (matchlen > 0) { | 
905  | 72.6k  |     contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);  | 
906  | 72.6k  |     subj->pos += matchlen;  | 
907  |  |  | 
908  | 72.6k  |     return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1);  | 
909  | 72.6k  |   }  | 
910  |  |  | 
911  |  |   // finally, try to match an html tag  | 
912  | 3.86M  |   if (subj->pos + 2 <= subj->input.len) { | 
913  | 3.84M  |     int c = subj->input.data[subj->pos];  | 
914  | 3.84M  |     if (c == '!' && (subj->flags & FLAG_SKIP_HTML_COMMENT) == 0) { | 
915  | 321k  |       c = subj->input.data[subj->pos+1];  | 
916  | 321k  |       if (c == '-' && subj->input.data[subj->pos+2] == '-') { | 
917  | 37.9k  |   if (subj->input.data[subj->pos+3] == '>') { | 
918  | 693  |     matchlen = 4;  | 
919  | 37.2k  |   } else if (subj->input.data[subj->pos+3] == '-' &&  | 
920  | 37.2k  |                    subj->input.data[subj->pos+4] == '>') { | 
921  | 273  |           matchlen = 5;  | 
922  | 37.0k  |         } else { | 
923  | 37.0k  |           matchlen = scan_html_comment(&subj->input, subj->pos + 1);  | 
924  | 37.0k  |           if (matchlen > 0) { | 
925  | 286  |             matchlen += 1; // prefix "<"  | 
926  | 36.7k  |     } else { // no match through end of input: set a flag so | 
927  |  |        // we don't reparse looking for -->:  | 
928  | 36.7k  |       subj->flags |= FLAG_SKIP_HTML_COMMENT;  | 
929  | 36.7k  |     }  | 
930  | 37.0k  |   }  | 
931  | 283k  |       } else if (c == '[') { | 
932  | 40.6k  |         if ((subj->flags & FLAG_SKIP_HTML_CDATA) == 0) { | 
933  | 36.8k  |           matchlen = scan_html_cdata(&subj->input, subj->pos + 2);  | 
934  | 36.8k  |           if (matchlen > 0) { | 
935  |  |             // The regex doesn't require the final "]]>". But if we're not at  | 
936  |  |             // the end of input, it must come after the match. Otherwise,  | 
937  |  |             // disable subsequent scans to avoid quadratic behavior.  | 
938  | 17.9k  |             matchlen += 5; // prefix "![", suffix "]]>"  | 
939  | 17.9k  |             if (subj->pos + matchlen > subj->input.len) { | 
940  | 2.80k  |               subj->flags |= FLAG_SKIP_HTML_CDATA;  | 
941  | 2.80k  |               matchlen = 0;  | 
942  | 2.80k  |             }  | 
943  | 17.9k  |           }  | 
944  | 36.8k  |         }  | 
945  | 242k  |       } else if ((subj->flags & FLAG_SKIP_HTML_DECLARATION) == 0) { | 
946  | 230k  |         matchlen = scan_html_declaration(&subj->input, subj->pos + 1);  | 
947  | 230k  |         if (matchlen > 0) { | 
948  | 48.1k  |           matchlen += 2; // prefix "!", suffix ">"  | 
949  | 48.1k  |           if (subj->pos + matchlen > subj->input.len) { | 
950  | 12.3k  |             subj->flags |= FLAG_SKIP_HTML_DECLARATION;  | 
951  | 12.3k  |             matchlen = 0;  | 
952  | 12.3k  |           }  | 
953  | 48.1k  |         }  | 
954  | 230k  |       }  | 
955  | 3.51M  |     } else if (c == '?') { | 
956  | 98.3k  |       if ((subj->flags & FLAG_SKIP_HTML_PI) == 0) { | 
957  |  |         // Note that we allow an empty match.  | 
958  | 76.2k  |         matchlen = scan_html_pi(&subj->input, subj->pos + 1);  | 
959  | 76.2k  |         matchlen += 3; // prefix "?", suffix "?>"  | 
960  | 76.2k  |         if (subj->pos + matchlen > subj->input.len) { | 
961  | 10.0k  |           subj->flags |= FLAG_SKIP_HTML_PI;  | 
962  | 10.0k  |           matchlen = 0;  | 
963  | 10.0k  |         }  | 
964  | 76.2k  |       }  | 
965  | 3.42M  |     } else { | 
966  | 3.42M  |       matchlen = scan_html_tag(&subj->input, subj->pos);  | 
967  | 3.42M  |     }  | 
968  | 3.84M  |   }  | 
969  | 3.86M  |   if (matchlen > 0) { | 
970  | 296k  |     const unsigned char *src = subj->input.data + subj->pos - 1;  | 
971  | 296k  |     bufsize_t len = matchlen + 1;  | 
972  | 296k  |     subj->pos += matchlen;  | 
973  | 296k  |     cmark_node *node = make_literal(subj, CMARK_NODE_HTML_INLINE,  | 
974  | 296k  |                                     subj->pos - matchlen - 1, subj->pos - 1);  | 
975  | 296k  |     node->data = (unsigned char *)subj->mem->realloc(NULL, len + 1);  | 
976  | 296k  |     memcpy(node->data, src, len);  | 
977  | 296k  |     node->data[len] = 0;  | 
978  | 296k  |     node->len = len;  | 
979  | 296k  |     adjust_subj_node_newlines(subj, node, matchlen, 1, options);  | 
980  | 296k  |     return node;  | 
981  | 296k  |   }  | 
982  |  |  | 
983  |  |   // if nothing matches, just return the opening <:  | 
984  | 3.56M  |   return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<")); | 
985  | 3.86M  | }  | 
986  |  |  | 
987  |  | // Parse a link label.  Returns 1 if successful.  | 
988  |  | // Note:  unescaped brackets are not allowed in labels.  | 
989  |  | // The label begins with `[` and ends with the first `]` character  | 
990  |  | // encountered.  Backticks in labels do not start code spans.  | 
991  | 1.90M  | static int link_label(subject *subj, cmark_chunk *raw_label) { | 
992  | 1.90M  |   bufsize_t startpos = subj->pos;  | 
993  | 1.90M  |   int length = 0;  | 
994  | 1.90M  |   unsigned char c;  | 
995  |  |  | 
996  |  |   // advance past [  | 
997  | 1.90M  |   if (peek_char(subj) == '[') { | 
998  | 661k  |     advance(subj);  | 
999  | 1.24M  |   } else { | 
1000  | 1.24M  |     return 0;  | 
1001  | 1.24M  |   }  | 
1002  |  |  | 
1003  | 28.2M  |   while ((c = peek_char(subj)) && c != '[' && c != ']') { | 
1004  | 27.5M  |     if (c == '\\') { | 
1005  | 41.9k  |       advance(subj);  | 
1006  | 41.9k  |       length++;  | 
1007  | 41.9k  |       if (cmark_ispunct(peek_char(subj))) { | 
1008  | 19.9k  |         advance(subj);  | 
1009  | 19.9k  |         length++;  | 
1010  | 19.9k  |       }  | 
1011  | 27.5M  |     } else { | 
1012  | 27.5M  |       advance(subj);  | 
1013  | 27.5M  |       length++;  | 
1014  | 27.5M  |     }  | 
1015  | 27.5M  |     if (length > MAX_LINK_LABEL_LENGTH) { | 
1016  | 1.91k  |       goto noMatch;  | 
1017  | 1.91k  |     }  | 
1018  | 27.5M  |   }  | 
1019  |  |  | 
1020  | 659k  |   if (c == ']') { // match found | 
1021  | 487k  |     *raw_label =  | 
1022  | 487k  |         cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1));  | 
1023  | 487k  |     cmark_chunk_trim(raw_label);  | 
1024  | 487k  |     advance(subj); // advance past ]  | 
1025  | 487k  |     return 1;  | 
1026  | 487k  |   }  | 
1027  |  |  | 
1028  | 173k  | noMatch:  | 
1029  | 173k  |   subj->pos = startpos; // rewind  | 
1030  | 173k  |   return 0;  | 
1031  | 659k  | }  | 
1032  |  |  | 
1033  |  | static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset,  | 
1034  | 354k  |                                         cmark_chunk *output) { | 
1035  | 354k  |   bufsize_t i = offset;  | 
1036  | 354k  |   size_t nb_p = 0;  | 
1037  |  |  | 
1038  | 39.3M  |   while (i < input->len) { | 
1039  | 39.3M  |     if (input->data[i] == '\\' &&  | 
1040  | 39.3M  |         i + 1 < input-> len &&  | 
1041  | 39.3M  |         cmark_ispunct(input->data[i+1]))  | 
1042  | 6.01k  |       i += 2;  | 
1043  | 39.3M  |     else if (input->data[i] == '(') { | 
1044  | 66.5k  |       ++nb_p;  | 
1045  | 66.5k  |       ++i;  | 
1046  | 66.5k  |       if (nb_p > 32)  | 
1047  | 364  |         return -1;  | 
1048  | 39.2M  |     } else if (input->data[i] == ')') { | 
1049  | 26.5k  |       if (nb_p == 0)  | 
1050  | 16.8k  |         break;  | 
1051  | 9.71k  |       --nb_p;  | 
1052  | 9.71k  |       ++i;  | 
1053  | 39.2M  |     } else if (cmark_isspace(input->data[i])) { | 
1054  | 332k  |       if (i == offset) { | 
1055  | 633  |         return -1;  | 
1056  | 633  |       }  | 
1057  | 331k  |       break;  | 
1058  | 38.8M  |     } else { | 
1059  | 38.8M  |       ++i;  | 
1060  | 38.8M  |     }  | 
1061  | 39.3M  |   }  | 
1062  |  |  | 
1063  | 353k  |   if (i >= input->len || nb_p != 0)  | 
1064  | 20.8k  |     return -1;  | 
1065  |  |  | 
1066  | 332k  |   { | 
1067  | 332k  |     cmark_chunk result = {input->data + offset, i - offset}; | 
1068  | 332k  |     *output = result;  | 
1069  | 332k  |   }  | 
1070  | 332k  |   return i - offset;  | 
1071  | 353k  | }  | 
1072  |  |  | 
1073  |  | static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset,  | 
1074  | 413k  |                                       cmark_chunk *output) { | 
1075  | 413k  |   bufsize_t i = offset;  | 
1076  |  |  | 
1077  | 413k  |   if (i < input->len && input->data[i] == '<') { | 
1078  | 59.3k  |     ++i;  | 
1079  | 592k  |     while (i < input->len) { | 
1080  | 590k  |       if (input->data[i] == '>') { | 
1081  | 46.2k  |         ++i;  | 
1082  | 46.2k  |         break;  | 
1083  | 544k  |       } else if (input->data[i] == '\\')  | 
1084  | 5.33k  |         i += 2;  | 
1085  | 539k  |       else if (input->data[i] == '\n' || input->data[i] == '<')  | 
1086  | 11.0k  |         return -1;  | 
1087  | 528k  |       else  | 
1088  | 528k  |         ++i;  | 
1089  | 590k  |     }  | 
1090  | 354k  |   } else { | 
1091  | 354k  |     return manual_scan_link_url_2(input, offset, output);  | 
1092  | 354k  |   }  | 
1093  |  |  | 
1094  | 48.3k  |   if (i >= input->len)  | 
1095  | 2.12k  |     return -1;  | 
1096  |  |  | 
1097  | 46.1k  |   { | 
1098  | 46.1k  |     cmark_chunk result = {input->data + offset + 1, i - 2 - offset}; | 
1099  | 46.1k  |     *output = result;  | 
1100  | 46.1k  |   }  | 
1101  | 46.1k  |   return i - offset;  | 
1102  | 48.3k  | }  | 
1103  |  |  | 
1104  |  | // Return a link, an image, or a literal close bracket.  | 
1105  | 2.30M  | static cmark_node *handle_close_bracket(subject *subj) { | 
1106  | 2.30M  |   bufsize_t initial_pos, after_link_text_pos;  | 
1107  | 2.30M  |   bufsize_t endurl, starttitle, endtitle, endall;  | 
1108  | 2.30M  |   bufsize_t sps, n;  | 
1109  | 2.30M  |   cmark_reference *ref = NULL;  | 
1110  | 2.30M  |   cmark_chunk url_chunk, title_chunk;  | 
1111  | 2.30M  |   unsigned char *url, *title;  | 
1112  | 2.30M  |   bracket *opener;  | 
1113  | 2.30M  |   cmark_node *inl;  | 
1114  | 2.30M  |   cmark_chunk raw_label;  | 
1115  | 2.30M  |   int found_label;  | 
1116  | 2.30M  |   cmark_node *tmp, *tmpnext;  | 
1117  | 2.30M  |   bool is_image;  | 
1118  |  |  | 
1119  | 2.30M  |   advance(subj); // advance past ]  | 
1120  | 2.30M  |   initial_pos = subj->pos;  | 
1121  |  |  | 
1122  |  |   // get last [ or ![  | 
1123  | 2.30M  |   opener = subj->last_bracket;  | 
1124  |  |  | 
1125  | 2.30M  |   if (opener == NULL) { | 
1126  | 519k  |     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); | 
1127  | 519k  |   }  | 
1128  |  |  | 
1129  |  |   // If we got here, we matched a potential link/image text.  | 
1130  |  |   // Now we check to see if it's a link/image.  | 
1131  | 1.78M  |   is_image = opener->image;  | 
1132  |  |  | 
1133  | 1.78M  |   if (!is_image && subj->no_link_openers) { | 
1134  |  |     // take delimiter off stack  | 
1135  | 17.8k  |     pop_bracket(subj);  | 
1136  | 17.8k  |     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); | 
1137  | 17.8k  |   }  | 
1138  |  |  | 
1139  | 1.77M  |   after_link_text_pos = subj->pos;  | 
1140  |  |  | 
1141  |  |   // First, look for an inline link.  | 
1142  | 1.77M  |   if (peek_char(subj) == '(' && | 
1143  | 1.77M  |       ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&  | 
1144  | 1.77M  |       ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps,  | 
1145  | 335k  |                                  &url_chunk)) > -1)) { | 
1146  |  |  | 
1147  |  |     // try to parse an explicit link:  | 
1148  | 303k  |     endurl = subj->pos + 1 + sps + n;  | 
1149  | 303k  |     starttitle = endurl + scan_spacechars(&subj->input, endurl);  | 
1150  |  |  | 
1151  |  |     // ensure there are spaces btw url and title  | 
1152  | 303k  |     endtitle = (starttitle == endurl)  | 
1153  | 303k  |                    ? starttitle  | 
1154  | 303k  |                    : starttitle + scan_link_title(&subj->input, starttitle);  | 
1155  |  |  | 
1156  | 303k  |     endall = endtitle + scan_spacechars(&subj->input, endtitle);  | 
1157  |  |  | 
1158  | 303k  |     if (peek_at(subj, endall) == ')') { | 
1159  | 20.3k  |       subj->pos = endall + 1;  | 
1160  |  |  | 
1161  | 20.3k  |       title_chunk =  | 
1162  | 20.3k  |           cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle);  | 
1163  | 20.3k  |       url = cmark_clean_url(subj->mem, &url_chunk);  | 
1164  | 20.3k  |       title = cmark_clean_title(subj->mem, &title_chunk);  | 
1165  | 20.3k  |       cmark_chunk_free(&url_chunk);  | 
1166  | 20.3k  |       cmark_chunk_free(&title_chunk);  | 
1167  | 20.3k  |       goto match;  | 
1168  |  |  | 
1169  | 282k  |     } else { | 
1170  |  |       // it could still be a shortcut reference link  | 
1171  | 282k  |       subj->pos = after_link_text_pos;  | 
1172  | 282k  |     }  | 
1173  | 303k  |   }  | 
1174  |  |  | 
1175  |  |   // Next, look for a following [link label] that matches in refmap.  | 
1176  |  |   // skip spaces  | 
1177  | 1.74M  |   raw_label = cmark_chunk_literal(""); | 
1178  | 1.74M  |   found_label = link_label(subj, &raw_label);  | 
1179  | 1.74M  |   if (!found_label) { | 
1180  |  |     // If we have a shortcut reference link, back up  | 
1181  |  |     // to before the spaces we skipped.  | 
1182  | 1.38M  |     subj->pos = initial_pos;  | 
1183  | 1.38M  |   }  | 
1184  |  |  | 
1185  | 1.74M  |   if ((!found_label || raw_label.len == 0) && !opener->bracket_after) { | 
1186  | 1.30M  |     cmark_chunk_free(&raw_label);  | 
1187  | 1.30M  |     raw_label = cmark_chunk_dup(&subj->input, opener->position,  | 
1188  | 1.30M  |                                 initial_pos - opener->position - 1);  | 
1189  | 1.30M  |     found_label = true;  | 
1190  | 1.30M  |   }  | 
1191  |  |  | 
1192  | 1.74M  |   if (found_label) { | 
1193  | 1.63M  |     ref = cmark_reference_lookup(subj->refmap, &raw_label);  | 
1194  | 1.63M  |     cmark_chunk_free(&raw_label);  | 
1195  | 1.63M  |   }  | 
1196  |  |  | 
1197  | 1.74M  |   if (ref != NULL) { // found | 
1198  | 193k  |     url = cmark_strdup(subj->mem, ref->url);  | 
1199  | 193k  |     title = cmark_strdup(subj->mem, ref->title);  | 
1200  | 193k  |     goto match;  | 
1201  | 1.55M  |   } else { | 
1202  | 1.55M  |     goto noMatch;  | 
1203  | 1.55M  |   }  | 
1204  |  |  | 
1205  | 1.55M  | noMatch:  | 
1206  |  |   // If we fall through to here, it means we didn't match a link:  | 
1207  | 1.55M  |   pop_bracket(subj); // remove this opener from delimiter list  | 
1208  | 1.55M  |   subj->pos = initial_pos;  | 
1209  | 1.55M  |   return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); | 
1210  |  |  | 
1211  | 213k  | match:  | 
1212  | 213k  |   inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK);  | 
1213  | 213k  |   inl->as.link.url = url;  | 
1214  | 213k  |   inl->as.link.title = title;  | 
1215  | 213k  |   inl->start_line = inl->end_line = subj->line;  | 
1216  | 213k  |   inl->start_column = opener->inl_text->start_column;  | 
1217  | 213k  |   inl->end_column = subj->pos + subj->column_offset + subj->block_offset;  | 
1218  | 213k  |   cmark_node_insert_before(opener->inl_text, inl);  | 
1219  |  |   // Add link text:  | 
1220  | 213k  |   tmp = opener->inl_text->next;  | 
1221  | 630k  |   while (tmp) { | 
1222  | 416k  |     tmpnext = tmp->next;  | 
1223  | 416k  |     cmark_node_unlink(tmp);  | 
1224  | 416k  |     append_child(inl, tmp);  | 
1225  | 416k  |     tmp = tmpnext;  | 
1226  | 416k  |   }  | 
1227  |  |  | 
1228  |  |   // Free the bracket [:  | 
1229  | 213k  |   cmark_node_free(opener->inl_text);  | 
1230  |  |  | 
1231  | 213k  |   process_emphasis(subj, opener->position);  | 
1232  | 213k  |   pop_bracket(subj);  | 
1233  |  |  | 
1234  |  |   // Now, if we have a link, we also want to deactivate links until  | 
1235  |  |   // we get a new opener. (This code can be removed if we decide to allow links  | 
1236  |  |   // inside links.)  | 
1237  | 213k  |   if (!is_image) { | 
1238  | 205k  |     subj->no_link_openers = true;  | 
1239  | 205k  |   }  | 
1240  |  |  | 
1241  | 213k  |   return NULL;  | 
1242  | 1.74M  | }  | 
1243  |  |  | 
1244  |  | // Parse a hard or soft linebreak, returning an inline.  | 
1245  |  | // Assumes the subject has a cr or newline at the current position.  | 
1246  | 2.17M  | static cmark_node *handle_newline(subject *subj) { | 
1247  | 2.17M  |   bufsize_t nlpos = subj->pos;  | 
1248  |  |   // skip over cr, crlf, or lf:  | 
1249  | 2.17M  |   if (peek_at(subj, subj->pos) == '\r') { | 
1250  | 0  |     advance(subj);  | 
1251  | 0  |   }  | 
1252  | 2.17M  |   if (peek_at(subj, subj->pos) == '\n') { | 
1253  | 2.17M  |     advance(subj);  | 
1254  | 2.17M  |   }  | 
1255  | 2.17M  |   ++subj->line;  | 
1256  | 2.17M  |   subj->column_offset = -subj->pos;  | 
1257  |  |   // skip spaces at beginning of line  | 
1258  | 2.17M  |   skip_spaces(subj);  | 
1259  | 2.17M  |   if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' &&  | 
1260  | 2.17M  |       peek_at(subj, nlpos - 2) == ' ') { | 
1261  | 2.22k  |     return make_linebreak(subj->mem);  | 
1262  | 2.17M  |   } else { | 
1263  | 2.17M  |     return make_softbreak(subj->mem);  | 
1264  | 2.17M  |   }  | 
1265  | 2.17M  | }  | 
1266  |  |  | 
1267  | 13.4M  | static bufsize_t subject_find_special_char(subject *subj, int options) { | 
1268  |  |   // "\r\n\\`&_*[]<!"  | 
1269  | 13.4M  |   static const int8_t SPECIAL_CHARS[256] = { | 
1270  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1271  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,  | 
1272  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1273  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,  | 
1274  | 13.4M  |       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1275  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1276  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1277  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1278  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1279  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1280  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  | 
1281  |  |  | 
1282  |  |   // " ' . -  | 
1283  | 13.4M  |   static const char SMART_PUNCT_CHARS[] = { | 
1284  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1285  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,  | 
1286  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1287  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1288  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1289  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1290  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1291  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1292  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1293  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1294  | 13.4M  |       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
1295  | 13.4M  |   };  | 
1296  |  |  | 
1297  | 13.4M  |   bufsize_t n = subj->pos + 1;  | 
1298  |  |  | 
1299  | 283M  |   while (n < subj->input.len) { | 
1300  | 282M  |     if (SPECIAL_CHARS[subj->input.data[n]])  | 
1301  | 8.50M  |       return n;  | 
1302  | 274M  |     if (options & CMARK_OPT_SMART && SMART_PUNCT_CHARS[subj->input.data[n]])  | 
1303  | 4.36M  |       return n;  | 
1304  | 269M  |     n++;  | 
1305  | 269M  |   }  | 
1306  |  |  | 
1307  | 593k  |   return subj->input.len;  | 
1308  | 13.4M  | }  | 
1309  |  |  | 
1310  |  | // Parse an inline, advancing subject, and add it as a child of parent.  | 
1311  |  | // Return 0 if no inline can be parsed, 1 otherwise.  | 
1312  | 33.3M  | static int parse_inline(subject *subj, cmark_node *parent, int options) { | 
1313  | 33.3M  |   cmark_node *new_inl = NULL;  | 
1314  | 33.3M  |   cmark_chunk contents;  | 
1315  | 33.3M  |   unsigned char c;  | 
1316  | 33.3M  |   bufsize_t startpos, endpos;  | 
1317  | 33.3M  |   c = peek_char(subj);  | 
1318  | 33.3M  |   if (c == 0) { | 
1319  | 0  |     return 0;  | 
1320  | 0  |   }  | 
1321  | 33.3M  |   switch (c) { | 
1322  | 0  |   case '\r':  | 
1323  | 2.17M  |   case '\n':  | 
1324  | 2.17M  |     new_inl = handle_newline(subj);  | 
1325  | 2.17M  |     break;  | 
1326  | 315k  |   case '`':  | 
1327  | 315k  |     new_inl = handle_backticks(subj, options);  | 
1328  | 315k  |     break;  | 
1329  | 628k  |   case '\\':  | 
1330  | 628k  |     new_inl = handle_backslash(subj);  | 
1331  | 628k  |     break;  | 
1332  | 514k  |   case '&':  | 
1333  | 514k  |     new_inl = handle_entity(subj);  | 
1334  | 514k  |     break;  | 
1335  | 3.95M  |   case '<':  | 
1336  | 3.95M  |     new_inl = handle_pointy_brace(subj, options);  | 
1337  | 3.95M  |     break;  | 
1338  | 600k  |   case '*':  | 
1339  | 1.04M  |   case '_':  | 
1340  | 1.38M  |   case '\'':  | 
1341  | 2.10M  |   case '"':  | 
1342  | 2.10M  |     new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0);  | 
1343  | 2.10M  |     break;  | 
1344  | 4.44M  |   case '-':  | 
1345  | 4.44M  |     new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0);  | 
1346  | 4.44M  |     break;  | 
1347  | 253k  |   case '.':  | 
1348  | 253k  |     new_inl = handle_period(subj, (options & CMARK_OPT_SMART) != 0);  | 
1349  | 253k  |     break;  | 
1350  | 2.20M  |   case '[':  | 
1351  | 2.20M  |     advance(subj);  | 
1352  | 2.20M  |     new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("[")); | 
1353  | 2.20M  |     push_bracket(subj, false, new_inl);  | 
1354  | 2.20M  |     break;  | 
1355  | 2.30M  |   case ']':  | 
1356  | 2.30M  |     new_inl = handle_close_bracket(subj);  | 
1357  | 2.30M  |     break;  | 
1358  | 935k  |   case '!':  | 
1359  | 935k  |     advance(subj);  | 
1360  | 935k  |     if (peek_char(subj) == '[') { | 
1361  | 60.6k  |       advance(subj);  | 
1362  | 60.6k  |       new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("![")); | 
1363  | 60.6k  |       push_bracket(subj, true, new_inl);  | 
1364  | 874k  |     } else { | 
1365  | 874k  |       new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!")); | 
1366  | 874k  |     }  | 
1367  | 935k  |     break;  | 
1368  | 13.4M  |   default:  | 
1369  | 13.4M  |     endpos = subject_find_special_char(subj, options);  | 
1370  | 13.4M  |     contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos);  | 
1371  | 13.4M  |     startpos = subj->pos;  | 
1372  | 13.4M  |     subj->pos = endpos;  | 
1373  |  |  | 
1374  |  |     // if we're at a newline, strip trailing spaces.  | 
1375  | 13.4M  |     if (S_is_line_end_char(peek_char(subj))) { | 
1376  | 1.67M  |       cmark_chunk_rtrim(&contents);  | 
1377  | 1.67M  |     }  | 
1378  |  |  | 
1379  | 13.4M  |     new_inl = make_str(subj, startpos, endpos - 1, contents);  | 
1380  | 33.3M  |   }  | 
1381  | 33.3M  |   if (new_inl != NULL) { | 
1382  | 33.0M  |     append_child(parent, new_inl);  | 
1383  | 33.0M  |   }  | 
1384  |  |  | 
1385  | 33.3M  |   return 1;  | 
1386  | 33.3M  | }  | 
1387  |  |  | 
1388  |  | // Parse inlines from parent's string_content, adding as children of parent.  | 
1389  |  | void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent,  | 
1390  | 850k  |                          cmark_reference_map *refmap, int options) { | 
1391  | 850k  |   int internal_offset = parent->type == CMARK_NODE_HEADING ?  | 
1392  | 766k  |     parent->as.heading.internal_offset : 0;  | 
1393  | 850k  |   subject subj;  | 
1394  | 850k  |   cmark_chunk content = {parent->data, parent->len}; | 
1395  | 850k  |   subject_from_buf(mem, parent->start_line, parent->start_column - 1 + internal_offset, &subj, &content, refmap);  | 
1396  | 850k  |   cmark_chunk_rtrim(&subj.input);  | 
1397  |  |  | 
1398  | 34.1M  |   while (!is_eof(&subj) && parse_inline(&subj, parent, options))  | 
1399  | 33.3M  |     ;  | 
1400  |  |  | 
1401  | 850k  |   process_emphasis(&subj, 0);  | 
1402  |  |   // free bracket and delim stack  | 
1403  | 850k  |   while (subj.last_delim) { | 
1404  | 0  |     remove_delimiter(&subj, subj.last_delim);  | 
1405  | 0  |   }  | 
1406  | 1.32M  |   while (subj.last_bracket) { | 
1407  | 479k  |     pop_bracket(&subj);  | 
1408  | 479k  |   }  | 
1409  | 850k  | }  | 
1410  |  |  | 
1411  |  | // Parse zero or more space characters, including at most one newline.  | 
1412  | 153k  | static void spnl(subject *subj) { | 
1413  | 153k  |   skip_spaces(subj);  | 
1414  | 153k  |   if (skip_line_end(subj)) { | 
1415  | 78.7k  |     skip_spaces(subj);  | 
1416  | 78.7k  |   }  | 
1417  | 153k  | }  | 
1418  |  |  | 
1419  |  | // Parse reference.  Assumes string begins with '[' character.  | 
1420  |  | // Modify refmap if a reference is encountered.  | 
1421  |  | // Return 0 if no reference found, otherwise position of subject  | 
1422  |  | // after reference is parsed.  | 
1423  |  | bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_chunk *input,  | 
1424  | 152k  |                                        cmark_reference_map *refmap) { | 
1425  | 152k  |   subject subj;  | 
1426  |  |  | 
1427  | 152k  |   cmark_chunk lab;  | 
1428  | 152k  |   cmark_chunk url;  | 
1429  | 152k  |   cmark_chunk title;  | 
1430  |  |  | 
1431  | 152k  |   bufsize_t matchlen = 0;  | 
1432  | 152k  |   bufsize_t beforetitle;  | 
1433  |  |  | 
1434  | 152k  |   subject_from_buf(mem, -1, 0, &subj, input, NULL);  | 
1435  |  |  | 
1436  |  |   // parse label:  | 
1437  | 152k  |   if (!link_label(&subj, &lab) || lab.len == 0)  | 
1438  | 37.5k  |     return 0;  | 
1439  |  |  | 
1440  |  |   // colon:  | 
1441  | 114k  |   if (peek_char(&subj) == ':') { | 
1442  | 78.1k  |     advance(&subj);  | 
1443  | 78.1k  |   } else { | 
1444  | 36.5k  |     return 0;  | 
1445  | 36.5k  |   }  | 
1446  |  |  | 
1447  |  |   // parse link url:  | 
1448  | 78.1k  |   spnl(&subj);  | 
1449  | 78.1k  |   if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1) { | 
1450  | 75.5k  |     subj.pos += matchlen;  | 
1451  | 75.5k  |   } else { | 
1452  | 2.57k  |     return 0;  | 
1453  | 2.57k  |   }  | 
1454  |  |  | 
1455  |  |   // parse optional link_title  | 
1456  | 75.5k  |   beforetitle = subj.pos;  | 
1457  | 75.5k  |   spnl(&subj);  | 
1458  | 75.5k  |   matchlen = subj.pos == beforetitle ? 0 : scan_link_title(&subj.input, subj.pos);  | 
1459  | 75.5k  |   if (matchlen) { | 
1460  | 2.77k  |     title = cmark_chunk_dup(&subj.input, subj.pos, matchlen);  | 
1461  | 2.77k  |     subj.pos += matchlen;  | 
1462  | 72.7k  |   } else { | 
1463  | 72.7k  |     subj.pos = beforetitle;  | 
1464  | 72.7k  |     title = cmark_chunk_literal(""); | 
1465  | 72.7k  |   }  | 
1466  |  |  | 
1467  |  |   // parse final spaces and newline:  | 
1468  | 75.5k  |   skip_spaces(&subj);  | 
1469  | 75.5k  |   if (!skip_line_end(&subj)) { | 
1470  | 10.1k  |     if (matchlen) { // try rewinding before title | 
1471  | 2.74k  |       subj.pos = beforetitle;  | 
1472  | 2.74k  |       skip_spaces(&subj);  | 
1473  | 2.74k  |       if (!skip_line_end(&subj)) { | 
1474  | 420  |         return 0;  | 
1475  | 420  |       }  | 
1476  | 7.38k  |     } else { | 
1477  | 7.38k  |       return 0;  | 
1478  | 7.38k  |     }  | 
1479  | 10.1k  |   }  | 
1480  |  |   // insert reference into refmap  | 
1481  | 67.7k  |   cmark_reference_create(refmap, &lab, &url, &title);  | 
1482  | 67.7k  |   return subj.pos;  | 
1483  | 75.5k  | }  |