/src/aspell/modules/filter/markdown.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | #include "settings.h" |
2 | | |
3 | | #include "config.hpp" |
4 | | #include "indiv_filter.hpp" |
5 | | #include "iostream.hpp" |
6 | | #include "string_map.hpp" |
7 | | #include "asc_ctype.hpp" |
8 | | |
9 | | #include <typeinfo> |
10 | | |
11 | | //#define DEBUG_FILTER |
12 | | |
13 | | namespace { |
14 | | |
15 | | using namespace acommon; |
16 | | |
17 | | struct Iterator; |
18 | | |
19 | | struct Block { |
20 | | Block * next; |
21 | 9.85k | Block() : next() {} |
22 | | enum KeepOpenState {NEVER, MAYBE, YES}; |
23 | | virtual KeepOpenState proc_line(Iterator &) = 0; |
24 | | virtual void dump() const = 0; |
25 | | virtual bool leaf() const = 0; |
26 | 9.85k | virtual ~Block() {} |
27 | | }; |
28 | | |
29 | | struct DocRoot : Block { |
30 | 1.98M | KeepOpenState proc_line(Iterator &) {return YES;} |
31 | 0 | void dump() const {CERR.printf("DocRoot\n");} |
32 | 25.1k | bool leaf() const {return false;} |
33 | | }; |
34 | | |
35 | | struct MultilineInlineState; |
36 | | |
37 | | class MarkdownFilter : public IndividualFilter { |
38 | | public: |
39 | 172 | MarkdownFilter() : root(), back(&root), prev_blank(true), inline_state() { |
40 | 172 | name_ = "markdown-filter"; |
41 | 172 | order_num_ = 0.30; // need to be before SGML filter |
42 | 172 | } |
43 | | PosibErr<bool> setup(Config *); |
44 | | void reset(); |
45 | | ~MarkdownFilter(); |
46 | | |
47 | | void process(FilterChar * & start, FilterChar * & stop); |
48 | | |
49 | | private: |
50 | 0 | void dump() { |
51 | 0 | CERR.printf(">>>blocks\n"); |
52 | 0 | for (Block * cur = &root; cur; cur = cur->next) { |
53 | 0 | cur->dump(); |
54 | 0 | } |
55 | 0 | CERR.printf("<<<blocks\n"); |
56 | 0 | } |
57 | | |
58 | | StringMap block_start_tags; |
59 | | StringMap raw_start_tags; |
60 | | |
61 | | DocRoot root; |
62 | | Block * back; |
63 | | bool prev_blank; |
64 | | MultilineInlineState * inline_state; |
65 | | |
66 | 21.0k | void kill(Block * blk) { |
67 | 21.0k | Block * cur = &root; |
68 | 21.9k | while (cur->next && cur->next != blk) |
69 | 930 | cur = cur->next; |
70 | 21.0k | back = cur; |
71 | 21.0k | Block * next = cur->next; |
72 | 21.0k | cur->next = NULL; |
73 | 21.0k | cur = next; |
74 | 30.6k | while (cur) { |
75 | 9.55k | next = cur->next; |
76 | 9.55k | delete cur; |
77 | 9.55k | cur = next; |
78 | 9.55k | } |
79 | 21.0k | } |
80 | | |
81 | 9.55k | void add(Block * blk) { |
82 | 9.55k | back->next = blk; |
83 | 9.55k | back = blk; |
84 | 9.55k | } |
85 | | |
86 | | Block * start_block(Iterator & itr); |
87 | | }; |
88 | | |
89 | | // |
90 | | // Iterator class |
91 | | // |
92 | | |
93 | 684k | inline void blank(FilterChar & chr) { |
94 | 684k | if (!asc_isspace(chr)) |
95 | 678k | chr = ' '; |
96 | 684k | } |
97 | | |
98 | | struct Iterator { |
99 | | FilterChar * line_start; |
100 | | FilterChar * i; |
101 | | FilterChar * end; |
102 | | int line_pos; |
103 | | int indent; |
104 | | Iterator(FilterChar * start, FilterChar * stop) |
105 | 907 | : line_start(start), i(start), end(stop), line_pos(), indent() {} |
106 | 5.09M | void * pos() {return i;} |
107 | 35.0M | unsigned int operator[](int x) const { |
108 | 35.0M | if (x < 0) { |
109 | 1.55M | if (i + x >= line_start) return i[x]; |
110 | 50.7k | else return '\0'; |
111 | 33.5M | } else { |
112 | 33.5M | if (i + x >= end) return '\0'; |
113 | 33.5M | if (*i == '\r' || *i == '\n') return '\0'; |
114 | 33.3M | else return i[x]; |
115 | 33.5M | } |
116 | 35.0M | } |
117 | 10.5k | bool prev_isspace() const {return i == line_start || asc_isspace(i[-1]);} |
118 | 1.55M | bool escaped() const {return operator[](-1) == '\\';} |
119 | 21.2M | unsigned int operator *() const {return operator[](0); } |
120 | 14.2M | bool eol() const {return operator*() == '\0';} |
121 | 5.99M | bool at_end() const {return i >= end;} |
122 | 2.16M | int width() const { |
123 | 2.16M | if (i == end) return 0; |
124 | 2.16M | if (*i == '\t') return 4 - (line_pos % 4); |
125 | 2.13M | return 1; |
126 | 2.16M | } |
127 | | // u_eq = not escaped and equal |
128 | 1.55M | bool u_eq(char chr) { |
129 | 1.55M | return !escaped() && operator*() == chr; |
130 | 1.55M | } |
131 | 978k | bool eq(const char * str) { |
132 | 978k | int i = 0; |
133 | 995k | while (str[i] != '\0' && operator[](i) == str[i]) |
134 | 16.4k | ++i; |
135 | 978k | return str[i] == '\0'; |
136 | 978k | } |
137 | 2.14M | void inc() { |
138 | 2.14M | indent = 0; |
139 | 2.14M | if (eol()) return; |
140 | 2.14M | line_pos += width(); |
141 | 2.14M | ++i; |
142 | 2.14M | } |
143 | 807k | void adv(int width = 1) { |
144 | 1.62M | for (; width > 0; --width) |
145 | 817k | inc(); |
146 | 807k | eat_space(); |
147 | 807k | } |
148 | 71.3k | void blank_adv(int width = 1) { |
149 | 684k | for (; !eol() && width > 0; --width) { |
150 | 612k | blank(*i); |
151 | 612k | inc(); |
152 | 612k | } |
153 | 71.3k | eat_space(); |
154 | 71.3k | } |
155 | 1.02M | void blank_rest() { |
156 | 1.07M | while (!eol()) { |
157 | 51.4k | blank(*i); |
158 | 51.4k | inc(); |
159 | 51.4k | } |
160 | 1.02M | } |
161 | | int eat_space(); |
162 | | void next_line(); |
163 | | }; |
164 | | |
165 | 2.92M | int Iterator::eat_space() { |
166 | 2.92M | indent = 0; |
167 | 2.98M | while (!eol()) { |
168 | 989k | if (*i == ' ') { |
169 | 41.3k | ++i; |
170 | 41.3k | indent++; |
171 | 41.3k | line_pos++; |
172 | 948k | } else if (*i == '\t') { |
173 | 20.2k | int w = width(); |
174 | 20.2k | ++i; |
175 | 20.2k | indent += w; |
176 | 20.2k | line_pos += w; |
177 | 928k | } else { |
178 | 928k | break; |
179 | 928k | } |
180 | 989k | } |
181 | 2.92M | return indent; |
182 | 2.92M | } |
183 | | |
184 | 1.99M | void Iterator::next_line() { |
185 | 1.99M | while (!eol()) |
186 | 0 | inc(); |
187 | 1.99M | if (!at_end() && *i == '\r') { |
188 | 22.2k | ++i; |
189 | 22.2k | if (!at_end() && *i == '\n') { |
190 | 640 | ++i; |
191 | 640 | } |
192 | 1.97M | } else if (!at_end()) { |
193 | 1.97M | ++i; |
194 | 1.97M | } |
195 | 1.99M | line_pos = 0; |
196 | 1.99M | line_start = i; |
197 | 1.99M | } |
198 | | |
199 | | // |
200 | | // Markdown blocks |
201 | | // |
202 | | |
203 | | struct BlockQuote : Block { |
204 | 33.6k | static BlockQuote * start_block(Iterator & itr) { |
205 | 33.6k | if (*itr == '>') { |
206 | 1.15k | itr.blank_adv(); |
207 | 1.15k | return new BlockQuote(); |
208 | 1.15k | } |
209 | 32.4k | return NULL; |
210 | 33.6k | } |
211 | 4.03k | KeepOpenState proc_line(Iterator & itr) { |
212 | 4.03k | if (*itr == '>') { |
213 | 1.52k | itr.blank_adv(); |
214 | 1.52k | return YES; |
215 | 2.50k | } else if (itr.eol()) { |
216 | 695 | return NEVER; |
217 | 695 | } |
218 | 1.81k | return MAYBE; |
219 | 4.03k | } |
220 | 0 | void dump() const {CERR.printf("BlockQuote\n");} |
221 | 1.75k | bool leaf() const {return false;} |
222 | | }; |
223 | | |
224 | | struct ListItem : Block { |
225 | | char marker; // '-' '+' or '*' for bullet lists; '.' or ')' for ordered lists |
226 | | int indent; // indention required in order to be considered part of |
227 | | // the same list item |
228 | | ListItem(char m, int i) |
229 | 2.42k | : marker(m), indent(i) {} |
230 | 32.4k | static ListItem * start_block(Iterator & itr) { |
231 | 32.4k | char marker = '\0'; |
232 | 32.4k | int width = 0; |
233 | 32.4k | if (*itr == '-' || *itr == '+' || *itr == '*') { |
234 | 2.42k | marker = *itr; |
235 | 2.42k | width = 1; |
236 | 30.0k | } else if (asc_isdigit(*itr)) { |
237 | 405 | width = 1; |
238 | 501 | while (asc_isdigit(itr[width])) |
239 | 96 | width += 1; |
240 | 405 | if (itr[width] == '.' || itr[width] == ')') { |
241 | 0 | width += 1; |
242 | 0 | marker = *itr; |
243 | 0 | } |
244 | 405 | } |
245 | 32.4k | if (marker != '\0') { |
246 | 2.42k | itr.adv(width); |
247 | 2.42k | if (itr.indent <= 4) { |
248 | 2.09k | int indent = width + itr.indent; |
249 | 2.09k | itr.indent = 0; |
250 | 2.09k | return new ListItem(marker, indent); |
251 | 2.09k | } else { |
252 | 333 | int indent = 1 + itr.indent; |
253 | 333 | itr.indent -= 1; |
254 | 333 | return new ListItem(marker, indent); |
255 | 333 | } |
256 | 2.42k | } |
257 | 30.0k | return NULL; |
258 | 32.4k | } |
259 | 8.76k | KeepOpenState proc_line(Iterator & itr) { |
260 | 8.76k | if (!itr.eol() && itr.indent >= indent) { |
261 | 59 | itr.indent -= indent; |
262 | 59 | return YES; |
263 | 59 | } |
264 | 8.70k | return MAYBE; |
265 | 8.76k | } |
266 | 0 | void dump() const {CERR.printf("ListItem: '%c' %d\n", marker, indent);} |
267 | 2.45k | bool leaf() const {return false;} |
268 | | }; |
269 | | |
270 | | struct IndentedCodeBlock : Block { |
271 | 34.5k | static IndentedCodeBlock * start_block(bool prev_blank, Iterator & itr) { |
272 | 34.5k | if (prev_blank && !itr.eol() && itr.indent >= 4) { |
273 | 747 | itr.blank_rest(); |
274 | 747 | return new IndentedCodeBlock(); |
275 | 747 | } |
276 | 33.8k | return NULL; |
277 | 34.5k | } |
278 | 1.85k | KeepOpenState proc_line(Iterator & itr) { |
279 | 1.85k | if (itr.indent >= 4) { |
280 | 447 | itr.blank_rest(); |
281 | 447 | return YES; |
282 | 1.40k | } else if (itr.eol()) { |
283 | 895 | return YES; |
284 | 895 | } |
285 | 514 | return NEVER; |
286 | 1.85k | } |
287 | 0 | void dump() const {CERR.printf("IndentedCodeBlock\n");} |
288 | 747 | bool leaf() const {return true;} |
289 | | }; |
290 | | |
291 | | struct FencedCodeBlock : Block { |
292 | | char delem; |
293 | | int delem_len; |
294 | 162 | FencedCodeBlock(char d, int l) : delem(d), delem_len(l) {} |
295 | 33.8k | static FencedCodeBlock * start_block(Iterator & itr) { |
296 | 33.8k | if (*itr == '`' || *itr == '~') { |
297 | 257 | char delem = *itr; |
298 | 257 | int i = 1; |
299 | 174k | while (itr[i] == delem) |
300 | 174k | ++i; |
301 | 257 | if (i < 3) return NULL; |
302 | 162 | itr.blank_adv(i); |
303 | 162 | itr.blank_rest(); // blank info string |
304 | 162 | return new FencedCodeBlock(delem, i); |
305 | 257 | } |
306 | 33.5k | return NULL; |
307 | 33.8k | } |
308 | 1.02M | KeepOpenState proc_line(Iterator & itr) { |
309 | 1.02M | if (*itr == '`' || *itr == '~') { |
310 | 164 | char delem = *itr; |
311 | 164 | int i = 1; |
312 | 342k | while (itr[i] == delem) |
313 | 342k | ++i; |
314 | 164 | itr.blank_adv(i); |
315 | 164 | if (i >= delem_len && itr.eol()) { |
316 | 149 | return NEVER; |
317 | 149 | } |
318 | 164 | } |
319 | 1.02M | itr.blank_rest(); |
320 | 1.02M | return YES; |
321 | 1.02M | } |
322 | 0 | bool blank_rest() const { |
323 | 0 | return true; |
324 | 0 | } |
325 | 0 | void dump() const {CERR.printf("FencedCodeBlock: `%c` %d\n", delem, delem_len);} |
326 | 162 | bool leaf() const {return true;} |
327 | | }; |
328 | | |
329 | | struct SingleLineBlock : Block { |
330 | 28.5k | static SingleLineBlock * start_block(Iterator & itr) { |
331 | 28.5k | unsigned int chr = *itr; |
332 | 28.5k | switch (chr) { |
333 | 1.01k | case '-': case '_': case '*': { |
334 | 1.01k | Iterator i = itr; |
335 | 1.01k | i.adv(); |
336 | 2.43k | while (*i == *itr) |
337 | 1.41k | i.adv(); |
338 | 1.01k | if (i.eol()) { |
339 | 358 | itr = i; |
340 | 358 | return new SingleLineBlock(); |
341 | 358 | } |
342 | 661 | if (chr != '-') // fall though on '-' case |
343 | 661 | break; |
344 | 661 | } |
345 | 1.51k | case '=': { |
346 | 1.51k | Iterator i = itr; |
347 | 1.51k | i.inc(); |
348 | 1.51k | while (*i == *itr) |
349 | 0 | i.inc(); |
350 | 1.51k | i.eat_space(); |
351 | 1.51k | if (i.eol()) { |
352 | 599 | itr = i; |
353 | 599 | return new SingleLineBlock(); |
354 | 599 | } |
355 | 914 | break; |
356 | 1.51k | } |
357 | 914 | case '#': |
358 | 170 | return new SingleLineBlock(); |
359 | 0 | break; |
360 | | |
361 | 28.5k | } |
362 | 27.4k | return NULL; |
363 | 28.5k | } |
364 | 1.12k | KeepOpenState proc_line(Iterator & itr) {return NEVER;} |
365 | 1.12k | bool leaf() const {return true;} |
366 | 0 | void dump() const {CERR.printf("SingleLineBlock\n");} |
367 | | }; |
368 | | |
369 | | // |
370 | | // MultilineInline |
371 | | // |
372 | | |
373 | | struct MultilineInline { |
374 | | virtual MultilineInline * close(Iterator & itr) = 0; |
375 | 2.86k | virtual ~MultilineInline() {} |
376 | | }; |
377 | | |
378 | | struct InlineCode : MultilineInline { |
379 | | int marker_len; |
380 | 711k | MultilineInline * open(Iterator & itr) { |
381 | 711k | if (itr.u_eq('`')) { |
382 | 464 | int i = 1; |
383 | 23.1k | while (itr[i] == '`') |
384 | 22.7k | ++i; |
385 | 464 | itr.blank_adv(i); |
386 | 464 | marker_len = i; |
387 | 464 | return close(itr); |
388 | 464 | } |
389 | 710k | return NULL; |
390 | 711k | } |
391 | 1.02k | MultilineInline * close(Iterator & itr) { |
392 | 61.1k | while (!itr.eol()) { |
393 | 60.5k | if (*itr == '`') { |
394 | 14.8k | int i = 1; |
395 | 10.7M | while (i < marker_len && itr[i] == '`') |
396 | 10.7M | ++i; |
397 | 14.8k | if (i == marker_len) { |
398 | 376 | itr.blank_adv(i); |
399 | 376 | return NULL; |
400 | 376 | } |
401 | 14.8k | } |
402 | 60.1k | itr.blank_adv(); |
403 | 60.1k | } |
404 | 653 | return this; |
405 | 1.02k | } |
406 | | }; |
407 | | |
408 | | // |
409 | | // Html handling |
410 | | // |
411 | | |
412 | | struct HtmlComment : MultilineInline { |
413 | 710k | MultilineInline * open(Iterator & itr) { |
414 | 710k | if (itr.eq("<!--")) { |
415 | 745 | itr.adv(4); |
416 | 745 | return close(itr); |
417 | 745 | } |
418 | 710k | return NULL; |
419 | 710k | } |
420 | 1.37k | MultilineInline * close(Iterator & itr) { |
421 | 268k | while (!itr.eol()) { |
422 | 267k | if (itr.eq("-->")) { |
423 | 311 | itr.adv(3); |
424 | 311 | return NULL; |
425 | 311 | } |
426 | 267k | itr.inc(); |
427 | 267k | } |
428 | 1.06k | return this; |
429 | 1.37k | } |
430 | | }; |
431 | | |
432 | 19.5k | bool parse_tag_close(Iterator & itr) { |
433 | 19.5k | if (*itr == '>') { |
434 | 2.43k | itr.adv(); |
435 | 2.43k | return true; |
436 | 17.1k | } else if (*itr == '/' && itr[1] == '>') { |
437 | 290 | itr.adv(2); |
438 | 290 | return true; |
439 | 290 | } |
440 | 16.8k | return false; |
441 | 19.5k | } |
442 | | |
443 | | // note: does _not_ eat trialing whitespaceb |
444 | 17.0k | bool parse_tag_name(Iterator & itr, String & tag) { |
445 | 17.0k | if (asc_isalpha(*itr)) { |
446 | 11.8k | tag += asc_tolower(*itr); |
447 | 11.8k | itr.inc(); |
448 | 263k | while (asc_isalpha(*itr) || asc_isdigit(*itr) || *itr == '-') { |
449 | 251k | tag += asc_tolower(*itr); |
450 | 251k | itr.inc(); |
451 | 251k | } |
452 | 11.8k | return true; |
453 | 11.8k | } |
454 | 5.19k | return false; |
455 | 17.0k | } |
456 | | |
457 | | struct HtmlTag : MultilineInline { |
458 | 735 | HtmlTag(bool mlt) : start_pos(NULL), last(NULL,NULL), multi_line(mlt) {} |
459 | | void * start_pos; // used for caching |
460 | | Iterator last; // ditto |
461 | | String name; |
462 | | bool closing; |
463 | | enum State {Invalid,Between,AfterName,AfterEq,InSingleQ,InDoubleQ,BeforeClose,Valid}; |
464 | | State state; |
465 | | bool multi_line; |
466 | 798k | void clear_cache() { |
467 | 798k | start_pos = NULL; |
468 | 798k | } |
469 | 798k | void reset() { |
470 | 798k | clear_cache(); |
471 | 798k | name.clear(); |
472 | 798k | closing = false; |
473 | 798k | state = Invalid; |
474 | 798k | } |
475 | 783k | MultilineInline * open(const Iterator & itr0, Iterator & itr) { |
476 | 783k | if (itr.pos() == start_pos) { |
477 | 23.1k | itr = last; |
478 | 23.1k | if (state != Invalid && state != Valid) |
479 | 1.51k | return this; |
480 | 21.6k | return NULL; |
481 | 23.1k | } |
482 | 760k | reset(); |
483 | 760k | start_pos = itr.pos(); |
484 | 760k | if (*itr == '<') { |
485 | 17.0k | itr.inc(); |
486 | 17.0k | if (*itr == '/') { |
487 | 2.23k | itr.inc(); |
488 | 2.23k | closing = true; |
489 | 2.23k | } |
490 | 17.0k | if (!parse_tag_name(itr, name)) |
491 | 5.19k | return invalid(itr0, itr); |
492 | 11.8k | state = Between; |
493 | 11.8k | if (itr.eol()) { |
494 | 2.83k | return incomplete(itr0, itr); |
495 | 9.01k | } else if (parse_tag_close(itr)) { |
496 | 2.41k | return valid(itr0, itr); |
497 | 6.60k | } else if (asc_isspace(*itr)) { |
498 | 4.18k | return close(itr0, itr); |
499 | 4.18k | } else { |
500 | 2.42k | return invalid(itr0, itr); |
501 | 2.42k | } |
502 | 11.8k | } |
503 | 743k | return invalid(itr0, itr); |
504 | 760k | } |
505 | 756k | MultilineInline * open(Iterator & itr) { |
506 | 756k | Iterator itr0 = itr; |
507 | 756k | return open(itr0, itr); |
508 | 756k | } |
509 | 11.5k | MultilineInline * close(const Iterator & itr0, Iterator & itr) { |
510 | 21.3k | while (!itr.eol()) { |
511 | 14.8k | if (state == Between || state == BeforeClose) { |
512 | 10.5k | itr.eat_space(); |
513 | 10.5k | bool leading_space = itr.prev_isspace(); |
514 | | |
515 | 10.5k | if (parse_tag_close(itr)) |
516 | 317 | return valid(itr0, itr); |
517 | | |
518 | 10.2k | if ((state == BeforeClose && !itr.eol()) |
519 | 10.2k | || (itr.line_pos != 0 && !leading_space)) |
520 | 2.68k | return invalid(itr0, itr); |
521 | 10.2k | } |
522 | | |
523 | 11.8k | state = parse_attribute(itr, state); |
524 | 11.8k | if (state == Invalid) |
525 | 2.09k | return invalid(itr0, itr); |
526 | 11.8k | } |
527 | 6.50k | return incomplete(itr0, itr); |
528 | 11.5k | } |
529 | 7.41k | MultilineInline * close(Iterator & itr) { |
530 | 7.41k | Iterator itr0 = itr; |
531 | 7.41k | return close(itr0, itr); |
532 | 7.41k | } |
533 | | |
534 | 2.72k | MultilineInline * valid(const Iterator & itr0, Iterator & itr) { |
535 | 2.72k | state = Valid; |
536 | 2.72k | last = itr; |
537 | 2.72k | return NULL; |
538 | 2.72k | } |
539 | 756k | MultilineInline * invalid(const Iterator & itr0, Iterator & itr) { |
540 | 756k | state = Invalid; |
541 | 756k | itr = itr0; |
542 | 756k | last = itr; |
543 | 756k | return NULL; |
544 | 756k | } |
545 | 9.33k | MultilineInline * incomplete(const Iterator & itr0, Iterator & itr) { |
546 | 9.33k | last = itr; |
547 | 9.33k | if (multi_line) |
548 | 9.00k | return this; |
549 | 332 | return invalid(itr0, itr); |
550 | 9.33k | } |
551 | | |
552 | | // note: does _not_ eat trialing whitespace |
553 | 11.8k | static State parse_attribute(Iterator & itr, State state) { |
554 | 11.8k | switch (state) { |
555 | | // note: this switch is being used as a computed goto to make |
556 | | // restoring state straightforward without restructuring the code |
557 | 7.57k | case Between: |
558 | 7.57k | if (asc_isalpha(*itr) || *itr == '_' || *itr == ':') { |
559 | 5.97k | itr.inc(); |
560 | 10.3k | while (asc_isalpha(*itr) || asc_isdigit(*itr) |
561 | 10.3k | || *itr == '_' || *itr == ':' || *itr == '.' || *itr == '-') |
562 | 4.38k | itr.inc(); |
563 | 7.70k | case AfterName: |
564 | 7.70k | itr.eat_space(); |
565 | 7.70k | if (itr.eol()) return AfterName; |
566 | 5.48k | if (*itr != '=') return Invalid; |
567 | 4.63k | itr.inc(); |
568 | 5.96k | case AfterEq: |
569 | 5.96k | itr.eat_space(); |
570 | 5.96k | if (itr.eol()) return AfterEq; |
571 | 4.36k | if (*itr == '\'') { |
572 | 135 | itr.inc(); |
573 | 1.12k | case InSingleQ: |
574 | 5.26k | while (!itr.eol() && *itr != '\'') |
575 | 4.14k | itr.inc(); |
576 | 1.12k | if (itr.eol()) return InSingleQ; |
577 | 129 | if (*itr != '\'') return Invalid; |
578 | 129 | itr.inc(); |
579 | 4.22k | } else if (*itr == '"') { |
580 | 425 | itr.inc(); |
581 | 640 | case InDoubleQ: |
582 | 3.62k | while (!itr.eol() && *itr != '"') |
583 | 2.98k | itr.inc(); |
584 | 640 | if (itr.eol()) return InDoubleQ; |
585 | 174 | if (*itr != '"') return Invalid; |
586 | 174 | itr.inc(); |
587 | 3.80k | } else { |
588 | 3.80k | void * pos = itr.pos(); |
589 | 20.8k | while (!itr.eol() && !asc_isspace(*itr) |
590 | 20.8k | && *itr != '"' && *itr != '\'' && *itr != '=' |
591 | 20.8k | && *itr != '<' && *itr != '>' && *itr != '`') |
592 | 17.0k | itr.inc(); |
593 | 3.80k | if (pos == itr.pos()) return Invalid; |
594 | 3.80k | } |
595 | 2.85k | return Between; |
596 | 4.36k | } |
597 | 1.60k | case BeforeClose: |
598 | 1.60k | return BeforeClose; |
599 | 0 | default: //case Valid: case Invalid: |
600 | | // should not happen |
601 | 0 | break; |
602 | 11.8k | } |
603 | | // should not be here |
604 | 0 | abort(); |
605 | 11.8k | } |
606 | | }; |
607 | | |
608 | | struct HtmlBlock : Block { |
609 | 1.88k | HtmlBlock(Iterator & itr) { |
610 | 1.88k | proc_line(itr); |
611 | 1.88k | } |
612 | 5.16k | KeepOpenState proc_line(Iterator & itr) { |
613 | 5.16k | if (itr.eol()) return NEVER; |
614 | 8.65k | while (!itr.eol()) itr.inc(); |
615 | 1.96k | return YES; |
616 | 5.16k | } |
617 | 0 | void dump() const {CERR.printf("HtmlBlock\n");} |
618 | 1.88k | bool leaf() const {return true;} |
619 | | }; |
620 | | |
621 | | struct RawHtmlBlock : Block { |
622 | 563 | RawHtmlBlock(Iterator & itr, ParmStr tn) : done(false), tag(false), tag_name(tn) { |
623 | 563 | proc_line(itr); |
624 | 563 | } |
625 | | bool done; |
626 | | HtmlTag tag; |
627 | | String tag_name; |
628 | 3.35k | KeepOpenState proc_line(Iterator & itr) { |
629 | 3.35k | tag.reset(); |
630 | 3.35k | if (done) return NEVER; |
631 | 49.0k | while (!itr.eol()) { |
632 | 46.3k | tag.open(itr); |
633 | 46.3k | if (tag.state == HtmlTag::Valid && tag.closing && tag.name == tag_name) { |
634 | 349 | done = true; |
635 | 4.18k | while (!itr.eol()) itr.inc(); |
636 | 349 | return NEVER; |
637 | 349 | } |
638 | 45.9k | itr.adv(); |
639 | 45.9k | } |
640 | 2.71k | return YES; |
641 | 3.06k | } |
642 | 0 | void dump() const {CERR.printf("RawHtmlBlock: %s\n", tag_name.c_str());} |
643 | 563 | bool leaf() const {return true;} |
644 | | }; |
645 | | |
646 | | Block * start_html_block(Iterator & itr, HtmlTag & tag, |
647 | | const StringMap & start_tags, |
648 | 27.4k | const StringMap & raw_tags) { |
649 | 27.4k | Iterator itr0 = itr; |
650 | 27.4k | tag.open(itr0, itr); |
651 | 27.4k | if (!tag.closing && raw_tags.have(tag.name)) |
652 | 563 | return new RawHtmlBlock(itr,tag.name); |
653 | 26.8k | if ((tag.state == HtmlTag::Valid && itr.eol()) |
654 | 26.8k | || start_tags.have(tag.name)) { |
655 | 1.88k | return new HtmlBlock(itr); |
656 | 1.88k | } |
657 | 24.9k | itr = itr0; |
658 | 24.9k | return NULL; |
659 | 26.8k | } |
660 | | |
661 | | // |
662 | | // Link handling |
663 | | // |
664 | | |
665 | | struct Link : MultilineInline { |
666 | 1.78k | Link(bool s) : skip_ref_labels(s) {reset();} |
667 | | enum State {Invalid, BeforeUrl, AfterUrl, InSingleQ, InDoubleQ, InParanQ, AfterQuote, Valid}; |
668 | | State state; |
669 | | bool skip_ref_labels; |
670 | | struct LineState { |
671 | | Iterator itr0; |
672 | | FilterChar * blank_start; |
673 | | FilterChar * blank_stop; |
674 | | LineState(const Iterator & itr0) |
675 | 46.3k | : itr0(itr0), blank_start(NULL), blank_stop(NULL) {} |
676 | | }; |
677 | 706k | void reset() { |
678 | 706k | state = Invalid; |
679 | 706k | } |
680 | 704k | MultilineInline * open(Iterator & itr) { |
681 | 704k | reset(); |
682 | 704k | if (itr.u_eq(']')) { |
683 | | // no space allowed between ']' and '(' or '['; |
684 | 9.70k | if (itr[1] == '(') { |
685 | 5.19k | itr.adv(2); |
686 | 5.19k | return close(itr); |
687 | 5.19k | } else if (skip_ref_labels && itr[1] == '[') { |
688 | 2.29k | LineState st(itr); |
689 | 2.29k | itr.adv(2); |
690 | 2.29k | st.blank_start = itr.i; |
691 | 9.00k | while (!itr.eol() && !itr.u_eq(']')) |
692 | 6.71k | itr.adv(); |
693 | 2.29k | st.blank_stop = itr.i; |
694 | 2.29k | if (!itr.eol()) |
695 | 1.01k | return valid(st,itr); |
696 | 1.28k | else |
697 | 1.28k | return invalid(st, itr); |
698 | 2.29k | } |
699 | 9.70k | } |
700 | 697k | state = Invalid; |
701 | 697k | return NULL; |
702 | 704k | } |
703 | | |
704 | 7.62k | static State parse_url(LineState & st, Iterator & itr, char close) { |
705 | 7.62k | if (itr.eol()) |
706 | 872 | return BeforeUrl; |
707 | 6.75k | if (itr.u_eq('<')) { |
708 | 534 | st.blank_start = itr.i; |
709 | 534 | itr.adv(); |
710 | 18.4k | while (!itr.eol() && !itr.u_eq('>')) |
711 | 17.8k | itr.adv(); |
712 | 534 | if (itr.eol()) |
713 | 237 | return Invalid; |
714 | 297 | itr.adv(); |
715 | 297 | st.blank_stop = itr.i; |
716 | 6.21k | } else { |
717 | 6.21k | st.blank_start = itr.i; |
718 | 35.7k | while (!itr.eol() && !itr.u_eq(close) && !asc_isspace(*itr)) |
719 | 29.4k | itr.inc(); |
720 | 6.21k | st.blank_stop = itr.i; |
721 | 6.21k | itr.eat_space(); |
722 | 6.21k | } |
723 | 6.51k | return AfterUrl; |
724 | 6.75k | } |
725 | 6.64k | static State parse_label(State state, Iterator & itr) { |
726 | 6.64k | switch (state) { |
727 | 5.38k | default: |
728 | 5.38k | if (itr.u_eq('\'')) { |
729 | 191 | itr.inc(); |
730 | 191 | state = InSingleQ; |
731 | 191 | case InSingleQ: |
732 | 961 | while (!itr.eol() && !itr.u_eq('\'')) |
733 | 770 | itr.inc(); |
734 | 191 | if (itr.eol()) |
735 | 2 | return state; |
736 | 189 | itr.adv(); |
737 | 189 | state = AfterQuote; |
738 | 5.19k | } else if (itr.u_eq('\"')) { |
739 | 1.06k | itr.inc(); |
740 | 1.06k | state = InDoubleQ; |
741 | 2.32k | case InDoubleQ: |
742 | 21.3k | while (!itr.eol() && !itr.u_eq('"')) |
743 | 19.0k | itr.inc(); |
744 | 2.32k | if (itr.eol()) |
745 | 1.27k | return state; |
746 | 1.05k | itr.adv(); |
747 | 1.05k | state = AfterQuote; |
748 | 4.12k | } else if (itr.u_eq('(')) { |
749 | 3.20k | state = InParanQ; |
750 | 3.20k | case InParanQ: |
751 | 9.86k | while (!itr.eol() && !itr.u_eq(')')) |
752 | 6.65k | itr.inc(); |
753 | 3.20k | if (itr.eol()) |
754 | 216 | return state; |
755 | 2.98k | itr.adv(); |
756 | 2.98k | state = AfterQuote; |
757 | 2.98k | } |
758 | 6.64k | } |
759 | 5.15k | return state; |
760 | 6.64k | } |
761 | 13.9k | Link * parse_url_label(Iterator & itr, char close) { |
762 | 13.9k | LineState st(itr); |
763 | 13.9k | itr.eat_space(); |
764 | 13.9k | switch (state) { |
765 | 7.62k | default: |
766 | 7.62k | state = parse_url(st,itr,close); |
767 | 7.62k | if (state == Invalid) return invalid(st, itr); |
768 | 7.38k | if (itr.eol()) return incomplete(st, itr); |
769 | 6.33k | case AfterUrl: |
770 | 6.33k | if (close != '\0' ? itr.u_eq(close) : itr.eol()) |
771 | 951 | return valid(st, itr); |
772 | 6.64k | case InSingleQ: case InDoubleQ: |
773 | 6.64k | state = parse_label(state, itr); |
774 | 6.64k | if (state == Invalid) return invalid(st, itr); |
775 | 6.64k | if (itr.eol()) return incomplete(st, itr); |
776 | 5.15k | case AfterQuote: |
777 | 5.15k | if (close != '\0' ? itr.u_eq(close) : itr.eol()) |
778 | 3.05k | return valid(st, itr); |
779 | 2.09k | return invalid(st, itr); |
780 | 13.9k | } |
781 | 13.9k | } |
782 | 8.48k | MultilineInline * close(Iterator & itr) { |
783 | 8.48k | return parse_url_label(itr, ')'); |
784 | 8.48k | } |
785 | 14.1k | static void blank(const LineState & st) { |
786 | 34.0k | for (FilterChar * i = st.blank_start; i != st.blank_stop; ++i) { |
787 | 19.8k | ::blank(*i); |
788 | 19.8k | } |
789 | 14.1k | } |
790 | 5.02k | Link * valid(const LineState & st, Iterator & itr) { |
791 | 5.02k | itr.adv(); // skip over closing tag |
792 | 5.02k | blank(st); |
793 | 5.02k | state = Valid; |
794 | 5.02k | return NULL; |
795 | 5.02k | } |
796 | 3.61k | Link * invalid(const LineState & st, Iterator & itr) { |
797 | 3.61k | state = Invalid; |
798 | 3.61k | itr = st.itr0; |
799 | 3.61k | return NULL; |
800 | 3.61k | } |
801 | 7.61k | Link * incomplete(const LineState & st, Iterator & itr) { |
802 | 7.61k | blank(st); |
803 | 7.61k | return this; |
804 | 7.61k | } |
805 | | }; |
806 | | |
807 | | struct LinkRefDefinition : Block { |
808 | | Link link; |
809 | | Link * multiline; |
810 | 1.61k | LinkRefDefinition() : link(false) {} |
811 | 30.0k | static LinkRefDefinition * start_block(Iterator & itr, bool skip_ref_labels) { |
812 | 30.0k | Link::LineState st(itr); |
813 | 30.0k | if (*itr == '[') { |
814 | 3.19k | itr.adv(); |
815 | 3.19k | st.blank_start = itr.i; |
816 | 3.19k | if (*itr == ']') goto fail; |
817 | 17.1k | while (!itr.eol() && !itr.u_eq(']')) { |
818 | 14.4k | itr.adv(); |
819 | 14.4k | } |
820 | 2.74k | st.blank_stop = itr.i; |
821 | 2.74k | itr.inc(); |
822 | 2.74k | if (*itr != ':') goto fail; |
823 | 1.61k | itr.adv(); |
824 | 1.61k | LinkRefDefinition * obj = new LinkRefDefinition(); |
825 | 1.61k | obj->multiline = obj->link.parse_url_label(itr, '\0'); |
826 | 1.61k | if (obj->link.state == Link::Invalid) { |
827 | 130 | delete obj; |
828 | 130 | goto fail; |
829 | 130 | } |
830 | 1.48k | if (skip_ref_labels) |
831 | 1.48k | Link::blank(st); |
832 | 1.48k | return obj; |
833 | 1.61k | } |
834 | 28.5k | fail: |
835 | 28.5k | itr = st.itr0; |
836 | 28.5k | return NULL; |
837 | 30.0k | } |
838 | 3.85k | KeepOpenState proc_line(Iterator & itr) { |
839 | 3.85k | if (!multiline) |
840 | 0 | return NEVER; |
841 | 3.85k | void * pos = itr.pos(); |
842 | 3.85k | multiline = multiline->parse_url_label(itr, '\0'); |
843 | 3.85k | if (multiline) |
844 | 2.37k | return MAYBE; |
845 | 1.47k | return NEVER; |
846 | 3.85k | } |
847 | 0 | void dump() const {CERR.printf("LinkRefDefination\n");} |
848 | 1.48k | bool leaf() const {return true;} |
849 | | }; |
850 | | |
851 | | // |
852 | | // |
853 | | // |
854 | | |
855 | | struct MultilineInlineState { |
856 | 172 | MultilineInlineState(bool mlt, bool s) : ptr(), tag(mlt), link(s) {} |
857 | | MultilineInline * ptr; |
858 | | InlineCode inline_code; |
859 | | HtmlComment comment; |
860 | | HtmlTag tag; |
861 | | Link link; |
862 | 172 | void clear_cache() { |
863 | 172 | tag.clear_cache(); |
864 | 172 | } |
865 | 0 | void reset() { |
866 | 0 | tag.reset(); |
867 | 0 | link.reset(); |
868 | 0 | } |
869 | | }; |
870 | | |
871 | | // |
872 | | // MarkdownFilter implementation |
873 | | // |
874 | | |
875 | 172 | PosibErr<bool> MarkdownFilter::setup(Config * cfg) { |
876 | 172 | bool skip_ref_labels = cfg->retrieve_bool("f-markdown-skip-ref-labels"); |
877 | 172 | bool multiline_tags = cfg->retrieve_bool("f-markdown-multiline-tags"); |
878 | 172 | delete inline_state; |
879 | 172 | inline_state = new MultilineInlineState(multiline_tags, skip_ref_labels); |
880 | 172 | raw_start_tags.clear(); |
881 | 172 | cfg->retrieve_list("f-markdown-raw-start-tags", &raw_start_tags); |
882 | 172 | block_start_tags.clear(); |
883 | 172 | cfg->retrieve_list("f-markdown-block-start-tags", &block_start_tags); |
884 | | |
885 | 172 | return true; |
886 | 172 | } |
887 | | |
888 | 0 | void MarkdownFilter::reset() { |
889 | 0 | kill(root.next); |
890 | 0 | prev_blank = true; |
891 | 0 | inline_state->reset(); |
892 | 0 | } |
893 | | |
894 | | |
895 | 172 | MarkdownFilter::~MarkdownFilter() { |
896 | 172 | kill(root.next); |
897 | 172 | delete inline_state; |
898 | 172 | } |
899 | | |
900 | 172 | void MarkdownFilter::process(FilterChar * & start, FilterChar * & stop) { |
901 | 172 | inline_state->clear_cache(); |
902 | 172 | Iterator itr(start,stop); |
903 | 172 | bool blank_line = false; |
904 | 1.99M | while (!itr.at_end()) { |
905 | 1.99M | itr.eat_space(); |
906 | 1.99M | if (inline_state->ptr) { |
907 | 13.4k | if (itr.eol()) |
908 | 1.53k | inline_state->ptr = NULL; |
909 | 11.9k | else |
910 | 11.9k | inline_state->ptr = inline_state->ptr->close(itr); |
911 | 1.98M | } else { |
912 | 1.98M | Block * blk = &root; |
913 | 1.98M | Block::KeepOpenState keep_open; |
914 | 4.99M | for (; blk; blk = blk->next) { |
915 | 3.02M | keep_open = blk->proc_line(itr); |
916 | 3.02M | if (keep_open != Block::YES) |
917 | 11.4k | break; |
918 | 3.02M | } |
919 | | |
920 | 1.98M | blank_line = itr.eol(); |
921 | 1.98M | Block * nblk = blank_line || (keep_open == Block::YES && back->leaf()) |
922 | 1.98M | ? NULL |
923 | 1.98M | : start_block(itr); |
924 | | |
925 | 1.98M | if (nblk || keep_open == Block::NEVER || (prev_blank && !blank_line)) { |
926 | | #ifdef DEBUG_FILTER |
927 | | CERR.printf("*** kill\n"); |
928 | | #endif |
929 | 19.4k | kill(blk); |
930 | 1.96M | } else { |
931 | 1.97M | for (; blk; blk = blk->next) { |
932 | 8.58k | keep_open = blk->proc_line(itr); |
933 | 8.58k | if (keep_open == Block::NEVER) { |
934 | | #ifdef DEBUG_FILTER |
935 | | CERR.printf("***** kill\n"); |
936 | | #endif |
937 | 1.44k | kill(blk); |
938 | 1.44k | break; |
939 | 1.44k | } |
940 | 8.58k | } |
941 | 1.96M | } |
942 | | |
943 | 1.98M | if (nblk) { |
944 | | #ifdef DEBUG_FILTER |
945 | | CERR.printf("*** new block\n"); |
946 | | #endif |
947 | 7.34k | add(nblk); |
948 | 7.34k | prev_blank = true; |
949 | 7.34k | } |
950 | | |
951 | 1.98M | while (nblk && !nblk->leaf()) { |
952 | 3.58k | nblk = start_block(itr); |
953 | 3.58k | if (nblk) { |
954 | | #ifdef DEBUG_FILTER |
955 | | CERR.printf("*** new block\n"); |
956 | | #endif |
957 | 2.21k | add(nblk); |
958 | 2.21k | } |
959 | 3.58k | } |
960 | | |
961 | | #ifdef DEBUG_FILTER |
962 | | dump(); |
963 | | #endif |
964 | 1.98M | } |
965 | | // now process line, mainly blank inline code and handle html tags |
966 | | |
967 | 2.70M | while (!itr.eol()) { |
968 | 711k | void * pos = itr.pos(); |
969 | 711k | #define TRY(what) \ |
970 | 2.83M | inline_state->ptr = inline_state->what.open(itr); \ |
971 | 2.83M | if (inline_state->ptr) break; \ |
972 | 2.83M | if (itr.pos() != pos) continue |
973 | 711k | TRY(inline_code); |
974 | 1.42M | TRY(comment); |
975 | 1.41M | TRY(tag); |
976 | 1.40M | TRY(link); |
977 | 698k | #undef TRY |
978 | 698k | if (*itr == '<' || *itr == '>') |
979 | 7.38k | itr.blank_adv(); |
980 | 691k | else |
981 | 691k | itr.adv(); |
982 | 698k | } |
983 | | |
984 | 1.99M | itr.next_line(); |
985 | | |
986 | 1.99M | prev_blank = blank_line; |
987 | 1.99M | } |
988 | 172 | } |
989 | | |
990 | 34.5k | Block * MarkdownFilter::start_block(Iterator & itr) { |
991 | 34.5k | inline_state->tag.reset(); |
992 | 34.5k | Block * nblk = NULL; |
993 | 34.5k | (nblk = IndentedCodeBlock::start_block(prev_blank, itr)) |
994 | 34.5k | || (nblk = FencedCodeBlock::start_block(itr)) |
995 | 34.5k | || (nblk = BlockQuote::start_block(itr)) |
996 | 34.5k | || (nblk = ListItem::start_block(itr)) |
997 | 34.5k | || (nblk = LinkRefDefinition::start_block(itr, inline_state->link.skip_ref_labels)) |
998 | 34.5k | || (nblk = SingleLineBlock::start_block(itr)) |
999 | 34.5k | || (nblk = start_html_block(itr, inline_state->tag, block_start_tags, raw_start_tags)); |
1000 | 34.5k | return nblk; |
1001 | 34.5k | } |
1002 | | |
1003 | | } // anon namespace |
1004 | | |
1005 | | C_EXPORT IndividualFilter * new_aspell_markdown_filter() |
1006 | 172 | { |
1007 | 172 | return new MarkdownFilter(); |
1008 | 172 | } |
1009 | | |