Coverage Report

Created: 2025-02-15 06:15

/src/md4c/src/md4c.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * MD4C: Markdown parser for C
3
 * (http://github.com/mity/md4c)
4
 *
5
 * Copyright (c) 2016-2024 Martin Mitáš
6
 *
7
 * Permission is hereby granted, free of charge, to any person obtaining a
8
 * copy of this software and associated documentation files (the "Software"),
9
 * to deal in the Software without restriction, including without limitation
10
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11
 * and/or sell copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following conditions:
13
 *
14
 * The above copyright notice and this permission notice shall be included in
15
 * all copies or substantial portions of the Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23
 * IN THE SOFTWARE.
24
 */
25
26
#include "md4c.h"
27
28
#include <limits.h>
29
#include <stdint.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
34
35
/*****************************
36
 ***  Miscellaneous Stuff  ***
37
 *****************************/
38
39
#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
40
    /* C89/90 or old compilers in general may not understand "inline". */
41
    #if defined __GNUC__
42
        #define inline __inline__
43
    #elif defined _MSC_VER
44
        #define inline __inline
45
    #else
46
        #define inline
47
    #endif
48
#endif
49
50
/* Make the UTF-8 support the default. */
51
#if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
52
    #define MD4C_USE_UTF8
53
#endif
54
55
/* Magic for making wide literals with MD4C_USE_UTF16. */
56
#ifdef _T
57
    #undef _T
58
#endif
59
#if defined MD4C_USE_UTF16
60
    #define _T(x)           L##x
61
#else
62
4.93G
    #define _T(x)           x
63
#endif
64
65
/* Misc. macros. */
66
290M
#define SIZEOF_ARRAY(a)     (sizeof(a) / sizeof(a[0]))
67
68
#define STRINGIZE_(x)       #x
69
#define STRINGIZE(x)        STRINGIZE_(x)
70
71
#define MAX(a,b)            ((a) > (b) ? (a) : (b))
72
33.2k
#define MIN(a,b)            ((a) < (b) ? (a) : (b))
73
74
#ifndef TRUE
75
283M
    #define TRUE            1
76
301M
    #define FALSE           0
77
#endif
78
79
#define MD_LOG(msg)                                                     \
80
1.12k
    do {                                                                \
81
1.12k
        if(ctx->parser.debug_log != NULL)                               \
82
1.12k
            ctx->parser.debug_log((msg), ctx->userdata);                \
83
1.12k
    } while(0)
84
85
#ifdef DEBUG
86
    #define MD_ASSERT(cond)                                             \
87
            do {                                                        \
88
                if(!(cond)) {                                           \
89
                    MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": "        \
90
                           "Assertion '" STRINGIZE(cond) "' failed.");  \
91
                    exit(1);                                            \
92
                }                                                       \
93
            } while(0)
94
95
    #define MD_UNREACHABLE()        MD_ASSERT(1 == 0)
96
#else
97
    #ifdef __GNUC__
98
63.7M
        #define MD_ASSERT(cond)     do { if(!(cond)) __builtin_unreachable(); } while(0)
99
0
        #define MD_UNREACHABLE()    do { __builtin_unreachable(); } while(0)
100
    #elif defined _MSC_VER  &&  _MSC_VER > 120
101
        #define MD_ASSERT(cond)     do { __assume(cond); } while(0)
102
        #define MD_UNREACHABLE()    do { __assume(0); } while(0)
103
    #else
104
        #define MD_ASSERT(cond)     do {} while(0)
105
        #define MD_UNREACHABLE()    do {} while(0)
106
    #endif
107
#endif
108
109
/* For falling through case labels in switch statements. */
110
#if defined __clang__ && __clang_major__ >= 12
111
168k
    #define MD_FALLTHROUGH()        __attribute__((fallthrough))
112
#elif defined __GNUC__ && __GNUC__ >= 7
113
    #define MD_FALLTHROUGH()        __attribute__((fallthrough))
114
#else
115
    #define MD_FALLTHROUGH()        ((void)0)
116
#endif
117
118
/* Suppress "unused parameter" warnings. */
119
36.5M
#define MD_UNUSED(x)                ((void)x)
120
121
122
/******************************
123
 ***  Some internal limits  ***
124
 ******************************/
125
126
/* We limit code span marks to lower than 32 backticks. This solves the
127
 * pathologic case of too many openers, each of different length: Their
128
 * resolving would be then O(n^2). */
129
694k
#define CODESPAN_MARK_MAXLEN    32
130
131
/* We limit column count of tables to prevent quadratic explosion of output
132
 * from pathological input of a table thousands of columns and thousands
133
 * of rows where rows are requested with as little as single character
134
 * per-line, relying on us to "helpfully" fill all the missing "<td></td>". */
135
173k
#define TABLE_MAXCOLCOUNT       128
136
137
138
/************************
139
 ***  Internal Types  ***
140
 ************************/
141
142
/* These are omnipresent so lets save some typing. */
143
162M
#define CHAR    MD_CHAR
144
101M
#define SZ      MD_SIZE
145
422M
#define OFF     MD_OFFSET
146
147
#define SZ_MAX      (sizeof(SZ) == 8 ? UINT64_MAX : UINT32_MAX)
148
#define OFF_MAX     (sizeof(OFF) == 8 ? UINT64_MAX : UINT32_MAX)
149
150
typedef struct MD_MARK_tag MD_MARK;
151
typedef struct MD_BLOCK_tag MD_BLOCK;
152
typedef struct MD_CONTAINER_tag MD_CONTAINER;
153
typedef struct MD_REF_DEF_tag MD_REF_DEF;
154
155
156
/* During analyzes of inline marks, we need to manage stacks of unresolved
157
 * openers of the given type.
158
 * The stack connects the marks via MD_MARK::next;
159
 */
160
typedef struct MD_MARKSTACK_tag MD_MARKSTACK;
161
struct MD_MARKSTACK_tag {
162
    int top;        /* -1 if empty. */
163
};
164
165
/* Context propagated through all the parsing. */
166
typedef struct MD_CTX_tag MD_CTX;
167
struct MD_CTX_tag {
168
    /* Immutable stuff (parameters of md_parse()). */
169
    const CHAR* text;
170
    SZ size;
171
    MD_PARSER parser;
172
    void* userdata;
173
174
    /* When this is true, it allows some optimizations. */
175
    int doc_ends_with_newline;
176
177
    /* Helper temporary growing buffer. */
178
    CHAR* buffer;
179
    unsigned alloc_buffer;
180
181
    /* Reference definitions. */
182
    MD_REF_DEF* ref_defs;
183
    int n_ref_defs;
184
    int alloc_ref_defs;
185
    void** ref_def_hashtable;
186
    int ref_def_hashtable_size;
187
    SZ max_ref_def_output;
188
189
    /* Stack of inline/span markers.
190
     * This is only used for parsing a single block contents but by storing it
191
     * here we may reuse the stack for subsequent blocks; i.e. we have fewer
192
     * (re)allocations. */
193
    MD_MARK* marks;
194
    int n_marks;
195
    int alloc_marks;
196
197
#if defined MD4C_USE_UTF16
198
    char mark_char_map[128];
199
#else
200
    char mark_char_map[256];
201
#endif
202
203
    /* For resolving of inline spans. */
204
    MD_MARKSTACK opener_stacks[16];
205
2.74M
#define ASTERISK_OPENERS_oo_mod3_0      (ctx->opener_stacks[0])     /* Opener-only */
206
#define ASTERISK_OPENERS_oo_mod3_1      (ctx->opener_stacks[1])
207
#define ASTERISK_OPENERS_oo_mod3_2      (ctx->opener_stacks[2])
208
#define ASTERISK_OPENERS_oc_mod3_0      (ctx->opener_stacks[3])     /* Both opener and closer candidate */
209
#define ASTERISK_OPENERS_oc_mod3_1      (ctx->opener_stacks[4])
210
#define ASTERISK_OPENERS_oc_mod3_2      (ctx->opener_stacks[5])
211
1.60M
#define UNDERSCORE_OPENERS_oo_mod3_0    (ctx->opener_stacks[6])     /* Opener-only */
212
#define UNDERSCORE_OPENERS_oo_mod3_1    (ctx->opener_stacks[7])
213
#define UNDERSCORE_OPENERS_oo_mod3_2    (ctx->opener_stacks[8])
214
#define UNDERSCORE_OPENERS_oc_mod3_0    (ctx->opener_stacks[9])     /* Both opener and closer candidate */
215
#define UNDERSCORE_OPENERS_oc_mod3_1    (ctx->opener_stacks[10])
216
#define UNDERSCORE_OPENERS_oc_mod3_2    (ctx->opener_stacks[11])
217
188k
#define TILDE_OPENERS_1                 (ctx->opener_stacks[12])
218
26.7k
#define TILDE_OPENERS_2                 (ctx->opener_stacks[13])
219
237M
#define BRACKET_OPENERS                 (ctx->opener_stacks[14])
220
344k
#define DOLLAR_OPENERS                  (ctx->opener_stacks[15])
221
222
    /* Stack of dummies which need to call free() for pointers stored in them.
223
     * These are constructed during inline parsing and freed after all the block
224
     * is processed (i.e. all callbacks referring those strings are called). */
225
    MD_MARKSTACK ptr_stack;
226
227
    /* For resolving table rows. */
228
    int n_table_cell_boundaries;
229
    int table_cell_boundaries_head;
230
    int table_cell_boundaries_tail;
231
232
    /* For resolving links. */
233
    int unresolved_link_head;
234
    int unresolved_link_tail;
235
236
    /* For resolving raw HTML. */
237
    OFF html_comment_horizon;
238
    OFF html_proc_instr_horizon;
239
    OFF html_decl_horizon;
240
    OFF html_cdata_horizon;
241
242
    /* For block analysis.
243
     * Notes:
244
     *   -- It holds MD_BLOCK as well as MD_LINE structures. After each
245
     *      MD_BLOCK, its (multiple) MD_LINE(s) follow.
246
     *   -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
247
     *      instead of MD_LINE(s).
248
     */
249
    void* block_bytes;
250
    MD_BLOCK* current_block;
251
    int n_block_bytes;
252
    int alloc_block_bytes;
253
254
    /* For container block analysis. */
255
    MD_CONTAINER* containers;
256
    int n_containers;
257
    int alloc_containers;
258
259
    /* Minimal indentation to call the block "indented code block". */
260
    unsigned code_indent_offset;
261
262
    /* Contextual info for line analysis. */
263
    SZ code_fence_length;   /* For checking closing fence length. */
264
    int html_block_type;    /* For checking closing raw HTML condition. */
265
    int last_line_has_list_loosening_effect;
266
    int last_list_item_starts_with_two_blank_lines;
267
};
268
269
enum MD_LINETYPE_tag {
270
    MD_LINE_BLANK,
271
    MD_LINE_HR,
272
    MD_LINE_ATXHEADER,
273
    MD_LINE_SETEXTHEADER,
274
    MD_LINE_SETEXTUNDERLINE,
275
    MD_LINE_INDENTEDCODE,
276
    MD_LINE_FENCEDCODE,
277
    MD_LINE_HTML,
278
    MD_LINE_TEXT,
279
    MD_LINE_TABLE,
280
    MD_LINE_TABLEUNDERLINE
281
};
282
typedef enum MD_LINETYPE_tag MD_LINETYPE;
283
284
typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
285
struct MD_LINE_ANALYSIS_tag {
286
    MD_LINETYPE type;
287
    unsigned data;
288
    int enforce_new_block;
289
    OFF beg;
290
    OFF end;
291
    unsigned indent;        /* Indentation level. */
292
};
293
294
typedef struct MD_LINE_tag MD_LINE;
295
struct MD_LINE_tag {
296
    OFF beg;
297
    OFF end;
298
};
299
300
typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
301
struct MD_VERBATIMLINE_tag {
302
    OFF beg;
303
    OFF end;
304
    OFF indent;
305
};
306
307
308
/*****************
309
 ***  Helpers  ***
310
 *****************/
311
312
/* Character accessors. */
313
1.25G
#define CH(off)                 (ctx->text[(off)])
314
69.1M
#define STR(off)                (ctx->text + (off))
315
316
/* Character classification.
317
 * Note we assume ASCII compatibility of code points < 128 here. */
318
312M
#define ISIN_(ch, ch_min, ch_max)       ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
319
995M
#define ISANYOF_(ch, palette)           ((ch) != _T('\0')  &&  md_strchr((palette), (ch)) != NULL)
320
976M
#define ISANYOF2_(ch, ch1, ch2)         ((ch) == (ch1) || (ch) == (ch2))
321
33.5M
#define ISANYOF3_(ch, ch1, ch2, ch3)    ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
322
1.66M
#define ISASCII_(ch)                    ((unsigned)(ch) <= 127)
323
453M
#define ISBLANK_(ch)                    (ISANYOF2_((ch), _T(' '), _T('\t')))
324
731M
#define ISNEWLINE_(ch)                  (ISANYOF2_((ch), _T('\r'), _T('\n')))
325
218M
#define ISWHITESPACE_(ch)               (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
326
43.5M
#define ISCNTRL_(ch)                    ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
327
41.3M
#define ISPUNCT_(ch)                    (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
328
125M
#define ISUPPER_(ch)                    (ISIN_(ch, _T('A'), _T('Z')))
329
35.6M
#define ISLOWER_(ch)                    (ISIN_(ch, _T('a'), _T('z')))
330
59.1M
#define ISALPHA_(ch)                    (ISUPPER_(ch) || ISLOWER_(ch))
331
35.5M
#define ISDIGIT_(ch)                    (ISIN_(ch, _T('0'), _T('9')))
332
6.51M
#define ISXDIGIT_(ch)                   (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
333
43.5M
#define ISALNUM_(ch)                    (ISALPHA_(ch) || ISDIGIT_(ch))
334
335
59.1M
#define ISANYOF(off, palette)           ISANYOF_(CH(off), (palette))
336
20.1M
#define ISANYOF2(off, ch1, ch2)         ISANYOF2_(CH(off), (ch1), (ch2))
337
6.56M
#define ISANYOF3(off, ch1, ch2, ch3)    ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
338
1.66M
#define ISASCII(off)                    ISASCII_(CH(off))
339
83.2M
#define ISBLANK(off)                    ISBLANK_(CH(off))
340
420M
#define ISNEWLINE(off)                  ISNEWLINE_(CH(off))
341
58.8M
#define ISWHITESPACE(off)               ISWHITESPACE_(CH(off))
342
38.7M
#define ISCNTRL(off)                    ISCNTRL_(CH(off))
343
8.88M
#define ISPUNCT(off)                    ISPUNCT_(CH(off))
344
192k
#define ISUPPER(off)                    ISUPPER_(CH(off))
345
#define ISLOWER(off)                    ISLOWER_(CH(off))
346
3.34M
#define ISALPHA(off)                    ISALPHA_(CH(off))
347
12.3M
#define ISDIGIT(off)                    ISDIGIT_(CH(off))
348
#define ISXDIGIT(off)                   ISXDIGIT_(CH(off))
349
25.7M
#define ISALNUM(off)                    ISALNUM_(CH(off))
350
351
352
#if defined MD4C_USE_UTF16
353
    #define md_strchr wcschr
354
#else
355
968M
    #define md_strchr strchr
356
#endif
357
358
359
/* Case insensitive check of string equality. */
360
static inline int
361
md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
362
3.18M
{
363
3.18M
    OFF i;
364
4.65M
    for(i = 0; i < n; i++) {
365
4.56M
        CHAR ch1 = s1[i];
366
4.56M
        CHAR ch2 = s2[i];
367
368
4.56M
        if(ISLOWER_(ch1))
369
1.59M
            ch1 += ('A'-'a');
370
4.56M
        if(ISLOWER_(ch2))
371
4.56M
            ch2 += ('A'-'a');
372
4.56M
        if(ch1 != ch2)
373
3.10M
            return FALSE;
374
4.56M
    }
375
81.2k
    return TRUE;
376
3.18M
}
377
378
static inline int
379
md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
380
48.4M
{
381
48.4M
    return memcmp(s1, s2, n * sizeof(CHAR)) == 0;
382
48.4M
}
383
384
static int
385
md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
386
618k
{
387
618k
    OFF off = 0;
388
618k
    int ret = 0;
389
390
1.45M
    while(1) {
391
11.6M
        while(off < size  &&  str[off] != _T('\0'))
392
10.1M
            off++;
393
394
1.45M
        if(off > 0) {
395
1.45M
            ret = ctx->parser.text(type, str, off, ctx->userdata);
396
1.45M
            if(ret != 0)
397
0
                return ret;
398
399
1.45M
            str += off;
400
1.45M
            size -= off;
401
1.45M
            off = 0;
402
1.45M
        }
403
404
1.45M
        if(off >= size)
405
618k
            return 0;
406
407
839k
        ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
408
839k
        if(ret != 0)
409
0
            return ret;
410
839k
        off++;
411
839k
    }
412
618k
}
413
414
415
#define MD_CHECK(func)                                                      \
416
234M
    do {                                                                    \
417
234M
        ret = (func);                                                       \
418
234M
        if(ret < 0)                                                         \
419
234M
            goto abort;                                                     \
420
234M
    } while(0)
421
422
423
#define MD_TEMP_BUFFER(sz)                                                  \
424
166k
    do {                                                                    \
425
166k
        if(sz > ctx->alloc_buffer) {                                        \
426
972
            CHAR* new_buffer;                                               \
427
972
            SZ new_size = ((sz) + (sz) / 2 + 128) & ~127;                   \
428
972
                                                                            \
429
972
            new_buffer = realloc(ctx->buffer, new_size);                    \
430
972
            if(new_buffer == NULL) {                                        \
431
0
                MD_LOG("realloc() failed.");                                \
432
0
                ret = -1;                                                   \
433
0
                goto abort;                                                 \
434
0
            }                                                               \
435
972
                                                                            \
436
972
            ctx->buffer = new_buffer;                                       \
437
972
            ctx->alloc_buffer = new_size;                                   \
438
972
        }                                                                   \
439
166k
    } while(0)
440
441
442
#define MD_ENTER_BLOCK(type, arg)                                           \
443
29.0M
    do {                                                                    \
444
29.0M
        ret = ctx->parser.enter_block((type), (arg), ctx->userdata);        \
445
29.0M
        if(ret != 0) {                                                      \
446
0
            MD_LOG("Aborted from enter_block() callback.");                 \
447
0
            goto abort;                                                     \
448
0
        }                                                                   \
449
29.0M
    } while(0)
450
451
#define MD_LEAVE_BLOCK(type, arg)                                           \
452
29.0M
    do {                                                                    \
453
29.0M
        ret = ctx->parser.leave_block((type), (arg), ctx->userdata);        \
454
29.0M
        if(ret != 0) {                                                      \
455
0
            MD_LOG("Aborted from leave_block() callback.");                 \
456
0
            goto abort;                                                     \
457
0
        }                                                                   \
458
29.0M
    } while(0)
459
460
#define MD_ENTER_SPAN(type, arg)                                            \
461
3.40M
    do {                                                                    \
462
3.40M
        ret = ctx->parser.enter_span((type), (arg), ctx->userdata);         \
463
3.40M
        if(ret != 0) {                                                      \
464
0
            MD_LOG("Aborted from enter_span() callback.");                  \
465
0
            goto abort;                                                     \
466
0
        }                                                                   \
467
3.40M
    } while(0)
468
469
#define MD_LEAVE_SPAN(type, arg)                                            \
470
4.83M
    do {                                                                    \
471
4.83M
        ret = ctx->parser.leave_span((type), (arg), ctx->userdata);         \
472
4.83M
        if(ret != 0) {                                                      \
473
0
            MD_LOG("Aborted from leave_span() callback.");                  \
474
0
            goto abort;                                                     \
475
0
        }                                                                   \
476
4.83M
    } while(0)
477
478
#define MD_TEXT(type, str, size)                                            \
479
31.6M
    do {                                                                    \
480
31.6M
        if(size > 0) {                                                      \
481
31.6M
            ret = ctx->parser.text((type), (str), (size), ctx->userdata);   \
482
31.6M
            if(ret != 0) {                                                  \
483
0
                MD_LOG("Aborted from text() callback.");                    \
484
0
                goto abort;                                                 \
485
0
            }                                                               \
486
31.6M
        }                                                                   \
487
31.6M
    } while(0)
488
489
#define MD_TEXT_INSECURE(type, str, size)                                   \
490
781k
    do {                                                                    \
491
781k
        if(size > 0) {                                                      \
492
618k
            ret = md_text_with_null_replacement(ctx, type, str, size);      \
493
618k
            if(ret != 0) {                                                  \
494
0
                MD_LOG("Aborted from text() callback.");                    \
495
0
                goto abort;                                                 \
496
0
            }                                                               \
497
618k
        }                                                                   \
498
781k
    } while(0)
499
500
501
/* If the offset falls into a gap between line, we return the following
502
 * line. */
503
static const MD_LINE*
504
md_lookup_line(OFF off, const MD_LINE* lines, MD_SIZE n_lines, MD_SIZE* p_line_index)
505
3.65M
{
506
3.65M
    MD_SIZE lo, hi;
507
3.65M
    MD_SIZE pivot;
508
3.65M
    const MD_LINE* line;
509
510
3.65M
    lo = 0;
511
3.65M
    hi = n_lines - 1;
512
14.7M
    while(lo <= hi) {
513
14.7M
        pivot = (lo + hi) / 2;
514
14.7M
        line = &lines[pivot];
515
516
14.7M
        if(off < line->beg) {
517
5.36M
            if(hi == 0  ||  lines[hi-1].end < off) {
518
0
                if(p_line_index != NULL)
519
0
                    *p_line_index = pivot;
520
0
                return line;
521
0
            }
522
5.36M
            hi = pivot - 1;
523
9.35M
        } else if(off > line->end) {
524
5.70M
            lo = pivot + 1;
525
5.70M
        } else {
526
3.65M
            if(p_line_index != NULL)
527
2.23M
                *p_line_index = pivot;
528
3.65M
            return line;
529
3.65M
        }
530
14.7M
    }
531
532
0
    return NULL;
533
3.65M
}
534
535
536
/*************************
537
 ***  Unicode Support  ***
538
 *************************/
539
540
typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
541
struct MD_UNICODE_FOLD_INFO_tag {
542
    unsigned codepoints[3];
543
    unsigned n_codepoints;
544
};
545
546
547
#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
548
    /* Binary search over sorted "map" of codepoints. Consecutive sequences
549
     * of codepoints may be encoded in the map by just using the
550
     * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
551
     *
552
     * Returns index of the found record in the map (in the case of ranges,
553
     * the minimal value is used); or -1 on failure. */
554
    static int
555
    md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
556
17.2M
    {
557
17.2M
        int beg, end;
558
17.2M
        int pivot_beg, pivot_end;
559
560
17.2M
        beg = 0;
561
17.2M
        end = (int) map_size-1;
562
108M
        while(beg <= end) {
563
            /* Pivot may be a range, not just a single value. */
564
91.4M
            pivot_beg = pivot_end = (beg + end) / 2;
565
91.4M
            if(map[pivot_end] & 0x40000000)
566
14.7M
                pivot_end++;
567
91.4M
            if(map[pivot_beg] & 0x80000000)
568
10.6M
                pivot_beg--;
569
570
91.4M
            if(codepoint < (map[pivot_beg] & 0x00ffffff))
571
1.51M
                end = pivot_beg - 1;
572
89.8M
            else if(codepoint > (map[pivot_end] & 0x00ffffff))
573
89.7M
                beg = pivot_end + 1;
574
101k
            else
575
101k
                return pivot_beg;
576
91.4M
        }
577
578
17.1M
        return -1;
579
17.2M
    }
580
581
    static int
582
    md_is_unicode_whitespace__(unsigned codepoint)
583
103M
    {
584
103M
#define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
585
622M
#define S(cp)               (cp)
586
        /* Unicode "Zs" category.
587
         * (generated by scripts/build_whitespace_map.py) */
588
103M
        static const unsigned WHITESPACE_MAP[] = {
589
103M
            S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
590
103M
        };
591
103M
#undef R
592
103M
#undef S
593
594
        /* The ASCII ones are the most frequently used ones, also CommonMark
595
         * specification requests few more in this range. */
596
103M
        if(codepoint <= 0x7f)
597
97.9M
            return ISWHITESPACE_(codepoint);
598
599
5.76M
        return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
600
103M
    }
601
602
    static int
603
    md_is_unicode_punct__(unsigned codepoint)
604
3.59M
    {
605
851M
#define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
606
366M
#define S(cp)               (cp)
607
        /* Unicode general "P" and "S" categories.
608
         * (generated by scripts/build_punct_map.py) */
609
3.59M
        static const unsigned PUNCT_MAP[] = {
610
3.59M
            R(0x0021,0x002f), R(0x003a,0x0040), R(0x005b,0x0060), R(0x007b,0x007e), R(0x00a1,0x00a9),
611
3.59M
            R(0x00ab,0x00ac), R(0x00ae,0x00b1), S(0x00b4), R(0x00b6,0x00b8), S(0x00bb), S(0x00bf), S(0x00d7),
612
3.59M
            S(0x00f7), R(0x02c2,0x02c5), R(0x02d2,0x02df), R(0x02e5,0x02eb), S(0x02ed), R(0x02ef,0x02ff), S(0x0375),
613
3.59M
            S(0x037e), R(0x0384,0x0385), S(0x0387), S(0x03f6), S(0x0482), R(0x055a,0x055f), R(0x0589,0x058a),
614
3.59M
            R(0x058d,0x058f), S(0x05be), S(0x05c0), S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0606,0x060f),
615
3.59M
            S(0x061b), R(0x061d,0x061f), R(0x066a,0x066d), S(0x06d4), S(0x06de), S(0x06e9), R(0x06fd,0x06fe),
616
3.59M
            R(0x0700,0x070d), R(0x07f6,0x07f9), R(0x07fe,0x07ff), R(0x0830,0x083e), S(0x085e), S(0x0888),
617
3.59M
            R(0x0964,0x0965), S(0x0970), R(0x09f2,0x09f3), R(0x09fa,0x09fb), S(0x09fd), S(0x0a76), R(0x0af0,0x0af1),
618
3.59M
            S(0x0b70), R(0x0bf3,0x0bfa), S(0x0c77), S(0x0c7f), S(0x0c84), S(0x0d4f), S(0x0d79), S(0x0df4), S(0x0e3f),
619
3.59M
            S(0x0e4f), R(0x0e5a,0x0e5b), R(0x0f01,0x0f17), R(0x0f1a,0x0f1f), S(0x0f34), S(0x0f36), S(0x0f38),
620
3.59M
            R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fbe,0x0fc5), R(0x0fc7,0x0fcc), R(0x0fce,0x0fda), R(0x104a,0x104f),
621
3.59M
            R(0x109e,0x109f), S(0x10fb), R(0x1360,0x1368), R(0x1390,0x1399), S(0x1400), R(0x166d,0x166e),
622
3.59M
            R(0x169b,0x169c), R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17db),
623
3.59M
            R(0x1800,0x180a), S(0x1940), R(0x1944,0x1945), R(0x19de,0x19ff), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6),
624
3.59M
            R(0x1aa8,0x1aad), R(0x1b5a,0x1b6a), R(0x1b74,0x1b7e), R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f),
625
3.59M
            R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), S(0x1fbd), R(0x1fbf,0x1fc1), R(0x1fcd,0x1fcf),
626
3.59M
            R(0x1fdd,0x1fdf), R(0x1fed,0x1fef), R(0x1ffd,0x1ffe), R(0x2010,0x2027), R(0x2030,0x205e),
627
3.59M
            R(0x207a,0x207e), R(0x208a,0x208e), R(0x20a0,0x20c0), R(0x2100,0x2101), R(0x2103,0x2106),
628
3.59M
            R(0x2108,0x2109), S(0x2114), R(0x2116,0x2118), R(0x211e,0x2123), S(0x2125), S(0x2127), S(0x2129),
629
3.59M
            S(0x212e), R(0x213a,0x213b), R(0x2140,0x2144), R(0x214a,0x214d), S(0x214f), R(0x218a,0x218b),
630
3.59M
            R(0x2190,0x2426), R(0x2440,0x244a), R(0x249c,0x24e9), R(0x2500,0x2775), R(0x2794,0x2b73),
631
3.59M
            R(0x2b76,0x2b95), R(0x2b97,0x2bff), R(0x2ce5,0x2cea), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
632
3.59M
            R(0x2e00,0x2e2e), R(0x2e30,0x2e5d), R(0x2e80,0x2e99), R(0x2e9b,0x2ef3), R(0x2f00,0x2fd5),
633
3.59M
            R(0x2ff0,0x2fff), R(0x3001,0x3004), R(0x3008,0x3020), S(0x3030), R(0x3036,0x3037), R(0x303d,0x303f),
634
3.59M
            R(0x309b,0x309c), S(0x30a0), S(0x30fb), R(0x3190,0x3191), R(0x3196,0x319f), R(0x31c0,0x31e3), S(0x31ef),
635
3.59M
            R(0x3200,0x321e), R(0x322a,0x3247), S(0x3250), R(0x3260,0x327f), R(0x328a,0x32b0), R(0x32c0,0x33ff),
636
3.59M
            R(0x4dc0,0x4dff), R(0xa490,0xa4c6), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
637
3.59M
            R(0xa6f2,0xa6f7), R(0xa700,0xa716), R(0xa720,0xa721), R(0xa789,0xa78a), R(0xa828,0xa82b),
638
3.59M
            R(0xa836,0xa839), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
639
3.59M
            S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaa77,0xaa79), R(0xaade,0xaadf),
640
3.59M
            R(0xaaf0,0xaaf1), S(0xab5b), R(0xab6a,0xab6b), S(0xabeb), S(0xfb29), R(0xfbb2,0xfbc2), R(0xfd3e,0xfd4f),
641
3.59M
            S(0xfdcf), R(0xfdfc,0xfdff), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe66), R(0xfe68,0xfe6b),
642
3.59M
            R(0xff01,0xff0f), R(0xff1a,0xff20), R(0xff3b,0xff40), R(0xff5b,0xff65), R(0xffe0,0xffe6),
643
3.59M
            R(0xffe8,0xffee), R(0xfffc,0xfffd), R(0x10100,0x10102), R(0x10137,0x1013f), R(0x10179,0x10189),
644
3.59M
            R(0x1018c,0x1018e), R(0x10190,0x1019c), S(0x101a0), R(0x101d0,0x101fc), S(0x1039f), S(0x103d0),
645
3.59M
            S(0x1056f), S(0x10857), R(0x10877,0x10878), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
646
3.59M
            S(0x10ac8), R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59),
647
3.59M
            R(0x10f86,0x10f89), R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143),
648
3.59M
            R(0x11174,0x11175), R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d),
649
3.59M
            S(0x112a9), R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7),
650
3.59M
            R(0x11641,0x11643), R(0x11660,0x1166c), S(0x116b9), R(0x1173c,0x1173f), S(0x1183b), R(0x11944,0x11946),
651
3.59M
            S(0x119e2), R(0x11a3f,0x11a46), R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11b00,0x11b09),
652
3.59M
            R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8), R(0x11f43,0x11f4f), R(0x11fd5,0x11ff1),
653
3.59M
            S(0x11fff), R(0x12470,0x12474), R(0x12ff1,0x12ff2), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3f),
654
3.59M
            R(0x16b44,0x16b45), R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9c), S(0x1bc9f), R(0x1cf50,0x1cfc3),
655
3.59M
            R(0x1d000,0x1d0f5), R(0x1d100,0x1d126), R(0x1d129,0x1d164), R(0x1d16a,0x1d16c), R(0x1d183,0x1d184),
656
3.59M
            R(0x1d18c,0x1d1a9), R(0x1d1ae,0x1d1ea), R(0x1d200,0x1d241), S(0x1d245), R(0x1d300,0x1d356), S(0x1d6c1),
657
3.59M
            S(0x1d6db), S(0x1d6fb), S(0x1d715), S(0x1d735), S(0x1d74f), S(0x1d76f), S(0x1d789), S(0x1d7a9),
658
3.59M
            S(0x1d7c3), R(0x1d800,0x1d9ff), R(0x1da37,0x1da3a), R(0x1da6d,0x1da74), R(0x1da76,0x1da83),
659
3.59M
            R(0x1da85,0x1da8b), S(0x1e14f), S(0x1e2ff), R(0x1e95e,0x1e95f), S(0x1ecac), S(0x1ecb0), S(0x1ed2e),
660
3.59M
            R(0x1eef0,0x1eef1), R(0x1f000,0x1f02b), R(0x1f030,0x1f093), R(0x1f0a0,0x1f0ae), R(0x1f0b1,0x1f0bf),
661
3.59M
            R(0x1f0c1,0x1f0cf), R(0x1f0d1,0x1f0f5), R(0x1f10d,0x1f1ad), R(0x1f1e6,0x1f202), R(0x1f210,0x1f23b),
662
3.59M
            R(0x1f240,0x1f248), R(0x1f250,0x1f251), R(0x1f260,0x1f265), R(0x1f300,0x1f6d7), R(0x1f6dc,0x1f6ec),
663
3.59M
            R(0x1f6f0,0x1f6fc), R(0x1f700,0x1f776), R(0x1f77b,0x1f7d9), R(0x1f7e0,0x1f7eb), S(0x1f7f0),
664
3.59M
            R(0x1f800,0x1f80b), R(0x1f810,0x1f847), R(0x1f850,0x1f859), R(0x1f860,0x1f887), R(0x1f890,0x1f8ad),
665
3.59M
            R(0x1f8b0,0x1f8b1), R(0x1f900,0x1fa53), R(0x1fa60,0x1fa6d), R(0x1fa70,0x1fa7c), R(0x1fa80,0x1fa88),
666
3.59M
            R(0x1fa90,0x1fabd), R(0x1fabf,0x1fac5), R(0x1face,0x1fadb), R(0x1fae0,0x1fae8), R(0x1faf0,0x1faf8),
667
3.59M
            R(0x1fb00,0x1fb92), R(0x1fb94,0x1fbca)
668
3.59M
        };
669
3.59M
#undef R
670
3.59M
#undef S
671
672
        /* The ASCII ones are the most frequently used ones, also CommonMark
673
         * specification requests few more in this range. */
674
3.59M
        if(codepoint <= 0x7f)
675
3.01M
            return ISPUNCT_(codepoint);
676
677
579k
        return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
678
3.59M
    }
679
680
    static void
681
    md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
682
65.0M
    {
683
4.42G
#define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
684
13.9G
#define S(cp)               (cp)
685
        /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
686
         * (generated by scripts/build_folding_map.py) */
687
65.0M
        static const unsigned FOLD_MAP_1[] = {
688
65.0M
            R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
689
65.0M
            R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
690
65.0M
            S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190),
691
65.0M
            S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f),
692
65.0M
            R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2),
693
65.0M
            S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8),
694
65.0M
            S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7),
695
65.0M
            R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241),
696
65.0M
            S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f),
697
65.0M
            S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab),
698
65.0M
            S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1),
699
65.0M
            S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f),
700
65.0M
            R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e),
701
65.0M
            R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81),
702
65.0M
            S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
703
65.0M
            R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
704
65.0M
            R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
705
65.0M
            R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8),
706
65.0M
            S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8),
707
65.0M
            S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183),
708
65.0M
            R(0x24b6,0x24cf), R(0x2c00,0x2c2f), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b),
709
65.0M
            S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2),
710
65.0M
            S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e),
711
65.0M
            S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792),
712
65.0M
            R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
713
65.0M
            S(0xa7b3), R(0xa7b4,0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7d0), S(0xa7d6),
714
65.0M
            S(0xa7d8), S(0xa7f5), R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3),
715
65.0M
            R(0x10570,0x1057a), R(0x1057c,0x1058a), R(0x1058c,0x10592), S(0x10594), S(0x10595), R(0x10c80,0x10cb2),
716
65.0M
            R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
717
65.0M
        };
718
65.0M
        static const unsigned FOLD_MAP_1_DATA[] = {
719
65.0M
            0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
720
65.0M
            0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257,
721
65.0M
            0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275,
722
65.0M
            0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292,
723
65.0M
            0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3,
724
65.0M
            0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242,
725
65.0M
            0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af,
726
65.0M
            0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0,
727
65.0M
            0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f,
728
65.0M
            0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586,
729
65.0M
            0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a,
730
65.0M
            0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07,
731
65.0M
            0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60,
732
65.0M
            0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0,
733
65.0M
            0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170,
734
65.0M
            0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5f, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251,
735
65.0M
            0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641,
736
65.0M
            0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c,
737
65.0M
            0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d,
738
65.0M
            0xab53, 0xa7b5, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7d1, 0xa7d7, 0xa7d9, 0xa7f6, 0x13a0,
739
65.0M
            0x13ef, 0xff41, 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10597, 0x105a1, 0x105a3, 0x105b1, 0x105b3,
740
65.0M
            0x105b9, 0x105bb, 0x105bc, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922, 0x1e943
741
65.0M
        };
742
65.0M
        static const unsigned FOLD_MAP_2[] = {
743
65.0M
            S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
744
65.0M
            S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
745
65.0M
            R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
746
65.0M
            S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
747
65.0M
            S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
748
65.0M
            S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
749
65.0M
        };
750
65.0M
        static const unsigned FOLD_MAP_2_DATA[] = {
751
65.0M
            0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
752
65.0M
            0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
753
65.0M
            0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
754
65.0M
            0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
755
65.0M
            0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
756
65.0M
            0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
757
65.0M
            0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
758
65.0M
            0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
759
65.0M
        };
760
65.0M
        static const unsigned FOLD_MAP_3[] = {
761
65.0M
            S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
762
65.0M
            S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
763
65.0M
        };
764
65.0M
        static const unsigned FOLD_MAP_3_DATA[] = {
765
65.0M
            0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
766
65.0M
            0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
767
65.0M
            0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
768
65.0M
            0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
769
65.0M
        };
770
65.0M
#undef R
771
65.0M
#undef S
772
65.0M
        static const struct {
773
65.0M
            const unsigned* map;
774
65.0M
            const unsigned* data;
775
65.0M
            size_t map_size;
776
65.0M
            unsigned n_codepoints;
777
65.0M
        } FOLD_MAP_LIST[] = {
778
65.0M
            { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
779
65.0M
            { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
780
65.0M
            { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
781
65.0M
        };
782
783
65.0M
        int i;
784
785
        /* Fast path for ASCII characters. */
786
65.0M
        if(codepoint <= 0x7f) {
787
61.4M
            info->codepoints[0] = codepoint;
788
61.4M
            if(ISUPPER_(codepoint))
789
763k
                info->codepoints[0] += 'a' - 'A';
790
61.4M
            info->n_codepoints = 1;
791
61.4M
            return;
792
61.4M
        }
793
794
        /* Try to locate the codepoint in any of the maps. */
795
14.5M
        for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
796
10.9M
            int index;
797
798
10.9M
            index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size);
799
10.9M
            if(index >= 0) {
800
                /* Found the mapping. */
801
43.9k
                unsigned n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
802
43.9k
                const unsigned* map = FOLD_MAP_LIST[i].map;
803
43.9k
                const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
804
805
43.9k
                memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints);
806
43.9k
                info->n_codepoints = n_codepoints;
807
808
43.9k
                if(FOLD_MAP_LIST[i].map[index] != codepoint) {
809
                    /* The found mapping maps whole range of codepoints,
810
                     * i.e. we have to offset info->codepoints[0] accordingly. */
811
32.2k
                    if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
812
                        /* Alternating type of the range. */
813
23.8k
                        info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
814
23.8k
                    } else {
815
                        /* Range to range kind of mapping. */
816
8.44k
                        info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
817
8.44k
                    }
818
32.2k
                }
819
820
43.9k
                return;
821
43.9k
            }
822
10.9M
        }
823
824
        /* No mapping found. Map the codepoint to itself. */
825
3.61M
        info->codepoints[0] = codepoint;
826
3.61M
        info->n_codepoints = 1;
827
3.61M
    }
828
#endif
829
830
831
#if defined MD4C_USE_UTF16
832
    #define IS_UTF16_SURROGATE_HI(word)     (((WORD)(word) & 0xfc00) == 0xd800)
833
    #define IS_UTF16_SURROGATE_LO(word)     (((WORD)(word) & 0xfc00) == 0xdc00)
834
    #define UTF16_DECODE_SURROGATE(hi, lo)  (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
835
836
    static unsigned
837
    md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
838
    {
839
        if(IS_UTF16_SURROGATE_HI(str[0])) {
840
            if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
841
                if(p_size != NULL)
842
                    *p_size = 2;
843
                return UTF16_DECODE_SURROGATE(str[0], str[1]);
844
            }
845
        }
846
847
        if(p_size != NULL)
848
            *p_size = 1;
849
        return str[0];
850
    }
851
852
    static unsigned
853
    md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
854
    {
855
        if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
856
            return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
857
858
        return CH(off);
859
    }
860
861
    /* No whitespace uses surrogates, so no decoding needed here. */
862
    #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
863
    #define ISUNICODEWHITESPACE(off)        md_is_unicode_whitespace__(CH(off))
864
    #define ISUNICODEWHITESPACEBEFORE(off)  md_is_unicode_whitespace__(CH((off)-1))
865
866
    #define ISUNICODEPUNCT(off)             md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
867
    #define ISUNICODEPUNCTBEFORE(off)       md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
868
869
    static inline int
870
    md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
871
    {
872
        return md_decode_utf16le__(str+off, str_size-off, p_char_size);
873
    }
874
#elif defined MD4C_USE_UTF8
875
107M
    #define IS_UTF8_LEAD1(byte)     ((unsigned char)(byte) <= 0x7f)
876
7.08M
    #define IS_UTF8_LEAD2(byte)     (((unsigned char)(byte) & 0xe0) == 0xc0)
877
5.99M
    #define IS_UTF8_LEAD3(byte)     (((unsigned char)(byte) & 0xf0) == 0xe0)
878
5.33M
    #define IS_UTF8_LEAD4(byte)     (((unsigned char)(byte) & 0xf8) == 0xf0)
879
6.67M
    #define IS_UTF8_TAIL(byte)      (((unsigned char)(byte) & 0xc0) == 0x80)
880
881
    static unsigned
882
    md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
883
102M
    {
884
102M
        if(!IS_UTF8_LEAD1(str[0])) {
885
5.85M
            if(IS_UTF8_LEAD2(str[0])) {
886
997k
                if(1 < str_size && IS_UTF8_TAIL(str[1])) {
887
205k
                    if(p_size != NULL)
888
154k
                        *p_size = 2;
889
890
205k
                    return (((unsigned int)str[0] & 0x1f) << 6) |
891
205k
                           (((unsigned int)str[1] & 0x3f) << 0);
892
205k
                }
893
4.85M
            } else if(IS_UTF8_LEAD3(str[0])) {
894
628k
                if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
895
87.7k
                    if(p_size != NULL)
896
69.9k
                        *p_size = 3;
897
898
87.7k
                    return (((unsigned int)str[0] & 0x0f) << 12) |
899
87.7k
                           (((unsigned int)str[1] & 0x3f) << 6) |
900
87.7k
                           (((unsigned int)str[2] & 0x3f) << 0);
901
87.7k
                }
902
4.22M
            } else if(IS_UTF8_LEAD4(str[0])) {
903
831k
                if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
904
94.8k
                    if(p_size != NULL)
905
76.7k
                        *p_size = 4;
906
907
94.8k
                    return (((unsigned int)str[0] & 0x07) << 18) |
908
94.8k
                           (((unsigned int)str[1] & 0x3f) << 12) |
909
94.8k
                           (((unsigned int)str[2] & 0x3f) << 6) |
910
94.8k
                           (((unsigned int)str[3] & 0x3f) << 0);
911
94.8k
                }
912
831k
            }
913
5.85M
        }
914
915
102M
        if(p_size != NULL)
916
98.9M
            *p_size = 1;
917
102M
        return (unsigned) str[0];
918
102M
    }
919
920
    static unsigned
921
    md_decode_utf8_before__(MD_CTX* ctx, OFF off)
922
4.75M
    {
923
4.75M
        if(!IS_UTF8_LEAD1(CH(off-1))) {
924
614k
            if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
925
47.4k
                return (((unsigned int)CH(off-2) & 0x1f) << 6) |
926
47.4k
                       (((unsigned int)CH(off-1) & 0x3f) << 0);
927
928
567k
            if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
929
14.8k
                return (((unsigned int)CH(off-3) & 0x0f) << 12) |
930
14.8k
                       (((unsigned int)CH(off-2) & 0x3f) << 6) |
931
14.8k
                       (((unsigned int)CH(off-1) & 0x3f) << 0);
932
933
552k
            if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
934
18.3k
                return (((unsigned int)CH(off-4) & 0x07) << 18) |
935
18.3k
                       (((unsigned int)CH(off-3) & 0x3f) << 12) |
936
18.3k
                       (((unsigned int)CH(off-2) & 0x3f) << 6) |
937
18.3k
                       (((unsigned int)CH(off-1) & 0x3f) << 0);
938
552k
        }
939
940
4.67M
        return (unsigned) CH(off-1);
941
4.75M
    }
942
943
146M
    #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
944
2.28M
    #define ISUNICODEWHITESPACE(off)        md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
945
4.21M
    #define ISUNICODEWHITESPACEBEFORE(off)  md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
946
947
1.54M
    #define ISUNICODEPUNCT(off)             md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
948
2.04M
    #define ISUNICODEPUNCTBEFORE(off)       md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
949
950
    static inline unsigned
951
    md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
952
99.2M
    {
953
99.2M
        return md_decode_utf8__(str+off, str_size-off, p_char_size);
954
99.2M
    }
955
#else
956
    #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
957
    #define ISUNICODEWHITESPACE(off)        ISWHITESPACE(off)
958
    #define ISUNICODEWHITESPACEBEFORE(off)  ISWHITESPACE((off)-1)
959
960
    #define ISUNICODEPUNCT(off)             ISPUNCT(off)
961
    #define ISUNICODEPUNCTBEFORE(off)       ISPUNCT((off)-1)
962
963
    static inline void
964
    md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
965
    {
966
        info->codepoints[0] = codepoint;
967
        if(ISUPPER_(codepoint))
968
            info->codepoints[0] += 'a' - 'A';
969
        info->n_codepoints = 1;
970
    }
971
972
    static inline unsigned
973
    md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
974
    {
975
        *p_size = 1;
976
        return (unsigned) str[off];
977
    }
978
#endif
979
980
981
/*************************************
982
 ***  Helper string manipulations  ***
983
 *************************************/
984
985
/* Fill buffer with copy of the string between 'beg' and 'end' but replace any
986
 * line breaks with given replacement character.
987
 *
988
 * NOTE: Caller is responsible to make sure the buffer is large enough.
989
 * (Given the output is always shorter than input, (end - beg) is good idea
990
 * what the caller should allocate.)
991
 */
992
static void
993
md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, MD_SIZE n_lines,
994
               CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
995
122k
{
996
122k
    CHAR* ptr = buffer;
997
122k
    int line_index = 0;
998
122k
    OFF off = beg;
999
1000
122k
    MD_UNUSED(n_lines);
1001
1002
2.59M
    while(1) {
1003
2.59M
        const MD_LINE* line = &lines[line_index];
1004
2.59M
        OFF line_end = line->end;
1005
2.59M
        if(end < line_end)
1006
120k
            line_end = end;
1007
1008
38.0M
        while(off < line_end) {
1009
35.4M
            *ptr = CH(off);
1010
35.4M
            ptr++;
1011
35.4M
            off++;
1012
35.4M
        }
1013
1014
2.59M
        if(off >= end) {
1015
122k
            *p_size = (MD_SIZE)(ptr - buffer);
1016
122k
            return;
1017
122k
        }
1018
1019
2.47M
        *ptr = line_break_replacement_char;
1020
2.47M
        ptr++;
1021
1022
2.47M
        line_index++;
1023
2.47M
        off = lines[line_index].beg;
1024
2.47M
    }
1025
122k
}
1026
1027
/* Wrapper of md_merge_lines() which allocates new buffer for the output string.
1028
 */
1029
static int
1030
md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, MD_SIZE n_lines,
1031
                    CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
1032
122k
{
1033
122k
    CHAR* buffer;
1034
1035
122k
    buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg));
1036
122k
    if(buffer == NULL) {
1037
0
        MD_LOG("malloc() failed.");
1038
0
        return -1;
1039
0
    }
1040
1041
122k
    md_merge_lines(ctx, beg, end, lines, n_lines,
1042
122k
                line_break_replacement_char, buffer, p_size);
1043
1044
122k
    *p_str = buffer;
1045
122k
    return 0;
1046
122k
}
1047
1048
static OFF
1049
md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
1050
26.3M
{
1051
26.3M
    SZ char_size;
1052
26.3M
    unsigned codepoint;
1053
1054
27.8M
    while(off < size) {
1055
27.3M
        codepoint = md_decode_unicode(label, off, size, &char_size);
1056
27.3M
        if(!ISUNICODEWHITESPACE_(codepoint)  &&  !ISNEWLINE_(label[off]))
1057
25.8M
            break;
1058
1.54M
        off += char_size;
1059
1.54M
    }
1060
1061
26.3M
    return off;
1062
26.3M
}
1063
1064
1065
/******************************
1066
 ***  Recognizing raw HTML  ***
1067
 ******************************/
1068
1069
/* md_is_html_tag() may be called when processing inlines (inline raw HTML)
1070
 * or when breaking document to blocks (checking for start of HTML block type 7).
1071
 *
1072
 * When breaking document to blocks, we do not yet know line boundaries, but
1073
 * in that case the whole tag has to live on a single line. We distinguish this
1074
 * by n_lines == 0.
1075
 */
1076
static int
1077
md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1078
2.52M
{
1079
2.52M
    int attr_state;
1080
2.52M
    OFF off = beg;
1081
2.52M
    OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
1082
2.52M
    MD_SIZE line_index = 0;
1083
1084
2.52M
    MD_ASSERT(CH(beg) == _T('<'));
1085
1086
2.52M
    if(off + 1 >= line_end)
1087
53.8k
        return FALSE;
1088
2.47M
    off++;
1089
1090
    /* For parsing attributes, we need a little state automaton below.
1091
     * State -1: no attributes are allowed.
1092
     * State 0: attribute could follow after some whitespace.
1093
     * State 1: after a whitespace (attribute name may follow).
1094
     * State 2: after attribute name ('=' MAY follow).
1095
     * State 3: after '=' (value specification MUST follow).
1096
     * State 41: in middle of unquoted attribute value.
1097
     * State 42: in middle of single-quoted attribute value.
1098
     * State 43: in middle of double-quoted attribute value.
1099
     */
1100
2.47M
    attr_state = 0;
1101
1102
2.47M
    if(CH(off) == _T('/')) {
1103
        /* Closer tag "</ ... >". No attributes may be present. */
1104
322k
        attr_state = -1;
1105
322k
        off++;
1106
322k
    }
1107
1108
    /* Tag name */
1109
2.47M
    if(off >= line_end  ||  !ISALPHA(off))
1110
940k
        return FALSE;
1111
1.52M
    off++;
1112
3.88M
    while(off < line_end  &&  (ISALNUM(off)  ||  CH(off) == _T('-')))
1113
2.35M
        off++;
1114
1115
    /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
1116
     * and final '>'. */
1117
2.00M
    while(1) {
1118
2.72M
        while(off < line_end  &&  !ISNEWLINE(off)) {
1119
2.13M
            if(attr_state > 40) {
1120
436k
                if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
1121
4.12k
                    attr_state = 0;
1122
4.12k
                    off--;  /* Put the char back for re-inspection in the new state. */
1123
432k
                } else if(attr_state == 42 && CH(off) == _T('\'')) {
1124
383
                    attr_state = 0;
1125
431k
                } else if(attr_state == 43 && CH(off) == _T('"')) {
1126
919
                    attr_state = 0;
1127
919
                }
1128
436k
                off++;
1129
1.70M
            } else if(ISWHITESPACE(off)) {
1130
192k
                if(attr_state == 0)
1131
16.8k
                    attr_state = 1;
1132
192k
                off++;
1133
1.50M
            } else if(attr_state <= 2 && CH(off) == _T('>')) {
1134
                /* End. */
1135
810k
                goto done;
1136
810k
            } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
1137
                /* End with digraph '/>' */
1138
1.60k
                off++;
1139
1.60k
                goto done;
1140
696k
            } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
1141
76.8k
                off++;
1142
                /* Attribute name */
1143
105k
                while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
1144
28.2k
                    off++;
1145
76.8k
                attr_state = 2;
1146
620k
            } else if(attr_state == 2 && CH(off) == _T('=')) {
1147
                /* Attribute assignment sign */
1148
7.99k
                off++;
1149
7.99k
                attr_state = 3;
1150
612k
            } else if(attr_state == 3) {
1151
                /* Expecting start of attribute value. */
1152
7.98k
                if(CH(off) == _T('"'))
1153
1.35k
                    attr_state = 43;
1154
6.63k
                else if(CH(off) == _T('\''))
1155
647
                    attr_state = 42;
1156
5.98k
                else if(!ISANYOF(off, _T("\"'=<>`"))  &&  !ISNEWLINE(off))
1157
5.30k
                    attr_state = 41;
1158
687
                else
1159
687
                    return FALSE;
1160
7.30k
                off++;
1161
604k
            } else {
1162
                /* Anything unexpected. */
1163
604k
                return FALSE;
1164
604k
            }
1165
2.13M
        }
1166
1167
        /* We have to be on a single line. See definition of start condition
1168
         * of HTML block, type 7. */
1169
590k
        if(n_lines == 0)
1170
67.9k
            return FALSE;
1171
1172
522k
        line_index++;
1173
522k
        if(line_index >= n_lines)
1174
44.4k
            return FALSE;
1175
1176
478k
        off = lines[line_index].beg;
1177
478k
        line_end = lines[line_index].end;
1178
1179
478k
        if(attr_state == 0  ||  attr_state == 41)
1180
129k
            attr_state = 1;
1181
1182
478k
        if(off >= max_end)
1183
0
            return FALSE;
1184
478k
    }
1185
1186
812k
done:
1187
812k
    if(off >= max_end)
1188
0
        return FALSE;
1189
1190
812k
    *p_end = off+1;
1191
812k
    return TRUE;
1192
812k
}
1193
1194
static int
1195
md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
1196
                        const MD_LINE* lines, MD_SIZE n_lines,
1197
                        OFF beg, OFF max_end, OFF* p_end,
1198
                        OFF* p_scan_horizon)
1199
112k
{
1200
112k
    OFF off = beg;
1201
112k
    MD_SIZE line_index = 0;
1202
1203
112k
    if(off < *p_scan_horizon  &&  *p_scan_horizon >= max_end - len) {
1204
        /* We have already scanned the range up to the max_end so we know
1205
         * there is nothing to see. */
1206
37.4k
        return FALSE;
1207
37.4k
    }
1208
1209
1.90M
    while(TRUE) {
1210
47.1M
        while(off + len <= lines[line_index].end  &&  off + len <= max_end) {
1211
45.3M
            if(md_ascii_eq(STR(off), str, len)) {
1212
                /* Success. */
1213
45.6k
                *p_end = off + len;
1214
45.6k
                return TRUE;
1215
45.6k
            }
1216
45.2M
            off++;
1217
45.2M
        }
1218
1219
1.86M
        line_index++;
1220
1.86M
        if(off >= max_end  ||  line_index >= n_lines) {
1221
            /* Failure. */
1222
29.1k
            *p_scan_horizon = off;
1223
29.1k
            return FALSE;
1224
29.1k
        }
1225
1226
1.83M
        off = lines[line_index].beg;
1227
1.83M
    }
1228
74.8k
}
1229
1230
static int
1231
md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1232
1.25M
{
1233
1.25M
    OFF off = beg;
1234
1235
1.25M
    MD_ASSERT(CH(beg) == _T('<'));
1236
1237
1.25M
    if(off + 4 >= lines[0].end)
1238
372k
        return FALSE;
1239
879k
    if(CH(off+1) != _T('!')  ||  CH(off+2) != _T('-')  ||  CH(off+3) != _T('-'))
1240
864k
        return FALSE;
1241
1242
    /* Skip only "<!" so that we accept also "<!-->" or "<!--->" */
1243
15.2k
    off += 2;
1244
1245
    /* Scan for ordinary comment closer "-->". */
1246
15.2k
    return md_scan_for_html_closer(ctx, _T("-->"), 3,
1247
15.2k
                lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon);
1248
879k
}
1249
1250
static int
1251
md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1252
1.24M
{
1253
1.24M
    OFF off = beg;
1254
1255
1.24M
    if(off + 2 >= lines[0].end)
1256
258k
        return FALSE;
1257
990k
    if(CH(off+1) != _T('?'))
1258
979k
        return FALSE;
1259
11.0k
    off += 2;
1260
1261
11.0k
    return md_scan_for_html_closer(ctx, _T("?>"), 2,
1262
11.0k
                lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon);
1263
990k
}
1264
1265
static int
1266
md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1267
1.24M
{
1268
1.24M
    OFF off = beg;
1269
1270
1.24M
    if(off + 2 >= lines[0].end)
1271
258k
        return FALSE;
1272
986k
    if(CH(off+1) != _T('!'))
1273
854k
        return FALSE;
1274
131k
    off += 2;
1275
1276
    /* Declaration name. */
1277
131k
    if(off >= lines[0].end  ||  !ISALPHA(off))
1278
93.7k
        return FALSE;
1279
37.9k
    off++;
1280
70.2k
    while(off < lines[0].end  &&  ISALPHA(off))
1281
32.2k
        off++;
1282
1283
37.9k
    return md_scan_for_html_closer(ctx, _T(">"), 1,
1284
37.9k
                lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon);
1285
131k
}
1286
1287
static int
1288
md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1289
1.24M
{
1290
1.24M
    static const CHAR open_str[] = _T("<![CDATA[");
1291
1.24M
    static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
1292
1293
1.24M
    OFF off = beg;
1294
1295
1.24M
    if(off + open_size >= lines[0].end)
1296
514k
        return FALSE;
1297
727k
    if(memcmp(STR(off), open_str, open_size) != 0)
1298
679k
        return FALSE;
1299
47.9k
    off += open_size;
1300
1301
47.9k
    return md_scan_for_html_closer(ctx, _T("]]>"), 3,
1302
47.9k
                lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon);
1303
727k
}
1304
1305
static int
1306
md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg, OFF max_end, OFF* p_end)
1307
2.05M
{
1308
2.05M
    MD_ASSERT(CH(beg) == _T('<'));
1309
2.05M
    return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end)  ||
1310
2.05M
            md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end)  ||
1311
2.05M
            md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end)  ||
1312
2.05M
            md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end)  ||
1313
2.05M
            md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
1314
2.05M
}
1315
1316
1317
/****************************
1318
 ***  Recognizing Entity  ***
1319
 ****************************/
1320
1321
static int
1322
md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1323
499k
{
1324
499k
    OFF off = beg;
1325
499k
    MD_UNUSED(ctx);
1326
1327
3.26M
    while(off < max_end  &&  ISXDIGIT_(text[off])  &&  off - beg <= 8)
1328
2.76M
        off++;
1329
1330
499k
    if(1 <= off - beg  &&  off - beg <= 6) {
1331
249k
        *p_end = off;
1332
249k
        return TRUE;
1333
249k
    } else {
1334
249k
        return FALSE;
1335
249k
    }
1336
499k
}
1337
1338
static int
1339
md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1340
1.00M
{
1341
1.00M
    OFF off = beg;
1342
1.00M
    MD_UNUSED(ctx);
1343
1344
4.15M
    while(off < max_end  &&  ISDIGIT_(text[off])  &&  off - beg <= 8)
1345
3.14M
        off++;
1346
1347
1.00M
    if(1 <= off - beg  &&  off - beg <= 7) {
1348
647k
        *p_end = off;
1349
647k
        return TRUE;
1350
647k
    } else {
1351
359k
        return FALSE;
1352
359k
    }
1353
1.00M
}
1354
1355
static int
1356
md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1357
2.49M
{
1358
2.49M
    OFF off = beg;
1359
2.49M
    MD_UNUSED(ctx);
1360
1361
2.49M
    if(off < max_end  &&  ISALPHA_(text[off]))
1362
245k
        off++;
1363
2.24M
    else
1364
2.24M
        return FALSE;
1365
1366
560k
    while(off < max_end  &&  ISALNUM_(text[off])  &&  off - beg <= 48)
1367
315k
        off++;
1368
1369
245k
    if(2 <= off - beg  &&  off - beg <= 48) {
1370
133k
        *p_end = off;
1371
133k
        return TRUE;
1372
133k
    } else {
1373
111k
        return FALSE;
1374
111k
    }
1375
245k
}
1376
1377
static int
1378
md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1379
3.99M
{
1380
3.99M
    int is_contents;
1381
3.99M
    OFF off = beg;
1382
1383
3.99M
    MD_ASSERT(text[off] == _T('&'));
1384
3.99M
    off++;
1385
1386
3.99M
    if(off+2 < max_end  &&  text[off] == _T('#')  &&  (text[off+1] == _T('x') || text[off+1] == _T('X')))
1387
499k
        is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off);
1388
3.50M
    else if(off+1 < max_end  &&  text[off] == _T('#'))
1389
1.00M
        is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off);
1390
2.49M
    else
1391
2.49M
        is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off);
1392
1393
3.99M
    if(is_contents  &&  off < max_end  &&  text[off] == _T(';')) {
1394
431k
        *p_end = off+1;
1395
431k
        return TRUE;
1396
3.56M
    } else {
1397
3.56M
        return FALSE;
1398
3.56M
    }
1399
3.99M
}
1400
1401
static inline int
1402
md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
1403
2.20M
{
1404
2.20M
    return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end);
1405
2.20M
}
1406
1407
1408
/******************************
1409
 ***  Attribute Management  ***
1410
 ******************************/
1411
1412
typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
1413
struct MD_ATTRIBUTE_BUILD_tag {
1414
    CHAR* text;
1415
    MD_TEXTTYPE* substr_types;
1416
    OFF* substr_offsets;
1417
    int substr_count;
1418
    int substr_alloc;
1419
    MD_TEXTTYPE trivial_types[1];
1420
    OFF trivial_offsets[2];
1421
};
1422
1423
1424
64.4M
#define MD_BUILD_ATTR_NO_ESCAPES    0x0001
1425
1426
static int
1427
md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
1428
                            MD_TEXTTYPE type, OFF off)
1429
13.1M
{
1430
13.1M
    if(build->substr_count >= build->substr_alloc) {
1431
159k
        MD_TEXTTYPE* new_substr_types;
1432
159k
        OFF* new_substr_offsets;
1433
1434
159k
        build->substr_alloc = (build->substr_alloc > 0
1435
159k
                ? build->substr_alloc + build->substr_alloc / 2
1436
159k
                : 8);
1437
159k
        new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types,
1438
159k
                                    build->substr_alloc * sizeof(MD_TEXTTYPE));
1439
159k
        if(new_substr_types == NULL) {
1440
0
            MD_LOG("realloc() failed.");
1441
0
            return -1;
1442
0
        }
1443
        /* Note +1 to reserve space for final offset (== raw_size). */
1444
159k
        new_substr_offsets = (OFF*) realloc(build->substr_offsets,
1445
159k
                                    (build->substr_alloc+1) * sizeof(OFF));
1446
159k
        if(new_substr_offsets == NULL) {
1447
0
            MD_LOG("realloc() failed.");
1448
0
            free(new_substr_types);
1449
0
            return -1;
1450
0
        }
1451
1452
159k
        build->substr_types = new_substr_types;
1453
159k
        build->substr_offsets = new_substr_offsets;
1454
159k
    }
1455
1456
13.1M
    build->substr_types[build->substr_count] = type;
1457
13.1M
    build->substr_offsets[build->substr_count] = off;
1458
13.1M
    build->substr_count++;
1459
13.1M
    return 0;
1460
13.1M
}
1461
1462
static void
1463
md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
1464
5.19M
{
1465
5.19M
    MD_UNUSED(ctx);
1466
1467
5.19M
    if(build->substr_alloc > 0) {
1468
98.3k
        free(build->text);
1469
98.3k
        free(build->substr_types);
1470
98.3k
        free(build->substr_offsets);
1471
98.3k
    }
1472
5.19M
}
1473
1474
static int
1475
md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
1476
                   unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
1477
5.19M
{
1478
5.19M
    OFF raw_off, off;
1479
5.19M
    int is_trivial;
1480
5.19M
    int ret = 0;
1481
1482
5.19M
    memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD));
1483
1484
    /* If there is no backslash and no ampersand, build trivial attribute
1485
     * without any malloc(). */
1486
5.19M
    is_trivial = TRUE;
1487
21.9M
    for(raw_off = 0; raw_off < raw_size; raw_off++) {
1488
16.8M
        if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
1489
98.3k
            is_trivial = FALSE;
1490
98.3k
            break;
1491
98.3k
        }
1492
16.8M
    }
1493
1494
5.19M
    if(is_trivial) {
1495
5.09M
        build->text = (CHAR*) (raw_size ? raw_text : NULL);
1496
5.09M
        build->substr_types = build->trivial_types;
1497
5.09M
        build->substr_offsets = build->trivial_offsets;
1498
5.09M
        build->substr_count = 1;
1499
5.09M
        build->substr_alloc = 0;
1500
5.09M
        build->trivial_types[0] = MD_TEXT_NORMAL;
1501
5.09M
        build->trivial_offsets[0] = 0;
1502
5.09M
        build->trivial_offsets[1] = raw_size;
1503
5.09M
        off = raw_size;
1504
5.09M
    } else {
1505
98.3k
        build->text = (CHAR*) malloc(raw_size * sizeof(CHAR));
1506
98.3k
        if(build->text == NULL) {
1507
0
            MD_LOG("malloc() failed.");
1508
0
            goto abort;
1509
0
        }
1510
1511
98.3k
        raw_off = 0;
1512
98.3k
        off = 0;
1513
1514
76.5M
        while(raw_off < raw_size) {
1515
76.4M
            if(raw_text[raw_off] == _T('\0')) {
1516
11.6M
                MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
1517
11.6M
                memcpy(build->text + off, raw_text + raw_off, 1);
1518
11.6M
                off++;
1519
11.6M
                raw_off++;
1520
11.6M
                continue;
1521
11.6M
            }
1522
1523
64.7M
            if(raw_text[raw_off] == _T('&')) {
1524
1.79M
                OFF ent_end;
1525
1526
1.79M
                if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) {
1527
289k
                    MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
1528
289k
                    memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off);
1529
289k
                    off += ent_end - raw_off;
1530
289k
                    raw_off = ent_end;
1531
289k
                    continue;
1532
289k
                }
1533
1.79M
            }
1534
1535
64.4M
            if(build->substr_count == 0  ||  build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
1536
1.18M
                MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
1537
1538
64.4M
            if(!(flags & MD_BUILD_ATTR_NO_ESCAPES)  &&
1539
64.4M
               raw_text[raw_off] == _T('\\')  &&  raw_off+1 < raw_size  &&
1540
64.4M
               (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
1541
10.2M
                raw_off++;
1542
1543
64.4M
            build->text[off++] = raw_text[raw_off++];
1544
64.4M
        }
1545
98.3k
        build->substr_offsets[build->substr_count] = off;
1546
98.3k
    }
1547
1548
5.19M
    attr->text = build->text;
1549
5.19M
    attr->size = off;
1550
5.19M
    attr->substr_offsets = build->substr_offsets;
1551
5.19M
    attr->substr_types = build->substr_types;
1552
5.19M
    return 0;
1553
1554
0
abort:
1555
0
    md_free_attribute(ctx, build);
1556
0
    return -1;
1557
5.19M
}
1558
1559
1560
/*********************************************
1561
 ***  Dictionary of Reference Definitions  ***
1562
 *********************************************/
1563
1564
2.65M
#define MD_FNV1A_BASE       2166136261U
1565
79.5M
#define MD_FNV1A_PRIME      16777619U
1566
1567
static inline unsigned
1568
md_fnv1a(unsigned base, const void* data, size_t n)
1569
19.8M
{
1570
19.8M
    const unsigned char* buf = (const unsigned char*) data;
1571
19.8M
    unsigned hash = base;
1572
19.8M
    size_t i;
1573
1574
99.4M
    for(i = 0; i < n; i++) {
1575
79.5M
        hash ^= buf[i];
1576
79.5M
        hash *= MD_FNV1A_PRIME;
1577
79.5M
    }
1578
1579
19.8M
    return hash;
1580
19.8M
}
1581
1582
1583
struct MD_REF_DEF_tag {
1584
    CHAR* label;
1585
    CHAR* title;
1586
    unsigned hash;
1587
    SZ label_size;
1588
    SZ title_size;
1589
    OFF dest_beg;
1590
    OFF dest_end;
1591
    unsigned char label_needs_free : 1;
1592
    unsigned char title_needs_free : 1;
1593
};
1594
1595
/* Label equivalence is quite complicated with regards to whitespace and case
1596
 * folding. This complicates computing a hash of it as well as direct comparison
1597
 * of two labels. */
1598
1599
static unsigned
1600
md_link_label_hash(const CHAR* label, SZ size)
1601
2.65M
{
1602
2.65M
    unsigned hash = MD_FNV1A_BASE;
1603
2.65M
    OFF off;
1604
2.65M
    unsigned codepoint;
1605
2.65M
    int is_whitespace = FALSE;
1606
1607
2.65M
    off = md_skip_unicode_whitespace(label, 0, size);
1608
22.5M
    while(off < size) {
1609
19.8M
        SZ char_size;
1610
1611
19.8M
        codepoint = md_decode_unicode(label, off, size, &char_size);
1612
19.8M
        is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
1613
1614
19.8M
        if(is_whitespace) {
1615
808k
            codepoint = ' ';
1616
808k
            hash = md_fnv1a(hash, &codepoint, sizeof(unsigned));
1617
808k
            off = md_skip_unicode_whitespace(label, off, size);
1618
19.0M
        } else {
1619
19.0M
            MD_UNICODE_FOLD_INFO fold_info;
1620
1621
19.0M
            md_get_unicode_fold_info(codepoint, &fold_info);
1622
19.0M
            hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned));
1623
19.0M
            off += char_size;
1624
19.0M
        }
1625
19.8M
    }
1626
1627
2.65M
    return hash;
1628
2.65M
}
1629
1630
static OFF
1631
md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
1632
                                 MD_UNICODE_FOLD_INFO* fold_info)
1633
46.1M
{
1634
46.1M
    unsigned codepoint;
1635
46.1M
    SZ char_size;
1636
1637
46.1M
    if(off >= size) {
1638
        /* Treat end of a link label as a whitespace. */
1639
62.5k
        goto whitespace;
1640
62.5k
    }
1641
1642
46.1M
    codepoint = md_decode_unicode(label, off, size, &char_size);
1643
46.1M
    off += char_size;
1644
46.1M
    if(ISUNICODEWHITESPACE_(codepoint)) {
1645
        /* Treat all whitespace as equivalent */
1646
103k
        goto whitespace;
1647
103k
    }
1648
1649
    /* Get real folding info. */
1650
46.0M
    md_get_unicode_fold_info(codepoint, fold_info);
1651
46.0M
    return off;
1652
1653
166k
whitespace:
1654
166k
    fold_info->codepoints[0] = _T(' ');
1655
166k
    fold_info->n_codepoints = 1;
1656
166k
    return md_skip_unicode_whitespace(label, off, size);
1657
46.1M
}
1658
1659
static int
1660
md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
1661
11.3M
{
1662
11.3M
    OFF a_off;
1663
11.3M
    OFF b_off;
1664
11.3M
    MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 };
1665
11.3M
    MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 };
1666
11.3M
    OFF a_fi_off = 0;
1667
11.3M
    OFF b_fi_off = 0;
1668
11.3M
    int cmp;
1669
1670
11.3M
    a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
1671
11.3M
    b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
1672
33.2M
    while(a_off < a_size || a_fi_off < a_fi.n_codepoints ||
1673
33.2M
          b_off < b_size || b_fi_off < b_fi.n_codepoints)
1674
23.0M
    {
1675
        /* If needed, load fold info for next char. */
1676
23.0M
        if(a_fi_off >= a_fi.n_codepoints) {
1677
23.0M
            a_fi_off = 0;
1678
23.0M
            a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi);
1679
23.0M
        }
1680
23.0M
        if(b_fi_off >= b_fi.n_codepoints) {
1681
23.0M
            b_fi_off = 0;
1682
23.0M
            b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi);
1683
23.0M
        }
1684
1685
23.0M
        cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
1686
23.0M
        if(cmp != 0)
1687
1.21M
            return cmp;
1688
1689
21.8M
        a_fi_off++;
1690
21.8M
        b_fi_off++;
1691
21.8M
    }
1692
1693
10.1M
    return 0;
1694
11.3M
}
1695
1696
typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
1697
struct MD_REF_DEF_LIST_tag {
1698
    int n_ref_defs;
1699
    int alloc_ref_defs;
1700
    MD_REF_DEF* ref_defs[];  /* Valid items always  point into ctx->ref_defs[] */
1701
};
1702
1703
static int
1704
md_ref_def_cmp(const void* a, const void* b)
1705
10.8M
{
1706
10.8M
    const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1707
10.8M
    const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1708
1709
10.8M
    if(a_ref->hash < b_ref->hash)
1710
19.9k
        return -1;
1711
10.8M
    else if(a_ref->hash > b_ref->hash)
1712
16.6k
        return +1;
1713
10.8M
    else
1714
10.8M
        return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size);
1715
10.8M
}
1716
1717
static int
1718
md_ref_def_cmp_for_sort(const void* a, const void* b)
1719
9.53M
{
1720
9.53M
    int cmp;
1721
1722
9.53M
    cmp = md_ref_def_cmp(a, b);
1723
1724
    /* Ensure stability of the sorting. */
1725
9.53M
    if(cmp == 0) {
1726
8.41M
        const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1727
8.41M
        const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1728
1729
8.41M
        if(a_ref < b_ref)
1730
8.41M
            cmp = -1;
1731
0
        else if(a_ref > b_ref)
1732
0
            cmp = +1;
1733
0
        else
1734
0
            cmp = 0;
1735
8.41M
    }
1736
1737
9.53M
    return cmp;
1738
9.53M
}
1739
1740
static int
1741
md_build_ref_def_hashtable(MD_CTX* ctx)
1742
16.6k
{
1743
16.6k
    int i, j;
1744
1745
16.6k
    if(ctx->n_ref_defs == 0)
1746
14.1k
        return 0;
1747
1748
2.49k
    ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
1749
2.49k
    ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*));
1750
2.49k
    if(ctx->ref_def_hashtable == NULL) {
1751
0
        MD_LOG("malloc() failed.");
1752
0
        goto abort;
1753
0
    }
1754
2.49k
    memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*));
1755
1756
    /* Each member of ctx->ref_def_hashtable[] can be:
1757
     *  -- NULL,
1758
     *  -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
1759
     *  -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
1760
     *     such MD_REF_DEFs.
1761
     */
1762
1.39M
    for(i = 0; i < ctx->n_ref_defs; i++) {
1763
1.38M
        MD_REF_DEF* def = &ctx->ref_defs[i];
1764
1.38M
        void* bucket;
1765
1.38M
        MD_REF_DEF_LIST* list;
1766
1767
1.38M
        def->hash = md_link_label_hash(def->label, def->label_size);
1768
1.38M
        bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
1769
1770
1.38M
        if(bucket == NULL) {
1771
            /* The bucket is empty. Make it just point to the def. */
1772
14.7k
            ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
1773
14.7k
            continue;
1774
14.7k
        }
1775
1776
1.37M
        if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1777
            /* The bucket already contains one ref. def. Lets see whether it
1778
             * is the same label (ref. def. duplicate) or different one
1779
             * (hash conflict). */
1780
259k
            MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
1781
1782
259k
            if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) {
1783
                /* Duplicate label: Ignore this ref. def. */
1784
257k
                continue;
1785
257k
            }
1786
1787
            /* Make the bucket complex, i.e. able to hold more ref. defs. */
1788
2.11k
            list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
1789
2.11k
            if(list == NULL) {
1790
0
                MD_LOG("malloc() failed.");
1791
0
                goto abort;
1792
0
            }
1793
2.11k
            list->ref_defs[0] = old_def;
1794
2.11k
            list->ref_defs[1] = def;
1795
2.11k
            list->n_ref_defs = 2;
1796
2.11k
            list->alloc_ref_defs = 2;
1797
2.11k
            ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1798
2.11k
            continue;
1799
2.11k
        }
1800
1801
        /* Append the def to the complex bucket list.
1802
         *
1803
         * Note in this case we ignore potential duplicates to avoid expensive
1804
         * iterating over the complex bucket. Below, we revisit all the complex
1805
         * buckets and handle it more cheaply after the complex bucket contents
1806
         * is sorted. */
1807
1.11M
        list = (MD_REF_DEF_LIST*) bucket;
1808
1.11M
        if(list->n_ref_defs >= list->alloc_ref_defs) {
1809
4.69k
            int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
1810
4.69k
            MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list,
1811
4.69k
                        sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
1812
4.69k
            if(list_tmp == NULL) {
1813
0
                MD_LOG("realloc() failed.");
1814
0
                goto abort;
1815
0
            }
1816
4.69k
            list = list_tmp;
1817
4.69k
            list->alloc_ref_defs = alloc_ref_defs;
1818
4.69k
            ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1819
4.69k
        }
1820
1821
1.11M
        list->ref_defs[list->n_ref_defs] = def;
1822
1.11M
        list->n_ref_defs++;
1823
1.11M
    }
1824
1825
    /* Sort the complex buckets so we can use bsearch() with them. */
1826
1.73M
    for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1827
1.73M
        void* bucket = ctx->ref_def_hashtable[i];
1828
1.73M
        MD_REF_DEF_LIST* list;
1829
1830
1.73M
        if(bucket == NULL)
1831
1.72M
            continue;
1832
14.7k
        if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1833
12.6k
            continue;
1834
1835
2.11k
        list = (MD_REF_DEF_LIST*) bucket;
1836
2.11k
        qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort);
1837
1838
        /* Disable all duplicates in the complex bucket by forcing all such
1839
         * records to point to the 1st such ref. def. I.e. no matter which
1840
         * record is found during the lookup, it will always point to the right
1841
         * ref. def. in ctx->ref_defs[]. */
1842
1.11M
        for(j = 1; j < list->n_ref_defs; j++) {
1843
1.11M
            if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0)
1844
1.11M
                list->ref_defs[j] = list->ref_defs[j-1];
1845
1.11M
        }
1846
2.11k
    }
1847
1848
2.49k
    return 0;
1849
1850
0
abort:
1851
0
    return -1;
1852
2.49k
}
1853
1854
static void
1855
md_free_ref_def_hashtable(MD_CTX* ctx)
1856
16.6k
{
1857
16.6k
    if(ctx->ref_def_hashtable != NULL) {
1858
2.49k
        int i;
1859
1860
1.73M
        for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1861
1.73M
            void* bucket = ctx->ref_def_hashtable[i];
1862
1.73M
            if(bucket == NULL)
1863
1.72M
                continue;
1864
14.7k
            if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1865
12.6k
                continue;
1866
2.11k
            free(bucket);
1867
2.11k
        }
1868
1869
2.49k
        free(ctx->ref_def_hashtable);
1870
2.49k
    }
1871
16.6k
}
1872
1873
static const MD_REF_DEF*
1874
md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
1875
1.41M
{
1876
1.41M
    unsigned hash;
1877
1.41M
    void* bucket;
1878
1879
1.41M
    if(ctx->ref_def_hashtable_size == 0)
1880
331k
        return NULL;
1881
1882
1.08M
    hash = md_link_label_hash(label, label_size);
1883
1.08M
    bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
1884
1885
1.08M
    if(bucket == NULL) {
1886
622k
        return NULL;
1887
622k
    } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1888
280k
        const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
1889
1890
280k
        if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0)
1891
180k
            return def;
1892
100k
        else
1893
100k
            return NULL;
1894
280k
    } else {
1895
182k
        MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
1896
182k
        MD_REF_DEF key_buf;
1897
182k
        const MD_REF_DEF* key = &key_buf;
1898
182k
        const MD_REF_DEF** ret;
1899
1900
182k
        key_buf.label = (CHAR*) label;
1901
182k
        key_buf.label_size = label_size;
1902
182k
        key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size);
1903
1904
182k
        ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs,
1905
182k
                    list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp);
1906
182k
        if(ret != NULL)
1907
174k
            return *ret;
1908
8.29k
        else
1909
8.29k
            return NULL;
1910
182k
    }
1911
1.08M
}
1912
1913
1914
/***************************
1915
 ***  Recognizing Links  ***
1916
 ***************************/
1917
1918
/* Note this code is partially shared between processing inlines and blocks
1919
 * as reference definitions and links share some helper parser functions.
1920
 */
1921
1922
typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
1923
struct MD_LINK_ATTR_tag {
1924
    OFF dest_beg;
1925
    OFF dest_end;
1926
1927
    CHAR* title;
1928
    SZ title_size;
1929
    int title_needs_free;
1930
};
1931
1932
1933
static int
1934
md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg,
1935
                 OFF* p_end, MD_SIZE* p_beg_line_index, MD_SIZE* p_end_line_index,
1936
                 OFF* p_contents_beg, OFF* p_contents_end)
1937
1.60M
{
1938
1.60M
    OFF off = beg;
1939
1.60M
    OFF contents_beg = 0;
1940
1.60M
    OFF contents_end = 0;
1941
1.60M
    MD_SIZE line_index = 0;
1942
1.60M
    int len = 0;
1943
1944
1.60M
    *p_beg_line_index = 0;
1945
1946
1.60M
    if(CH(off) != _T('['))
1947
11.1k
        return FALSE;
1948
1.59M
    off++;
1949
1950
1.72M
    while(1) {
1951
1.72M
        OFF line_end = lines[line_index].end;
1952
1953
7.68M
        while(off < line_end) {
1954
7.48M
            if(CH(off) == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
1955
85.0k
                if(contents_end == 0) {
1956
12.9k
                    contents_beg = off;
1957
12.9k
                    *p_beg_line_index = line_index;
1958
12.9k
                }
1959
85.0k
                contents_end = off + 2;
1960
85.0k
                off += 2;
1961
7.39M
            } else if(CH(off) == _T('[')) {
1962
39.7k
                return FALSE;
1963
7.35M
            } else if(CH(off) == _T(']')) {
1964
1.48M
                if(contents_beg < contents_end) {
1965
                    /* Success. */
1966
1.46M
                    *p_contents_beg = contents_beg;
1967
1.46M
                    *p_contents_end = contents_end;
1968
1.46M
                    *p_end = off+1;
1969
1.46M
                    *p_end_line_index = line_index;
1970
1.46M
                    return TRUE;
1971
1.46M
                } else {
1972
                    /* Link label must have some non-whitespace contents. */
1973
17.3k
                    return FALSE;
1974
17.3k
                }
1975
5.87M
            } else {
1976
5.87M
                unsigned codepoint;
1977
5.87M
                SZ char_size;
1978
1979
5.87M
                codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size);
1980
5.87M
                if(!ISUNICODEWHITESPACE_(codepoint)) {
1981
5.32M
                    if(contents_end == 0) {
1982
1.51M
                        contents_beg = off;
1983
1.51M
                        *p_beg_line_index = line_index;
1984
1.51M
                    }
1985
5.32M
                    contents_end = off + char_size;
1986
5.32M
                }
1987
1988
5.87M
                off += char_size;
1989
5.87M
            }
1990
1991
5.96M
            len++;
1992
5.96M
            if(len > 999)
1993
579
                return FALSE;
1994
5.96M
        }
1995
1996
195k
        line_index++;
1997
195k
        len++;
1998
195k
        if(line_index < n_lines)
1999
125k
            off = lines[line_index].beg;
2000
70.3k
        else
2001
70.3k
            break;
2002
195k
    }
2003
2004
70.3k
    return FALSE;
2005
1.59M
}
2006
2007
static int
2008
md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
2009
                         OFF* p_contents_beg, OFF* p_contents_end)
2010
56.8k
{
2011
56.8k
    OFF off = beg;
2012
2013
56.8k
    if(off >= max_end  ||  CH(off) != _T('<'))
2014
0
        return FALSE;
2015
56.8k
    off++;
2016
2017
1.86M
    while(off < max_end) {
2018
1.85M
        if(CH(off) == _T('\\')  &&  off+1 < max_end  &&  ISPUNCT(off+1)) {
2019
55.5k
            off += 2;
2020
55.5k
            continue;
2021
55.5k
        }
2022
2023
1.79M
        if(ISNEWLINE(off)  ||  CH(off) == _T('<'))
2024
6.58k
            return FALSE;
2025
2026
1.79M
        if(CH(off) == _T('>')) {
2027
            /* Success. */
2028
39.9k
            *p_contents_beg = beg+1;
2029
39.9k
            *p_contents_end = off;
2030
39.9k
            *p_end = off+1;
2031
39.9k
            return TRUE;
2032
39.9k
        }
2033
2034
1.75M
        off++;
2035
1.75M
    }
2036
2037
10.3k
    return FALSE;
2038
56.8k
}
2039
2040
static int
2041
md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
2042
                         OFF* p_contents_beg, OFF* p_contents_end)
2043
3.21M
{
2044
3.21M
    OFF off = beg;
2045
3.21M
    int parenthesis_level = 0;
2046
2047
35.6M
    while(off < max_end) {
2048
34.1M
        if(CH(off) == _T('\\')  &&  off+1 < max_end  &&  ISPUNCT(off+1)) {
2049
102k
            off += 2;
2050
102k
            continue;
2051
102k
        }
2052
2053
34.0M
        if(ISWHITESPACE(off) || ISCNTRL(off))
2054
232k
            break;
2055
2056
        /* Link destination may include balanced pairs of unescaped '(' ')'.
2057
         * Note we limit the maximal nesting level by 32 to protect us from
2058
         * https://github.com/jgm/cmark/issues/214 */
2059
33.8M
        if(CH(off) == _T('(')) {
2060
2.12M
            parenthesis_level++;
2061
2.12M
            if(parenthesis_level > 32)
2062
1.04k
                return FALSE;
2063
31.7M
        } else if(CH(off) == _T(')')) {
2064
3.23M
            if(parenthesis_level == 0)
2065
1.50M
                break;
2066
1.72M
            parenthesis_level--;
2067
1.72M
        }
2068
2069
32.3M
        off++;
2070
32.3M
    }
2071
2072
3.21M
    if(parenthesis_level != 0  ||  off == beg)
2073
166k
        return FALSE;
2074
2075
    /* Success. */
2076
3.04M
    *p_contents_beg = beg;
2077
3.04M
    *p_contents_end = off;
2078
3.04M
    *p_end = off;
2079
3.04M
    return TRUE;
2080
3.21M
}
2081
2082
static inline int
2083
md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
2084
                       OFF* p_contents_beg, OFF* p_contents_end)
2085
3.26M
{
2086
3.26M
    if(CH(beg) == _T('<'))
2087
56.8k
        return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2088
3.21M
    else
2089
3.21M
        return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2090
3.26M
}
2091
2092
static int
2093
md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg,
2094
                 OFF* p_end, MD_SIZE* p_beg_line_index, MD_SIZE* p_end_line_index,
2095
                 OFF* p_contents_beg, OFF* p_contents_end)
2096
3.08M
{
2097
3.08M
    OFF off = beg;
2098
3.08M
    CHAR closer_char;
2099
3.08M
    MD_SIZE line_index = 0;
2100
2101
    /* White space with up to one line break. */
2102
3.31M
    while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2103
233k
        off++;
2104
3.08M
    if(off >= lines[line_index].end) {
2105
1.43M
        line_index++;
2106
1.43M
        if(line_index >= n_lines)
2107
29.2k
            return FALSE;
2108
1.40M
        off = lines[line_index].beg;
2109
1.40M
    }
2110
3.05M
    if(off == beg)
2111
1.54M
        return FALSE;
2112
2113
1.50M
    *p_beg_line_index = line_index;
2114
2115
    /* First char determines how to detect end of it. */
2116
1.50M
    switch(CH(off)) {
2117
3.89k
        case _T('"'):   closer_char = _T('"'); break;
2118
30.1k
        case _T('\''):  closer_char = _T('\''); break;
2119
50.6k
        case _T('('):   closer_char = _T(')'); break;
2120
1.42M
        default:        return FALSE;
2121
1.50M
    }
2122
84.6k
    off++;
2123
2124
84.6k
    *p_contents_beg = off;
2125
2126
4.11M
    while(line_index < n_lines) {
2127
4.09M
        OFF line_end = lines[line_index].end;
2128
2129
40.9M
        while(off < line_end) {
2130
36.9M
            if(CH(off) == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2131
3.85M
                off++;
2132
33.1M
            } else if(CH(off) == closer_char) {
2133
                /* Success. */
2134
49.9k
                *p_contents_end = off;
2135
49.9k
                *p_end = off+1;
2136
49.9k
                *p_end_line_index = line_index;
2137
49.9k
                return TRUE;
2138
33.0M
            } else if(closer_char == _T(')')  &&  CH(off) == _T('(')) {
2139
                /* ()-style title cannot contain (unescaped '(')) */
2140
15.1k
                return FALSE;
2141
15.1k
            }
2142
2143
36.8M
            off++;
2144
36.8M
        }
2145
2146
4.03M
        line_index++;
2147
4.03M
    }
2148
2149
19.6k
    return FALSE;
2150
84.6k
}
2151
2152
/* Returns 0 if it is not a reference definition.
2153
 *
2154
 * Returns N > 0 if it is a reference definition. N then corresponds to the
2155
 * number of lines forming it). In this case the definition is stored for
2156
 * resolving any links referring to it.
2157
 *
2158
 * Returns -1 in case of an error (out of memory).
2159
 */
2160
static int
2161
md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines)
2162
1.60M
{
2163
1.60M
    OFF label_contents_beg;
2164
1.60M
    OFF label_contents_end;
2165
1.60M
    MD_SIZE label_contents_line_index;
2166
1.60M
    int label_is_multiline = FALSE;
2167
1.60M
    OFF dest_contents_beg;
2168
1.60M
    OFF dest_contents_end;
2169
1.60M
    OFF title_contents_beg;
2170
1.60M
    OFF title_contents_end;
2171
1.60M
    MD_SIZE title_contents_line_index;
2172
1.60M
    int title_is_multiline = FALSE;
2173
1.60M
    OFF off;
2174
1.60M
    MD_SIZE line_index = 0;
2175
1.60M
    MD_SIZE tmp_line_index;
2176
1.60M
    MD_REF_DEF* def = NULL;
2177
1.60M
    int ret = 0;
2178
2179
    /* Link label. */
2180
1.60M
    if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg,
2181
1.60M
                &off, &label_contents_line_index, &line_index,
2182
1.60M
                &label_contents_beg, &label_contents_end))
2183
139k
        return FALSE;
2184
1.46M
    label_is_multiline = (label_contents_line_index != line_index);
2185
2186
    /* Colon. */
2187
1.46M
    if(off >= lines[line_index].end  ||  CH(off) != _T(':'))
2188
48.0k
        return FALSE;
2189
1.41M
    off++;
2190
2191
    /* Optional white space with up to one line break. */
2192
1.43M
    while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2193
15.6k
        off++;
2194
1.41M
    if(off >= lines[line_index].end) {
2195
9.08k
        line_index++;
2196
9.08k
        if(line_index >= n_lines)
2197
6.59k
            return FALSE;
2198
2.48k
        off = lines[line_index].beg;
2199
2.48k
    }
2200
2201
    /* Link destination. */
2202
1.41M
    if(!md_is_link_destination(ctx, off, lines[line_index].end,
2203
1.41M
                &off, &dest_contents_beg, &dest_contents_end))
2204
4.97k
        return FALSE;
2205
2206
    /* (Optional) title. Note we interpret it as an title only if nothing
2207
     * more follows on its last line. */
2208
1.40M
    if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2209
1.40M
                &off, &title_contents_line_index, &tmp_line_index,
2210
1.40M
                &title_contents_beg, &title_contents_end)
2211
1.40M
        &&  off >= lines[line_index + tmp_line_index].end)
2212
5.34k
    {
2213
5.34k
        title_is_multiline = (tmp_line_index != title_contents_line_index);
2214
5.34k
        title_contents_line_index += line_index;
2215
5.34k
        line_index += tmp_line_index;
2216
1.40M
    } else {
2217
        /* Not a title. */
2218
1.40M
        title_is_multiline = FALSE;
2219
1.40M
        title_contents_beg = off;
2220
1.40M
        title_contents_end = off;
2221
1.40M
        title_contents_line_index = 0;
2222
1.40M
    }
2223
2224
    /* Nothing more can follow on the last line. */
2225
1.40M
    if(off < lines[line_index].end)
2226
17.6k
        return FALSE;
2227
2228
    /* So, it _is_ a reference definition. Remember it. */
2229
1.38M
    if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
2230
6.01k
        MD_REF_DEF* new_defs;
2231
2232
6.01k
        ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
2233
6.01k
                ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
2234
6.01k
                : 16);
2235
6.01k
        new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
2236
6.01k
        if(new_defs == NULL) {
2237
0
            MD_LOG("realloc() failed.");
2238
0
            goto abort;
2239
0
        }
2240
2241
6.01k
        ctx->ref_defs = new_defs;
2242
6.01k
    }
2243
1.38M
    def = &ctx->ref_defs[ctx->n_ref_defs];
2244
1.38M
    memset(def, 0, sizeof(MD_REF_DEF));
2245
2246
1.38M
    if(label_is_multiline) {
2247
18.4k
        MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
2248
18.4k
                    lines + label_contents_line_index, n_lines - label_contents_line_index,
2249
18.4k
                    _T(' '), &def->label, &def->label_size));
2250
18.4k
        def->label_needs_free = TRUE;
2251
1.37M
    } else {
2252
1.37M
        def->label = (CHAR*) STR(label_contents_beg);
2253
1.37M
        def->label_size = label_contents_end - label_contents_beg;
2254
1.37M
    }
2255
2256
1.38M
    if(title_is_multiline) {
2257
5.08k
        MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2258
5.08k
                    lines + title_contents_line_index, n_lines - title_contents_line_index,
2259
5.08k
                    _T('\n'), &def->title, &def->title_size));
2260
5.08k
        def->title_needs_free = TRUE;
2261
1.38M
    } else {
2262
1.38M
        def->title = (CHAR*) STR(title_contents_beg);
2263
1.38M
        def->title_size = title_contents_end - title_contents_beg;
2264
1.38M
    }
2265
2266
1.38M
    def->dest_beg = dest_contents_beg;
2267
1.38M
    def->dest_end = dest_contents_end;
2268
2269
    /* Success. */
2270
1.38M
    ctx->n_ref_defs++;
2271
1.38M
    return line_index + 1;
2272
2273
0
abort:
2274
    /* Failure. */
2275
0
    if(def != NULL  &&  def->label_needs_free)
2276
0
        free(def->label);
2277
0
    if(def != NULL  &&  def->title_needs_free)
2278
0
        free(def->title);
2279
0
    return ret;
2280
1.38M
}
2281
2282
static int
2283
md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
2284
                     OFF beg, OFF end, MD_LINK_ATTR* attr)
2285
1.42M
{
2286
1.42M
    const MD_REF_DEF* def;
2287
1.42M
    const MD_LINE* beg_line;
2288
1.42M
    int is_multiline;
2289
1.42M
    CHAR* label;
2290
1.42M
    SZ label_size;
2291
1.42M
    int ret = FALSE;
2292
2293
1.42M
    MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
2294
1.42M
    MD_ASSERT(CH(end-1) == _T(']'));
2295
2296
1.42M
    if(ctx->max_ref_def_output == 0)
2297
6.48k
        return FALSE;
2298
2299
1.41M
    beg += (CH(beg) == _T('!') ? 2 : 1);
2300
1.41M
    end--;
2301
2302
    /* Find lines corresponding to the beg and end positions. */
2303
1.41M
    beg_line = md_lookup_line(beg, lines, n_lines, NULL);
2304
1.41M
    is_multiline = (end > beg_line->end);
2305
2306
1.41M
    if(is_multiline) {
2307
70.8k
        MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
2308
70.8k
                 (int)(n_lines - (beg_line - lines)), _T(' '), &label, &label_size));
2309
1.34M
    } else {
2310
1.34M
        label = (CHAR*) STR(beg);
2311
1.34M
        label_size = end - beg;
2312
1.34M
    }
2313
2314
1.41M
    def = md_lookup_ref_def(ctx, label, label_size);
2315
1.41M
    if(def != NULL) {
2316
355k
        attr->dest_beg = def->dest_beg;
2317
355k
        attr->dest_end = def->dest_end;
2318
355k
        attr->title = def->title;
2319
355k
        attr->title_size = def->title_size;
2320
355k
        attr->title_needs_free = FALSE;
2321
355k
    }
2322
2323
1.41M
    if(is_multiline)
2324
70.8k
        free(label);
2325
2326
1.41M
    if(def != NULL) {
2327
        /* See https://github.com/mity/md4c/issues/238 */
2328
355k
        MD_SIZE output_size_estimation = def->label_size + def->title_size + def->dest_end - def->dest_beg;
2329
355k
        if(output_size_estimation < ctx->max_ref_def_output) {
2330
355k
            ctx->max_ref_def_output -= output_size_estimation;
2331
355k
            ret = TRUE;
2332
355k
        } else {
2333
29
            MD_LOG("Too many link reference definition instantiations.");
2334
29
            ctx->max_ref_def_output = 0;
2335
29
        }
2336
355k
    }
2337
2338
1.41M
abort:
2339
1.41M
    return ret;
2340
1.41M
}
2341
2342
static int
2343
md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
2344
                       OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
2345
1.91M
{
2346
1.91M
    MD_SIZE line_index = 0;
2347
1.91M
    MD_SIZE tmp_line_index;
2348
1.91M
    OFF title_contents_beg;
2349
1.91M
    OFF title_contents_end;
2350
1.91M
    MD_SIZE title_contents_line_index;
2351
1.91M
    int title_is_multiline;
2352
1.91M
    OFF off = beg;
2353
1.91M
    int ret = FALSE;
2354
2355
1.91M
    md_lookup_line(off, lines, n_lines, &line_index);
2356
2357
1.91M
    MD_ASSERT(CH(off) == _T('('));
2358
1.91M
    off++;
2359
2360
    /* Optional white space with up to one line break. */
2361
2.08M
    while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2362
163k
        off++;
2363
1.91M
    if(off >= lines[line_index].end  &&  (off >= ctx->size  ||  ISNEWLINE(off))) {
2364
118k
        line_index++;
2365
118k
        if(line_index >= n_lines)
2366
4.15k
            return FALSE;
2367
114k
        off = lines[line_index].beg;
2368
114k
    }
2369
2370
    /* Link destination may be omitted, but only when not also having a title. */
2371
1.91M
    if(off < ctx->size  &&  CH(off) == _T(')')) {
2372
57.6k
        attr->dest_beg = off;
2373
57.6k
        attr->dest_end = off;
2374
57.6k
        attr->title = NULL;
2375
57.6k
        attr->title_size = 0;
2376
57.6k
        attr->title_needs_free = FALSE;
2377
57.6k
        off++;
2378
57.6k
        *p_end = off;
2379
57.6k
        return TRUE;
2380
57.6k
    }
2381
2382
    /* Link destination. */
2383
1.85M
    if(!md_is_link_destination(ctx, off, lines[line_index].end,
2384
1.85M
                        &off, &attr->dest_beg, &attr->dest_end))
2385
179k
        return FALSE;
2386
2387
    /* (Optional) title. */
2388
1.67M
    if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2389
1.67M
                &off, &title_contents_line_index, &tmp_line_index,
2390
1.67M
                &title_contents_beg, &title_contents_end))
2391
42.9k
    {
2392
42.9k
        title_is_multiline = (tmp_line_index != title_contents_line_index);
2393
42.9k
        title_contents_line_index += line_index;
2394
42.9k
        line_index += tmp_line_index;
2395
1.63M
    } else {
2396
        /* Not a title. */
2397
1.63M
        title_is_multiline = FALSE;
2398
1.63M
        title_contents_beg = off;
2399
1.63M
        title_contents_end = off;
2400
1.63M
        title_contents_line_index = 0;
2401
1.63M
    }
2402
2403
    /* Optional whitespace followed with final ')'. */
2404
1.84M
    while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2405
166k
        off++;
2406
1.67M
    if(off >= lines[line_index].end) {
2407
38.5k
        line_index++;
2408
38.5k
        if(line_index >= n_lines)
2409
7.96k
            return FALSE;
2410
30.6k
        off = lines[line_index].beg;
2411
30.6k
    }
2412
1.66M
    if(CH(off) != _T(')'))
2413
124k
        goto abort;
2414
1.54M
    off++;
2415
2416
1.54M
    if(title_contents_beg >= title_contents_end) {
2417
1.50M
        attr->title = NULL;
2418
1.50M
        attr->title_size = 0;
2419
1.50M
        attr->title_needs_free = FALSE;
2420
1.50M
    } else if(!title_is_multiline) {
2421
6.85k
        attr->title = (CHAR*) STR(title_contents_beg);
2422
6.85k
        attr->title_size = title_contents_end - title_contents_beg;
2423
6.85k
        attr->title_needs_free = FALSE;
2424
27.6k
    } else {
2425
27.6k
        MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2426
27.6k
                    lines + title_contents_line_index, n_lines - title_contents_line_index,
2427
27.6k
                    _T('\n'), &attr->title, &attr->title_size));
2428
27.6k
        attr->title_needs_free = TRUE;
2429
27.6k
    }
2430
2431
1.54M
    *p_end = off;
2432
1.54M
    ret = TRUE;
2433
2434
1.66M
abort:
2435
1.66M
    return ret;
2436
1.54M
}
2437
2438
static void
2439
md_free_ref_defs(MD_CTX* ctx)
2440
16.6k
{
2441
16.6k
    int i;
2442
2443
1.40M
    for(i = 0; i < ctx->n_ref_defs; i++) {
2444
1.38M
        MD_REF_DEF* def = &ctx->ref_defs[i];
2445
2446
1.38M
        if(def->label_needs_free)
2447
18.4k
            free(def->label);
2448
1.38M
        if(def->title_needs_free)
2449
5.08k
            free(def->title);
2450
1.38M
    }
2451
2452
16.6k
    free(ctx->ref_defs);
2453
16.6k
}
2454
2455
2456
/******************************************
2457
 ***  Processing Inlines (a.k.a Spans)  ***
2458
 ******************************************/
2459
2460
/* We process inlines in few phases:
2461
 *
2462
 * (1) We go through the block text and collect all significant characters
2463
 *     which may start/end a span or some other significant position into
2464
 *     ctx->marks[]. Core of this is what md_collect_marks() does.
2465
 *
2466
 *     We also do some very brief preliminary context-less analysis, whether
2467
 *     it might be opener or closer (e.g. of an emphasis span).
2468
 *
2469
 *     This speeds the other steps as we do not need to re-iterate over all
2470
 *     characters anymore.
2471
 *
2472
 * (2) We analyze each potential mark types, in order by their precedence.
2473
 *
2474
 *     In each md_analyze_XXX() function, we re-iterate list of the marks,
2475
 *     skipping already resolved regions (in preceding precedences) and try to
2476
 *     resolve them.
2477
 *
2478
 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
2479
 *       them as resolved.
2480
 *
2481
 * (2.2) For range-type marks, we analyze whether the mark could be closer
2482
 *       and, if yes, whether there is some preceding opener it could satisfy.
2483
 *
2484
 *       If not we check whether it could be really an opener and if yes, we
2485
 *       remember it so subsequent closers may resolve it.
2486
 *
2487
 * (3) Finally, when all marks were analyzed, we render the block contents
2488
 *     by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
2489
 *     or ::close_span() whenever we reach a resolved mark.
2490
 */
2491
2492
2493
/* The mark structure.
2494
 *
2495
 * '\\': Maybe escape sequence.
2496
 * '\0': NULL char.
2497
 *  '*': Maybe (strong) emphasis start/end.
2498
 *  '_': Maybe (strong) emphasis start/end.
2499
 *  '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
2500
 *  '`': Maybe code span start/end.
2501
 *  '&': Maybe start of entity.
2502
 *  ';': Maybe end of entity.
2503
 *  '<': Maybe start of raw HTML or autolink.
2504
 *  '>': Maybe end of raw HTML or autolink.
2505
 *  '[': Maybe start of link label or link text.
2506
 *  '!': Equivalent of '[' for image.
2507
 *  ']': Maybe end of link label or link text.
2508
 *  '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
2509
 *  ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
2510
 *  '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
2511
 *  'D': Dummy mark, it reserves a space for splitting a previous mark
2512
 *       (e.g. emphasis) or to make more space for storing some special data
2513
 *       related to the preceding mark (e.g. link).
2514
 *
2515
 * Note that not all instances of these chars in the text imply creation of the
2516
 * structure. Only those which have (or may have, after we see more context)
2517
 * the special meaning.
2518
 *
2519
 * (Keep this struct as small as possible to fit as much of them into CPU
2520
 * cache line.)
2521
 */
2522
struct MD_MARK_tag {
2523
    OFF beg;
2524
    OFF end;
2525
2526
    /* For unresolved openers, 'next' may be used to form a stack of
2527
     * unresolved open openers.
2528
     *
2529
     * When resolved with MD_MARK_OPENER/CLOSER flag, next/prev is index of the
2530
     * respective closer/opener.
2531
     */
2532
    int prev;
2533
    int next;
2534
    CHAR ch;
2535
    unsigned char flags;
2536
};
2537
2538
/* Mark flags (these apply to ALL mark types). */
2539
84.1M
#define MD_MARK_POTENTIAL_OPENER            0x01  /* Maybe opener. */
2540
4.15M
#define MD_MARK_POTENTIAL_CLOSER            0x02  /* Maybe closer. */
2541
116M
#define MD_MARK_OPENER                      0x04  /* Definitely opener. */
2542
3.03M
#define MD_MARK_CLOSER                      0x08  /* Definitely closer. */
2543
1.35G
#define MD_MARK_RESOLVED                    0x10  /* Resolved in any definite way. */
2544
2545
/* Mark flags specific for various mark types (so they can share bits). */
2546
7.97M
#define MD_MARK_EMPH_OC                     0x20  /* Opener/closer mixed candidate. Helper for the "rule of 3". */
2547
3.28M
#define MD_MARK_EMPH_MOD3_0                 0x40
2548
5.23M
#define MD_MARK_EMPH_MOD3_1                 0x80
2549
2.85M
#define MD_MARK_EMPH_MOD3_2                 (0x40 | 0x80)
2550
6.90M
#define MD_MARK_EMPH_MOD3_MASK              (0x40 | 0x80)
2551
1.01M
#define MD_MARK_AUTOLINK                    0x20  /* Distinguisher for '<', '>'. */
2552
66.7k
#define MD_MARK_AUTOLINK_MISSING_MAILTO     0x40
2553
279k
#define MD_MARK_VALIDPERMISSIVEAUTOLINK     0x20  /* For permissive autolinks. */
2554
76.7M
#define MD_MARK_HASNESTEDBRACKETS           0x20  /* For '[' to rule out invalid link labels early */
2555
2556
static MD_MARKSTACK*
2557
md_emph_stack(MD_CTX* ctx, MD_CHAR ch, unsigned flags)
2558
4.34M
{
2559
4.34M
    MD_MARKSTACK* stack;
2560
2561
4.34M
    switch(ch) {
2562
2.74M
        case '*':   stack = &ASTERISK_OPENERS_oo_mod3_0; break;
2563
1.60M
        case '_':   stack = &UNDERSCORE_OPENERS_oo_mod3_0; break;
2564
0
        default:    MD_UNREACHABLE();
2565
4.34M
    }
2566
2567
4.34M
    if(flags & MD_MARK_EMPH_OC)
2568
2.06M
        stack += 3;
2569
2570
4.34M
    switch(flags & MD_MARK_EMPH_MOD3_MASK) {
2571
1.65M
        case MD_MARK_EMPH_MOD3_0:   stack += 0; break;
2572
1.89M
        case MD_MARK_EMPH_MOD3_1:   stack += 1; break;
2573
790k
        case MD_MARK_EMPH_MOD3_2:   stack += 2; break;
2574
0
        default:                    MD_UNREACHABLE();
2575
4.34M
    }
2576
2577
4.34M
    return stack;
2578
4.34M
}
2579
2580
static MD_MARKSTACK*
2581
md_opener_stack(MD_CTX* ctx, int mark_index)
2582
563k
{
2583
563k
    MD_MARK* mark = &ctx->marks[mark_index];
2584
2585
563k
    switch(mark->ch) {
2586
234k
        case _T('*'):
2587
348k
        case _T('_'):   return md_emph_stack(ctx, mark->ch, mark->flags);
2588
2589
215k
        case _T('~'):   return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
2590
2591
0
        case _T('!'):
2592
0
        case _T('['):   return &BRACKET_OPENERS;
2593
2594
0
        default:        MD_UNREACHABLE();
2595
563k
    }
2596
563k
}
2597
2598
static MD_MARK*
2599
md_add_mark(MD_CTX* ctx)
2600
283M
{
2601
283M
    if(ctx->n_marks >= ctx->alloc_marks) {
2602
42.4k
        MD_MARK* new_marks;
2603
2604
42.4k
        ctx->alloc_marks = (ctx->alloc_marks > 0
2605
42.4k
                ? ctx->alloc_marks + ctx->alloc_marks / 2
2606
42.4k
                : 64);
2607
42.4k
        new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK));
2608
42.4k
        if(new_marks == NULL) {
2609
0
            MD_LOG("realloc() failed.");
2610
0
            return NULL;
2611
0
        }
2612
2613
42.4k
        ctx->marks = new_marks;
2614
42.4k
    }
2615
2616
283M
    return &ctx->marks[ctx->n_marks++];
2617
283M
}
2618
2619
#define ADD_MARK_()                                                     \
2620
283M
        do {                                                            \
2621
283M
            mark = md_add_mark(ctx);                                    \
2622
283M
            if(mark == NULL) {                                          \
2623
0
                ret = -1;                                               \
2624
0
                goto abort;                                             \
2625
0
            }                                                           \
2626
283M
        } while(0)
2627
2628
#define ADD_MARK(ch_, beg_, end_, flags_)                               \
2629
283M
        do {                                                            \
2630
283M
            ADD_MARK_();                                                \
2631
283M
            mark->beg = (beg_);                                         \
2632
283M
            mark->end = (end_);                                         \
2633
283M
            mark->prev = -1;                                            \
2634
283M
            mark->next = -1;                                            \
2635
283M
            mark->ch = (char)(ch_);                                     \
2636
283M
            mark->flags = (flags_);                                     \
2637
283M
        } while(0)
2638
2639
2640
static inline void
2641
md_mark_stack_push(MD_CTX* ctx, MD_MARKSTACK* stack, int mark_index)
2642
77.3M
{
2643
77.3M
    ctx->marks[mark_index].next = stack->top;
2644
77.3M
    stack->top = mark_index;
2645
77.3M
}
2646
2647
static inline int
2648
md_mark_stack_pop(MD_CTX* ctx, MD_MARKSTACK* stack)
2649
6.76M
{
2650
6.76M
    int top = stack->top;
2651
6.76M
    if(top >= 0)
2652
6.76M
        stack->top = ctx->marks[top].next;
2653
6.76M
    return top;
2654
6.76M
}
2655
2656
/* Sometimes, we need to store a pointer into the mark. It is quite rare
2657
 * so we do not bother to make MD_MARK use union, and it can only happen
2658
 * for dummy marks. */
2659
static inline void
2660
md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
2661
1.95M
{
2662
1.95M
    MD_MARK* mark = &ctx->marks[mark_index];
2663
1.95M
    MD_ASSERT(mark->ch == 'D');
2664
2665
    /* Check only members beg and end are misused for this. */
2666
1.95M
    MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
2667
1.95M
    memcpy(mark, &ptr, sizeof(void*));
2668
1.95M
}
2669
2670
static inline void*
2671
md_mark_get_ptr(MD_CTX* ctx, int mark_index)
2672
2.34M
{
2673
2.34M
    void* ptr;
2674
2.34M
    MD_MARK* mark = &ctx->marks[mark_index];
2675
2.34M
    MD_ASSERT(mark->ch == 'D');
2676
2.34M
    memcpy(&ptr, mark, sizeof(void*));
2677
2.34M
    return ptr;
2678
2.34M
}
2679
2680
static inline void
2681
md_resolve_range(MD_CTX* ctx, int opener_index, int closer_index)
2682
1.05M
{
2683
1.05M
    MD_MARK* opener = &ctx->marks[opener_index];
2684
1.05M
    MD_MARK* closer = &ctx->marks[closer_index];
2685
2686
    /* Interconnect opener and closer and mark both as resolved. */
2687
1.05M
    opener->next = closer_index;
2688
1.05M
    closer->prev = opener_index;
2689
2690
1.05M
    opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
2691
1.05M
    closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
2692
1.05M
}
2693
2694
2695
430k
#define MD_ROLLBACK_CROSSING    0
2696
549k
#define MD_ROLLBACK_ALL         1
2697
2698
/* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
2699
 * resolvings accordingly to these rules:
2700
 *
2701
 * (1) All stacks of openers are cut so that any pending potential openers
2702
 *     are discarded from future consideration.
2703
 *
2704
 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
2705
 *     are thrown away and turned into dummy marks ('D').
2706
 *
2707
 * WARNING: Do not call for arbitrary range of opener and closer.
2708
 * This must form (potentially) valid range not crossing nesting boundaries
2709
 * of already resolved ranges.
2710
 */
2711
static void
2712
md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
2713
490k
{
2714
490k
    int i;
2715
2716
8.33M
    for(i = 0; i < (int) SIZEOF_ARRAY(ctx->opener_stacks); i++) {
2717
7.84M
        MD_MARKSTACK* stack = &ctx->opener_stacks[i];
2718
10.5M
        while(stack->top >= opener_index)
2719
2.73M
            md_mark_stack_pop(ctx, stack);
2720
7.84M
    }
2721
2722
490k
    if(how == MD_ROLLBACK_ALL) {
2723
21.5M
        for(i = opener_index + 1; i < closer_index; i++) {
2724
21.5M
            ctx->marks[i].ch = 'D';
2725
21.5M
            ctx->marks[i].flags = 0;
2726
21.5M
        }
2727
59.4k
    }
2728
490k
}
2729
2730
static void
2731
md_build_mark_char_map(MD_CTX* ctx)
2732
16.6k
{
2733
16.6k
    memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map));
2734
2735
16.6k
    ctx->mark_char_map['\\'] = 1;
2736
16.6k
    ctx->mark_char_map['*'] = 1;
2737
16.6k
    ctx->mark_char_map['_'] = 1;
2738
16.6k
    ctx->mark_char_map['`'] = 1;
2739
16.6k
    ctx->mark_char_map['&'] = 1;
2740
16.6k
    ctx->mark_char_map[';'] = 1;
2741
16.6k
    ctx->mark_char_map['<'] = 1;
2742
16.6k
    ctx->mark_char_map['>'] = 1;
2743
16.6k
    ctx->mark_char_map['['] = 1;
2744
16.6k
    ctx->mark_char_map['!'] = 1;
2745
16.6k
    ctx->mark_char_map[']'] = 1;
2746
16.6k
    ctx->mark_char_map['\0'] = 1;
2747
2748
16.6k
    if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
2749
8.88k
        ctx->mark_char_map['~'] = 1;
2750
2751
16.6k
    if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
2752
8.61k
        ctx->mark_char_map['$'] = 1;
2753
2754
16.6k
    if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
2755
10.4k
        ctx->mark_char_map['@'] = 1;
2756
2757
16.6k
    if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
2758
8.56k
        ctx->mark_char_map[':'] = 1;
2759
2760
16.6k
    if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
2761
8.44k
        ctx->mark_char_map['.'] = 1;
2762
2763
16.6k
    if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
2764
14.0k
        ctx->mark_char_map['|'] = 1;
2765
2766
16.6k
    if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
2767
9.30k
        int i;
2768
2769
2.39M
        for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
2770
2.38M
            if(ISWHITESPACE_(i))
2771
37.2k
                ctx->mark_char_map[i] = 1;
2772
2.38M
        }
2773
9.30k
    }
2774
16.6k
}
2775
2776
static int
2777
md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, OFF beg,
2778
                MD_MARK* opener, MD_MARK* closer,
2779
                OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
2780
                int* p_reached_paragraph_end)
2781
503k
{
2782
503k
    OFF opener_beg = beg;
2783
503k
    OFF opener_end;
2784
503k
    OFF closer_beg;
2785
503k
    OFF closer_end;
2786
503k
    SZ mark_len;
2787
503k
    OFF line_end;
2788
503k
    int has_space_after_opener = FALSE;
2789
503k
    int has_eol_after_opener = FALSE;
2790
503k
    int has_space_before_closer = FALSE;
2791
503k
    int has_eol_before_closer = FALSE;
2792
503k
    int has_only_space = TRUE;
2793
503k
    MD_SIZE line_index = 0;
2794
2795
503k
    line_end = lines[0].end;
2796
503k
    opener_end = opener_beg;
2797
4.56M
    while(opener_end < line_end  &&  CH(opener_end) == _T('`'))
2798
4.05M
        opener_end++;
2799
503k
    has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
2800
503k
    has_eol_after_opener = (opener_end == line_end);
2801
2802
    /* The caller needs to know end of the opening mark even if we fail. */
2803
503k
    opener->end = opener_end;
2804
2805
503k
    mark_len = opener_end - opener_beg;
2806
503k
    if(mark_len > CODESPAN_MARK_MAXLEN)
2807
4.00k
        return FALSE;
2808
2809
    /* Check whether we already know there is no closer of this length.
2810
     * If so, re-scan does no sense. This fixes issue #59. */
2811
498k
    if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end  ||
2812
498k
       (*p_reached_paragraph_end  &&  last_potential_closers[mark_len-1] < opener_end))
2813
14.5k
        return FALSE;
2814
2815
484k
    closer_beg = opener_end;
2816
484k
    closer_end = opener_end;
2817
2818
    /* Find closer mark. */
2819
2.91M
    while(TRUE) {
2820
85.2M
        while(closer_beg < line_end  &&  CH(closer_beg) != _T('`')) {
2821
82.3M
            if(CH(closer_beg) != _T(' '))
2822
81.5M
                has_only_space = FALSE;
2823
82.3M
            closer_beg++;
2824
82.3M
        }
2825
2.91M
        closer_end = closer_beg;
2826
6.72M
        while(closer_end < line_end  &&  CH(closer_end) == _T('`'))
2827
3.81M
            closer_end++;
2828
2829
2.91M
        if(closer_end - closer_beg == mark_len) {
2830
            /* Success. */
2831
377k
            has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
2832
377k
            has_eol_before_closer = (closer_beg == lines[line_index].beg);
2833
377k
            break;
2834
377k
        }
2835
2836
2.53M
        if(closer_end - closer_beg > 0) {
2837
            /* We have found a back-tick which is not part of the closer. */
2838
191k
            has_only_space = FALSE;
2839
2840
            /* But if we eventually fail, remember it as a potential closer
2841
             * of its own length for future attempts. This mitigates needs for
2842
             * rescans. */
2843
191k
            if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
2844
187k
                if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
2845
178k
                    last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
2846
187k
            }
2847
191k
        }
2848
2849
2.53M
        if(closer_end >= line_end) {
2850
2.43M
            line_index++;
2851
2.43M
            if(line_index >= n_lines) {
2852
                /* Reached end of the paragraph and still nothing. */
2853
107k
                *p_reached_paragraph_end = TRUE;
2854
107k
                return FALSE;
2855
107k
            }
2856
            /* Try on the next line. */
2857
2.32M
            line_end = lines[line_index].end;
2858
2.32M
            closer_beg = lines[line_index].beg;
2859
2.32M
        } else {
2860
99.2k
            closer_beg = closer_end;
2861
99.2k
        }
2862
2.53M
    }
2863
2864
    /* If there is a space or a new line both after and before the opener
2865
     * (and if the code span is not made of spaces only), consume one initial
2866
     * and one trailing space as part of the marks. */
2867
377k
    if(!has_only_space  &&
2868
377k
       (has_space_after_opener || has_eol_after_opener)  &&
2869
377k
       (has_space_before_closer || has_eol_before_closer))
2870
47.0k
    {
2871
47.0k
        if(has_space_after_opener)
2872
1.70k
            opener_end++;
2873
45.3k
        else
2874
45.3k
            opener_end = lines[1].beg;
2875
2876
47.0k
        if(has_space_before_closer)
2877
1.02k
            closer_beg--;
2878
46.0k
        else {
2879
            /* Go back to the end of prev line */
2880
46.0k
            closer_beg = lines[line_index-1].end;
2881
            /* But restore any trailing whitespace */
2882
86.8k
            while(closer_beg < ctx->size  &&  ISBLANK(closer_beg))
2883
40.8k
                closer_beg++;
2884
46.0k
        }
2885
47.0k
    }
2886
2887
377k
    opener->ch = _T('`');
2888
377k
    opener->beg = opener_beg;
2889
377k
    opener->end = opener_end;
2890
377k
    opener->flags = MD_MARK_POTENTIAL_OPENER;
2891
377k
    closer->ch = _T('`');
2892
377k
    closer->beg = closer_beg;
2893
377k
    closer->end = closer_end;
2894
377k
    closer->flags = MD_MARK_POTENTIAL_CLOSER;
2895
377k
    return TRUE;
2896
484k
}
2897
2898
static int
2899
md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2900
1.67M
{
2901
1.67M
    OFF off = beg+1;
2902
2903
1.67M
    MD_ASSERT(CH(beg) == _T('<'));
2904
2905
    /* Check for scheme. */
2906
1.67M
    if(off >= max_end  ||  !ISASCII(off))
2907
91.1k
        return FALSE;
2908
1.58M
    off++;
2909
3.76M
    while(1) {
2910
3.76M
        if(off >= max_end)
2911
112k
            return FALSE;
2912
3.65M
        if(off - beg > 32)
2913
8.18k
            return FALSE;
2914
3.64M
        if(CH(off) == _T(':')  &&  off - beg >= 3)
2915
24.9k
            break;
2916
3.62M
        if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
2917
1.43M
            return FALSE;
2918
2.18M
        off++;
2919
2.18M
    }
2920
2921
    /* Check the path after the scheme. */
2922
4.85M
    while(off < max_end  &&  CH(off) != _T('>')) {
2923
4.84M
        if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
2924
20.2k
            return FALSE;
2925
4.82M
        off++;
2926
4.82M
    }
2927
2928
4.72k
    if(off >= max_end)
2929
1.33k
        return FALSE;
2930
2931
3.39k
    MD_ASSERT(CH(off) == _T('>'));
2932
3.39k
    *p_end = off+1;
2933
3.39k
    return TRUE;
2934
3.39k
}
2935
2936
static int
2937
md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2938
1.67M
{
2939
1.67M
    OFF off = beg + 1;
2940
1.67M
    int label_len;
2941
2942
1.67M
    MD_ASSERT(CH(beg) == _T('<'));
2943
2944
    /* The code should correspond to this regexp:
2945
            /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
2946
            @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
2947
            (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
2948
     */
2949
2950
    /* Username (before '@'). */
2951
6.85M
    while(off < max_end  &&  (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
2952
5.18M
        off++;
2953
1.67M
    if(off <= beg+1)
2954
376k
        return FALSE;
2955
2956
    /* '@' */
2957
1.29M
    if(off >= max_end  ||  CH(off) != _T('@'))
2958
1.21M
        return FALSE;
2959
79.8k
    off++;
2960
2961
    /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
2962
     * characters or '-', but '-' is not allowed as first or last char. */
2963
79.8k
    label_len = 0;
2964
457k
    while(off < max_end) {
2965
455k
        if(ISALNUM(off))
2966
343k
            label_len++;
2967
112k
        else if(CH(off) == _T('-')  &&  label_len > 0)
2968
33.0k
            label_len++;
2969
79.2k
        else if(CH(off) == _T('.')  &&  label_len > 0  &&  CH(off-1) != _T('-'))
2970
3.21k
            label_len = 0;
2971
76.0k
        else
2972
76.0k
            break;
2973
2974
379k
        if(label_len > 63)
2975
1.91k
            return FALSE;
2976
2977
377k
        off++;
2978
377k
    }
2979
2980
77.8k
    if(label_len <= 0  || off >= max_end  ||  CH(off) != _T('>') ||  CH(off-1) == _T('-'))
2981
55.5k
        return FALSE;
2982
2983
22.3k
    *p_end = off+1;
2984
22.3k
    return TRUE;
2985
77.8k
}
2986
2987
static int
2988
md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
2989
1.67M
{
2990
1.67M
    if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
2991
3.39k
        *p_missing_mailto = FALSE;
2992
3.39k
        return TRUE;
2993
3.39k
    }
2994
2995
1.67M
    if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
2996
22.3k
        *p_missing_mailto = TRUE;
2997
22.3k
        return TRUE;
2998
22.3k
    }
2999
3000
1.64M
    return FALSE;
3001
1.67M
}
3002
3003
static int
3004
md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, int table_mode)
3005
2.38M
{
3006
2.38M
    MD_SIZE line_index;
3007
2.38M
    int ret = 0;
3008
2.38M
    MD_MARK* mark;
3009
2.38M
    OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
3010
2.38M
    int codespan_scanned_till_paragraph_end = FALSE;
3011
3012
9.73M
    for(line_index = 0; line_index < n_lines; line_index++) {
3013
7.35M
        const MD_LINE* line = &lines[line_index];
3014
7.35M
        OFF off = line->beg;
3015
3016
148M
        while(TRUE) {
3017
148M
            CHAR ch;
3018
3019
#ifdef MD4C_USE_UTF16
3020
    /* For UTF-16, mark_char_map[] covers only ASCII. */
3021
    #define IS_MARK_CHAR(off)   ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map))  &&  \
3022
                                (ctx->mark_char_map[(unsigned char) CH(off)]))
3023
#else
3024
    /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
3025
848M
    #define IS_MARK_CHAR(off)   (ctx->mark_char_map[(unsigned char) CH(off)])
3026
148M
#endif
3027
3028
            /* Optimization: Use some loop unrolling. */
3029
161M
            while(off + 3 < line->end  &&  !IS_MARK_CHAR(off+0)  &&  !IS_MARK_CHAR(off+1)
3030
161M
                                       &&  !IS_MARK_CHAR(off+2)  &&  !IS_MARK_CHAR(off+3))
3031
12.6M
                off += 4;
3032
169M
            while(off < line->end  &&  !IS_MARK_CHAR(off+0))
3033
20.5M
                off++;
3034
3035
148M
            if(off >= line->end)
3036
7.35M
                break;
3037
3038
141M
            ch = CH(off);
3039
3040
            /* A backslash escape.
3041
             * It can go beyond line->end as it may involve escaped new
3042
             * line to form a hard break. */
3043
141M
            if(ch == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
3044
                /* Hard-break cannot be on the last line of the block. */
3045
4.38M
                if(!ISNEWLINE(off+1)  ||  line_index+1 < n_lines)
3046
3.24M
                    ADD_MARK(ch, off, off+2, MD_MARK_RESOLVED);
3047
4.38M
                off += 2;
3048
4.38M
                continue;
3049
4.38M
            }
3050
3051
            /* A potential (string) emphasis start/end. */
3052
136M
            if(ch == _T('*')  ||  ch == _T('_')) {
3053
1.31M
                OFF tmp = off+1;
3054
1.31M
                int left_level;     /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
3055
1.31M
                int right_level;    /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
3056
3057
17.4M
                while(tmp < line->end  &&  CH(tmp) == ch)
3058
16.1M
                    tmp++;
3059
3060
1.31M
                if(off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off))
3061
165k
                    left_level = 0;
3062
1.15M
                else if(ISUNICODEPUNCTBEFORE(off))
3063
604k
                    left_level = 1;
3064
548k
                else
3065
548k
                    left_level = 2;
3066
3067
1.31M
                if(tmp == line->end  ||  ISUNICODEWHITESPACE(tmp))
3068
140k
                    right_level = 0;
3069
1.17M
                else if(ISUNICODEPUNCT(tmp))
3070
541k
                    right_level = 1;
3071
636k
                else
3072
636k
                    right_level = 2;
3073
3074
                /* Intra-word underscore doesn't have special meaning. */
3075
1.31M
                if(ch == _T('_')  &&  left_level == 2  &&  right_level == 2) {
3076
179k
                    left_level = 0;
3077
179k
                    right_level = 0;
3078
179k
                }
3079
3080
1.31M
                if(left_level != 0  ||  right_level != 0) {
3081
1.08M
                    unsigned flags = 0;
3082
3083
1.08M
                    if(left_level > 0  &&  left_level >= right_level)
3084
747k
                        flags |= MD_MARK_POTENTIAL_CLOSER;
3085
1.08M
                    if(right_level > 0  &&  right_level >= left_level)
3086
855k
                        flags |= MD_MARK_POTENTIAL_OPENER;
3087
1.08M
                    if(flags == (MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER))
3088
522k
                        flags |= MD_MARK_EMPH_OC;
3089
3090
                    /* For "the rule of three" we need to remember the original
3091
                     * size of the mark (modulo three), before we potentially
3092
                     * split the mark when being later resolved partially by some
3093
                     * shorter closer. */
3094
1.08M
                    switch((tmp - off) % 3) {
3095
120k
                        case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
3096
781k
                        case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
3097
179k
                        case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
3098
1.08M
                    }
3099
3100
1.08M
                    ADD_MARK(ch, off, tmp, flags);
3101
3102
                    /* During resolving, multiple asterisks may have to be
3103
                     * split into independent span start/ends. Consider e.g.
3104
                     * "**foo* bar*". Therefore we push also some empty dummy
3105
                     * marks to have enough space for that. */
3106
1.08M
                    off++;
3107
17.0M
                    while(off < tmp) {
3108
15.9M
                        ADD_MARK('D', off, off, 0);
3109
15.9M
                        off++;
3110
15.9M
                    }
3111
1.08M
                    continue;
3112
1.08M
                }
3113
3114
238k
                off = tmp;
3115
238k
                continue;
3116
1.31M
            }
3117
3118
            /* A potential code span start/end. */
3119
135M
            if(ch == _T('`')) {
3120
503k
                MD_MARK opener;
3121
503k
                MD_MARK closer;
3122
503k
                int is_code_span;
3123
3124
503k
                is_code_span = md_is_code_span(ctx, line, n_lines - line_index, off,
3125
503k
                            &opener, &closer, codespan_last_potential_closers,
3126
503k
                            &codespan_scanned_till_paragraph_end);
3127
503k
                if(is_code_span) {
3128
377k
                    ADD_MARK(opener.ch, opener.beg, opener.end, opener.flags);
3129
377k
                    ADD_MARK(closer.ch, closer.beg, closer.end, closer.flags);
3130
377k
                    md_resolve_range(ctx, ctx->n_marks-2, ctx->n_marks-1);
3131
377k
                    off = closer.end;
3132
3133
                    /* Advance the current line accordingly. */
3134
377k
                    if(off > line->end)
3135
260k
                        line = md_lookup_line(off, lines, n_lines, &line_index);
3136
377k
                    continue;
3137
377k
                }
3138
3139
125k
                off = opener.end;
3140
125k
                continue;
3141
503k
            }
3142
3143
            /* A potential entity start. */
3144
135M
            if(ch == _T('&')) {
3145
2.94M
                ADD_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3146
2.94M
                off++;
3147
2.94M
                continue;
3148
2.94M
            }
3149
3150
            /* A potential entity end. */
3151
132M
            if(ch == _T(';')) {
3152
                /* We surely cannot be entity unless the previous mark is '&'. */
3153
24.9M
                if(ctx->n_marks > 0  &&  ctx->marks[ctx->n_marks-1].ch == _T('&'))
3154
2.37M
                    ADD_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3155
3156
24.9M
                off++;
3157
24.9M
                continue;
3158
24.9M
            }
3159
3160
            /* A potential autolink or raw HTML start/end. */
3161
107M
            if(ch == _T('<')) {
3162
2.52M
                int is_autolink;
3163
2.52M
                OFF autolink_end;
3164
2.52M
                int missing_mailto;
3165
3166
2.52M
                if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
3167
2.05M
                    int is_html;
3168
2.05M
                    OFF html_end;
3169
3170
                    /* Given the nature of the raw HTML, we have to recognize
3171
                     * it here. Doing so later in md_analyze_lt_gt() could
3172
                     * open can of worms of quadratic complexity. */
3173
2.05M
                    is_html = md_is_html_any(ctx, line, n_lines - line_index, off,
3174
2.05M
                                    lines[n_lines-1].end, &html_end);
3175
2.05M
                    if(is_html) {
3176
848k
                        ADD_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
3177
848k
                        ADD_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3178
848k
                        ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3179
848k
                        ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3180
848k
                        off = html_end;
3181
3182
                        /* Advance the current line accordingly. */
3183
848k
                        if(off > line->end)
3184
56.8k
                            line = md_lookup_line(off, lines, n_lines, &line_index);
3185
848k
                        continue;
3186
848k
                    }
3187
2.05M
                }
3188
3189
1.67M
                is_autolink = md_is_autolink(ctx, off, lines[n_lines-1].end,
3190
1.67M
                                    &autolink_end, &missing_mailto);
3191
1.67M
                if(is_autolink) {
3192
25.7k
                    unsigned flags = MD_MARK_RESOLVED | MD_MARK_AUTOLINK;
3193
25.7k
                    if(missing_mailto)
3194
22.3k
                        flags |= MD_MARK_AUTOLINK_MISSING_MAILTO;
3195
3196
25.7k
                    ADD_MARK(_T('<'), off, off+1, MD_MARK_OPENER | flags);
3197
25.7k
                    ADD_MARK(_T('>'), autolink_end-1, autolink_end, MD_MARK_CLOSER | flags);
3198
25.7k
                    ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3199
25.7k
                    ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3200
25.7k
                    off = autolink_end;
3201
25.7k
                    continue;
3202
25.7k
                }
3203
3204
1.64M
                off++;
3205
1.64M
                continue;
3206
1.67M
            }
3207
3208
            /* A potential link or its part. */
3209
104M
            if(ch == _T('[')  ||  (ch == _T('!') && off+1 < line->end && CH(off+1) == _T('['))) {
3210
76.4M
                OFF tmp = (ch == _T('[') ? off+1 : off+2);
3211
76.4M
                ADD_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
3212
76.4M
                off = tmp;
3213
                /* Two dummies to make enough place for data we need if it is
3214
                 * a link. */
3215
76.4M
                ADD_MARK('D', off, off, 0);
3216
76.4M
                ADD_MARK('D', off, off, 0);
3217
76.4M
                continue;
3218
76.4M
            }
3219
28.1M
            if(ch == _T(']')) {
3220
3.84M
                ADD_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3221
3.84M
                off++;
3222
3.84M
                continue;
3223
3.84M
            }
3224
3225
            /* A potential permissive e-mail autolink. */
3226
24.3M
            if(ch == _T('@')) {
3227
409k
                if(line->beg + 1 <= off  &&  ISALNUM(off-1)  &&
3228
409k
                    off + 3 < line->end  &&  ISALNUM(off+1))
3229
240k
                {
3230
240k
                    ADD_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3231
                    /* Push a dummy as a reserve for a closer. */
3232
240k
                    ADD_MARK('D', line->beg, line->end, 0);
3233
240k
                }
3234
3235
409k
                off++;
3236
409k
                continue;
3237
409k
            }
3238
3239
            /* A potential permissive URL autolink. */
3240
23.9M
            if(ch == _T(':')) {
3241
573k
                static struct {
3242
573k
                    const CHAR* scheme;
3243
573k
                    SZ scheme_size;
3244
573k
                    const CHAR* suffix;
3245
573k
                    SZ suffix_size;
3246
573k
                } scheme_map[] = {
3247
                    /* In the order from the most frequently used, arguably. */
3248
573k
                    { _T("http"), 4,    _T("//"), 2 },
3249
573k
                    { _T("https"), 5,   _T("//"), 2 },
3250
573k
                    { _T("ftp"), 3,     _T("//"), 2 }
3251
573k
                };
3252
573k
                int scheme_index;
3253
3254
2.16M
                for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
3255
1.66M
                    const CHAR* scheme = scheme_map[scheme_index].scheme;
3256
1.66M
                    const SZ scheme_size = scheme_map[scheme_index].scheme_size;
3257
1.66M
                    const CHAR* suffix = scheme_map[scheme_index].suffix;
3258
1.66M
                    const SZ suffix_size = scheme_map[scheme_index].suffix_size;
3259
3260
1.66M
                    if(line->beg + scheme_size <= off  &&  md_ascii_eq(STR(off-scheme_size), scheme, scheme_size)  &&
3261
1.66M
                        off + 1 + suffix_size < line->end  &&  md_ascii_eq(STR(off+1), suffix, suffix_size))
3262
67.6k
                    {
3263
67.6k
                        ADD_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
3264
                        /* Push a dummy as a reserve for a closer. */
3265
67.6k
                        ADD_MARK('D', line->beg, line->end, 0);
3266
67.6k
                        off += 1 + suffix_size;
3267
67.6k
                        break;
3268
67.6k
                    }
3269
1.66M
                }
3270
3271
573k
                off++;
3272
573k
                continue;
3273
573k
            }
3274
3275
            /* A potential permissive WWW autolink. */
3276
23.3M
            if(ch == _T('.')) {
3277
1.73M
                if(line->beg + 3 <= off  &&  md_ascii_eq(STR(off-3), _T("www"), 3)  &&
3278
1.73M
                   (off-3 == line->beg || ISUNICODEWHITESPACEBEFORE(off-3) || ISUNICODEPUNCTBEFORE(off-3)))
3279
515k
                {
3280
515k
                    ADD_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
3281
                    /* Push a dummy as a reserve for a closer. */
3282
515k
                    ADD_MARK('D', line->beg, line->end, 0);
3283
515k
                    off++;
3284
515k
                    continue;
3285
515k
                }
3286
3287
1.22M
                off++;
3288
1.22M
                continue;
3289
1.73M
            }
3290
3291
            /* A potential table cell boundary or wiki link label delimiter. */
3292
21.6M
            if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
3293
970k
                ADD_MARK(ch, off, off+1, 0);
3294
970k
                off++;
3295
970k
                continue;
3296
970k
            }
3297
3298
            /* A potential strikethrough/equation start/end. */
3299
20.6M
            if(ch == _T('$') || ch == _T('~')) {
3300
451k
                OFF tmp = off+1;
3301
3302
825k
                while(tmp < line->end && CH(tmp) == ch)
3303
373k
                    tmp++;
3304
3305
451k
                if(tmp - off <= 2) {
3306
425k
                    unsigned flags = MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER;
3307
3308
425k
                    if(off > line->beg  &&  !ISUNICODEWHITESPACEBEFORE(off)  &&  !ISUNICODEPUNCTBEFORE(off))
3309
124k
                        flags &= ~MD_MARK_POTENTIAL_OPENER;
3310
425k
                    if(tmp < line->end  &&  !ISUNICODEWHITESPACE(tmp)  &&  !ISUNICODEPUNCT(tmp))
3311
116k
                        flags &= ~MD_MARK_POTENTIAL_CLOSER;
3312
425k
                    if(flags != 0)
3313
358k
                        ADD_MARK(ch, off, tmp, flags);
3314
425k
                }
3315
3316
451k
                off = tmp;
3317
451k
                continue;
3318
451k
            }
3319
3320
            /* Turn non-trivial whitespace into single space. */
3321
20.1M
            if(ISWHITESPACE_(ch)) {
3322
1.11M
                OFF tmp = off+1;
3323
3324
8.19M
                while(tmp < line->end  &&  ISWHITESPACE(tmp))
3325
7.08M
                    tmp++;
3326
3327
1.11M
                if(tmp - off > 1  ||  ch != _T(' '))
3328
655k
                    ADD_MARK(ch, off, tmp, MD_MARK_RESOLVED);
3329
3330
1.11M
                off = tmp;
3331
1.11M
                continue;
3332
1.11M
            }
3333
3334
            /* NULL character. */
3335
19.0M
            if(ch == _T('\0')) {
3336
15.9M
                ADD_MARK(ch, off, off+1, MD_MARK_RESOLVED);
3337
15.9M
                off++;
3338
15.9M
                continue;
3339
15.9M
            }
3340
3341
3.15M
            off++;
3342
3.15M
        }
3343
7.35M
    }
3344
3345
    /* Add a dummy mark at the end of the mark vector to simplify
3346
     * process_inlines(). */
3347
2.38M
    ADD_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
3348
3349
2.38M
abort:
3350
2.38M
    return ret;
3351
2.38M
}
3352
3353
static void
3354
md_analyze_bracket(MD_CTX* ctx, int mark_index)
3355
80.3M
{
3356
    /* We cannot really resolve links here as for that we would need
3357
     * more context. E.g. a following pair of brackets (reference link),
3358
     * or enclosing pair of brackets (if the inner is the link, the outer
3359
     * one cannot be.)
3360
     *
3361
     * Therefore we here only construct a list of '[' ']' pairs ordered by
3362
     * position of the closer. This allows us to analyze what is or is not
3363
     * link in the right order, from inside to outside in case of nested
3364
     * brackets.
3365
     *
3366
     * The resolving itself is deferred to md_resolve_links().
3367
     */
3368
3369
80.3M
    MD_MARK* mark = &ctx->marks[mark_index];
3370
3371
80.3M
    if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
3372
76.4M
        if(BRACKET_OPENERS.top >= 0)
3373
74.9M
            ctx->marks[BRACKET_OPENERS.top].flags |= MD_MARK_HASNESTEDBRACKETS;
3374
3375
76.4M
        md_mark_stack_push(ctx, &BRACKET_OPENERS, mark_index);
3376
76.4M
        return;
3377
76.4M
    }
3378
3379
3.84M
    if(BRACKET_OPENERS.top >= 0) {
3380
3.58M
        int opener_index = md_mark_stack_pop(ctx, &BRACKET_OPENERS);
3381
3.58M
        MD_MARK* opener = &ctx->marks[opener_index];
3382
3383
        /* Interconnect the opener and closer. */
3384
3.58M
        opener->next = mark_index;
3385
3.58M
        mark->prev = opener_index;
3386
3387
        /* Add the pair into a list of potential links for md_resolve_links().
3388
         * Note we misuse opener->prev for this as opener->next points to its
3389
         * closer. */
3390
3.58M
        if(ctx->unresolved_link_tail >= 0)
3391
3.31M
            ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
3392
265k
        else
3393
265k
            ctx->unresolved_link_head = opener_index;
3394
3.58M
        ctx->unresolved_link_tail = opener_index;
3395
3.58M
        opener->prev = -1;
3396
3.58M
    }
3397
3.84M
}
3398
3399
/* Forward declaration. */
3400
static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
3401
                                     int mark_beg, int mark_end);
3402
3403
static int
3404
md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines)
3405
2.38M
{
3406
2.38M
    int opener_index = ctx->unresolved_link_head;
3407
2.38M
    OFF last_link_beg = 0;
3408
2.38M
    OFF last_link_end = 0;
3409
2.38M
    OFF last_img_beg = 0;
3410
2.38M
    OFF last_img_end = 0;
3411
3412
5.93M
    while(opener_index >= 0) {
3413
3.55M
        MD_MARK* opener = &ctx->marks[opener_index];
3414
3.55M
        int closer_index = opener->next;
3415
3.55M
        MD_MARK* closer = &ctx->marks[closer_index];
3416
3.55M
        int next_index = opener->prev;
3417
3.55M
        MD_MARK* next_opener;
3418
3.55M
        MD_MARK* next_closer;
3419
3.55M
        MD_LINK_ATTR attr;
3420
3.55M
        int is_link = FALSE;
3421
3422
3.55M
        if(next_index >= 0) {
3423
3.28M
            next_opener = &ctx->marks[next_index];
3424
3.28M
            next_closer = &ctx->marks[next_opener->next];
3425
3.28M
        } else {
3426
264k
            next_opener = NULL;
3427
264k
            next_closer = NULL;
3428
264k
        }
3429
3430
        /* If nested ("[ [ ] ]"), we need to make sure that:
3431
         *   - The outer does not end inside of (...) belonging to the inner.
3432
         *   - The outer cannot be link if the inner is link (i.e. not image).
3433
         *
3434
         * (Note we here analyze from inner to outer as the marks are ordered
3435
         * by closer->beg.)
3436
         */
3437
3.55M
        if((opener->beg < last_link_beg  &&  closer->end < last_link_end)  ||
3438
3.55M
           (opener->beg < last_img_beg  &&  closer->end < last_img_end)  ||
3439
3.55M
           (opener->beg < last_link_end  &&  opener->ch == '['))
3440
125k
        {
3441
125k
            opener_index = next_index;
3442
125k
            continue;
3443
125k
        }
3444
3445
        /* Recognize and resolve wiki links.
3446
         * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
3447
         */
3448
3.42M
        if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
3449
3.42M
            (opener->end - opener->beg == 1) &&         /* not image */
3450
3.42M
            next_opener != NULL &&                      /* double '[' opener */
3451
3.42M
            next_opener->ch == '[' &&
3452
3.42M
            (next_opener->beg == opener->beg - 1) &&
3453
3.42M
            (next_opener->end - next_opener->beg == 1) &&
3454
3.42M
            next_closer != NULL &&                      /* double ']' closer */
3455
3.42M
            next_closer->ch == ']' &&
3456
3.42M
            (next_closer->beg == closer->beg + 1) &&
3457
3.42M
            (next_closer->end - next_closer->beg == 1))
3458
51.4k
        {
3459
51.4k
            MD_MARK* delim = NULL;
3460
51.4k
            int delim_index;
3461
51.4k
            OFF dest_beg, dest_end;
3462
3463
51.4k
            is_link = TRUE;
3464
3465
            /* We don't allow destination to be longer than 100 characters.
3466
             * Lets scan to see whether there is '|'. (If not then the whole
3467
             * wiki-link has to be below the 100 characters.) */
3468
51.4k
            delim_index = opener_index + 1;
3469
1.92M
            while(delim_index < closer_index) {
3470
1.90M
                MD_MARK* m = &ctx->marks[delim_index];
3471
1.90M
                if(m->ch == '|') {
3472
21.4k
                    delim = m;
3473
21.4k
                    break;
3474
21.4k
                }
3475
1.88M
                if(m->ch != 'D') {
3476
251k
                    if(m->beg - opener->end > 100)
3477
4.02k
                        break;
3478
247k
                    if(m->ch != 'D'  &&  (m->flags & MD_MARK_OPENER))
3479
5.79k
                        delim_index = m->next;
3480
247k
                }
3481
1.87M
                delim_index++;
3482
1.87M
            }
3483
3484
51.4k
            dest_beg = opener->end;
3485
51.4k
            dest_end = (delim != NULL) ? delim->beg : closer->beg;
3486
51.4k
            if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100)
3487
21.4k
                is_link = FALSE;
3488
3489
            /* There may not be any new line in the destination. */
3490
51.4k
            if(is_link) {
3491
30.0k
                OFF off;
3492
192k
                for(off = dest_beg; off < dest_end; off++) {
3493
165k
                    if(ISNEWLINE(off)) {
3494
3.70k
                        is_link = FALSE;
3495
3.70k
                        break;
3496
3.70k
                    }
3497
165k
                }
3498
30.0k
            }
3499
3500
51.4k
            if(is_link) {
3501
26.3k
                if(delim != NULL) {
3502
19.2k
                    if(delim->end < closer->beg) {
3503
17.8k
                        md_rollback(ctx, opener_index, delim_index, MD_ROLLBACK_ALL);
3504
17.8k
                        md_rollback(ctx, delim_index, closer_index, MD_ROLLBACK_CROSSING);
3505
17.8k
                        delim->flags |= MD_MARK_RESOLVED;
3506
17.8k
                        opener->end = delim->beg;
3507
17.8k
                    } else {
3508
                        /* The pipe is just before the closer: [[foo|]] */
3509
1.36k
                        md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3510
1.36k
                        closer->beg = delim->beg;
3511
1.36k
                        delim = NULL;
3512
1.36k
                    }
3513
19.2k
                }
3514
3515
26.3k
                opener->beg = next_opener->beg;
3516
26.3k
                opener->next = closer_index;
3517
26.3k
                opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3518
3519
26.3k
                closer->end = next_closer->end;
3520
26.3k
                closer->prev = opener_index;
3521
26.3k
                closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3522
3523
26.3k
                last_link_beg = opener->beg;
3524
26.3k
                last_link_end = closer->end;
3525
3526
26.3k
                if(delim != NULL)
3527
17.8k
                    md_analyze_link_contents(ctx, lines, n_lines, delim_index+1, closer_index);
3528
3529
26.3k
                opener_index = next_opener->prev;
3530
26.3k
                continue;
3531
26.3k
            }
3532
51.4k
        }
3533
3534
3.39M
        if(next_opener != NULL  &&  next_opener->beg == closer->end) {
3535
238k
            if(next_closer->beg > closer->end + 1) {
3536
                /* Might be full reference link. */
3537
44.2k
                if(!(next_opener->flags & MD_MARK_HASNESTEDBRACKETS))
3538
44.2k
                    is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr);
3539
194k
            } else {
3540
                /* Might be shortcut reference link. */
3541
194k
                if(!(opener->flags & MD_MARK_HASNESTEDBRACKETS))
3542
181k
                    is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3543
194k
            }
3544
3545
238k
            if(is_link < 0)
3546
0
                return -1;
3547
3548
238k
            if(is_link) {
3549
                /* Eat the 2nd "[...]". */
3550
2.85k
                closer->end = next_closer->end;
3551
3552
                /* Do not analyze the label as a standalone link in the next
3553
                 * iteration. */
3554
2.85k
                next_index = ctx->marks[next_index].prev;
3555
2.85k
            }
3556
3.16M
        } else {
3557
3.16M
            if(closer->end < ctx->size  &&  CH(closer->end) == _T('(')) {
3558
                /* Might be inline link. */
3559
1.91M
                OFF inline_link_end = UINT_MAX;
3560
3561
1.91M
                is_link = md_is_inline_link_spec(ctx, lines, n_lines, closer->end, &inline_link_end, &attr);
3562
1.91M
                if(is_link < 0)
3563
0
                    return -1;
3564
3565
                /* Check the closing ')' is not inside an already resolved range
3566
                 * (i.e. a range with a higher priority), e.g. a code span. */
3567
1.91M
                if(is_link) {
3568
1.60M
                    int i = closer_index + 1;
3569
3570
9.68M
                    while(i < ctx->n_marks) {
3571
9.68M
                        MD_MARK* mark = &ctx->marks[i];
3572
3573
9.68M
                        if(mark->beg >= inline_link_end)
3574
1.59M
                            break;
3575
8.08M
                        if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) {
3576
100k
                            if(ctx->marks[mark->next].beg >= inline_link_end) {
3577
                                /* Cancel the link status. */
3578
3.36k
                                if(attr.title_needs_free)
3579
2.28k
                                    free(attr.title);
3580
3.36k
                                is_link = FALSE;
3581
3.36k
                                break;
3582
3.36k
                            }
3583
3584
97.5k
                            i = mark->next + 1;
3585
7.98M
                        } else {
3586
7.98M
                            i++;
3587
7.98M
                        }
3588
8.08M
                    }
3589
1.60M
                }
3590
3591
1.91M
                if(is_link) {
3592
                    /* Eat the "(...)" */
3593
1.59M
                    closer->end = inline_link_end;
3594
1.59M
                }
3595
1.91M
            }
3596
3597
3.16M
            if(!is_link) {
3598
                /* Might be collapsed reference link. */
3599
1.56M
                if(!(opener->flags & MD_MARK_HASNESTEDBRACKETS))
3600
1.19M
                    is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3601
1.56M
                if(is_link < 0)
3602
0
                    return -1;
3603
1.56M
            }
3604
3.16M
        }
3605
3606
3.39M
        if(is_link) {
3607
            /* Resolve the brackets as a link. */
3608
1.95M
            opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3609
1.95M
            closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3610
3611
            /* If it is a link, we store the destination and title in the two
3612
             * dummy marks after the opener. */
3613
1.95M
            MD_ASSERT(ctx->marks[opener_index+1].ch == 'D');
3614
1.95M
            ctx->marks[opener_index+1].beg = attr.dest_beg;
3615
1.95M
            ctx->marks[opener_index+1].end = attr.dest_end;
3616
3617
1.95M
            MD_ASSERT(ctx->marks[opener_index+2].ch == 'D');
3618
1.95M
            md_mark_store_ptr(ctx, opener_index+2, attr.title);
3619
            /* The title might or might not have been allocated for us. */
3620
1.95M
            if(attr.title_needs_free)
3621
25.3k
                md_mark_stack_push(ctx, &ctx->ptr_stack, opener_index+2);
3622
1.95M
            ctx->marks[opener_index+2].prev = attr.title_size;
3623
3624
1.95M
            if(opener->ch == '[') {
3625
439k
                last_link_beg = opener->beg;
3626
439k
                last_link_end = closer->end;
3627
1.51M
            } else {
3628
1.51M
                last_img_beg = opener->beg;
3629
1.51M
                last_img_end = closer->end;
3630
1.51M
            }
3631
3632
1.95M
            md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3633
3634
            /* If the link text is formed by nothing but permissive autolink,
3635
             * suppress the autolink.
3636
             * See https://github.com/mity/md4c/issues/152 for more info. */
3637
1.95M
            if(ctx->parser.flags & MD_FLAG_PERMISSIVEAUTOLINKS) {
3638
1.66M
                MD_MARK* first_nested;
3639
1.66M
                MD_MARK* last_nested;
3640
3641
1.66M
                first_nested = opener + 1;
3642
4.99M
                while(first_nested->ch == _T('D')  &&  first_nested < closer)
3643
3.33M
                    first_nested++;
3644
3645
1.66M
                last_nested = closer - 1;
3646
1.66M
                while(first_nested->ch == _T('D')  &&  last_nested > opener)
3647
0
                    last_nested--;
3648
3649
1.66M
                if((first_nested->flags & MD_MARK_RESOLVED)  &&
3650
1.66M
                   first_nested->beg == opener->end  &&
3651
1.66M
                   ISANYOF_(first_nested->ch, _T("@:."))  &&
3652
1.66M
                   first_nested->next == (last_nested - ctx->marks)  &&
3653
1.66M
                   last_nested->end == closer->beg)
3654
932
                {
3655
932
                    first_nested->ch = _T('D');
3656
932
                    first_nested->flags &= ~MD_MARK_RESOLVED;
3657
932
                    last_nested->ch = _T('D');
3658
932
                    last_nested->flags &= ~MD_MARK_RESOLVED;
3659
932
                }
3660
1.66M
            }
3661
1.95M
        }
3662
3663
3.39M
        opener_index = next_index;
3664
3.39M
    }
3665
3666
2.38M
    return 0;
3667
2.38M
}
3668
3669
/* Analyze whether the mark '&' starts a HTML entity.
3670
 * If so, update its flags as well as flags of corresponding closer ';'. */
3671
static void
3672
md_analyze_entity(MD_CTX* ctx, int mark_index)
3673
2.73M
{
3674
2.73M
    MD_MARK* opener = &ctx->marks[mark_index];
3675
2.73M
    MD_MARK* closer;
3676
2.73M
    OFF off;
3677
3678
    /* Cannot be entity if there is no closer as the next mark.
3679
     * (Any other mark between would mean strange character which cannot be
3680
     * part of the entity.
3681
     *
3682
     * So we can do all the work on '&' and do not call this later for the
3683
     * closing mark ';'.
3684
     */
3685
2.73M
    if(mark_index + 1 >= ctx->n_marks)
3686
0
        return;
3687
2.73M
    closer = &ctx->marks[mark_index+1];
3688
2.73M
    if(closer->ch != ';')
3689
536k
        return;
3690
3691
2.20M
    if(md_is_entity(ctx, opener->beg, closer->end, &off)) {
3692
142k
        MD_ASSERT(off == closer->end);
3693
3694
142k
        md_resolve_range(ctx, mark_index, mark_index+1);
3695
142k
        opener->end = closer->end;
3696
142k
    }
3697
2.20M
}
3698
3699
static void
3700
md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
3701
136k
{
3702
136k
    MD_MARK* mark = &ctx->marks[mark_index];
3703
136k
    mark->flags |= MD_MARK_RESOLVED;
3704
136k
    mark->next = -1;
3705
3706
136k
    if(ctx->table_cell_boundaries_head < 0)
3707
17.6k
        ctx->table_cell_boundaries_head = mark_index;
3708
119k
    else
3709
119k
        ctx->marks[ctx->table_cell_boundaries_tail].next = mark_index;
3710
136k
    ctx->table_cell_boundaries_tail = mark_index;
3711
136k
    ctx->n_table_cell_boundaries++;
3712
136k
}
3713
3714
/* Split a longer mark into two. The new mark takes the given count of
3715
 * characters. May only be called if an adequate number of dummy 'D' marks
3716
 * follows.
3717
 */
3718
static int
3719
md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
3720
143k
{
3721
143k
    MD_MARK* mark = &ctx->marks[mark_index];
3722
143k
    int new_mark_index = mark_index + (mark->end - mark->beg - n);
3723
143k
    MD_MARK* dummy = &ctx->marks[new_mark_index];
3724
3725
143k
    MD_ASSERT(mark->end - mark->beg > n);
3726
143k
    MD_ASSERT(dummy->ch == 'D');
3727
3728
143k
    memcpy(dummy, mark, sizeof(MD_MARK));
3729
143k
    mark->end -= n;
3730
143k
    dummy->beg = mark->end;
3731
3732
143k
    return new_mark_index;
3733
143k
}
3734
3735
static void
3736
md_analyze_emph(MD_CTX* ctx, int mark_index)
3737
1.06M
{
3738
1.06M
    MD_MARK* mark = &ctx->marks[mark_index];
3739
3740
    /* If we can be a closer, try to resolve with the preceding opener. */
3741
1.06M
    if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
3742
754k
        MD_MARK* opener = NULL;
3743
754k
        int opener_index = 0;
3744
754k
        MD_MARKSTACK* opener_stacks[6];
3745
754k
        int i, n_opener_stacks;
3746
754k
        unsigned flags = mark->flags;
3747
3748
754k
        n_opener_stacks = 0;
3749
3750
        /* Apply the rule of 3 */
3751
754k
        opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, mark->ch, MD_MARK_EMPH_MOD3_0 | MD_MARK_EMPH_OC);
3752
754k
        if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3753
619k
            opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, mark->ch, MD_MARK_EMPH_MOD3_1 | MD_MARK_EMPH_OC);
3754
754k
        if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3755
222k
            opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, mark->ch, MD_MARK_EMPH_MOD3_2 | MD_MARK_EMPH_OC);
3756
754k
        opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, mark->ch, MD_MARK_EMPH_MOD3_0);
3757
754k
        if(!(flags & MD_MARK_EMPH_OC)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3758
658k
            opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, mark->ch, MD_MARK_EMPH_MOD3_1);
3759
754k
        if(!(flags & MD_MARK_EMPH_OC)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3760
384k
            opener_stacks[n_opener_stacks++] = md_emph_stack(ctx, mark->ch, MD_MARK_EMPH_MOD3_2);
3761
3762
        /* Opener is the most recent mark from the allowed stacks. */
3763
4.14M
        for(i = 0; i < n_opener_stacks; i++) {
3764
3.39M
            if(opener_stacks[i]->top >= 0) {
3765
486k
                int m_index = opener_stacks[i]->top;
3766
486k
                MD_MARK* m = &ctx->marks[m_index];
3767
3768
486k
                if(opener == NULL  ||  m->end > opener->end) {
3769
439k
                    opener_index = m_index;
3770
439k
                    opener = m;
3771
439k
                }
3772
486k
            }
3773
3.39M
        }
3774
3775
        /* Resolve, if we have found matching opener. */
3776
754k
        if(opener != NULL) {
3777
348k
            SZ opener_size = opener->end - opener->beg;
3778
348k
            SZ closer_size = mark->end - mark->beg;
3779
348k
            MD_MARKSTACK* stack = md_opener_stack(ctx, opener_index);
3780
3781
348k
            if(opener_size > closer_size) {
3782
80.7k
                opener_index = md_split_emph_mark(ctx, opener_index, closer_size);
3783
80.7k
                md_mark_stack_push(ctx, stack, opener_index);
3784
267k
            } else if(opener_size < closer_size) {
3785
62.3k
                md_split_emph_mark(ctx, mark_index, closer_size - opener_size);
3786
62.3k
            }
3787
3788
            /* Above we were only peeking. */
3789
348k
            md_mark_stack_pop(ctx, stack);
3790
3791
348k
            md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3792
348k
            md_resolve_range(ctx, opener_index, mark_index);
3793
348k
            return;
3794
348k
        }
3795
754k
    }
3796
3797
    /* If we could not resolve as closer, we may be yet be an opener. */
3798
719k
    if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3799
602k
        md_mark_stack_push(ctx, md_emph_stack(ctx, mark->ch, mark->flags), mark_index);
3800
719k
}
3801
3802
static void
3803
md_analyze_tilde(MD_CTX* ctx, int mark_index)
3804
215k
{
3805
215k
    MD_MARK* mark = &ctx->marks[mark_index];
3806
215k
    MD_MARKSTACK* stack = md_opener_stack(ctx, mark_index);
3807
3808
    /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
3809
     * only tildes sequences of length 1 and 2, and the length of the opener
3810
     * and closer has to match. */
3811
3812
215k
    if((mark->flags & MD_MARK_POTENTIAL_CLOSER)  &&  stack->top >= 0) {
3813
64.5k
        int opener_index = stack->top;
3814
3815
64.5k
        md_mark_stack_pop(ctx, stack);
3816
64.5k
        md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3817
64.5k
        md_resolve_range(ctx, opener_index, mark_index);
3818
64.5k
        return;
3819
64.5k
    }
3820
3821
150k
    if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3822
123k
        md_mark_stack_push(ctx, stack, mark_index);
3823
150k
}
3824
3825
static void
3826
md_analyze_dollar(MD_CTX* ctx, int mark_index)
3827
121k
{
3828
121k
    MD_MARK* mark = &ctx->marks[mark_index];
3829
3830
121k
    if((mark->flags & MD_MARK_POTENTIAL_CLOSER)  &&  DOLLAR_OPENERS.top >= 0) {
3831
        /* If the potential closer has a non-matching number of $, discard */
3832
43.9k
        MD_MARK* opener = &ctx->marks[DOLLAR_OPENERS.top];
3833
43.9k
        int opener_index = DOLLAR_OPENERS.top;
3834
43.9k
        MD_MARK* closer = mark;
3835
43.9k
        int closer_index = mark_index;
3836
3837
43.9k
        if(opener->end - opener->beg == closer->end - closer->beg) {
3838
            /* We are the matching closer */
3839
40.1k
            md_mark_stack_pop(ctx, &DOLLAR_OPENERS);
3840
40.1k
            md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3841
40.1k
            md_resolve_range(ctx, opener_index, closer_index);
3842
3843
            /* Discard all pending openers: Latex math span do not allow
3844
             * nesting. */
3845
40.1k
            DOLLAR_OPENERS.top = -1;
3846
40.1k
            return;
3847
40.1k
        }
3848
43.9k
    }
3849
3850
81.2k
    if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3851
69.6k
        md_mark_stack_push(ctx, &DOLLAR_OPENERS, mark_index);
3852
81.2k
}
3853
3854
static MD_MARK*
3855
md_scan_left_for_resolved_mark(MD_CTX* ctx, MD_MARK* mark_from, OFF off, MD_MARK** p_cursor)
3856
76.7k
{
3857
76.7k
    MD_MARK* mark;
3858
3859
319k
    for(mark = mark_from; mark >= ctx->marks; mark--) {
3860
315k
        if(mark->ch == 'D'  ||  mark->beg > off)
3861
212k
            continue;
3862
102k
        if(mark->beg <= off  &&  off < mark->end  &&  (mark->flags & MD_MARK_RESOLVED)) {
3863
15.9k
            if(p_cursor != NULL)
3864
15.9k
                *p_cursor = mark;
3865
15.9k
            return mark;
3866
15.9k
        }
3867
86.5k
        if(mark->end <= off)
3868
56.1k
            break;
3869
86.5k
    }
3870
3871
60.7k
    if(p_cursor != NULL)
3872
60.7k
        *p_cursor = mark;
3873
60.7k
    return NULL;
3874
76.7k
}
3875
3876
static MD_MARK*
3877
md_scan_right_for_resolved_mark(MD_CTX* ctx, MD_MARK* mark_from, OFF off, MD_MARK** p_cursor)
3878
989k
{
3879
989k
    MD_MARK* mark;
3880
3881
1.81M
    for(mark = mark_from; mark < ctx->marks + ctx->n_marks; mark++) {
3882
1.81M
        if(mark->ch == 'D'  ||  mark->end <= off)
3883
574k
            continue;
3884
1.23M
        if(mark->beg <= off  &&  off < mark->end  &&  (mark->flags & MD_MARK_RESOLVED)) {
3885
9.42k
            if(p_cursor != NULL)
3886
9.42k
                *p_cursor = mark;
3887
9.42k
            return mark;
3888
9.42k
        }
3889
1.22M
        if(mark->beg > off)
3890
980k
            break;
3891
1.22M
    }
3892
3893
980k
    if(p_cursor != NULL)
3894
980k
        *p_cursor = mark;
3895
980k
    return NULL;
3896
989k
}
3897
3898
static void
3899
md_analyze_permissive_autolink(MD_CTX* ctx, int mark_index)
3900
544k
{
3901
544k
    static const struct {
3902
544k
        const MD_CHAR start_char;
3903
544k
        const MD_CHAR delim_char;
3904
544k
        const MD_CHAR* allowed_nonalnum_chars;
3905
544k
        int min_components;
3906
544k
        const MD_CHAR optional_end_char;
3907
544k
    } URL_MAP[] = {
3908
544k
        { _T('\0'), _T('.'),  _T(".-_"),      2, _T('\0') },    /* host, mandatory */
3909
544k
        { _T('/'),  _T('/'),  _T("/.-_"),     0, _T('/') },     /* path */
3910
544k
        { _T('?'),  _T('&'),  _T("&.-+_=()"), 1, _T('\0') },    /* query */
3911
544k
        { _T('#'),  _T('\0'), _T(".-+_") ,    1, _T('\0') }     /* fragment */
3912
544k
    };
3913
3914
544k
    MD_MARK* opener = &ctx->marks[mark_index];
3915
544k
    MD_MARK* closer = &ctx->marks[mark_index + 1];  /* The dummy. */
3916
544k
    OFF line_beg = closer->beg;     /* md_collect_mark() set this for us */
3917
544k
    OFF line_end = closer->end;     /* ditto */
3918
544k
    OFF beg = opener->beg;
3919
544k
    OFF end = opener->end;
3920
544k
    MD_MARK* left_cursor = opener;
3921
544k
    int left_boundary_ok = FALSE;
3922
544k
    MD_MARK* right_cursor = opener;
3923
544k
    int right_boundary_ok = FALSE;
3924
544k
    unsigned i;
3925
3926
544k
    MD_ASSERT(closer->ch == 'D');
3927
3928
544k
    if(opener->ch == '@') {
3929
214k
        MD_ASSERT(CH(opener->beg) == _T('@'));
3930
3931
        /* Scan backwards for the user name (before '@'). */
3932
5.51M
        while(beg > line_beg) {
3933
5.49M
            if(ISALNUM(beg-1))
3934
5.28M
                beg--;
3935
214k
            else if(beg >= line_beg+2  &&  ISALNUM(beg-2)  &&
3936
214k
                        ISANYOF(beg-1, _T(".-_+"))  &&
3937
214k
                        md_scan_left_for_resolved_mark(ctx, left_cursor, beg-1, &left_cursor) == NULL  &&
3938
214k
                        ISALNUM(beg))
3939
19.7k
                beg--;
3940
194k
            else
3941
194k
                break;
3942
5.49M
        }
3943
214k
        if(beg == opener->beg)      /* empty user name */
3944
0
            return;
3945
214k
    }
3946
3947
    /* Verify there's line boundary, whitespace, allowed punctuation or
3948
     * resolved emphasis mark just before the suspected autolink. */
3949
544k
    if(beg == line_beg  ||  ISUNICODEWHITESPACEBEFORE(beg)  ||  ISANYOF(beg-1, _T("({["))) {
3950
189k
        left_boundary_ok = TRUE;
3951
355k
    } else if(ISANYOF(beg-1, _T("*_~"))) {
3952
55.9k
        MD_MARK* left_mark;
3953
3954
55.9k
        left_mark = md_scan_left_for_resolved_mark(ctx, left_cursor, beg-1, &left_cursor);
3955
55.9k
        if(left_mark != NULL  &&  (left_mark->flags & MD_MARK_OPENER))
3956
12.8k
            left_boundary_ok = TRUE;
3957
55.9k
    }
3958
544k
    if(!left_boundary_ok)
3959
342k
        return;
3960
3961
325k
    for(i = 0; i < SIZEOF_ARRAY(URL_MAP); i++) {
3962
294k
        int n_components = 0;
3963
294k
        int n_open_brackets = 0;
3964
3965
294k
        if(URL_MAP[i].start_char != _T('\0')) {
3966
92.7k
            if(end >= line_end  ||  CH(end) != URL_MAP[i].start_char)
3967
70.9k
                continue;
3968
21.7k
            if(URL_MAP[i].min_components > 0  &&  (end+1 >= line_end  ||  !ISALNUM(end+1)))
3969
10.7k
                continue;
3970
10.9k
            end++;
3971
10.9k
        }
3972
3973
2.88M
        while(end < line_end) {
3974
2.86M
            if(ISALNUM(end)) {
3975
1.77M
                if(n_components == 0)
3976
162k
                    n_components++;
3977
1.77M
                end++;
3978
1.77M
            } else if(end < line_end  &&
3979
1.09M
                        ISANYOF(end, URL_MAP[i].allowed_nonalnum_chars)  &&
3980
1.09M
                        md_scan_right_for_resolved_mark(ctx, right_cursor, end, &right_cursor) == NULL  &&
3981
1.09M
                        ((end > line_beg && (ISALNUM(end-1) || CH(end-1) == _T(')')))  ||  CH(end) == _T('('))  &&
3982
1.09M
                        ((end+1 < line_end && (ISALNUM(end+1) || CH(end+1) == _T('(')))  ||  CH(end) == _T(')')))
3983
898k
            {
3984
898k
                if(CH(end) == URL_MAP[i].delim_char)
3985
855k
                    n_components++;
3986
3987
                /* brackets have to be balanced. */
3988
898k
                if(CH(end) == _T('(')) {
3989
16.6k
                    n_open_brackets++;
3990
881k
                } else if(CH(end) == _T(')')) {
3991
8.32k
                    if(n_open_brackets <= 0)
3992
2.02k
                        break;
3993
6.29k
                    n_open_brackets--;
3994
6.29k
                }
3995
3996
896k
                end++;
3997
896k
            } else {
3998
194k
                break;
3999
194k
            }
4000
2.86M
        }
4001
4002
213k
        if(end < line_end  &&  URL_MAP[i].optional_end_char != _T('\0')  &&
4003
213k
                CH(end) == URL_MAP[i].optional_end_char)
4004
2.21k
            end++;
4005
4006
213k
        if(n_components < URL_MAP[i].min_components  ||  n_open_brackets != 0)
4007
90.8k
            return;
4008
4009
122k
        if(opener->ch == '@')   /* E-mail autolinks wants only the host. */
4010
80.8k
            break;
4011
122k
    }
4012
4013
    /* Verify there's line boundary, whitespace, allowed punctuation or
4014
     * resolved emphasis mark just after the suspected autolink. */
4015
111k
    if(end == line_end  ||  ISUNICODEWHITESPACE(end)  ||  ISANYOF(end, _T(")}].!?,;"))) {
4016
79.9k
        right_boundary_ok = TRUE;
4017
79.9k
    } else {
4018
31.2k
        MD_MARK* right_mark;
4019
4020
31.2k
        right_mark = md_scan_right_for_resolved_mark(ctx, right_cursor, end, &right_cursor);
4021
31.2k
        if(right_mark != NULL  &&  (right_mark->flags & MD_MARK_CLOSER))
4022
2.48k
            right_boundary_ok = TRUE;
4023
31.2k
    }
4024
111k
    if(!right_boundary_ok)
4025
28.7k
        return;
4026
4027
    /* Success, we are an autolink. */
4028
82.4k
    opener->beg = beg;
4029
82.4k
    opener->end = beg;
4030
82.4k
    closer->beg = end;
4031
82.4k
    closer->end = end;
4032
82.4k
    closer->ch = opener->ch;
4033
82.4k
    md_resolve_range(ctx, mark_index, mark_index + 1);
4034
82.4k
}
4035
4036
13.8M
#define MD_ANALYZE_NOSKIP_EMPH  0x01
4037
4038
static inline void
4039
md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
4040
                 int mark_beg, int mark_end, const CHAR* mark_chars, unsigned flags)
4041
13.6M
{
4042
13.6M
    int i = mark_beg;
4043
13.6M
    OFF last_end = lines[0].beg;
4044
4045
13.6M
    MD_UNUSED(lines);
4046
13.6M
    MD_UNUSED(n_lines);
4047
4048
1.01G
    while(i < mark_end) {
4049
1.00G
        MD_MARK* mark = &ctx->marks[i];
4050
4051
        /* Skip resolved spans. */
4052
1.00G
        if(mark->flags & MD_MARK_RESOLVED) {
4053
93.1M
            if((mark->flags & MD_MARK_OPENER)  &&
4054
93.1M
               !((flags & MD_ANALYZE_NOSKIP_EMPH) && ISANYOF_(mark->ch, "*_~")))
4055
10.1M
            {
4056
10.1M
                MD_ASSERT(i < mark->next);
4057
10.1M
                i = mark->next + 1;
4058
82.9M
            } else {
4059
82.9M
                i++;
4060
82.9M
            }
4061
93.1M
            continue;
4062
93.1M
        }
4063
4064
        /* Skip marks we do not want to deal with. */
4065
907M
        if(!ISANYOF_(mark->ch, mark_chars)) {
4066
821M
            i++;
4067
821M
            continue;
4068
821M
        }
4069
4070
        /* The resolving in previous step could have expanded a mark. */
4071
85.3M
        if(mark->beg < last_end) {
4072
231k
            i++;
4073
231k
            continue;
4074
231k
        }
4075
4076
        /* Analyze the mark. */
4077
85.1M
        switch(mark->ch) {
4078
74.8M
            case '[':   /* Pass through. */
4079
76.4M
            case '!':   /* Pass through. */
4080
80.3M
            case ']':   md_analyze_bracket(ctx, i); break;
4081
2.73M
            case '&':   md_analyze_entity(ctx, i); break;
4082
136k
            case '|':   md_analyze_table_cell_boundary(ctx, i); break;
4083
434k
            case '_':   /* Pass through. */
4084
1.06M
            case '*':   md_analyze_emph(ctx, i); break;
4085
215k
            case '~':   md_analyze_tilde(ctx, i); break;
4086
121k
            case '$':   md_analyze_dollar(ctx, i); break;
4087
265k
            case '.':   /* Pass through. */
4088
330k
            case ':':   /* Pass through. */
4089
544k
            case '@':   md_analyze_permissive_autolink(ctx, i); break;
4090
85.1M
        }
4091
4092
85.1M
        if(mark->flags & MD_MARK_RESOLVED) {
4093
815k
            if(mark->flags & MD_MARK_OPENER)
4094
225k
                last_end = ctx->marks[mark->next].end;
4095
590k
            else
4096
590k
                last_end = mark->end;
4097
815k
        }
4098
4099
85.1M
        i++;
4100
85.1M
    }
4101
13.6M
}
4102
4103
/* Analyze marks (build ctx->marks). */
4104
static int
4105
md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines, int table_mode)
4106
2.38M
{
4107
2.38M
    int ret;
4108
4109
    /* Reset the previously collected stack of marks. */
4110
2.38M
    ctx->n_marks = 0;
4111
4112
    /* Collect all marks. */
4113
2.38M
    MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode));
4114
4115
    /* (1) Links. */
4116
2.38M
    md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!"), 0);
4117
2.38M
    MD_CHECK(md_resolve_links(ctx, lines, n_lines));
4118
2.38M
    BRACKET_OPENERS.top = -1;
4119
2.38M
    ctx->unresolved_link_head = -1;
4120
2.38M
    ctx->unresolved_link_tail = -1;
4121
4122
2.38M
    if(table_mode) {
4123
        /* (2) Analyze table cell boundaries. */
4124
734k
        MD_ASSERT(n_lines == 1);
4125
734k
        ctx->n_table_cell_boundaries = 0;
4126
734k
        md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|"), 0);
4127
734k
        return ret;
4128
734k
    }
4129
4130
    /* (3) Emphasis and strong emphasis; permissive autolinks. */
4131
1.64M
    md_analyze_link_contents(ctx, lines, n_lines, 0, ctx->n_marks);
4132
4133
1.64M
abort:
4134
1.64M
    return ret;
4135
1.64M
}
4136
4137
static void
4138
md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines,
4139
                         int mark_beg, int mark_end)
4140
3.61M
{
4141
3.61M
    int i;
4142
4143
3.61M
    md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("&"), 0);
4144
3.61M
    md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$"), 0);
4145
4146
3.61M
    if((ctx->parser.flags & MD_FLAG_PERMISSIVEAUTOLINKS) != 0) {
4147
        /* These have to be processed last, as they may be greedy and expand
4148
         * from their original mark. Also their implementation must be careful
4149
         * not to cross any (previously) resolved marks when doing so. */
4150
3.25M
        md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("@:."), MD_ANALYZE_NOSKIP_EMPH);
4151
3.25M
    }
4152
4153
61.4M
    for(i = 0; i < (int) SIZEOF_ARRAY(ctx->opener_stacks); i++)
4154
57.8M
        ctx->opener_stacks[i].top = -1;
4155
3.61M
}
4156
4157
static int
4158
md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type,
4159
                      const CHAR* dest, SZ dest_size, int is_autolink,
4160
                      const CHAR* title, SZ title_size)
4161
2.50M
{
4162
2.50M
    MD_ATTRIBUTE_BUILD href_build = { 0 };
4163
2.50M
    MD_ATTRIBUTE_BUILD title_build = { 0 };
4164
2.50M
    MD_SPAN_A_DETAIL det;
4165
2.50M
    int ret = 0;
4166
4167
    /* Note we here rely on fact that MD_SPAN_A_DETAIL and
4168
     * MD_SPAN_IMG_DETAIL are binary-compatible. */
4169
2.50M
    memset(&det, 0, sizeof(MD_SPAN_A_DETAIL));
4170
2.50M
    MD_CHECK(md_build_attribute(ctx, dest, dest_size,
4171
2.50M
                    (is_autolink ? MD_BUILD_ATTR_NO_ESCAPES : 0),
4172
2.50M
                    &det.href, &href_build));
4173
2.50M
    MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build));
4174
2.50M
    det.is_autolink = is_autolink;
4175
2.50M
    if(enter)
4176
535k
        MD_ENTER_SPAN(type, &det);
4177
1.97M
    else
4178
1.97M
        MD_LEAVE_SPAN(type, &det);
4179
4180
2.50M
abort:
4181
2.50M
    md_free_attribute(ctx, &href_build);
4182
2.50M
    md_free_attribute(ctx, &title_build);
4183
2.50M
    return ret;
4184
2.50M
}
4185
4186
static int
4187
md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size)
4188
39.4k
{
4189
39.4k
    MD_ATTRIBUTE_BUILD target_build = { 0 };
4190
39.4k
    MD_SPAN_WIKILINK_DETAIL det;
4191
39.4k
    int ret = 0;
4192
4193
39.4k
    memset(&det, 0, sizeof(MD_SPAN_WIKILINK_DETAIL));
4194
39.4k
    MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build));
4195
4196
39.4k
    if (enter)
4197
19.7k
        MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det);
4198
19.7k
    else
4199
19.7k
        MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det);
4200
4201
39.4k
abort:
4202
39.4k
    md_free_attribute(ctx, &target_build);
4203
39.4k
    return ret;
4204
39.4k
}
4205
4206
4207
/* Render the output, accordingly to the analyzed ctx->marks. */
4208
static int
4209
md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines)
4210
1.64M
{
4211
1.64M
    MD_TEXTTYPE text_type;
4212
1.64M
    const MD_LINE* line = lines;
4213
1.64M
    MD_MARK* prev_mark = NULL;
4214
1.64M
    MD_MARK* mark;
4215
1.64M
    OFF off = lines[0].beg;
4216
1.64M
    OFF end = lines[n_lines-1].end;
4217
1.64M
    OFF tmp;
4218
1.64M
    int enforce_hardbreak = 0;
4219
1.64M
    int ret = 0;
4220
4221
    /* Find first resolved mark. Note there is always at least one resolved
4222
     * mark,  the dummy last one after the end of the latest line we actually
4223
     * never really reach. This saves us of a lot of special checks and cases
4224
     * in this function. */
4225
1.64M
    mark = ctx->marks;
4226
6.56M
    while(!(mark->flags & MD_MARK_RESOLVED))
4227
4.92M
        mark++;
4228
4229
1.64M
    text_type = MD_TEXT_NORMAL;
4230
4231
27.2M
    while(1) {
4232
        /* Process the text up to the next mark or end-of-line. */
4233
27.2M
        tmp = (line->end < mark->beg ? line->end : mark->beg);
4234
27.2M
        if(tmp > off) {
4235
8.64M
            MD_TEXT(text_type, STR(off), tmp - off);
4236
8.64M
            off = tmp;
4237
8.64M
        }
4238
4239
        /* If reached the mark, process it and move to next one. */
4240
27.2M
        if(off >= mark->beg) {
4241
22.6M
            switch(mark->ch) {
4242
3.15M
                case '\\':      /* Backslash escape. */
4243
3.15M
                    if(ISNEWLINE(mark->beg+1))
4244
2.79M
                        enforce_hardbreak = 1;
4245
361k
                    else
4246
361k
                        MD_TEXT(text_type, STR(mark->beg+1), 1);
4247
3.15M
                    break;
4248
4249
3.15M
                case ' ':       /* Non-trivial space. */
4250
23.3k
                    MD_TEXT(text_type, _T(" "), 1);
4251
23.3k
                    break;
4252
4253
696k
                case '`':       /* Code span. */
4254
696k
                    if(mark->flags & MD_MARK_OPENER) {
4255
348k
                        MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
4256
348k
                        text_type = MD_TEXT_CODE;
4257
348k
                    } else {
4258
348k
                        MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
4259
348k
                        text_type = MD_TEXT_NORMAL;
4260
348k
                    }
4261
696k
                    break;
4262
4263
696k
                case '_':       /* Underline (or emphasis if we fall through). */
4264
211k
                    if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
4265
129k
                        if(mark->flags & MD_MARK_OPENER) {
4266
162k
                            while(off < mark->end) {
4267
97.9k
                                MD_ENTER_SPAN(MD_SPAN_U, NULL);
4268
97.9k
                                off++;
4269
97.9k
                            }
4270
64.9k
                        } else {
4271
162k
                            while(off < mark->end) {
4272
97.8k
                                MD_LEAVE_SPAN(MD_SPAN_U, NULL);
4273
97.8k
                                off++;
4274
97.8k
                            }
4275
64.9k
                        }
4276
129k
                        break;
4277
129k
                    }
4278
81.5k
                    MD_FALLTHROUGH();
4279
4280
530k
                case '*':       /* Emphasis, strong emphasis. */
4281
530k
                    if(mark->flags & MD_MARK_OPENER) {
4282
265k
                        if((mark->end - off) % 2) {
4283
215k
                            MD_ENTER_SPAN(MD_SPAN_EM, NULL);
4284
215k
                            off++;
4285
215k
                        }
4286
2.35M
                        while(off + 1 < mark->end) {
4287
2.08M
                            MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
4288
2.08M
                            off += 2;
4289
2.08M
                        }
4290
265k
                    } else {
4291
2.35M
                        while(off + 1 < mark->end) {
4292
2.08M
                            MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
4293
2.08M
                            off += 2;
4294
2.08M
                        }
4295
265k
                        if((mark->end - off) % 2) {
4296
215k
                            MD_LEAVE_SPAN(MD_SPAN_EM, NULL);
4297
215k
                            off++;
4298
215k
                        }
4299
265k
                    }
4300
530k
                    break;
4301
4302
530k
                case '~':
4303
121k
                    if(mark->flags & MD_MARK_OPENER)
4304
61.1k
                        MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
4305
60.6k
                    else
4306
60.6k
                        MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
4307
121k
                    break;
4308
4309
121k
                case '$':
4310
77.8k
                    if(mark->flags & MD_MARK_OPENER) {
4311
38.9k
                        MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4312
38.9k
                        text_type = MD_TEXT_LATEXMATH;
4313
38.9k
                    } else {
4314
38.9k
                        MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4315
38.9k
                        text_type = MD_TEXT_NORMAL;
4316
38.9k
                    }
4317
77.8k
                    break;
4318
4319
418k
                case '[':       /* Link, wiki link, image. */
4320
462k
                case '!':
4321
2.36M
                case ']':
4322
2.36M
                {
4323
2.36M
                    const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
4324
2.36M
                    const MD_MARK* closer = &ctx->marks[opener->next];
4325
2.36M
                    const MD_MARK* dest_mark;
4326
2.36M
                    const MD_MARK* title_mark;
4327
4328
2.36M
                    if ((opener->ch == '[' && closer->ch == ']') &&
4329
2.36M
                        opener->end - opener->beg >= 2 &&
4330
2.36M
                        closer->end - closer->beg >= 2)
4331
39.4k
                    {
4332
39.4k
                        int has_label = (opener->end - opener->beg > 2);
4333
39.4k
                        SZ target_sz;
4334
4335
39.4k
                        if(has_label)
4336
26.2k
                            target_sz = opener->end - (opener->beg+2);
4337
13.2k
                        else
4338
13.2k
                            target_sz = closer->beg - opener->end;
4339
4340
39.4k
                        MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'),
4341
39.4k
                                 has_label ? STR(opener->beg+2) : STR(opener->end),
4342
39.4k
                                 target_sz));
4343
4344
39.4k
                        break;
4345
39.4k
                    }
4346
4347
2.32M
                    dest_mark = opener+1;
4348
2.32M
                    MD_ASSERT(dest_mark->ch == 'D');
4349
2.32M
                    title_mark = opener+2;
4350
2.32M
                    MD_ASSERT(title_mark->ch == 'D');
4351
4352
2.32M
                    MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'),
4353
2.32M
                                (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A),
4354
2.32M
                                STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE,
4355
2.32M
                                md_mark_get_ptr(ctx, (int)(title_mark - ctx->marks)),
4356
2.32M
                title_mark->prev));
4357
4358
                    /* link/image closer may span multiple lines. */
4359
2.32M
                    if(mark->ch == ']') {
4360
1.97M
                        while(mark->end > line->end)
4361
94.8k
                            line++;
4362
1.88M
                    }
4363
4364
2.32M
                    break;
4365
2.32M
                }
4366
4367
495k
                case '<':
4368
991k
                case '>':       /* Autolink or raw HTML. */
4369
991k
                    if(!(mark->flags & MD_MARK_AUTOLINK)) {
4370
                        /* Raw HTML. */
4371
946k
                        if(mark->flags & MD_MARK_OPENER)
4372
473k
                            text_type = MD_TEXT_HTML;
4373
473k
                        else
4374
473k
                            text_type = MD_TEXT_NORMAL;
4375
946k
                        break;
4376
946k
                    }
4377
                    /* Pass through, if auto-link. */
4378
44.3k
                    MD_FALLTHROUGH();
4379
4380
161k
                case '@':       /* Permissive e-mail autolink. */
4381
175k
                case ':':       /* Permissive URL autolink. */
4382
186k
                case '.':       /* Permissive WWW autolink. */
4383
186k
                {
4384
186k
                    MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
4385
186k
                    MD_MARK* closer = &ctx->marks[opener->next];
4386
186k
                    const CHAR* dest = STR(opener->end);
4387
186k
                    SZ dest_size = closer->beg - opener->end;
4388
4389
                    /* For permissive auto-links we do not know closer mark
4390
                     * position at the time of md_collect_marks(), therefore
4391
                     * it can be out-of-order in ctx->marks[].
4392
                     *
4393
                     * With this flag, we make sure that we output the closer
4394
                     * only if we processed the opener. */
4395
186k
                    if(mark->flags & MD_MARK_OPENER)
4396
93.1k
                        closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK;
4397
4398
186k
                    if(opener->ch == '@' || opener->ch == '.' ||
4399
186k
                        (opener->ch == '<' && (opener->flags & MD_MARK_AUTOLINK_MISSING_MAILTO)))
4400
166k
                    {
4401
166k
                        dest_size += 7;
4402
166k
                        MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
4403
166k
                        memcpy(ctx->buffer,
4404
166k
                                (opener->ch == '.' ? _T("http://") : _T("mailto:")),
4405
166k
                                7 * sizeof(CHAR));
4406
166k
                        memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR));
4407
166k
                        dest = ctx->buffer;
4408
166k
                    }
4409
4410
186k
                    if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK)
4411
186k
                        MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER),
4412
186k
                                    MD_SPAN_A, dest, dest_size, TRUE, NULL, 0));
4413
186k
                    break;
4414
186k
                }
4415
4416
186k
                case '&':       /* Entity. */
4417
130k
                    MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
4418
130k
                    break;
4419
4420
13.7M
                case '\0':
4421
13.7M
                    MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
4422
13.7M
                    break;
4423
4424
13.7M
                case 127:
4425
7.94k
                    goto abort;
4426
22.6M
            }
4427
4428
22.6M
            off = mark->end;
4429
4430
            /* Move to next resolved mark. */
4431
22.6M
            prev_mark = mark;
4432
22.6M
            mark++;
4433
234M
            while(!(mark->flags & MD_MARK_RESOLVED)  ||  mark->beg < off)
4434
211M
                mark++;
4435
22.6M
        }
4436
4437
        /* If reached end of line, move to next one. */
4438
27.2M
        if(off >= line->end) {
4439
            /* If it is the last line, we are done. */
4440
8.18M
            if(off >= end)
4441
1.63M
                break;
4442
4443
6.55M
            if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
4444
1.54M
                MD_ASSERT(prev_mark != NULL);
4445
1.54M
                MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$')  &&  (prev_mark->flags & MD_MARK_OPENER));
4446
1.54M
                MD_ASSERT(ISANYOF2_(mark->ch, '`', '$')  &&  (mark->flags & MD_MARK_CLOSER));
4447
4448
                /* Inside a code span, trailing line whitespace has to be
4449
                 * outputted. */
4450
1.54M
                tmp = off;
4451
1.77M
                while(off < ctx->size  &&  ISBLANK(off))
4452
227k
                    off++;
4453
1.54M
                if(off > tmp)
4454
49.7k
                    MD_TEXT(text_type, STR(tmp), off-tmp);
4455
4456
                /* and new lines are transformed into single spaces. */
4457
1.54M
                if(off == line->end)
4458
1.45M
                    MD_TEXT(text_type, _T(" "), 1);
4459
5.00M
            } else if(text_type == MD_TEXT_HTML) {
4460
                /* Inside raw HTML, we output the new line verbatim, including
4461
                 * any trailing spaces. */
4462
142k
                tmp = off;
4463
239k
                while(tmp < end  &&  ISBLANK(tmp))
4464
96.7k
                    tmp++;
4465
142k
                if(tmp > off)
4466
8.62k
                    MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off);
4467
142k
                MD_TEXT(MD_TEXT_HTML, _T("\n"), 1);
4468
4.85M
            } else {
4469
                /* Output soft or hard line break. */
4470
4.85M
                MD_TEXTTYPE break_type = MD_TEXT_SOFTBR;
4471
4472
4.85M
                if(text_type == MD_TEXT_NORMAL) {
4473
4.85M
                    if(enforce_hardbreak  ||  (ctx->parser.flags & MD_FLAG_HARD_SOFT_BREAKS)) {
4474
3.25M
                        break_type = MD_TEXT_BR;
4475
3.25M
                    } else {
4476
2.89M
                        while(off < ctx->size  &&  ISBLANK(off))
4477
1.29M
                            off++;
4478
1.60M
                        if(off >= line->end + 2  &&  CH(off-2) == _T(' ')  &&  CH(off-1) == _T(' ')  &&  ISNEWLINE(off))
4479
1.40k
                            break_type = MD_TEXT_BR;
4480
1.60M
                    }
4481
4.85M
                }
4482
4483
4.85M
                MD_TEXT(break_type, _T("\n"), 1);
4484
4.85M
            }
4485
4486
            /* Move to the next line. */
4487
6.55M
            line++;
4488
6.55M
            off = line->beg;
4489
4490
6.55M
            enforce_hardbreak = 0;
4491
6.55M
        }
4492
27.2M
    }
4493
4494
1.64M
abort:
4495
1.64M
    return ret;
4496
1.64M
}
4497
4498
4499
/***************************
4500
 ***  Processing Tables  ***
4501
 ***************************/
4502
4503
static void
4504
md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align)
4505
14.3k
{
4506
14.3k
    static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER };
4507
14.3k
    OFF off = beg;
4508
4509
33.7k
    while(n_align > 0) {
4510
19.4k
        int index = 0;  /* index into align_map[] */
4511
4512
642k
        while(CH(off) != _T('-'))
4513
622k
            off++;
4514
19.4k
        if(off > beg  &&  CH(off-1) == _T(':'))
4515
2.36k
            index |= 1;
4516
42.0k
        while(off < end  &&  CH(off) == _T('-'))
4517
22.5k
            off++;
4518
19.4k
        if(off < end  &&  CH(off) == _T(':'))
4519
2.04k
            index |= 2;
4520
4521
19.4k
        *align = align_map[index];
4522
19.4k
        align++;
4523
19.4k
        n_align--;
4524
19.4k
    }
4525
4526
14.3k
}
4527
4528
/* Forward declaration. */
4529
static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines);
4530
4531
static int
4532
md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end)
4533
936k
{
4534
936k
    MD_LINE line;
4535
936k
    MD_BLOCK_TD_DETAIL det;
4536
936k
    int ret = 0;
4537
4538
980k
    while(beg < end  &&  ISWHITESPACE(beg))
4539
44.1k
        beg++;
4540
948k
    while(end > beg  &&  ISWHITESPACE(end-1))
4541
12.6k
        end--;
4542
4543
936k
    det.align = align;
4544
936k
    line.beg = beg;
4545
936k
    line.end = end;
4546
4547
936k
    MD_ENTER_BLOCK(cell_type, &det);
4548
936k
    MD_CHECK(md_process_normal_block_contents(ctx, &line, 1));
4549
936k
    MD_LEAVE_BLOCK(cell_type, &det);
4550
4551
936k
abort:
4552
936k
    return ret;
4553
936k
}
4554
4555
static int
4556
md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
4557
                     const MD_ALIGN* align, int col_count)
4558
734k
{
4559
734k
    MD_LINE line;
4560
734k
    OFF* pipe_offs = NULL;
4561
734k
    int i, j, k, n;
4562
734k
    int ret = 0;
4563
4564
734k
    line.beg = beg;
4565
734k
    line.end = end;
4566
4567
    /* Break the line into table cells by identifying pipe characters who
4568
     * form the cell boundary. */
4569
734k
    MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
4570
4571
    /* We have to remember the cell boundaries in local buffer because
4572
     * ctx->marks[] shall be reused during cell contents processing. */
4573
734k
    n = ctx->n_table_cell_boundaries + 2;
4574
734k
    pipe_offs = (OFF*) malloc(n * sizeof(OFF));
4575
734k
    if(pipe_offs == NULL) {
4576
0
        MD_LOG("malloc() failed.");
4577
0
        ret = -1;
4578
0
        goto abort;
4579
0
    }
4580
734k
    j = 0;
4581
734k
    pipe_offs[j++] = beg;
4582
871k
    for(i = ctx->table_cell_boundaries_head; i >= 0; i = ctx->marks[i].next) {
4583
136k
        MD_MARK* mark = &ctx->marks[i];
4584
136k
        pipe_offs[j++] = mark->end;
4585
136k
    }
4586
734k
    pipe_offs[j++] = end+1;
4587
4588
    /* Process cells. */
4589
734k
    MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
4590
734k
    k = 0;
4591
1.48M
    for(i = 0; i < j-1  &&  k < col_count; i++) {
4592
752k
        if(pipe_offs[i] < pipe_offs[i+1]-1)
4593
741k
            MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
4594
752k
    }
4595
    /* Make sure we call enough table cells even if the current table contains
4596
     * too few of them. */
4597
929k
    while(k < col_count)
4598
194k
        MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
4599
734k
    MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
4600
4601
734k
abort:
4602
734k
    free(pipe_offs);
4603
4604
734k
    ctx->table_cell_boundaries_head = -1;
4605
734k
    ctx->table_cell_boundaries_tail = -1;
4606
4607
734k
    return ret;
4608
734k
}
4609
4610
static int
4611
md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, MD_SIZE n_lines)
4612
14.3k
{
4613
14.3k
    MD_ALIGN* align;
4614
14.3k
    MD_SIZE line_index;
4615
14.3k
    int ret = 0;
4616
4617
    /* At least two lines have to be present: The column headers and the line
4618
     * with the underlines. */
4619
14.3k
    MD_ASSERT(n_lines >= 2);
4620
4621
14.3k
    align = malloc(col_count * sizeof(MD_ALIGN));
4622
14.3k
    if(align == NULL) {
4623
0
        MD_LOG("malloc() failed.");
4624
0
        ret = -1;
4625
0
        goto abort;
4626
0
    }
4627
4628
14.3k
    md_analyze_table_alignment(ctx, lines[1].beg, lines[1].end, align, col_count);
4629
4630
14.3k
    MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL);
4631
14.3k
    MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH,
4632
14.3k
                        lines[0].beg, lines[0].end, align, col_count));
4633
14.3k
    MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL);
4634
4635
14.3k
    if(n_lines > 2) {
4636
8.47k
        MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL);
4637
728k
        for(line_index = 2; line_index < n_lines; line_index++) {
4638
720k
            MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD,
4639
720k
                     lines[line_index].beg, lines[line_index].end, align, col_count));
4640
720k
        }
4641
8.47k
        MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL);
4642
8.47k
    }
4643
4644
14.3k
abort:
4645
14.3k
    free(align);
4646
14.3k
    return ret;
4647
14.3k
}
4648
4649
4650
/**************************
4651
 ***  Processing Block  ***
4652
 **************************/
4653
4654
107M
#define MD_BLOCK_CONTAINER_OPENER   0x01
4655
107M
#define MD_BLOCK_CONTAINER_CLOSER   0x02
4656
54.1M
#define MD_BLOCK_CONTAINER          (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER)
4657
20.5M
#define MD_BLOCK_LOOSE_LIST         0x04
4658
443k
#define MD_BLOCK_SETEXT_HEADER      0x08
4659
4660
struct MD_BLOCK_tag {
4661
    MD_BLOCKTYPE type  :  8;
4662
    unsigned flags     :  8;
4663
4664
    /* MD_BLOCK_H:      Header level (1 - 6)
4665
     * MD_BLOCK_CODE:   Non-zero if fenced, zero if indented.
4666
     * MD_BLOCK_LI:     Task mark character (0 if not task list item, 'x', 'X' or ' ').
4667
     * MD_BLOCK_TABLE:  Column count (as determined by the table underline).
4668
     */
4669
    unsigned data      : 16;
4670
4671
    /* Leaf blocks:     Count of lines (MD_LINE or MD_VERBATIMLINE) on the block.
4672
     * MD_BLOCK_LI:     Task mark offset in the input doc.
4673
     * MD_BLOCK_OL:     Start item number.
4674
     */
4675
    MD_SIZE n_lines;
4676
};
4677
4678
struct MD_CONTAINER_tag {
4679
    CHAR ch;
4680
    unsigned is_loose    : 8;
4681
    unsigned is_task     : 8;
4682
    unsigned start;
4683
    unsigned mark_indent;
4684
    unsigned contents_indent;
4685
    OFF block_byte_off;
4686
    OFF task_mark_off;
4687
};
4688
4689
4690
static int
4691
md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, MD_SIZE n_lines)
4692
1.64M
{
4693
1.64M
    int i;
4694
1.64M
    int ret;
4695
4696
1.64M
    MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE));
4697
1.64M
    MD_CHECK(md_process_inlines(ctx, lines, n_lines));
4698
4699
1.64M
abort:
4700
    /* Free any temporary memory blocks stored within some dummy marks. */
4701
1.67M
    for(i = ctx->ptr_stack.top; i >= 0; i = ctx->marks[i].next)
4702
25.3k
        free(md_mark_get_ptr(ctx, i));
4703
1.64M
    ctx->ptr_stack.top = -1;
4704
4705
1.64M
    return ret;
4706
1.64M
}
4707
4708
static int
4709
md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, MD_SIZE n_lines)
4710
105k
{
4711
105k
    static const CHAR indent_chunk_str[] = _T("                ");
4712
105k
    static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1;
4713
4714
105k
    MD_SIZE line_index;
4715
105k
    int ret = 0;
4716
4717
887k
    for(line_index = 0; line_index < n_lines; line_index++) {
4718
781k
        const MD_VERBATIMLINE* line = &lines[line_index];
4719
781k
        int indent = line->indent;
4720
4721
781k
        MD_ASSERT(indent >= 0);
4722
4723
        /* Output code indentation. */
4724
2.24M
        while(indent > (int) indent_chunk_size) {
4725
1.46M
            MD_TEXT(text_type, indent_chunk_str, indent_chunk_size);
4726
1.46M
            indent -= indent_chunk_size;
4727
1.46M
        }
4728
781k
        if(indent > 0)
4729
19.6k
            MD_TEXT(text_type, indent_chunk_str, indent);
4730
4731
        /* Output the code line itself. */
4732
781k
        MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
4733
4734
        /* Enforce end-of-line. */
4735
781k
        MD_TEXT(text_type, _T("\n"), 1);
4736
781k
    }
4737
4738
105k
abort:
4739
105k
    return ret;
4740
105k
}
4741
4742
static int
4743
md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, MD_SIZE n_lines)
4744
81.7k
{
4745
81.7k
    if(is_fenced) {
4746
        /* Skip the first line in case of fenced code: It is the fence.
4747
         * (Only the starting fence is present due to logic in md_analyze_line().) */
4748
68.7k
        lines++;
4749
68.7k
        n_lines--;
4750
68.7k
    } else {
4751
        /* Ignore blank lines at start/end of indented code block. */
4752
12.9k
        while(n_lines > 0  &&  lines[0].beg == lines[0].end) {
4753
0
            lines++;
4754
0
            n_lines--;
4755
0
        }
4756
24.1k
        while(n_lines > 0  &&  lines[n_lines-1].beg == lines[n_lines-1].end) {
4757
11.1k
            n_lines--;
4758
11.1k
        }
4759
12.9k
    }
4760
4761
81.7k
    if(n_lines == 0)
4762
46.0k
        return 0;
4763
4764
35.6k
    return md_process_verbatim_block_contents(ctx, MD_TEXT_CODE, lines, n_lines);
4765
81.7k
}
4766
4767
static int
4768
md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det,
4769
                            MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build)
4770
68.7k
{
4771
68.7k
    const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1);
4772
68.7k
    OFF beg = fence_line->beg;
4773
68.7k
    OFF end = fence_line->end;
4774
68.7k
    OFF lang_end;
4775
68.7k
    CHAR fence_ch = CH(fence_line->beg);
4776
68.7k
    int ret = 0;
4777
4778
    /* Skip the fence itself. */
4779
376k
    while(beg < ctx->size  &&  CH(beg) == fence_ch)
4780
307k
        beg++;
4781
    /* Trim initial spaces. */
4782
78.5k
    while(beg < ctx->size  &&  CH(beg) == _T(' '))
4783
9.76k
        beg++;
4784
4785
    /* Trim trailing spaces. */
4786
74.6k
    while(end > beg  &&  CH(end-1) == _T(' '))
4787
5.90k
        end--;
4788
4789
    /* Build info string attribute. */
4790
68.7k
    MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build));
4791
4792
    /* Build info string attribute. */
4793
68.7k
    lang_end = beg;
4794
454k
    while(lang_end < end  &&  !ISWHITESPACE(lang_end))
4795
385k
        lang_end++;
4796
68.7k
    MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build));
4797
4798
68.7k
    det->fence_char = fence_ch;
4799
4800
68.7k
abort:
4801
68.7k
    return ret;
4802
68.7k
}
4803
4804
static int
4805
md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block)
4806
882k
{
4807
882k
    union {
4808
882k
        MD_BLOCK_H_DETAIL header;
4809
882k
        MD_BLOCK_CODE_DETAIL code;
4810
882k
        MD_BLOCK_TABLE_DETAIL table;
4811
882k
    } det;
4812
882k
    MD_ATTRIBUTE_BUILD info_build;
4813
882k
    MD_ATTRIBUTE_BUILD lang_build;
4814
882k
    int is_in_tight_list;
4815
882k
    int clean_fence_code_detail = FALSE;
4816
882k
    int ret = 0;
4817
4818
882k
    memset(&det, 0, sizeof(det));
4819
4820
882k
    if(ctx->n_containers == 0)
4821
535k
        is_in_tight_list = FALSE;
4822
346k
    else
4823
346k
        is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose;
4824
4825
882k
    switch(block->type) {
4826
203k
        case MD_BLOCK_H:
4827
203k
            det.header.level = block->data;
4828
203k
            break;
4829
4830
81.7k
        case MD_BLOCK_CODE:
4831
            /* For fenced code block, we may need to set the info string. */
4832
81.7k
            if(block->data != 0) {
4833
68.7k
                memset(&det.code, 0, sizeof(MD_BLOCK_CODE_DETAIL));
4834
68.7k
                clean_fence_code_detail = TRUE;
4835
68.7k
                MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build));
4836
68.7k
            }
4837
81.7k
            break;
4838
4839
81.7k
        case MD_BLOCK_TABLE:
4840
14.3k
            det.table.col_count = block->data;
4841
14.3k
            det.table.head_row_count = 1;
4842
14.3k
            det.table.body_row_count = block->n_lines - 2;
4843
14.3k
            break;
4844
4845
583k
        default:
4846
            /* Noop. */
4847
583k
            break;
4848
882k
    }
4849
4850
882k
    if(!is_in_tight_list  ||  block->type != MD_BLOCK_P)
4851
727k
        MD_ENTER_BLOCK(block->type, (void*) &det);
4852
4853
    /* Process the block contents accordingly to is type. */
4854
882k
    switch(block->type) {
4855
7.17k
        case MD_BLOCK_HR:
4856
            /* noop */
4857
7.17k
            break;
4858
4859
81.7k
        case MD_BLOCK_CODE:
4860
81.7k
            MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0),
4861
81.7k
                            (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4862
81.7k
            break;
4863
4864
81.7k
        case MD_BLOCK_HTML:
4865
70.2k
            MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML,
4866
70.2k
                            (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4867
70.2k
            break;
4868
4869
70.2k
        case MD_BLOCK_TABLE:
4870
14.3k
            MD_CHECK(md_process_table_block_contents(ctx, block->data,
4871
14.3k
                            (const MD_LINE*)(block + 1), block->n_lines));
4872
14.3k
            break;
4873
4874
709k
        default:
4875
709k
            MD_CHECK(md_process_normal_block_contents(ctx,
4876
709k
                            (const MD_LINE*)(block + 1), block->n_lines));
4877
709k
            break;
4878
882k
    }
4879
4880
882k
    if(!is_in_tight_list  ||  block->type != MD_BLOCK_P)
4881
727k
        MD_LEAVE_BLOCK(block->type, (void*) &det);
4882
4883
882k
abort:
4884
882k
    if(clean_fence_code_detail) {
4885
68.7k
        md_free_attribute(ctx, &info_build);
4886
68.7k
        md_free_attribute(ctx, &lang_build);
4887
68.7k
    }
4888
882k
    return ret;
4889
882k
}
4890
4891
static int
4892
md_process_all_blocks(MD_CTX* ctx)
4893
16.6k
{
4894
16.6k
    int byte_off = 0;
4895
16.6k
    int ret = 0;
4896
4897
    /* ctx->containers now is not needed for detection of lists and list items
4898
     * so we reuse it for tracking what lists are loose or tight. We rely
4899
     * on the fact the vector is large enough to hold the deepest nesting
4900
     * level of lists. */
4901
16.6k
    ctx->n_containers = 0;
4902
4903
54.1M
    while(byte_off < ctx->n_block_bytes) {
4904
54.1M
        MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off);
4905
54.1M
        union {
4906
54.1M
            MD_BLOCK_UL_DETAIL ul;
4907
54.1M
            MD_BLOCK_OL_DETAIL ol;
4908
54.1M
            MD_BLOCK_LI_DETAIL li;
4909
54.1M
        } det;
4910
4911
54.1M
        switch(block->type) {
4912
13.6M
            case MD_BLOCK_UL:
4913
13.6M
                det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4914
13.6M
                det.ul.mark = (CHAR) block->data;
4915
13.6M
                break;
4916
4917
42.3k
            case MD_BLOCK_OL:
4918
42.3k
                det.ol.start = block->n_lines;
4919
42.3k
                det.ol.is_tight =  (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4920
42.3k
                det.ol.mark_delimiter = (CHAR) block->data;
4921
42.3k
                break;
4922
4923
14.6M
            case MD_BLOCK_LI:
4924
14.6M
                det.li.is_task = (block->data != 0);
4925
14.6M
                det.li.task_mark = (CHAR) block->data;
4926
14.6M
                det.li.task_mark_offset = (OFF) block->n_lines;
4927
14.6M
                break;
4928
4929
25.7M
            default:
4930
                /* noop */
4931
25.7M
                break;
4932
54.1M
        }
4933
4934
54.1M
        if(block->flags & MD_BLOCK_CONTAINER) {
4935
53.2M
            if(block->flags & MD_BLOCK_CONTAINER_CLOSER) {
4936
26.6M
                MD_LEAVE_BLOCK(block->type, &det);
4937
4938
26.6M
                if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE)
4939
19.2M
                    ctx->n_containers--;
4940
26.6M
            }
4941
4942
53.2M
            if(block->flags & MD_BLOCK_CONTAINER_OPENER) {
4943
26.6M
                MD_ENTER_BLOCK(block->type, &det);
4944
4945
26.6M
                if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) {
4946
6.85M
                    ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST);
4947
6.85M
                    ctx->n_containers++;
4948
19.7M
                } else if(block->type == MD_BLOCK_QUOTE) {
4949
                    /* This causes that any text in a block quote, even if
4950
                     * nested inside a tight list item, is wrapped with
4951
                     * <p>...</p>. */
4952
12.4M
                    ctx->containers[ctx->n_containers].is_loose = TRUE;
4953
12.4M
                    ctx->n_containers++;
4954
12.4M
                }
4955
26.6M
            }
4956
53.2M
        } else {
4957
882k
            MD_CHECK(md_process_leaf_block(ctx, block));
4958
4959
882k
            if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML)
4960
152k
                byte_off += block->n_lines * sizeof(MD_VERBATIMLINE);
4961
730k
            else
4962
730k
                byte_off += block->n_lines * sizeof(MD_LINE);
4963
882k
        }
4964
4965
54.1M
        byte_off += sizeof(MD_BLOCK);
4966
54.1M
    }
4967
4968
16.6k
    ctx->n_block_bytes = 0;
4969
4970
16.6k
abort:
4971
16.6k
    return ret;
4972
16.6k
}
4973
4974
4975
/************************************
4976
 ***  Grouping Lines into Blocks  ***
4977
 ************************************/
4978
4979
static void*
4980
md_push_block_bytes(MD_CTX* ctx, int n_bytes)
4981
66.0M
{
4982
66.0M
    void* ptr;
4983
4984
66.0M
    if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) {
4985
33.5k
        void* new_block_bytes;
4986
4987
33.5k
        ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0
4988
33.5k
                ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2
4989
33.5k
                : 512);
4990
33.5k
        new_block_bytes = realloc(ctx->block_bytes, ctx->alloc_block_bytes);
4991
33.5k
        if(new_block_bytes == NULL) {
4992
0
            MD_LOG("realloc() failed.");
4993
0
            return NULL;
4994
0
        }
4995
4996
        /* Fix the ->current_block after the reallocation. */
4997
33.5k
        if(ctx->current_block != NULL) {
4998
9.47k
            OFF off_current_block = (OFF) ((char*) ctx->current_block - (char*) ctx->block_bytes);
4999
9.47k
            ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block);
5000
9.47k
        }
5001
5002
33.5k
        ctx->block_bytes = new_block_bytes;
5003
33.5k
    }
5004
5005
66.0M
    ptr = (char*)ctx->block_bytes + ctx->n_block_bytes;
5006
66.0M
    ctx->n_block_bytes += n_bytes;
5007
66.0M
    return ptr;
5008
66.0M
}
5009
5010
static int
5011
md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
5012
897k
{
5013
897k
    MD_BLOCK* block;
5014
5015
897k
    MD_ASSERT(ctx->current_block == NULL);
5016
5017
897k
    block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
5018
897k
    if(block == NULL)
5019
0
        return -1;
5020
5021
897k
    switch(line->type) {
5022
7.17k
        case MD_LINE_HR:
5023
7.17k
            block->type = MD_BLOCK_HR;
5024
7.17k
            break;
5025
5026
171k
        case MD_LINE_ATXHEADER:
5027
171k
        case MD_LINE_SETEXTHEADER:
5028
171k
            block->type = MD_BLOCK_H;
5029
171k
            break;
5030
5031
68.7k
        case MD_LINE_FENCEDCODE:
5032
81.7k
        case MD_LINE_INDENTEDCODE:
5033
81.7k
            block->type = MD_BLOCK_CODE;
5034
81.7k
            break;
5035
5036
567k
        case MD_LINE_TEXT:
5037
567k
            block->type = MD_BLOCK_P;
5038
567k
            break;
5039
5040
70.2k
        case MD_LINE_HTML:
5041
70.2k
            block->type = MD_BLOCK_HTML;
5042
70.2k
            break;
5043
5044
0
        case MD_LINE_BLANK:
5045
0
        case MD_LINE_SETEXTUNDERLINE:
5046
0
        case MD_LINE_TABLEUNDERLINE:
5047
0
        default:
5048
0
            MD_UNREACHABLE();
5049
0
            break;
5050
897k
    }
5051
5052
897k
    block->flags = 0;
5053
897k
    block->data = line->data;
5054
897k
    block->n_lines = 0;
5055
5056
897k
    ctx->current_block = block;
5057
897k
    return 0;
5058
897k
}
5059
5060
/* Eat from start of current (textual) block any reference definitions and
5061
 * remember them so we can resolve any links referring to them.
5062
 *
5063
 * (Reference definitions can only be at start of it as they cannot break
5064
 * a paragraph.)
5065
 */
5066
static int
5067
md_consume_link_reference_definitions(MD_CTX* ctx)
5068
231k
{
5069
231k
    MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
5070
231k
    MD_SIZE n_lines = ctx->current_block->n_lines;
5071
231k
    MD_SIZE n = 0;
5072
5073
    /* Compute how many lines at the start of the block form one or more
5074
     * reference definitions. */
5075
1.62M
    while(n < n_lines) {
5076
1.60M
        int n_link_ref_lines;
5077
5078
1.60M
        n_link_ref_lines = md_is_link_reference_definition(ctx,
5079
1.60M
                                    lines + n, n_lines - n);
5080
        /* Not a reference definition? */
5081
1.60M
        if(n_link_ref_lines == 0)
5082
216k
            break;
5083
5084
        /* We fail if it is the ref. def. but it could not be stored due
5085
         * a memory allocation error. */
5086
1.38M
        if(n_link_ref_lines < 0)
5087
0
            return -1;
5088
5089
1.38M
        n += n_link_ref_lines;
5090
1.38M
    }
5091
5092
    /* If there was at least one reference definition, we need to remove
5093
     * its lines from the block, or perhaps even the whole block. */
5094
231k
    if(n > 0) {
5095
38.5k
        if(n == n_lines) {
5096
            /* Remove complete block. */
5097
15.1k
            ctx->n_block_bytes -= n * sizeof(MD_LINE);
5098
15.1k
            ctx->n_block_bytes -= sizeof(MD_BLOCK);
5099
15.1k
            ctx->current_block = NULL;
5100
23.4k
        } else {
5101
            /* Remove just some initial lines from the block. */
5102
23.4k
            memmove(lines, lines + n, (n_lines - n) * sizeof(MD_LINE));
5103
23.4k
            ctx->current_block->n_lines -= n;
5104
23.4k
            ctx->n_block_bytes -= n * sizeof(MD_LINE);
5105
23.4k
        }
5106
38.5k
    }
5107
5108
231k
    return 0;
5109
231k
}
5110
5111
static int
5112
md_end_current_block(MD_CTX* ctx)
5113
68.5M
{
5114
68.5M
    int ret = 0;
5115
5116
68.5M
    if(ctx->current_block == NULL)
5117
67.6M
        return ret;
5118
5119
    /* Check whether there is a reference definition. (We do this here instead
5120
     * of in md_analyze_line() because reference definition can take multiple
5121
     * lines.) */
5122
899k
    if(ctx->current_block->type == MD_BLOCK_P  ||
5123
899k
       (ctx->current_block->type == MD_BLOCK_H  &&  (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)))
5124
554k
    {
5125
554k
        MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
5126
554k
        if(lines[0].beg < ctx->size  &&  CH(lines[0].beg) == _T('[')) {
5127
231k
            MD_CHECK(md_consume_link_reference_definitions(ctx));
5128
231k
            if(ctx->current_block == NULL)
5129
15.1k
                return ret;
5130
231k
        }
5131
554k
    }
5132
5133
884k
    if(ctx->current_block->type == MD_BLOCK_H  &&  (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) {
5134
33.3k
        MD_SIZE n_lines = ctx->current_block->n_lines;
5135
5136
33.3k
        if(n_lines > 1) {
5137
            /* Get rid of the underline. */
5138
31.4k
            ctx->current_block->n_lines--;
5139
31.4k
            ctx->n_block_bytes -= sizeof(MD_LINE);
5140
31.4k
        } else {
5141
            /* Only the underline has left after eating the ref. defs.
5142
             * Keep the line as beginning of a new ordinary paragraph. */
5143
1.97k
            ctx->current_block->type = MD_BLOCK_P;
5144
1.97k
            return 0;
5145
1.97k
        }
5146
33.3k
    }
5147
5148
    /* Mark we are not building any block anymore. */
5149
882k
    ctx->current_block = NULL;
5150
5151
882k
abort:
5152
882k
    return ret;
5153
882k
}
5154
5155
static int
5156
md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis)
5157
11.9M
{
5158
11.9M
    MD_ASSERT(ctx->current_block != NULL);
5159
5160
11.9M
    if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) {
5161
861k
        MD_VERBATIMLINE* line;
5162
5163
861k
        line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, sizeof(MD_VERBATIMLINE));
5164
861k
        if(line == NULL)
5165
0
            return -1;
5166
5167
861k
        line->indent = analysis->indent;
5168
861k
        line->beg = analysis->beg;
5169
861k
        line->end = analysis->end;
5170
11.0M
    } else {
5171
11.0M
        MD_LINE* line;
5172
5173
11.0M
        line = (MD_LINE*) md_push_block_bytes(ctx, sizeof(MD_LINE));
5174
11.0M
        if(line == NULL)
5175
0
            return -1;
5176
5177
11.0M
        line->beg = analysis->beg;
5178
11.0M
        line->end = analysis->end;
5179
11.0M
    }
5180
11.9M
    ctx->current_block->n_lines++;
5181
5182
11.9M
    return 0;
5183
11.9M
}
5184
5185
static int
5186
md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start,
5187
                        unsigned data, unsigned flags)
5188
53.2M
{
5189
53.2M
    MD_BLOCK* block;
5190
53.2M
    int ret = 0;
5191
5192
53.2M
    MD_CHECK(md_end_current_block(ctx));
5193
5194
53.2M
    block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
5195
53.2M
    if(block == NULL)
5196
0
        return -1;
5197
5198
53.2M
    block->type = type;
5199
53.2M
    block->flags = flags;
5200
53.2M
    block->data = data;
5201
53.2M
    block->n_lines = start;
5202
5203
53.2M
abort:
5204
53.2M
    return ret;
5205
53.2M
}
5206
5207
5208
5209
/***********************
5210
 ***  Line Analysis  ***
5211
 ***********************/
5212
5213
static int
5214
md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer)
5215
7.32M
{
5216
7.32M
    OFF off = beg + 1;
5217
7.32M
    int n = 1;
5218
5219
17.4M
    while(off < ctx->size  &&  (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) {
5220
10.1M
        if(CH(off) == CH(beg))
5221
7.50M
            n++;
5222
10.1M
        off++;
5223
10.1M
    }
5224
5225
7.32M
    if(n < 3) {
5226
7.30M
        *p_killer = off;
5227
7.30M
        return FALSE;
5228
7.30M
    }
5229
5230
    /* Nothing else can be present on the line. */
5231
23.2k
    if(off < ctx->size  &&  !ISNEWLINE(off)) {
5232
16.1k
        *p_killer = off;
5233
16.1k
        return FALSE;
5234
16.1k
    }
5235
5236
7.17k
    *p_end = off;
5237
7.17k
    return TRUE;
5238
23.2k
}
5239
5240
static int
5241
md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level)
5242
202k
{
5243
202k
    int n;
5244
202k
    OFF off = beg + 1;
5245
5246
779k
    while(off < ctx->size  &&  CH(off) == _T('#')  &&  off - beg < 7)
5247
576k
        off++;
5248
202k
    n = off - beg;
5249
5250
202k
    if(n > 6)
5251
8.48k
        return FALSE;
5252
194k
    *p_level = n;
5253
5254
194k
    if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS)  &&  off < ctx->size  &&
5255
194k
       !ISBLANK(off)  &&  !ISNEWLINE(off))
5256
22.5k
        return FALSE;
5257
5258
216k
    while(off < ctx->size  &&  ISBLANK(off))
5259
44.6k
        off++;
5260
171k
    *p_beg = off;
5261
171k
    *p_end = off;
5262
171k
    return TRUE;
5263
194k
}
5264
5265
static int
5266
md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level)
5267
79.8k
{
5268
79.8k
    OFF off = beg + 1;
5269
5270
90.0k
    while(off < ctx->size  &&  CH(off) == CH(beg))
5271
10.2k
        off++;
5272
5273
    /* Optionally, space(s) or tabs can follow. */
5274
245k
    while(off < ctx->size  &&  ISBLANK(off))
5275
165k
        off++;
5276
5277
    /* But nothing more is allowed on the line. */
5278
79.8k
    if(off < ctx->size  &&  !ISNEWLINE(off))
5279
46.4k
        return FALSE;
5280
5281
33.3k
    *p_level = (CH(beg) == _T('=') ? 1 : 2);
5282
33.3k
    *p_end = off;
5283
33.3k
    return TRUE;
5284
79.8k
}
5285
5286
static int
5287
md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count)
5288
28.4k
{
5289
28.4k
    OFF off = beg;
5290
28.4k
    int found_pipe = FALSE;
5291
28.4k
    unsigned col_count = 0;
5292
5293
28.4k
    if(off < ctx->size  &&  CH(off) == _T('|')) {
5294
13.0k
        found_pipe = TRUE;
5295
13.0k
        off++;
5296
627k
        while(off < ctx->size  &&  ISWHITESPACE(off))
5297
614k
            off++;
5298
13.0k
    }
5299
5300
181k
    while(1) {
5301
181k
        int delimited = FALSE;
5302
5303
        /* Cell underline ("-----", ":----", "----:" or ":----:") */
5304
181k
        if(off < ctx->size  &&  CH(off) == _T(':'))
5305
4.84k
            off++;
5306
181k
        if(off >= ctx->size  ||  CH(off) != _T('-'))
5307
8.07k
            return FALSE;
5308
353k
        while(off < ctx->size  &&  CH(off) == _T('-'))
5309
179k
            off++;
5310
173k
        if(off < ctx->size  &&  CH(off) == _T(':'))
5311
2.35k
            off++;
5312
5313
173k
        col_count++;
5314
173k
        if(col_count > TABLE_MAXCOLCOUNT) {
5315
1.09k
            MD_LOG("Suppressing table (column_count >" STRINGIZE(TABLE_MAXCOLCOUNT) ")");
5316
1.09k
            return FALSE;
5317
1.09k
        }
5318
5319
        /* Pipe delimiter (optional at the end of line). */
5320
220k
        while(off < ctx->size  &&  ISWHITESPACE(off))
5321
48.7k
            off++;
5322
172k
        if(off < ctx->size  &&  CH(off) == _T('|')) {
5323
160k
            delimited = TRUE;
5324
160k
            found_pipe =  TRUE;
5325
160k
            off++;
5326
166k
            while(off < ctx->size  &&  ISWHITESPACE(off))
5327
5.94k
                off++;
5328
160k
        }
5329
5330
        /* Success, if we reach end of line. */
5331
172k
        if(off >= ctx->size  ||  ISNEWLINE(off))
5332
14.9k
            break;
5333
5334
157k
        if(!delimited)
5335
4.39k
            return FALSE;
5336
157k
    }
5337
5338
14.9k
    if(!found_pipe)
5339
605
        return FALSE;
5340
5341
14.3k
    *p_end = off;
5342
14.3k
    *p_col_count = col_count;
5343
14.3k
    return TRUE;
5344
14.9k
}
5345
5346
static int
5347
md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
5348
275k
{
5349
275k
    OFF off = beg;
5350
5351
835k
    while(off < ctx->size && CH(off) == CH(beg))
5352
559k
        off++;
5353
5354
    /* Fence must have at least three characters. */
5355
275k
    if(off - beg < 3)
5356
202k
        return FALSE;
5357
5358
73.6k
    ctx->code_fence_length = off - beg;
5359
5360
    /* Optionally, space(s) can follow. */
5361
90.0k
    while(off < ctx->size  &&  CH(off) == _T(' '))
5362
16.4k
        off++;
5363
5364
    /* Optionally, an info string can follow. */
5365
1.89M
    while(off < ctx->size  &&  !ISNEWLINE(off)) {
5366
        /* Backtick-based fence must not contain '`' in the info string. */
5367
1.82M
        if(CH(beg) == _T('`')  &&  CH(off) == _T('`'))
5368
4.84k
            return FALSE;
5369
1.82M
        off++;
5370
1.82M
    }
5371
5372
68.7k
    *p_end = off;
5373
68.7k
    return TRUE;
5374
73.6k
}
5375
5376
static int
5377
md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end)
5378
364k
{
5379
364k
    OFF off = beg;
5380
364k
    int ret = FALSE;
5381
5382
    /* Closing fence must have at least the same length and use same char as
5383
     * opening one. */
5384
1.89M
    while(off < ctx->size  &&  CH(off) == ch)
5385
1.53M
        off++;
5386
364k
    if(off - beg < ctx->code_fence_length)
5387
358k
        goto out;
5388
5389
    /* Optionally, space(s) can follow */
5390
11.4k
    while(off < ctx->size  &&  CH(off) == _T(' '))
5391
5.94k
        off++;
5392
5393
    /* But nothing more is allowed on the line. */
5394
5.53k
    if(off < ctx->size  &&  !ISNEWLINE(off))
5395
3.60k
        goto out;
5396
5397
1.92k
    ret = TRUE;
5398
5399
364k
out:
5400
    /* Note we set *p_end even on failure: If we are not closing fence, caller
5401
     * would eat the line anyway without any parsing. */
5402
364k
    *p_end = off;
5403
364k
    return ret;
5404
1.92k
}
5405
5406
5407
/* Helper data for md_is_html_block_start_condition() and
5408
 * md_is_html_block_end_condition() */
5409
typedef struct TAG_tag TAG;
5410
struct TAG_tag {
5411
    const CHAR* name;
5412
    unsigned len    : 8;
5413
};
5414
5415
#ifdef X
5416
    #undef X
5417
#endif
5418
#define X(name)     { _T(name), (sizeof(name)-1) / sizeof(CHAR) }
5419
#define Xend        { NULL, 0 }
5420
5421
static const TAG t1[] = { X("pre"), X("script"), X("style"), X("textarea"), Xend };
5422
5423
static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
5424
static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
5425
static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
5426
static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
5427
                          X("div"), X("dl"), X("dt"), Xend };
5428
static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
5429
                          X("form"), X("frame"), X("frameset"), Xend };
5430
static const TAG h6[] = { X("h1"), X("h2"), X("h3"), X("h4"), X("h5"), X("h6"),
5431
                          X("head"), X("header"), X("hr"), X("html"), Xend };
5432
static const TAG i6[] = { X("iframe"), Xend };
5433
static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
5434
static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend };
5435
static const TAG n6[] = { X("nav"), X("noframes"), Xend };
5436
static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
5437
static const TAG p6[] = { X("p"), X("param"), Xend };
5438
static const TAG s6[] = { X("search"), X("section"), X("summary"), Xend };
5439
static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
5440
                          X("thead"), X("title"), X("tr"), X("track"), Xend };
5441
static const TAG u6[] = { X("ul"), Xend };
5442
static const TAG xx[] = { Xend };
5443
5444
#undef X
5445
#undef Xend
5446
5447
/* Returns type of the raw HTML block, or FALSE if it is not HTML block.
5448
 * (Refer to CommonMark specification for details about the types.)
5449
 */
5450
static int
5451
md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
5452
538k
{
5453
    /* Type 6 is started by a long list of allowed tags. We use two-level
5454
     * tree to speed-up the search. */
5455
538k
    static const TAG* map6[26] = {
5456
538k
        a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
5457
538k
        n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
5458
538k
    };
5459
538k
    OFF off = beg + 1;
5460
538k
    int i;
5461
5462
    /* Check for type 1: <script, <pre, or <style */
5463
2.66M
    for(i = 0; t1[i].name != NULL; i++) {
5464
2.13M
        if(off + t1[i].len <= ctx->size) {
5465
2.13M
            if(md_ascii_case_eq(STR(off), t1[i].name, t1[i].len))
5466
12.8k
                return 1;
5467
2.13M
        }
5468
2.13M
    }
5469
5470
    /* Check for type 2: <!-- */
5471
525k
    if(off + 3 < ctx->size  &&  CH(off) == _T('!')  &&  CH(off+1) == _T('-')  &&  CH(off+2) == _T('-'))
5472
3.92k
        return 2;
5473
5474
    /* Check for type 3: <? */
5475
521k
    if(off < ctx->size  &&  CH(off) == _T('?'))
5476
7.52k
        return 3;
5477
5478
    /* Check for type 4 or 5: <! */
5479
514k
    if(off < ctx->size  &&  CH(off) == _T('!')) {
5480
        /* Check for type 4: <! followed by uppercase letter. */
5481
8.16k
        if(off + 1 < ctx->size  &&  ISASCII(off+1))
5482
6.87k
            return 4;
5483
5484
        /* Check for type 5: <![CDATA[ */
5485
1.29k
        if(off + 8 < ctx->size) {
5486
1.25k
            if(md_ascii_eq(STR(off), _T("![CDATA["), 8))
5487
0
                return 5;
5488
1.25k
        }
5489
1.29k
    }
5490
5491
    /* Check for type 6: Many possible starting tags listed above. */
5492
507k
    if(off + 1 < ctx->size  &&  (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
5493
192k
        int slot;
5494
192k
        const TAG* tags;
5495
5496
192k
        if(CH(off) == _T('/'))
5497
3.77k
            off++;
5498
5499
192k
        slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
5500
192k
        tags = map6[slot];
5501
5502
907k
        for(i = 0; tags[i].name != NULL; i++) {
5503
777k
            if(off + tags[i].len <= ctx->size) {
5504
776k
                if(md_ascii_case_eq(STR(off), tags[i].name, tags[i].len)) {
5505
63.3k
                    OFF tmp = off + tags[i].len;
5506
63.3k
                    if(tmp >= ctx->size)
5507
7
                        return 6;
5508
63.3k
                    if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
5509
37.8k
                        return 6;
5510
25.5k
                    if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
5511
547
                        return 6;
5512
24.9k
                    break;
5513
25.5k
                }
5514
776k
            }
5515
777k
        }
5516
192k
    }
5517
5518
    /* Check for type 7: any COMPLETE other opening or closing tag. */
5519
468k
    if(off + 1 < ctx->size) {
5520
468k
        OFF end;
5521
5522
468k
        if(md_is_html_tag(ctx, NULL, 0, beg, ctx->size, &end)) {
5523
            /* Only optional whitespace and new line may follow. */
5524
11.6k
            while(end < ctx->size  &&  ISWHITESPACE(end))
5525
2.36k
                end++;
5526
9.30k
            if(end >= ctx->size  ||  ISNEWLINE(end))
5527
5.52k
                return 7;
5528
9.30k
        }
5529
468k
    }
5530
5531
463k
    return FALSE;
5532
468k
}
5533
5534
/* Case sensitive check whether there is a substring 'what' between 'beg'
5535
 * and end of line. */
5536
static int
5537
md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end)
5538
155k
{
5539
155k
    OFF i;
5540
1.78M
    for(i = beg; i + what_len < ctx->size; i++) {
5541
1.78M
        if(ISNEWLINE(i))
5542
140k
            break;
5543
1.64M
        if(memcmp(STR(i), what, what_len * sizeof(CHAR)) == 0) {
5544
15.2k
            *p_end = i + what_len;
5545
15.2k
            return TRUE;
5546
15.2k
        }
5547
1.64M
    }
5548
5549
140k
    *p_end = i;
5550
140k
    return FALSE;
5551
155k
}
5552
5553
/* Returns type of HTML block end condition or FALSE if not an end condition.
5554
 *
5555
 * Note it fills p_end even when it is not end condition as the caller
5556
 * does not need to analyze contents of a raw HTML block.
5557
 */
5558
static int
5559
md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end)
5560
507k
{
5561
507k
    switch(ctx->html_block_type) {
5562
252k
        case 1:
5563
252k
        {
5564
252k
            OFF off = beg;
5565
252k
            int i;
5566
5567
3.73M
            while(off+1 < ctx->size  &&  !ISNEWLINE(off)) {
5568
3.48M
                if(CH(off) == _T('<')  &&  CH(off+1) == _T('/')) {
5569
345k
                    for(i = 0; t1[i].name != NULL; i++) {
5570
276k
                        if(off + 2 + t1[i].len < ctx->size) {
5571
275k
                            if(md_ascii_case_eq(STR(off+2), t1[i].name, t1[i].len)  &&
5572
275k
                               CH(off+2+t1[i].len) == _T('>'))
5573
1.68k
                            {
5574
1.68k
                                *p_end = off+2+t1[i].len+1;
5575
1.68k
                                return TRUE;
5576
1.68k
                            }
5577
275k
                        }
5578
276k
                    }
5579
69.8k
                }
5580
3.48M
                off++;
5581
3.48M
            }
5582
251k
            *p_end = off;
5583
251k
            return FALSE;
5584
252k
        }
5585
5586
24.7k
        case 2:
5587
24.7k
            return (md_line_contains(ctx, beg, _T("-->"), 3, p_end) ? 2 : FALSE);
5588
5589
22.2k
        case 3:
5590
22.2k
            return (md_line_contains(ctx, beg, _T("?>"), 2, p_end) ? 3 : FALSE);
5591
5592
108k
        case 4:
5593
108k
            return (md_line_contains(ctx, beg, _T(">"), 1, p_end) ? 4 : FALSE);
5594
5595
0
        case 5:
5596
0
            return (md_line_contains(ctx, beg, _T("]]>"), 3, p_end) ? 5 : FALSE);
5597
5598
96.4k
        case 6:     /* Pass through */
5599
98.6k
        case 7:
5600
98.6k
            if(beg >= ctx->size  ||  ISNEWLINE(beg)) {
5601
                /* Blank line ends types 6 and 7. */
5602
38.9k
                *p_end = beg;
5603
38.9k
                return ctx->html_block_type;
5604
38.9k
            }
5605
59.7k
            return FALSE;
5606
5607
0
        default:
5608
0
            MD_UNREACHABLE();
5609
507k
    }
5610
0
    return FALSE;
5611
507k
}
5612
5613
5614
static int
5615
md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container)
5616
7.08M
{
5617
    /* Block quote has no "items" like lists. */
5618
7.08M
    if(container->ch == _T('>'))
5619
3.25M
        return FALSE;
5620
5621
3.83M
    if(container->ch != pivot->ch)
5622
3.36M
        return FALSE;
5623
468k
    if(container->mark_indent > pivot->contents_indent)
5624
2.80k
        return FALSE;
5625
5626
466k
    return TRUE;
5627
468k
}
5628
5629
static int
5630
md_push_container(MD_CTX* ctx, const MD_CONTAINER* container)
5631
19.2M
{
5632
19.2M
    if(ctx->n_containers >= ctx->alloc_containers) {
5633
4.62k
        MD_CONTAINER* new_containers;
5634
5635
4.62k
        ctx->alloc_containers = (ctx->alloc_containers > 0
5636
4.62k
                ? ctx->alloc_containers + ctx->alloc_containers / 2
5637
4.62k
                : 16);
5638
4.62k
        new_containers = realloc(ctx->containers, ctx->alloc_containers * sizeof(MD_CONTAINER));
5639
4.62k
        if(new_containers == NULL) {
5640
0
            MD_LOG("realloc() failed.");
5641
0
            return -1;
5642
0
        }
5643
5644
4.62k
        ctx->containers = new_containers;
5645
4.62k
    }
5646
5647
19.2M
    memcpy(&ctx->containers[ctx->n_containers++], container, sizeof(MD_CONTAINER));
5648
19.2M
    return 0;
5649
19.2M
}
5650
5651
static int
5652
md_enter_child_containers(MD_CTX* ctx, int n_children)
5653
6.78M
{
5654
6.78M
    int i;
5655
6.78M
    int ret = 0;
5656
5657
26.0M
    for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) {
5658
19.2M
        MD_CONTAINER* c = &ctx->containers[i];
5659
19.2M
        int is_ordered_list = FALSE;
5660
5661
19.2M
        switch(c->ch) {
5662
18.3k
            case _T(')'):
5663
21.1k
            case _T('.'):
5664
21.1k
                is_ordered_list = TRUE;
5665
21.1k
                MD_FALLTHROUGH();
5666
5667
6.61M
            case _T('-'):
5668
6.76M
            case _T('+'):
5669
6.85M
            case _T('*'):
5670
                /* Remember offset in ctx->block_bytes so we can revisit the
5671
                 * block if we detect it is a loose list. */
5672
6.85M
                md_end_current_block(ctx);
5673
6.85M
                c->block_byte_off = ctx->n_block_bytes;
5674
5675
6.85M
                MD_CHECK(md_push_container_bytes(ctx,
5676
6.85M
                                (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL),
5677
6.85M
                                c->start, c->ch, MD_BLOCK_CONTAINER_OPENER));
5678
6.85M
                MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5679
6.85M
                                c->task_mark_off,
5680
6.85M
                                (c->is_task ? CH(c->task_mark_off) : 0),
5681
6.85M
                                MD_BLOCK_CONTAINER_OPENER));
5682
6.85M
                break;
5683
5684
12.4M
            case _T('>'):
5685
12.4M
                MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER));
5686
12.4M
                break;
5687
5688
12.4M
            default:
5689
0
                MD_UNREACHABLE();
5690
0
                break;
5691
19.2M
        }
5692
19.2M
    }
5693
5694
6.78M
abort:
5695
6.78M
    return ret;
5696
6.78M
}
5697
5698
static int
5699
md_leave_child_containers(MD_CTX* ctx, int n_keep)
5700
6.96M
{
5701
6.96M
    int ret = 0;
5702
5703
26.2M
    while(ctx->n_containers > n_keep) {
5704
19.2M
        MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1];
5705
19.2M
        int is_ordered_list = FALSE;
5706
5707
19.2M
        switch(c->ch) {
5708
18.3k
            case _T(')'):
5709
21.1k
            case _T('.'):
5710
21.1k
                is_ordered_list = TRUE;
5711
21.1k
                MD_FALLTHROUGH();
5712
5713
6.61M
            case _T('-'):
5714
6.76M
            case _T('+'):
5715
6.85M
            case _T('*'):
5716
6.85M
                MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5717
6.85M
                                c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0),
5718
6.85M
                                MD_BLOCK_CONTAINER_CLOSER));
5719
6.85M
                MD_CHECK(md_push_container_bytes(ctx,
5720
6.85M
                                (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0,
5721
6.85M
                                c->ch, MD_BLOCK_CONTAINER_CLOSER));
5722
6.85M
                break;
5723
5724
12.4M
            case _T('>'):
5725
12.4M
                MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0,
5726
12.4M
                                0, MD_BLOCK_CONTAINER_CLOSER));
5727
12.4M
                break;
5728
5729
12.4M
            default:
5730
0
                MD_UNREACHABLE();
5731
0
                break;
5732
19.2M
        }
5733
5734
19.2M
        ctx->n_containers--;
5735
19.2M
    }
5736
5737
6.96M
abort:
5738
6.96M
    return ret;
5739
6.96M
}
5740
5741
static int
5742
md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container)
5743
38.5M
{
5744
38.5M
    OFF off = beg;
5745
38.5M
    OFF max_end;
5746
5747
38.5M
    if(off >= ctx->size  ||  indent >= ctx->code_indent_offset)
5748
4.48k
        return FALSE;
5749
5750
    /* Check for block quote mark. */
5751
38.5M
    if(CH(off) == _T('>')) {
5752
15.6M
        off++;
5753
15.6M
        p_container->ch = _T('>');
5754
15.6M
        p_container->is_loose = FALSE;
5755
15.6M
        p_container->is_task = FALSE;
5756
15.6M
        p_container->mark_indent = indent;
5757
15.6M
        p_container->contents_indent = indent + 1;
5758
15.6M
        *p_end = off;
5759
15.6M
        return TRUE;
5760
15.6M
    }
5761
5762
    /* Check for list item bullet mark. */
5763
22.8M
    if(ISANYOF(off, _T("-+*"))  &&  (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1))) {
5764
10.6M
        p_container->ch = CH(off);
5765
10.6M
        p_container->is_loose = FALSE;
5766
10.6M
        p_container->is_task = FALSE;
5767
10.6M
        p_container->mark_indent = indent;
5768
10.6M
        p_container->contents_indent = indent + 1;
5769
10.6M
        *p_end = off+1;
5770
10.6M
        return TRUE;
5771
10.6M
    }
5772
5773
    /* Check for ordered list item marks. */
5774
12.1M
    max_end = off + 9;
5775
12.1M
    if(max_end > ctx->size)
5776
13.8k
        max_end = ctx->size;
5777
12.1M
    p_container->start = 0;
5778
12.3M
    while(off < max_end  &&  ISDIGIT(off)) {
5779
124k
        p_container->start = p_container->start * 10 + CH(off) - _T('0');
5780
124k
        off++;
5781
124k
    }
5782
12.1M
    if(off > beg  &&
5783
12.1M
       off < ctx->size  &&
5784
12.1M
       (CH(off) == _T('.') || CH(off) == _T(')'))  &&
5785
12.1M
       (off+1 >= ctx->size || ISBLANK(off+1) || ISNEWLINE(off+1)))
5786
29.8k
    {
5787
29.8k
        p_container->ch = CH(off);
5788
29.8k
        p_container->is_loose = FALSE;
5789
29.8k
        p_container->is_task = FALSE;
5790
29.8k
        p_container->mark_indent = indent;
5791
29.8k
        p_container->contents_indent = indent + off - beg + 1;
5792
29.8k
        *p_end = off+1;
5793
29.8k
        return TRUE;
5794
29.8k
    }
5795
5796
12.1M
    return FALSE;
5797
12.1M
}
5798
5799
static unsigned
5800
md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end)
5801
39.4M
{
5802
39.4M
    OFF off = beg;
5803
39.4M
    unsigned indent = total_indent;
5804
5805
52.5M
    while(off < ctx->size  &&  ISBLANK(off)) {
5806
13.1M
        if(CH(off) == _T('\t'))
5807
12.7M
            indent = (indent + 4) & ~3;
5808
439k
        else
5809
439k
            indent++;
5810
13.1M
        off++;
5811
13.1M
    }
5812
5813
39.4M
    *p_end = off;
5814
39.4M
    return indent - total_indent;
5815
39.4M
}
5816
5817
static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0, 0, 0, 0, 0 };
5818
5819
/* Analyze type of the line and find some its properties. This serves as a
5820
 * main input for determining type and boundaries of a block. */
5821
static int
5822
md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
5823
                const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line)
5824
19.3M
{
5825
19.3M
    unsigned total_indent = 0;
5826
19.3M
    int n_parents = 0;
5827
19.3M
    int n_brothers = 0;
5828
19.3M
    int n_children = 0;
5829
19.3M
    MD_CONTAINER container = { 0 };
5830
19.3M
    int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect;
5831
19.3M
    OFF off = beg;
5832
19.3M
    OFF hr_killer = 0;
5833
19.3M
    int ret = 0;
5834
5835
19.3M
    line->indent = md_line_indentation(ctx, total_indent, off, &off);
5836
19.3M
    total_indent += line->indent;
5837
19.3M
    line->beg = off;
5838
19.3M
    line->enforce_new_block = FALSE;
5839
5840
    /* Given the indentation and block quote marks '>', determine how many of
5841
     * the current containers are our parents. */
5842
19.7M
    while(n_parents < ctx->n_containers) {
5843
8.61M
        MD_CONTAINER* c = &ctx->containers[n_parents];
5844
5845
8.61M
        if(c->ch == _T('>')  &&  line->indent < ctx->code_indent_offset  &&
5846
8.61M
            off < ctx->size  &&  CH(off) == _T('>'))
5847
325k
        {
5848
            /* Block quote mark. */
5849
325k
            off++;
5850
325k
            total_indent++;
5851
325k
            line->indent = md_line_indentation(ctx, total_indent, off, &off);
5852
325k
            total_indent += line->indent;
5853
5854
            /* The optional 1st space after '>' is part of the block quote mark. */
5855
325k
            if(line->indent > 0)
5856
16.4k
                line->indent--;
5857
5858
325k
            line->beg = off;
5859
5860
8.28M
        } else if(c->ch != _T('>')  &&  line->indent >= c->contents_indent) {
5861
            /* List. */
5862
34.4k
            line->indent -= c->contents_indent;
5863
8.25M
        } else {
5864
8.25M
            break;
5865
8.25M
        }
5866
5867
359k
        n_parents++;
5868
359k
    }
5869
5870
19.3M
    if(off >= ctx->size  ||  ISNEWLINE(off)) {
5871
        /* Blank line does not need any real indentation to be nested inside
5872
         * a list. */
5873
699k
        if(n_brothers + n_children == 0) {
5874
828k
            while(n_parents < ctx->n_containers  &&  ctx->containers[n_parents].ch != _T('>'))
5875
128k
                n_parents++;
5876
699k
        }
5877
699k
    }
5878
5879
39.1M
    while(TRUE) {
5880
        /* Check whether we are fenced code continuation. */
5881
39.1M
        if(pivot_line->type == MD_LINE_FENCEDCODE) {
5882
365k
            line->beg = off;
5883
5884
            /* We are another MD_LINE_FENCEDCODE unless we are closing fence
5885
             * which we transform into MD_LINE_BLANK. */
5886
365k
            if(line->indent < ctx->code_indent_offset) {
5887
364k
                if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), off, &off)) {
5888
1.92k
                    line->type = MD_LINE_BLANK;
5889
1.92k
                    ctx->last_line_has_list_loosening_effect = FALSE;
5890
1.92k
                    break;
5891
1.92k
                }
5892
364k
            }
5893
5894
            /* Change indentation accordingly to the initial code fence. */
5895
363k
            if(n_parents == ctx->n_containers) {
5896
297k
                if(line->indent > pivot_line->indent)
5897
2.53k
                    line->indent -= pivot_line->indent;
5898
294k
                else
5899
294k
                    line->indent = 0;
5900
5901
297k
                line->type = MD_LINE_FENCEDCODE;
5902
297k
                break;
5903
297k
            }
5904
363k
        }
5905
5906
        /* Check whether we are HTML block continuation. */
5907
38.8M
        if(pivot_line->type == MD_LINE_HTML  &&  ctx->html_block_type > 0) {
5908
450k
            if(n_parents < ctx->n_containers) {
5909
                /* HTML block is implicitly ended if the enclosing container
5910
                 * block ends. */
5911
13.6k
                ctx->html_block_type = 0;
5912
436k
            } else {
5913
436k
                int html_block_type;
5914
5915
436k
                html_block_type = md_is_html_block_end_condition(ctx, off, &off);
5916
436k
                if(html_block_type > 0) {
5917
42.4k
                    MD_ASSERT(html_block_type == ctx->html_block_type);
5918
5919
                    /* Make sure this is the last line of the block. */
5920
42.4k
                    ctx->html_block_type = 0;
5921
5922
                    /* Some end conditions serve as blank lines at the same time. */
5923
42.4k
                    if(html_block_type == 6 || html_block_type == 7) {
5924
38.9k
                        line->type = MD_LINE_BLANK;
5925
38.9k
                        line->indent = 0;
5926
38.9k
                        break;
5927
38.9k
                    }
5928
42.4k
                }
5929
5930
397k
                line->type = MD_LINE_HTML;
5931
397k
                n_parents = ctx->n_containers;
5932
397k
                break;
5933
436k
            }
5934
450k
        }
5935
5936
        /* Check for blank line. */
5937
38.3M
        if(off >= ctx->size  ||  ISNEWLINE(off)) {
5938
7.42M
            if(pivot_line->type == MD_LINE_INDENTEDCODE  &&  n_parents == ctx->n_containers) {
5939
12.4k
                line->type = MD_LINE_INDENTEDCODE;
5940
12.4k
                if(line->indent > ctx->code_indent_offset)
5941
2.55k
                    line->indent -= ctx->code_indent_offset;
5942
9.89k
                else
5943
9.89k
                    line->indent = 0;
5944
12.4k
                ctx->last_line_has_list_loosening_effect = FALSE;
5945
7.40M
            } else {
5946
7.40M
                line->type = MD_LINE_BLANK;
5947
7.40M
                ctx->last_line_has_list_loosening_effect = (n_parents > 0  &&
5948
7.40M
                        n_brothers + n_children == 0  &&
5949
7.40M
                        ctx->containers[n_parents-1].ch != _T('>'));
5950
5951
7.40M
    #if 1
5952
                /* See https://github.com/mity/md4c/issues/6
5953
                 *
5954
                 * This ugly checking tests we are in (yet empty) list item but
5955
                 * not its very first line (i.e. not the line with the list
5956
                 * item mark).
5957
                 *
5958
                 * If we are such a blank line, then any following non-blank
5959
                 * line which would be part of the list item actually has to
5960
                 * end the list because according to the specification, "a list
5961
                 * item can begin with at most one blank line."
5962
                 */
5963
7.40M
                if(n_parents > 0  &&  ctx->containers[n_parents-1].ch != _T('>')  &&
5964
7.40M
                   n_brothers + n_children == 0  &&  ctx->current_block == NULL  &&
5965
7.40M
                   ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5966
64.2k
                {
5967
64.2k
                    MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5968
64.2k
                    if(top_block->type == MD_BLOCK_LI)
5969
56.5k
                        ctx->last_list_item_starts_with_two_blank_lines = TRUE;
5970
64.2k
                }
5971
7.40M
    #endif
5972
7.40M
            }
5973
7.42M
            break;
5974
30.9M
        } else {
5975
30.9M
    #if 1
5976
            /* This is the 2nd half of the hack. If the flag is set (i.e. there
5977
             * was a 2nd blank line at the beginning of the list item) and if
5978
             * we would otherwise still belong to the list item, we enforce
5979
             * the end of the list. */
5980
30.9M
            if(ctx->last_list_item_starts_with_two_blank_lines) {
5981
39.8k
                if(n_parents > 0  &&  n_parents == ctx->n_containers  &&
5982
39.8k
                   ctx->containers[n_parents-1].ch != _T('>')  &&
5983
39.8k
                   n_brothers + n_children == 0  &&  ctx->current_block == NULL  &&
5984
39.8k
                   ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5985
12.3k
                {
5986
12.3k
                    MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5987
12.3k
                    if(top_block->type == MD_BLOCK_LI) {
5988
7.32k
                        n_parents--;
5989
5990
7.32k
                        line->indent = total_indent;
5991
7.32k
                        if(n_parents > 0)
5992
2.65k
                            line->indent -= MIN(line->indent, ctx->containers[n_parents-1].contents_indent);
5993
7.32k
                    }
5994
12.3k
                }
5995
5996
39.8k
                ctx->last_list_item_starts_with_two_blank_lines = FALSE;
5997
39.8k
            }
5998
30.9M
    #endif
5999
30.9M
            ctx->last_line_has_list_loosening_effect = FALSE;
6000
30.9M
        }
6001
6002
        /* Check whether we are Setext underline. */
6003
30.9M
        if(line->indent < ctx->code_indent_offset  &&  pivot_line->type == MD_LINE_TEXT
6004
30.9M
            &&  off < ctx->size  &&  ISANYOF2(off, _T('='), _T('-'))
6005
30.9M
            &&  (n_parents == ctx->n_containers))
6006
79.8k
        {
6007
79.8k
            unsigned level;
6008
6009
79.8k
            if(md_is_setext_underline(ctx, off, &off, &level)) {
6010
33.3k
                line->type = MD_LINE_SETEXTUNDERLINE;
6011
33.3k
                line->data = level;
6012
33.3k
                break;
6013
33.3k
            }
6014
79.8k
        }
6015
6016
        /* Check for thematic break line. */
6017
30.9M
        if(line->indent < ctx->code_indent_offset
6018
30.9M
            &&  off < ctx->size  &&  off >= hr_killer
6019
30.9M
            &&  ISANYOF(off, _T("-_*")))
6020
7.32M
        {
6021
7.32M
            if(md_is_hr_line(ctx, off, &off, &hr_killer)) {
6022
7.17k
                line->type = MD_LINE_HR;
6023
7.17k
                break;
6024
7.17k
            }
6025
7.32M
        }
6026
6027
        /* Check for "brother" container. I.e. whether we are another list item
6028
         * in already started list. */
6029
30.9M
        if(n_parents < ctx->n_containers  &&  n_brothers + n_children == 0) {
6030
8.11M
            OFF tmp;
6031
6032
8.11M
            if(md_is_container_mark(ctx, line->indent, off, &tmp, &container)  &&
6033
8.11M
               md_is_container_compatible(&ctx->containers[n_parents], &container))
6034
466k
            {
6035
466k
                pivot_line = &md_dummy_blank_line;
6036
6037
466k
                off = tmp;
6038
6039
466k
                total_indent += container.contents_indent - container.mark_indent;
6040
466k
                line->indent = md_line_indentation(ctx, total_indent, off, &off);
6041
466k
                total_indent += line->indent;
6042
466k
                line->beg = off;
6043
6044
                /* Some of the following whitespace actually still belongs to the mark. */
6045
466k
                if(off >= ctx->size || ISNEWLINE(off)) {
6046
264k
                    container.contents_indent++;
6047
264k
                } else if(line->indent <= ctx->code_indent_offset) {
6048
199k
                    container.contents_indent += line->indent;
6049
199k
                    line->indent = 0;
6050
199k
                } else {
6051
1.79k
                    container.contents_indent += 1;
6052
1.79k
                    line->indent--;
6053
1.79k
                }
6054
6055
466k
                ctx->containers[n_parents].mark_indent = container.mark_indent;
6056
466k
                ctx->containers[n_parents].contents_indent = container.contents_indent;
6057
6058
466k
                n_brothers++;
6059
466k
                continue;
6060
466k
            }
6061
8.11M
        }
6062
6063
        /* Check for indented code.
6064
         * Note indented code block cannot interrupt a paragraph. */
6065
30.4M
        if(line->indent >= ctx->code_indent_offset  &&  (pivot_line->type != MD_LINE_TEXT)) {
6066
15.4k
            line->type = MD_LINE_INDENTEDCODE;
6067
15.4k
            line->indent -= ctx->code_indent_offset;
6068
15.4k
            line->data = 0;
6069
15.4k
            break;
6070
15.4k
        }
6071
6072
        /* Check for start of a new container block. */
6073
30.4M
        if(line->indent < ctx->code_indent_offset  &&
6074
30.4M
           md_is_container_mark(ctx, line->indent, off, &off, &container))
6075
19.3M
        {
6076
19.3M
            if(pivot_line->type == MD_LINE_TEXT  &&  n_parents == ctx->n_containers  &&
6077
19.3M
                        (off >= ctx->size || ISNEWLINE(off))  &&  container.ch != _T('>'))
6078
13.6k
            {
6079
                /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */
6080
19.2M
            } else if(pivot_line->type == MD_LINE_TEXT  &&  n_parents == ctx->n_containers  &&
6081
19.2M
                        ISANYOF2_(container.ch, _T('.'), _T(')'))  &&  container.start != 1)
6082
2.70k
            {
6083
                /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */
6084
19.2M
            } else {
6085
19.2M
                total_indent += container.contents_indent - container.mark_indent;
6086
19.2M
                line->indent = md_line_indentation(ctx, total_indent, off, &off);
6087
19.2M
                total_indent += line->indent;
6088
6089
19.2M
                line->beg = off;
6090
19.2M
                line->data = container.ch;
6091
6092
                /* Some of the following whitespace actually still belongs to the mark. */
6093
19.2M
                if(off >= ctx->size || ISNEWLINE(off)) {
6094
6.65M
                    container.contents_indent++;
6095
12.6M
                } else if(line->indent <= ctx->code_indent_offset) {
6096
12.6M
                    container.contents_indent += line->indent;
6097
12.6M
                    line->indent = 0;
6098
12.6M
                } else {
6099
2.76k
                    container.contents_indent += 1;
6100
2.76k
                    line->indent--;
6101
2.76k
                }
6102
6103
19.2M
                if(n_brothers + n_children == 0)
6104
6.78M
                    pivot_line = &md_dummy_blank_line;
6105
6106
19.2M
                if(n_children == 0)
6107
6.78M
                    MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6108
6109
19.2M
                n_children++;
6110
19.2M
                MD_CHECK(md_push_container(ctx, &container));
6111
19.2M
                continue;
6112
19.2M
            }
6113
19.3M
        }
6114
6115
        /* Check whether we are table continuation. */
6116
11.1M
        if(pivot_line->type == MD_LINE_TABLE  &&  n_parents == ctx->n_containers) {
6117
720k
            line->type = MD_LINE_TABLE;
6118
720k
            break;
6119
720k
        }
6120
6121
        /* Check for ATX header. */
6122
10.4M
        if(line->indent < ctx->code_indent_offset  &&
6123
10.4M
                off < ctx->size  &&  CH(off) == _T('#'))
6124
202k
        {
6125
202k
            unsigned level;
6126
6127
202k
            if(md_is_atxheader_line(ctx, off, &line->beg, &off, &level)) {
6128
171k
                line->type = MD_LINE_ATXHEADER;
6129
171k
                line->data = level;
6130
171k
                break;
6131
171k
            }
6132
202k
        }
6133
6134
        /* Check whether we are starting code fence. */
6135
10.2M
        if(line->indent < ctx->code_indent_offset  &&
6136
10.2M
                off < ctx->size  &&  ISANYOF2(off, _T('`'), _T('~')))
6137
275k
        {
6138
275k
            if(md_is_opening_code_fence(ctx, off, &off)) {
6139
68.7k
                line->type = MD_LINE_FENCEDCODE;
6140
68.7k
                line->data = 1;
6141
68.7k
                line->enforce_new_block = TRUE;
6142
68.7k
                break;
6143
68.7k
            }
6144
275k
        }
6145
6146
        /* Check for start of raw HTML block. */
6147
10.1M
        if(off < ctx->size  &&  CH(off) == _T('<')
6148
10.1M
            &&  !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS))
6149
538k
        {
6150
538k
            ctx->html_block_type = md_is_html_block_start_condition(ctx, off);
6151
6152
            /* HTML block type 7 cannot interrupt paragraph. */
6153
538k
            if(ctx->html_block_type == 7  &&  pivot_line->type == MD_LINE_TEXT)
6154
4.83k
                ctx->html_block_type = 0;
6155
6156
538k
            if(ctx->html_block_type > 0) {
6157
                /* The line itself also may immediately close the block. */
6158
70.2k
                if(md_is_html_block_end_condition(ctx, off, &off) == ctx->html_block_type) {
6159
                    /* Make sure this is the last line of the block. */
6160
13.3k
                    ctx->html_block_type = 0;
6161
13.3k
                }
6162
6163
70.2k
                line->enforce_new_block = TRUE;
6164
70.2k
                line->type = MD_LINE_HTML;
6165
70.2k
                break;
6166
70.2k
            }
6167
538k
        }
6168
6169
        /* Check for table underline. */
6170
10.1M
        if((ctx->parser.flags & MD_FLAG_TABLES)  &&  pivot_line->type == MD_LINE_TEXT
6171
10.1M
            &&  off < ctx->size  &&  ISANYOF3(off, _T('|'), _T('-'), _T(':'))
6172
10.1M
            &&  n_parents == ctx->n_containers)
6173
284k
        {
6174
284k
            unsigned col_count;
6175
6176
284k
            if(ctx->current_block != NULL  &&  ctx->current_block->n_lines == 1  &&
6177
284k
                md_is_table_underline(ctx, off, &off, &col_count))
6178
14.3k
            {
6179
14.3k
                line->data = col_count;
6180
14.3k
                line->type = MD_LINE_TABLEUNDERLINE;
6181
14.3k
                break;
6182
14.3k
            }
6183
284k
        }
6184
6185
        /* By default, we are normal text line. */
6186
10.0M
        line->type = MD_LINE_TEXT;
6187
10.0M
        if(pivot_line->type == MD_LINE_TEXT  &&  n_brothers + n_children == 0) {
6188
            /* Lazy continuation. */
6189
9.53M
            n_parents = ctx->n_containers;
6190
9.53M
        }
6191
6192
        /* Check for task mark. */
6193
10.0M
        if((ctx->parser.flags & MD_FLAG_TASKLISTS)  &&  n_brothers + n_children > 0  &&
6194
10.0M
           ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)")))
6195
128k
        {
6196
128k
            OFF tmp = off;
6197
6198
128k
            while(tmp < ctx->size  &&  tmp < off + 3  &&  ISBLANK(tmp))
6199
0
                tmp++;
6200
128k
            if(tmp + 2 < ctx->size  &&  CH(tmp) == _T('[')  &&
6201
128k
               ISANYOF(tmp+1, _T("xX "))  &&  CH(tmp+2) == _T(']')  &&
6202
128k
               (tmp + 3 == ctx->size  ||  ISBLANK(tmp+3)  ||  ISNEWLINE(tmp+3)))
6203
31.4k
            {
6204
31.4k
                MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container);
6205
31.4k
                task_container->is_task = TRUE;
6206
31.4k
                task_container->task_mark_off = tmp + 1;
6207
31.4k
                off = tmp + 3;
6208
62.5k
                while(off < ctx->size  &&  ISWHITESPACE(off))
6209
31.0k
                    off++;
6210
31.4k
                line->beg = off;
6211
31.4k
            }
6212
128k
        }
6213
6214
10.0M
        break;
6215
10.1M
    }
6216
6217
    /* Scan for end of the line.
6218
     *
6219
     * Note this is quite a bottleneck of the parsing as we here iterate almost
6220
     * over compete document.
6221
     */
6222
19.3M
#if defined __linux__ && !defined MD4C_USE_UTF16
6223
    /* Recent glibc versions have superbly optimized strcspn(), even using
6224
     * vectorization if available. */
6225
19.3M
    if(ctx->doc_ends_with_newline  &&  off < ctx->size) {
6226
7.59M
        while(TRUE) {
6227
7.59M
            off += (OFF) strcspn(STR(off), "\r\n");
6228
6229
            /* strcspn() can stop on zero terminator; but that can appear
6230
             * anywhere in the Markfown input... */
6231
7.59M
            if(CH(off) == _T('\0'))
6232
1.32M
                off++;
6233
6.27M
            else
6234
6.27M
                break;
6235
7.59M
        }
6236
6.27M
    } else
6237
13.0M
#endif
6238
13.0M
    {
6239
        /* Optimization: Use some loop unrolling. */
6240
76.6M
        while(off + 3 < ctx->size  &&  !ISNEWLINE(off+0)  &&  !ISNEWLINE(off+1)
6241
76.6M
                                   &&  !ISNEWLINE(off+2)  &&  !ISNEWLINE(off+3))
6242
63.5M
            off += 4;
6243
25.2M
        while(off < ctx->size  &&  !ISNEWLINE(off))
6244
12.1M
            off++;
6245
13.0M
    }
6246
6247
    /* Set end of the line. */
6248
19.3M
    line->end = off;
6249
6250
    /* But for ATX header, we should exclude the optional trailing mark. */
6251
19.3M
    if(line->type == MD_LINE_ATXHEADER) {
6252
171k
        OFF tmp = line->end;
6253
177k
        while(tmp > line->beg && ISBLANK(tmp-1))
6254
5.97k
            tmp--;
6255
207k
        while(tmp > line->beg && CH(tmp-1) == _T('#'))
6256
35.8k
            tmp--;
6257
171k
        if(tmp == line->beg || ISBLANK(tmp-1) || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS))
6258
155k
            line->end = tmp;
6259
171k
    }
6260
6261
    /* Trim trailing spaces. */
6262
19.3M
    if(line->type != MD_LINE_INDENTEDCODE  &&  line->type != MD_LINE_FENCEDCODE  && line->type != MD_LINE_HTML) {
6263
21.3M
        while(line->end > line->beg && ISBLANK(line->end-1))
6264
2.81M
            line->end--;
6265
18.4M
    }
6266
6267
    /* Eat also the new line. */
6268
19.3M
    if(off < ctx->size && CH(off) == _T('\r'))
6269
6.40M
        off++;
6270
19.3M
    if(off < ctx->size && CH(off) == _T('\n'))
6271
12.9M
        off++;
6272
6273
19.3M
    *p_end = off;
6274
6275
    /* If we belong to a list after seeing a blank line, the list is loose. */
6276
19.3M
    if(prev_line_has_list_loosening_effect  &&  line->type != MD_LINE_BLANK  &&  n_parents + n_brothers > 0) {
6277
8.49k
        MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1];
6278
8.49k
        if(c->ch != _T('>')) {
6279
6.23k
            MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off);
6280
6.23k
            block->flags |= MD_BLOCK_LOOSE_LIST;
6281
6.23k
        }
6282
8.49k
    }
6283
6284
    /* Leave any containers we are not part of anymore. */
6285
19.3M
    if(n_children == 0  &&  n_parents + n_brothers < ctx->n_containers)
6286
161k
        MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6287
6288
    /* Enter any container we found a mark for. */
6289
19.3M
    if(n_brothers > 0) {
6290
466k
        MD_ASSERT(n_brothers == 1);
6291
466k
        MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6292
466k
                    ctx->containers[n_parents].task_mark_off,
6293
466k
                    (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0),
6294
466k
                    MD_BLOCK_CONTAINER_CLOSER));
6295
466k
        MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6296
466k
                    container.task_mark_off,
6297
466k
                    (container.is_task ? CH(container.task_mark_off) : 0),
6298
466k
                    MD_BLOCK_CONTAINER_OPENER));
6299
466k
        ctx->containers[n_parents].is_task = container.is_task;
6300
466k
        ctx->containers[n_parents].task_mark_off = container.task_mark_off;
6301
466k
    }
6302
6303
19.3M
    if(n_children > 0)
6304
6.78M
        MD_CHECK(md_enter_child_containers(ctx, n_children));
6305
6306
19.3M
abort:
6307
19.3M
    return ret;
6308
19.3M
}
6309
6310
static int
6311
md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line)
6312
19.3M
{
6313
19.3M
    const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line;
6314
19.3M
    int ret = 0;
6315
6316
    /* Blank line ends current leaf block. */
6317
19.3M
    if(line->type == MD_LINE_BLANK) {
6318
7.45M
        MD_CHECK(md_end_current_block(ctx));
6319
7.45M
        *p_pivot_line = &md_dummy_blank_line;
6320
7.45M
        return 0;
6321
7.45M
    }
6322
6323
11.9M
    if(line->enforce_new_block)
6324
139k
        MD_CHECK(md_end_current_block(ctx));
6325
6326
    /* Some line types form block on their own. */
6327
11.9M
    if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) {
6328
178k
        MD_CHECK(md_end_current_block(ctx));
6329
6330
        /* Add our single-line block. */
6331
178k
        MD_CHECK(md_start_new_block(ctx, line));
6332
178k
        MD_CHECK(md_add_line_into_current_block(ctx, line));
6333
178k
        MD_CHECK(md_end_current_block(ctx));
6334
178k
        *p_pivot_line = &md_dummy_blank_line;
6335
178k
        return 0;
6336
178k
    }
6337
6338
    /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */
6339
11.7M
    if(line->type == MD_LINE_SETEXTUNDERLINE) {
6340
33.3k
        MD_ASSERT(ctx->current_block != NULL);
6341
33.3k
        ctx->current_block->type = MD_BLOCK_H;
6342
33.3k
        ctx->current_block->data = line->data;
6343
33.3k
        ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER;
6344
33.3k
        MD_CHECK(md_add_line_into_current_block(ctx, line));
6345
33.3k
        MD_CHECK(md_end_current_block(ctx));
6346
33.3k
        if(ctx->current_block == NULL) {
6347
31.4k
            *p_pivot_line = &md_dummy_blank_line;
6348
31.4k
        } else {
6349
            /* This happens if we have consumed all the body as link ref. defs.
6350
             * and downgraded the underline into start of a new paragraph block. */
6351
1.97k
            line->type = MD_LINE_TEXT;
6352
1.97k
            *p_pivot_line = line;
6353
1.97k
        }
6354
33.3k
        return 0;
6355
33.3k
    }
6356
6357
    /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */
6358
11.6M
    if(line->type == MD_LINE_TABLEUNDERLINE) {
6359
14.3k
        MD_ASSERT(ctx->current_block != NULL);
6360
14.3k
        MD_ASSERT(ctx->current_block->n_lines == 1);
6361
14.3k
        ctx->current_block->type = MD_BLOCK_TABLE;
6362
14.3k
        ctx->current_block->data = line->data;
6363
14.3k
        MD_ASSERT(pivot_line != &md_dummy_blank_line);
6364
14.3k
        ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE;
6365
14.3k
        MD_CHECK(md_add_line_into_current_block(ctx, line));
6366
14.3k
        return 0;
6367
14.3k
    }
6368
6369
    /* The current block also ends if the line has different type. */
6370
11.6M
    if(line->type != pivot_line->type)
6371
502k
        MD_CHECK(md_end_current_block(ctx));
6372
6373
    /* The current line may start a new block. */
6374
11.6M
    if(ctx->current_block == NULL) {
6375
719k
        MD_CHECK(md_start_new_block(ctx, line));
6376
719k
        *p_pivot_line = line;
6377
719k
    }
6378
6379
    /* In all other cases the line is just a continuation of the current block. */
6380
11.6M
    MD_CHECK(md_add_line_into_current_block(ctx, line));
6381
6382
11.6M
abort:
6383
11.6M
    return ret;
6384
11.6M
}
6385
6386
static int
6387
md_process_doc(MD_CTX *ctx)
6388
16.6k
{
6389
16.6k
    const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line;
6390
16.6k
    MD_LINE_ANALYSIS line_buf[2];
6391
16.6k
    MD_LINE_ANALYSIS* line = &line_buf[0];
6392
16.6k
    OFF off = 0;
6393
16.6k
    int ret = 0;
6394
6395
16.6k
    MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
6396
6397
19.3M
    while(off < ctx->size) {
6398
19.3M
        if(line == pivot_line)
6399
713k
            line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]);
6400
6401
19.3M
        MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line));
6402
19.3M
        MD_CHECK(md_process_line(ctx, &pivot_line, line));
6403
19.3M
    }
6404
6405
16.6k
    md_end_current_block(ctx);
6406
6407
16.6k
    MD_CHECK(md_build_ref_def_hashtable(ctx));
6408
6409
    /* Process all blocks. */
6410
16.6k
    MD_CHECK(md_leave_child_containers(ctx, 0));
6411
16.6k
    MD_CHECK(md_process_all_blocks(ctx));
6412
6413
16.6k
    MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL);
6414
6415
16.6k
abort:
6416
6417
#if 0
6418
    /* Output some memory consumption statistics. */
6419
    {
6420
        char buffer[256];
6421
        sprintf(buffer, "Alloced %u bytes for block buffer.",
6422
                    (unsigned)(ctx->alloc_block_bytes));
6423
        MD_LOG(buffer);
6424
6425
        sprintf(buffer, "Alloced %u bytes for containers buffer.",
6426
                    (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER)));
6427
        MD_LOG(buffer);
6428
6429
        sprintf(buffer, "Alloced %u bytes for marks buffer.",
6430
                    (unsigned)(ctx->alloc_marks * sizeof(MD_MARK)));
6431
        MD_LOG(buffer);
6432
6433
        sprintf(buffer, "Alloced %u bytes for aux. buffer.",
6434
                    (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR)));
6435
        MD_LOG(buffer);
6436
    }
6437
#endif
6438
6439
16.6k
    return ret;
6440
16.6k
}
6441
6442
6443
/********************
6444
 ***  Public API  ***
6445
 ********************/
6446
6447
int
6448
md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata)
6449
16.6k
{
6450
16.6k
    MD_CTX ctx;
6451
16.6k
    int i;
6452
16.6k
    int ret;
6453
6454
16.6k
    if(parser->abi_version != 0) {
6455
0
        if(parser->debug_log != NULL)
6456
0
            parser->debug_log("Unsupported abi_version.", userdata);
6457
0
        return -1;
6458
0
    }
6459
6460
    /* Setup context structure. */
6461
16.6k
    memset(&ctx, 0, sizeof(MD_CTX));
6462
16.6k
    ctx.text = text;
6463
16.6k
    ctx.size = size;
6464
16.6k
    memcpy(&ctx.parser, parser, sizeof(MD_PARSER));
6465
16.6k
    ctx.userdata = userdata;
6466
16.6k
    ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
6467
16.6k
    md_build_mark_char_map(&ctx);
6468
16.6k
    ctx.doc_ends_with_newline = (size > 0  &&  ISNEWLINE_(text[size-1]));
6469
16.6k
    ctx.max_ref_def_output = MIN(MIN(16 * (uint64_t)size, (uint64_t)(1024 * 1024)), (uint64_t)SZ_MAX);
6470
6471
    /* Reset all mark stacks and lists. */
6472
282k
    for(i = 0; i < (int) SIZEOF_ARRAY(ctx.opener_stacks); i++)
6473
265k
        ctx.opener_stacks[i].top = -1;
6474
16.6k
    ctx.ptr_stack.top = -1;
6475
16.6k
    ctx.unresolved_link_head = -1;
6476
16.6k
    ctx.unresolved_link_tail = -1;
6477
16.6k
    ctx.table_cell_boundaries_head = -1;
6478
16.6k
    ctx.table_cell_boundaries_tail = -1;
6479
6480
    /* All the work. */
6481
16.6k
    ret = md_process_doc(&ctx);
6482
6483
    /* Clean-up. */
6484
16.6k
    md_free_ref_defs(&ctx);
6485
16.6k
    md_free_ref_def_hashtable(&ctx);
6486
16.6k
    free(ctx.buffer);
6487
16.6k
    free(ctx.marks);
6488
16.6k
    free(ctx.block_bytes);
6489
16.6k
    free(ctx.containers);
6490
6491
16.6k
    return ret;
6492
16.6k
}