Coverage Report

Created: 2026-03-08 06:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Modules/_sre/sre.c
Line
Count
Source
1
/*
2
 * Secret Labs' Regular Expression Engine
3
 *
4
 * regular expression matching engine
5
 *
6
 * partial history:
7
 * 1999-10-24 fl   created (based on existing template matcher code)
8
 * 2000-03-06 fl   first alpha, sort of
9
 * 2000-08-01 fl   fixes for 1.6b1
10
 * 2000-08-07 fl   use PyOS_CheckStack() if available
11
 * 2000-09-20 fl   added expand method
12
 * 2001-03-20 fl   lots of fixes for 2.1b2
13
 * 2001-04-15 fl   export copyright as Python attribute, not global
14
 * 2001-04-28 fl   added __copy__ methods (work in progress)
15
 * 2001-05-14 fl   fixes for 1.5.2 compatibility
16
 * 2001-07-01 fl   added BIGCHARSET support (from Martin von Loewis)
17
 * 2001-10-18 fl   fixed group reset issue (from Matthew Mueller)
18
 * 2001-10-20 fl   added split primitive; re-enable unicode for 1.6/2.0/2.1
19
 * 2001-10-21 fl   added sub/subn primitive
20
 * 2001-10-24 fl   added finditer primitive (for 2.2 only)
21
 * 2001-12-07 fl   fixed memory leak in sub/subn (Guido van Rossum)
22
 * 2002-11-09 fl   fixed empty sub/subn return type
23
 * 2003-04-18 mvl  fully support 4-byte codes
24
 * 2003-10-17 gn   implemented non recursive scheme
25
 * 2013-02-04 mrab added fullmatch primitive
26
 *
27
 * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
28
 *
29
 * This version of the SRE library can be redistributed under CNRI's
30
 * Python 1.6 license.  For any other use, please contact Secret Labs
31
 * AB (info@pythonware.com).
32
 *
33
 * Portions of this engine have been developed in cooperation with
34
 * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
35
 * other compatibility work.
36
 */
37
38
static const char copyright[] =
39
    " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
40
41
#include "Python.h"
42
#include "pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
43
#include "pycore_dict.h"             // _PyDict_Next()
44
#include "pycore_long.h"             // _PyLong_GetZero()
45
#include "pycore_moduleobject.h"     // _PyModule_GetState()
46
#include "pycore_unicodeobject.h"    // _PyUnicode_Copy
47
#include "pycore_weakref.h"          // FT_CLEAR_WEAKREFS()
48
49
#include "sre.h"                     // SRE_CODE
50
51
#include <ctype.h>                   // tolower(), toupper(), isalnum()
52
53
1.24G
#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
54
55
// On macOS, use the wide character ctype API using btowc()
56
#if defined(__APPLE__)
57
#  define USE_CTYPE_WINT_T
58
#endif
59
60
0
static int sre_isalnum(unsigned int ch) {
61
#ifdef USE_CTYPE_WINT_T
62
    return (unsigned int)iswalnum(btowc((int)ch));
63
#else
64
0
    return (unsigned int)isalnum((int)ch);
65
0
#endif
66
0
}
67
68
0
static unsigned int sre_tolower(unsigned int ch) {
69
#ifdef USE_CTYPE_WINT_T
70
    return (unsigned int)towlower(btowc((int)ch));
71
#else
72
0
    return (unsigned int)tolower((int)ch);
73
0
#endif
74
0
}
75
76
0
static unsigned int sre_toupper(unsigned int ch) {
77
#ifdef USE_CTYPE_WINT_T
78
    return (unsigned int)towupper(btowc((int)ch));
79
#else
80
0
    return (unsigned int)toupper((int)ch);
81
0
#endif
82
0
}
83
84
/* Defining this one controls tracing:
85
 * 0 -- disabled
86
 * 1 -- only if the DEBUG flag set
87
 * 2 -- always
88
 */
89
#ifndef VERBOSE
90
#  define VERBOSE 0
91
#endif
92
93
/* -------------------------------------------------------------------- */
94
95
#if defined(_MSC_VER) && !defined(__clang__)
96
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
97
#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
98
/* fastest possible local call under MSVC */
99
#define LOCAL(type) static __inline type __fastcall
100
#else
101
#define LOCAL(type) static inline type
102
#endif
103
104
/* error codes */
105
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
106
#define SRE_ERROR_STATE -2 /* illegal state */
107
0
#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
108
0
#define SRE_ERROR_MEMORY -9 /* out of memory */
109
0
#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
110
111
#if VERBOSE == 0
112
#  define INIT_TRACE(state)
113
#  define DO_TRACE 0
114
#  define TRACE(v)
115
#elif VERBOSE == 1
116
#  define INIT_TRACE(state) int _debug = (state)->debug
117
#  define DO_TRACE (_debug)
118
#  define TRACE(v) do {     \
119
        if (_debug) { \
120
            printf v;       \
121
        }                   \
122
    } while (0)
123
#elif VERBOSE == 2
124
#  define INIT_TRACE(state)
125
#  define DO_TRACE 1
126
#  define TRACE(v) printf v
127
#else
128
#  error VERBOSE must be 0, 1 or 2
129
#endif
130
131
/* -------------------------------------------------------------------- */
132
/* search engine state */
133
134
#define SRE_IS_DIGIT(ch)\
135
462
    ((ch) <= '9' && Py_ISDIGIT(ch))
136
#define SRE_IS_SPACE(ch)\
137
32
    ((ch) <= ' ' && Py_ISSPACE(ch))
138
#define SRE_IS_LINEBREAK(ch)\
139
97.3M
    ((ch) == '\n')
140
#define SRE_IS_WORD(ch)\
141
11.5M
    ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
142
143
static unsigned int sre_lower_ascii(unsigned int ch)
144
7.27M
{
145
7.27M
    return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
146
7.27M
}
147
148
/* locale-specific character predicates */
149
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
150
 * warnings when c's type supports only numbers < N+1 */
151
0
#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? sre_isalnum((ch)) : 0)
152
0
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
153
154
static unsigned int sre_lower_locale(unsigned int ch)
155
0
{
156
0
    return ((ch) < 256 ? (unsigned int)sre_tolower((ch)) : ch);
157
0
}
158
159
static unsigned int sre_upper_locale(unsigned int ch)
160
0
{
161
0
    return ((ch) < 256 ? (unsigned int)sre_toupper((ch)) : ch);
162
0
}
163
164
/* unicode-specific character predicates */
165
166
16
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
167
88.6M
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
168
0
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
169
10.2k
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
170
5.14k
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
171
172
static unsigned int sre_lower_unicode(unsigned int ch)
173
117M
{
174
117M
    return (unsigned int) Py_UNICODE_TOLOWER(ch);
175
117M
}
176
177
static unsigned int sre_upper_unicode(unsigned int ch)
178
30.4M
{
179
30.4M
    return (unsigned int) Py_UNICODE_TOUPPER(ch);
180
30.4M
}
181
182
LOCAL(int)
183
sre_category(SRE_CODE category, unsigned int ch)
184
100M
{
185
100M
    switch (category) {
186
187
462
    case SRE_CATEGORY_DIGIT:
188
462
        return SRE_IS_DIGIT(ch);
189
0
    case SRE_CATEGORY_NOT_DIGIT:
190
0
        return !SRE_IS_DIGIT(ch);
191
32
    case SRE_CATEGORY_SPACE:
192
32
        return SRE_IS_SPACE(ch);
193
0
    case SRE_CATEGORY_NOT_SPACE:
194
0
        return !SRE_IS_SPACE(ch);
195
11.5M
    case SRE_CATEGORY_WORD:
196
11.5M
        return SRE_IS_WORD(ch);
197
0
    case SRE_CATEGORY_NOT_WORD:
198
0
        return !SRE_IS_WORD(ch);
199
0
    case SRE_CATEGORY_LINEBREAK:
200
0
        return SRE_IS_LINEBREAK(ch);
201
0
    case SRE_CATEGORY_NOT_LINEBREAK:
202
0
        return !SRE_IS_LINEBREAK(ch);
203
204
0
    case SRE_CATEGORY_LOC_WORD:
205
0
        return SRE_LOC_IS_WORD(ch);
206
0
    case SRE_CATEGORY_LOC_NOT_WORD:
207
0
        return !SRE_LOC_IS_WORD(ch);
208
209
16
    case SRE_CATEGORY_UNI_DIGIT:
210
16
        return SRE_UNI_IS_DIGIT(ch);
211
0
    case SRE_CATEGORY_UNI_NOT_DIGIT:
212
0
        return !SRE_UNI_IS_DIGIT(ch);
213
77.7M
    case SRE_CATEGORY_UNI_SPACE:
214
77.7M
        return SRE_UNI_IS_SPACE(ch);
215
10.9M
    case SRE_CATEGORY_UNI_NOT_SPACE:
216
10.9M
        return !SRE_UNI_IS_SPACE(ch);
217
5.14k
    case SRE_CATEGORY_UNI_WORD:
218
5.14k
        return SRE_UNI_IS_WORD(ch);
219
0
    case SRE_CATEGORY_UNI_NOT_WORD:
220
0
        return !SRE_UNI_IS_WORD(ch);
221
0
    case SRE_CATEGORY_UNI_LINEBREAK:
222
0
        return SRE_UNI_IS_LINEBREAK(ch);
223
0
    case SRE_CATEGORY_UNI_NOT_LINEBREAK:
224
0
        return !SRE_UNI_IS_LINEBREAK(ch);
225
100M
    }
226
0
    return 0;
227
100M
}
228
229
LOCAL(int)
230
char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
231
0
{
232
0
    return ch == pattern
233
0
        || (SRE_CODE) sre_lower_locale(ch) == pattern
234
0
        || (SRE_CODE) sre_upper_locale(ch) == pattern;
235
0
}
236
237
238
/* helpers */
239
240
static void
241
data_stack_dealloc(SRE_STATE* state)
242
195M
{
243
195M
    if (state->data_stack) {
244
169M
        PyMem_Free(state->data_stack);
245
169M
        state->data_stack = NULL;
246
169M
    }
247
195M
    state->data_stack_size = state->data_stack_base = 0;
248
195M
}
249
250
static int
251
data_stack_grow(SRE_STATE* state, Py_ssize_t size)
252
170M
{
253
170M
    INIT_TRACE(state);
254
170M
    Py_ssize_t minsize, cursize;
255
170M
    minsize = state->data_stack_base+size;
256
170M
    cursize = state->data_stack_size;
257
170M
    if (cursize < minsize) {
258
170M
        void* stack;
259
170M
        cursize = minsize+minsize/4+1024;
260
170M
        TRACE(("allocate/grow stack %zd\n", cursize));
261
170M
        stack = PyMem_Realloc(state->data_stack, cursize);
262
170M
        if (!stack) {
263
0
            data_stack_dealloc(state);
264
0
            return SRE_ERROR_MEMORY;
265
0
        }
266
170M
        state->data_stack = (char *)stack;
267
170M
        state->data_stack_size = cursize;
268
170M
    }
269
170M
    return 0;
270
170M
}
271
272
/* memory pool functions for SRE_REPEAT, this can avoid memory
273
   leak when SRE(match) function terminates abruptly.
274
   state->repeat_pool_used is a doubly-linked list, so that we
275
   can remove a SRE_REPEAT node from it.
276
   state->repeat_pool_unused is a singly-linked list, we put/get
277
   node at the head. */
278
static SRE_REPEAT *
279
repeat_pool_malloc(SRE_STATE *state)
280
112M
{
281
112M
    SRE_REPEAT *repeat;
282
283
112M
    if (state->repeat_pool_unused) {
284
        /* remove from unused pool (singly-linked list) */
285
75.3M
        repeat = state->repeat_pool_unused;
286
75.3M
        state->repeat_pool_unused = repeat->pool_next;
287
75.3M
    }
288
37.1M
    else {
289
37.1M
        repeat = PyMem_Malloc(sizeof(SRE_REPEAT));
290
37.1M
        if (!repeat) {
291
0
            return NULL;
292
0
        }
293
37.1M
    }
294
295
    /* add to used pool (doubly-linked list) */
296
112M
    SRE_REPEAT *temp = state->repeat_pool_used;
297
112M
    if (temp) {
298
19.2M
        temp->pool_prev = repeat;
299
19.2M
    }
300
112M
    repeat->pool_prev = NULL;
301
112M
    repeat->pool_next = temp;
302
112M
    state->repeat_pool_used = repeat;
303
304
112M
    return repeat;
305
112M
}
306
307
static void
308
repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
309
112M
{
310
112M
    SRE_REPEAT *prev = repeat->pool_prev;
311
112M
    SRE_REPEAT *next = repeat->pool_next;
312
313
    /* remove from used pool (doubly-linked list) */
314
112M
    if (prev) {
315
0
        prev->pool_next = next;
316
0
    }
317
112M
    else {
318
112M
        state->repeat_pool_used = next;
319
112M
    }
320
112M
    if (next) {
321
19.2M
        next->pool_prev = prev;
322
19.2M
    }
323
324
    /* add to unused pool (singly-linked list) */
325
112M
    repeat->pool_next = state->repeat_pool_unused;
326
112M
    state->repeat_pool_unused = repeat;
327
112M
}
328
329
static void
330
repeat_pool_clear(SRE_STATE *state)
331
82.1M
{
332
    /* clear used pool */
333
82.1M
    SRE_REPEAT *next = state->repeat_pool_used;
334
82.1M
    state->repeat_pool_used = NULL;
335
82.1M
    while (next) {
336
0
        SRE_REPEAT *temp = next;
337
0
        next = temp->pool_next;
338
0
        PyMem_Free(temp);
339
0
    }
340
341
    /* clear unused pool */
342
82.1M
    next = state->repeat_pool_unused;
343
82.1M
    state->repeat_pool_unused = NULL;
344
119M
    while (next) {
345
37.1M
        SRE_REPEAT *temp = next;
346
37.1M
        next = temp->pool_next;
347
37.1M
        PyMem_Free(temp);
348
37.1M
    }
349
82.1M
}
350
351
/* generate 8-bit version */
352
353
371M
#define SRE_CHAR Py_UCS1
354
#define SIZEOF_SRE_CHAR 1
355
1.48G
#define SRE(F) sre_ucs1_##F
356
#include "sre_lib.h"
357
358
/* generate 16-bit unicode version */
359
360
432M
#define SRE_CHAR Py_UCS2
361
#define SIZEOF_SRE_CHAR 2
362
1.86G
#define SRE(F) sre_ucs2_##F
363
#include "sre_lib.h"
364
365
/* generate 32-bit unicode version */
366
367
114M
#define SRE_CHAR Py_UCS4
368
#define SIZEOF_SRE_CHAR 4
369
593M
#define SRE(F) sre_ucs4_##F
370
#include "sre_lib.h"
371
372
/* -------------------------------------------------------------------- */
373
/* factories and destructors */
374
375
/* module state */
376
typedef struct {
377
    PyTypeObject *Pattern_Type;
378
    PyTypeObject *Match_Type;
379
    PyTypeObject *Scanner_Type;
380
    PyTypeObject *Template_Type;
381
    PyObject *compile_template;  // reference to re._compile_template
382
} _sremodulestate;
383
384
static _sremodulestate *
385
get_sre_module_state(PyObject *m)
386
80.1M
{
387
80.1M
    _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
388
80.1M
    assert(state);
389
80.1M
    return state;
390
80.1M
}
391
392
static struct PyModuleDef sremodule;
393
#define get_sre_module_state_by_class(cls) \
394
80.1M
    (get_sre_module_state(PyType_GetModule(cls)))
395
396
/* see sre.h for object declarations */
397
static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
398
static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
399
400
21.5k
#define _PatternObject_CAST(op)     ((PatternObject *)(op))
401
88.7M
#define _MatchObject_CAST(op)       ((MatchObject *)(op))
402
0
#define _TemplateObject_CAST(op)    ((TemplateObject *)(op))
403
772k
#define _ScannerObject_CAST(op)     ((ScannerObject *)(op))
404
405
/*[clinic input]
406
module _sre
407
class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
408
class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
409
class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
410
[clinic start generated code]*/
411
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
412
413
/*[clinic input]
414
_sre.getcodesize -> int
415
[clinic start generated code]*/
416
417
static int
418
_sre_getcodesize_impl(PyObject *module)
419
/*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
420
0
{
421
0
    return sizeof(SRE_CODE);
422
0
}
423
424
/*[clinic input]
425
_sre.ascii_iscased -> bool
426
427
    character: int
428
    /
429
430
[clinic start generated code]*/
431
432
static int
433
_sre_ascii_iscased_impl(PyObject *module, int character)
434
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
435
97.7k
{
436
97.7k
    unsigned int ch = (unsigned int)character;
437
97.7k
    return ch < 128 && Py_ISALPHA(ch);
438
97.7k
}
439
440
/*[clinic input]
441
_sre.unicode_iscased -> bool
442
443
    character: int
444
    /
445
446
[clinic start generated code]*/
447
448
static int
449
_sre_unicode_iscased_impl(PyObject *module, int character)
450
/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
451
34.1M
{
452
34.1M
    unsigned int ch = (unsigned int)character;
453
34.1M
    return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
454
34.1M
}
455
456
/*[clinic input]
457
_sre.ascii_tolower -> int
458
459
    character: int
460
    /
461
462
[clinic start generated code]*/
463
464
static int
465
_sre_ascii_tolower_impl(PyObject *module, int character)
466
/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
467
1.42M
{
468
1.42M
    return sre_lower_ascii(character);
469
1.42M
}
470
471
/*[clinic input]
472
_sre.unicode_tolower -> int
473
474
    character: int
475
    /
476
477
[clinic start generated code]*/
478
479
static int
480
_sre_unicode_tolower_impl(PyObject *module, int character)
481
/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
482
83.5M
{
483
83.5M
    return sre_lower_unicode(character);
484
83.5M
}
485
486
LOCAL(void)
487
state_reset(SRE_STATE* state)
488
113M
{
489
    /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
490
    /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
491
492
113M
    state->lastmark = -1;
493
113M
    state->lastindex = -1;
494
495
113M
    state->repeat = NULL;
496
497
113M
    data_stack_dealloc(state);
498
113M
}
499
500
static const void*
501
getstring(PyObject* string, Py_ssize_t* p_length,
502
          int* p_isbytes, int* p_charsize,
503
          Py_buffer *view)
504
131M
{
505
    /* given a python object, return a data pointer, a length (in
506
       characters), and a character size.  return NULL if the object
507
       is not a string (or not compatible) */
508
509
    /* Unicode objects do not support the buffer API. So, get the data
510
       directly instead. */
511
131M
    if (PyUnicode_Check(string)) {
512
131M
        *p_length = PyUnicode_GET_LENGTH(string);
513
131M
        *p_charsize = PyUnicode_KIND(string);
514
131M
        *p_isbytes = 0;
515
131M
        return PyUnicode_DATA(string);
516
131M
    }
517
518
    /* get pointer to byte string buffer */
519
63.0k
    if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
520
0
        PyErr_Format(PyExc_TypeError, "expected string or bytes-like "
521
0
                     "object, got '%.200s'", Py_TYPE(string)->tp_name);
522
0
        return NULL;
523
0
    }
524
525
63.0k
    *p_length = view->len;
526
63.0k
    *p_charsize = 1;
527
63.0k
    *p_isbytes = 1;
528
529
63.0k
    if (view->buf == NULL) {
530
0
        PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
531
0
        PyBuffer_Release(view);
532
0
        view->buf = NULL;
533
0
        return NULL;
534
0
    }
535
63.0k
    return view->buf;
536
63.0k
}
537
538
LOCAL(PyObject*)
539
state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
540
           Py_ssize_t start, Py_ssize_t end)
541
82.1M
{
542
    /* prepare state object */
543
544
82.1M
    Py_ssize_t length;
545
82.1M
    int isbytes, charsize;
546
82.1M
    const void* ptr;
547
548
82.1M
    memset(state, 0, sizeof(SRE_STATE));
549
550
82.1M
    state->mark = PyMem_New(const void *, pattern->groups * 2);
551
82.1M
    if (!state->mark) {
552
0
        PyErr_NoMemory();
553
0
        goto err;
554
0
    }
555
82.1M
    state->lastmark = -1;
556
82.1M
    state->lastindex = -1;
557
558
82.1M
    state->buffer.buf = NULL;
559
82.1M
    ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
560
82.1M
    if (!ptr)
561
0
        goto err;
562
563
82.1M
    if (isbytes && pattern->isbytes == 0) {
564
0
        PyErr_SetString(PyExc_TypeError,
565
0
                        "cannot use a string pattern on a bytes-like object");
566
0
        goto err;
567
0
    }
568
82.1M
    if (!isbytes && pattern->isbytes > 0) {
569
0
        PyErr_SetString(PyExc_TypeError,
570
0
                        "cannot use a bytes pattern on a string-like object");
571
0
        goto err;
572
0
    }
573
574
    /* adjust boundaries */
575
82.1M
    if (start < 0)
576
0
        start = 0;
577
82.1M
    else if (start > length)
578
0
        start = length;
579
580
82.1M
    if (end < 0)
581
0
        end = 0;
582
82.1M
    else if (end > length)
583
82.1M
        end = length;
584
585
82.1M
    state->isbytes = isbytes;
586
82.1M
    state->charsize = charsize;
587
82.1M
    state->match_all = 0;
588
82.1M
    state->must_advance = 0;
589
82.1M
    state->debug = ((pattern->flags & SRE_FLAG_DEBUG) != 0);
590
591
82.1M
    state->beginning = ptr;
592
593
82.1M
    state->start = (void*) ((char*) ptr + start * state->charsize);
594
82.1M
    state->end = (void*) ((char*) ptr + end * state->charsize);
595
596
82.1M
    state->string = Py_NewRef(string);
597
82.1M
    state->pos = start;
598
82.1M
    state->endpos = end;
599
600
#ifdef Py_DEBUG
601
    state->fail_after_count = pattern->fail_after_count;
602
    state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
603
#endif
604
605
82.1M
    return string;
606
0
  err:
607
    /* We add an explicit cast here because MSVC has a bug when
608
       compiling C code where it believes that `const void**` cannot be
609
       safely casted to `void*`, see bpo-39943 for details. */
610
0
    PyMem_Free((void*) state->mark);
611
0
    state->mark = NULL;
612
0
    if (state->buffer.buf)
613
0
        PyBuffer_Release(&state->buffer);
614
0
    return NULL;
615
82.1M
}
616
617
LOCAL(void)
618
state_fini(SRE_STATE* state)
619
82.1M
{
620
82.1M
    if (state->buffer.buf)
621
32.2k
        PyBuffer_Release(&state->buffer);
622
82.1M
    Py_XDECREF(state->string);
623
82.1M
    data_stack_dealloc(state);
624
    /* See above PyMem_Free() for why we explicitly cast here. */
625
82.1M
    PyMem_Free((void*) state->mark);
626
82.1M
    state->mark = NULL;
627
    /* SRE_REPEAT pool */
628
82.1M
    repeat_pool_clear(state);
629
82.1M
}
630
631
/* calculate offset from start of string */
632
#define STATE_OFFSET(state, member)\
633
194M
    (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
634
635
LOCAL(PyObject*)
636
getslice(int isbytes, const void *ptr,
637
         PyObject* string, Py_ssize_t start, Py_ssize_t end)
638
154M
{
639
154M
    if (isbytes) {
640
97.9k
        if (PyBytes_CheckExact(string) &&
641
97.9k
            start == 0 && end == PyBytes_GET_SIZE(string)) {
642
1.97k
            return Py_NewRef(string);
643
1.97k
        }
644
96.0k
        return PyBytes_FromStringAndSize(
645
96.0k
                (const char *)ptr + start, end - start);
646
97.9k
    }
647
154M
    else {
648
154M
        return PyUnicode_Substring(string, start, end);
649
154M
    }
650
154M
}
651
652
LOCAL(PyObject*)
653
state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
654
1.05M
{
655
1.05M
    Py_ssize_t i, j;
656
657
1.05M
    index = (index - 1) * 2;
658
659
1.05M
    if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
660
0
        if (empty)
661
            /* want empty string */
662
0
            i = j = 0;
663
0
        else {
664
0
            Py_RETURN_NONE;
665
0
        }
666
1.05M
    } else {
667
1.05M
        i = STATE_OFFSET(state, state->mark[index]);
668
1.05M
        j = STATE_OFFSET(state, state->mark[index+1]);
669
670
        /* check wrong span */
671
1.05M
        if (i > j) {
672
0
            PyErr_SetString(PyExc_SystemError,
673
0
                            "The span of capturing group is wrong,"
674
0
                            " please report a bug for the re module.");
675
0
            return NULL;
676
0
        }
677
1.05M
    }
678
679
1.05M
    return getslice(state->isbytes, state->beginning, string, i, j);
680
1.05M
}
681
682
static void
683
pattern_error(Py_ssize_t status)
684
0
{
685
0
    switch (status) {
686
0
    case SRE_ERROR_RECURSION_LIMIT:
687
        /* This error code seems to be unused. */
688
0
        PyErr_SetString(
689
0
            PyExc_RecursionError,
690
0
            "maximum recursion limit exceeded"
691
0
            );
692
0
        break;
693
0
    case SRE_ERROR_MEMORY:
694
0
        PyErr_NoMemory();
695
0
        break;
696
0
    case SRE_ERROR_INTERRUPTED:
697
    /* An exception has already been raised, so let it fly */
698
0
        break;
699
0
    default:
700
        /* other error codes indicate compiler/engine bugs */
701
0
        PyErr_SetString(
702
0
            PyExc_RuntimeError,
703
0
            "internal error in regular expression engine"
704
0
            );
705
0
    }
706
0
}
707
708
static int
709
pattern_traverse(PyObject *op, visitproc visit, void *arg)
710
18.4k
{
711
18.4k
    PatternObject *self = _PatternObject_CAST(op);
712
18.4k
    Py_VISIT(Py_TYPE(self));
713
18.4k
    Py_VISIT(self->groupindex);
714
18.4k
    Py_VISIT(self->indexgroup);
715
18.4k
    Py_VISIT(self->pattern);
716
#ifdef Py_DEBUG
717
    Py_VISIT(self->fail_after_exc);
718
#endif
719
18.4k
    return 0;
720
18.4k
}
721
722
static int
723
pattern_clear(PyObject *op)
724
3.01k
{
725
3.01k
    PatternObject *self = _PatternObject_CAST(op);
726
3.01k
    Py_CLEAR(self->groupindex);
727
3.01k
    Py_CLEAR(self->indexgroup);
728
3.01k
    Py_CLEAR(self->pattern);
729
#ifdef Py_DEBUG
730
    Py_CLEAR(self->fail_after_exc);
731
#endif
732
3.01k
    return 0;
733
3.01k
}
734
735
static void
736
pattern_dealloc(PyObject *self)
737
3.01k
{
738
3.01k
    PyTypeObject *tp = Py_TYPE(self);
739
3.01k
    PyObject_GC_UnTrack(self);
740
3.01k
    FT_CLEAR_WEAKREFS(self, _PatternObject_CAST(self)->weakreflist);
741
3.01k
    (void)pattern_clear(self);
742
3.01k
    tp->tp_free(self);
743
3.01k
    Py_DECREF(tp);
744
3.01k
}
745
746
LOCAL(Py_ssize_t)
747
sre_match(SRE_STATE* state, SRE_CODE* pattern)
748
61.6M
{
749
61.6M
    if (state->charsize == 1)
750
40.6M
        return sre_ucs1_match(state, pattern, 1);
751
20.9M
    if (state->charsize == 2)
752
13.2M
        return sre_ucs2_match(state, pattern, 1);
753
20.9M
    assert(state->charsize == 4);
754
7.72M
    return sre_ucs4_match(state, pattern, 1);
755
20.9M
}
756
757
LOCAL(Py_ssize_t)
758
sre_search(SRE_STATE* state, SRE_CODE* pattern)
759
117M
{
760
117M
    if (state->charsize == 1)
761
53.5M
        return sre_ucs1_search(state, pattern);
762
64.2M
    if (state->charsize == 2)
763
57.2M
        return sre_ucs2_search(state, pattern);
764
64.2M
    assert(state->charsize == 4);
765
6.95M
    return sre_ucs4_search(state, pattern);
766
64.2M
}
767
768
/*[clinic input]
769
_sre.SRE_Pattern.prefixmatch
770
771
    cls: defining_class
772
    /
773
    string: object
774
    pos: Py_ssize_t = 0
775
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
776
777
Matches zero or more characters at the beginning of the string.
778
[clinic start generated code]*/
779
780
static PyObject *
781
_sre_SRE_Pattern_prefixmatch_impl(PatternObject *self, PyTypeObject *cls,
782
                                  PyObject *string, Py_ssize_t pos,
783
                                  Py_ssize_t endpos)
784
/*[clinic end generated code: output=a0e079fb4f875240 input=e2a7e68ea47d048c]*/
785
61.6M
{
786
61.6M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
787
61.6M
    SRE_STATE state;
788
61.6M
    Py_ssize_t status;
789
61.6M
    PyObject *match;
790
791
61.6M
    if (!state_init(&state, self, string, pos, endpos))
792
0
        return NULL;
793
794
61.6M
    INIT_TRACE(&state);
795
61.6M
    state.ptr = state.start;
796
797
61.6M
    TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
798
799
61.6M
    status = sre_match(&state, PatternObject_GetCode(self));
800
801
61.6M
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
802
61.6M
    if (PyErr_Occurred()) {
803
0
        state_fini(&state);
804
0
        return NULL;
805
0
    }
806
807
61.6M
    match = pattern_new_match(module_state, self, &state, status);
808
61.6M
    state_fini(&state);
809
61.6M
    return match;
810
61.6M
}
811
812
813
/*[clinic input]
814
_sre.SRE_Pattern.fullmatch
815
816
    cls: defining_class
817
    /
818
    string: object
819
    pos: Py_ssize_t = 0
820
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
821
822
Matches against all of the string.
823
[clinic start generated code]*/
824
825
static PyObject *
826
_sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
827
                                PyObject *string, Py_ssize_t pos,
828
                                Py_ssize_t endpos)
829
/*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
830
0
{
831
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
832
0
    SRE_STATE state;
833
0
    Py_ssize_t status;
834
0
    PyObject *match;
835
836
0
    if (!state_init(&state, self, string, pos, endpos))
837
0
        return NULL;
838
839
0
    INIT_TRACE(&state);
840
0
    state.ptr = state.start;
841
842
0
    TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
843
844
0
    state.match_all = 1;
845
0
    status = sre_match(&state, PatternObject_GetCode(self));
846
847
0
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
848
0
    if (PyErr_Occurred()) {
849
0
        state_fini(&state);
850
0
        return NULL;
851
0
    }
852
853
0
    match = pattern_new_match(module_state, self, &state, status);
854
0
    state_fini(&state);
855
0
    return match;
856
0
}
857
858
/*[clinic input]
859
@permit_long_summary
860
_sre.SRE_Pattern.search
861
862
    cls: defining_class
863
    /
864
    string: object
865
    pos: Py_ssize_t = 0
866
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
867
868
Scan through string looking for a match, and return a corresponding match object instance.
869
870
Return None if no position in the string matches.
871
[clinic start generated code]*/
872
873
static PyObject *
874
_sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
875
                             PyObject *string, Py_ssize_t pos,
876
                             Py_ssize_t endpos)
877
/*[clinic end generated code: output=bd7f2d9d583e1463 input=05e9feee0334c156]*/
878
4.80M
{
879
4.80M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
880
4.80M
    SRE_STATE state;
881
4.80M
    Py_ssize_t status;
882
4.80M
    PyObject *match;
883
884
4.80M
    if (!state_init(&state, self, string, pos, endpos))
885
0
        return NULL;
886
887
4.80M
    INIT_TRACE(&state);
888
4.80M
    TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
889
890
4.80M
    status = sre_search(&state, PatternObject_GetCode(self));
891
892
4.80M
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
893
894
4.80M
    if (PyErr_Occurred()) {
895
0
        state_fini(&state);
896
0
        return NULL;
897
0
    }
898
899
4.80M
    match = pattern_new_match(module_state, self, &state, status);
900
4.80M
    state_fini(&state);
901
4.80M
    return match;
902
4.80M
}
903
904
/*[clinic input]
905
_sre.SRE_Pattern.findall
906
907
    string: object
908
    pos: Py_ssize_t = 0
909
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
910
911
Return a list of all non-overlapping matches of pattern in string.
912
[clinic start generated code]*/
913
914
static PyObject *
915
_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
916
                              Py_ssize_t pos, Py_ssize_t endpos)
917
/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
918
3.84M
{
919
3.84M
    SRE_STATE state;
920
3.84M
    PyObject* list;
921
3.84M
    Py_ssize_t status;
922
3.84M
    Py_ssize_t i, b, e;
923
924
3.84M
    if (!state_init(&state, self, string, pos, endpos))
925
0
        return NULL;
926
927
3.84M
    list = PyList_New(0);
928
3.84M
    if (!list) {
929
0
        state_fini(&state);
930
0
        return NULL;
931
0
    }
932
933
90.5M
    while (state.start <= state.end) {
934
935
90.5M
        PyObject* item;
936
937
90.5M
        state_reset(&state);
938
939
90.5M
        state.ptr = state.start;
940
941
90.5M
        status = sre_search(&state, PatternObject_GetCode(self));
942
90.5M
        if (PyErr_Occurred())
943
0
            goto error;
944
945
90.5M
        if (status <= 0) {
946
3.84M
            if (status == 0)
947
3.84M
                break;
948
0
            pattern_error(status);
949
0
            goto error;
950
3.84M
        }
951
952
        /* don't bother to build a match object */
953
86.6M
        switch (self->groups) {
954
86.6M
        case 0:
955
86.6M
            b = STATE_OFFSET(&state, state.start);
956
86.6M
            e = STATE_OFFSET(&state, state.ptr);
957
86.6M
            item = getslice(state.isbytes, state.beginning,
958
86.6M
                            string, b, e);
959
86.6M
            if (!item)
960
0
                goto error;
961
86.6M
            break;
962
86.6M
        case 1:
963
0
            item = state_getslice(&state, 1, string, 1);
964
0
            if (!item)
965
0
                goto error;
966
0
            break;
967
0
        default:
968
0
            item = PyTuple_New(self->groups);
969
0
            if (!item)
970
0
                goto error;
971
0
            for (i = 0; i < self->groups; i++) {
972
0
                PyObject* o = state_getslice(&state, i+1, string, 1);
973
0
                if (!o) {
974
0
                    Py_DECREF(item);
975
0
                    goto error;
976
0
                }
977
0
                PyTuple_SET_ITEM(item, i, o);
978
0
            }
979
0
            break;
980
86.6M
        }
981
982
86.6M
        status = PyList_Append(list, item);
983
86.6M
        Py_DECREF(item);
984
86.6M
        if (status < 0)
985
0
            goto error;
986
987
86.6M
        state.must_advance = (state.ptr == state.start);
988
86.6M
        state.start = state.ptr;
989
86.6M
    }
990
991
3.84M
    state_fini(&state);
992
3.84M
    return list;
993
994
0
error:
995
0
    Py_DECREF(list);
996
0
    state_fini(&state);
997
0
    return NULL;
998
999
3.84M
}
1000
1001
/*[clinic input]
1002
@permit_long_summary
1003
_sre.SRE_Pattern.finditer
1004
1005
    cls: defining_class
1006
    /
1007
    string: object
1008
    pos: Py_ssize_t = 0
1009
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
1010
1011
Return an iterator over all non-overlapping matches for the RE pattern in string.
1012
1013
For each match, the iterator returns a match object.
1014
[clinic start generated code]*/
1015
1016
static PyObject *
1017
_sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
1018
                               PyObject *string, Py_ssize_t pos,
1019
                               Py_ssize_t endpos)
1020
/*[clinic end generated code: output=1791dbf3618ade56 input=ee28865796048023]*/
1021
386k
{
1022
386k
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1023
386k
    PyObject* scanner;
1024
386k
    PyObject* search;
1025
386k
    PyObject* iterator;
1026
1027
386k
    scanner = pattern_scanner(module_state, self, string, pos, endpos);
1028
386k
    if (!scanner)
1029
0
        return NULL;
1030
1031
386k
    search = PyObject_GetAttrString(scanner, "search");
1032
386k
    Py_DECREF(scanner);
1033
386k
    if (!search)
1034
0
        return NULL;
1035
1036
386k
    iterator = PyCallIter_New(search, Py_None);
1037
386k
    Py_DECREF(search);
1038
1039
386k
    return iterator;
1040
386k
}
1041
1042
/*[clinic input]
1043
_sre.SRE_Pattern.scanner
1044
1045
    cls: defining_class
1046
    /
1047
    string: object
1048
    pos: Py_ssize_t = 0
1049
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
1050
1051
[clinic start generated code]*/
1052
1053
static PyObject *
1054
_sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
1055
                              PyObject *string, Py_ssize_t pos,
1056
                              Py_ssize_t endpos)
1057
/*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
1058
0
{
1059
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1060
1061
0
    return pattern_scanner(module_state, self, string, pos, endpos);
1062
0
}
1063
1064
/*[clinic input]
1065
_sre.SRE_Pattern.split
1066
1067
    string: object
1068
    maxsplit: Py_ssize_t = 0
1069
1070
Split string by the occurrences of pattern.
1071
[clinic start generated code]*/
1072
1073
static PyObject *
1074
_sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
1075
                            Py_ssize_t maxsplit)
1076
/*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
1077
1.55M
{
1078
1.55M
    SRE_STATE state;
1079
1.55M
    PyObject* list;
1080
1.55M
    PyObject* item;
1081
1.55M
    Py_ssize_t status;
1082
1.55M
    Py_ssize_t n;
1083
1.55M
    Py_ssize_t i;
1084
1.55M
    const void* last;
1085
1086
1.55M
    assert(self->codesize != 0);
1087
1088
1.55M
    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
1089
0
        return NULL;
1090
1091
1.55M
    list = PyList_New(0);
1092
1.55M
    if (!list) {
1093
0
        state_fini(&state);
1094
0
        return NULL;
1095
0
    }
1096
1097
1.55M
    n = 0;
1098
1.55M
    last = state.start;
1099
1100
2.67M
    while (!maxsplit || n < maxsplit) {
1101
1102
1.62M
        state_reset(&state);
1103
1104
1.62M
        state.ptr = state.start;
1105
1106
1.62M
        status = sre_search(&state, PatternObject_GetCode(self));
1107
1.62M
        if (PyErr_Occurred())
1108
0
            goto error;
1109
1110
1.62M
        if (status <= 0) {
1111
507k
            if (status == 0)
1112
507k
                break;
1113
0
            pattern_error(status);
1114
0
            goto error;
1115
507k
        }
1116
1117
        /* get segment before this match */
1118
1.12M
        item = getslice(state.isbytes, state.beginning,
1119
1.12M
            string, STATE_OFFSET(&state, last),
1120
1.12M
            STATE_OFFSET(&state, state.start)
1121
1.12M
            );
1122
1.12M
        if (!item)
1123
0
            goto error;
1124
1.12M
        status = PyList_Append(list, item);
1125
1.12M
        Py_DECREF(item);
1126
1.12M
        if (status < 0)
1127
0
            goto error;
1128
1129
        /* add groups (if any) */
1130
2.17M
        for (i = 0; i < self->groups; i++) {
1131
1.05M
            item = state_getslice(&state, i+1, string, 0);
1132
1.05M
            if (!item)
1133
0
                goto error;
1134
1.05M
            status = PyList_Append(list, item);
1135
1.05M
            Py_DECREF(item);
1136
1.05M
            if (status < 0)
1137
0
                goto error;
1138
1.05M
        }
1139
1140
1.12M
        n = n + 1;
1141
1.12M
        state.must_advance = (state.ptr == state.start);
1142
1.12M
        last = state.start = state.ptr;
1143
1144
1.12M
    }
1145
1146
    /* get segment following last match (even if empty) */
1147
1.55M
    item = getslice(state.isbytes, state.beginning,
1148
1.55M
        string, STATE_OFFSET(&state, last), state.endpos
1149
1.55M
        );
1150
1.55M
    if (!item)
1151
0
        goto error;
1152
1.55M
    status = PyList_Append(list, item);
1153
1.55M
    Py_DECREF(item);
1154
1.55M
    if (status < 0)
1155
0
        goto error;
1156
1157
1.55M
    state_fini(&state);
1158
1.55M
    return list;
1159
1160
0
error:
1161
0
    Py_DECREF(list);
1162
0
    state_fini(&state);
1163
0
    return NULL;
1164
1165
1.55M
}
1166
1167
static PyObject *
1168
compile_template(_sremodulestate *module_state,
1169
                 PatternObject *pattern, PyObject *template)
1170
0
{
1171
    /* delegate to Python code */
1172
0
    PyObject *func = FT_ATOMIC_LOAD_PTR(module_state->compile_template);
1173
0
    if (func == NULL) {
1174
0
        func = PyImport_ImportModuleAttrString("re", "_compile_template");
1175
0
        if (func == NULL) {
1176
0
            return NULL;
1177
0
        }
1178
#ifdef Py_GIL_DISABLED
1179
        PyObject *other_func = NULL;
1180
        if (!_Py_atomic_compare_exchange_ptr(&module_state->compile_template, &other_func, func))  {
1181
            Py_DECREF(func);
1182
            func = other_func;
1183
        }
1184
#else
1185
0
        Py_XSETREF(module_state->compile_template, func);
1186
0
#endif
1187
0
    }
1188
1189
0
    PyObject *args[] = {(PyObject *)pattern, template};
1190
0
    PyObject *result = PyObject_Vectorcall(func, args, 2, NULL);
1191
1192
0
    if (result == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
1193
        /* If the replacement string is unhashable (e.g. bytearray),
1194
         * convert it to the basic type (str or bytes) and repeat. */
1195
0
        if (PyUnicode_Check(template) && !PyUnicode_CheckExact(template)) {
1196
0
            PyErr_Clear();
1197
0
            template = _PyUnicode_Copy(template);
1198
0
        }
1199
0
        else if (PyObject_CheckBuffer(template) && !PyBytes_CheckExact(template)) {
1200
0
            PyErr_Clear();
1201
0
            template = PyBytes_FromObject(template);
1202
0
        }
1203
0
        else {
1204
0
            return NULL;
1205
0
        }
1206
0
        if (template == NULL) {
1207
0
            return NULL;
1208
0
        }
1209
0
        args[1] = template;
1210
0
        result = PyObject_Vectorcall(func, args, 2, NULL);
1211
0
        Py_DECREF(template);
1212
0
    }
1213
1214
0
    if (result != NULL && Py_TYPE(result) != module_state->Template_Type) {
1215
0
        PyErr_Format(PyExc_RuntimeError,
1216
0
                    "the result of compiling a replacement string is %.200s",
1217
0
                    Py_TYPE(result)->tp_name);
1218
0
        Py_DECREF(result);
1219
0
        return NULL;
1220
0
    }
1221
0
    return result;
1222
0
}
1223
1224
static PyObject *expand_template(TemplateObject *, MatchObject *); /* Forward */
1225
1226
static PyObject*
1227
pattern_subx(_sremodulestate* module_state,
1228
             PatternObject* self,
1229
             PyObject* ptemplate,
1230
             PyObject* string,
1231
             Py_ssize_t count,
1232
             Py_ssize_t subn)
1233
9.97M
{
1234
9.97M
    SRE_STATE state;
1235
9.97M
    PyObject* list;
1236
9.97M
    PyObject* joiner;
1237
9.97M
    PyObject* item;
1238
9.97M
    PyObject* filter;
1239
9.97M
    PyObject* match;
1240
9.97M
    const void* ptr;
1241
9.97M
    Py_ssize_t status;
1242
9.97M
    Py_ssize_t n;
1243
9.97M
    Py_ssize_t i, b, e;
1244
9.97M
    int isbytes, charsize;
1245
9.97M
    enum {LITERAL, TEMPLATE, CALLABLE} filter_type;
1246
9.97M
    Py_buffer view;
1247
1248
9.97M
    if (PyCallable_Check(ptemplate)) {
1249
        /* sub/subn takes either a function or a template */
1250
4.00M
        filter = Py_NewRef(ptemplate);
1251
4.00M
        filter_type = CALLABLE;
1252
5.97M
    } else {
1253
        /* if not callable, check if it's a literal string */
1254
5.97M
        int literal;
1255
5.97M
        view.buf = NULL;
1256
5.97M
        ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
1257
5.97M
        if (ptr) {
1258
5.97M
            if (charsize == 1)
1259
5.97M
                literal = memchr(ptr, '\\', n) == NULL;
1260
0
            else
1261
0
                literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
1262
5.97M
        } else {
1263
0
            PyErr_Clear();
1264
0
            literal = 0;
1265
0
        }
1266
5.97M
        if (view.buf)
1267
0
            PyBuffer_Release(&view);
1268
5.97M
        if (literal) {
1269
5.97M
            filter = Py_NewRef(ptemplate);
1270
5.97M
            filter_type = LITERAL;
1271
5.97M
        } else {
1272
            /* not a literal; hand it over to the template compiler */
1273
0
            filter = compile_template(module_state, self, ptemplate);
1274
0
            if (!filter)
1275
0
                return NULL;
1276
1277
0
            assert(Py_TYPE(filter) == module_state->Template_Type);
1278
0
            if (Py_SIZE(filter) == 0) {
1279
0
                Py_SETREF(filter,
1280
0
                          Py_NewRef(((TemplateObject *)filter)->literal));
1281
0
                filter_type = LITERAL;
1282
0
            }
1283
0
            else {
1284
0
                filter_type = TEMPLATE;
1285
0
            }
1286
0
        }
1287
5.97M
    }
1288
1289
9.97M
    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
1290
0
        Py_DECREF(filter);
1291
0
        return NULL;
1292
0
    }
1293
1294
9.97M
    list = PyList_New(0);
1295
9.97M
    if (!list) {
1296
0
        Py_DECREF(filter);
1297
0
        state_fini(&state);
1298
0
        return NULL;
1299
0
    }
1300
1301
9.97M
    n = i = 0;
1302
1303
17.4M
    while (!count || n < count) {
1304
1305
17.4M
        state_reset(&state);
1306
1307
17.4M
        state.ptr = state.start;
1308
1309
17.4M
        status = sre_search(&state, PatternObject_GetCode(self));
1310
17.4M
        if (PyErr_Occurred())
1311
0
            goto error;
1312
1313
17.4M
        if (status <= 0) {
1314
9.97M
            if (status == 0)
1315
9.97M
                break;
1316
0
            pattern_error(status);
1317
0
            goto error;
1318
9.97M
        }
1319
1320
7.48M
        b = STATE_OFFSET(&state, state.start);
1321
7.48M
        e = STATE_OFFSET(&state, state.ptr);
1322
1323
7.48M
        if (i < b) {
1324
            /* get segment before this match */
1325
3.87M
            item = getslice(state.isbytes, state.beginning,
1326
3.87M
                string, i, b);
1327
3.87M
            if (!item)
1328
0
                goto error;
1329
3.87M
            status = PyList_Append(list, item);
1330
3.87M
            Py_DECREF(item);
1331
3.87M
            if (status < 0)
1332
0
                goto error;
1333
1334
3.87M
        }
1335
1336
7.48M
        if (filter_type != LITERAL) {
1337
            /* pass match object through filter */
1338
7.48M
            match = pattern_new_match(module_state, self, &state, 1);
1339
7.48M
            if (!match)
1340
0
                goto error;
1341
7.48M
            if (filter_type == TEMPLATE) {
1342
0
                item = expand_template((TemplateObject *)filter,
1343
0
                                       (MatchObject *)match);
1344
0
            }
1345
7.48M
            else {
1346
7.48M
                assert(filter_type == CALLABLE);
1347
7.48M
                item = PyObject_CallOneArg(filter, match);
1348
7.48M
            }
1349
7.48M
            Py_DECREF(match);
1350
7.48M
            if (!item)
1351
34
                goto error;
1352
7.48M
        } else {
1353
            /* filter is literal string */
1354
2.59k
            item = Py_NewRef(filter);
1355
2.59k
        }
1356
1357
        /* add to list */
1358
7.48M
        if (item != Py_None) {
1359
7.48M
            status = PyList_Append(list, item);
1360
7.48M
            Py_DECREF(item);
1361
7.48M
            if (status < 0)
1362
0
                goto error;
1363
7.48M
        }
1364
1365
7.48M
        i = e;
1366
7.48M
        n = n + 1;
1367
7.48M
        state.must_advance = (state.ptr == state.start);
1368
7.48M
        state.start = state.ptr;
1369
7.48M
    }
1370
1371
    /* get segment following last match */
1372
9.97M
    if (i < state.endpos) {
1373
7.09M
        item = getslice(state.isbytes, state.beginning,
1374
7.09M
                        string, i, state.endpos);
1375
7.09M
        if (!item)
1376
0
            goto error;
1377
7.09M
        status = PyList_Append(list, item);
1378
7.09M
        Py_DECREF(item);
1379
7.09M
        if (status < 0)
1380
0
            goto error;
1381
7.09M
    }
1382
1383
9.97M
    state_fini(&state);
1384
1385
9.97M
    Py_DECREF(filter);
1386
1387
    /* convert list to single string (also removes list) */
1388
9.97M
    joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
1389
9.97M
    if (!joiner) {
1390
0
        Py_DECREF(list);
1391
0
        return NULL;
1392
0
    }
1393
9.97M
    if (PyList_GET_SIZE(list) == 0) {
1394
1.91M
        Py_DECREF(list);
1395
1.91M
        item = joiner;
1396
1.91M
    }
1397
8.05M
    else {
1398
8.05M
        if (state.isbytes)
1399
32.0k
            item = PyBytes_Join(joiner, list);
1400
8.02M
        else
1401
8.02M
            item = PyUnicode_Join(joiner, list);
1402
8.05M
        Py_DECREF(joiner);
1403
8.05M
        Py_DECREF(list);
1404
8.05M
        if (!item)
1405
0
            return NULL;
1406
8.05M
    }
1407
1408
9.97M
    if (subn)
1409
0
        return Py_BuildValue("Nn", item, n);
1410
1411
9.97M
    return item;
1412
1413
34
error:
1414
34
    Py_DECREF(list);
1415
34
    state_fini(&state);
1416
34
    Py_DECREF(filter);
1417
34
    return NULL;
1418
1419
9.97M
}
1420
1421
/*[clinic input]
1422
@permit_long_summary
1423
_sre.SRE_Pattern.sub
1424
1425
    cls: defining_class
1426
    /
1427
    repl: object
1428
    string: object
1429
    count: Py_ssize_t = 0
1430
1431
Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1432
[clinic start generated code]*/
1433
1434
static PyObject *
1435
_sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1436
                          PyObject *repl, PyObject *string, Py_ssize_t count)
1437
/*[clinic end generated code: output=4be141ab04bca60d input=eba511fd1c4908b7]*/
1438
9.97M
{
1439
9.97M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1440
1441
9.97M
    return pattern_subx(module_state, self, repl, string, count, 0);
1442
9.97M
}
1443
1444
/*[clinic input]
1445
@permit_long_summary
1446
_sre.SRE_Pattern.subn
1447
1448
    cls: defining_class
1449
    /
1450
    repl: object
1451
    string: object
1452
    count: Py_ssize_t = 0
1453
1454
Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1455
[clinic start generated code]*/
1456
1457
static PyObject *
1458
_sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1459
                           PyObject *repl, PyObject *string,
1460
                           Py_ssize_t count)
1461
/*[clinic end generated code: output=da02fd85258b1e1f input=6a5bb5b61717abf0]*/
1462
0
{
1463
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1464
1465
0
    return pattern_subx(module_state, self, repl, string, count, 1);
1466
0
}
1467
1468
/*[clinic input]
1469
_sre.SRE_Pattern.__copy__
1470
1471
[clinic start generated code]*/
1472
1473
static PyObject *
1474
_sre_SRE_Pattern___copy___impl(PatternObject *self)
1475
/*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
1476
0
{
1477
0
    return Py_NewRef(self);
1478
0
}
1479
1480
/*[clinic input]
1481
_sre.SRE_Pattern.__deepcopy__
1482
1483
    memo: object
1484
    /
1485
1486
[clinic start generated code]*/
1487
1488
static PyObject *
1489
_sre_SRE_Pattern___deepcopy___impl(PatternObject *self, PyObject *memo)
1490
/*[clinic end generated code: output=75efe69bd12c5d7d input=a465b1602f997bed]*/
1491
0
{
1492
0
    return Py_NewRef(self);
1493
0
}
1494
1495
#ifdef Py_DEBUG
1496
/*[clinic input]
1497
_sre.SRE_Pattern._fail_after
1498
1499
    count: int
1500
    exception: object
1501
    /
1502
1503
For debugging.
1504
[clinic start generated code]*/
1505
1506
static PyObject *
1507
_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
1508
                                  PyObject *exception)
1509
/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
1510
{
1511
    self->fail_after_count = count;
1512
    Py_INCREF(exception);
1513
    Py_XSETREF(self->fail_after_exc, exception);
1514
    Py_RETURN_NONE;
1515
}
1516
#endif /* Py_DEBUG */
1517
1518
static PyObject *
1519
pattern_repr(PyObject *self)
1520
0
{
1521
0
    static const struct {
1522
0
        const char *name;
1523
0
        int value;
1524
0
    } flag_names[] = {
1525
0
        {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1526
0
        {"re.LOCALE", SRE_FLAG_LOCALE},
1527
0
        {"re.MULTILINE", SRE_FLAG_MULTILINE},
1528
0
        {"re.DOTALL", SRE_FLAG_DOTALL},
1529
0
        {"re.UNICODE", SRE_FLAG_UNICODE},
1530
0
        {"re.VERBOSE", SRE_FLAG_VERBOSE},
1531
0
        {"re.DEBUG", SRE_FLAG_DEBUG},
1532
0
        {"re.ASCII", SRE_FLAG_ASCII},
1533
0
    };
1534
1535
0
    PatternObject *obj = _PatternObject_CAST(self);
1536
0
    PyObject *result = NULL;
1537
0
    PyObject *flag_items;
1538
0
    size_t i;
1539
0
    int flags = obj->flags;
1540
1541
    /* Omit re.UNICODE for valid string patterns. */
1542
0
    if (obj->isbytes == 0 &&
1543
0
        (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1544
0
         SRE_FLAG_UNICODE)
1545
0
        flags &= ~SRE_FLAG_UNICODE;
1546
1547
0
    flag_items = PyList_New(0);
1548
0
    if (!flag_items)
1549
0
        return NULL;
1550
1551
0
    for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1552
0
        if (flags & flag_names[i].value) {
1553
0
            PyObject *item = PyUnicode_FromString(flag_names[i].name);
1554
0
            if (!item)
1555
0
                goto done;
1556
1557
0
            if (PyList_Append(flag_items, item) < 0) {
1558
0
                Py_DECREF(item);
1559
0
                goto done;
1560
0
            }
1561
0
            Py_DECREF(item);
1562
0
            flags &= ~flag_names[i].value;
1563
0
        }
1564
0
    }
1565
0
    if (flags) {
1566
0
        PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1567
0
        if (!item)
1568
0
            goto done;
1569
1570
0
        if (PyList_Append(flag_items, item) < 0) {
1571
0
            Py_DECREF(item);
1572
0
            goto done;
1573
0
        }
1574
0
        Py_DECREF(item);
1575
0
    }
1576
1577
0
    if (PyList_Size(flag_items) > 0) {
1578
0
        PyObject *flags_result;
1579
0
        PyObject *sep = PyUnicode_FromString("|");
1580
0
        if (!sep)
1581
0
            goto done;
1582
0
        flags_result = PyUnicode_Join(sep, flag_items);
1583
0
        Py_DECREF(sep);
1584
0
        if (!flags_result)
1585
0
            goto done;
1586
0
        result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1587
0
                                      obj->pattern, flags_result);
1588
0
        Py_DECREF(flags_result);
1589
0
    }
1590
0
    else {
1591
0
        result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1592
0
    }
1593
1594
0
done:
1595
0
    Py_DECREF(flag_items);
1596
0
    return result;
1597
0
}
1598
1599
PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
1600
1601
/* PatternObject's 'groupindex' method. */
1602
static PyObject *
1603
pattern_groupindex(PyObject *op, void *Py_UNUSED(ignored))
1604
0
{
1605
0
    PatternObject *self = _PatternObject_CAST(op);
1606
0
    if (self->groupindex == NULL)
1607
0
        return PyDict_New();
1608
0
    return PyDictProxy_New(self->groupindex);
1609
0
}
1610
1611
static int _validate(PatternObject *self); /* Forward */
1612
1613
/*[clinic input]
1614
_sre.compile
1615
1616
    pattern: object
1617
    flags: int
1618
    code: object(subclass_of='&PyList_Type')
1619
    groups: Py_ssize_t
1620
    groupindex: object(subclass_of='&PyDict_Type')
1621
    indexgroup: object(subclass_of='&PyTuple_Type')
1622
1623
[clinic start generated code]*/
1624
1625
static PyObject *
1626
_sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
1627
                  PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1628
                  PyObject *indexgroup)
1629
/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
1630
3.46k
{
1631
    /* "compile" pattern descriptor to pattern object */
1632
1633
3.46k
    _sremodulestate *module_state = get_sre_module_state(module);
1634
3.46k
    PatternObject* self;
1635
3.46k
    Py_ssize_t i, n;
1636
1637
3.46k
    n = PyList_GET_SIZE(code);
1638
    /* coverity[ampersand_in_size] */
1639
3.46k
    self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
1640
3.46k
    if (!self)
1641
0
        return NULL;
1642
3.46k
    self->weakreflist = NULL;
1643
3.46k
    self->pattern = NULL;
1644
3.46k
    self->groupindex = NULL;
1645
3.46k
    self->indexgroup = NULL;
1646
#ifdef Py_DEBUG
1647
    self->fail_after_count = -1;
1648
    self->fail_after_exc = NULL;
1649
#endif
1650
1651
3.46k
    self->codesize = n;
1652
1653
98.8M
    for (i = 0; i < n; i++) {
1654
98.7M
        PyObject *o = PyList_GET_ITEM(code, i);
1655
98.7M
        unsigned long value = PyLong_AsUnsignedLong(o);
1656
98.7M
        if (value == (unsigned long)-1 && PyErr_Occurred()) {
1657
0
            break;
1658
0
        }
1659
98.7M
        self->code[i] = (SRE_CODE) value;
1660
98.7M
        if ((unsigned long) self->code[i] != value) {
1661
0
            PyErr_SetString(PyExc_OverflowError,
1662
0
                            "regular expression code size limit exceeded");
1663
0
            break;
1664
0
        }
1665
98.7M
    }
1666
3.46k
    PyObject_GC_Track(self);
1667
1668
3.46k
    if (PyErr_Occurred()) {
1669
0
        Py_DECREF(self);
1670
0
        return NULL;
1671
0
    }
1672
1673
3.46k
    if (pattern == Py_None) {
1674
0
        self->isbytes = -1;
1675
0
    }
1676
3.46k
    else {
1677
3.46k
        Py_ssize_t p_length;
1678
3.46k
        int charsize;
1679
3.46k
        Py_buffer view;
1680
3.46k
        view.buf = NULL;
1681
3.46k
        if (!getstring(pattern, &p_length, &self->isbytes,
1682
3.46k
                       &charsize, &view)) {
1683
0
            Py_DECREF(self);
1684
0
            return NULL;
1685
0
        }
1686
3.46k
        if (view.buf)
1687
56
            PyBuffer_Release(&view);
1688
3.46k
    }
1689
1690
3.46k
    self->pattern = Py_NewRef(pattern);
1691
1692
3.46k
    self->flags = flags;
1693
1694
3.46k
    self->groups = groups;
1695
1696
3.46k
    if (PyDict_GET_SIZE(groupindex) > 0) {
1697
66
        self->groupindex = Py_NewRef(groupindex);
1698
66
        if (PyTuple_GET_SIZE(indexgroup) > 0) {
1699
66
            self->indexgroup = Py_NewRef(indexgroup);
1700
66
        }
1701
66
    }
1702
1703
3.46k
    if (!_validate(self)) {
1704
0
        Py_DECREF(self);
1705
0
        return NULL;
1706
0
    }
1707
1708
3.46k
    return (PyObject*) self;
1709
3.46k
}
1710
1711
/*[clinic input]
1712
_sre.template
1713
1714
    pattern: object
1715
    template: object(subclass_of="&PyList_Type")
1716
        A list containing interleaved literal strings (str or bytes) and group
1717
        indices (int), as returned by re._parser.parse_template():
1718
            [literal1, group1, ..., literalN, groupN]
1719
    /
1720
1721
[clinic start generated code]*/
1722
1723
static PyObject *
1724
_sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template)
1725
/*[clinic end generated code: output=d51290e596ebca86 input=af55380b27f02942]*/
1726
0
{
1727
    /* template is a list containing interleaved literal strings (str or bytes)
1728
     * and group indices (int), as returned by _parser.parse_template:
1729
     * [literal1, group1, literal2, ..., literalN].
1730
     */
1731
0
    _sremodulestate *module_state = get_sre_module_state(module);
1732
0
    TemplateObject *self = NULL;
1733
0
    Py_ssize_t n = PyList_GET_SIZE(template);
1734
0
    if ((n & 1) == 0 || n < 1) {
1735
0
        goto bad_template;
1736
0
    }
1737
0
    n /= 2;
1738
0
    self = PyObject_GC_NewVar(TemplateObject, module_state->Template_Type, n);
1739
0
    if (!self)
1740
0
        return NULL;
1741
0
    self->chunks = 1 + 2*n;
1742
0
    self->literal = Py_NewRef(PyList_GET_ITEM(template, 0));
1743
0
    for (Py_ssize_t i = 0; i < n; i++) {
1744
0
        Py_ssize_t index = PyLong_AsSsize_t(PyList_GET_ITEM(template, 2*i+1));
1745
0
        if (index == -1 && PyErr_Occurred()) {
1746
0
            Py_SET_SIZE(self, i);
1747
0
            Py_DECREF(self);
1748
0
            return NULL;
1749
0
        }
1750
0
        if (index < 0) {
1751
0
            Py_SET_SIZE(self, i);
1752
0
            goto bad_template;
1753
0
        }
1754
0
        self->items[i].index = index;
1755
1756
0
        PyObject *literal = PyList_GET_ITEM(template, 2*i+2);
1757
        // Skip empty literals.
1758
0
        if ((PyUnicode_Check(literal) && !PyUnicode_GET_LENGTH(literal)) ||
1759
0
            (PyBytes_Check(literal) && !PyBytes_GET_SIZE(literal)))
1760
0
        {
1761
0
            literal = NULL;
1762
0
            self->chunks--;
1763
0
        }
1764
0
        self->items[i].literal = Py_XNewRef(literal);
1765
0
    }
1766
0
    PyObject_GC_Track(self);
1767
0
    return (PyObject*) self;
1768
1769
0
bad_template:
1770
0
    PyErr_SetString(PyExc_TypeError, "invalid template");
1771
0
    Py_XDECREF(self);
1772
0
    return NULL;
1773
0
}
1774
1775
/* -------------------------------------------------------------------- */
1776
/* Code validation */
1777
1778
/* To learn more about this code, have a look at the _compile() function in
1779
   Lib/sre_compile.py.  The validation functions below checks the code array
1780
   for conformance with the code patterns generated there.
1781
1782
   The nice thing about the generated code is that it is position-independent:
1783
   all jumps are relative jumps forward.  Also, jumps don't cross each other:
1784
   the target of a later jump is always earlier than the target of an earlier
1785
   jump.  IOW, this is okay:
1786
1787
   J---------J-------T--------T
1788
    \         \_____/        /
1789
     \______________________/
1790
1791
   but this is not:
1792
1793
   J---------J-------T--------T
1794
    \_________\_____/        /
1795
               \____________/
1796
1797
   It also helps that SRE_CODE is always an unsigned type.
1798
*/
1799
1800
/* Defining this one enables tracing of the validator */
1801
#undef VVERBOSE
1802
1803
/* Trace macro for the validator */
1804
#if defined(VVERBOSE)
1805
#define VTRACE(v) printf v
1806
#else
1807
150M
#define VTRACE(v) do {} while(0)  /* do nothing */
1808
#endif
1809
1810
/* Report failure */
1811
0
#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return -1; } while (0)
1812
1813
/* Extract opcode, argument, or skip count from code array */
1814
#define GET_OP                                          \
1815
35.4M
    do {                                                \
1816
35.4M
        VTRACE(("%p: ", code));                         \
1817
35.4M
        if (code >= end) FAIL;                          \
1818
35.4M
        op = *code++;                                   \
1819
35.4M
        VTRACE(("%lu (op)\n", (unsigned long)op));      \
1820
35.4M
    } while (0)
1821
#define GET_ARG                                         \
1822
31.7M
    do {                                                \
1823
31.7M
        VTRACE(("%p= ", code));                         \
1824
31.7M
        if (code >= end) FAIL;                          \
1825
31.7M
        arg = *code++;                                  \
1826
31.7M
        VTRACE(("%lu (arg)\n", (unsigned long)arg));    \
1827
31.7M
    } while (0)
1828
#define GET_SKIP_ADJ(adj)                               \
1829
5.94M
    do {                                                \
1830
5.94M
        VTRACE(("%p= ", code));                         \
1831
5.94M
        if (code >= end) FAIL;                          \
1832
5.94M
        skip = *code;                                   \
1833
5.94M
        VTRACE(("%lu (skip to %p)\n",                   \
1834
5.94M
               (unsigned long)skip, code+skip));        \
1835
5.94M
        if (skip-adj > (uintptr_t)(end - code))         \
1836
5.94M
            FAIL;                                       \
1837
5.94M
        code++;                                         \
1838
5.94M
    } while (0)
1839
5.94M
#define GET_SKIP GET_SKIP_ADJ(0)
1840
1841
static int
1842
_validate_charset(SRE_CODE *code, SRE_CODE *end)
1843
2.71M
{
1844
    /* Some variables are manipulated by the macros above */
1845
2.71M
    SRE_CODE op;
1846
2.71M
    SRE_CODE arg;
1847
2.71M
    SRE_CODE offset;
1848
2.71M
    int i;
1849
1850
8.05M
    while (code < end) {
1851
5.34M
        GET_OP;
1852
5.34M
        switch (op) {
1853
1854
1.06k
        case SRE_OP_NEGATE:
1855
1.06k
            break;
1856
1857
5.24M
        case SRE_OP_LITERAL:
1858
5.24M
            GET_ARG;
1859
5.24M
            break;
1860
1861
5.24M
        case SRE_OP_RANGE:
1862
9.64k
        case SRE_OP_RANGE_UNI_IGNORE:
1863
9.64k
            GET_ARG;
1864
9.64k
            GET_ARG;
1865
9.64k
            break;
1866
1867
9.64k
        case SRE_OP_CHARSET:
1868
2.89k
            offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
1869
2.89k
            if (offset > (uintptr_t)(end - code))
1870
0
                FAIL;
1871
2.89k
            code += offset;
1872
2.89k
            break;
1873
1874
84.4k
        case SRE_OP_BIGCHARSET:
1875
84.4k
            GET_ARG; /* Number of blocks */
1876
84.4k
            offset = 256/sizeof(SRE_CODE); /* 256-byte table */
1877
84.4k
            if (offset > (uintptr_t)(end - code))
1878
0
                FAIL;
1879
            /* Make sure that each byte points to a valid block */
1880
21.6M
            for (i = 0; i < 256; i++) {
1881
21.6M
                if (((unsigned char *)code)[i] >= arg)
1882
0
                    FAIL;
1883
21.6M
            }
1884
84.4k
            code += offset;
1885
84.4k
            offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
1886
84.4k
            if (offset > (uintptr_t)(end - code))
1887
0
                FAIL;
1888
84.4k
            code += offset;
1889
84.4k
            break;
1890
1891
1.47k
        case SRE_OP_CATEGORY:
1892
1.47k
            GET_ARG;
1893
1.47k
            switch (arg) {
1894
36
            case SRE_CATEGORY_DIGIT:
1895
36
            case SRE_CATEGORY_NOT_DIGIT:
1896
72
            case SRE_CATEGORY_SPACE:
1897
72
            case SRE_CATEGORY_NOT_SPACE:
1898
100
            case SRE_CATEGORY_WORD:
1899
100
            case SRE_CATEGORY_NOT_WORD:
1900
100
            case SRE_CATEGORY_LINEBREAK:
1901
100
            case SRE_CATEGORY_NOT_LINEBREAK:
1902
100
            case SRE_CATEGORY_LOC_WORD:
1903
100
            case SRE_CATEGORY_LOC_NOT_WORD:
1904
229
            case SRE_CATEGORY_UNI_DIGIT:
1905
561
            case SRE_CATEGORY_UNI_NOT_DIGIT:
1906
1.29k
            case SRE_CATEGORY_UNI_SPACE:
1907
1.31k
            case SRE_CATEGORY_UNI_NOT_SPACE:
1908
1.45k
            case SRE_CATEGORY_UNI_WORD:
1909
1.47k
            case SRE_CATEGORY_UNI_NOT_WORD:
1910
1.47k
            case SRE_CATEGORY_UNI_LINEBREAK:
1911
1.47k
            case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1912
1.47k
                break;
1913
0
            default:
1914
0
                FAIL;
1915
1.47k
            }
1916
1.47k
            break;
1917
1918
1.47k
        default:
1919
0
            FAIL;
1920
1921
5.34M
        }
1922
5.34M
    }
1923
1924
2.71M
    return 0;
1925
2.71M
}
1926
1927
/* Returns 0 on success, -1 on failure, and 1 if the last op is JUMP. */
1928
static int
1929
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1930
2.23M
{
1931
    /* Some variables are manipulated by the macros above */
1932
2.23M
    SRE_CODE op;
1933
2.23M
    SRE_CODE arg;
1934
2.23M
    SRE_CODE skip;
1935
1936
2.23M
    VTRACE(("code=%p, end=%p\n", code, end));
1937
1938
2.23M
    if (code > end)
1939
0
        FAIL;
1940
1941
30.1M
    while (code < end) {
1942
27.8M
        GET_OP;
1943
27.8M
        switch (op) {
1944
1945
114k
        case SRE_OP_MARK:
1946
            /* We don't check whether marks are properly nested; the
1947
               sre_match() code is robust even if they don't, and the worst
1948
               you can get is nonsensical match results. */
1949
114k
            GET_ARG;
1950
114k
            if (arg >= 2 * (size_t)groups) {
1951
0
                VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1952
0
                FAIL;
1953
0
            }
1954
114k
            break;
1955
1956
18.6M
        case SRE_OP_LITERAL:
1957
18.6M
        case SRE_OP_NOT_LITERAL:
1958
18.6M
        case SRE_OP_LITERAL_IGNORE:
1959
18.6M
        case SRE_OP_NOT_LITERAL_IGNORE:
1960
23.7M
        case SRE_OP_LITERAL_UNI_IGNORE:
1961
23.7M
        case SRE_OP_NOT_LITERAL_UNI_IGNORE:
1962
23.7M
        case SRE_OP_LITERAL_LOC_IGNORE:
1963
23.7M
        case SRE_OP_NOT_LITERAL_LOC_IGNORE:
1964
23.7M
            GET_ARG;
1965
            /* The arg is just a character, nothing to check */
1966
23.7M
            break;
1967
1968
23.7M
        case SRE_OP_SUCCESS:
1969
34
        case SRE_OP_FAILURE:
1970
            /* Nothing to check; these normally end the matching process */
1971
34
            break;
1972
1973
24.6k
        case SRE_OP_AT:
1974
24.6k
            GET_ARG;
1975
24.6k
            switch (arg) {
1976
57
            case SRE_AT_BEGINNING:
1977
65
            case SRE_AT_BEGINNING_STRING:
1978
2.59k
            case SRE_AT_BEGINNING_LINE:
1979
2.68k
            case SRE_AT_END:
1980
20.7k
            case SRE_AT_END_LINE:
1981
20.7k
            case SRE_AT_END_STRING:
1982
20.7k
            case SRE_AT_BOUNDARY:
1983
20.7k
            case SRE_AT_NON_BOUNDARY:
1984
20.7k
            case SRE_AT_LOC_BOUNDARY:
1985
20.7k
            case SRE_AT_LOC_NON_BOUNDARY:
1986
24.6k
            case SRE_AT_UNI_BOUNDARY:
1987
24.6k
            case SRE_AT_UNI_NON_BOUNDARY:
1988
24.6k
                break;
1989
0
            default:
1990
0
                FAIL;
1991
24.6k
            }
1992
24.6k
            break;
1993
1994
24.6k
        case SRE_OP_ANY:
1995
6.84k
        case SRE_OP_ANY_ALL:
1996
            /* These have no operands */
1997
6.84k
            break;
1998
1999
6.20k
        case SRE_OP_IN:
2000
6.55k
        case SRE_OP_IN_IGNORE:
2001
2.71M
        case SRE_OP_IN_UNI_IGNORE:
2002
2.71M
        case SRE_OP_IN_LOC_IGNORE:
2003
2.71M
            GET_SKIP;
2004
            /* Stop 1 before the end; we check the FAILURE below */
2005
2.71M
            if (_validate_charset(code, code+skip-2))
2006
0
                FAIL;
2007
2.71M
            if (code[skip-2] != SRE_OP_FAILURE)
2008
0
                FAIL;
2009
2.71M
            code += skip-1;
2010
2.71M
            break;
2011
2012
3.46k
        case SRE_OP_INFO:
2013
3.46k
            {
2014
                /* A minimal info field is
2015
                   <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2016
                   If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2017
                   more follows. */
2018
3.46k
                SRE_CODE flags, i;
2019
3.46k
                SRE_CODE *newcode;
2020
3.46k
                GET_SKIP;
2021
3.46k
                newcode = code+skip-1;
2022
3.46k
                GET_ARG; flags = arg;
2023
3.46k
                GET_ARG;
2024
3.46k
                GET_ARG;
2025
                /* Check that only valid flags are present */
2026
3.46k
                if ((flags & ~(SRE_INFO_PREFIX |
2027
3.46k
                               SRE_INFO_LITERAL |
2028
3.46k
                               SRE_INFO_CHARSET)) != 0)
2029
0
                    FAIL;
2030
                /* PREFIX and CHARSET are mutually exclusive */
2031
3.46k
                if ((flags & SRE_INFO_PREFIX) &&
2032
1.45k
                    (flags & SRE_INFO_CHARSET))
2033
0
                    FAIL;
2034
                /* LITERAL implies PREFIX */
2035
3.46k
                if ((flags & SRE_INFO_LITERAL) &&
2036
606
                    !(flags & SRE_INFO_PREFIX))
2037
0
                    FAIL;
2038
                /* Validate the prefix */
2039
3.46k
                if (flags & SRE_INFO_PREFIX) {
2040
1.45k
                    SRE_CODE prefix_len;
2041
1.45k
                    GET_ARG; prefix_len = arg;
2042
1.45k
                    GET_ARG;
2043
                    /* Here comes the prefix string */
2044
1.45k
                    if (prefix_len > (uintptr_t)(newcode - code))
2045
0
                        FAIL;
2046
1.45k
                    code += prefix_len;
2047
                    /* And here comes the overlap table */
2048
1.45k
                    if (prefix_len > (uintptr_t)(newcode - code))
2049
0
                        FAIL;
2050
                    /* Each overlap value should be < prefix_len */
2051
7.65M
                    for (i = 0; i < prefix_len; i++) {
2052
7.65M
                        if (code[i] >= prefix_len)
2053
0
                            FAIL;
2054
7.65M
                    }
2055
1.45k
                    code += prefix_len;
2056
1.45k
                }
2057
                /* Validate the charset */
2058
3.46k
                if (flags & SRE_INFO_CHARSET) {
2059
425
                    if (_validate_charset(code, newcode-1))
2060
0
                        FAIL;
2061
425
                    if (newcode[-1] != SRE_OP_FAILURE)
2062
0
                        FAIL;
2063
425
                    code = newcode;
2064
425
                }
2065
3.04k
                else if (code != newcode) {
2066
0
                  VTRACE(("code=%p, newcode=%p\n", code, newcode));
2067
0
                    FAIL;
2068
0
                }
2069
3.46k
            }
2070
3.46k
            break;
2071
2072
20.5k
        case SRE_OP_BRANCH:
2073
20.5k
            {
2074
20.5k
                SRE_CODE *target = NULL;
2075
1.00M
                for (;;) {
2076
1.00M
                    GET_SKIP;
2077
1.00M
                    if (skip == 0)
2078
20.5k
                        break;
2079
                    /* Stop 2 before the end; we check the JUMP below */
2080
982k
                    if (_validate_inner(code, code+skip-3, groups))
2081
0
                        FAIL;
2082
982k
                    code += skip-3;
2083
                    /* Check that it ends with a JUMP, and that each JUMP
2084
                       has the same target */
2085
982k
                    GET_OP;
2086
982k
                    if (op != SRE_OP_JUMP)
2087
0
                        FAIL;
2088
982k
                    GET_SKIP;
2089
982k
                    if (target == NULL)
2090
20.5k
                        target = code+skip-1;
2091
961k
                    else if (code+skip-1 != target)
2092
0
                        FAIL;
2093
982k
                }
2094
20.5k
                if (code != target)
2095
0
                    FAIL;
2096
20.5k
            }
2097
20.5k
            break;
2098
2099
1.22M
        case SRE_OP_REPEAT_ONE:
2100
1.22M
        case SRE_OP_MIN_REPEAT_ONE:
2101
1.22M
        case SRE_OP_POSSESSIVE_REPEAT_ONE:
2102
1.22M
            {
2103
1.22M
                SRE_CODE min, max;
2104
1.22M
                GET_SKIP;
2105
1.22M
                GET_ARG; min = arg;
2106
1.22M
                GET_ARG; max = arg;
2107
1.22M
                if (min > max)
2108
0
                    FAIL;
2109
1.22M
                if (max > SRE_MAXREPEAT)
2110
0
                    FAIL;
2111
1.22M
                if (_validate_inner(code, code+skip-4, groups))
2112
0
                    FAIL;
2113
1.22M
                code += skip-4;
2114
1.22M
                GET_OP;
2115
1.22M
                if (op != SRE_OP_SUCCESS)
2116
0
                    FAIL;
2117
1.22M
            }
2118
1.22M
            break;
2119
2120
1.22M
        case SRE_OP_REPEAT:
2121
19.2k
        case SRE_OP_POSSESSIVE_REPEAT:
2122
19.2k
            {
2123
19.2k
                SRE_CODE op1 = op, min, max;
2124
19.2k
                GET_SKIP;
2125
19.2k
                GET_ARG; min = arg;
2126
19.2k
                GET_ARG; max = arg;
2127
19.2k
                if (min > max)
2128
0
                    FAIL;
2129
19.2k
                if (max > SRE_MAXREPEAT)
2130
0
                    FAIL;
2131
19.2k
                if (_validate_inner(code, code+skip-3, groups))
2132
0
                    FAIL;
2133
19.2k
                code += skip-3;
2134
19.2k
                GET_OP;
2135
19.2k
                if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
2136
43
                    if (op != SRE_OP_SUCCESS)
2137
0
                        FAIL;
2138
43
                }
2139
19.2k
                else {
2140
19.2k
                    if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
2141
0
                        FAIL;
2142
19.2k
                }
2143
19.2k
            }
2144
19.2k
            break;
2145
2146
19.2k
        case SRE_OP_ATOMIC_GROUP:
2147
22
            {
2148
22
                GET_SKIP;
2149
22
                if (_validate_inner(code, code+skip-2, groups))
2150
0
                    FAIL;
2151
22
                code += skip-2;
2152
22
                GET_OP;
2153
22
                if (op != SRE_OP_SUCCESS)
2154
0
                    FAIL;
2155
22
            }
2156
22
            break;
2157
2158
22
        case SRE_OP_GROUPREF:
2159
0
        case SRE_OP_GROUPREF_IGNORE:
2160
772
        case SRE_OP_GROUPREF_UNI_IGNORE:
2161
772
        case SRE_OP_GROUPREF_LOC_IGNORE:
2162
772
            GET_ARG;
2163
772
            if (arg >= (size_t)groups)
2164
0
                FAIL;
2165
772
            break;
2166
2167
772
        case SRE_OP_GROUPREF_EXISTS:
2168
            /* The regex syntax for this is: '(?(group)then|else)', where
2169
               'group' is either an integer group number or a group name,
2170
               'then' and 'else' are sub-regexes, and 'else' is optional. */
2171
47
            GET_ARG;
2172
47
            if (arg >= (size_t)groups)
2173
0
                FAIL;
2174
47
            GET_SKIP_ADJ(1);
2175
47
            code--; /* The skip is relative to the first arg! */
2176
            /* There are two possibilities here: if there is both a 'then'
2177
               part and an 'else' part, the generated code looks like:
2178
2179
               GROUPREF_EXISTS
2180
               <group>
2181
               <skipyes>
2182
               ...then part...
2183
               JUMP
2184
               <skipno>
2185
               (<skipyes> jumps here)
2186
               ...else part...
2187
               (<skipno> jumps here)
2188
2189
               If there is only a 'then' part, it looks like:
2190
2191
               GROUPREF_EXISTS
2192
               <group>
2193
               <skip>
2194
               ...then part...
2195
               (<skip> jumps here)
2196
2197
               There is no direct way to decide which it is, and we don't want
2198
               to allow arbitrary jumps anywhere in the code; so we just look
2199
               for a JUMP opcode preceding our skip target.
2200
            */
2201
47
            VTRACE(("then part:\n"));
2202
47
            int rc = _validate_inner(code+1, code+skip-1, groups);
2203
47
            if (rc == 1) {
2204
40
                VTRACE(("else part:\n"));
2205
40
                code += skip-2; /* Position after JUMP, at <skipno> */
2206
40
                GET_SKIP;
2207
40
                rc = _validate_inner(code, code+skip-1, groups);
2208
40
            }
2209
47
            if (rc)
2210
0
                FAIL;
2211
47
            code += skip-1;
2212
47
            break;
2213
2214
102
        case SRE_OP_ASSERT:
2215
355
        case SRE_OP_ASSERT_NOT:
2216
355
            GET_SKIP;
2217
355
            GET_ARG; /* 0 for lookahead, width for lookbehind */
2218
355
            code--; /* Back up over arg to simplify math below */
2219
            /* Stop 1 before the end; we check the SUCCESS below */
2220
355
            if (_validate_inner(code+1, code+skip-2, groups))
2221
0
                FAIL;
2222
355
            code += skip-2;
2223
355
            GET_OP;
2224
355
            if (op != SRE_OP_SUCCESS)
2225
0
                FAIL;
2226
355
            break;
2227
2228
355
        case SRE_OP_JUMP:
2229
40
            if (code + 1 != end)
2230
0
                FAIL;
2231
40
            VTRACE(("JUMP: %d\n", __LINE__));
2232
40
            return 1;
2233
2234
0
        default:
2235
0
            FAIL;
2236
2237
27.8M
        }
2238
27.8M
    }
2239
2240
2.23M
    VTRACE(("okay\n"));
2241
2.23M
    return 0;
2242
2.23M
}
2243
2244
static int
2245
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2246
3.46k
{
2247
3.46k
    if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
2248
3.46k
        code >= end || end[-1] != SRE_OP_SUCCESS)
2249
0
        FAIL;
2250
3.46k
    return _validate_inner(code, end-1, groups);
2251
3.46k
}
2252
2253
static int
2254
_validate(PatternObject *self)
2255
3.46k
{
2256
3.46k
    if (_validate_outer(self->code, self->code+self->codesize, self->groups))
2257
0
    {
2258
0
        PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
2259
0
        return 0;
2260
0
    }
2261
3.46k
    else
2262
3.46k
        VTRACE(("Success!\n"));
2263
3.46k
    return 1;
2264
3.46k
}
2265
2266
/* -------------------------------------------------------------------- */
2267
/* match methods */
2268
2269
static int
2270
match_traverse(PyObject *op, visitproc visit, void *arg)
2271
5.85k
{
2272
5.85k
    MatchObject *self = _MatchObject_CAST(op);
2273
5.85k
    Py_VISIT(Py_TYPE(self));
2274
5.85k
    Py_VISIT(self->string);
2275
5.85k
    Py_VISIT(self->regs);
2276
5.85k
    Py_VISIT(self->pattern);
2277
5.85k
    return 0;
2278
5.85k
}
2279
2280
static int
2281
match_clear(PyObject *op)
2282
59.2M
{
2283
59.2M
    MatchObject *self = _MatchObject_CAST(op);
2284
59.2M
    Py_CLEAR(self->string);
2285
59.2M
    Py_CLEAR(self->regs);
2286
59.2M
    Py_CLEAR(self->pattern);
2287
59.2M
    return 0;
2288
59.2M
}
2289
2290
static void
2291
match_dealloc(PyObject *self)
2292
59.2M
{
2293
59.2M
    PyTypeObject *tp = Py_TYPE(self);
2294
59.2M
    PyObject_GC_UnTrack(self);
2295
59.2M
    (void)match_clear(self);
2296
59.2M
    tp->tp_free(self);
2297
59.2M
    Py_DECREF(tp);
2298
59.2M
}
2299
2300
static PyObject*
2301
match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2302
51.5M
{
2303
51.5M
    Py_ssize_t length;
2304
51.5M
    int isbytes, charsize;
2305
51.5M
    Py_buffer view;
2306
51.5M
    PyObject *result;
2307
51.5M
    const void* ptr;
2308
51.5M
    Py_ssize_t i, j;
2309
2310
51.5M
    assert(0 <= index && index < self->groups);
2311
51.5M
    index *= 2;
2312
2313
51.5M
    if (self->string == Py_None || self->mark[index] < 0) {
2314
        /* return default value if the string or group is undefined */
2315
8.60M
        return Py_NewRef(def);
2316
8.60M
    }
2317
2318
42.9M
    ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
2319
42.9M
    if (ptr == NULL)
2320
0
        return NULL;
2321
2322
42.9M
    i = self->mark[index];
2323
42.9M
    j = self->mark[index+1];
2324
42.9M
    i = Py_MIN(i, length);
2325
42.9M
    j = Py_MIN(j, length);
2326
42.9M
    result = getslice(isbytes, ptr, self->string, i, j);
2327
42.9M
    if (isbytes && view.buf != NULL)
2328
30.7k
        PyBuffer_Release(&view);
2329
42.9M
    return result;
2330
42.9M
}
2331
2332
static Py_ssize_t
2333
match_getindex(MatchObject* self, PyObject* index)
2334
69.7M
{
2335
69.7M
    Py_ssize_t i;
2336
2337
69.7M
    if (index == NULL)
2338
        /* Default value */
2339
18.2M
        return 0;
2340
2341
51.5M
    if (PyIndex_Check(index)) {
2342
33.1M
        i = PyNumber_AsSsize_t(index, NULL);
2343
33.1M
    }
2344
18.4M
    else {
2345
18.4M
        i = -1;
2346
2347
18.4M
        if (self->pattern->groupindex) {
2348
18.4M
            index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2349
18.4M
            if (index && PyLong_Check(index)) {
2350
18.4M
                i = PyLong_AsSsize_t(index);
2351
18.4M
            }
2352
18.4M
        }
2353
18.4M
    }
2354
51.5M
    if (i < 0 || i >= self->groups) {
2355
        /* raise IndexError if we were given a bad group number */
2356
0
        if (!PyErr_Occurred()) {
2357
0
            PyErr_SetString(PyExc_IndexError, "no such group");
2358
0
        }
2359
0
        return -1;
2360
0
    }
2361
2362
    // Check that i*2 cannot overflow to make static analyzers happy
2363
51.5M
    assert((size_t)i <= SRE_MAXGROUPS);
2364
51.5M
    return i;
2365
51.5M
}
2366
2367
static PyObject*
2368
match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2369
51.5M
{
2370
51.5M
    Py_ssize_t i = match_getindex(self, index);
2371
2372
51.5M
    if (i < 0) {
2373
0
        return NULL;
2374
0
    }
2375
2376
51.5M
    return match_getslice_by_index(self, i, def);
2377
51.5M
}
2378
2379
/*[clinic input]
2380
@permit_long_summary
2381
_sre.SRE_Match.expand
2382
2383
    template: object
2384
2385
Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2386
[clinic start generated code]*/
2387
2388
static PyObject *
2389
_sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2390
/*[clinic end generated code: output=931b58ccc323c3a1 input=dc74d81265376ac3]*/
2391
0
{
2392
0
    _sremodulestate *module_state = get_sre_module_state_by_class(Py_TYPE(self));
2393
0
    PyObject *filter = compile_template(module_state, self->pattern, template);
2394
0
    if (filter == NULL) {
2395
0
        return NULL;
2396
0
    }
2397
0
    PyObject *result = expand_template((TemplateObject *)filter, self);
2398
0
    Py_DECREF(filter);
2399
0
    return result;
2400
0
}
2401
2402
static PyObject*
2403
match_group(PyObject *op, PyObject* args)
2404
26.4M
{
2405
26.4M
    MatchObject *self = _MatchObject_CAST(op);
2406
26.4M
    PyObject* result;
2407
26.4M
    Py_ssize_t i, size;
2408
2409
26.4M
    size = PyTuple_GET_SIZE(args);
2410
2411
26.4M
    switch (size) {
2412
3.05M
    case 0:
2413
3.05M
        result = match_getslice(self, _PyLong_GetZero(), Py_None);
2414
3.05M
        break;
2415
10.7M
    case 1:
2416
10.7M
        result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2417
10.7M
        break;
2418
12.6M
    default:
2419
        /* fetch multiple items */
2420
12.6M
        result = PyTuple_New(size);
2421
12.6M
        if (!result)
2422
0
            return NULL;
2423
47.2M
        for (i = 0; i < size; i++) {
2424
34.6M
            PyObject* item = match_getslice(
2425
34.6M
                self, PyTuple_GET_ITEM(args, i), Py_None
2426
34.6M
                );
2427
34.6M
            if (!item) {
2428
0
                Py_DECREF(result);
2429
0
                return NULL;
2430
0
            }
2431
34.6M
            PyTuple_SET_ITEM(result, i, item);
2432
34.6M
        }
2433
12.6M
        break;
2434
26.4M
    }
2435
26.4M
    return result;
2436
26.4M
}
2437
2438
static PyObject*
2439
match_getitem(PyObject *op, PyObject* name)
2440
3.03M
{
2441
3.03M
    MatchObject *self = _MatchObject_CAST(op);
2442
3.03M
    return match_getslice(self, name, Py_None);
2443
3.03M
}
2444
2445
/*[clinic input]
2446
_sre.SRE_Match.groups
2447
2448
    default: object = None
2449
        Is used for groups that did not participate in the match.
2450
2451
Return a tuple containing all the subgroups of the match, from 1.
2452
[clinic start generated code]*/
2453
2454
static PyObject *
2455
_sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2456
/*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
2457
253
{
2458
253
    PyObject* result;
2459
253
    Py_ssize_t index;
2460
2461
253
    result = PyTuple_New(self->groups-1);
2462
253
    if (!result)
2463
0
        return NULL;
2464
2465
1.96k
    for (index = 1; index < self->groups; index++) {
2466
1.71k
        PyObject* item;
2467
1.71k
        item = match_getslice_by_index(self, index, default_value);
2468
1.71k
        if (!item) {
2469
0
            Py_DECREF(result);
2470
0
            return NULL;
2471
0
        }
2472
1.71k
        PyTuple_SET_ITEM(result, index-1, item);
2473
1.71k
    }
2474
2475
253
    return result;
2476
253
}
2477
2478
/*[clinic input]
2479
@permit_long_summary
2480
_sre.SRE_Match.groupdict
2481
2482
    default: object = None
2483
        Is used for groups that did not participate in the match.
2484
2485
Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2486
[clinic start generated code]*/
2487
2488
static PyObject *
2489
_sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2490
/*[clinic end generated code: output=29917c9073e41757 input=a8d3a1dc80336872]*/
2491
49
{
2492
49
    PyObject *result;
2493
49
    PyObject *key;
2494
49
    PyObject *value;
2495
49
    Py_ssize_t pos = 0;
2496
49
    Py_hash_t hash;
2497
2498
49
    result = PyDict_New();
2499
49
    if (!result || !self->pattern->groupindex)
2500
0
        return result;
2501
2502
49
    Py_BEGIN_CRITICAL_SECTION(self->pattern->groupindex);
2503
295
    while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
2504
246
        int status;
2505
246
        Py_INCREF(key);
2506
246
        value = match_getslice(self, key, default_value);
2507
246
        if (!value) {
2508
0
            Py_DECREF(key);
2509
0
            Py_CLEAR(result);
2510
0
            goto exit;
2511
0
        }
2512
246
        status = _PyDict_SetItem_KnownHash(result, key, value, hash);
2513
246
        Py_DECREF(value);
2514
246
        Py_DECREF(key);
2515
246
        if (status < 0) {
2516
0
            Py_CLEAR(result);
2517
0
            goto exit;
2518
0
        }
2519
246
    }
2520
49
exit:;
2521
49
    Py_END_CRITICAL_SECTION();
2522
2523
49
    return result;
2524
49
}
2525
2526
/*[clinic input]
2527
_sre.SRE_Match.start -> Py_ssize_t
2528
2529
    group: object(c_default="NULL") = 0
2530
    /
2531
2532
Return index of the start of the substring matched by group.
2533
[clinic start generated code]*/
2534
2535
static Py_ssize_t
2536
_sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2537
/*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
2538
4.78M
{
2539
4.78M
    Py_ssize_t index = match_getindex(self, group);
2540
2541
4.78M
    if (index < 0) {
2542
0
        return -1;
2543
0
    }
2544
2545
    /* mark is -1 if group is undefined */
2546
4.78M
    return self->mark[index*2];
2547
4.78M
}
2548
2549
/*[clinic input]
2550
_sre.SRE_Match.end -> Py_ssize_t
2551
2552
    group: object(c_default="NULL") = 0
2553
    /
2554
2555
Return index of the end of the substring matched by group.
2556
[clinic start generated code]*/
2557
2558
static Py_ssize_t
2559
_sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2560
/*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
2561
10.3M
{
2562
10.3M
    Py_ssize_t index = match_getindex(self, group);
2563
2564
10.3M
    if (index < 0) {
2565
0
        return -1;
2566
0
    }
2567
2568
    /* mark is -1 if group is undefined */
2569
10.3M
    return self->mark[index*2+1];
2570
10.3M
}
2571
2572
LOCAL(PyObject*)
2573
_pair(Py_ssize_t i1, Py_ssize_t i2)
2574
3.15M
{
2575
3.15M
    PyObject* pair;
2576
3.15M
    PyObject* item;
2577
2578
3.15M
    pair = PyTuple_New(2);
2579
3.15M
    if (!pair)
2580
0
        return NULL;
2581
2582
3.15M
    item = PyLong_FromSsize_t(i1);
2583
3.15M
    if (!item)
2584
0
        goto error;
2585
3.15M
    PyTuple_SET_ITEM(pair, 0, item);
2586
2587
3.15M
    item = PyLong_FromSsize_t(i2);
2588
3.15M
    if (!item)
2589
0
        goto error;
2590
3.15M
    PyTuple_SET_ITEM(pair, 1, item);
2591
2592
3.15M
    return pair;
2593
2594
0
  error:
2595
0
    Py_DECREF(pair);
2596
0
    return NULL;
2597
3.15M
}
2598
2599
/*[clinic input]
2600
_sre.SRE_Match.span
2601
2602
    group: object(c_default="NULL") = 0
2603
    /
2604
2605
For match object m, return the 2-tuple (m.start(group), m.end(group)).
2606
[clinic start generated code]*/
2607
2608
static PyObject *
2609
_sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
2610
/*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
2611
3.15M
{
2612
3.15M
    Py_ssize_t index = match_getindex(self, group);
2613
2614
3.15M
    if (index < 0) {
2615
0
        return NULL;
2616
0
    }
2617
2618
    /* marks are -1 if group is undefined */
2619
3.15M
    return _pair(self->mark[index*2], self->mark[index*2+1]);
2620
3.15M
}
2621
2622
static PyObject*
2623
match_regs(MatchObject* self)
2624
0
{
2625
0
    PyObject* regs;
2626
0
    PyObject* item;
2627
0
    Py_ssize_t index;
2628
2629
0
    regs = PyTuple_New(self->groups);
2630
0
    if (!regs)
2631
0
        return NULL;
2632
2633
0
    for (index = 0; index < self->groups; index++) {
2634
0
        item = _pair(self->mark[index*2], self->mark[index*2+1]);
2635
0
        if (!item) {
2636
0
            Py_DECREF(regs);
2637
0
            return NULL;
2638
0
        }
2639
0
        PyTuple_SET_ITEM(regs, index, item);
2640
0
    }
2641
2642
0
    self->regs = Py_NewRef(regs);
2643
2644
0
    return regs;
2645
0
}
2646
2647
/*[clinic input]
2648
_sre.SRE_Match.__copy__
2649
2650
[clinic start generated code]*/
2651
2652
static PyObject *
2653
_sre_SRE_Match___copy___impl(MatchObject *self)
2654
/*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
2655
0
{
2656
0
    return Py_NewRef(self);
2657
0
}
2658
2659
/*[clinic input]
2660
_sre.SRE_Match.__deepcopy__
2661
2662
    memo: object
2663
    /
2664
2665
[clinic start generated code]*/
2666
2667
static PyObject *
2668
_sre_SRE_Match___deepcopy___impl(MatchObject *self, PyObject *memo)
2669
/*[clinic end generated code: output=2b657578eb03f4a3 input=779d12a31c2c325e]*/
2670
0
{
2671
0
    return Py_NewRef(self);
2672
0
}
2673
2674
PyDoc_STRVAR(match_doc,
2675
"The result of re.search(), re.prefixmatch(), and re.fullmatch().\n\
2676
Match objects always have a boolean value of True.");
2677
2678
PyDoc_STRVAR(match_group_doc,
2679
"group([group1, ...]) -> str or tuple.\n\
2680
    Return subgroup(s) of the match by indices or names.\n\
2681
    For 0 returns the entire match.");
2682
2683
static PyObject *
2684
match_lastindex_get(PyObject *op, void *Py_UNUSED(ignored))
2685
0
{
2686
0
    MatchObject *self = _MatchObject_CAST(op);
2687
0
    if (self->lastindex >= 0)
2688
0
        return PyLong_FromSsize_t(self->lastindex);
2689
0
    Py_RETURN_NONE;
2690
0
}
2691
2692
static PyObject *
2693
match_lastgroup_get(PyObject *op, void *Py_UNUSED(ignored))
2694
0
{
2695
0
    MatchObject *self = _MatchObject_CAST(op);
2696
0
    if (self->pattern->indexgroup &&
2697
0
        self->lastindex >= 0 &&
2698
0
        self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2699
0
    {
2700
0
        PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2701
0
                                            self->lastindex);
2702
0
        return Py_NewRef(result);
2703
0
    }
2704
0
    Py_RETURN_NONE;
2705
0
}
2706
2707
static PyObject *
2708
match_regs_get(PyObject *op, void *Py_UNUSED(ignored))
2709
0
{
2710
0
    MatchObject *self = _MatchObject_CAST(op);
2711
0
    if (self->regs) {
2712
0
        return Py_NewRef(self->regs);
2713
0
    } else
2714
0
        return match_regs(self);
2715
0
}
2716
2717
static PyObject *
2718
match_repr(PyObject *op)
2719
0
{
2720
0
    MatchObject *self = _MatchObject_CAST(op);
2721
0
    PyObject *result;
2722
0
    PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2723
0
    if (group0 == NULL)
2724
0
        return NULL;
2725
0
    result = PyUnicode_FromFormat(
2726
0
            "<%s object; span=(%zd, %zd), match=%.50R>",
2727
0
            Py_TYPE(self)->tp_name,
2728
0
            self->mark[0], self->mark[1], group0);
2729
0
    Py_DECREF(group0);
2730
0
    return result;
2731
0
}
2732
2733
2734
static PyObject*
2735
pattern_new_match(_sremodulestate* module_state,
2736
                  PatternObject* pattern,
2737
                  SRE_STATE* state,
2738
                  Py_ssize_t status)
2739
77.3M
{
2740
    /* create match object (from state object) */
2741
2742
77.3M
    MatchObject* match;
2743
77.3M
    Py_ssize_t i, j;
2744
77.3M
    char* base;
2745
77.3M
    int n;
2746
2747
77.3M
    if (status > 0) {
2748
2749
        /* create match object (with room for extra group marks) */
2750
        /* coverity[ampersand_in_size] */
2751
59.2M
        match = PyObject_GC_NewVar(MatchObject,
2752
59.2M
                                   module_state->Match_Type,
2753
59.2M
                                   2*(pattern->groups+1));
2754
59.2M
        if (!match)
2755
0
            return NULL;
2756
2757
59.2M
        Py_INCREF(pattern);
2758
59.2M
        match->pattern = pattern;
2759
2760
59.2M
        match->string = Py_NewRef(state->string);
2761
2762
59.2M
        match->regs = NULL;
2763
59.2M
        match->groups = pattern->groups+1;
2764
2765
        /* fill in group slices */
2766
2767
59.2M
        base = (char*) state->beginning;
2768
59.2M
        n = state->charsize;
2769
2770
59.2M
        match->mark[0] = ((char*) state->start - base) / n;
2771
59.2M
        match->mark[1] = ((char*) state->ptr - base) / n;
2772
2773
112M
        for (i = j = 0; i < pattern->groups; i++, j+=2)
2774
53.1M
            if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2775
44.0M
                match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2776
44.0M
                match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2777
2778
                /* check wrong span */
2779
44.0M
                if (match->mark[j+2] > match->mark[j+3]) {
2780
0
                    PyErr_SetString(PyExc_SystemError,
2781
0
                                    "The span of capturing group is wrong,"
2782
0
                                    " please report a bug for the re module.");
2783
0
                    Py_DECREF(match);
2784
0
                    return NULL;
2785
0
                }
2786
44.0M
            } else
2787
9.15M
                match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2788
2789
59.2M
        match->pos = state->pos;
2790
59.2M
        match->endpos = state->endpos;
2791
2792
59.2M
        match->lastindex = state->lastindex;
2793
2794
59.2M
        PyObject_GC_Track(match);
2795
59.2M
        return (PyObject*) match;
2796
2797
59.2M
    } else if (status == 0) {
2798
2799
        /* no match */
2800
18.0M
        Py_RETURN_NONE;
2801
2802
18.0M
    }
2803
2804
    /* internal error */
2805
0
    pattern_error(status);
2806
0
    return NULL;
2807
77.3M
}
2808
2809
2810
/* -------------------------------------------------------------------- */
2811
/* scanner methods (experimental) */
2812
2813
static int
2814
scanner_traverse(PyObject *op, visitproc visit, void *arg)
2815
196
{
2816
196
    ScannerObject *self = _ScannerObject_CAST(op);
2817
196
    Py_VISIT(Py_TYPE(self));
2818
196
    Py_VISIT(self->pattern);
2819
196
    return 0;
2820
196
}
2821
2822
static int
2823
scanner_clear(PyObject *op)
2824
386k
{
2825
386k
    ScannerObject *self = _ScannerObject_CAST(op);
2826
386k
    Py_CLEAR(self->pattern);
2827
386k
    return 0;
2828
386k
}
2829
2830
static void
2831
scanner_dealloc(PyObject *self)
2832
386k
{
2833
386k
    PyTypeObject *tp = Py_TYPE(self);
2834
386k
    PyObject_GC_UnTrack(self);
2835
386k
    ScannerObject *scanner = _ScannerObject_CAST(self);
2836
386k
    state_fini(&scanner->state);
2837
386k
    (void)scanner_clear(self);
2838
386k
    tp->tp_free(self);
2839
386k
    Py_DECREF(tp);
2840
386k
}
2841
2842
static int
2843
scanner_begin(ScannerObject* self)
2844
3.41M
{
2845
#ifdef Py_GIL_DISABLED
2846
    int was_executing = _Py_atomic_exchange_int(&self->executing, 1);
2847
#else
2848
3.41M
    int was_executing = self->executing;
2849
3.41M
    self->executing = 1;
2850
3.41M
#endif
2851
3.41M
    if (was_executing) {
2852
0
        PyErr_SetString(PyExc_ValueError,
2853
0
                        "regular expression scanner already executing");
2854
0
        return 0;
2855
0
    }
2856
3.41M
    return 1;
2857
3.41M
}
2858
2859
static void
2860
scanner_end(ScannerObject* self)
2861
3.41M
{
2862
3.41M
    assert(FT_ATOMIC_LOAD_INT_RELAXED(self->executing));
2863
3.41M
    FT_ATOMIC_STORE_INT(self->executing, 0);
2864
3.41M
}
2865
2866
/*[clinic input]
2867
_sre.SRE_Scanner.prefixmatch
2868
2869
    cls: defining_class
2870
    /
2871
2872
[clinic start generated code]*/
2873
2874
static PyObject *
2875
_sre_SRE_Scanner_prefixmatch_impl(ScannerObject *self, PyTypeObject *cls)
2876
/*[clinic end generated code: output=02b3b9d2954a2157 input=3049b20466c56a8e]*/
2877
0
{
2878
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2879
0
    SRE_STATE* state = &self->state;
2880
0
    PyObject* match;
2881
0
    Py_ssize_t status;
2882
2883
0
    if (!scanner_begin(self)) {
2884
0
        return NULL;
2885
0
    }
2886
0
    if (state->start == NULL) {
2887
0
        scanner_end(self);
2888
0
        Py_RETURN_NONE;
2889
0
    }
2890
2891
0
    state_reset(state);
2892
2893
0
    state->ptr = state->start;
2894
2895
0
    status = sre_match(state, PatternObject_GetCode(self->pattern));
2896
0
    if (PyErr_Occurred()) {
2897
0
        scanner_end(self);
2898
0
        return NULL;
2899
0
    }
2900
2901
0
    match = pattern_new_match(module_state, self->pattern,
2902
0
                              state, status);
2903
2904
0
    if (status == 0)
2905
0
        state->start = NULL;
2906
0
    else {
2907
0
        state->must_advance = (state->ptr == state->start);
2908
0
        state->start = state->ptr;
2909
0
    }
2910
2911
0
    scanner_end(self);
2912
0
    return match;
2913
0
}
2914
2915
2916
/*[clinic input]
2917
_sre.SRE_Scanner.search
2918
2919
    cls: defining_class
2920
    /
2921
2922
[clinic start generated code]*/
2923
2924
static PyObject *
2925
_sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2926
/*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
2927
3.41M
{
2928
3.41M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2929
3.41M
    SRE_STATE* state = &self->state;
2930
3.41M
    PyObject* match;
2931
3.41M
    Py_ssize_t status;
2932
2933
3.41M
    if (!scanner_begin(self)) {
2934
0
        return NULL;
2935
0
    }
2936
3.41M
    if (state->start == NULL) {
2937
0
        scanner_end(self);
2938
0
        Py_RETURN_NONE;
2939
0
    }
2940
2941
3.41M
    state_reset(state);
2942
2943
3.41M
    state->ptr = state->start;
2944
2945
3.41M
    status = sre_search(state, PatternObject_GetCode(self->pattern));
2946
3.41M
    if (PyErr_Occurred()) {
2947
0
        scanner_end(self);
2948
0
        return NULL;
2949
0
    }
2950
2951
3.41M
    match = pattern_new_match(module_state, self->pattern,
2952
3.41M
                              state, status);
2953
2954
3.41M
    if (status == 0)
2955
386k
        state->start = NULL;
2956
3.03M
    else {
2957
3.03M
        state->must_advance = (state->ptr == state->start);
2958
3.03M
        state->start = state->ptr;
2959
3.03M
    }
2960
2961
3.41M
    scanner_end(self);
2962
3.41M
    return match;
2963
3.41M
}
2964
2965
static PyObject *
2966
pattern_scanner(_sremodulestate *module_state,
2967
                PatternObject *self,
2968
                PyObject *string,
2969
                Py_ssize_t pos,
2970
                Py_ssize_t endpos)
2971
386k
{
2972
386k
    ScannerObject* scanner;
2973
2974
    /* create scanner object */
2975
386k
    scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
2976
386k
    if (!scanner)
2977
0
        return NULL;
2978
386k
    scanner->pattern = NULL;
2979
386k
    scanner->executing = 0;
2980
2981
    /* create search state object */
2982
386k
    if (!state_init(&scanner->state, self, string, pos, endpos)) {
2983
0
        Py_DECREF(scanner);
2984
0
        return NULL;
2985
0
    }
2986
2987
386k
    Py_INCREF(self);
2988
386k
    scanner->pattern = self;
2989
2990
386k
    PyObject_GC_Track(scanner);
2991
386k
    return (PyObject*) scanner;
2992
386k
}
2993
2994
/* -------------------------------------------------------------------- */
2995
/* template methods */
2996
2997
static int
2998
template_traverse(PyObject *op, visitproc visit, void *arg)
2999
0
{
3000
0
    TemplateObject *self = _TemplateObject_CAST(op);
3001
0
    Py_VISIT(Py_TYPE(self));
3002
0
    Py_VISIT(self->literal);
3003
0
    for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
3004
0
        Py_VISIT(self->items[i].literal);
3005
0
    }
3006
0
    return 0;
3007
0
}
3008
3009
static int
3010
template_clear(PyObject *op)
3011
0
{
3012
0
    TemplateObject *self = _TemplateObject_CAST(op);
3013
0
    Py_CLEAR(self->literal);
3014
0
    for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
3015
0
        Py_CLEAR(self->items[i].literal);
3016
0
    }
3017
0
    return 0;
3018
0
}
3019
3020
static void
3021
template_dealloc(PyObject *self)
3022
0
{
3023
0
    PyTypeObject *tp = Py_TYPE(self);
3024
0
    PyObject_GC_UnTrack(self);
3025
0
    (void)template_clear(self);
3026
0
    tp->tp_free(self);
3027
0
    Py_DECREF(tp);
3028
0
}
3029
3030
static PyObject *
3031
expand_template(TemplateObject *self, MatchObject *match)
3032
0
{
3033
0
    if (Py_SIZE(self) == 0) {
3034
0
        return Py_NewRef(self->literal);
3035
0
    }
3036
3037
0
    PyObject *result = NULL;
3038
0
    Py_ssize_t count = 0;  // the number of non-empty chunks
3039
    /* For small number of strings use a buffer allocated on the stack,
3040
     * otherwise use a list object. */
3041
0
    PyObject *buffer[10];
3042
0
    PyObject **out = buffer;
3043
0
    PyObject *list = NULL;
3044
0
    if (self->chunks > (int)Py_ARRAY_LENGTH(buffer) ||
3045
0
        !PyUnicode_Check(self->literal))
3046
0
    {
3047
0
        list = PyList_New(self->chunks);
3048
0
        if (!list) {
3049
0
            return NULL;
3050
0
        }
3051
0
        out = &PyList_GET_ITEM(list, 0);
3052
0
    }
3053
3054
0
    out[count++] = Py_NewRef(self->literal);
3055
0
    for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) {
3056
0
        Py_ssize_t index = self->items[i].index;
3057
0
        if (index >= match->groups) {
3058
0
            PyErr_SetString(PyExc_IndexError, "no such group");
3059
0
            goto cleanup;
3060
0
        }
3061
0
        PyObject *item = match_getslice_by_index(match, index, Py_None);
3062
0
        if (item == NULL) {
3063
0
            goto cleanup;
3064
0
        }
3065
0
        if (item != Py_None) {
3066
0
            out[count++] = Py_NewRef(item);
3067
0
        }
3068
0
        Py_DECREF(item);
3069
3070
0
        PyObject *literal = self->items[i].literal;
3071
0
        if (literal != NULL) {
3072
0
            out[count++] = Py_NewRef(literal);
3073
0
        }
3074
0
    }
3075
3076
0
    if (PyUnicode_Check(self->literal)) {
3077
0
        result = _PyUnicode_JoinArray(&_Py_STR(empty), out, count);
3078
0
    }
3079
0
    else {
3080
0
        Py_SET_SIZE(list, count);
3081
0
        result = PyBytes_Join((PyObject *)&_Py_SINGLETON(bytes_empty), list);
3082
0
    }
3083
3084
0
cleanup:
3085
0
    if (list) {
3086
0
        Py_DECREF(list);
3087
0
    }
3088
0
    else {
3089
0
        for (Py_ssize_t i = 0; i < count; i++) {
3090
0
            Py_DECREF(out[i]);
3091
0
        }
3092
0
    }
3093
0
    return result;
3094
0
}
3095
3096
3097
static Py_hash_t
3098
pattern_hash(PyObject *op)
3099
0
{
3100
0
    PatternObject *self = _PatternObject_CAST(op);
3101
3102
0
    Py_hash_t hash, hash2;
3103
3104
0
    hash = PyObject_Hash(self->pattern);
3105
0
    if (hash == -1) {
3106
0
        return -1;
3107
0
    }
3108
3109
0
    hash2 = Py_HashBuffer(self->code, sizeof(self->code[0]) * self->codesize);
3110
0
    hash ^= hash2;
3111
3112
0
    hash ^= self->flags;
3113
0
    hash ^= self->isbytes;
3114
0
    hash ^= self->codesize;
3115
3116
0
    if (hash == -1) {
3117
0
        hash = -2;
3118
0
    }
3119
0
    return hash;
3120
0
}
3121
3122
static PyObject*
3123
pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
3124
0
{
3125
0
    PyTypeObject *tp = Py_TYPE(lefto);
3126
0
    _sremodulestate *module_state = get_sre_module_state_by_class(tp);
3127
0
    PatternObject *left, *right;
3128
0
    int cmp;
3129
3130
0
    if (op != Py_EQ && op != Py_NE) {
3131
0
        Py_RETURN_NOTIMPLEMENTED;
3132
0
    }
3133
3134
0
    if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
3135
0
    {
3136
0
        Py_RETURN_NOTIMPLEMENTED;
3137
0
    }
3138
3139
0
    if (lefto == righto) {
3140
        /* a pattern is equal to itself */
3141
0
        return PyBool_FromLong(op == Py_EQ);
3142
0
    }
3143
3144
0
    left = (PatternObject *)lefto;
3145
0
    right = (PatternObject *)righto;
3146
3147
0
    cmp = (left->flags == right->flags
3148
0
           && left->isbytes == right->isbytes
3149
0
           && left->codesize == right->codesize);
3150
0
    if (cmp) {
3151
        /* Compare the code and the pattern because the same pattern can
3152
           produce different codes depending on the locale used to compile the
3153
           pattern when the re.LOCALE flag is used. Don't compare groups,
3154
           indexgroup nor groupindex: they are derivated from the pattern. */
3155
0
        cmp = (memcmp(left->code, right->code,
3156
0
                      sizeof(left->code[0]) * left->codesize) == 0);
3157
0
    }
3158
0
    if (cmp) {
3159
0
        cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
3160
0
                                       Py_EQ);
3161
0
        if (cmp < 0) {
3162
0
            return NULL;
3163
0
        }
3164
0
    }
3165
0
    if (op == Py_NE) {
3166
0
        cmp = !cmp;
3167
0
    }
3168
0
    return PyBool_FromLong(cmp);
3169
0
}
3170
3171
#include "clinic/sre.c.h"
3172
3173
static PyMethodDef pattern_methods[] = {
3174
    _SRE_SRE_PATTERN_PREFIXMATCH_METHODDEF
3175
    /* "match" reuses the prefixmatch Clinic-generated parser and impl
3176
     * to avoid duplicating the argument parsing boilerplate code. */
3177
    {"match", _PyCFunction_CAST(_sre_SRE_Pattern_prefixmatch),
3178
     METH_METHOD|METH_FASTCALL|METH_KEYWORDS,
3179
     _sre_SRE_Pattern_prefixmatch__doc__},
3180
    _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
3181
    _SRE_SRE_PATTERN_SEARCH_METHODDEF
3182
    _SRE_SRE_PATTERN_SUB_METHODDEF
3183
    _SRE_SRE_PATTERN_SUBN_METHODDEF
3184
    _SRE_SRE_PATTERN_FINDALL_METHODDEF
3185
    _SRE_SRE_PATTERN_SPLIT_METHODDEF
3186
    _SRE_SRE_PATTERN_FINDITER_METHODDEF
3187
    _SRE_SRE_PATTERN_SCANNER_METHODDEF
3188
    _SRE_SRE_PATTERN___COPY___METHODDEF
3189
    _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
3190
    _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
3191
    {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3192
     PyDoc_STR("See PEP 585")},
3193
    {NULL, NULL}
3194
};
3195
3196
static PyGetSetDef pattern_getset[] = {
3197
    {"groupindex", pattern_groupindex, NULL,
3198
      "A dictionary mapping group names to group numbers."},
3199
    {NULL}  /* Sentinel */
3200
};
3201
3202
#define PAT_OFF(x) offsetof(PatternObject, x)
3203
static PyMemberDef pattern_members[] = {
3204
    {"pattern",    _Py_T_OBJECT,    PAT_OFF(pattern),       Py_READONLY,
3205
     "The pattern string from which the RE object was compiled."},
3206
    {"flags",      Py_T_INT,       PAT_OFF(flags),         Py_READONLY,
3207
     "The regex matching flags."},
3208
    {"groups",     Py_T_PYSSIZET,  PAT_OFF(groups),        Py_READONLY,
3209
     "The number of capturing groups in the pattern."},
3210
    {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(PatternObject, weakreflist), Py_READONLY},
3211
    {NULL}  /* Sentinel */
3212
};
3213
3214
static PyType_Slot pattern_slots[] = {
3215
    {Py_tp_dealloc, pattern_dealloc},
3216
    {Py_tp_repr, pattern_repr},
3217
    {Py_tp_hash, pattern_hash},
3218
    {Py_tp_doc, (void *)pattern_doc},
3219
    {Py_tp_richcompare, pattern_richcompare},
3220
    {Py_tp_methods, pattern_methods},
3221
    {Py_tp_members, pattern_members},
3222
    {Py_tp_getset, pattern_getset},
3223
    {Py_tp_traverse, pattern_traverse},
3224
    {Py_tp_clear, pattern_clear},
3225
    {0, NULL},
3226
};
3227
3228
static PyType_Spec pattern_spec = {
3229
    .name = "re.Pattern",
3230
    .basicsize = sizeof(PatternObject),
3231
    .itemsize = sizeof(SRE_CODE),
3232
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3233
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3234
    .slots = pattern_slots,
3235
};
3236
3237
static PyMethodDef match_methods[] = {
3238
    {"group", match_group, METH_VARARGS, match_group_doc},
3239
    _SRE_SRE_MATCH_START_METHODDEF
3240
    _SRE_SRE_MATCH_END_METHODDEF
3241
    _SRE_SRE_MATCH_SPAN_METHODDEF
3242
    _SRE_SRE_MATCH_GROUPS_METHODDEF
3243
    _SRE_SRE_MATCH_GROUPDICT_METHODDEF
3244
    _SRE_SRE_MATCH_EXPAND_METHODDEF
3245
    _SRE_SRE_MATCH___COPY___METHODDEF
3246
    _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
3247
    {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3248
     PyDoc_STR("See PEP 585")},
3249
    {NULL, NULL}
3250
};
3251
3252
static PyGetSetDef match_getset[] = {
3253
    {"lastindex", match_lastindex_get, NULL,
3254
     "The integer index of the last matched capturing group."},
3255
    {"lastgroup", match_lastgroup_get, NULL,
3256
     "The name of the last matched capturing group."},
3257
    {"regs", match_regs_get, NULL, NULL},
3258
    {NULL}
3259
};
3260
3261
#define MATCH_OFF(x) offsetof(MatchObject, x)
3262
static PyMemberDef match_members[] = {
3263
    {"string",  _Py_T_OBJECT,   MATCH_OFF(string),  Py_READONLY,
3264
     "The string passed to match() or search()."},
3265
    {"re",      _Py_T_OBJECT,   MATCH_OFF(pattern), Py_READONLY,
3266
     "The regular expression object."},
3267
    {"pos",     Py_T_PYSSIZET, MATCH_OFF(pos),     Py_READONLY,
3268
     "The index into the string at which the RE engine started looking for a match."},
3269
    {"endpos",  Py_T_PYSSIZET, MATCH_OFF(endpos),  Py_READONLY,
3270
     "The index into the string beyond which the RE engine will not go."},
3271
    {NULL}
3272
};
3273
3274
/* FIXME: implement setattr("string", None) as a special case (to
3275
   detach the associated string, if any */
3276
static PyType_Slot match_slots[] = {
3277
    {Py_tp_dealloc, match_dealloc},
3278
    {Py_tp_repr, match_repr},
3279
    {Py_tp_doc, (void *)match_doc},
3280
    {Py_tp_methods, match_methods},
3281
    {Py_tp_members, match_members},
3282
    {Py_tp_getset, match_getset},
3283
    {Py_tp_traverse, match_traverse},
3284
    {Py_tp_clear, match_clear},
3285
3286
    /* As mapping.
3287
     *
3288
     * Match objects do not support length or assignment, but do support
3289
     * __getitem__.
3290
     */
3291
    {Py_mp_subscript, match_getitem},
3292
3293
    {0, NULL},
3294
};
3295
3296
static PyType_Spec match_spec = {
3297
    .name = "re.Match",
3298
    .basicsize = sizeof(MatchObject),
3299
    .itemsize = sizeof(Py_ssize_t),
3300
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3301
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3302
    .slots = match_slots,
3303
};
3304
3305
static PyMethodDef scanner_methods[] = {
3306
    _SRE_SRE_SCANNER_PREFIXMATCH_METHODDEF
3307
    /* "match" reuses the prefixmatch Clinic-generated parser and impl
3308
     * to avoid duplicating the argument parsing boilerplate code. */
3309
    {"match", _PyCFunction_CAST(_sre_SRE_Scanner_prefixmatch),
3310
     METH_METHOD|METH_FASTCALL|METH_KEYWORDS,
3311
     _sre_SRE_Scanner_prefixmatch__doc__},
3312
    _SRE_SRE_SCANNER_SEARCH_METHODDEF
3313
    {NULL, NULL}
3314
};
3315
3316
#define SCAN_OFF(x) offsetof(ScannerObject, x)
3317
static PyMemberDef scanner_members[] = {
3318
    {"pattern", _Py_T_OBJECT, SCAN_OFF(pattern), Py_READONLY},
3319
    {NULL}  /* Sentinel */
3320
};
3321
3322
static PyType_Slot scanner_slots[] = {
3323
    {Py_tp_dealloc, scanner_dealloc},
3324
    {Py_tp_methods, scanner_methods},
3325
    {Py_tp_members, scanner_members},
3326
    {Py_tp_traverse, scanner_traverse},
3327
    {Py_tp_clear, scanner_clear},
3328
    {0, NULL},
3329
};
3330
3331
static PyType_Spec scanner_spec = {
3332
    .name = "_sre.SRE_Scanner",
3333
    .basicsize = sizeof(ScannerObject),
3334
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3335
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3336
    .slots = scanner_slots,
3337
};
3338
3339
static PyType_Slot template_slots[] = {
3340
    {Py_tp_dealloc, template_dealloc},
3341
    {Py_tp_traverse, template_traverse},
3342
    {Py_tp_clear, template_clear},
3343
    {0, NULL},
3344
};
3345
3346
static PyType_Spec template_spec = {
3347
    .name = "_sre.SRE_Template",
3348
    .basicsize = sizeof(TemplateObject),
3349
    .itemsize = sizeof(((TemplateObject *)0)->items[0]),
3350
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3351
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3352
    .slots = template_slots,
3353
};
3354
3355
static PyMethodDef _functions[] = {
3356
    _SRE_COMPILE_METHODDEF
3357
    _SRE_TEMPLATE_METHODDEF
3358
    _SRE_GETCODESIZE_METHODDEF
3359
    _SRE_ASCII_ISCASED_METHODDEF
3360
    _SRE_UNICODE_ISCASED_METHODDEF
3361
    _SRE_ASCII_TOLOWER_METHODDEF
3362
    _SRE_UNICODE_TOLOWER_METHODDEF
3363
    {NULL, NULL}
3364
};
3365
3366
static int
3367
sre_traverse(PyObject *module, visitproc visit, void *arg)
3368
1.46k
{
3369
1.46k
    _sremodulestate *state = get_sre_module_state(module);
3370
3371
1.46k
    Py_VISIT(state->Pattern_Type);
3372
1.46k
    Py_VISIT(state->Match_Type);
3373
1.46k
    Py_VISIT(state->Scanner_Type);
3374
1.46k
    Py_VISIT(state->Template_Type);
3375
1.46k
    Py_VISIT(state->compile_template);
3376
3377
1.46k
    return 0;
3378
1.46k
}
3379
3380
static int
3381
sre_clear(PyObject *module)
3382
0
{
3383
0
    _sremodulestate *state = get_sre_module_state(module);
3384
3385
0
    Py_CLEAR(state->Pattern_Type);
3386
0
    Py_CLEAR(state->Match_Type);
3387
0
    Py_CLEAR(state->Scanner_Type);
3388
0
    Py_CLEAR(state->Template_Type);
3389
0
    Py_CLEAR(state->compile_template);
3390
3391
0
    return 0;
3392
0
}
3393
3394
static void
3395
sre_free(void *module)
3396
0
{
3397
0
    sre_clear((PyObject *)module);
3398
0
}
3399
3400
120
#define CREATE_TYPE(m, type, spec)                                  \
3401
120
do {                                                                \
3402
120
    type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
3403
120
    if (type == NULL) {                                             \
3404
0
        goto error;                                                 \
3405
0
    }                                                               \
3406
120
} while (0)
3407
3408
#define ADD_ULONG_CONSTANT(module, name, value)           \
3409
60
    do {                                                  \
3410
60
        if (PyModule_Add(module, name, PyLong_FromUnsignedLong(value)) < 0) { \
3411
0
            goto error;                                   \
3412
0
        }                                                 \
3413
60
} while (0)
3414
3415
3416
#ifdef Py_DEBUG
3417
static void
3418
_assert_match_aliases_prefixmatch(PyMethodDef *methods)
3419
{
3420
    PyMethodDef *prefixmatch_md = &methods[0];
3421
    PyMethodDef *match_md = &methods[1];
3422
    assert(strcmp(prefixmatch_md->ml_name, "prefixmatch") == 0);
3423
    assert(strcmp(match_md->ml_name, "match") == 0);
3424
    assert(match_md->ml_meth == prefixmatch_md->ml_meth);
3425
    assert(match_md->ml_flags == prefixmatch_md->ml_flags);
3426
    assert(match_md->ml_doc == prefixmatch_md->ml_doc);
3427
}
3428
#endif
3429
3430
static int
3431
sre_exec(PyObject *m)
3432
30
{
3433
30
    _sremodulestate *state;
3434
3435
#ifdef Py_DEBUG
3436
    _assert_match_aliases_prefixmatch(pattern_methods);
3437
    _assert_match_aliases_prefixmatch(scanner_methods);
3438
#endif
3439
3440
    /* Create heap types */
3441
30
    state = get_sre_module_state(m);
3442
30
    CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
3443
30
    CREATE_TYPE(m, state->Match_Type, &match_spec);
3444
30
    CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
3445
30
    CREATE_TYPE(m, state->Template_Type, &template_spec);
3446
3447
30
    if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
3448
0
        goto error;
3449
0
    }
3450
3451
30
    if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
3452
0
        goto error;
3453
0
    }
3454
3455
30
    ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
3456
30
    ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
3457
3458
30
    if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
3459
0
        goto error;
3460
0
    }
3461
3462
30
    return 0;
3463
3464
0
error:
3465
0
    return -1;
3466
30
}
3467
3468
static PyModuleDef_Slot sre_slots[] = {
3469
    {Py_mod_exec, sre_exec},
3470
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
3471
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
3472
    {0, NULL},
3473
};
3474
3475
static struct PyModuleDef sremodule = {
3476
    .m_base = PyModuleDef_HEAD_INIT,
3477
    .m_name = "_sre",
3478
    .m_size = sizeof(_sremodulestate),
3479
    .m_methods = _functions,
3480
    .m_slots = sre_slots,
3481
    .m_traverse = sre_traverse,
3482
    .m_free = sre_free,
3483
    .m_clear = sre_clear,
3484
};
3485
3486
PyMODINIT_FUNC
3487
PyInit__sre(void)
3488
30
{
3489
30
    return PyModuleDef_Init(&sremodule);
3490
30
}
3491
3492
/* vim:ts=4:sw=4:et
3493
*/