Coverage Report

Created: 2026-01-10 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Modules/_sre/sre.c
Line
Count
Source
1
/*
2
 * Secret Labs' Regular Expression Engine
3
 *
4
 * regular expression matching engine
5
 *
6
 * partial history:
7
 * 1999-10-24 fl   created (based on existing template matcher code)
8
 * 2000-03-06 fl   first alpha, sort of
9
 * 2000-08-01 fl   fixes for 1.6b1
10
 * 2000-08-07 fl   use PyOS_CheckStack() if available
11
 * 2000-09-20 fl   added expand method
12
 * 2001-03-20 fl   lots of fixes for 2.1b2
13
 * 2001-04-15 fl   export copyright as Python attribute, not global
14
 * 2001-04-28 fl   added __copy__ methods (work in progress)
15
 * 2001-05-14 fl   fixes for 1.5.2 compatibility
16
 * 2001-07-01 fl   added BIGCHARSET support (from Martin von Loewis)
17
 * 2001-10-18 fl   fixed group reset issue (from Matthew Mueller)
18
 * 2001-10-20 fl   added split primitive; re-enable unicode for 1.6/2.0/2.1
19
 * 2001-10-21 fl   added sub/subn primitive
20
 * 2001-10-24 fl   added finditer primitive (for 2.2 only)
21
 * 2001-12-07 fl   fixed memory leak in sub/subn (Guido van Rossum)
22
 * 2002-11-09 fl   fixed empty sub/subn return type
23
 * 2003-04-18 mvl  fully support 4-byte codes
24
 * 2003-10-17 gn   implemented non recursive scheme
25
 * 2013-02-04 mrab added fullmatch primitive
26
 *
27
 * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
28
 *
29
 * This version of the SRE library can be redistributed under CNRI's
30
 * Python 1.6 license.  For any other use, please contact Secret Labs
31
 * AB (info@pythonware.com).
32
 *
33
 * Portions of this engine have been developed in cooperation with
34
 * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
35
 * other compatibility work.
36
 */
37
38
static const char copyright[] =
39
    " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
40
41
#include "Python.h"
42
#include "pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
43
#include "pycore_dict.h"             // _PyDict_Next()
44
#include "pycore_long.h"             // _PyLong_GetZero()
45
#include "pycore_moduleobject.h"     // _PyModule_GetState()
46
#include "pycore_unicodeobject.h"    // _PyUnicode_Copy
47
#include "pycore_weakref.h"          // FT_CLEAR_WEAKREFS()
48
49
#include "sre.h"                     // SRE_CODE
50
51
#include <ctype.h>                   // tolower(), toupper(), isalnum()
52
53
1.19G
#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
54
55
// On macOS, use the wide character ctype API using btowc()
56
#if defined(__APPLE__)
57
#  define USE_CTYPE_WINT_T
58
#endif
59
60
0
static int sre_isalnum(unsigned int ch) {
61
#ifdef USE_CTYPE_WINT_T
62
    return (unsigned int)iswalnum(btowc((int)ch));
63
#else
64
0
    return (unsigned int)isalnum((int)ch);
65
0
#endif
66
0
}
67
68
0
static unsigned int sre_tolower(unsigned int ch) {
69
#ifdef USE_CTYPE_WINT_T
70
    return (unsigned int)towlower(btowc((int)ch));
71
#else
72
0
    return (unsigned int)tolower((int)ch);
73
0
#endif
74
0
}
75
76
0
static unsigned int sre_toupper(unsigned int ch) {
77
#ifdef USE_CTYPE_WINT_T
78
    return (unsigned int)towupper(btowc((int)ch));
79
#else
80
0
    return (unsigned int)toupper((int)ch);
81
0
#endif
82
0
}
83
84
/* Defining this one controls tracing:
85
 * 0 -- disabled
86
 * 1 -- only if the DEBUG flag set
87
 * 2 -- always
88
 */
89
#ifndef VERBOSE
90
#  define VERBOSE 0
91
#endif
92
93
/* -------------------------------------------------------------------- */
94
95
#if defined(_MSC_VER) && !defined(__clang__)
96
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
97
#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
98
/* fastest possible local call under MSVC */
99
#define LOCAL(type) static __inline type __fastcall
100
#else
101
#define LOCAL(type) static inline type
102
#endif
103
104
/* error codes */
105
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
106
#define SRE_ERROR_STATE -2 /* illegal state */
107
0
#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
108
0
#define SRE_ERROR_MEMORY -9 /* out of memory */
109
0
#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
110
111
#if VERBOSE == 0
112
#  define INIT_TRACE(state)
113
#  define DO_TRACE 0
114
#  define TRACE(v)
115
#elif VERBOSE == 1
116
#  define INIT_TRACE(state) int _debug = (state)->debug
117
#  define DO_TRACE (_debug)
118
#  define TRACE(v) do {     \
119
        if (_debug) { \
120
            printf v;       \
121
        }                   \
122
    } while (0)
123
#elif VERBOSE == 2
124
#  define INIT_TRACE(state)
125
#  define DO_TRACE 1
126
#  define TRACE(v) printf v
127
#else
128
#  error VERBOSE must be 0, 1 or 2
129
#endif
130
131
/* -------------------------------------------------------------------- */
132
/* search engine state */
133
134
#define SRE_IS_DIGIT(ch)\
135
336
    ((ch) <= '9' && Py_ISDIGIT(ch))
136
#define SRE_IS_SPACE(ch)\
137
0
    ((ch) <= ' ' && Py_ISSPACE(ch))
138
#define SRE_IS_LINEBREAK(ch)\
139
68.4M
    ((ch) == '\n')
140
#define SRE_IS_WORD(ch)\
141
10.8M
    ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
142
143
static unsigned int sre_lower_ascii(unsigned int ch)
144
8.21M
{
145
8.21M
    return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
146
8.21M
}
147
148
/* locale-specific character predicates */
149
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
150
 * warnings when c's type supports only numbers < N+1 */
151
0
#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? sre_isalnum((ch)) : 0)
152
0
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
153
154
static unsigned int sre_lower_locale(unsigned int ch)
155
0
{
156
0
    return ((ch) < 256 ? (unsigned int)sre_tolower((ch)) : ch);
157
0
}
158
159
static unsigned int sre_upper_locale(unsigned int ch)
160
0
{
161
0
    return ((ch) < 256 ? (unsigned int)sre_toupper((ch)) : ch);
162
0
}
163
164
/* unicode-specific character predicates */
165
166
0
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
167
87.2M
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
168
0
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
169
0
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
170
0
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
171
172
static unsigned int sre_lower_unicode(unsigned int ch)
173
110M
{
174
110M
    return (unsigned int) Py_UNICODE_TOLOWER(ch);
175
110M
}
176
177
static unsigned int sre_upper_unicode(unsigned int ch)
178
28.3M
{
179
28.3M
    return (unsigned int) Py_UNICODE_TOUPPER(ch);
180
28.3M
}
181
182
LOCAL(int)
183
sre_category(SRE_CODE category, unsigned int ch)
184
98.0M
{
185
98.0M
    switch (category) {
186
187
336
    case SRE_CATEGORY_DIGIT:
188
336
        return SRE_IS_DIGIT(ch);
189
0
    case SRE_CATEGORY_NOT_DIGIT:
190
0
        return !SRE_IS_DIGIT(ch);
191
0
    case SRE_CATEGORY_SPACE:
192
0
        return SRE_IS_SPACE(ch);
193
0
    case SRE_CATEGORY_NOT_SPACE:
194
0
        return !SRE_IS_SPACE(ch);
195
10.8M
    case SRE_CATEGORY_WORD:
196
10.8M
        return SRE_IS_WORD(ch);
197
0
    case SRE_CATEGORY_NOT_WORD:
198
0
        return !SRE_IS_WORD(ch);
199
0
    case SRE_CATEGORY_LINEBREAK:
200
0
        return SRE_IS_LINEBREAK(ch);
201
0
    case SRE_CATEGORY_NOT_LINEBREAK:
202
0
        return !SRE_IS_LINEBREAK(ch);
203
204
0
    case SRE_CATEGORY_LOC_WORD:
205
0
        return SRE_LOC_IS_WORD(ch);
206
0
    case SRE_CATEGORY_LOC_NOT_WORD:
207
0
        return !SRE_LOC_IS_WORD(ch);
208
209
0
    case SRE_CATEGORY_UNI_DIGIT:
210
0
        return SRE_UNI_IS_DIGIT(ch);
211
0
    case SRE_CATEGORY_UNI_NOT_DIGIT:
212
0
        return !SRE_UNI_IS_DIGIT(ch);
213
73.6M
    case SRE_CATEGORY_UNI_SPACE:
214
73.6M
        return SRE_UNI_IS_SPACE(ch);
215
13.5M
    case SRE_CATEGORY_UNI_NOT_SPACE:
216
13.5M
        return !SRE_UNI_IS_SPACE(ch);
217
0
    case SRE_CATEGORY_UNI_WORD:
218
0
        return SRE_UNI_IS_WORD(ch);
219
0
    case SRE_CATEGORY_UNI_NOT_WORD:
220
0
        return !SRE_UNI_IS_WORD(ch);
221
0
    case SRE_CATEGORY_UNI_LINEBREAK:
222
0
        return SRE_UNI_IS_LINEBREAK(ch);
223
0
    case SRE_CATEGORY_UNI_NOT_LINEBREAK:
224
0
        return !SRE_UNI_IS_LINEBREAK(ch);
225
98.0M
    }
226
0
    return 0;
227
98.0M
}
228
229
LOCAL(int)
230
char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
231
0
{
232
0
    return ch == pattern
233
0
        || (SRE_CODE) sre_lower_locale(ch) == pattern
234
0
        || (SRE_CODE) sre_upper_locale(ch) == pattern;
235
0
}
236
237
238
/* helpers */
239
240
static void
241
data_stack_dealloc(SRE_STATE* state)
242
190M
{
243
190M
    if (state->data_stack) {
244
161M
        PyMem_Free(state->data_stack);
245
161M
        state->data_stack = NULL;
246
161M
    }
247
190M
    state->data_stack_size = state->data_stack_base = 0;
248
190M
}
249
250
static int
251
data_stack_grow(SRE_STATE* state, Py_ssize_t size)
252
162M
{
253
162M
    INIT_TRACE(state);
254
162M
    Py_ssize_t minsize, cursize;
255
162M
    minsize = state->data_stack_base+size;
256
162M
    cursize = state->data_stack_size;
257
162M
    if (cursize < minsize) {
258
162M
        void* stack;
259
162M
        cursize = minsize+minsize/4+1024;
260
162M
        TRACE(("allocate/grow stack %zd\n", cursize));
261
162M
        stack = PyMem_Realloc(state->data_stack, cursize);
262
162M
        if (!stack) {
263
0
            data_stack_dealloc(state);
264
0
            return SRE_ERROR_MEMORY;
265
0
        }
266
162M
        state->data_stack = (char *)stack;
267
162M
        state->data_stack_size = cursize;
268
162M
    }
269
162M
    return 0;
270
162M
}
271
272
/* memory pool functions for SRE_REPEAT, this can avoid memory
273
   leak when SRE(match) function terminates abruptly.
274
   state->repeat_pool_used is a doubly-linked list, so that we
275
   can remove a SRE_REPEAT node from it.
276
   state->repeat_pool_unused is a singly-linked list, we put/get
277
   node at the head. */
278
static SRE_REPEAT *
279
repeat_pool_malloc(SRE_STATE *state)
280
113M
{
281
113M
    SRE_REPEAT *repeat;
282
283
113M
    if (state->repeat_pool_unused) {
284
        /* remove from unused pool (singly-linked list) */
285
69.6M
        repeat = state->repeat_pool_unused;
286
69.6M
        state->repeat_pool_unused = repeat->pool_next;
287
69.6M
    }
288
43.8M
    else {
289
43.8M
        repeat = PyMem_Malloc(sizeof(SRE_REPEAT));
290
43.8M
        if (!repeat) {
291
0
            return NULL;
292
0
        }
293
43.8M
    }
294
295
    /* add to used pool (doubly-linked list) */
296
113M
    SRE_REPEAT *temp = state->repeat_pool_used;
297
113M
    if (temp) {
298
22.4M
        temp->pool_prev = repeat;
299
22.4M
    }
300
113M
    repeat->pool_prev = NULL;
301
113M
    repeat->pool_next = temp;
302
113M
    state->repeat_pool_used = repeat;
303
304
113M
    return repeat;
305
113M
}
306
307
static void
308
repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
309
113M
{
310
113M
    SRE_REPEAT *prev = repeat->pool_prev;
311
113M
    SRE_REPEAT *next = repeat->pool_next;
312
313
    /* remove from used pool (doubly-linked list) */
314
113M
    if (prev) {
315
0
        prev->pool_next = next;
316
0
    }
317
113M
    else {
318
113M
        state->repeat_pool_used = next;
319
113M
    }
320
113M
    if (next) {
321
22.4M
        next->pool_prev = prev;
322
22.4M
    }
323
324
    /* add to unused pool (singly-linked list) */
325
113M
    repeat->pool_next = state->repeat_pool_unused;
326
113M
    state->repeat_pool_unused = repeat;
327
113M
}
328
329
static void
330
repeat_pool_clear(SRE_STATE *state)
331
86.8M
{
332
    /* clear used pool */
333
86.8M
    SRE_REPEAT *next = state->repeat_pool_used;
334
86.8M
    state->repeat_pool_used = NULL;
335
86.8M
    while (next) {
336
0
        SRE_REPEAT *temp = next;
337
0
        next = temp->pool_next;
338
0
        PyMem_Free(temp);
339
0
    }
340
341
    /* clear unused pool */
342
86.8M
    next = state->repeat_pool_unused;
343
86.8M
    state->repeat_pool_unused = NULL;
344
130M
    while (next) {
345
43.8M
        SRE_REPEAT *temp = next;
346
43.8M
        next = temp->pool_next;
347
43.8M
        PyMem_Free(temp);
348
43.8M
    }
349
86.8M
}
350
351
/* generate 8-bit version */
352
353
344M
#define SRE_CHAR Py_UCS1
354
#define SIZEOF_SRE_CHAR 1
355
1.35G
#define SRE(F) sre_ucs1_##F
356
#include "sre_lib.h"
357
358
/* generate 16-bit unicode version */
359
360
363M
#define SRE_CHAR Py_UCS2
361
#define SIZEOF_SRE_CHAR 2
362
1.66G
#define SRE(F) sre_ucs2_##F
363
#include "sre_lib.h"
364
365
/* generate 32-bit unicode version */
366
367
158M
#define SRE_CHAR Py_UCS4
368
#define SIZEOF_SRE_CHAR 4
369
802M
#define SRE(F) sre_ucs4_##F
370
#include "sre_lib.h"
371
372
/* -------------------------------------------------------------------- */
373
/* factories and destructors */
374
375
/* module state */
376
typedef struct {
377
    PyTypeObject *Pattern_Type;
378
    PyTypeObject *Match_Type;
379
    PyTypeObject *Scanner_Type;
380
    PyTypeObject *Template_Type;
381
    PyObject *compile_template;  // reference to re._compile_template
382
} _sremodulestate;
383
384
static _sremodulestate *
385
get_sre_module_state(PyObject *m)
386
84.4M
{
387
84.4M
    _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
388
84.4M
    assert(state);
389
84.4M
    return state;
390
84.4M
}
391
392
static struct PyModuleDef sremodule;
393
#define get_sre_module_state_by_class(cls) \
394
84.4M
    (get_sre_module_state(PyType_GetModule(cls)))
395
396
/* see sre.h for object declarations */
397
static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
398
static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
399
400
19.2k
#define _PatternObject_CAST(op)     ((PatternObject *)(op))
401
93.3M
#define _MatchObject_CAST(op)       ((MatchObject *)(op))
402
0
#define _TemplateObject_CAST(op)    ((TemplateObject *)(op))
403
776k
#define _ScannerObject_CAST(op)     ((ScannerObject *)(op))
404
405
/*[clinic input]
406
module _sre
407
class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
408
class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
409
class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
410
[clinic start generated code]*/
411
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
412
413
/*[clinic input]
414
_sre.getcodesize -> int
415
[clinic start generated code]*/
416
417
static int
418
_sre_getcodesize_impl(PyObject *module)
419
/*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
420
0
{
421
0
    return sizeof(SRE_CODE);
422
0
}
423
424
/*[clinic input]
425
_sre.ascii_iscased -> bool
426
427
    character: int
428
    /
429
430
[clinic start generated code]*/
431
432
static int
433
_sre_ascii_iscased_impl(PyObject *module, int character)
434
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
435
11.9k
{
436
11.9k
    unsigned int ch = (unsigned int)character;
437
11.9k
    return ch < 128 && Py_ISALPHA(ch);
438
11.9k
}
439
440
/*[clinic input]
441
_sre.unicode_iscased -> bool
442
443
    character: int
444
    /
445
446
[clinic start generated code]*/
447
448
static int
449
_sre_unicode_iscased_impl(PyObject *module, int character)
450
/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
451
31.3M
{
452
31.3M
    unsigned int ch = (unsigned int)character;
453
31.3M
    return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
454
31.3M
}
455
456
/*[clinic input]
457
_sre.ascii_tolower -> int
458
459
    character: int
460
    /
461
462
[clinic start generated code]*/
463
464
static int
465
_sre_ascii_tolower_impl(PyObject *module, int character)
466
/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
467
2.42M
{
468
2.42M
    return sre_lower_ascii(character);
469
2.42M
}
470
471
/*[clinic input]
472
_sre.unicode_tolower -> int
473
474
    character: int
475
    /
476
477
[clinic start generated code]*/
478
479
static int
480
_sre_unicode_tolower_impl(PyObject *module, int character)
481
/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
482
78.8M
{
483
78.8M
    return sre_lower_unicode(character);
484
78.8M
}
485
486
LOCAL(void)
487
state_reset(SRE_STATE* state)
488
103M
{
489
    /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
490
    /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
491
492
103M
    state->lastmark = -1;
493
103M
    state->lastindex = -1;
494
495
103M
    state->repeat = NULL;
496
497
103M
    data_stack_dealloc(state);
498
103M
}
499
500
static const void*
501
getstring(PyObject* string, Py_ssize_t* p_length,
502
          int* p_isbytes, int* p_charsize,
503
          Py_buffer *view)
504
143M
{
505
    /* given a python object, return a data pointer, a length (in
506
       characters), and a character size.  return NULL if the object
507
       is not a string (or not compatible) */
508
509
    /* Unicode objects do not support the buffer API. So, get the data
510
       directly instead. */
511
143M
    if (PyUnicode_Check(string)) {
512
142M
        *p_length = PyUnicode_GET_LENGTH(string);
513
142M
        *p_charsize = PyUnicode_KIND(string);
514
142M
        *p_isbytes = 0;
515
142M
        return PyUnicode_DATA(string);
516
142M
    }
517
518
    /* get pointer to byte string buffer */
519
1.09M
    if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
520
0
        PyErr_Format(PyExc_TypeError, "expected string or bytes-like "
521
0
                     "object, got '%.200s'", Py_TYPE(string)->tp_name);
522
0
        return NULL;
523
0
    }
524
525
1.09M
    *p_length = view->len;
526
1.09M
    *p_charsize = 1;
527
1.09M
    *p_isbytes = 1;
528
529
1.09M
    if (view->buf == NULL) {
530
0
        PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
531
0
        PyBuffer_Release(view);
532
0
        view->buf = NULL;
533
0
        return NULL;
534
0
    }
535
1.09M
    return view->buf;
536
1.09M
}
537
538
LOCAL(PyObject*)
539
state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
540
           Py_ssize_t start, Py_ssize_t end)
541
86.8M
{
542
    /* prepare state object */
543
544
86.8M
    Py_ssize_t length;
545
86.8M
    int isbytes, charsize;
546
86.8M
    const void* ptr;
547
548
86.8M
    memset(state, 0, sizeof(SRE_STATE));
549
550
86.8M
    state->mark = PyMem_New(const void *, pattern->groups * 2);
551
86.8M
    if (!state->mark) {
552
0
        PyErr_NoMemory();
553
0
        goto err;
554
0
    }
555
86.8M
    state->lastmark = -1;
556
86.8M
    state->lastindex = -1;
557
558
86.8M
    state->buffer.buf = NULL;
559
86.8M
    ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
560
86.8M
    if (!ptr)
561
0
        goto err;
562
563
86.8M
    if (isbytes && pattern->isbytes == 0) {
564
0
        PyErr_SetString(PyExc_TypeError,
565
0
                        "cannot use a string pattern on a bytes-like object");
566
0
        goto err;
567
0
    }
568
86.8M
    if (!isbytes && pattern->isbytes > 0) {
569
0
        PyErr_SetString(PyExc_TypeError,
570
0
                        "cannot use a bytes pattern on a string-like object");
571
0
        goto err;
572
0
    }
573
574
    /* adjust boundaries */
575
86.8M
    if (start < 0)
576
0
        start = 0;
577
86.8M
    else if (start > length)
578
0
        start = length;
579
580
86.8M
    if (end < 0)
581
0
        end = 0;
582
86.8M
    else if (end > length)
583
86.8M
        end = length;
584
585
86.8M
    state->isbytes = isbytes;
586
86.8M
    state->charsize = charsize;
587
86.8M
    state->match_all = 0;
588
86.8M
    state->must_advance = 0;
589
86.8M
    state->debug = ((pattern->flags & SRE_FLAG_DEBUG) != 0);
590
591
86.8M
    state->beginning = ptr;
592
593
86.8M
    state->start = (void*) ((char*) ptr + start * state->charsize);
594
86.8M
    state->end = (void*) ((char*) ptr + end * state->charsize);
595
596
86.8M
    state->string = Py_NewRef(string);
597
86.8M
    state->pos = start;
598
86.8M
    state->endpos = end;
599
600
#ifdef Py_DEBUG
601
    state->fail_after_count = pattern->fail_after_count;
602
    state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
603
#endif
604
605
86.8M
    return string;
606
0
  err:
607
    /* We add an explicit cast here because MSVC has a bug when
608
       compiling C code where it believes that `const void**` cannot be
609
       safely casted to `void*`, see bpo-39943 for details. */
610
0
    PyMem_Free((void*) state->mark);
611
0
    state->mark = NULL;
612
0
    if (state->buffer.buf)
613
0
        PyBuffer_Release(&state->buffer);
614
0
    return NULL;
615
86.8M
}
616
617
LOCAL(void)
618
state_fini(SRE_STATE* state)
619
86.8M
{
620
86.8M
    if (state->buffer.buf)
621
551k
        PyBuffer_Release(&state->buffer);
622
86.8M
    Py_XDECREF(state->string);
623
86.8M
    data_stack_dealloc(state);
624
    /* See above PyMem_Free() for why we explicitly cast here. */
625
86.8M
    PyMem_Free((void*) state->mark);
626
86.8M
    state->mark = NULL;
627
    /* SRE_REPEAT pool */
628
86.8M
    repeat_pool_clear(state);
629
86.8M
}
630
631
/* calculate offset from start of string */
632
#define STATE_OFFSET(state, member)\
633
171M
    (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
634
635
LOCAL(PyObject*)
636
getslice(int isbytes, const void *ptr,
637
         PyObject* string, Py_ssize_t start, Py_ssize_t end)
638
153M
{
639
153M
    if (isbytes) {
640
619k
        if (PyBytes_CheckExact(string) &&
641
619k
            start == 0 && end == PyBytes_GET_SIZE(string)) {
642
2.30k
            return Py_NewRef(string);
643
2.30k
        }
644
617k
        return PyBytes_FromStringAndSize(
645
617k
                (const char *)ptr + start, end - start);
646
619k
    }
647
152M
    else {
648
152M
        return PyUnicode_Substring(string, start, end);
649
152M
    }
650
153M
}
651
652
LOCAL(PyObject*)
653
state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
654
1.11M
{
655
1.11M
    Py_ssize_t i, j;
656
657
1.11M
    index = (index - 1) * 2;
658
659
1.11M
    if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
660
0
        if (empty)
661
            /* want empty string */
662
0
            i = j = 0;
663
0
        else {
664
0
            Py_RETURN_NONE;
665
0
        }
666
1.11M
    } else {
667
1.11M
        i = STATE_OFFSET(state, state->mark[index]);
668
1.11M
        j = STATE_OFFSET(state, state->mark[index+1]);
669
670
        /* check wrong span */
671
1.11M
        if (i > j) {
672
0
            PyErr_SetString(PyExc_SystemError,
673
0
                            "The span of capturing group is wrong,"
674
0
                            " please report a bug for the re module.");
675
0
            return NULL;
676
0
        }
677
1.11M
    }
678
679
1.11M
    return getslice(state->isbytes, state->beginning, string, i, j);
680
1.11M
}
681
682
static void
683
pattern_error(Py_ssize_t status)
684
0
{
685
0
    switch (status) {
686
0
    case SRE_ERROR_RECURSION_LIMIT:
687
        /* This error code seems to be unused. */
688
0
        PyErr_SetString(
689
0
            PyExc_RecursionError,
690
0
            "maximum recursion limit exceeded"
691
0
            );
692
0
        break;
693
0
    case SRE_ERROR_MEMORY:
694
0
        PyErr_NoMemory();
695
0
        break;
696
0
    case SRE_ERROR_INTERRUPTED:
697
    /* An exception has already been raised, so let it fly */
698
0
        break;
699
0
    default:
700
        /* other error codes indicate compiler/engine bugs */
701
0
        PyErr_SetString(
702
0
            PyExc_RuntimeError,
703
0
            "internal error in regular expression engine"
704
0
            );
705
0
    }
706
0
}
707
708
static int
709
pattern_traverse(PyObject *op, visitproc visit, void *arg)
710
16.3k
{
711
16.3k
    PatternObject *self = _PatternObject_CAST(op);
712
16.3k
    Py_VISIT(Py_TYPE(self));
713
16.3k
    Py_VISIT(self->groupindex);
714
16.3k
    Py_VISIT(self->indexgroup);
715
16.3k
    Py_VISIT(self->pattern);
716
#ifdef Py_DEBUG
717
    Py_VISIT(self->fail_after_exc);
718
#endif
719
16.3k
    return 0;
720
16.3k
}
721
722
static int
723
pattern_clear(PyObject *op)
724
2.89k
{
725
2.89k
    PatternObject *self = _PatternObject_CAST(op);
726
2.89k
    Py_CLEAR(self->groupindex);
727
2.89k
    Py_CLEAR(self->indexgroup);
728
2.89k
    Py_CLEAR(self->pattern);
729
#ifdef Py_DEBUG
730
    Py_CLEAR(self->fail_after_exc);
731
#endif
732
2.89k
    return 0;
733
2.89k
}
734
735
static void
736
pattern_dealloc(PyObject *self)
737
2.89k
{
738
2.89k
    PyTypeObject *tp = Py_TYPE(self);
739
2.89k
    PyObject_GC_UnTrack(self);
740
2.89k
    FT_CLEAR_WEAKREFS(self, _PatternObject_CAST(self)->weakreflist);
741
2.89k
    (void)pattern_clear(self);
742
2.89k
    tp->tp_free(self);
743
2.89k
    Py_DECREF(tp);
744
2.89k
}
745
746
LOCAL(Py_ssize_t)
747
sre_match(SRE_STATE* state, SRE_CODE* pattern)
748
63.2M
{
749
63.2M
    if (state->charsize == 1)
750
40.0M
        return sre_ucs1_match(state, pattern, 1);
751
23.2M
    if (state->charsize == 2)
752
12.3M
        return sre_ucs2_match(state, pattern, 1);
753
23.2M
    assert(state->charsize == 4);
754
10.9M
    return sre_ucs4_match(state, pattern, 1);
755
23.2M
}
756
757
LOCAL(Py_ssize_t)
758
sre_search(SRE_STATE* state, SRE_CODE* pattern)
759
109M
{
760
109M
    if (state->charsize == 1)
761
48.7M
        return sre_ucs1_search(state, pattern);
762
60.5M
    if (state->charsize == 2)
763
52.0M
        return sre_ucs2_search(state, pattern);
764
60.5M
    assert(state->charsize == 4);
765
8.44M
    return sre_ucs4_search(state, pattern);
766
60.5M
}
767
768
/*[clinic input]
769
_sre.SRE_Pattern.match
770
771
    cls: defining_class
772
    /
773
    string: object
774
    pos: Py_ssize_t = 0
775
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
776
777
Matches zero or more characters at the beginning of the string.
778
[clinic start generated code]*/
779
780
static PyObject *
781
_sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
782
                            PyObject *string, Py_ssize_t pos,
783
                            Py_ssize_t endpos)
784
/*[clinic end generated code: output=ec6208ea58a0cca0 input=4bdb9c3e564d13ac]*/
785
63.2M
{
786
63.2M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
787
63.2M
    SRE_STATE state;
788
63.2M
    Py_ssize_t status;
789
63.2M
    PyObject *match;
790
791
63.2M
    if (!state_init(&state, self, string, pos, endpos))
792
0
        return NULL;
793
794
63.2M
    INIT_TRACE(&state);
795
63.2M
    state.ptr = state.start;
796
797
63.2M
    TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
798
799
63.2M
    status = sre_match(&state, PatternObject_GetCode(self));
800
801
63.2M
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
802
63.2M
    if (PyErr_Occurred()) {
803
0
        state_fini(&state);
804
0
        return NULL;
805
0
    }
806
807
63.2M
    match = pattern_new_match(module_state, self, &state, status);
808
63.2M
    state_fini(&state);
809
63.2M
    return match;
810
63.2M
}
811
812
/*[clinic input]
813
_sre.SRE_Pattern.fullmatch
814
815
    cls: defining_class
816
    /
817
    string: object
818
    pos: Py_ssize_t = 0
819
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
820
821
Matches against all of the string.
822
[clinic start generated code]*/
823
824
static PyObject *
825
_sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
826
                                PyObject *string, Py_ssize_t pos,
827
                                Py_ssize_t endpos)
828
/*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
829
0
{
830
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
831
0
    SRE_STATE state;
832
0
    Py_ssize_t status;
833
0
    PyObject *match;
834
835
0
    if (!state_init(&state, self, string, pos, endpos))
836
0
        return NULL;
837
838
0
    INIT_TRACE(&state);
839
0
    state.ptr = state.start;
840
841
0
    TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
842
843
0
    state.match_all = 1;
844
0
    status = sre_match(&state, PatternObject_GetCode(self));
845
846
0
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
847
0
    if (PyErr_Occurred()) {
848
0
        state_fini(&state);
849
0
        return NULL;
850
0
    }
851
852
0
    match = pattern_new_match(module_state, self, &state, status);
853
0
    state_fini(&state);
854
0
    return match;
855
0
}
856
857
/*[clinic input]
858
@permit_long_summary
859
_sre.SRE_Pattern.search
860
861
    cls: defining_class
862
    /
863
    string: object
864
    pos: Py_ssize_t = 0
865
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
866
867
Scan through string looking for a match, and return a corresponding match object instance.
868
869
Return None if no position in the string matches.
870
[clinic start generated code]*/
871
872
static PyObject *
873
_sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
874
                             PyObject *string, Py_ssize_t pos,
875
                             Py_ssize_t endpos)
876
/*[clinic end generated code: output=bd7f2d9d583e1463 input=05e9feee0334c156]*/
877
5.79M
{
878
5.79M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
879
5.79M
    SRE_STATE state;
880
5.79M
    Py_ssize_t status;
881
5.79M
    PyObject *match;
882
883
5.79M
    if (!state_init(&state, self, string, pos, endpos))
884
0
        return NULL;
885
886
5.79M
    INIT_TRACE(&state);
887
5.79M
    TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
888
889
5.79M
    status = sre_search(&state, PatternObject_GetCode(self));
890
891
5.79M
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
892
893
5.79M
    if (PyErr_Occurred()) {
894
0
        state_fini(&state);
895
0
        return NULL;
896
0
    }
897
898
5.79M
    match = pattern_new_match(module_state, self, &state, status);
899
5.79M
    state_fini(&state);
900
5.79M
    return match;
901
5.79M
}
902
903
/*[clinic input]
904
_sre.SRE_Pattern.findall
905
906
    string: object
907
    pos: Py_ssize_t = 0
908
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
909
910
Return a list of all non-overlapping matches of pattern in string.
911
[clinic start generated code]*/
912
913
static PyObject *
914
_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
915
                              Py_ssize_t pos, Py_ssize_t endpos)
916
/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
917
4.14M
{
918
4.14M
    SRE_STATE state;
919
4.14M
    PyObject* list;
920
4.14M
    Py_ssize_t status;
921
4.14M
    Py_ssize_t i, b, e;
922
923
4.14M
    if (!state_init(&state, self, string, pos, endpos))
924
0
        return NULL;
925
926
4.14M
    list = PyList_New(0);
927
4.14M
    if (!list) {
928
0
        state_fini(&state);
929
0
        return NULL;
930
0
    }
931
932
79.9M
    while (state.start <= state.end) {
933
934
79.9M
        PyObject* item;
935
936
79.9M
        state_reset(&state);
937
938
79.9M
        state.ptr = state.start;
939
940
79.9M
        status = sre_search(&state, PatternObject_GetCode(self));
941
79.9M
        if (PyErr_Occurred())
942
0
            goto error;
943
944
79.9M
        if (status <= 0) {
945
4.14M
            if (status == 0)
946
4.14M
                break;
947
0
            pattern_error(status);
948
0
            goto error;
949
4.14M
        }
950
951
        /* don't bother to build a match object */
952
75.7M
        switch (self->groups) {
953
75.7M
        case 0:
954
75.7M
            b = STATE_OFFSET(&state, state.start);
955
75.7M
            e = STATE_OFFSET(&state, state.ptr);
956
75.7M
            item = getslice(state.isbytes, state.beginning,
957
75.7M
                            string, b, e);
958
75.7M
            if (!item)
959
0
                goto error;
960
75.7M
            break;
961
75.7M
        case 1:
962
0
            item = state_getslice(&state, 1, string, 1);
963
0
            if (!item)
964
0
                goto error;
965
0
            break;
966
0
        default:
967
0
            item = PyTuple_New(self->groups);
968
0
            if (!item)
969
0
                goto error;
970
0
            for (i = 0; i < self->groups; i++) {
971
0
                PyObject* o = state_getslice(&state, i+1, string, 1);
972
0
                if (!o) {
973
0
                    Py_DECREF(item);
974
0
                    goto error;
975
0
                }
976
0
                PyTuple_SET_ITEM(item, i, o);
977
0
            }
978
0
            break;
979
75.7M
        }
980
981
75.7M
        status = PyList_Append(list, item);
982
75.7M
        Py_DECREF(item);
983
75.7M
        if (status < 0)
984
0
            goto error;
985
986
75.7M
        state.must_advance = (state.ptr == state.start);
987
75.7M
        state.start = state.ptr;
988
75.7M
    }
989
990
4.14M
    state_fini(&state);
991
4.14M
    return list;
992
993
0
error:
994
0
    Py_DECREF(list);
995
0
    state_fini(&state);
996
0
    return NULL;
997
998
4.14M
}
999
1000
/*[clinic input]
1001
@permit_long_summary
1002
_sre.SRE_Pattern.finditer
1003
1004
    cls: defining_class
1005
    /
1006
    string: object
1007
    pos: Py_ssize_t = 0
1008
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
1009
1010
Return an iterator over all non-overlapping matches for the RE pattern in string.
1011
1012
For each match, the iterator returns a match object.
1013
[clinic start generated code]*/
1014
1015
static PyObject *
1016
_sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
1017
                               PyObject *string, Py_ssize_t pos,
1018
                               Py_ssize_t endpos)
1019
/*[clinic end generated code: output=1791dbf3618ade56 input=ee28865796048023]*/
1020
388k
{
1021
388k
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1022
388k
    PyObject* scanner;
1023
388k
    PyObject* search;
1024
388k
    PyObject* iterator;
1025
1026
388k
    scanner = pattern_scanner(module_state, self, string, pos, endpos);
1027
388k
    if (!scanner)
1028
0
        return NULL;
1029
1030
388k
    search = PyObject_GetAttrString(scanner, "search");
1031
388k
    Py_DECREF(scanner);
1032
388k
    if (!search)
1033
0
        return NULL;
1034
1035
388k
    iterator = PyCallIter_New(search, Py_None);
1036
388k
    Py_DECREF(search);
1037
1038
388k
    return iterator;
1039
388k
}
1040
1041
/*[clinic input]
1042
_sre.SRE_Pattern.scanner
1043
1044
    cls: defining_class
1045
    /
1046
    string: object
1047
    pos: Py_ssize_t = 0
1048
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
1049
1050
[clinic start generated code]*/
1051
1052
static PyObject *
1053
_sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
1054
                              PyObject *string, Py_ssize_t pos,
1055
                              Py_ssize_t endpos)
1056
/*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
1057
0
{
1058
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1059
1060
0
    return pattern_scanner(module_state, self, string, pos, endpos);
1061
0
}
1062
1063
/*[clinic input]
1064
_sre.SRE_Pattern.split
1065
1066
    string: object
1067
    maxsplit: Py_ssize_t = 0
1068
1069
Split string by the occurrences of pattern.
1070
[clinic start generated code]*/
1071
1072
static PyObject *
1073
_sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
1074
                            Py_ssize_t maxsplit)
1075
/*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
1076
1.71M
{
1077
1.71M
    SRE_STATE state;
1078
1.71M
    PyObject* list;
1079
1.71M
    PyObject* item;
1080
1.71M
    Py_ssize_t status;
1081
1.71M
    Py_ssize_t n;
1082
1.71M
    Py_ssize_t i;
1083
1.71M
    const void* last;
1084
1085
1.71M
    assert(self->codesize != 0);
1086
1087
1.71M
    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
1088
0
        return NULL;
1089
1090
1.71M
    list = PyList_New(0);
1091
1.71M
    if (!list) {
1092
0
        state_fini(&state);
1093
0
        return NULL;
1094
0
    }
1095
1096
1.71M
    n = 0;
1097
1.71M
    last = state.start;
1098
1099
2.88M
    while (!maxsplit || n < maxsplit) {
1100
1101
1.77M
        state_reset(&state);
1102
1103
1.77M
        state.ptr = state.start;
1104
1105
1.77M
        status = sre_search(&state, PatternObject_GetCode(self));
1106
1.77M
        if (PyErr_Occurred())
1107
0
            goto error;
1108
1109
1.77M
        if (status <= 0) {
1110
597k
            if (status == 0)
1111
597k
                break;
1112
0
            pattern_error(status);
1113
0
            goto error;
1114
597k
        }
1115
1116
        /* get segment before this match */
1117
1.17M
        item = getslice(state.isbytes, state.beginning,
1118
1.17M
            string, STATE_OFFSET(&state, last),
1119
1.17M
            STATE_OFFSET(&state, state.start)
1120
1.17M
            );
1121
1.17M
        if (!item)
1122
0
            goto error;
1123
1.17M
        status = PyList_Append(list, item);
1124
1.17M
        Py_DECREF(item);
1125
1.17M
        if (status < 0)
1126
0
            goto error;
1127
1128
        /* add groups (if any) */
1129
2.28M
        for (i = 0; i < self->groups; i++) {
1130
1.11M
            item = state_getslice(&state, i+1, string, 0);
1131
1.11M
            if (!item)
1132
0
                goto error;
1133
1.11M
            status = PyList_Append(list, item);
1134
1.11M
            Py_DECREF(item);
1135
1.11M
            if (status < 0)
1136
0
                goto error;
1137
1.11M
        }
1138
1139
1.17M
        n = n + 1;
1140
1.17M
        state.must_advance = (state.ptr == state.start);
1141
1.17M
        last = state.start = state.ptr;
1142
1143
1.17M
    }
1144
1145
    /* get segment following last match (even if empty) */
1146
1.71M
    item = getslice(state.isbytes, state.beginning,
1147
1.71M
        string, STATE_OFFSET(&state, last), state.endpos
1148
1.71M
        );
1149
1.71M
    if (!item)
1150
0
        goto error;
1151
1.71M
    status = PyList_Append(list, item);
1152
1.71M
    Py_DECREF(item);
1153
1.71M
    if (status < 0)
1154
0
        goto error;
1155
1156
1.71M
    state_fini(&state);
1157
1.71M
    return list;
1158
1159
0
error:
1160
0
    Py_DECREF(list);
1161
0
    state_fini(&state);
1162
0
    return NULL;
1163
1164
1.71M
}
1165
1166
static PyObject *
1167
compile_template(_sremodulestate *module_state,
1168
                 PatternObject *pattern, PyObject *template)
1169
0
{
1170
    /* delegate to Python code */
1171
0
    PyObject *func = FT_ATOMIC_LOAD_PTR(module_state->compile_template);
1172
0
    if (func == NULL) {
1173
0
        func = PyImport_ImportModuleAttrString("re", "_compile_template");
1174
0
        if (func == NULL) {
1175
0
            return NULL;
1176
0
        }
1177
#ifdef Py_GIL_DISABLED
1178
        PyObject *other_func = NULL;
1179
        if (!_Py_atomic_compare_exchange_ptr(&module_state->compile_template, &other_func, func))  {
1180
            Py_DECREF(func);
1181
            func = other_func;
1182
        }
1183
#else
1184
0
        Py_XSETREF(module_state->compile_template, func);
1185
0
#endif
1186
0
    }
1187
1188
0
    PyObject *args[] = {(PyObject *)pattern, template};
1189
0
    PyObject *result = PyObject_Vectorcall(func, args, 2, NULL);
1190
1191
0
    if (result == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
1192
        /* If the replacement string is unhashable (e.g. bytearray),
1193
         * convert it to the basic type (str or bytes) and repeat. */
1194
0
        if (PyUnicode_Check(template) && !PyUnicode_CheckExact(template)) {
1195
0
            PyErr_Clear();
1196
0
            template = _PyUnicode_Copy(template);
1197
0
        }
1198
0
        else if (PyObject_CheckBuffer(template) && !PyBytes_CheckExact(template)) {
1199
0
            PyErr_Clear();
1200
0
            template = PyBytes_FromObject(template);
1201
0
        }
1202
0
        else {
1203
0
            return NULL;
1204
0
        }
1205
0
        if (template == NULL) {
1206
0
            return NULL;
1207
0
        }
1208
0
        args[1] = template;
1209
0
        result = PyObject_Vectorcall(func, args, 2, NULL);
1210
0
        Py_DECREF(template);
1211
0
    }
1212
1213
0
    if (result != NULL && Py_TYPE(result) != module_state->Template_Type) {
1214
0
        PyErr_Format(PyExc_RuntimeError,
1215
0
                    "the result of compiling a replacement string is %.200s",
1216
0
                    Py_TYPE(result)->tp_name);
1217
0
        Py_DECREF(result);
1218
0
        return NULL;
1219
0
    }
1220
0
    return result;
1221
0
}
1222
1223
static PyObject *expand_template(TemplateObject *, MatchObject *); /* Forward */
1224
1225
static PyObject*
1226
pattern_subx(_sremodulestate* module_state,
1227
             PatternObject* self,
1228
             PyObject* ptemplate,
1229
             PyObject* string,
1230
             Py_ssize_t count,
1231
             Py_ssize_t subn)
1232
11.5M
{
1233
11.5M
    SRE_STATE state;
1234
11.5M
    PyObject* list;
1235
11.5M
    PyObject* joiner;
1236
11.5M
    PyObject* item;
1237
11.5M
    PyObject* filter;
1238
11.5M
    PyObject* match;
1239
11.5M
    const void* ptr;
1240
11.5M
    Py_ssize_t status;
1241
11.5M
    Py_ssize_t n;
1242
11.5M
    Py_ssize_t i, b, e;
1243
11.5M
    int isbytes, charsize;
1244
11.5M
    enum {LITERAL, TEMPLATE, CALLABLE} filter_type;
1245
11.5M
    Py_buffer view;
1246
1247
11.5M
    if (PyCallable_Check(ptemplate)) {
1248
        /* sub/subn takes either a function or a template */
1249
3.12M
        filter = Py_NewRef(ptemplate);
1250
3.12M
        filter_type = CALLABLE;
1251
8.40M
    } else {
1252
        /* if not callable, check if it's a literal string */
1253
8.40M
        int literal;
1254
8.40M
        view.buf = NULL;
1255
8.40M
        ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
1256
8.40M
        if (ptr) {
1257
8.40M
            if (charsize == 1)
1258
8.40M
                literal = memchr(ptr, '\\', n) == NULL;
1259
0
            else
1260
0
                literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
1261
8.40M
        } else {
1262
0
            PyErr_Clear();
1263
0
            literal = 0;
1264
0
        }
1265
8.40M
        if (view.buf)
1266
0
            PyBuffer_Release(&view);
1267
8.40M
        if (literal) {
1268
8.40M
            filter = Py_NewRef(ptemplate);
1269
8.40M
            filter_type = LITERAL;
1270
8.40M
        } else {
1271
            /* not a literal; hand it over to the template compiler */
1272
0
            filter = compile_template(module_state, self, ptemplate);
1273
0
            if (!filter)
1274
0
                return NULL;
1275
1276
0
            assert(Py_TYPE(filter) == module_state->Template_Type);
1277
0
            if (Py_SIZE(filter) == 0) {
1278
0
                Py_SETREF(filter,
1279
0
                          Py_NewRef(((TemplateObject *)filter)->literal));
1280
0
                filter_type = LITERAL;
1281
0
            }
1282
0
            else {
1283
0
                filter_type = TEMPLATE;
1284
0
            }
1285
0
        }
1286
8.40M
    }
1287
1288
11.5M
    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
1289
0
        Py_DECREF(filter);
1290
0
        return NULL;
1291
0
    }
1292
1293
11.5M
    list = PyList_New(0);
1294
11.5M
    if (!list) {
1295
0
        Py_DECREF(filter);
1296
0
        state_fini(&state);
1297
0
        return NULL;
1298
0
    }
1299
1300
11.5M
    n = i = 0;
1301
1302
18.3M
    while (!count || n < count) {
1303
1304
18.3M
        state_reset(&state);
1305
1306
18.3M
        state.ptr = state.start;
1307
1308
18.3M
        status = sre_search(&state, PatternObject_GetCode(self));
1309
18.3M
        if (PyErr_Occurred())
1310
0
            goto error;
1311
1312
18.3M
        if (status <= 0) {
1313
11.5M
            if (status == 0)
1314
11.5M
                break;
1315
0
            pattern_error(status);
1316
0
            goto error;
1317
11.5M
        }
1318
1319
6.80M
        b = STATE_OFFSET(&state, state.start);
1320
6.80M
        e = STATE_OFFSET(&state, state.ptr);
1321
1322
6.80M
        if (i < b) {
1323
            /* get segment before this match */
1324
5.50M
            item = getslice(state.isbytes, state.beginning,
1325
5.50M
                string, i, b);
1326
5.50M
            if (!item)
1327
0
                goto error;
1328
5.50M
            status = PyList_Append(list, item);
1329
5.50M
            Py_DECREF(item);
1330
5.50M
            if (status < 0)
1331
0
                goto error;
1332
1333
5.50M
        }
1334
1335
6.80M
        if (filter_type != LITERAL) {
1336
            /* pass match object through filter */
1337
6.80M
            match = pattern_new_match(module_state, self, &state, 1);
1338
6.80M
            if (!match)
1339
0
                goto error;
1340
6.80M
            if (filter_type == TEMPLATE) {
1341
0
                item = expand_template((TemplateObject *)filter,
1342
0
                                       (MatchObject *)match);
1343
0
            }
1344
6.80M
            else {
1345
6.80M
                assert(filter_type == CALLABLE);
1346
6.80M
                item = PyObject_CallOneArg(filter, match);
1347
6.80M
            }
1348
6.80M
            Py_DECREF(match);
1349
6.80M
            if (!item)
1350
31
                goto error;
1351
6.80M
        } else {
1352
            /* filter is literal string */
1353
2.18k
            item = Py_NewRef(filter);
1354
2.18k
        }
1355
1356
        /* add to list */
1357
6.80M
        if (item != Py_None) {
1358
6.80M
            status = PyList_Append(list, item);
1359
6.80M
            Py_DECREF(item);
1360
6.80M
            if (status < 0)
1361
0
                goto error;
1362
6.80M
        }
1363
1364
6.80M
        i = e;
1365
6.80M
        n = n + 1;
1366
6.80M
        state.must_advance = (state.ptr == state.start);
1367
6.80M
        state.start = state.ptr;
1368
6.80M
    }
1369
1370
    /* get segment following last match */
1371
11.5M
    if (i < state.endpos) {
1372
7.94M
        item = getslice(state.isbytes, state.beginning,
1373
7.94M
                        string, i, state.endpos);
1374
7.94M
        if (!item)
1375
0
            goto error;
1376
7.94M
        status = PyList_Append(list, item);
1377
7.94M
        Py_DECREF(item);
1378
7.94M
        if (status < 0)
1379
0
            goto error;
1380
7.94M
    }
1381
1382
11.5M
    state_fini(&state);
1383
1384
11.5M
    Py_DECREF(filter);
1385
1386
    /* convert list to single string (also removes list) */
1387
11.5M
    joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
1388
11.5M
    if (!joiner) {
1389
0
        Py_DECREF(list);
1390
0
        return NULL;
1391
0
    }
1392
11.5M
    if (PyList_GET_SIZE(list) == 0) {
1393
3.29M
        Py_DECREF(list);
1394
3.29M
        item = joiner;
1395
3.29M
    }
1396
8.22M
    else {
1397
8.22M
        if (state.isbytes)
1398
39.0k
            item = PyBytes_Join(joiner, list);
1399
8.18M
        else
1400
8.18M
            item = PyUnicode_Join(joiner, list);
1401
8.22M
        Py_DECREF(joiner);
1402
8.22M
        Py_DECREF(list);
1403
8.22M
        if (!item)
1404
0
            return NULL;
1405
8.22M
    }
1406
1407
11.5M
    if (subn)
1408
0
        return Py_BuildValue("Nn", item, n);
1409
1410
11.5M
    return item;
1411
1412
31
error:
1413
31
    Py_DECREF(list);
1414
31
    state_fini(&state);
1415
31
    Py_DECREF(filter);
1416
31
    return NULL;
1417
1418
11.5M
}
1419
1420
/*[clinic input]
1421
@permit_long_summary
1422
_sre.SRE_Pattern.sub
1423
1424
    cls: defining_class
1425
    /
1426
    repl: object
1427
    string: object
1428
    count: Py_ssize_t = 0
1429
1430
Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1431
[clinic start generated code]*/
1432
1433
static PyObject *
1434
_sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1435
                          PyObject *repl, PyObject *string, Py_ssize_t count)
1436
/*[clinic end generated code: output=4be141ab04bca60d input=eba511fd1c4908b7]*/
1437
11.5M
{
1438
11.5M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1439
1440
11.5M
    return pattern_subx(module_state, self, repl, string, count, 0);
1441
11.5M
}
1442
1443
/*[clinic input]
1444
@permit_long_summary
1445
_sre.SRE_Pattern.subn
1446
1447
    cls: defining_class
1448
    /
1449
    repl: object
1450
    string: object
1451
    count: Py_ssize_t = 0
1452
1453
Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1454
[clinic start generated code]*/
1455
1456
static PyObject *
1457
_sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1458
                           PyObject *repl, PyObject *string,
1459
                           Py_ssize_t count)
1460
/*[clinic end generated code: output=da02fd85258b1e1f input=6a5bb5b61717abf0]*/
1461
0
{
1462
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1463
1464
0
    return pattern_subx(module_state, self, repl, string, count, 1);
1465
0
}
1466
1467
/*[clinic input]
1468
_sre.SRE_Pattern.__copy__
1469
1470
[clinic start generated code]*/
1471
1472
static PyObject *
1473
_sre_SRE_Pattern___copy___impl(PatternObject *self)
1474
/*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
1475
0
{
1476
0
    return Py_NewRef(self);
1477
0
}
1478
1479
/*[clinic input]
1480
_sre.SRE_Pattern.__deepcopy__
1481
1482
    memo: object
1483
    /
1484
1485
[clinic start generated code]*/
1486
1487
static PyObject *
1488
_sre_SRE_Pattern___deepcopy___impl(PatternObject *self, PyObject *memo)
1489
/*[clinic end generated code: output=75efe69bd12c5d7d input=a465b1602f997bed]*/
1490
0
{
1491
0
    return Py_NewRef(self);
1492
0
}
1493
1494
#ifdef Py_DEBUG
1495
/*[clinic input]
1496
_sre.SRE_Pattern._fail_after
1497
1498
    count: int
1499
    exception: object
1500
    /
1501
1502
For debugging.
1503
[clinic start generated code]*/
1504
1505
static PyObject *
1506
_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
1507
                                  PyObject *exception)
1508
/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
1509
{
1510
    self->fail_after_count = count;
1511
    Py_INCREF(exception);
1512
    Py_XSETREF(self->fail_after_exc, exception);
1513
    Py_RETURN_NONE;
1514
}
1515
#endif /* Py_DEBUG */
1516
1517
static PyObject *
1518
pattern_repr(PyObject *self)
1519
0
{
1520
0
    static const struct {
1521
0
        const char *name;
1522
0
        int value;
1523
0
    } flag_names[] = {
1524
0
        {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1525
0
        {"re.LOCALE", SRE_FLAG_LOCALE},
1526
0
        {"re.MULTILINE", SRE_FLAG_MULTILINE},
1527
0
        {"re.DOTALL", SRE_FLAG_DOTALL},
1528
0
        {"re.UNICODE", SRE_FLAG_UNICODE},
1529
0
        {"re.VERBOSE", SRE_FLAG_VERBOSE},
1530
0
        {"re.DEBUG", SRE_FLAG_DEBUG},
1531
0
        {"re.ASCII", SRE_FLAG_ASCII},
1532
0
    };
1533
1534
0
    PatternObject *obj = _PatternObject_CAST(self);
1535
0
    PyObject *result = NULL;
1536
0
    PyObject *flag_items;
1537
0
    size_t i;
1538
0
    int flags = obj->flags;
1539
1540
    /* Omit re.UNICODE for valid string patterns. */
1541
0
    if (obj->isbytes == 0 &&
1542
0
        (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1543
0
         SRE_FLAG_UNICODE)
1544
0
        flags &= ~SRE_FLAG_UNICODE;
1545
1546
0
    flag_items = PyList_New(0);
1547
0
    if (!flag_items)
1548
0
        return NULL;
1549
1550
0
    for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1551
0
        if (flags & flag_names[i].value) {
1552
0
            PyObject *item = PyUnicode_FromString(flag_names[i].name);
1553
0
            if (!item)
1554
0
                goto done;
1555
1556
0
            if (PyList_Append(flag_items, item) < 0) {
1557
0
                Py_DECREF(item);
1558
0
                goto done;
1559
0
            }
1560
0
            Py_DECREF(item);
1561
0
            flags &= ~flag_names[i].value;
1562
0
        }
1563
0
    }
1564
0
    if (flags) {
1565
0
        PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1566
0
        if (!item)
1567
0
            goto done;
1568
1569
0
        if (PyList_Append(flag_items, item) < 0) {
1570
0
            Py_DECREF(item);
1571
0
            goto done;
1572
0
        }
1573
0
        Py_DECREF(item);
1574
0
    }
1575
1576
0
    if (PyList_Size(flag_items) > 0) {
1577
0
        PyObject *flags_result;
1578
0
        PyObject *sep = PyUnicode_FromString("|");
1579
0
        if (!sep)
1580
0
            goto done;
1581
0
        flags_result = PyUnicode_Join(sep, flag_items);
1582
0
        Py_DECREF(sep);
1583
0
        if (!flags_result)
1584
0
            goto done;
1585
0
        result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1586
0
                                      obj->pattern, flags_result);
1587
0
        Py_DECREF(flags_result);
1588
0
    }
1589
0
    else {
1590
0
        result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1591
0
    }
1592
1593
0
done:
1594
0
    Py_DECREF(flag_items);
1595
0
    return result;
1596
0
}
1597
1598
PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
1599
1600
/* PatternObject's 'groupindex' method. */
1601
static PyObject *
1602
pattern_groupindex(PyObject *op, void *Py_UNUSED(ignored))
1603
0
{
1604
0
    PatternObject *self = _PatternObject_CAST(op);
1605
0
    if (self->groupindex == NULL)
1606
0
        return PyDict_New();
1607
0
    return PyDictProxy_New(self->groupindex);
1608
0
}
1609
1610
static int _validate(PatternObject *self); /* Forward */
1611
1612
/*[clinic input]
1613
_sre.compile
1614
1615
    pattern: object
1616
    flags: int
1617
    code: object(subclass_of='&PyList_Type')
1618
    groups: Py_ssize_t
1619
    groupindex: object(subclass_of='&PyDict_Type')
1620
    indexgroup: object(subclass_of='&PyTuple_Type')
1621
1622
[clinic start generated code]*/
1623
1624
static PyObject *
1625
_sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
1626
                  PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1627
                  PyObject *indexgroup)
1628
/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
1629
3.14k
{
1630
    /* "compile" pattern descriptor to pattern object */
1631
1632
3.14k
    _sremodulestate *module_state = get_sre_module_state(module);
1633
3.14k
    PatternObject* self;
1634
3.14k
    Py_ssize_t i, n;
1635
1636
3.14k
    n = PyList_GET_SIZE(code);
1637
    /* coverity[ampersand_in_size] */
1638
3.14k
    self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
1639
3.14k
    if (!self)
1640
0
        return NULL;
1641
3.14k
    self->weakreflist = NULL;
1642
3.14k
    self->pattern = NULL;
1643
3.14k
    self->groupindex = NULL;
1644
3.14k
    self->indexgroup = NULL;
1645
#ifdef Py_DEBUG
1646
    self->fail_after_count = -1;
1647
    self->fail_after_exc = NULL;
1648
#endif
1649
1650
3.14k
    self->codesize = n;
1651
1652
90.1M
    for (i = 0; i < n; i++) {
1653
90.1M
        PyObject *o = PyList_GET_ITEM(code, i);
1654
90.1M
        unsigned long value = PyLong_AsUnsignedLong(o);
1655
90.1M
        if (value == (unsigned long)-1 && PyErr_Occurred()) {
1656
0
            break;
1657
0
        }
1658
90.1M
        self->code[i] = (SRE_CODE) value;
1659
90.1M
        if ((unsigned long) self->code[i] != value) {
1660
0
            PyErr_SetString(PyExc_OverflowError,
1661
0
                            "regular expression code size limit exceeded");
1662
0
            break;
1663
0
        }
1664
90.1M
    }
1665
3.14k
    PyObject_GC_Track(self);
1666
1667
3.14k
    if (PyErr_Occurred()) {
1668
0
        Py_DECREF(self);
1669
0
        return NULL;
1670
0
    }
1671
1672
3.14k
    if (pattern == Py_None) {
1673
0
        self->isbytes = -1;
1674
0
    }
1675
3.14k
    else {
1676
3.14k
        Py_ssize_t p_length;
1677
3.14k
        int charsize;
1678
3.14k
        Py_buffer view;
1679
3.14k
        view.buf = NULL;
1680
3.14k
        if (!getstring(pattern, &p_length, &self->isbytes,
1681
3.14k
                       &charsize, &view)) {
1682
0
            Py_DECREF(self);
1683
0
            return NULL;
1684
0
        }
1685
3.14k
        if (view.buf)
1686
30
            PyBuffer_Release(&view);
1687
3.14k
    }
1688
1689
3.14k
    self->pattern = Py_NewRef(pattern);
1690
1691
3.14k
    self->flags = flags;
1692
1693
3.14k
    self->groups = groups;
1694
1695
3.14k
    if (PyDict_GET_SIZE(groupindex) > 0) {
1696
30
        self->groupindex = Py_NewRef(groupindex);
1697
30
        if (PyTuple_GET_SIZE(indexgroup) > 0) {
1698
30
            self->indexgroup = Py_NewRef(indexgroup);
1699
30
        }
1700
30
    }
1701
1702
3.14k
    if (!_validate(self)) {
1703
0
        Py_DECREF(self);
1704
0
        return NULL;
1705
0
    }
1706
1707
3.14k
    return (PyObject*) self;
1708
3.14k
}
1709
1710
/*[clinic input]
1711
_sre.template
1712
1713
    pattern: object
1714
    template: object(subclass_of="&PyList_Type")
1715
        A list containing interleaved literal strings (str or bytes) and group
1716
        indices (int), as returned by re._parser.parse_template():
1717
            [literal1, group1, ..., literalN, groupN]
1718
    /
1719
1720
[clinic start generated code]*/
1721
1722
static PyObject *
1723
_sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template)
1724
/*[clinic end generated code: output=d51290e596ebca86 input=af55380b27f02942]*/
1725
0
{
1726
    /* template is a list containing interleaved literal strings (str or bytes)
1727
     * and group indices (int), as returned by _parser.parse_template:
1728
     * [literal1, group1, literal2, ..., literalN].
1729
     */
1730
0
    _sremodulestate *module_state = get_sre_module_state(module);
1731
0
    TemplateObject *self = NULL;
1732
0
    Py_ssize_t n = PyList_GET_SIZE(template);
1733
0
    if ((n & 1) == 0 || n < 1) {
1734
0
        goto bad_template;
1735
0
    }
1736
0
    n /= 2;
1737
0
    self = PyObject_GC_NewVar(TemplateObject, module_state->Template_Type, n);
1738
0
    if (!self)
1739
0
        return NULL;
1740
0
    self->chunks = 1 + 2*n;
1741
0
    self->literal = Py_NewRef(PyList_GET_ITEM(template, 0));
1742
0
    for (Py_ssize_t i = 0; i < n; i++) {
1743
0
        Py_ssize_t index = PyLong_AsSsize_t(PyList_GET_ITEM(template, 2*i+1));
1744
0
        if (index == -1 && PyErr_Occurred()) {
1745
0
            Py_SET_SIZE(self, i);
1746
0
            Py_DECREF(self);
1747
0
            return NULL;
1748
0
        }
1749
0
        if (index < 0) {
1750
0
            Py_SET_SIZE(self, i);
1751
0
            goto bad_template;
1752
0
        }
1753
0
        self->items[i].index = index;
1754
1755
0
        PyObject *literal = PyList_GET_ITEM(template, 2*i+2);
1756
        // Skip empty literals.
1757
0
        if ((PyUnicode_Check(literal) && !PyUnicode_GET_LENGTH(literal)) ||
1758
0
            (PyBytes_Check(literal) && !PyBytes_GET_SIZE(literal)))
1759
0
        {
1760
0
            literal = NULL;
1761
0
            self->chunks--;
1762
0
        }
1763
0
        self->items[i].literal = Py_XNewRef(literal);
1764
0
    }
1765
0
    PyObject_GC_Track(self);
1766
0
    return (PyObject*) self;
1767
1768
0
bad_template:
1769
0
    PyErr_SetString(PyExc_TypeError, "invalid template");
1770
0
    Py_XDECREF(self);
1771
0
    return NULL;
1772
0
}
1773
1774
/* -------------------------------------------------------------------- */
1775
/* Code validation */
1776
1777
/* To learn more about this code, have a look at the _compile() function in
1778
   Lib/sre_compile.py.  The validation functions below checks the code array
1779
   for conformance with the code patterns generated there.
1780
1781
   The nice thing about the generated code is that it is position-independent:
1782
   all jumps are relative jumps forward.  Also, jumps don't cross each other:
1783
   the target of a later jump is always earlier than the target of an earlier
1784
   jump.  IOW, this is okay:
1785
1786
   J---------J-------T--------T
1787
    \         \_____/        /
1788
     \______________________/
1789
1790
   but this is not:
1791
1792
   J---------J-------T--------T
1793
    \_________\_____/        /
1794
               \____________/
1795
1796
   It also helps that SRE_CODE is always an unsigned type.
1797
*/
1798
1799
/* Defining this one enables tracing of the validator */
1800
#undef VVERBOSE
1801
1802
/* Trace macro for the validator */
1803
#if defined(VVERBOSE)
1804
#define VTRACE(v) printf v
1805
#else
1806
143M
#define VTRACE(v) do {} while(0)  /* do nothing */
1807
#endif
1808
1809
/* Report failure */
1810
0
#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return -1; } while (0)
1811
1812
/* Extract opcode, argument, or skip count from code array */
1813
#define GET_OP                                          \
1814
33.7M
    do {                                                \
1815
33.7M
        VTRACE(("%p: ", code));                         \
1816
33.7M
        if (code >= end) FAIL;                          \
1817
33.7M
        op = *code++;                                   \
1818
33.7M
        VTRACE(("%lu (op)\n", (unsigned long)op));      \
1819
33.7M
    } while (0)
1820
#define GET_ARG                                         \
1821
30.2M
    do {                                                \
1822
30.2M
        VTRACE(("%p= ", code));                         \
1823
30.2M
        if (code >= end) FAIL;                          \
1824
30.2M
        arg = *code++;                                  \
1825
30.2M
        VTRACE(("%lu (arg)\n", (unsigned long)arg));    \
1826
30.2M
    } while (0)
1827
#define GET_SKIP_ADJ(adj)                               \
1828
5.58M
    do {                                                \
1829
5.58M
        VTRACE(("%p= ", code));                         \
1830
5.58M
        if (code >= end) FAIL;                          \
1831
5.58M
        skip = *code;                                   \
1832
5.58M
        VTRACE(("%lu (skip to %p)\n",                   \
1833
5.58M
               (unsigned long)skip, code+skip));        \
1834
5.58M
        if (skip-adj > (uintptr_t)(end - code))         \
1835
5.58M
            FAIL;                                       \
1836
5.58M
        code++;                                         \
1837
5.58M
    } while (0)
1838
5.58M
#define GET_SKIP GET_SKIP_ADJ(0)
1839
1840
static int
1841
_validate_charset(SRE_CODE *code, SRE_CODE *end)
1842
2.32M
{
1843
    /* Some variables are manipulated by the macros above */
1844
2.32M
    SRE_CODE op;
1845
2.32M
    SRE_CODE arg;
1846
2.32M
    SRE_CODE offset;
1847
2.32M
    int i;
1848
1849
6.89M
    while (code < end) {
1850
4.56M
        GET_OP;
1851
4.56M
        switch (op) {
1852
1853
824
        case SRE_OP_NEGATE:
1854
824
            break;
1855
1856
4.46M
        case SRE_OP_LITERAL:
1857
4.46M
            GET_ARG;
1858
4.46M
            break;
1859
1860
4.46M
        case SRE_OP_RANGE:
1861
22.0k
        case SRE_OP_RANGE_UNI_IGNORE:
1862
22.0k
            GET_ARG;
1863
22.0k
            GET_ARG;
1864
22.0k
            break;
1865
1866
22.0k
        case SRE_OP_CHARSET:
1867
5.88k
            offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
1868
5.88k
            if (offset > (uintptr_t)(end - code))
1869
0
                FAIL;
1870
5.88k
            code += offset;
1871
5.88k
            break;
1872
1873
71.4k
        case SRE_OP_BIGCHARSET:
1874
71.4k
            GET_ARG; /* Number of blocks */
1875
71.4k
            offset = 256/sizeof(SRE_CODE); /* 256-byte table */
1876
71.4k
            if (offset > (uintptr_t)(end - code))
1877
0
                FAIL;
1878
            /* Make sure that each byte points to a valid block */
1879
18.3M
            for (i = 0; i < 256; i++) {
1880
18.2M
                if (((unsigned char *)code)[i] >= arg)
1881
0
                    FAIL;
1882
18.2M
            }
1883
71.4k
            code += offset;
1884
71.4k
            offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
1885
71.4k
            if (offset > (uintptr_t)(end - code))
1886
0
                FAIL;
1887
71.4k
            code += offset;
1888
71.4k
            break;
1889
1890
508
        case SRE_OP_CATEGORY:
1891
508
            GET_ARG;
1892
508
            switch (arg) {
1893
32
            case SRE_CATEGORY_DIGIT:
1894
32
            case SRE_CATEGORY_NOT_DIGIT:
1895
40
            case SRE_CATEGORY_SPACE:
1896
40
            case SRE_CATEGORY_NOT_SPACE:
1897
50
            case SRE_CATEGORY_WORD:
1898
50
            case SRE_CATEGORY_NOT_WORD:
1899
50
            case SRE_CATEGORY_LINEBREAK:
1900
50
            case SRE_CATEGORY_NOT_LINEBREAK:
1901
50
            case SRE_CATEGORY_LOC_WORD:
1902
50
            case SRE_CATEGORY_LOC_NOT_WORD:
1903
60
            case SRE_CATEGORY_UNI_DIGIT:
1904
315
            case SRE_CATEGORY_UNI_NOT_DIGIT:
1905
486
            case SRE_CATEGORY_UNI_SPACE:
1906
496
            case SRE_CATEGORY_UNI_NOT_SPACE:
1907
506
            case SRE_CATEGORY_UNI_WORD:
1908
508
            case SRE_CATEGORY_UNI_NOT_WORD:
1909
508
            case SRE_CATEGORY_UNI_LINEBREAK:
1910
508
            case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1911
508
                break;
1912
0
            default:
1913
0
                FAIL;
1914
508
            }
1915
508
            break;
1916
1917
508
        default:
1918
0
            FAIL;
1919
1920
4.56M
        }
1921
4.56M
    }
1922
1923
2.32M
    return 0;
1924
2.32M
}
1925
1926
/* Returns 0 on success, -1 on failure, and 1 if the last op is JUMP. */
1927
static int
1928
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1929
2.24M
{
1930
    /* Some variables are manipulated by the macros above */
1931
2.24M
    SRE_CODE op;
1932
2.24M
    SRE_CODE arg;
1933
2.24M
    SRE_CODE skip;
1934
1935
2.24M
    VTRACE(("code=%p, end=%p\n", code, end));
1936
1937
2.24M
    if (code > end)
1938
0
        FAIL;
1939
1940
29.2M
    while (code < end) {
1941
26.9M
        GET_OP;
1942
26.9M
        switch (op) {
1943
1944
97.4k
        case SRE_OP_MARK:
1945
            /* We don't check whether marks are properly nested; the
1946
               sre_match() code is robust even if they don't, and the worst
1947
               you can get is nonsensical match results. */
1948
97.4k
            GET_ARG;
1949
97.4k
            if (arg >= 2 * (size_t)groups) {
1950
0
                VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1951
0
                FAIL;
1952
0
            }
1953
97.4k
            break;
1954
1955
18.3M
        case SRE_OP_LITERAL:
1956
18.3M
        case SRE_OP_NOT_LITERAL:
1957
18.3M
        case SRE_OP_LITERAL_IGNORE:
1958
18.3M
        case SRE_OP_NOT_LITERAL_IGNORE:
1959
22.9M
        case SRE_OP_LITERAL_UNI_IGNORE:
1960
22.9M
        case SRE_OP_NOT_LITERAL_UNI_IGNORE:
1961
22.9M
        case SRE_OP_LITERAL_LOC_IGNORE:
1962
22.9M
        case SRE_OP_NOT_LITERAL_LOC_IGNORE:
1963
22.9M
            GET_ARG;
1964
            /* The arg is just a character, nothing to check */
1965
22.9M
            break;
1966
1967
22.9M
        case SRE_OP_SUCCESS:
1968
50
        case SRE_OP_FAILURE:
1969
            /* Nothing to check; these normally end the matching process */
1970
50
            break;
1971
1972
43.0k
        case SRE_OP_AT:
1973
43.0k
            GET_ARG;
1974
43.0k
            switch (arg) {
1975
27
            case SRE_AT_BEGINNING:
1976
31
            case SRE_AT_BEGINNING_STRING:
1977
3.39k
            case SRE_AT_BEGINNING_LINE:
1978
3.42k
            case SRE_AT_END:
1979
39.5k
            case SRE_AT_END_LINE:
1980
39.5k
            case SRE_AT_END_STRING:
1981
39.5k
            case SRE_AT_BOUNDARY:
1982
39.5k
            case SRE_AT_NON_BOUNDARY:
1983
39.5k
            case SRE_AT_LOC_BOUNDARY:
1984
39.5k
            case SRE_AT_LOC_NON_BOUNDARY:
1985
43.0k
            case SRE_AT_UNI_BOUNDARY:
1986
43.0k
            case SRE_AT_UNI_NON_BOUNDARY:
1987
43.0k
                break;
1988
0
            default:
1989
0
                FAIL;
1990
43.0k
            }
1991
43.0k
            break;
1992
1993
244k
        case SRE_OP_ANY:
1994
244k
        case SRE_OP_ANY_ALL:
1995
            /* These have no operands */
1996
244k
            break;
1997
1998
32.2k
        case SRE_OP_IN:
1999
32.6k
        case SRE_OP_IN_IGNORE:
2000
2.32M
        case SRE_OP_IN_UNI_IGNORE:
2001
2.32M
        case SRE_OP_IN_LOC_IGNORE:
2002
2.32M
            GET_SKIP;
2003
            /* Stop 1 before the end; we check the FAILURE below */
2004
2.32M
            if (_validate_charset(code, code+skip-2))
2005
0
                FAIL;
2006
2.32M
            if (code[skip-2] != SRE_OP_FAILURE)
2007
0
                FAIL;
2008
2.32M
            code += skip-1;
2009
2.32M
            break;
2010
2011
3.14k
        case SRE_OP_INFO:
2012
3.14k
            {
2013
                /* A minimal info field is
2014
                   <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2015
                   If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2016
                   more follows. */
2017
3.14k
                SRE_CODE flags, i;
2018
3.14k
                SRE_CODE *newcode;
2019
3.14k
                GET_SKIP;
2020
3.14k
                newcode = code+skip-1;
2021
3.14k
                GET_ARG; flags = arg;
2022
3.14k
                GET_ARG;
2023
3.14k
                GET_ARG;
2024
                /* Check that only valid flags are present */
2025
3.14k
                if ((flags & ~(SRE_INFO_PREFIX |
2026
3.14k
                               SRE_INFO_LITERAL |
2027
3.14k
                               SRE_INFO_CHARSET)) != 0)
2028
0
                    FAIL;
2029
                /* PREFIX and CHARSET are mutually exclusive */
2030
3.14k
                if ((flags & SRE_INFO_PREFIX) &&
2031
1.39k
                    (flags & SRE_INFO_CHARSET))
2032
0
                    FAIL;
2033
                /* LITERAL implies PREFIX */
2034
3.14k
                if ((flags & SRE_INFO_LITERAL) &&
2035
581
                    !(flags & SRE_INFO_PREFIX))
2036
0
                    FAIL;
2037
                /* Validate the prefix */
2038
3.14k
                if (flags & SRE_INFO_PREFIX) {
2039
1.39k
                    SRE_CODE prefix_len;
2040
1.39k
                    GET_ARG; prefix_len = arg;
2041
1.39k
                    GET_ARG;
2042
                    /* Here comes the prefix string */
2043
1.39k
                    if (prefix_len > (uintptr_t)(newcode - code))
2044
0
                        FAIL;
2045
1.39k
                    code += prefix_len;
2046
                    /* And here comes the overlap table */
2047
1.39k
                    if (prefix_len > (uintptr_t)(newcode - code))
2048
0
                        FAIL;
2049
                    /* Each overlap value should be < prefix_len */
2050
5.86M
                    for (i = 0; i < prefix_len; i++) {
2051
5.86M
                        if (code[i] >= prefix_len)
2052
0
                            FAIL;
2053
5.86M
                    }
2054
1.39k
                    code += prefix_len;
2055
1.39k
                }
2056
                /* Validate the charset */
2057
3.14k
                if (flags & SRE_INFO_CHARSET) {
2058
366
                    if (_validate_charset(code, newcode-1))
2059
0
                        FAIL;
2060
366
                    if (newcode[-1] != SRE_OP_FAILURE)
2061
0
                        FAIL;
2062
366
                    code = newcode;
2063
366
                }
2064
2.77k
                else if (code != newcode) {
2065
0
                  VTRACE(("code=%p, newcode=%p\n", code, newcode));
2066
0
                    FAIL;
2067
0
                }
2068
3.14k
            }
2069
3.14k
            break;
2070
2071
6.03k
        case SRE_OP_BRANCH:
2072
6.03k
            {
2073
6.03k
                SRE_CODE *target = NULL;
2074
1.01M
                for (;;) {
2075
1.01M
                    GET_SKIP;
2076
1.01M
                    if (skip == 0)
2077
6.03k
                        break;
2078
                    /* Stop 2 before the end; we check the JUMP below */
2079
1.00M
                    if (_validate_inner(code, code+skip-3, groups))
2080
0
                        FAIL;
2081
1.00M
                    code += skip-3;
2082
                    /* Check that it ends with a JUMP, and that each JUMP
2083
                       has the same target */
2084
1.00M
                    GET_OP;
2085
1.00M
                    if (op != SRE_OP_JUMP)
2086
0
                        FAIL;
2087
1.00M
                    GET_SKIP;
2088
1.00M
                    if (target == NULL)
2089
6.03k
                        target = code+skip-1;
2090
998k
                    else if (code+skip-1 != target)
2091
0
                        FAIL;
2092
1.00M
                }
2093
6.03k
                if (code != target)
2094
0
                    FAIL;
2095
6.03k
            }
2096
6.03k
            break;
2097
2098
1.23M
        case SRE_OP_REPEAT_ONE:
2099
1.23M
        case SRE_OP_MIN_REPEAT_ONE:
2100
1.23M
        case SRE_OP_POSSESSIVE_REPEAT_ONE:
2101
1.23M
            {
2102
1.23M
                SRE_CODE min, max;
2103
1.23M
                GET_SKIP;
2104
1.23M
                GET_ARG; min = arg;
2105
1.23M
                GET_ARG; max = arg;
2106
1.23M
                if (min > max)
2107
0
                    FAIL;
2108
1.23M
                if (max > SRE_MAXREPEAT)
2109
0
                    FAIL;
2110
1.23M
                if (_validate_inner(code, code+skip-4, groups))
2111
0
                    FAIL;
2112
1.23M
                code += skip-4;
2113
1.23M
                GET_OP;
2114
1.23M
                if (op != SRE_OP_SUCCESS)
2115
0
                    FAIL;
2116
1.23M
            }
2117
1.23M
            break;
2118
2119
1.23M
        case SRE_OP_REPEAT:
2120
6.01k
        case SRE_OP_POSSESSIVE_REPEAT:
2121
6.01k
            {
2122
6.01k
                SRE_CODE op1 = op, min, max;
2123
6.01k
                GET_SKIP;
2124
6.01k
                GET_ARG; min = arg;
2125
6.01k
                GET_ARG; max = arg;
2126
6.01k
                if (min > max)
2127
0
                    FAIL;
2128
6.01k
                if (max > SRE_MAXREPEAT)
2129
0
                    FAIL;
2130
6.01k
                if (_validate_inner(code, code+skip-3, groups))
2131
0
                    FAIL;
2132
6.01k
                code += skip-3;
2133
6.01k
                GET_OP;
2134
6.01k
                if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
2135
29
                    if (op != SRE_OP_SUCCESS)
2136
0
                        FAIL;
2137
29
                }
2138
5.99k
                else {
2139
5.99k
                    if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
2140
0
                        FAIL;
2141
5.99k
                }
2142
6.01k
            }
2143
6.01k
            break;
2144
2145
6.01k
        case SRE_OP_ATOMIC_GROUP:
2146
14
            {
2147
14
                GET_SKIP;
2148
14
                if (_validate_inner(code, code+skip-2, groups))
2149
0
                    FAIL;
2150
14
                code += skip-2;
2151
14
                GET_OP;
2152
14
                if (op != SRE_OP_SUCCESS)
2153
0
                    FAIL;
2154
14
            }
2155
14
            break;
2156
2157
14
        case SRE_OP_GROUPREF:
2158
0
        case SRE_OP_GROUPREF_IGNORE:
2159
28
        case SRE_OP_GROUPREF_UNI_IGNORE:
2160
28
        case SRE_OP_GROUPREF_LOC_IGNORE:
2161
28
            GET_ARG;
2162
28
            if (arg >= (size_t)groups)
2163
0
                FAIL;
2164
28
            break;
2165
2166
28
        case SRE_OP_GROUPREF_EXISTS:
2167
            /* The regex syntax for this is: '(?(group)then|else)', where
2168
               'group' is either an integer group number or a group name,
2169
               'then' and 'else' are sub-regexes, and 'else' is optional. */
2170
8
            GET_ARG;
2171
8
            if (arg >= (size_t)groups)
2172
0
                FAIL;
2173
8
            GET_SKIP_ADJ(1);
2174
8
            code--; /* The skip is relative to the first arg! */
2175
            /* There are two possibilities here: if there is both a 'then'
2176
               part and an 'else' part, the generated code looks like:
2177
2178
               GROUPREF_EXISTS
2179
               <group>
2180
               <skipyes>
2181
               ...then part...
2182
               JUMP
2183
               <skipno>
2184
               (<skipyes> jumps here)
2185
               ...else part...
2186
               (<skipno> jumps here)
2187
2188
               If there is only a 'then' part, it looks like:
2189
2190
               GROUPREF_EXISTS
2191
               <group>
2192
               <skip>
2193
               ...then part...
2194
               (<skip> jumps here)
2195
2196
               There is no direct way to decide which it is, and we don't want
2197
               to allow arbitrary jumps anywhere in the code; so we just look
2198
               for a JUMP opcode preceding our skip target.
2199
            */
2200
8
            VTRACE(("then part:\n"));
2201
8
            int rc = _validate_inner(code+1, code+skip-1, groups);
2202
8
            if (rc == 1) {
2203
4
                VTRACE(("else part:\n"));
2204
4
                code += skip-2; /* Position after JUMP, at <skipno> */
2205
4
                GET_SKIP;
2206
4
                rc = _validate_inner(code, code+skip-1, groups);
2207
4
            }
2208
8
            if (rc)
2209
0
                FAIL;
2210
8
            code += skip-1;
2211
8
            break;
2212
2213
41
        case SRE_OP_ASSERT:
2214
258
        case SRE_OP_ASSERT_NOT:
2215
258
            GET_SKIP;
2216
258
            GET_ARG; /* 0 for lookahead, width for lookbehind */
2217
258
            code--; /* Back up over arg to simplify math below */
2218
            /* Stop 1 before the end; we check the SUCCESS below */
2219
258
            if (_validate_inner(code+1, code+skip-2, groups))
2220
0
                FAIL;
2221
258
            code += skip-2;
2222
258
            GET_OP;
2223
258
            if (op != SRE_OP_SUCCESS)
2224
0
                FAIL;
2225
258
            break;
2226
2227
258
        case SRE_OP_JUMP:
2228
4
            if (code + 1 != end)
2229
0
                FAIL;
2230
4
            VTRACE(("JUMP: %d\n", __LINE__));
2231
4
            return 1;
2232
2233
0
        default:
2234
0
            FAIL;
2235
2236
26.9M
        }
2237
26.9M
    }
2238
2239
2.24M
    VTRACE(("okay\n"));
2240
2.24M
    return 0;
2241
2.24M
}
2242
2243
static int
2244
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2245
3.14k
{
2246
3.14k
    if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
2247
3.14k
        code >= end || end[-1] != SRE_OP_SUCCESS)
2248
0
        FAIL;
2249
3.14k
    return _validate_inner(code, end-1, groups);
2250
3.14k
}
2251
2252
static int
2253
_validate(PatternObject *self)
2254
3.14k
{
2255
3.14k
    if (_validate_outer(self->code, self->code+self->codesize, self->groups))
2256
0
    {
2257
0
        PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
2258
0
        return 0;
2259
0
    }
2260
3.14k
    else
2261
3.14k
        VTRACE(("Success!\n"));
2262
3.14k
    return 1;
2263
3.14k
}
2264
2265
/* -------------------------------------------------------------------- */
2266
/* match methods */
2267
2268
static int
2269
match_traverse(PyObject *op, visitproc visit, void *arg)
2270
5.91k
{
2271
5.91k
    MatchObject *self = _MatchObject_CAST(op);
2272
5.91k
    Py_VISIT(Py_TYPE(self));
2273
5.91k
    Py_VISIT(self->string);
2274
5.91k
    Py_VISIT(self->regs);
2275
5.91k
    Py_VISIT(self->pattern);
2276
5.91k
    return 0;
2277
5.91k
}
2278
2279
static int
2280
match_clear(PyObject *op)
2281
60.5M
{
2282
60.5M
    MatchObject *self = _MatchObject_CAST(op);
2283
60.5M
    Py_CLEAR(self->string);
2284
60.5M
    Py_CLEAR(self->regs);
2285
60.5M
    Py_CLEAR(self->pattern);
2286
60.5M
    return 0;
2287
60.5M
}
2288
2289
static void
2290
match_dealloc(PyObject *self)
2291
60.5M
{
2292
60.5M
    PyTypeObject *tp = Py_TYPE(self);
2293
60.5M
    PyObject_GC_UnTrack(self);
2294
60.5M
    (void)match_clear(self);
2295
60.5M
    tp->tp_free(self);
2296
60.5M
    Py_DECREF(tp);
2297
60.5M
}
2298
2299
static PyObject*
2300
match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2301
62.5M
{
2302
62.5M
    Py_ssize_t length;
2303
62.5M
    int isbytes, charsize;
2304
62.5M
    Py_buffer view;
2305
62.5M
    PyObject *result;
2306
62.5M
    const void* ptr;
2307
62.5M
    Py_ssize_t i, j;
2308
2309
62.5M
    assert(0 <= index && index < self->groups);
2310
62.5M
    index *= 2;
2311
2312
62.5M
    if (self->string == Py_None || self->mark[index] < 0) {
2313
        /* return default value if the string or group is undefined */
2314
13.9M
        return Py_NewRef(def);
2315
13.9M
    }
2316
2317
48.6M
    ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
2318
48.6M
    if (ptr == NULL)
2319
0
        return NULL;
2320
2321
48.6M
    i = self->mark[index];
2322
48.6M
    j = self->mark[index+1];
2323
48.6M
    i = Py_MIN(i, length);
2324
48.6M
    j = Py_MIN(j, length);
2325
48.6M
    result = getslice(isbytes, ptr, self->string, i, j);
2326
48.6M
    if (isbytes && view.buf != NULL)
2327
540k
        PyBuffer_Release(&view);
2328
48.6M
    return result;
2329
48.6M
}
2330
2331
static Py_ssize_t
2332
match_getindex(MatchObject* self, PyObject* index)
2333
85.3M
{
2334
85.3M
    Py_ssize_t i;
2335
2336
85.3M
    if (index == NULL)
2337
        /* Default value */
2338
21.7M
        return 0;
2339
2340
63.5M
    if (PyIndex_Check(index)) {
2341
42.5M
        i = PyNumber_AsSsize_t(index, NULL);
2342
42.5M
    }
2343
21.0M
    else {
2344
21.0M
        i = -1;
2345
2346
21.0M
        if (self->pattern->groupindex) {
2347
21.0M
            index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2348
21.0M
            if (index && PyLong_Check(index)) {
2349
21.0M
                i = PyLong_AsSsize_t(index);
2350
21.0M
            }
2351
21.0M
        }
2352
21.0M
    }
2353
63.5M
    if (i < 0 || i >= self->groups) {
2354
        /* raise IndexError if we were given a bad group number */
2355
0
        if (!PyErr_Occurred()) {
2356
0
            PyErr_SetString(PyExc_IndexError, "no such group");
2357
0
        }
2358
0
        return -1;
2359
0
    }
2360
2361
    // Check that i*2 cannot overflow to make static analyzers happy
2362
63.5M
    assert((size_t)i <= SRE_MAXGROUPS);
2363
63.5M
    return i;
2364
63.5M
}
2365
2366
static PyObject*
2367
match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2368
62.5M
{
2369
62.5M
    Py_ssize_t i = match_getindex(self, index);
2370
2371
62.5M
    if (i < 0) {
2372
0
        return NULL;
2373
0
    }
2374
2375
62.5M
    return match_getslice_by_index(self, i, def);
2376
62.5M
}
2377
2378
/*[clinic input]
2379
@permit_long_summary
2380
_sre.SRE_Match.expand
2381
2382
    template: object
2383
2384
Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2385
[clinic start generated code]*/
2386
2387
static PyObject *
2388
_sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2389
/*[clinic end generated code: output=931b58ccc323c3a1 input=dc74d81265376ac3]*/
2390
0
{
2391
0
    _sremodulestate *module_state = get_sre_module_state_by_class(Py_TYPE(self));
2392
0
    PyObject *filter = compile_template(module_state, self->pattern, template);
2393
0
    if (filter == NULL) {
2394
0
        return NULL;
2395
0
    }
2396
0
    PyObject *result = expand_template((TemplateObject *)filter, self);
2397
0
    Py_DECREF(filter);
2398
0
    return result;
2399
0
}
2400
2401
static PyObject*
2402
match_group(PyObject *op, PyObject* args)
2403
29.6M
{
2404
29.6M
    MatchObject *self = _MatchObject_CAST(op);
2405
29.6M
    PyObject* result;
2406
29.6M
    Py_ssize_t i, size;
2407
2408
29.6M
    size = PyTuple_GET_SIZE(args);
2409
2410
29.6M
    switch (size) {
2411
2.93M
    case 0:
2412
2.93M
        result = match_getslice(self, _PyLong_GetZero(), Py_None);
2413
2.93M
        break;
2414
10.3M
    case 1:
2415
10.3M
        result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2416
10.3M
        break;
2417
16.4M
    default:
2418
        /* fetch multiple items */
2419
16.4M
        result = PyTuple_New(size);
2420
16.4M
        if (!result)
2421
0
            return NULL;
2422
62.7M
        for (i = 0; i < size; i++) {
2423
46.2M
            PyObject* item = match_getslice(
2424
46.2M
                self, PyTuple_GET_ITEM(args, i), Py_None
2425
46.2M
                );
2426
46.2M
            if (!item) {
2427
0
                Py_DECREF(result);
2428
0
                return NULL;
2429
0
            }
2430
46.2M
            PyTuple_SET_ITEM(result, i, item);
2431
46.2M
        }
2432
16.4M
        break;
2433
29.6M
    }
2434
29.6M
    return result;
2435
29.6M
}
2436
2437
static PyObject*
2438
match_getitem(PyObject *op, PyObject* name)
2439
3.07M
{
2440
3.07M
    MatchObject *self = _MatchObject_CAST(op);
2441
3.07M
    return match_getslice(self, name, Py_None);
2442
3.07M
}
2443
2444
/*[clinic input]
2445
_sre.SRE_Match.groups
2446
2447
    default: object = None
2448
        Is used for groups that did not participate in the match.
2449
2450
Return a tuple containing all the subgroups of the match, from 1.
2451
[clinic start generated code]*/
2452
2453
static PyObject *
2454
_sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2455
/*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
2456
112
{
2457
112
    PyObject* result;
2458
112
    Py_ssize_t index;
2459
2460
112
    result = PyTuple_New(self->groups-1);
2461
112
    if (!result)
2462
0
        return NULL;
2463
2464
1.12k
    for (index = 1; index < self->groups; index++) {
2465
1.01k
        PyObject* item;
2466
1.01k
        item = match_getslice_by_index(self, index, default_value);
2467
1.01k
        if (!item) {
2468
0
            Py_DECREF(result);
2469
0
            return NULL;
2470
0
        }
2471
1.01k
        PyTuple_SET_ITEM(result, index-1, item);
2472
1.01k
    }
2473
2474
112
    return result;
2475
112
}
2476
2477
/*[clinic input]
2478
@permit_long_summary
2479
_sre.SRE_Match.groupdict
2480
2481
    default: object = None
2482
        Is used for groups that did not participate in the match.
2483
2484
Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2485
[clinic start generated code]*/
2486
2487
static PyObject *
2488
_sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2489
/*[clinic end generated code: output=29917c9073e41757 input=a8d3a1dc80336872]*/
2490
24
{
2491
24
    PyObject *result;
2492
24
    PyObject *key;
2493
24
    PyObject *value;
2494
24
    Py_ssize_t pos = 0;
2495
24
    Py_hash_t hash;
2496
2497
24
    result = PyDict_New();
2498
24
    if (!result || !self->pattern->groupindex)
2499
0
        return result;
2500
2501
24
    Py_BEGIN_CRITICAL_SECTION(self->pattern->groupindex);
2502
168
    while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
2503
144
        int status;
2504
144
        Py_INCREF(key);
2505
144
        value = match_getslice(self, key, default_value);
2506
144
        if (!value) {
2507
0
            Py_DECREF(key);
2508
0
            Py_CLEAR(result);
2509
0
            goto exit;
2510
0
        }
2511
144
        status = _PyDict_SetItem_KnownHash(result, key, value, hash);
2512
144
        Py_DECREF(value);
2513
144
        Py_DECREF(key);
2514
144
        if (status < 0) {
2515
0
            Py_CLEAR(result);
2516
0
            goto exit;
2517
0
        }
2518
144
    }
2519
24
exit:;
2520
24
    Py_END_CRITICAL_SECTION();
2521
2522
24
    return result;
2523
24
}
2524
2525
/*[clinic input]
2526
_sre.SRE_Match.start -> Py_ssize_t
2527
2528
    group: object(c_default="NULL") = 0
2529
    /
2530
2531
Return index of the start of the substring matched by group.
2532
[clinic start generated code]*/
2533
2534
static Py_ssize_t
2535
_sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2536
/*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
2537
6.28M
{
2538
6.28M
    Py_ssize_t index = match_getindex(self, group);
2539
2540
6.28M
    if (index < 0) {
2541
0
        return -1;
2542
0
    }
2543
2544
    /* mark is -1 if group is undefined */
2545
6.28M
    return self->mark[index*2];
2546
6.28M
}
2547
2548
/*[clinic input]
2549
_sre.SRE_Match.end -> Py_ssize_t
2550
2551
    group: object(c_default="NULL") = 0
2552
    /
2553
2554
Return index of the end of the substring matched by group.
2555
[clinic start generated code]*/
2556
2557
static Py_ssize_t
2558
_sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2559
/*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
2560
13.2M
{
2561
13.2M
    Py_ssize_t index = match_getindex(self, group);
2562
2563
13.2M
    if (index < 0) {
2564
0
        return -1;
2565
0
    }
2566
2567
    /* mark is -1 if group is undefined */
2568
13.2M
    return self->mark[index*2+1];
2569
13.2M
}
2570
2571
LOCAL(PyObject*)
2572
_pair(Py_ssize_t i1, Py_ssize_t i2)
2573
3.21M
{
2574
3.21M
    PyObject* pair;
2575
3.21M
    PyObject* item;
2576
2577
3.21M
    pair = PyTuple_New(2);
2578
3.21M
    if (!pair)
2579
0
        return NULL;
2580
2581
3.21M
    item = PyLong_FromSsize_t(i1);
2582
3.21M
    if (!item)
2583
0
        goto error;
2584
3.21M
    PyTuple_SET_ITEM(pair, 0, item);
2585
2586
3.21M
    item = PyLong_FromSsize_t(i2);
2587
3.21M
    if (!item)
2588
0
        goto error;
2589
3.21M
    PyTuple_SET_ITEM(pair, 1, item);
2590
2591
3.21M
    return pair;
2592
2593
0
  error:
2594
0
    Py_DECREF(pair);
2595
0
    return NULL;
2596
3.21M
}
2597
2598
/*[clinic input]
2599
_sre.SRE_Match.span
2600
2601
    group: object(c_default="NULL") = 0
2602
    /
2603
2604
For match object m, return the 2-tuple (m.start(group), m.end(group)).
2605
[clinic start generated code]*/
2606
2607
static PyObject *
2608
_sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
2609
/*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
2610
3.21M
{
2611
3.21M
    Py_ssize_t index = match_getindex(self, group);
2612
2613
3.21M
    if (index < 0) {
2614
0
        return NULL;
2615
0
    }
2616
2617
    /* marks are -1 if group is undefined */
2618
3.21M
    return _pair(self->mark[index*2], self->mark[index*2+1]);
2619
3.21M
}
2620
2621
static PyObject*
2622
match_regs(MatchObject* self)
2623
0
{
2624
0
    PyObject* regs;
2625
0
    PyObject* item;
2626
0
    Py_ssize_t index;
2627
2628
0
    regs = PyTuple_New(self->groups);
2629
0
    if (!regs)
2630
0
        return NULL;
2631
2632
0
    for (index = 0; index < self->groups; index++) {
2633
0
        item = _pair(self->mark[index*2], self->mark[index*2+1]);
2634
0
        if (!item) {
2635
0
            Py_DECREF(regs);
2636
0
            return NULL;
2637
0
        }
2638
0
        PyTuple_SET_ITEM(regs, index, item);
2639
0
    }
2640
2641
0
    self->regs = Py_NewRef(regs);
2642
2643
0
    return regs;
2644
0
}
2645
2646
/*[clinic input]
2647
_sre.SRE_Match.__copy__
2648
2649
[clinic start generated code]*/
2650
2651
static PyObject *
2652
_sre_SRE_Match___copy___impl(MatchObject *self)
2653
/*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
2654
0
{
2655
0
    return Py_NewRef(self);
2656
0
}
2657
2658
/*[clinic input]
2659
_sre.SRE_Match.__deepcopy__
2660
2661
    memo: object
2662
    /
2663
2664
[clinic start generated code]*/
2665
2666
static PyObject *
2667
_sre_SRE_Match___deepcopy___impl(MatchObject *self, PyObject *memo)
2668
/*[clinic end generated code: output=2b657578eb03f4a3 input=779d12a31c2c325e]*/
2669
0
{
2670
0
    return Py_NewRef(self);
2671
0
}
2672
2673
PyDoc_STRVAR(match_doc,
2674
"The result of re.match() and re.search().\n\
2675
Match objects always have a boolean value of True.");
2676
2677
PyDoc_STRVAR(match_group_doc,
2678
"group([group1, ...]) -> str or tuple.\n\
2679
    Return subgroup(s) of the match by indices or names.\n\
2680
    For 0 returns the entire match.");
2681
2682
static PyObject *
2683
match_lastindex_get(PyObject *op, void *Py_UNUSED(ignored))
2684
0
{
2685
0
    MatchObject *self = _MatchObject_CAST(op);
2686
0
    if (self->lastindex >= 0)
2687
0
        return PyLong_FromSsize_t(self->lastindex);
2688
0
    Py_RETURN_NONE;
2689
0
}
2690
2691
static PyObject *
2692
match_lastgroup_get(PyObject *op, void *Py_UNUSED(ignored))
2693
0
{
2694
0
    MatchObject *self = _MatchObject_CAST(op);
2695
0
    if (self->pattern->indexgroup &&
2696
0
        self->lastindex >= 0 &&
2697
0
        self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2698
0
    {
2699
0
        PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2700
0
                                            self->lastindex);
2701
0
        return Py_NewRef(result);
2702
0
    }
2703
0
    Py_RETURN_NONE;
2704
0
}
2705
2706
static PyObject *
2707
match_regs_get(PyObject *op, void *Py_UNUSED(ignored))
2708
0
{
2709
0
    MatchObject *self = _MatchObject_CAST(op);
2710
0
    if (self->regs) {
2711
0
        return Py_NewRef(self->regs);
2712
0
    } else
2713
0
        return match_regs(self);
2714
0
}
2715
2716
static PyObject *
2717
match_repr(PyObject *op)
2718
0
{
2719
0
    MatchObject *self = _MatchObject_CAST(op);
2720
0
    PyObject *result;
2721
0
    PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2722
0
    if (group0 == NULL)
2723
0
        return NULL;
2724
0
    result = PyUnicode_FromFormat(
2725
0
            "<%s object; span=(%zd, %zd), match=%.50R>",
2726
0
            Py_TYPE(self)->tp_name,
2727
0
            self->mark[0], self->mark[1], group0);
2728
0
    Py_DECREF(group0);
2729
0
    return result;
2730
0
}
2731
2732
2733
static PyObject*
2734
pattern_new_match(_sremodulestate* module_state,
2735
                  PatternObject* pattern,
2736
                  SRE_STATE* state,
2737
                  Py_ssize_t status)
2738
79.3M
{
2739
    /* create match object (from state object) */
2740
2741
79.3M
    MatchObject* match;
2742
79.3M
    Py_ssize_t i, j;
2743
79.3M
    char* base;
2744
79.3M
    int n;
2745
2746
79.3M
    if (status > 0) {
2747
2748
        /* create match object (with room for extra group marks) */
2749
        /* coverity[ampersand_in_size] */
2750
60.5M
        match = PyObject_GC_NewVar(MatchObject,
2751
60.5M
                                   module_state->Match_Type,
2752
60.5M
                                   2*(pattern->groups+1));
2753
60.5M
        if (!match)
2754
0
            return NULL;
2755
2756
60.5M
        Py_INCREF(pattern);
2757
60.5M
        match->pattern = pattern;
2758
2759
60.5M
        match->string = Py_NewRef(state->string);
2760
2761
60.5M
        match->regs = NULL;
2762
60.5M
        match->groups = pattern->groups+1;
2763
2764
        /* fill in group slices */
2765
2766
60.5M
        base = (char*) state->beginning;
2767
60.5M
        n = state->charsize;
2768
2769
60.5M
        match->mark[0] = ((char*) state->start - base) / n;
2770
60.5M
        match->mark[1] = ((char*) state->ptr - base) / n;
2771
2772
124M
        for (i = j = 0; i < pattern->groups; i++, j+=2)
2773
63.8M
            if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2774
49.3M
                match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2775
49.3M
                match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2776
2777
                /* check wrong span */
2778
49.3M
                if (match->mark[j+2] > match->mark[j+3]) {
2779
0
                    PyErr_SetString(PyExc_SystemError,
2780
0
                                    "The span of capturing group is wrong,"
2781
0
                                    " please report a bug for the re module.");
2782
0
                    Py_DECREF(match);
2783
0
                    return NULL;
2784
0
                }
2785
49.3M
            } else
2786
14.4M
                match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2787
2788
60.5M
        match->pos = state->pos;
2789
60.5M
        match->endpos = state->endpos;
2790
2791
60.5M
        match->lastindex = state->lastindex;
2792
2793
60.5M
        PyObject_GC_Track(match);
2794
60.5M
        return (PyObject*) match;
2795
2796
60.5M
    } else if (status == 0) {
2797
2798
        /* no match */
2799
18.7M
        Py_RETURN_NONE;
2800
2801
18.7M
    }
2802
2803
    /* internal error */
2804
0
    pattern_error(status);
2805
0
    return NULL;
2806
79.3M
}
2807
2808
2809
/* -------------------------------------------------------------------- */
2810
/* scanner methods (experimental) */
2811
2812
static int
2813
scanner_traverse(PyObject *op, visitproc visit, void *arg)
2814
168
{
2815
168
    ScannerObject *self = _ScannerObject_CAST(op);
2816
168
    Py_VISIT(Py_TYPE(self));
2817
168
    Py_VISIT(self->pattern);
2818
168
    return 0;
2819
168
}
2820
2821
static int
2822
scanner_clear(PyObject *op)
2823
388k
{
2824
388k
    ScannerObject *self = _ScannerObject_CAST(op);
2825
388k
    Py_CLEAR(self->pattern);
2826
388k
    return 0;
2827
388k
}
2828
2829
static void
2830
scanner_dealloc(PyObject *self)
2831
388k
{
2832
388k
    PyTypeObject *tp = Py_TYPE(self);
2833
388k
    PyObject_GC_UnTrack(self);
2834
388k
    ScannerObject *scanner = _ScannerObject_CAST(self);
2835
388k
    state_fini(&scanner->state);
2836
388k
    (void)scanner_clear(self);
2837
388k
    tp->tp_free(self);
2838
388k
    Py_DECREF(tp);
2839
388k
}
2840
2841
static int
2842
scanner_begin(ScannerObject* self)
2843
3.45M
{
2844
#ifdef Py_GIL_DISABLED
2845
    int was_executing = _Py_atomic_exchange_int(&self->executing, 1);
2846
#else
2847
3.45M
    int was_executing = self->executing;
2848
3.45M
    self->executing = 1;
2849
3.45M
#endif
2850
3.45M
    if (was_executing) {
2851
0
        PyErr_SetString(PyExc_ValueError,
2852
0
                        "regular expression scanner already executing");
2853
0
        return 0;
2854
0
    }
2855
3.45M
    return 1;
2856
3.45M
}
2857
2858
static void
2859
scanner_end(ScannerObject* self)
2860
3.45M
{
2861
3.45M
    assert(FT_ATOMIC_LOAD_INT_RELAXED(self->executing));
2862
3.45M
    FT_ATOMIC_STORE_INT(self->executing, 0);
2863
3.45M
}
2864
2865
/*[clinic input]
2866
_sre.SRE_Scanner.match
2867
2868
    cls: defining_class
2869
    /
2870
2871
[clinic start generated code]*/
2872
2873
static PyObject *
2874
_sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
2875
/*[clinic end generated code: output=6e22c149dc0f0325 input=b5146e1f30278cb7]*/
2876
0
{
2877
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2878
0
    SRE_STATE* state = &self->state;
2879
0
    PyObject* match;
2880
0
    Py_ssize_t status;
2881
2882
0
    if (!scanner_begin(self)) {
2883
0
        return NULL;
2884
0
    }
2885
0
    if (state->start == NULL) {
2886
0
        scanner_end(self);
2887
0
        Py_RETURN_NONE;
2888
0
    }
2889
2890
0
    state_reset(state);
2891
2892
0
    state->ptr = state->start;
2893
2894
0
    status = sre_match(state, PatternObject_GetCode(self->pattern));
2895
0
    if (PyErr_Occurred()) {
2896
0
        scanner_end(self);
2897
0
        return NULL;
2898
0
    }
2899
2900
0
    match = pattern_new_match(module_state, self->pattern,
2901
0
                              state, status);
2902
2903
0
    if (status == 0)
2904
0
        state->start = NULL;
2905
0
    else {
2906
0
        state->must_advance = (state->ptr == state->start);
2907
0
        state->start = state->ptr;
2908
0
    }
2909
2910
0
    scanner_end(self);
2911
0
    return match;
2912
0
}
2913
2914
2915
/*[clinic input]
2916
_sre.SRE_Scanner.search
2917
2918
    cls: defining_class
2919
    /
2920
2921
[clinic start generated code]*/
2922
2923
static PyObject *
2924
_sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2925
/*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
2926
3.45M
{
2927
3.45M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2928
3.45M
    SRE_STATE* state = &self->state;
2929
3.45M
    PyObject* match;
2930
3.45M
    Py_ssize_t status;
2931
2932
3.45M
    if (!scanner_begin(self)) {
2933
0
        return NULL;
2934
0
    }
2935
3.45M
    if (state->start == NULL) {
2936
0
        scanner_end(self);
2937
0
        Py_RETURN_NONE;
2938
0
    }
2939
2940
3.45M
    state_reset(state);
2941
2942
3.45M
    state->ptr = state->start;
2943
2944
3.45M
    status = sre_search(state, PatternObject_GetCode(self->pattern));
2945
3.45M
    if (PyErr_Occurred()) {
2946
0
        scanner_end(self);
2947
0
        return NULL;
2948
0
    }
2949
2950
3.45M
    match = pattern_new_match(module_state, self->pattern,
2951
3.45M
                              state, status);
2952
2953
3.45M
    if (status == 0)
2954
388k
        state->start = NULL;
2955
3.07M
    else {
2956
3.07M
        state->must_advance = (state->ptr == state->start);
2957
3.07M
        state->start = state->ptr;
2958
3.07M
    }
2959
2960
3.45M
    scanner_end(self);
2961
3.45M
    return match;
2962
3.45M
}
2963
2964
static PyObject *
2965
pattern_scanner(_sremodulestate *module_state,
2966
                PatternObject *self,
2967
                PyObject *string,
2968
                Py_ssize_t pos,
2969
                Py_ssize_t endpos)
2970
388k
{
2971
388k
    ScannerObject* scanner;
2972
2973
    /* create scanner object */
2974
388k
    scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
2975
388k
    if (!scanner)
2976
0
        return NULL;
2977
388k
    scanner->pattern = NULL;
2978
388k
    scanner->executing = 0;
2979
2980
    /* create search state object */
2981
388k
    if (!state_init(&scanner->state, self, string, pos, endpos)) {
2982
0
        Py_DECREF(scanner);
2983
0
        return NULL;
2984
0
    }
2985
2986
388k
    Py_INCREF(self);
2987
388k
    scanner->pattern = self;
2988
2989
388k
    PyObject_GC_Track(scanner);
2990
388k
    return (PyObject*) scanner;
2991
388k
}
2992
2993
/* -------------------------------------------------------------------- */
2994
/* template methods */
2995
2996
static int
2997
template_traverse(PyObject *op, visitproc visit, void *arg)
2998
0
{
2999
0
    TemplateObject *self = _TemplateObject_CAST(op);
3000
0
    Py_VISIT(Py_TYPE(self));
3001
0
    Py_VISIT(self->literal);
3002
0
    for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
3003
0
        Py_VISIT(self->items[i].literal);
3004
0
    }
3005
0
    return 0;
3006
0
}
3007
3008
static int
3009
template_clear(PyObject *op)
3010
0
{
3011
0
    TemplateObject *self = _TemplateObject_CAST(op);
3012
0
    Py_CLEAR(self->literal);
3013
0
    for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
3014
0
        Py_CLEAR(self->items[i].literal);
3015
0
    }
3016
0
    return 0;
3017
0
}
3018
3019
static void
3020
template_dealloc(PyObject *self)
3021
0
{
3022
0
    PyTypeObject *tp = Py_TYPE(self);
3023
0
    PyObject_GC_UnTrack(self);
3024
0
    (void)template_clear(self);
3025
0
    tp->tp_free(self);
3026
0
    Py_DECREF(tp);
3027
0
}
3028
3029
static PyObject *
3030
expand_template(TemplateObject *self, MatchObject *match)
3031
0
{
3032
0
    if (Py_SIZE(self) == 0) {
3033
0
        return Py_NewRef(self->literal);
3034
0
    }
3035
3036
0
    PyObject *result = NULL;
3037
0
    Py_ssize_t count = 0;  // the number of non-empty chunks
3038
    /* For small number of strings use a buffer allocated on the stack,
3039
     * otherwise use a list object. */
3040
0
    PyObject *buffer[10];
3041
0
    PyObject **out = buffer;
3042
0
    PyObject *list = NULL;
3043
0
    if (self->chunks > (int)Py_ARRAY_LENGTH(buffer) ||
3044
0
        !PyUnicode_Check(self->literal))
3045
0
    {
3046
0
        list = PyList_New(self->chunks);
3047
0
        if (!list) {
3048
0
            return NULL;
3049
0
        }
3050
0
        out = &PyList_GET_ITEM(list, 0);
3051
0
    }
3052
3053
0
    out[count++] = Py_NewRef(self->literal);
3054
0
    for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) {
3055
0
        Py_ssize_t index = self->items[i].index;
3056
0
        if (index >= match->groups) {
3057
0
            PyErr_SetString(PyExc_IndexError, "no such group");
3058
0
            goto cleanup;
3059
0
        }
3060
0
        PyObject *item = match_getslice_by_index(match, index, Py_None);
3061
0
        if (item == NULL) {
3062
0
            goto cleanup;
3063
0
        }
3064
0
        if (item != Py_None) {
3065
0
            out[count++] = Py_NewRef(item);
3066
0
        }
3067
0
        Py_DECREF(item);
3068
3069
0
        PyObject *literal = self->items[i].literal;
3070
0
        if (literal != NULL) {
3071
0
            out[count++] = Py_NewRef(literal);
3072
0
        }
3073
0
    }
3074
3075
0
    if (PyUnicode_Check(self->literal)) {
3076
0
        result = _PyUnicode_JoinArray(&_Py_STR(empty), out, count);
3077
0
    }
3078
0
    else {
3079
0
        Py_SET_SIZE(list, count);
3080
0
        result = PyBytes_Join((PyObject *)&_Py_SINGLETON(bytes_empty), list);
3081
0
    }
3082
3083
0
cleanup:
3084
0
    if (list) {
3085
0
        Py_DECREF(list);
3086
0
    }
3087
0
    else {
3088
0
        for (Py_ssize_t i = 0; i < count; i++) {
3089
0
            Py_DECREF(out[i]);
3090
0
        }
3091
0
    }
3092
0
    return result;
3093
0
}
3094
3095
3096
static Py_hash_t
3097
pattern_hash(PyObject *op)
3098
0
{
3099
0
    PatternObject *self = _PatternObject_CAST(op);
3100
3101
0
    Py_hash_t hash, hash2;
3102
3103
0
    hash = PyObject_Hash(self->pattern);
3104
0
    if (hash == -1) {
3105
0
        return -1;
3106
0
    }
3107
3108
0
    hash2 = Py_HashBuffer(self->code, sizeof(self->code[0]) * self->codesize);
3109
0
    hash ^= hash2;
3110
3111
0
    hash ^= self->flags;
3112
0
    hash ^= self->isbytes;
3113
0
    hash ^= self->codesize;
3114
3115
0
    if (hash == -1) {
3116
0
        hash = -2;
3117
0
    }
3118
0
    return hash;
3119
0
}
3120
3121
static PyObject*
3122
pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
3123
0
{
3124
0
    PyTypeObject *tp = Py_TYPE(lefto);
3125
0
    _sremodulestate *module_state = get_sre_module_state_by_class(tp);
3126
0
    PatternObject *left, *right;
3127
0
    int cmp;
3128
3129
0
    if (op != Py_EQ && op != Py_NE) {
3130
0
        Py_RETURN_NOTIMPLEMENTED;
3131
0
    }
3132
3133
0
    if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
3134
0
    {
3135
0
        Py_RETURN_NOTIMPLEMENTED;
3136
0
    }
3137
3138
0
    if (lefto == righto) {
3139
        /* a pattern is equal to itself */
3140
0
        return PyBool_FromLong(op == Py_EQ);
3141
0
    }
3142
3143
0
    left = (PatternObject *)lefto;
3144
0
    right = (PatternObject *)righto;
3145
3146
0
    cmp = (left->flags == right->flags
3147
0
           && left->isbytes == right->isbytes
3148
0
           && left->codesize == right->codesize);
3149
0
    if (cmp) {
3150
        /* Compare the code and the pattern because the same pattern can
3151
           produce different codes depending on the locale used to compile the
3152
           pattern when the re.LOCALE flag is used. Don't compare groups,
3153
           indexgroup nor groupindex: they are derivated from the pattern. */
3154
0
        cmp = (memcmp(left->code, right->code,
3155
0
                      sizeof(left->code[0]) * left->codesize) == 0);
3156
0
    }
3157
0
    if (cmp) {
3158
0
        cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
3159
0
                                       Py_EQ);
3160
0
        if (cmp < 0) {
3161
0
            return NULL;
3162
0
        }
3163
0
    }
3164
0
    if (op == Py_NE) {
3165
0
        cmp = !cmp;
3166
0
    }
3167
0
    return PyBool_FromLong(cmp);
3168
0
}
3169
3170
#include "clinic/sre.c.h"
3171
3172
static PyMethodDef pattern_methods[] = {
3173
    _SRE_SRE_PATTERN_MATCH_METHODDEF
3174
    _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
3175
    _SRE_SRE_PATTERN_SEARCH_METHODDEF
3176
    _SRE_SRE_PATTERN_SUB_METHODDEF
3177
    _SRE_SRE_PATTERN_SUBN_METHODDEF
3178
    _SRE_SRE_PATTERN_FINDALL_METHODDEF
3179
    _SRE_SRE_PATTERN_SPLIT_METHODDEF
3180
    _SRE_SRE_PATTERN_FINDITER_METHODDEF
3181
    _SRE_SRE_PATTERN_SCANNER_METHODDEF
3182
    _SRE_SRE_PATTERN___COPY___METHODDEF
3183
    _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
3184
    _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
3185
    {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3186
     PyDoc_STR("See PEP 585")},
3187
    {NULL, NULL}
3188
};
3189
3190
static PyGetSetDef pattern_getset[] = {
3191
    {"groupindex", pattern_groupindex, NULL,
3192
      "A dictionary mapping group names to group numbers."},
3193
    {NULL}  /* Sentinel */
3194
};
3195
3196
#define PAT_OFF(x) offsetof(PatternObject, x)
3197
static PyMemberDef pattern_members[] = {
3198
    {"pattern",    _Py_T_OBJECT,    PAT_OFF(pattern),       Py_READONLY,
3199
     "The pattern string from which the RE object was compiled."},
3200
    {"flags",      Py_T_INT,       PAT_OFF(flags),         Py_READONLY,
3201
     "The regex matching flags."},
3202
    {"groups",     Py_T_PYSSIZET,  PAT_OFF(groups),        Py_READONLY,
3203
     "The number of capturing groups in the pattern."},
3204
    {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(PatternObject, weakreflist), Py_READONLY},
3205
    {NULL}  /* Sentinel */
3206
};
3207
3208
static PyType_Slot pattern_slots[] = {
3209
    {Py_tp_dealloc, pattern_dealloc},
3210
    {Py_tp_repr, pattern_repr},
3211
    {Py_tp_hash, pattern_hash},
3212
    {Py_tp_doc, (void *)pattern_doc},
3213
    {Py_tp_richcompare, pattern_richcompare},
3214
    {Py_tp_methods, pattern_methods},
3215
    {Py_tp_members, pattern_members},
3216
    {Py_tp_getset, pattern_getset},
3217
    {Py_tp_traverse, pattern_traverse},
3218
    {Py_tp_clear, pattern_clear},
3219
    {0, NULL},
3220
};
3221
3222
static PyType_Spec pattern_spec = {
3223
    .name = "re.Pattern",
3224
    .basicsize = sizeof(PatternObject),
3225
    .itemsize = sizeof(SRE_CODE),
3226
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3227
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3228
    .slots = pattern_slots,
3229
};
3230
3231
static PyMethodDef match_methods[] = {
3232
    {"group", match_group, METH_VARARGS, match_group_doc},
3233
    _SRE_SRE_MATCH_START_METHODDEF
3234
    _SRE_SRE_MATCH_END_METHODDEF
3235
    _SRE_SRE_MATCH_SPAN_METHODDEF
3236
    _SRE_SRE_MATCH_GROUPS_METHODDEF
3237
    _SRE_SRE_MATCH_GROUPDICT_METHODDEF
3238
    _SRE_SRE_MATCH_EXPAND_METHODDEF
3239
    _SRE_SRE_MATCH___COPY___METHODDEF
3240
    _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
3241
    {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3242
     PyDoc_STR("See PEP 585")},
3243
    {NULL, NULL}
3244
};
3245
3246
static PyGetSetDef match_getset[] = {
3247
    {"lastindex", match_lastindex_get, NULL,
3248
     "The integer index of the last matched capturing group."},
3249
    {"lastgroup", match_lastgroup_get, NULL,
3250
     "The name of the last matched capturing group."},
3251
    {"regs", match_regs_get, NULL, NULL},
3252
    {NULL}
3253
};
3254
3255
#define MATCH_OFF(x) offsetof(MatchObject, x)
3256
static PyMemberDef match_members[] = {
3257
    {"string",  _Py_T_OBJECT,   MATCH_OFF(string),  Py_READONLY,
3258
     "The string passed to match() or search()."},
3259
    {"re",      _Py_T_OBJECT,   MATCH_OFF(pattern), Py_READONLY,
3260
     "The regular expression object."},
3261
    {"pos",     Py_T_PYSSIZET, MATCH_OFF(pos),     Py_READONLY,
3262
     "The index into the string at which the RE engine started looking for a match."},
3263
    {"endpos",  Py_T_PYSSIZET, MATCH_OFF(endpos),  Py_READONLY,
3264
     "The index into the string beyond which the RE engine will not go."},
3265
    {NULL}
3266
};
3267
3268
/* FIXME: implement setattr("string", None) as a special case (to
3269
   detach the associated string, if any */
3270
static PyType_Slot match_slots[] = {
3271
    {Py_tp_dealloc, match_dealloc},
3272
    {Py_tp_repr, match_repr},
3273
    {Py_tp_doc, (void *)match_doc},
3274
    {Py_tp_methods, match_methods},
3275
    {Py_tp_members, match_members},
3276
    {Py_tp_getset, match_getset},
3277
    {Py_tp_traverse, match_traverse},
3278
    {Py_tp_clear, match_clear},
3279
3280
    /* As mapping.
3281
     *
3282
     * Match objects do not support length or assignment, but do support
3283
     * __getitem__.
3284
     */
3285
    {Py_mp_subscript, match_getitem},
3286
3287
    {0, NULL},
3288
};
3289
3290
static PyType_Spec match_spec = {
3291
    .name = "re.Match",
3292
    .basicsize = sizeof(MatchObject),
3293
    .itemsize = sizeof(Py_ssize_t),
3294
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3295
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3296
    .slots = match_slots,
3297
};
3298
3299
static PyMethodDef scanner_methods[] = {
3300
    _SRE_SRE_SCANNER_MATCH_METHODDEF
3301
    _SRE_SRE_SCANNER_SEARCH_METHODDEF
3302
    {NULL, NULL}
3303
};
3304
3305
#define SCAN_OFF(x) offsetof(ScannerObject, x)
3306
static PyMemberDef scanner_members[] = {
3307
    {"pattern", _Py_T_OBJECT, SCAN_OFF(pattern), Py_READONLY},
3308
    {NULL}  /* Sentinel */
3309
};
3310
3311
static PyType_Slot scanner_slots[] = {
3312
    {Py_tp_dealloc, scanner_dealloc},
3313
    {Py_tp_methods, scanner_methods},
3314
    {Py_tp_members, scanner_members},
3315
    {Py_tp_traverse, scanner_traverse},
3316
    {Py_tp_clear, scanner_clear},
3317
    {0, NULL},
3318
};
3319
3320
static PyType_Spec scanner_spec = {
3321
    .name = "_sre.SRE_Scanner",
3322
    .basicsize = sizeof(ScannerObject),
3323
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3324
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3325
    .slots = scanner_slots,
3326
};
3327
3328
static PyType_Slot template_slots[] = {
3329
    {Py_tp_dealloc, template_dealloc},
3330
    {Py_tp_traverse, template_traverse},
3331
    {Py_tp_clear, template_clear},
3332
    {0, NULL},
3333
};
3334
3335
static PyType_Spec template_spec = {
3336
    .name = "_sre.SRE_Template",
3337
    .basicsize = sizeof(TemplateObject),
3338
    .itemsize = sizeof(((TemplateObject *)0)->items[0]),
3339
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3340
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3341
    .slots = template_slots,
3342
};
3343
3344
static PyMethodDef _functions[] = {
3345
    _SRE_COMPILE_METHODDEF
3346
    _SRE_TEMPLATE_METHODDEF
3347
    _SRE_GETCODESIZE_METHODDEF
3348
    _SRE_ASCII_ISCASED_METHODDEF
3349
    _SRE_UNICODE_ISCASED_METHODDEF
3350
    _SRE_ASCII_TOLOWER_METHODDEF
3351
    _SRE_UNICODE_TOLOWER_METHODDEF
3352
    {NULL, NULL}
3353
};
3354
3355
static int
3356
sre_traverse(PyObject *module, visitproc visit, void *arg)
3357
1.42k
{
3358
1.42k
    _sremodulestate *state = get_sre_module_state(module);
3359
3360
1.42k
    Py_VISIT(state->Pattern_Type);
3361
1.42k
    Py_VISIT(state->Match_Type);
3362
1.42k
    Py_VISIT(state->Scanner_Type);
3363
1.42k
    Py_VISIT(state->Template_Type);
3364
1.42k
    Py_VISIT(state->compile_template);
3365
3366
1.42k
    return 0;
3367
1.42k
}
3368
3369
static int
3370
sre_clear(PyObject *module)
3371
0
{
3372
0
    _sremodulestate *state = get_sre_module_state(module);
3373
3374
0
    Py_CLEAR(state->Pattern_Type);
3375
0
    Py_CLEAR(state->Match_Type);
3376
0
    Py_CLEAR(state->Scanner_Type);
3377
0
    Py_CLEAR(state->Template_Type);
3378
0
    Py_CLEAR(state->compile_template);
3379
3380
0
    return 0;
3381
0
}
3382
3383
static void
3384
sre_free(void *module)
3385
0
{
3386
0
    sre_clear((PyObject *)module);
3387
0
}
3388
3389
96
#define CREATE_TYPE(m, type, spec)                                  \
3390
96
do {                                                                \
3391
96
    type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
3392
96
    if (type == NULL) {                                             \
3393
0
        goto error;                                                 \
3394
0
    }                                                               \
3395
96
} while (0)
3396
3397
#define ADD_ULONG_CONSTANT(module, name, value)           \
3398
48
    do {                                                  \
3399
48
        if (PyModule_Add(module, name, PyLong_FromUnsignedLong(value)) < 0) { \
3400
0
            goto error;                                   \
3401
0
        }                                                 \
3402
48
} while (0)
3403
3404
static int
3405
sre_exec(PyObject *m)
3406
24
{
3407
24
    _sremodulestate *state;
3408
3409
    /* Create heap types */
3410
24
    state = get_sre_module_state(m);
3411
24
    CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
3412
24
    CREATE_TYPE(m, state->Match_Type, &match_spec);
3413
24
    CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
3414
24
    CREATE_TYPE(m, state->Template_Type, &template_spec);
3415
3416
24
    if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
3417
0
        goto error;
3418
0
    }
3419
3420
24
    if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
3421
0
        goto error;
3422
0
    }
3423
3424
24
    ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
3425
24
    ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
3426
3427
24
    if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
3428
0
        goto error;
3429
0
    }
3430
3431
24
    return 0;
3432
3433
0
error:
3434
0
    return -1;
3435
24
}
3436
3437
static PyModuleDef_Slot sre_slots[] = {
3438
    {Py_mod_exec, sre_exec},
3439
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
3440
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
3441
    {0, NULL},
3442
};
3443
3444
static struct PyModuleDef sremodule = {
3445
    .m_base = PyModuleDef_HEAD_INIT,
3446
    .m_name = "_sre",
3447
    .m_size = sizeof(_sremodulestate),
3448
    .m_methods = _functions,
3449
    .m_slots = sre_slots,
3450
    .m_traverse = sre_traverse,
3451
    .m_free = sre_free,
3452
    .m_clear = sre_clear,
3453
};
3454
3455
PyMODINIT_FUNC
3456
PyInit__sre(void)
3457
24
{
3458
24
    return PyModuleDef_Init(&sremodule);
3459
24
}
3460
3461
/* vim:ts=4:sw=4:et
3462
*/