Coverage Report

Created: 2026-05-16 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Modules/_sre/sre.c
Line
Count
Source
1
/*
2
 * Secret Labs' Regular Expression Engine
3
 *
4
 * regular expression matching engine
5
 *
6
 * partial history:
7
 * 1999-10-24 fl   created (based on existing template matcher code)
8
 * 2000-03-06 fl   first alpha, sort of
9
 * 2000-08-01 fl   fixes for 1.6b1
10
 * 2000-08-07 fl   use PyOS_CheckStack() if available
11
 * 2000-09-20 fl   added expand method
12
 * 2001-03-20 fl   lots of fixes for 2.1b2
13
 * 2001-04-15 fl   export copyright as Python attribute, not global
14
 * 2001-04-28 fl   added __copy__ methods (work in progress)
15
 * 2001-05-14 fl   fixes for 1.5.2 compatibility
16
 * 2001-07-01 fl   added BIGCHARSET support (from Martin von Loewis)
17
 * 2001-10-18 fl   fixed group reset issue (from Matthew Mueller)
18
 * 2001-10-20 fl   added split primitive; re-enable unicode for 1.6/2.0/2.1
19
 * 2001-10-21 fl   added sub/subn primitive
20
 * 2001-10-24 fl   added finditer primitive (for 2.2 only)
21
 * 2001-12-07 fl   fixed memory leak in sub/subn (Guido van Rossum)
22
 * 2002-11-09 fl   fixed empty sub/subn return type
23
 * 2003-04-18 mvl  fully support 4-byte codes
24
 * 2003-10-17 gn   implemented non recursive scheme
25
 * 2013-02-04 mrab added fullmatch primitive
26
 *
27
 * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
28
 *
29
 * This version of the SRE library can be redistributed under CNRI's
30
 * Python 1.6 license.  For any other use, please contact Secret Labs
31
 * AB (info@pythonware.com).
32
 *
33
 * Portions of this engine have been developed in cooperation with
34
 * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
35
 * other compatibility work.
36
 */
37
38
static const char copyright[] =
39
    " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
40
41
#include "Python.h"
42
#include "pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
43
#include "pycore_dict.h"             // _PyDict_Next()
44
#include "pycore_long.h"             // _PyLong_GetZero()
45
#include "pycore_moduleobject.h"     // _PyModule_GetState()
46
#include "pycore_tuple.h"            // _PyTuple_FromPairSteal
47
#include "pycore_unicodeobject.h"    // _PyUnicode_Copy
48
#include "pycore_weakref.h"          // FT_CLEAR_WEAKREFS()
49
50
#include "sre.h"                     // SRE_CODE
51
52
#include <ctype.h>                   // tolower(), toupper(), isalnum()
53
54
1.33G
#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
55
56
// On macOS, use the wide character ctype API using btowc()
57
#if defined(__APPLE__)
58
#  define USE_CTYPE_WINT_T
59
#endif
60
61
0
static int sre_isalnum(unsigned int ch) {
62
#ifdef USE_CTYPE_WINT_T
63
    return (unsigned int)iswalnum(btowc((int)ch));
64
#else
65
0
    return (unsigned int)isalnum((int)ch);
66
0
#endif
67
0
}
68
69
0
static unsigned int sre_tolower(unsigned int ch) {
70
#ifdef USE_CTYPE_WINT_T
71
    return (unsigned int)towlower(btowc((int)ch));
72
#else
73
0
    return (unsigned int)tolower((int)ch);
74
0
#endif
75
0
}
76
77
0
static unsigned int sre_toupper(unsigned int ch) {
78
#ifdef USE_CTYPE_WINT_T
79
    return (unsigned int)towupper(btowc((int)ch));
80
#else
81
0
    return (unsigned int)toupper((int)ch);
82
0
#endif
83
0
}
84
85
/* Defining this one controls tracing:
86
 * 0 -- disabled
87
 * 1 -- only if the DEBUG flag set
88
 * 2 -- always
89
 */
90
#ifndef VERBOSE
91
#  define VERBOSE 0
92
#endif
93
94
/* -------------------------------------------------------------------- */
95
96
#if defined(_MSC_VER) && !defined(__clang__)
97
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
98
#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
99
/* fastest possible local call under MSVC */
100
#define LOCAL(type) static __inline type __fastcall
101
#else
102
#define LOCAL(type) static inline type
103
#endif
104
105
/* error codes */
106
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
107
#define SRE_ERROR_STATE -2 /* illegal state */
108
0
#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
109
0
#define SRE_ERROR_MEMORY -9 /* out of memory */
110
0
#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
111
112
#if VERBOSE == 0
113
#  define INIT_TRACE(state)
114
#  define DO_TRACE 0
115
#  define TRACE(v)
116
#elif VERBOSE == 1
117
#  define INIT_TRACE(state) int _debug = (state)->debug
118
#  define DO_TRACE (_debug)
119
#  define TRACE(v) do {     \
120
        if (_debug) { \
121
            printf v;       \
122
        }                   \
123
    } while (0)
124
#elif VERBOSE == 2
125
#  define INIT_TRACE(state)
126
#  define DO_TRACE 1
127
#  define TRACE(v) printf v
128
#else
129
#  error VERBOSE must be 0, 1 or 2
130
#endif
131
132
/* -------------------------------------------------------------------- */
133
/* search engine state */
134
135
#define SRE_IS_DIGIT(ch)\
136
1.48k
    ((ch) <= '9' && Py_ISDIGIT(ch))
137
#define SRE_IS_SPACE(ch)\
138
32
    ((ch) <= ' ' && Py_ISSPACE(ch))
139
#define SRE_IS_LINEBREAK(ch)\
140
36.0M
    ((ch) == '\n')
141
#define SRE_IS_WORD(ch)\
142
10.9M
    ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
143
144
static unsigned int sre_lower_ascii(unsigned int ch)
145
9.24M
{
146
9.24M
    return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
147
9.24M
}
148
149
/* locale-specific character predicates */
150
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
151
 * warnings when c's type supports only numbers < N+1 */
152
0
#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? sre_isalnum((ch)) : 0)
153
0
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
154
155
static unsigned int sre_lower_locale(unsigned int ch)
156
0
{
157
0
    return ((ch) < 256 ? (unsigned int)sre_tolower((ch)) : ch);
158
0
}
159
160
static unsigned int sre_upper_locale(unsigned int ch)
161
0
{
162
0
    return ((ch) < 256 ? (unsigned int)sre_toupper((ch)) : ch);
163
0
}
164
165
/* unicode-specific character predicates */
166
167
16
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
168
75.7M
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
169
0
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
170
1.49k
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
171
748
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
172
173
static unsigned int sre_lower_unicode(unsigned int ch)
174
116M
{
175
116M
    return (unsigned int) Py_UNICODE_TOLOWER(ch);
176
116M
}
177
178
static unsigned int sre_upper_unicode(unsigned int ch)
179
27.8M
{
180
27.8M
    return (unsigned int) Py_UNICODE_TOUPPER(ch);
181
27.8M
}
182
183
LOCAL(int)
184
sre_category(SRE_CODE category, unsigned int ch)
185
86.7M
{
186
86.7M
    switch (category) {
187
188
1.48k
    case SRE_CATEGORY_DIGIT:
189
1.48k
        return SRE_IS_DIGIT(ch);
190
0
    case SRE_CATEGORY_NOT_DIGIT:
191
0
        return !SRE_IS_DIGIT(ch);
192
32
    case SRE_CATEGORY_SPACE:
193
32
        return SRE_IS_SPACE(ch);
194
0
    case SRE_CATEGORY_NOT_SPACE:
195
0
        return !SRE_IS_SPACE(ch);
196
10.9M
    case SRE_CATEGORY_WORD:
197
10.9M
        return SRE_IS_WORD(ch);
198
0
    case SRE_CATEGORY_NOT_WORD:
199
0
        return !SRE_IS_WORD(ch);
200
0
    case SRE_CATEGORY_LINEBREAK:
201
0
        return SRE_IS_LINEBREAK(ch);
202
0
    case SRE_CATEGORY_NOT_LINEBREAK:
203
0
        return !SRE_IS_LINEBREAK(ch);
204
205
0
    case SRE_CATEGORY_LOC_WORD:
206
0
        return SRE_LOC_IS_WORD(ch);
207
0
    case SRE_CATEGORY_LOC_NOT_WORD:
208
0
        return !SRE_LOC_IS_WORD(ch);
209
210
16
    case SRE_CATEGORY_UNI_DIGIT:
211
16
        return SRE_UNI_IS_DIGIT(ch);
212
0
    case SRE_CATEGORY_UNI_NOT_DIGIT:
213
0
        return !SRE_UNI_IS_DIGIT(ch);
214
5.63M
    case SRE_CATEGORY_UNI_SPACE:
215
5.63M
        return SRE_UNI_IS_SPACE(ch);
216
70.1M
    case SRE_CATEGORY_UNI_NOT_SPACE:
217
70.1M
        return !SRE_UNI_IS_SPACE(ch);
218
748
    case SRE_CATEGORY_UNI_WORD:
219
748
        return SRE_UNI_IS_WORD(ch);
220
0
    case SRE_CATEGORY_UNI_NOT_WORD:
221
0
        return !SRE_UNI_IS_WORD(ch);
222
0
    case SRE_CATEGORY_UNI_LINEBREAK:
223
0
        return SRE_UNI_IS_LINEBREAK(ch);
224
0
    case SRE_CATEGORY_UNI_NOT_LINEBREAK:
225
0
        return !SRE_UNI_IS_LINEBREAK(ch);
226
86.7M
    }
227
0
    return 0;
228
86.7M
}
229
230
LOCAL(int)
231
char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
232
0
{
233
0
    return ch == pattern
234
0
        || (SRE_CODE) sre_lower_locale(ch) == pattern
235
0
        || (SRE_CODE) sre_upper_locale(ch) == pattern;
236
0
}
237
238
239
/* helpers */
240
241
static void
242
data_stack_dealloc(SRE_STATE* state)
243
183M
{
244
183M
    if (state->data_stack) {
245
159M
        PyMem_Free(state->data_stack);
246
159M
        state->data_stack = NULL;
247
159M
    }
248
183M
    state->data_stack_size = state->data_stack_base = 0;
249
183M
}
250
251
static int
252
data_stack_grow(SRE_STATE* state, Py_ssize_t size)
253
162M
{
254
162M
    INIT_TRACE(state);
255
162M
    Py_ssize_t minsize, cursize;
256
162M
    minsize = state->data_stack_base+size;
257
162M
    cursize = state->data_stack_size;
258
162M
    if (cursize < minsize) {
259
162M
        void* stack;
260
162M
        cursize = minsize+minsize/4+1024;
261
162M
        TRACE(("allocate/grow stack %zd\n", cursize));
262
162M
        stack = PyMem_Realloc(state->data_stack, cursize);
263
162M
        if (!stack) {
264
0
            data_stack_dealloc(state);
265
0
            return SRE_ERROR_MEMORY;
266
0
        }
267
162M
        state->data_stack = (char *)stack;
268
162M
        state->data_stack_size = cursize;
269
162M
    }
270
162M
    return 0;
271
162M
}
272
273
/* memory pool functions for SRE_REPEAT, this can avoid memory
274
   leak when SRE(match) function terminates abruptly.
275
   state->repeat_pool_used is a doubly-linked list, so that we
276
   can remove a SRE_REPEAT node from it.
277
   state->repeat_pool_unused is a singly-linked list, we put/get
278
   node at the head. */
279
static SRE_REPEAT *
280
repeat_pool_malloc(SRE_STATE *state)
281
48.5M
{
282
48.5M
    SRE_REPEAT *repeat;
283
284
48.5M
    if (state->repeat_pool_unused) {
285
        /* remove from unused pool (singly-linked list) */
286
16.3k
        repeat = state->repeat_pool_unused;
287
16.3k
        state->repeat_pool_unused = repeat->pool_next;
288
16.3k
    }
289
48.5M
    else {
290
48.5M
        repeat = PyMem_Malloc(sizeof(SRE_REPEAT));
291
48.5M
        if (!repeat) {
292
0
            return NULL;
293
0
        }
294
48.5M
    }
295
296
    /* add to used pool (doubly-linked list) */
297
48.5M
    SRE_REPEAT *temp = state->repeat_pool_used;
298
48.5M
    if (temp) {
299
29.6M
        temp->pool_prev = repeat;
300
29.6M
    }
301
48.5M
    repeat->pool_prev = NULL;
302
48.5M
    repeat->pool_next = temp;
303
48.5M
    state->repeat_pool_used = repeat;
304
305
48.5M
    return repeat;
306
48.5M
}
307
308
static void
309
repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
310
48.5M
{
311
48.5M
    SRE_REPEAT *prev = repeat->pool_prev;
312
48.5M
    SRE_REPEAT *next = repeat->pool_next;
313
314
    /* remove from used pool (doubly-linked list) */
315
48.5M
    if (prev) {
316
0
        prev->pool_next = next;
317
0
    }
318
48.5M
    else {
319
48.5M
        state->repeat_pool_used = next;
320
48.5M
    }
321
48.5M
    if (next) {
322
29.6M
        next->pool_prev = prev;
323
29.6M
    }
324
325
    /* add to unused pool (singly-linked list) */
326
48.5M
    repeat->pool_next = state->repeat_pool_unused;
327
48.5M
    state->repeat_pool_unused = repeat;
328
48.5M
}
329
330
static void
331
repeat_pool_clear(SRE_STATE *state)
332
78.2M
{
333
    /* clear used pool */
334
78.2M
    SRE_REPEAT *next = state->repeat_pool_used;
335
78.2M
    state->repeat_pool_used = NULL;
336
78.2M
    while (next) {
337
0
        SRE_REPEAT *temp = next;
338
0
        next = temp->pool_next;
339
0
        PyMem_Free(temp);
340
0
    }
341
342
    /* clear unused pool */
343
78.2M
    next = state->repeat_pool_unused;
344
78.2M
    state->repeat_pool_unused = NULL;
345
126M
    while (next) {
346
48.5M
        SRE_REPEAT *temp = next;
347
48.5M
        next = temp->pool_next;
348
48.5M
        PyMem_Free(temp);
349
48.5M
    }
350
78.2M
}
351
352
/* generate 8-bit version */
353
354
216M
#define SRE_CHAR Py_UCS1
355
#define SIZEOF_SRE_CHAR 1
356
923M
#define SRE(F) sre_ucs1_##F
357
#include "sre_lib.h"
358
359
/* generate 16-bit unicode version */
360
361
279M
#define SRE_CHAR Py_UCS2
362
#define SIZEOF_SRE_CHAR 2
363
1.40G
#define SRE(F) sre_ucs2_##F
364
#include "sre_lib.h"
365
366
/* generate 32-bit unicode version */
367
368
104M
#define SRE_CHAR Py_UCS4
369
#define SIZEOF_SRE_CHAR 4
370
560M
#define SRE(F) sre_ucs4_##F
371
#include "sre_lib.h"
372
373
/* -------------------------------------------------------------------- */
374
/* factories and destructors */
375
376
/* module state */
377
typedef struct {
378
    PyTypeObject *Pattern_Type;
379
    PyTypeObject *Match_Type;
380
    PyTypeObject *Scanner_Type;
381
    PyTypeObject *Template_Type;
382
    PyObject *compile_template;  // reference to re._compile_template
383
} _sremodulestate;
384
385
static _sremodulestate *
386
get_sre_module_state(PyObject *m)
387
76.9M
{
388
76.9M
    _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
389
76.9M
    assert(state);
390
76.9M
    return state;
391
76.9M
}
392
393
static struct PyModuleDef sremodule;
394
#define get_sre_module_state_by_class(cls) \
395
76.9M
    (get_sre_module_state(PyType_GetModule(cls)))
396
397
/* see sre.h for object declarations */
398
static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
399
static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
400
401
16.0k
#define _PatternObject_CAST(op)     ((PatternObject *)(op))
402
83.7M
#define _MatchObject_CAST(op)       ((MatchObject *)(op))
403
0
#define _TemplateObject_CAST(op)    ((TemplateObject *)(op))
404
705k
#define _ScannerObject_CAST(op)     ((ScannerObject *)(op))
405
406
/*[clinic input]
407
module _sre
408
class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
409
class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
410
class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
411
[clinic start generated code]*/
412
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
413
414
/*[clinic input]
415
_sre.getcodesize -> int
416
[clinic start generated code]*/
417
418
static int
419
_sre_getcodesize_impl(PyObject *module)
420
/*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
421
0
{
422
0
    return sizeof(SRE_CODE);
423
0
}
424
425
/*[clinic input]
426
_sre.ascii_iscased -> bool
427
428
    character: int
429
    /
430
431
[clinic start generated code]*/
432
433
static int
434
_sre_ascii_iscased_impl(PyObject *module, int character)
435
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
436
7.40k
{
437
7.40k
    unsigned int ch = (unsigned int)character;
438
7.40k
    return ch < 128 && Py_ISALPHA(ch);
439
7.40k
}
440
441
/*[clinic input]
442
_sre.unicode_iscased -> bool
443
444
    character: int
445
    /
446
447
[clinic start generated code]*/
448
449
static int
450
_sre_unicode_iscased_impl(PyObject *module, int character)
451
/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
452
30.8M
{
453
30.8M
    unsigned int ch = (unsigned int)character;
454
30.8M
    return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
455
30.8M
}
456
457
/*[clinic input]
458
_sre.ascii_tolower -> int
459
460
    character: int
461
    /
462
463
[clinic start generated code]*/
464
465
static int
466
_sre_ascii_tolower_impl(PyObject *module, int character)
467
/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
468
1.37M
{
469
1.37M
    return sre_lower_ascii(character);
470
1.37M
}
471
472
/*[clinic input]
473
_sre.unicode_tolower -> int
474
475
    character: int
476
    /
477
478
[clinic start generated code]*/
479
480
static int
481
_sre_unicode_tolower_impl(PyObject *module, int character)
482
/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
483
85.9M
{
484
85.9M
    return sre_lower_unicode(character);
485
85.9M
}
486
487
LOCAL(void)
488
state_reset(SRE_STATE* state)
489
105M
{
490
    /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
491
    /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
492
493
105M
    state->lastmark = -1;
494
105M
    state->lastindex = -1;
495
496
105M
    state->repeat = NULL;
497
498
105M
    data_stack_dealloc(state);
499
105M
}
500
501
static const void*
502
getstring(PyObject* string, Py_ssize_t* p_length,
503
          int* p_isbytes, int* p_charsize,
504
          Py_buffer *view)
505
127M
{
506
    /* given a python object, return a data pointer, a length (in
507
       characters), and a character size.  return NULL if the object
508
       is not a string (or not compatible) */
509
510
    /* Unicode objects do not support the buffer API. So, get the data
511
       directly instead. */
512
127M
    if (PyUnicode_Check(string)) {
513
126M
        *p_length = PyUnicode_GET_LENGTH(string);
514
126M
        *p_charsize = PyUnicode_KIND(string);
515
126M
        *p_isbytes = 0;
516
126M
        return PyUnicode_DATA(string);
517
126M
    }
518
519
    /* get pointer to byte string buffer */
520
806k
    if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
521
0
        PyErr_Format(PyExc_TypeError, "expected string or bytes-like "
522
0
                     "object, got '%.200s'", Py_TYPE(string)->tp_name);
523
0
        return NULL;
524
0
    }
525
526
806k
    *p_length = view->len;
527
806k
    *p_charsize = 1;
528
806k
    *p_isbytes = 1;
529
530
806k
    if (view->buf == NULL) {
531
0
        PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
532
0
        PyBuffer_Release(view);
533
0
        view->buf = NULL;
534
0
        return NULL;
535
0
    }
536
806k
    return view->buf;
537
806k
}
538
539
LOCAL(PyObject*)
540
state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
541
           Py_ssize_t start, Py_ssize_t end)
542
78.2M
{
543
    /* prepare state object */
544
545
78.2M
    Py_ssize_t length;
546
78.2M
    int isbytes, charsize;
547
78.2M
    const void* ptr;
548
549
78.2M
    memset(state, 0, sizeof(SRE_STATE));
550
551
78.2M
    state->mark = PyMem_New(const void *, pattern->groups * 2);
552
78.2M
    if (!state->mark) {
553
0
        PyErr_NoMemory();
554
0
        goto err;
555
0
    }
556
78.2M
    state->lastmark = -1;
557
78.2M
    state->lastindex = -1;
558
559
78.2M
    state->buffer.buf = NULL;
560
78.2M
    ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
561
78.2M
    if (!ptr)
562
0
        goto err;
563
564
78.2M
    if (isbytes && pattern->isbytes == 0) {
565
0
        PyErr_SetString(PyExc_TypeError,
566
0
                        "cannot use a string pattern on a bytes-like object");
567
0
        goto err;
568
0
    }
569
78.2M
    if (!isbytes && pattern->isbytes > 0) {
570
0
        PyErr_SetString(PyExc_TypeError,
571
0
                        "cannot use a bytes pattern on a string-like object");
572
0
        goto err;
573
0
    }
574
575
    /* adjust boundaries */
576
78.2M
    if (start < 0)
577
0
        start = 0;
578
78.2M
    else if (start > length)
579
0
        start = length;
580
581
78.2M
    if (end < 0)
582
0
        end = 0;
583
78.2M
    else if (end > length)
584
78.2M
        end = length;
585
586
78.2M
    state->isbytes = isbytes;
587
78.2M
    state->charsize = charsize;
588
78.2M
    state->match_all = 0;
589
78.2M
    state->must_advance = 0;
590
78.2M
    state->debug = ((pattern->flags & SRE_FLAG_DEBUG) != 0);
591
592
78.2M
    state->beginning = ptr;
593
594
78.2M
    state->start = (void*) ((char*) ptr + start * state->charsize);
595
78.2M
    state->end = (void*) ((char*) ptr + end * state->charsize);
596
597
78.2M
    state->string = Py_NewRef(string);
598
78.2M
    state->pos = start;
599
78.2M
    state->endpos = end;
600
601
#ifdef Py_DEBUG
602
    state->fail_after_count = pattern->fail_after_count;
603
    state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
604
#endif
605
606
78.2M
    return string;
607
0
  err:
608
    /* We add an explicit cast here because MSVC has a bug when
609
       compiling C code where it believes that `const void**` cannot be
610
       safely casted to `void*`, see bpo-39943 for details. */
611
0
    PyMem_Free((void*) state->mark);
612
0
    state->mark = NULL;
613
0
    if (state->buffer.buf)
614
0
        PyBuffer_Release(&state->buffer);
615
0
    return NULL;
616
78.2M
}
617
618
LOCAL(void)
619
state_fini(SRE_STATE* state)
620
78.2M
{
621
78.2M
    if (state->buffer.buf)
622
412k
        PyBuffer_Release(&state->buffer);
623
78.2M
    Py_XDECREF(state->string);
624
78.2M
    data_stack_dealloc(state);
625
    /* See above PyMem_Free() for why we explicitly cast here. */
626
78.2M
    PyMem_Free((void*) state->mark);
627
78.2M
    state->mark = NULL;
628
    /* SRE_REPEAT pool */
629
78.2M
    repeat_pool_clear(state);
630
78.2M
}
631
632
/* calculate offset from start of string */
633
#define STATE_OFFSET(state, member)\
634
180M
    (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
635
636
LOCAL(PyObject*)
637
getslice(int isbytes, const void *ptr,
638
         PyObject* string, Py_ssize_t start, Py_ssize_t end)
639
147M
{
640
147M
    if (isbytes) {
641
463k
        if (PyBytes_CheckExact(string) &&
642
463k
            start == 0 && end == PyBytes_GET_SIZE(string)) {
643
2.05k
            return Py_NewRef(string);
644
2.05k
        }
645
461k
        return PyBytes_FromStringAndSize(
646
461k
                (const char *)ptr + start, end - start);
647
463k
    }
648
146M
    else {
649
146M
        return PyUnicode_Substring(string, start, end);
650
146M
    }
651
147M
}
652
653
LOCAL(PyObject*)
654
state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
655
771k
{
656
771k
    Py_ssize_t i, j;
657
658
771k
    index = (index - 1) * 2;
659
660
771k
    if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
661
0
        if (empty)
662
            /* want empty string */
663
0
            i = j = 0;
664
0
        else {
665
0
            Py_RETURN_NONE;
666
0
        }
667
771k
    } else {
668
771k
        i = STATE_OFFSET(state, state->mark[index]);
669
771k
        j = STATE_OFFSET(state, state->mark[index+1]);
670
671
        /* check wrong span */
672
771k
        if (i > j) {
673
0
            PyErr_SetString(PyExc_SystemError,
674
0
                            "The span of capturing group is wrong,"
675
0
                            " please report a bug for the re module.");
676
0
            return NULL;
677
0
        }
678
771k
    }
679
680
771k
    return getslice(state->isbytes, state->beginning, string, i, j);
681
771k
}
682
683
static void
684
pattern_error(Py_ssize_t status)
685
0
{
686
0
    switch (status) {
687
0
    case SRE_ERROR_RECURSION_LIMIT:
688
        /* This error code seems to be unused. */
689
0
        PyErr_SetString(
690
0
            PyExc_RecursionError,
691
0
            "maximum recursion limit exceeded"
692
0
            );
693
0
        break;
694
0
    case SRE_ERROR_MEMORY:
695
0
        PyErr_NoMemory();
696
0
        break;
697
0
    case SRE_ERROR_INTERRUPTED:
698
    /* An exception has already been raised, so let it fly */
699
0
        break;
700
0
    default:
701
        /* other error codes indicate compiler/engine bugs */
702
0
        PyErr_SetString(
703
0
            PyExc_RuntimeError,
704
0
            "internal error in regular expression engine"
705
0
            );
706
0
    }
707
0
}
708
709
static int
710
pattern_traverse(PyObject *op, visitproc visit, void *arg)
711
12.7k
{
712
12.7k
    PatternObject *self = _PatternObject_CAST(op);
713
12.7k
    Py_VISIT(Py_TYPE(self));
714
12.7k
    Py_VISIT(self->groupindex);
715
12.7k
    Py_VISIT(self->indexgroup);
716
12.7k
    Py_VISIT(self->pattern);
717
#ifdef Py_DEBUG
718
    Py_VISIT(self->fail_after_exc);
719
#endif
720
12.7k
    return 0;
721
12.7k
}
722
723
static int
724
pattern_clear(PyObject *op)
725
3.35k
{
726
3.35k
    PatternObject *self = _PatternObject_CAST(op);
727
3.35k
    Py_CLEAR(self->groupindex);
728
3.35k
    Py_CLEAR(self->indexgroup);
729
3.35k
    Py_CLEAR(self->pattern);
730
#ifdef Py_DEBUG
731
    Py_CLEAR(self->fail_after_exc);
732
#endif
733
3.35k
    return 0;
734
3.35k
}
735
736
static void
737
pattern_dealloc(PyObject *self)
738
3.35k
{
739
3.35k
    PyTypeObject *tp = Py_TYPE(self);
740
3.35k
    PyObject_GC_UnTrack(self);
741
3.35k
    FT_CLEAR_WEAKREFS(self, _PatternObject_CAST(self)->weakreflist);
742
3.35k
    (void)pattern_clear(self);
743
3.35k
    tp->tp_free(self);
744
3.35k
    Py_DECREF(tp);
745
3.35k
}
746
747
LOCAL(Py_ssize_t)
748
sre_match(SRE_STATE* state, SRE_CODE* pattern)
749
58.6M
{
750
58.6M
    if (state->charsize == 1)
751
35.2M
        return sre_ucs1_match(state, pattern, 1);
752
23.4M
    if (state->charsize == 2)
753
14.2M
        return sre_ucs2_match(state, pattern, 1);
754
23.4M
    assert(state->charsize == 4);
755
9.16M
    return sre_ucs4_match(state, pattern, 1);
756
23.4M
}
757
758
LOCAL(Py_ssize_t)
759
sre_search(SRE_STATE* state, SRE_CODE* pattern)
760
110M
{
761
110M
    if (state->charsize == 1)
762
51.3M
        return sre_ucs1_search(state, pattern);
763
59.1M
    if (state->charsize == 2)
764
51.8M
        return sre_ucs2_search(state, pattern);
765
59.1M
    assert(state->charsize == 4);
766
7.28M
    return sre_ucs4_search(state, pattern);
767
59.1M
}
768
769
/*[clinic input]
770
_sre.SRE_Pattern.prefixmatch
771
772
    cls: defining_class
773
    /
774
    string: object
775
    pos: Py_ssize_t = 0
776
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
777
778
Matches zero or more characters at the beginning of the string.
779
[clinic start generated code]*/
780
781
static PyObject *
782
_sre_SRE_Pattern_prefixmatch_impl(PatternObject *self, PyTypeObject *cls,
783
                                  PyObject *string, Py_ssize_t pos,
784
                                  Py_ssize_t endpos)
785
/*[clinic end generated code: output=a0e079fb4f875240 input=e2a7e68ea47d048c]*/
786
58.6M
{
787
58.6M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
788
58.6M
    SRE_STATE state;
789
58.6M
    Py_ssize_t status;
790
58.6M
    PyObject *match;
791
792
58.6M
    if (!state_init(&state, self, string, pos, endpos))
793
0
        return NULL;
794
795
58.6M
    INIT_TRACE(&state);
796
58.6M
    state.ptr = state.start;
797
798
58.6M
    TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
799
800
58.6M
    status = sre_match(&state, PatternObject_GetCode(self));
801
802
58.6M
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
803
58.6M
    if (PyErr_Occurred()) {
804
0
        state_fini(&state);
805
0
        return NULL;
806
0
    }
807
808
58.6M
    match = pattern_new_match(module_state, self, &state, status);
809
58.6M
    state_fini(&state);
810
58.6M
    return match;
811
58.6M
}
812
813
814
/*[clinic input]
815
_sre.SRE_Pattern.fullmatch
816
817
    cls: defining_class
818
    /
819
    string: object
820
    pos: Py_ssize_t = 0
821
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
822
823
Matches against all of the string.
824
[clinic start generated code]*/
825
826
static PyObject *
827
_sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
828
                                PyObject *string, Py_ssize_t pos,
829
                                Py_ssize_t endpos)
830
/*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
831
0
{
832
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
833
0
    SRE_STATE state;
834
0
    Py_ssize_t status;
835
0
    PyObject *match;
836
837
0
    if (!state_init(&state, self, string, pos, endpos))
838
0
        return NULL;
839
840
0
    INIT_TRACE(&state);
841
0
    state.ptr = state.start;
842
843
0
    TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
844
845
0
    state.match_all = 1;
846
0
    status = sre_match(&state, PatternObject_GetCode(self));
847
848
0
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
849
0
    if (PyErr_Occurred()) {
850
0
        state_fini(&state);
851
0
        return NULL;
852
0
    }
853
854
0
    match = pattern_new_match(module_state, self, &state, status);
855
0
    state_fini(&state);
856
0
    return match;
857
0
}
858
859
/*[clinic input]
860
@permit_long_summary
861
_sre.SRE_Pattern.search
862
863
    cls: defining_class
864
    /
865
    string: object
866
    pos: Py_ssize_t = 0
867
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
868
869
Scan through string looking for a match, and return a corresponding match object instance.
870
871
Return None if no position in the string matches.
872
[clinic start generated code]*/
873
874
static PyObject *
875
_sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
876
                             PyObject *string, Py_ssize_t pos,
877
                             Py_ssize_t endpos)
878
/*[clinic end generated code: output=bd7f2d9d583e1463 input=05e9feee0334c156]*/
879
4.92M
{
880
4.92M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
881
4.92M
    SRE_STATE state;
882
4.92M
    Py_ssize_t status;
883
4.92M
    PyObject *match;
884
885
4.92M
    if (!state_init(&state, self, string, pos, endpos))
886
0
        return NULL;
887
888
4.92M
    INIT_TRACE(&state);
889
4.92M
    TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
890
891
4.92M
    status = sre_search(&state, PatternObject_GetCode(self));
892
893
4.92M
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
894
895
4.92M
    if (PyErr_Occurred()) {
896
0
        state_fini(&state);
897
0
        return NULL;
898
0
    }
899
900
4.92M
    match = pattern_new_match(module_state, self, &state, status);
901
4.92M
    state_fini(&state);
902
4.92M
    return match;
903
4.92M
}
904
905
/*[clinic input]
906
_sre.SRE_Pattern.findall
907
908
    string: object
909
    pos: Py_ssize_t = 0
910
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
911
912
Return a list of all non-overlapping matches of pattern in string.
913
[clinic start generated code]*/
914
915
static PyObject *
916
_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
917
                              Py_ssize_t pos, Py_ssize_t endpos)
918
/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
919
3.34M
{
920
3.34M
    SRE_STATE state;
921
3.34M
    PyObject* list;
922
3.34M
    Py_ssize_t status;
923
3.34M
    Py_ssize_t i, b, e;
924
925
3.34M
    if (!state_init(&state, self, string, pos, endpos))
926
0
        return NULL;
927
928
3.34M
    list = PyList_New(0);
929
3.34M
    if (!list) {
930
0
        state_fini(&state);
931
0
        return NULL;
932
0
    }
933
934
84.6M
    while (state.start <= state.end) {
935
936
84.6M
        PyObject* item;
937
938
84.6M
        state_reset(&state);
939
940
84.6M
        state.ptr = state.start;
941
942
84.6M
        status = sre_search(&state, PatternObject_GetCode(self));
943
84.6M
        if (PyErr_Occurred())
944
0
            goto error;
945
946
84.6M
        if (status <= 0) {
947
3.34M
            if (status == 0)
948
3.34M
                break;
949
0
            pattern_error(status);
950
0
            goto error;
951
3.34M
        }
952
953
        /* don't bother to build a match object */
954
81.2M
        switch (self->groups) {
955
81.2M
        case 0:
956
81.2M
            b = STATE_OFFSET(&state, state.start);
957
81.2M
            e = STATE_OFFSET(&state, state.ptr);
958
81.2M
            item = getslice(state.isbytes, state.beginning,
959
81.2M
                            string, b, e);
960
81.2M
            if (!item)
961
0
                goto error;
962
81.2M
            break;
963
81.2M
        case 1:
964
0
            item = state_getslice(&state, 1, string, 1);
965
0
            if (!item)
966
0
                goto error;
967
0
            break;
968
0
        default:
969
0
            item = PyTuple_New(self->groups);
970
0
            if (!item)
971
0
                goto error;
972
0
            for (i = 0; i < self->groups; i++) {
973
0
                PyObject* o = state_getslice(&state, i+1, string, 1);
974
0
                if (!o) {
975
0
                    Py_DECREF(item);
976
0
                    goto error;
977
0
                }
978
0
                PyTuple_SET_ITEM(item, i, o);
979
0
            }
980
0
            break;
981
81.2M
        }
982
983
81.2M
        status = PyList_Append(list, item);
984
81.2M
        Py_DECREF(item);
985
81.2M
        if (status < 0)
986
0
            goto error;
987
988
81.2M
        state.must_advance = (state.ptr == state.start);
989
81.2M
        state.start = state.ptr;
990
81.2M
    }
991
992
3.34M
    state_fini(&state);
993
3.34M
    return list;
994
995
0
error:
996
0
    Py_DECREF(list);
997
0
    state_fini(&state);
998
0
    return NULL;
999
1000
3.34M
}
1001
1002
/*[clinic input]
1003
@permit_long_summary
1004
_sre.SRE_Pattern.finditer
1005
1006
    cls: defining_class
1007
    /
1008
    string: object
1009
    pos: Py_ssize_t = 0
1010
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
1011
1012
Return an iterator over all non-overlapping matches for the RE pattern in string.
1013
1014
For each match, the iterator returns a match object.
1015
[clinic start generated code]*/
1016
1017
static PyObject *
1018
_sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
1019
                               PyObject *string, Py_ssize_t pos,
1020
                               Py_ssize_t endpos)
1021
/*[clinic end generated code: output=1791dbf3618ade56 input=ee28865796048023]*/
1022
352k
{
1023
352k
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1024
352k
    PyObject* scanner;
1025
352k
    PyObject* search;
1026
352k
    PyObject* iterator;
1027
1028
352k
    scanner = pattern_scanner(module_state, self, string, pos, endpos);
1029
352k
    if (!scanner)
1030
0
        return NULL;
1031
1032
352k
    search = PyObject_GetAttrString(scanner, "search");
1033
352k
    Py_DECREF(scanner);
1034
352k
    if (!search)
1035
0
        return NULL;
1036
1037
352k
    iterator = PyCallIter_New(search, Py_None);
1038
352k
    Py_DECREF(search);
1039
1040
352k
    return iterator;
1041
352k
}
1042
1043
/*[clinic input]
1044
_sre.SRE_Pattern.scanner
1045
1046
    cls: defining_class
1047
    /
1048
    string: object
1049
    pos: Py_ssize_t = 0
1050
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
1051
1052
[clinic start generated code]*/
1053
1054
static PyObject *
1055
_sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
1056
                              PyObject *string, Py_ssize_t pos,
1057
                              Py_ssize_t endpos)
1058
/*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
1059
0
{
1060
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1061
1062
0
    return pattern_scanner(module_state, self, string, pos, endpos);
1063
0
}
1064
1065
/*[clinic input]
1066
_sre.SRE_Pattern.split
1067
1068
    string: object
1069
    maxsplit: Py_ssize_t = 0
1070
1071
Split string by the occurrences of pattern.
1072
[clinic start generated code]*/
1073
1074
static PyObject *
1075
_sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
1076
                            Py_ssize_t maxsplit)
1077
/*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
1078
1.21M
{
1079
1.21M
    SRE_STATE state;
1080
1.21M
    PyObject* list;
1081
1.21M
    PyObject* item;
1082
1.21M
    Py_ssize_t status;
1083
1.21M
    Py_ssize_t n;
1084
1.21M
    Py_ssize_t i;
1085
1.21M
    const void* last;
1086
1087
1.21M
    assert(self->codesize != 0);
1088
1089
1.21M
    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
1090
0
        return NULL;
1091
1092
1.21M
    list = PyList_New(0);
1093
1.21M
    if (!list) {
1094
0
        state_fini(&state);
1095
0
        return NULL;
1096
0
    }
1097
1098
1.21M
    n = 0;
1099
1.21M
    last = state.start;
1100
1101
2.06M
    while (!maxsplit || n < maxsplit) {
1102
1103
1.28M
        state_reset(&state);
1104
1105
1.28M
        state.ptr = state.start;
1106
1107
1.28M
        status = sre_search(&state, PatternObject_GetCode(self));
1108
1.28M
        if (PyErr_Occurred())
1109
0
            goto error;
1110
1111
1.28M
        if (status <= 0) {
1112
442k
            if (status == 0)
1113
442k
                break;
1114
0
            pattern_error(status);
1115
0
            goto error;
1116
442k
        }
1117
1118
        /* get segment before this match */
1119
846k
        item = getslice(state.isbytes, state.beginning,
1120
846k
            string, STATE_OFFSET(&state, last),
1121
846k
            STATE_OFFSET(&state, state.start)
1122
846k
            );
1123
846k
        if (!item)
1124
0
            goto error;
1125
846k
        status = PyList_Append(list, item);
1126
846k
        Py_DECREF(item);
1127
846k
        if (status < 0)
1128
0
            goto error;
1129
1130
        /* add groups (if any) */
1131
1.61M
        for (i = 0; i < self->groups; i++) {
1132
771k
            item = state_getslice(&state, i+1, string, 0);
1133
771k
            if (!item)
1134
0
                goto error;
1135
771k
            status = PyList_Append(list, item);
1136
771k
            Py_DECREF(item);
1137
771k
            if (status < 0)
1138
0
                goto error;
1139
771k
        }
1140
1141
846k
        n = n + 1;
1142
846k
        state.must_advance = (state.ptr == state.start);
1143
846k
        last = state.start = state.ptr;
1144
1145
846k
    }
1146
1147
    /* get segment following last match (even if empty) */
1148
1.21M
    item = getslice(state.isbytes, state.beginning,
1149
1.21M
        string, STATE_OFFSET(&state, last), state.endpos
1150
1.21M
        );
1151
1.21M
    if (!item)
1152
0
        goto error;
1153
1.21M
    status = PyList_Append(list, item);
1154
1.21M
    Py_DECREF(item);
1155
1.21M
    if (status < 0)
1156
0
        goto error;
1157
1158
1.21M
    state_fini(&state);
1159
1.21M
    return list;
1160
1161
0
error:
1162
0
    Py_DECREF(list);
1163
0
    state_fini(&state);
1164
0
    return NULL;
1165
1166
1.21M
}
1167
1168
static PyObject *
1169
compile_template(_sremodulestate *module_state,
1170
                 PatternObject *pattern, PyObject *template)
1171
0
{
1172
    /* delegate to Python code */
1173
0
    PyObject *func = FT_ATOMIC_LOAD_PTR(module_state->compile_template);
1174
0
    if (func == NULL) {
1175
0
        func = PyImport_ImportModuleAttrString("re", "_compile_template");
1176
0
        if (func == NULL) {
1177
0
            return NULL;
1178
0
        }
1179
#ifdef Py_GIL_DISABLED
1180
        PyObject *other_func = NULL;
1181
        if (!_Py_atomic_compare_exchange_ptr(&module_state->compile_template, &other_func, func))  {
1182
            Py_DECREF(func);
1183
            func = other_func;
1184
        }
1185
#else
1186
0
        Py_XSETREF(module_state->compile_template, func);
1187
0
#endif
1188
0
    }
1189
1190
0
    PyObject *args[] = {(PyObject *)pattern, template};
1191
0
    PyObject *result = PyObject_Vectorcall(func, args, 2, NULL);
1192
1193
0
    if (result == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
1194
        /* If the replacement string is unhashable (e.g. bytearray),
1195
         * convert it to the basic type (str or bytes) and repeat. */
1196
0
        if (PyUnicode_Check(template) && !PyUnicode_CheckExact(template)) {
1197
0
            PyErr_Clear();
1198
0
            template = _PyUnicode_Copy(template);
1199
0
        }
1200
0
        else if (PyObject_CheckBuffer(template) && !PyBytes_CheckExact(template)) {
1201
0
            PyErr_Clear();
1202
0
            template = PyBytes_FromObject(template);
1203
0
        }
1204
0
        else {
1205
0
            return NULL;
1206
0
        }
1207
0
        if (template == NULL) {
1208
0
            return NULL;
1209
0
        }
1210
0
        args[1] = template;
1211
0
        result = PyObject_Vectorcall(func, args, 2, NULL);
1212
0
        Py_DECREF(template);
1213
0
    }
1214
1215
0
    if (result != NULL && Py_TYPE(result) != module_state->Template_Type) {
1216
0
        PyErr_Format(PyExc_RuntimeError,
1217
0
                    "the result of compiling a replacement string is %.200s",
1218
0
                    Py_TYPE(result)->tp_name);
1219
0
        Py_DECREF(result);
1220
0
        return NULL;
1221
0
    }
1222
0
    return result;
1223
0
}
1224
1225
static PyObject *expand_template(TemplateObject *, MatchObject *); /* Forward */
1226
1227
static PyObject*
1228
pattern_subx(_sremodulestate* module_state,
1229
             PatternObject* self,
1230
             PyObject* ptemplate,
1231
             PyObject* string,
1232
             Py_ssize_t count,
1233
             Py_ssize_t subn)
1234
9.79M
{
1235
9.79M
    SRE_STATE state;
1236
9.79M
    PyObject* list;
1237
9.79M
    PyObject* joiner;
1238
9.79M
    PyObject* item;
1239
9.79M
    PyObject* filter;
1240
9.79M
    PyObject* match;
1241
9.79M
    const void* ptr;
1242
9.79M
    Py_ssize_t status;
1243
9.79M
    Py_ssize_t n;
1244
9.79M
    Py_ssize_t i, b, e;
1245
9.79M
    int isbytes, charsize;
1246
9.79M
    enum {LITERAL, TEMPLATE, CALLABLE} filter_type;
1247
9.79M
    Py_buffer view;
1248
1249
9.79M
    if (PyCallable_Check(ptemplate)) {
1250
        /* sub/subn takes either a function or a template */
1251
3.82M
        filter = Py_NewRef(ptemplate);
1252
3.82M
        filter_type = CALLABLE;
1253
5.97M
    } else {
1254
        /* if not callable, check if it's a literal string */
1255
5.97M
        int literal;
1256
5.97M
        view.buf = NULL;
1257
5.97M
        ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
1258
5.97M
        if (ptr) {
1259
5.97M
            if (charsize == 1)
1260
5.97M
                literal = memchr(ptr, '\\', n) == NULL;
1261
0
            else
1262
0
                literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
1263
5.97M
        } else {
1264
0
            PyErr_Clear();
1265
0
            literal = 0;
1266
0
        }
1267
5.97M
        if (view.buf)
1268
0
            PyBuffer_Release(&view);
1269
5.97M
        if (literal) {
1270
5.97M
            filter = Py_NewRef(ptemplate);
1271
5.97M
            filter_type = LITERAL;
1272
5.97M
        } else {
1273
            /* not a literal; hand it over to the template compiler */
1274
0
            filter = compile_template(module_state, self, ptemplate);
1275
0
            if (!filter)
1276
0
                return NULL;
1277
1278
0
            assert(Py_TYPE(filter) == module_state->Template_Type);
1279
0
            if (Py_SIZE(filter) == 0) {
1280
0
                Py_SETREF(filter,
1281
0
                          Py_NewRef(((TemplateObject *)filter)->literal));
1282
0
                filter_type = LITERAL;
1283
0
            }
1284
0
            else {
1285
0
                filter_type = TEMPLATE;
1286
0
            }
1287
0
        }
1288
5.97M
    }
1289
1290
9.79M
    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
1291
0
        Py_DECREF(filter);
1292
0
        return NULL;
1293
0
    }
1294
1295
9.79M
    list = PyList_New(0);
1296
9.79M
    if (!list) {
1297
0
        Py_DECREF(filter);
1298
0
        state_fini(&state);
1299
0
        return NULL;
1300
0
    }
1301
1302
9.79M
    n = i = 0;
1303
1304
16.3M
    while (!count || n < count) {
1305
1306
16.3M
        state_reset(&state);
1307
1308
16.3M
        state.ptr = state.start;
1309
1310
16.3M
        status = sre_search(&state, PatternObject_GetCode(self));
1311
16.3M
        if (PyErr_Occurred())
1312
0
            goto error;
1313
1314
16.3M
        if (status <= 0) {
1315
9.79M
            if (status == 0)
1316
9.79M
                break;
1317
0
            pattern_error(status);
1318
0
            goto error;
1319
9.79M
        }
1320
1321
6.56M
        b = STATE_OFFSET(&state, state.start);
1322
6.56M
        e = STATE_OFFSET(&state, state.ptr);
1323
1324
6.56M
        if (i < b) {
1325
            /* get segment before this match */
1326
3.35M
            item = getslice(state.isbytes, state.beginning,
1327
3.35M
                string, i, b);
1328
3.35M
            if (!item)
1329
0
                goto error;
1330
3.35M
            status = PyList_Append(list, item);
1331
3.35M
            Py_DECREF(item);
1332
3.35M
            if (status < 0)
1333
0
                goto error;
1334
1335
3.35M
        }
1336
1337
6.56M
        if (filter_type != LITERAL) {
1338
            /* pass match object through filter */
1339
6.56M
            match = pattern_new_match(module_state, self, &state, 1);
1340
6.56M
            if (!match)
1341
0
                goto error;
1342
6.56M
            if (filter_type == TEMPLATE) {
1343
0
                item = expand_template((TemplateObject *)filter,
1344
0
                                       (MatchObject *)match);
1345
0
            }
1346
6.56M
            else {
1347
6.56M
                assert(filter_type == CALLABLE);
1348
6.56M
                item = PyObject_CallOneArg(filter, match);
1349
6.56M
            }
1350
6.56M
            Py_DECREF(match);
1351
6.56M
            if (!item)
1352
56
                goto error;
1353
6.56M
        } else {
1354
            /* filter is literal string */
1355
2.59k
            item = Py_NewRef(filter);
1356
2.59k
        }
1357
1358
        /* add to list */
1359
6.56M
        if (item != Py_None) {
1360
6.56M
            status = PyList_Append(list, item);
1361
6.56M
            Py_DECREF(item);
1362
6.56M
            if (status < 0)
1363
0
                goto error;
1364
6.56M
        }
1365
1366
6.56M
        i = e;
1367
6.56M
        n = n + 1;
1368
6.56M
        state.must_advance = (state.ptr == state.start);
1369
6.56M
        state.start = state.ptr;
1370
6.56M
    }
1371
1372
    /* get segment following last match */
1373
9.79M
    if (i < state.endpos) {
1374
7.24M
        item = getslice(state.isbytes, state.beginning,
1375
7.24M
                        string, i, state.endpos);
1376
7.24M
        if (!item)
1377
0
            goto error;
1378
7.24M
        status = PyList_Append(list, item);
1379
7.24M
        Py_DECREF(item);
1380
7.24M
        if (status < 0)
1381
0
            goto error;
1382
7.24M
    }
1383
1384
9.79M
    state_fini(&state);
1385
1386
9.79M
    Py_DECREF(filter);
1387
1388
    /* convert list to single string (also removes list) */
1389
9.79M
    joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
1390
9.79M
    if (!joiner) {
1391
0
        Py_DECREF(list);
1392
0
        return NULL;
1393
0
    }
1394
9.79M
    if (PyList_GET_SIZE(list) == 0) {
1395
1.91M
        Py_DECREF(list);
1396
1.91M
        item = joiner;
1397
1.91M
    }
1398
7.87M
    else {
1399
7.87M
        if (state.isbytes)
1400
33.5k
            item = PyBytes_Join(joiner, list);
1401
7.84M
        else
1402
7.84M
            item = PyUnicode_Join(joiner, list);
1403
7.87M
        Py_DECREF(joiner);
1404
7.87M
        Py_DECREF(list);
1405
7.87M
        if (!item)
1406
0
            return NULL;
1407
7.87M
    }
1408
1409
9.79M
    if (subn)
1410
0
        return Py_BuildValue("Nn", item, n);
1411
1412
9.79M
    return item;
1413
1414
56
error:
1415
56
    Py_DECREF(list);
1416
56
    state_fini(&state);
1417
56
    Py_DECREF(filter);
1418
56
    return NULL;
1419
1420
9.79M
}
1421
1422
/*[clinic input]
1423
@permit_long_summary
1424
_sre.SRE_Pattern.sub
1425
1426
    cls: defining_class
1427
    /
1428
    repl: object
1429
    string: object
1430
    count: Py_ssize_t = 0
1431
1432
Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1433
[clinic start generated code]*/
1434
1435
static PyObject *
1436
_sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1437
                          PyObject *repl, PyObject *string, Py_ssize_t count)
1438
/*[clinic end generated code: output=4be141ab04bca60d input=eba511fd1c4908b7]*/
1439
9.79M
{
1440
9.79M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1441
1442
9.79M
    return pattern_subx(module_state, self, repl, string, count, 0);
1443
9.79M
}
1444
1445
/*[clinic input]
1446
@permit_long_summary
1447
_sre.SRE_Pattern.subn
1448
1449
    cls: defining_class
1450
    /
1451
    repl: object
1452
    string: object
1453
    count: Py_ssize_t = 0
1454
1455
Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1456
[clinic start generated code]*/
1457
1458
static PyObject *
1459
_sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1460
                           PyObject *repl, PyObject *string,
1461
                           Py_ssize_t count)
1462
/*[clinic end generated code: output=da02fd85258b1e1f input=6a5bb5b61717abf0]*/
1463
0
{
1464
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1465
1466
0
    return pattern_subx(module_state, self, repl, string, count, 1);
1467
0
}
1468
1469
/*[clinic input]
1470
_sre.SRE_Pattern.__copy__
1471
1472
[clinic start generated code]*/
1473
1474
static PyObject *
1475
_sre_SRE_Pattern___copy___impl(PatternObject *self)
1476
/*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
1477
0
{
1478
0
    return Py_NewRef(self);
1479
0
}
1480
1481
/*[clinic input]
1482
_sre.SRE_Pattern.__deepcopy__
1483
1484
    memo: object
1485
    /
1486
1487
[clinic start generated code]*/
1488
1489
static PyObject *
1490
_sre_SRE_Pattern___deepcopy___impl(PatternObject *self, PyObject *memo)
1491
/*[clinic end generated code: output=75efe69bd12c5d7d input=a465b1602f997bed]*/
1492
0
{
1493
0
    return Py_NewRef(self);
1494
0
}
1495
1496
#ifdef Py_DEBUG
1497
/*[clinic input]
1498
_sre.SRE_Pattern._fail_after
1499
1500
    count: int
1501
    exception: object
1502
    /
1503
1504
For debugging.
1505
[clinic start generated code]*/
1506
1507
static PyObject *
1508
_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
1509
                                  PyObject *exception)
1510
/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
1511
{
1512
    self->fail_after_count = count;
1513
    Py_INCREF(exception);
1514
    Py_XSETREF(self->fail_after_exc, exception);
1515
    Py_RETURN_NONE;
1516
}
1517
#endif /* Py_DEBUG */
1518
1519
static PyObject *
1520
pattern_repr(PyObject *self)
1521
0
{
1522
0
    static const struct {
1523
0
        const char *name;
1524
0
        int value;
1525
0
    } flag_names[] = {
1526
0
        {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1527
0
        {"re.LOCALE", SRE_FLAG_LOCALE},
1528
0
        {"re.MULTILINE", SRE_FLAG_MULTILINE},
1529
0
        {"re.DOTALL", SRE_FLAG_DOTALL},
1530
0
        {"re.UNICODE", SRE_FLAG_UNICODE},
1531
0
        {"re.VERBOSE", SRE_FLAG_VERBOSE},
1532
0
        {"re.DEBUG", SRE_FLAG_DEBUG},
1533
0
        {"re.ASCII", SRE_FLAG_ASCII},
1534
0
    };
1535
1536
0
    PatternObject *obj = _PatternObject_CAST(self);
1537
0
    PyObject *result = NULL;
1538
0
    PyObject *flag_items;
1539
0
    size_t i;
1540
0
    int flags = obj->flags;
1541
1542
    /* Omit re.UNICODE for valid string patterns. */
1543
0
    if (obj->isbytes == 0 &&
1544
0
        (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1545
0
         SRE_FLAG_UNICODE)
1546
0
        flags &= ~SRE_FLAG_UNICODE;
1547
1548
0
    flag_items = PyList_New(0);
1549
0
    if (!flag_items)
1550
0
        return NULL;
1551
1552
0
    for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1553
0
        if (flags & flag_names[i].value) {
1554
0
            PyObject *item = PyUnicode_FromString(flag_names[i].name);
1555
0
            if (!item)
1556
0
                goto done;
1557
1558
0
            if (PyList_Append(flag_items, item) < 0) {
1559
0
                Py_DECREF(item);
1560
0
                goto done;
1561
0
            }
1562
0
            Py_DECREF(item);
1563
0
            flags &= ~flag_names[i].value;
1564
0
        }
1565
0
    }
1566
0
    if (flags) {
1567
0
        PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1568
0
        if (!item)
1569
0
            goto done;
1570
1571
0
        if (PyList_Append(flag_items, item) < 0) {
1572
0
            Py_DECREF(item);
1573
0
            goto done;
1574
0
        }
1575
0
        Py_DECREF(item);
1576
0
    }
1577
1578
0
    if (PyList_Size(flag_items) > 0) {
1579
0
        PyObject *flags_result;
1580
0
        PyObject *sep = PyUnicode_FromString("|");
1581
0
        if (!sep)
1582
0
            goto done;
1583
0
        flags_result = PyUnicode_Join(sep, flag_items);
1584
0
        Py_DECREF(sep);
1585
0
        if (!flags_result)
1586
0
            goto done;
1587
0
        result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1588
0
                                      obj->pattern, flags_result);
1589
0
        Py_DECREF(flags_result);
1590
0
    }
1591
0
    else {
1592
0
        result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1593
0
    }
1594
1595
0
done:
1596
0
    Py_DECREF(flag_items);
1597
0
    return result;
1598
0
}
1599
1600
PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
1601
1602
/* PatternObject's 'groupindex' method. */
1603
static PyObject *
1604
pattern_groupindex(PyObject *op, void *Py_UNUSED(ignored))
1605
0
{
1606
0
    PatternObject *self = _PatternObject_CAST(op);
1607
0
    if (self->groupindex == NULL)
1608
0
        return PyDict_New();
1609
0
    return PyDictProxy_New(self->groupindex);
1610
0
}
1611
1612
static int _validate(PatternObject *self); /* Forward */
1613
1614
/*[clinic input]
1615
_sre.compile
1616
1617
    pattern: object
1618
    flags: int
1619
    code: object(subclass_of='&PyList_Type')
1620
    groups: Py_ssize_t
1621
    groupindex: object(subclass_of='&PyDict_Type')
1622
    indexgroup: object(subclass_of='&PyTuple_Type')
1623
1624
[clinic start generated code]*/
1625
1626
static PyObject *
1627
_sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
1628
                  PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1629
                  PyObject *indexgroup)
1630
/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
1631
3.74k
{
1632
    /* "compile" pattern descriptor to pattern object */
1633
1634
3.74k
    _sremodulestate *module_state = get_sre_module_state(module);
1635
3.74k
    PatternObject* self;
1636
3.74k
    Py_ssize_t i, n;
1637
1638
3.74k
    n = PyList_GET_SIZE(code);
1639
    /* coverity[ampersand_in_size] */
1640
3.74k
    self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
1641
3.74k
    if (!self)
1642
0
        return NULL;
1643
3.74k
    self->weakreflist = NULL;
1644
3.74k
    self->pattern = NULL;
1645
3.74k
    self->groupindex = NULL;
1646
3.74k
    self->indexgroup = NULL;
1647
#ifdef Py_DEBUG
1648
    self->fail_after_count = -1;
1649
    self->fail_after_exc = NULL;
1650
#endif
1651
1652
3.74k
    self->codesize = n;
1653
1654
96.5M
    for (i = 0; i < n; i++) {
1655
96.5M
        PyObject *o = PyList_GET_ITEM(code, i);
1656
96.5M
        unsigned long value = PyLong_AsUnsignedLong(o);
1657
96.5M
        if (value == (unsigned long)-1 && PyErr_Occurred()) {
1658
0
            break;
1659
0
        }
1660
96.5M
        self->code[i] = (SRE_CODE) value;
1661
96.5M
        if ((unsigned long) self->code[i] != value) {
1662
0
            PyErr_SetString(PyExc_OverflowError,
1663
0
                            "regular expression code size limit exceeded");
1664
0
            break;
1665
0
        }
1666
96.5M
    }
1667
3.74k
    PyObject_GC_Track(self);
1668
1669
3.74k
    if (PyErr_Occurred()) {
1670
0
        Py_DECREF(self);
1671
0
        return NULL;
1672
0
    }
1673
1674
3.74k
    if (pattern == Py_None) {
1675
0
        self->isbytes = -1;
1676
0
    }
1677
3.74k
    else {
1678
3.74k
        Py_ssize_t p_length;
1679
3.74k
        int charsize;
1680
3.74k
        Py_buffer view;
1681
3.74k
        view.buf = NULL;
1682
3.74k
        if (!getstring(pattern, &p_length, &self->isbytes,
1683
3.74k
                       &charsize, &view)) {
1684
0
            Py_DECREF(self);
1685
0
            return NULL;
1686
0
        }
1687
3.74k
        if (view.buf)
1688
42
            PyBuffer_Release(&view);
1689
3.74k
    }
1690
1691
3.74k
    self->pattern = Py_NewRef(pattern);
1692
1693
3.74k
    self->flags = flags;
1694
1695
3.74k
    self->groups = groups;
1696
1697
3.74k
    if (PyDict_GET_SIZE(groupindex) > 0) {
1698
59
        self->groupindex = Py_NewRef(groupindex);
1699
59
        if (PyTuple_GET_SIZE(indexgroup) > 0) {
1700
59
            self->indexgroup = Py_NewRef(indexgroup);
1701
59
        }
1702
59
    }
1703
1704
3.74k
    if (!_validate(self)) {
1705
0
        Py_DECREF(self);
1706
0
        return NULL;
1707
0
    }
1708
1709
3.74k
    return (PyObject*) self;
1710
3.74k
}
1711
1712
/*[clinic input]
1713
_sre.template
1714
1715
    pattern: object
1716
    template: object(subclass_of="&PyList_Type")
1717
        A list containing interleaved literal strings (str or bytes) and group
1718
        indices (int), as returned by re._parser.parse_template():
1719
            [literal1, group1, ..., literalN, groupN]
1720
    /
1721
1722
[clinic start generated code]*/
1723
1724
static PyObject *
1725
_sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template)
1726
/*[clinic end generated code: output=d51290e596ebca86 input=af55380b27f02942]*/
1727
0
{
1728
    /* template is a list containing interleaved literal strings (str or bytes)
1729
     * and group indices (int), as returned by _parser.parse_template:
1730
     * [literal1, group1, literal2, ..., literalN].
1731
     */
1732
0
    _sremodulestate *module_state = get_sre_module_state(module);
1733
0
    TemplateObject *self = NULL;
1734
0
    Py_ssize_t n = PyList_GET_SIZE(template);
1735
0
    if ((n & 1) == 0 || n < 1) {
1736
0
        goto bad_template;
1737
0
    }
1738
0
    n /= 2;
1739
0
    self = PyObject_GC_NewVar(TemplateObject, module_state->Template_Type, n);
1740
0
    if (!self)
1741
0
        return NULL;
1742
0
    self->chunks = 1 + 2*n;
1743
0
    self->literal = Py_NewRef(PyList_GET_ITEM(template, 0));
1744
0
    for (Py_ssize_t i = 0; i < n; i++) {
1745
0
        Py_ssize_t index = PyLong_AsSsize_t(PyList_GET_ITEM(template, 2*i+1));
1746
0
        if (index == -1 && PyErr_Occurred()) {
1747
0
            Py_SET_SIZE(self, i);
1748
0
            Py_DECREF(self);
1749
0
            return NULL;
1750
0
        }
1751
0
        if (index < 0) {
1752
0
            Py_SET_SIZE(self, i);
1753
0
            goto bad_template;
1754
0
        }
1755
0
        self->items[i].index = index;
1756
1757
0
        PyObject *literal = PyList_GET_ITEM(template, 2*i+2);
1758
        // Skip empty literals.
1759
0
        if ((PyUnicode_Check(literal) && !PyUnicode_GET_LENGTH(literal)) ||
1760
0
            (PyBytes_Check(literal) && !PyBytes_GET_SIZE(literal)))
1761
0
        {
1762
0
            literal = NULL;
1763
0
            self->chunks--;
1764
0
        }
1765
0
        self->items[i].literal = Py_XNewRef(literal);
1766
0
    }
1767
0
    PyObject_GC_Track(self);
1768
0
    return (PyObject*) self;
1769
1770
0
bad_template:
1771
0
    PyErr_SetString(PyExc_TypeError, "invalid template");
1772
0
    Py_XDECREF(self);
1773
0
    return NULL;
1774
0
}
1775
1776
/* -------------------------------------------------------------------- */
1777
/* Code validation */
1778
1779
/* To learn more about this code, have a look at the _compile() function in
1780
   Lib/sre_compile.py.  The validation functions below checks the code array
1781
   for conformance with the code patterns generated there.
1782
1783
   The nice thing about the generated code is that it is position-independent:
1784
   all jumps are relative jumps forward.  Also, jumps don't cross each other:
1785
   the target of a later jump is always earlier than the target of an earlier
1786
   jump.  IOW, this is okay:
1787
1788
   J---------J-------T--------T
1789
    \         \_____/        /
1790
     \______________________/
1791
1792
   but this is not:
1793
1794
   J---------J-------T--------T
1795
    \_________\_____/        /
1796
               \____________/
1797
1798
   It also helps that SRE_CODE is always an unsigned type.
1799
*/
1800
1801
/* Defining this one enables tracing of the validator */
1802
#undef VVERBOSE
1803
1804
/* Trace macro for the validator */
1805
#if defined(VVERBOSE)
1806
#define VTRACE(v) printf v
1807
#else
1808
146M
#define VTRACE(v) do {} while(0)  /* do nothing */
1809
#endif
1810
1811
/* Report failure */
1812
0
#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return -1; } while (0)
1813
1814
/* Extract opcode, argument, or skip count from code array */
1815
#define GET_OP                                          \
1816
34.3M
    do {                                                \
1817
34.3M
        VTRACE(("%p: ", code));                         \
1818
34.3M
        if (code >= end) FAIL;                          \
1819
34.3M
        op = *code++;                                   \
1820
34.3M
        VTRACE(("%lu (op)\n", (unsigned long)op));      \
1821
34.3M
    } while (0)
1822
#define GET_ARG                                         \
1823
29.9M
    do {                                                \
1824
29.9M
        VTRACE(("%p= ", code));                         \
1825
29.9M
        if (code >= end) FAIL;                          \
1826
29.9M
        arg = *code++;                                  \
1827
29.9M
        VTRACE(("%lu (arg)\n", (unsigned long)arg));    \
1828
29.9M
    } while (0)
1829
#define GET_SKIP_ADJ(adj)                               \
1830
6.51M
    do {                                                \
1831
6.51M
        VTRACE(("%p= ", code));                         \
1832
6.51M
        if (code >= end) FAIL;                          \
1833
6.51M
        skip = *code;                                   \
1834
6.51M
        VTRACE(("%lu (skip to %p)\n",                   \
1835
6.51M
               (unsigned long)skip, code+skip));        \
1836
6.51M
        if (skip-adj > (uintptr_t)(end - code))         \
1837
6.51M
            FAIL;                                       \
1838
6.51M
        code++;                                         \
1839
6.51M
    } while (0)
1840
6.51M
#define GET_SKIP GET_SKIP_ADJ(0)
1841
1842
static int
1843
_validate_charset(SRE_CODE *code, SRE_CODE *end)
1844
3.40M
{
1845
    /* Some variables are manipulated by the macros above */
1846
3.40M
    SRE_CODE op;
1847
3.40M
    SRE_CODE arg;
1848
3.40M
    SRE_CODE offset;
1849
3.40M
    int i;
1850
1851
10.1M
    while (code < end) {
1852
6.71M
        GET_OP;
1853
6.71M
        switch (op) {
1854
1855
1.26k
        case SRE_OP_NEGATE:
1856
1.26k
            break;
1857
1858
6.61M
        case SRE_OP_LITERAL:
1859
6.61M
            GET_ARG;
1860
6.61M
            break;
1861
1862
6.61M
        case SRE_OP_RANGE:
1863
11.7k
        case SRE_OP_RANGE_UNI_IGNORE:
1864
11.7k
            GET_ARG;
1865
11.7k
            GET_ARG;
1866
11.7k
            break;
1867
1868
11.7k
        case SRE_OP_CHARSET:
1869
784
            offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
1870
784
            if (offset > (uintptr_t)(end - code))
1871
0
                FAIL;
1872
784
            code += offset;
1873
784
            break;
1874
1875
89.6k
        case SRE_OP_BIGCHARSET:
1876
89.6k
            GET_ARG; /* Number of blocks */
1877
89.6k
            offset = 256/sizeof(SRE_CODE); /* 256-byte table */
1878
89.6k
            if (offset > (uintptr_t)(end - code))
1879
0
                FAIL;
1880
            /* Make sure that each byte points to a valid block */
1881
23.0M
            for (i = 0; i < 256; i++) {
1882
22.9M
                if (((unsigned char *)code)[i] >= arg)
1883
0
                    FAIL;
1884
22.9M
            }
1885
89.6k
            code += offset;
1886
89.6k
            offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
1887
89.6k
            if (offset > (uintptr_t)(end - code))
1888
0
                FAIL;
1889
89.6k
            code += offset;
1890
89.6k
            break;
1891
1892
1.63k
        case SRE_OP_CATEGORY:
1893
1.63k
            GET_ARG;
1894
1.63k
            switch (arg) {
1895
34
            case SRE_CATEGORY_DIGIT:
1896
34
            case SRE_CATEGORY_NOT_DIGIT:
1897
66
            case SRE_CATEGORY_SPACE:
1898
66
            case SRE_CATEGORY_NOT_SPACE:
1899
92
            case SRE_CATEGORY_WORD:
1900
92
            case SRE_CATEGORY_NOT_WORD:
1901
92
            case SRE_CATEGORY_LINEBREAK:
1902
92
            case SRE_CATEGORY_NOT_LINEBREAK:
1903
92
            case SRE_CATEGORY_LOC_WORD:
1904
92
            case SRE_CATEGORY_LOC_NOT_WORD:
1905
222
            case SRE_CATEGORY_UNI_DIGIT:
1906
766
            case SRE_CATEGORY_UNI_NOT_DIGIT:
1907
1.46k
            case SRE_CATEGORY_UNI_SPACE:
1908
1.47k
            case SRE_CATEGORY_UNI_NOT_SPACE:
1909
1.57k
            case SRE_CATEGORY_UNI_WORD:
1910
1.63k
            case SRE_CATEGORY_UNI_NOT_WORD:
1911
1.63k
            case SRE_CATEGORY_UNI_LINEBREAK:
1912
1.63k
            case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1913
1.63k
                break;
1914
0
            default:
1915
0
                FAIL;
1916
1.63k
            }
1917
1.63k
            break;
1918
1919
1.63k
        default:
1920
0
            FAIL;
1921
1922
6.71M
        }
1923
6.71M
    }
1924
1925
3.40M
    return 0;
1926
3.40M
}
1927
1928
/* Returns 0 on success, -1 on failure, and 1 if the last op is JUMP. */
1929
static int
1930
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1931
2.20M
{
1932
    /* Some variables are manipulated by the macros above */
1933
2.20M
    SRE_CODE op;
1934
2.20M
    SRE_CODE arg;
1935
2.20M
    SRE_CODE skip;
1936
1937
2.20M
    VTRACE(("code=%p, end=%p\n", code, end));
1938
1939
2.20M
    if (code > end)
1940
0
        FAIL;
1941
1942
27.6M
    while (code < end) {
1943
25.4M
        GET_OP;
1944
25.4M
        switch (op) {
1945
1946
358k
        case SRE_OP_MARK:
1947
            /* We don't check whether marks are properly nested; the
1948
               sre_match() code is robust even if they don't, and the worst
1949
               you can get is nonsensical match results. */
1950
358k
            GET_ARG;
1951
358k
            if (arg >= 2 * (size_t)groups) {
1952
0
                VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1953
0
                FAIL;
1954
0
            }
1955
358k
            break;
1956
1957
16.3M
        case SRE_OP_LITERAL:
1958
16.3M
        case SRE_OP_NOT_LITERAL:
1959
16.3M
        case SRE_OP_LITERAL_IGNORE:
1960
16.3M
        case SRE_OP_NOT_LITERAL_IGNORE:
1961
20.1M
        case SRE_OP_LITERAL_UNI_IGNORE:
1962
20.1M
        case SRE_OP_NOT_LITERAL_UNI_IGNORE:
1963
20.1M
        case SRE_OP_LITERAL_LOC_IGNORE:
1964
20.1M
        case SRE_OP_NOT_LITERAL_LOC_IGNORE:
1965
20.1M
            GET_ARG;
1966
            /* The arg is just a character, nothing to check */
1967
20.1M
            break;
1968
1969
20.1M
        case SRE_OP_SUCCESS:
1970
81
        case SRE_OP_FAILURE:
1971
            /* Nothing to check; these normally end the matching process */
1972
81
            break;
1973
1974
83.9k
        case SRE_OP_AT:
1975
83.9k
            GET_ARG;
1976
83.9k
            switch (arg) {
1977
49
            case SRE_AT_BEGINNING:
1978
57
            case SRE_AT_BEGINNING_STRING:
1979
71.7k
            case SRE_AT_BEGINNING_LINE:
1980
71.8k
            case SRE_AT_END:
1981
79.6k
            case SRE_AT_END_LINE:
1982
79.7k
            case SRE_AT_END_STRING:
1983
79.7k
            case SRE_AT_BOUNDARY:
1984
79.7k
            case SRE_AT_NON_BOUNDARY:
1985
79.7k
            case SRE_AT_LOC_BOUNDARY:
1986
79.7k
            case SRE_AT_LOC_NON_BOUNDARY:
1987
83.9k
            case SRE_AT_UNI_BOUNDARY:
1988
83.9k
            case SRE_AT_UNI_NON_BOUNDARY:
1989
83.9k
                break;
1990
0
            default:
1991
0
                FAIL;
1992
83.9k
            }
1993
83.9k
            break;
1994
1995
83.9k
        case SRE_OP_ANY:
1996
38.2k
        case SRE_OP_ANY_ALL:
1997
            /* These have no operands */
1998
38.2k
            break;
1999
2000
5.79k
        case SRE_OP_IN:
2001
6.04k
        case SRE_OP_IN_IGNORE:
2002
3.40M
        case SRE_OP_IN_UNI_IGNORE:
2003
3.40M
        case SRE_OP_IN_LOC_IGNORE:
2004
3.40M
            GET_SKIP;
2005
            /* Stop 1 before the end; we check the FAILURE below */
2006
3.40M
            if (_validate_charset(code, code+skip-2))
2007
0
                FAIL;
2008
3.40M
            if (code[skip-2] != SRE_OP_FAILURE)
2009
0
                FAIL;
2010
3.40M
            code += skip-1;
2011
3.40M
            break;
2012
2013
3.74k
        case SRE_OP_INFO:
2014
3.74k
            {
2015
                /* A minimal info field is
2016
                   <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2017
                   If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2018
                   more follows. */
2019
3.74k
                SRE_CODE flags, i;
2020
3.74k
                SRE_CODE *newcode;
2021
3.74k
                GET_SKIP;
2022
3.74k
                newcode = code+skip-1;
2023
3.74k
                GET_ARG; flags = arg;
2024
3.74k
                GET_ARG;
2025
3.74k
                GET_ARG;
2026
                /* Check that only valid flags are present */
2027
3.74k
                if ((flags & ~(SRE_INFO_PREFIX |
2028
3.74k
                               SRE_INFO_LITERAL |
2029
3.74k
                               SRE_INFO_CHARSET)) != 0)
2030
0
                    FAIL;
2031
                /* PREFIX and CHARSET are mutually exclusive */
2032
3.74k
                if ((flags & SRE_INFO_PREFIX) &&
2033
1.64k
                    (flags & SRE_INFO_CHARSET))
2034
0
                    FAIL;
2035
                /* LITERAL implies PREFIX */
2036
3.74k
                if ((flags & SRE_INFO_LITERAL) &&
2037
714
                    !(flags & SRE_INFO_PREFIX))
2038
0
                    FAIL;
2039
                /* Validate the prefix */
2040
3.74k
                if (flags & SRE_INFO_PREFIX) {
2041
1.64k
                    SRE_CODE prefix_len;
2042
1.64k
                    GET_ARG; prefix_len = arg;
2043
1.64k
                    GET_ARG;
2044
                    /* Here comes the prefix string */
2045
1.64k
                    if (prefix_len > (uintptr_t)(newcode - code))
2046
0
                        FAIL;
2047
1.64k
                    code += prefix_len;
2048
                    /* And here comes the overlap table */
2049
1.64k
                    if (prefix_len > (uintptr_t)(newcode - code))
2050
0
                        FAIL;
2051
                    /* Each overlap value should be < prefix_len */
2052
7.14M
                    for (i = 0; i < prefix_len; i++) {
2053
7.14M
                        if (code[i] >= prefix_len)
2054
0
                            FAIL;
2055
7.14M
                    }
2056
1.64k
                    code += prefix_len;
2057
1.64k
                }
2058
                /* Validate the charset */
2059
3.74k
                if (flags & SRE_INFO_CHARSET) {
2060
409
                    if (_validate_charset(code, newcode-1))
2061
0
                        FAIL;
2062
409
                    if (newcode[-1] != SRE_OP_FAILURE)
2063
0
                        FAIL;
2064
409
                    code = newcode;
2065
409
                }
2066
3.33k
                else if (code != newcode) {
2067
0
                  VTRACE(("code=%p, newcode=%p\n", code, newcode));
2068
0
                    FAIL;
2069
0
                }
2070
3.74k
            }
2071
3.74k
            break;
2072
2073
28.0k
        case SRE_OP_BRANCH:
2074
28.0k
            {
2075
28.0k
                SRE_CODE *target = NULL;
2076
909k
                for (;;) {
2077
909k
                    GET_SKIP;
2078
909k
                    if (skip == 0)
2079
28.0k
                        break;
2080
                    /* Stop 2 before the end; we check the JUMP below */
2081
881k
                    if (_validate_inner(code, code+skip-3, groups))
2082
0
                        FAIL;
2083
881k
                    code += skip-3;
2084
                    /* Check that it ends with a JUMP, and that each JUMP
2085
                       has the same target */
2086
881k
                    GET_OP;
2087
881k
                    if (op != SRE_OP_JUMP)
2088
0
                        FAIL;
2089
881k
                    GET_SKIP;
2090
881k
                    if (target == NULL)
2091
28.0k
                        target = code+skip-1;
2092
853k
                    else if (code+skip-1 != target)
2093
0
                        FAIL;
2094
881k
                }
2095
28.0k
                if (code != target)
2096
0
                    FAIL;
2097
28.0k
            }
2098
28.0k
            break;
2099
2100
1.27M
        case SRE_OP_REPEAT_ONE:
2101
1.27M
        case SRE_OP_MIN_REPEAT_ONE:
2102
1.27M
        case SRE_OP_POSSESSIVE_REPEAT_ONE:
2103
1.27M
            {
2104
1.27M
                SRE_CODE min, max;
2105
1.27M
                GET_SKIP;
2106
1.27M
                GET_ARG; min = arg;
2107
1.27M
                GET_ARG; max = arg;
2108
1.27M
                if (min > max)
2109
0
                    FAIL;
2110
1.27M
                if (max > SRE_MAXREPEAT)
2111
0
                    FAIL;
2112
1.27M
                if (_validate_inner(code, code+skip-4, groups))
2113
0
                    FAIL;
2114
1.27M
                code += skip-4;
2115
1.27M
                GET_OP;
2116
1.27M
                if (op != SRE_OP_SUCCESS)
2117
0
                    FAIL;
2118
1.27M
            }
2119
1.27M
            break;
2120
2121
1.27M
        case SRE_OP_REPEAT:
2122
41.9k
        case SRE_OP_POSSESSIVE_REPEAT:
2123
41.9k
            {
2124
41.9k
                SRE_CODE op1 = op, min, max;
2125
41.9k
                GET_SKIP;
2126
41.9k
                GET_ARG; min = arg;
2127
41.9k
                GET_ARG; max = arg;
2128
41.9k
                if (min > max)
2129
0
                    FAIL;
2130
41.9k
                if (max > SRE_MAXREPEAT)
2131
0
                    FAIL;
2132
41.9k
                if (_validate_inner(code, code+skip-3, groups))
2133
0
                    FAIL;
2134
41.9k
                code += skip-3;
2135
41.9k
                GET_OP;
2136
41.9k
                if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
2137
46
                    if (op != SRE_OP_SUCCESS)
2138
0
                        FAIL;
2139
46
                }
2140
41.9k
                else {
2141
41.9k
                    if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
2142
0
                        FAIL;
2143
41.9k
                }
2144
41.9k
            }
2145
41.9k
            break;
2146
2147
41.9k
        case SRE_OP_ATOMIC_GROUP:
2148
157
            {
2149
157
                GET_SKIP;
2150
157
                if (_validate_inner(code, code+skip-2, groups))
2151
0
                    FAIL;
2152
157
                code += skip-2;
2153
157
                GET_OP;
2154
157
                if (op != SRE_OP_SUCCESS)
2155
0
                    FAIL;
2156
157
            }
2157
157
            break;
2158
2159
157
        case SRE_OP_GROUPREF:
2160
849
        case SRE_OP_GROUPREF_IGNORE:
2161
1.60k
        case SRE_OP_GROUPREF_UNI_IGNORE:
2162
1.60k
        case SRE_OP_GROUPREF_LOC_IGNORE:
2163
1.60k
            GET_ARG;
2164
1.60k
            if (arg >= (size_t)groups)
2165
0
                FAIL;
2166
1.60k
            break;
2167
2168
1.60k
        case SRE_OP_GROUPREF_EXISTS:
2169
            /* The regex syntax for this is: '(?(group)then|else)', where
2170
               'group' is either an integer group number or a group name,
2171
               'then' and 'else' are sub-regexes, and 'else' is optional. */
2172
54
            GET_ARG;
2173
54
            if (arg >= (size_t)groups)
2174
0
                FAIL;
2175
54
            GET_SKIP_ADJ(1);
2176
54
            code--; /* The skip is relative to the first arg! */
2177
            /* There are two possibilities here: if there is both a 'then'
2178
               part and an 'else' part, the generated code looks like:
2179
2180
               GROUPREF_EXISTS
2181
               <group>
2182
               <skipyes>
2183
               ...then part...
2184
               JUMP
2185
               <skipno>
2186
               (<skipyes> jumps here)
2187
               ...else part...
2188
               (<skipno> jumps here)
2189
2190
               If there is only a 'then' part, it looks like:
2191
2192
               GROUPREF_EXISTS
2193
               <group>
2194
               <skip>
2195
               ...then part...
2196
               (<skip> jumps here)
2197
2198
               There is no direct way to decide which it is, and we don't want
2199
               to allow arbitrary jumps anywhere in the code; so we just look
2200
               for a JUMP opcode preceding our skip target.
2201
            */
2202
54
            VTRACE(("then part:\n"));
2203
54
            int rc = _validate_inner(code+1, code+skip-1, groups);
2204
54
            if (rc == 1) {
2205
32
                VTRACE(("else part:\n"));
2206
32
                code += skip-2; /* Position after JUMP, at <skipno> */
2207
32
                GET_SKIP;
2208
32
                rc = _validate_inner(code, code+skip-1, groups);
2209
32
            }
2210
54
            if (rc)
2211
0
                FAIL;
2212
54
            code += skip-1;
2213
54
            break;
2214
2215
117
        case SRE_OP_ASSERT:
2216
369
        case SRE_OP_ASSERT_NOT:
2217
369
            GET_SKIP;
2218
369
            GET_ARG; /* 0 for lookahead, width for lookbehind */
2219
369
            code--; /* Back up over arg to simplify math below */
2220
            /* Stop 1 before the end; we check the SUCCESS below */
2221
369
            if (_validate_inner(code+1, code+skip-2, groups))
2222
0
                FAIL;
2223
369
            code += skip-2;
2224
369
            GET_OP;
2225
369
            if (op != SRE_OP_SUCCESS)
2226
0
                FAIL;
2227
369
            break;
2228
2229
369
        case SRE_OP_JUMP:
2230
32
            if (code + 1 != end)
2231
0
                FAIL;
2232
32
            VTRACE(("JUMP: %d\n", __LINE__));
2233
32
            return 1;
2234
2235
0
        default:
2236
0
            FAIL;
2237
2238
25.4M
        }
2239
25.4M
    }
2240
2241
2.20M
    VTRACE(("okay\n"));
2242
2.20M
    return 0;
2243
2.20M
}
2244
2245
static int
2246
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2247
3.74k
{
2248
3.74k
    if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
2249
3.74k
        code >= end || end[-1] != SRE_OP_SUCCESS)
2250
0
        FAIL;
2251
3.74k
    return _validate_inner(code, end-1, groups);
2252
3.74k
}
2253
2254
static int
2255
_validate(PatternObject *self)
2256
3.74k
{
2257
3.74k
    if (_validate_outer(self->code, self->code+self->codesize, self->groups))
2258
0
    {
2259
0
        PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
2260
0
        return 0;
2261
0
    }
2262
3.74k
    else
2263
3.74k
        VTRACE(("Success!\n"));
2264
3.74k
    return 1;
2265
3.74k
}
2266
2267
/* -------------------------------------------------------------------- */
2268
/* match methods */
2269
2270
static int
2271
match_traverse(PyObject *op, visitproc visit, void *arg)
2272
48.5k
{
2273
48.5k
    MatchObject *self = _MatchObject_CAST(op);
2274
48.5k
    Py_VISIT(Py_TYPE(self));
2275
48.5k
    Py_VISIT(self->string);
2276
48.5k
    Py_VISIT(self->regs);
2277
48.5k
    Py_VISIT(self->pattern);
2278
48.5k
    return 0;
2279
48.5k
}
2280
2281
static int
2282
match_clear(PyObject *op)
2283
54.3M
{
2284
54.3M
    MatchObject *self = _MatchObject_CAST(op);
2285
54.3M
    Py_CLEAR(self->string);
2286
54.3M
    Py_CLEAR(self->regs);
2287
54.3M
    Py_CLEAR(self->pattern);
2288
54.3M
    return 0;
2289
54.3M
}
2290
2291
static void
2292
match_dealloc(PyObject *self)
2293
54.3M
{
2294
54.3M
    PyTypeObject *tp = Py_TYPE(self);
2295
54.3M
    PyObject_GC_UnTrack(self);
2296
54.3M
    (void)match_clear(self);
2297
54.3M
    tp->tp_free(self);
2298
54.3M
    Py_DECREF(tp);
2299
54.3M
}
2300
2301
static PyObject*
2302
match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2303
52.5M
{
2304
52.5M
    Py_ssize_t length;
2305
52.5M
    int isbytes, charsize;
2306
52.5M
    Py_buffer view;
2307
52.5M
    PyObject *result;
2308
52.5M
    const void* ptr;
2309
52.5M
    Py_ssize_t i, j;
2310
2311
52.5M
    assert(0 <= index && index < self->groups);
2312
52.5M
    index *= 2;
2313
2314
52.5M
    if (self->string == Py_None || self->mark[index] < 0) {
2315
        /* return default value if the string or group is undefined */
2316
9.74M
        return Py_NewRef(def);
2317
9.74M
    }
2318
2319
42.7M
    ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
2320
42.7M
    if (ptr == NULL)
2321
0
        return NULL;
2322
2323
42.7M
    i = self->mark[index];
2324
42.7M
    j = self->mark[index+1];
2325
42.7M
    i = Py_MIN(i, length);
2326
42.7M
    j = Py_MIN(j, length);
2327
42.7M
    result = getslice(isbytes, ptr, self->string, i, j);
2328
42.7M
    if (isbytes && view.buf != NULL)
2329
394k
        PyBuffer_Release(&view);
2330
42.7M
    return result;
2331
42.7M
}
2332
2333
static Py_ssize_t
2334
match_getindex(MatchObject* self, PyObject* index)
2335
72.2M
{
2336
72.2M
    Py_ssize_t i;
2337
2338
72.2M
    if (index == NULL)
2339
        /* Default value */
2340
19.0M
        return 0;
2341
2342
53.2M
    if (PyIndex_Check(index)) {
2343
35.2M
        i = PyNumber_AsSsize_t(index, NULL);
2344
35.2M
    }
2345
17.9M
    else {
2346
17.9M
        i = -1;
2347
2348
17.9M
        if (self->pattern->groupindex) {
2349
17.9M
            index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2350
17.9M
            if (index && PyLong_Check(index)) {
2351
17.9M
                i = PyLong_AsSsize_t(index);
2352
17.9M
            }
2353
17.9M
        }
2354
17.9M
    }
2355
53.2M
    if (i < 0 || i >= self->groups) {
2356
        /* raise IndexError if we were given a bad group number */
2357
0
        if (!PyErr_Occurred()) {
2358
0
            PyErr_SetString(PyExc_IndexError, "no such group");
2359
0
        }
2360
0
        return -1;
2361
0
    }
2362
2363
    // Check that i*2 cannot overflow to make static analyzers happy
2364
53.2M
    assert((size_t)i <= SRE_MAXGROUPS);
2365
53.2M
    return i;
2366
53.2M
}
2367
2368
static PyObject*
2369
match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2370
52.5M
{
2371
52.5M
    Py_ssize_t i = match_getindex(self, index);
2372
2373
52.5M
    if (i < 0) {
2374
0
        return NULL;
2375
0
    }
2376
2377
52.5M
    return match_getslice_by_index(self, i, def);
2378
52.5M
}
2379
2380
/*[clinic input]
2381
@permit_long_summary
2382
_sre.SRE_Match.expand
2383
2384
    template: object
2385
2386
Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2387
[clinic start generated code]*/
2388
2389
static PyObject *
2390
_sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2391
/*[clinic end generated code: output=931b58ccc323c3a1 input=dc74d81265376ac3]*/
2392
0
{
2393
0
    _sremodulestate *module_state = get_sre_module_state_by_class(Py_TYPE(self));
2394
0
    PyObject *filter = compile_template(module_state, self->pattern, template);
2395
0
    if (filter == NULL) {
2396
0
        return NULL;
2397
0
    }
2398
0
    PyObject *result = expand_template((TemplateObject *)filter, self);
2399
0
    Py_DECREF(filter);
2400
0
    return result;
2401
0
}
2402
2403
static PyObject*
2404
match_group(PyObject *op, PyObject* args)
2405
26.5M
{
2406
26.5M
    MatchObject *self = _MatchObject_CAST(op);
2407
26.5M
    PyObject* result;
2408
26.5M
    Py_ssize_t i, size;
2409
2410
26.5M
    size = PyTuple_GET_SIZE(args);
2411
2412
26.5M
    switch (size) {
2413
2.99M
    case 0:
2414
2.99M
        result = match_getslice(self, _PyLong_GetZero(), Py_None);
2415
2.99M
        break;
2416
10.4M
    case 1:
2417
10.4M
        result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2418
10.4M
        break;
2419
13.0M
    default:
2420
        /* fetch multiple items */
2421
13.0M
        result = PyTuple_New(size);
2422
13.0M
        if (!result)
2423
0
            return NULL;
2424
49.1M
        for (i = 0; i < size; i++) {
2425
36.1M
            PyObject* item = match_getslice(
2426
36.1M
                self, PyTuple_GET_ITEM(args, i), Py_None
2427
36.1M
                );
2428
36.1M
            if (!item) {
2429
0
                Py_DECREF(result);
2430
0
                return NULL;
2431
0
            }
2432
36.1M
            PyTuple_SET_ITEM(result, i, item);
2433
36.1M
        }
2434
13.0M
        break;
2435
26.5M
    }
2436
26.5M
    return result;
2437
26.5M
}
2438
2439
static PyObject*
2440
match_getitem(PyObject *op, PyObject* name)
2441
2.85M
{
2442
2.85M
    MatchObject *self = _MatchObject_CAST(op);
2443
2.85M
    return match_getslice(self, name, Py_None);
2444
2.85M
}
2445
2446
/*[clinic input]
2447
_sre.SRE_Match.groups
2448
2449
    default: object = None
2450
        Is used for groups that did not participate in the match.
2451
2452
Return a tuple containing all the subgroups of the match, from 1.
2453
[clinic start generated code]*/
2454
2455
static PyObject *
2456
_sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2457
/*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
2458
323
{
2459
323
    PyObject* result;
2460
323
    Py_ssize_t index;
2461
2462
323
    result = PyTuple_New(self->groups-1);
2463
323
    if (!result)
2464
0
        return NULL;
2465
2466
2.74k
    for (index = 1; index < self->groups; index++) {
2467
2.42k
        PyObject* item;
2468
2.42k
        item = match_getslice_by_index(self, index, default_value);
2469
2.42k
        if (!item) {
2470
0
            Py_DECREF(result);
2471
0
            return NULL;
2472
0
        }
2473
2.42k
        PyTuple_SET_ITEM(result, index-1, item);
2474
2.42k
    }
2475
2476
323
    return result;
2477
323
}
2478
2479
/*[clinic input]
2480
@permit_long_summary
2481
_sre.SRE_Match.groupdict
2482
2483
    default: object = None
2484
        Is used for groups that did not participate in the match.
2485
2486
Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2487
[clinic start generated code]*/
2488
2489
static PyObject *
2490
_sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2491
/*[clinic end generated code: output=29917c9073e41757 input=a8d3a1dc80336872]*/
2492
122
{
2493
122
    PyObject *result;
2494
122
    PyObject *key;
2495
122
    PyObject *value;
2496
122
    Py_ssize_t pos = 0;
2497
122
    Py_hash_t hash;
2498
2499
122
    result = PyDict_New();
2500
122
    if (!result || !self->pattern->groupindex)
2501
0
        return result;
2502
2503
122
    Py_BEGIN_CRITICAL_SECTION(self->pattern->groupindex);
2504
806
    while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
2505
684
        int status;
2506
684
        Py_INCREF(key);
2507
684
        value = match_getslice(self, key, default_value);
2508
684
        if (!value) {
2509
0
            Py_DECREF(key);
2510
0
            Py_CLEAR(result);
2511
0
            goto exit;
2512
0
        }
2513
684
        status = _PyDict_SetItem_KnownHash(result, key, value, hash);
2514
684
        Py_DECREF(value);
2515
684
        Py_DECREF(key);
2516
684
        if (status < 0) {
2517
0
            Py_CLEAR(result);
2518
0
            goto exit;
2519
0
        }
2520
684
    }
2521
122
exit:;
2522
122
    Py_END_CRITICAL_SECTION();
2523
2524
122
    return result;
2525
122
}
2526
2527
/*[clinic input]
2528
_sre.SRE_Match.start -> Py_ssize_t
2529
2530
    group: object(c_default="NULL") = 0
2531
    /
2532
2533
Return index of the start of the substring matched by group.
2534
[clinic start generated code]*/
2535
2536
static Py_ssize_t
2537
_sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2538
/*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
2539
5.27M
{
2540
5.27M
    Py_ssize_t index = match_getindex(self, group);
2541
2542
5.27M
    if (index < 0) {
2543
0
        return -1;
2544
0
    }
2545
2546
    /* mark is -1 if group is undefined */
2547
5.27M
    return self->mark[index*2];
2548
5.27M
}
2549
2550
/*[clinic input]
2551
_sre.SRE_Match.end -> Py_ssize_t
2552
2553
    group: object(c_default="NULL") = 0
2554
    /
2555
2556
Return index of the end of the substring matched by group.
2557
[clinic start generated code]*/
2558
2559
static Py_ssize_t
2560
_sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2561
/*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
2562
11.6M
{
2563
11.6M
    Py_ssize_t index = match_getindex(self, group);
2564
2565
11.6M
    if (index < 0) {
2566
0
        return -1;
2567
0
    }
2568
2569
    /* mark is -1 if group is undefined */
2570
11.6M
    return self->mark[index*2+1];
2571
11.6M
}
2572
2573
LOCAL(PyObject*)
2574
_pair(Py_ssize_t i1, Py_ssize_t i2)
2575
2.87M
{
2576
2.87M
    PyObject* item1 = PyLong_FromSsize_t(i1);
2577
2.87M
    if (!item1) {
2578
0
        return NULL;
2579
0
    }
2580
2.87M
    PyObject* item2 = PyLong_FromSsize_t(i2);
2581
2.87M
    if(!item2) {
2582
0
        Py_DECREF(item1);
2583
0
        return NULL;
2584
0
    }
2585
2586
2.87M
    return _PyTuple_FromPairSteal(item1, item2);
2587
2.87M
}
2588
2589
/*[clinic input]
2590
_sre.SRE_Match.span
2591
2592
    group: object(c_default="NULL") = 0
2593
    /
2594
2595
For match object m, return the 2-tuple (m.start(group), m.end(group)).
2596
[clinic start generated code]*/
2597
2598
static PyObject *
2599
_sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
2600
/*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
2601
2.87M
{
2602
2.87M
    Py_ssize_t index = match_getindex(self, group);
2603
2604
2.87M
    if (index < 0) {
2605
0
        return NULL;
2606
0
    }
2607
2608
    /* marks are -1 if group is undefined */
2609
2.87M
    return _pair(self->mark[index*2], self->mark[index*2+1]);
2610
2.87M
}
2611
2612
static PyObject*
2613
match_regs(MatchObject* self)
2614
0
{
2615
0
    PyObject* regs;
2616
0
    PyObject* item;
2617
0
    Py_ssize_t index;
2618
2619
0
    regs = PyTuple_New(self->groups);
2620
0
    if (!regs)
2621
0
        return NULL;
2622
2623
0
    for (index = 0; index < self->groups; index++) {
2624
0
        item = _pair(self->mark[index*2], self->mark[index*2+1]);
2625
0
        if (!item) {
2626
0
            Py_DECREF(regs);
2627
0
            return NULL;
2628
0
        }
2629
0
        PyTuple_SET_ITEM(regs, index, item);
2630
0
    }
2631
2632
0
    self->regs = Py_NewRef(regs);
2633
2634
0
    return regs;
2635
0
}
2636
2637
/*[clinic input]
2638
_sre.SRE_Match.__copy__
2639
2640
[clinic start generated code]*/
2641
2642
static PyObject *
2643
_sre_SRE_Match___copy___impl(MatchObject *self)
2644
/*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
2645
0
{
2646
0
    return Py_NewRef(self);
2647
0
}
2648
2649
/*[clinic input]
2650
_sre.SRE_Match.__deepcopy__
2651
2652
    memo: object
2653
    /
2654
2655
[clinic start generated code]*/
2656
2657
static PyObject *
2658
_sre_SRE_Match___deepcopy___impl(MatchObject *self, PyObject *memo)
2659
/*[clinic end generated code: output=2b657578eb03f4a3 input=779d12a31c2c325e]*/
2660
0
{
2661
0
    return Py_NewRef(self);
2662
0
}
2663
2664
PyDoc_STRVAR(match_doc,
2665
"The result of re.search(), re.prefixmatch(), and re.fullmatch().\n\
2666
Match objects always have a boolean value of True.");
2667
2668
PyDoc_STRVAR(match_group_doc,
2669
"group([group1, ...]) -> str or tuple.\n\
2670
    Return subgroup(s) of the match by indices or names.\n\
2671
    For 0 returns the entire match.");
2672
2673
static PyObject *
2674
match_lastindex_get(PyObject *op, void *Py_UNUSED(ignored))
2675
0
{
2676
0
    MatchObject *self = _MatchObject_CAST(op);
2677
0
    if (self->lastindex >= 0)
2678
0
        return PyLong_FromSsize_t(self->lastindex);
2679
0
    Py_RETURN_NONE;
2680
0
}
2681
2682
static PyObject *
2683
match_lastgroup_get(PyObject *op, void *Py_UNUSED(ignored))
2684
0
{
2685
0
    MatchObject *self = _MatchObject_CAST(op);
2686
0
    if (self->pattern->indexgroup &&
2687
0
        self->lastindex >= 0 &&
2688
0
        self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2689
0
    {
2690
0
        PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2691
0
                                            self->lastindex);
2692
0
        return Py_NewRef(result);
2693
0
    }
2694
0
    Py_RETURN_NONE;
2695
0
}
2696
2697
static PyObject *
2698
match_regs_get(PyObject *op, void *Py_UNUSED(ignored))
2699
0
{
2700
0
    MatchObject *self = _MatchObject_CAST(op);
2701
0
    if (self->regs) {
2702
0
        return Py_NewRef(self->regs);
2703
0
    } else
2704
0
        return match_regs(self);
2705
0
}
2706
2707
static PyObject *
2708
match_repr(PyObject *op)
2709
0
{
2710
0
    MatchObject *self = _MatchObject_CAST(op);
2711
0
    PyObject *result;
2712
0
    PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2713
0
    if (group0 == NULL)
2714
0
        return NULL;
2715
0
    result = PyUnicode_FromFormat(
2716
0
            "<%s object; span=(%zd, %zd), match=%.50R>",
2717
0
            Py_TYPE(self)->tp_name,
2718
0
            self->mark[0], self->mark[1], group0);
2719
0
    Py_DECREF(group0);
2720
0
    return result;
2721
0
}
2722
2723
2724
static PyObject*
2725
pattern_new_match(_sremodulestate* module_state,
2726
                  PatternObject* pattern,
2727
                  SRE_STATE* state,
2728
                  Py_ssize_t status)
2729
73.3M
{
2730
    /* create match object (from state object) */
2731
2732
73.3M
    MatchObject* match;
2733
73.3M
    Py_ssize_t i, j;
2734
73.3M
    char* base;
2735
73.3M
    int n;
2736
2737
73.3M
    if (status > 0) {
2738
2739
        /* create match object (with room for extra group marks) */
2740
        /* coverity[ampersand_in_size] */
2741
54.3M
        match = PyObject_GC_NewVar(MatchObject,
2742
54.3M
                                   module_state->Match_Type,
2743
54.3M
                                   2*(pattern->groups+1));
2744
54.3M
        if (!match)
2745
0
            return NULL;
2746
2747
54.3M
        Py_INCREF(pattern);
2748
54.3M
        match->pattern = pattern;
2749
2750
54.3M
        match->string = Py_NewRef(state->string);
2751
2752
54.3M
        match->regs = NULL;
2753
54.3M
        match->groups = pattern->groups+1;
2754
2755
        /* fill in group slices */
2756
2757
54.3M
        base = (char*) state->beginning;
2758
54.3M
        n = state->charsize;
2759
2760
54.3M
        match->mark[0] = ((char*) state->start - base) / n;
2761
54.3M
        match->mark[1] = ((char*) state->ptr - base) / n;
2762
2763
108M
        for (i = j = 0; i < pattern->groups; i++, j+=2)
2764
54.1M
            if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2765
43.8M
                match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2766
43.8M
                match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2767
2768
                /* check wrong span */
2769
43.8M
                if (match->mark[j+2] > match->mark[j+3]) {
2770
0
                    PyErr_SetString(PyExc_SystemError,
2771
0
                                    "The span of capturing group is wrong,"
2772
0
                                    " please report a bug for the re module.");
2773
0
                    Py_DECREF(match);
2774
0
                    return NULL;
2775
0
                }
2776
43.8M
            } else
2777
10.2M
                match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2778
2779
54.3M
        match->pos = state->pos;
2780
54.3M
        match->endpos = state->endpos;
2781
2782
54.3M
        match->lastindex = state->lastindex;
2783
2784
54.3M
        PyObject_GC_Track(match);
2785
54.3M
        return (PyObject*) match;
2786
2787
54.3M
    } else if (status == 0) {
2788
2789
        /* no match */
2790
19.0M
        Py_RETURN_NONE;
2791
2792
19.0M
    }
2793
2794
    /* internal error */
2795
0
    pattern_error(status);
2796
0
    return NULL;
2797
73.3M
}
2798
2799
2800
/* -------------------------------------------------------------------- */
2801
/* scanner methods (experimental) */
2802
2803
static int
2804
scanner_traverse(PyObject *op, visitproc visit, void *arg)
2805
946
{
2806
946
    ScannerObject *self = _ScannerObject_CAST(op);
2807
946
    Py_VISIT(Py_TYPE(self));
2808
946
    Py_VISIT(self->pattern);
2809
946
    return 0;
2810
946
}
2811
2812
static int
2813
scanner_clear(PyObject *op)
2814
352k
{
2815
352k
    ScannerObject *self = _ScannerObject_CAST(op);
2816
352k
    Py_CLEAR(self->pattern);
2817
352k
    return 0;
2818
352k
}
2819
2820
static void
2821
scanner_dealloc(PyObject *self)
2822
352k
{
2823
352k
    PyTypeObject *tp = Py_TYPE(self);
2824
352k
    PyObject_GC_UnTrack(self);
2825
352k
    ScannerObject *scanner = _ScannerObject_CAST(self);
2826
352k
    state_fini(&scanner->state);
2827
352k
    (void)scanner_clear(self);
2828
352k
    tp->tp_free(self);
2829
352k
    Py_DECREF(tp);
2830
352k
}
2831
2832
static int
2833
scanner_begin(ScannerObject* self)
2834
3.21M
{
2835
#ifdef Py_GIL_DISABLED
2836
    int was_executing = _Py_atomic_exchange_int(&self->executing, 1);
2837
#else
2838
3.21M
    int was_executing = self->executing;
2839
3.21M
    self->executing = 1;
2840
3.21M
#endif
2841
3.21M
    if (was_executing) {
2842
0
        PyErr_SetString(PyExc_ValueError,
2843
0
                        "regular expression scanner already executing");
2844
0
        return 0;
2845
0
    }
2846
3.21M
    return 1;
2847
3.21M
}
2848
2849
static void
2850
scanner_end(ScannerObject* self)
2851
3.21M
{
2852
3.21M
    assert(FT_ATOMIC_LOAD_INT_RELAXED(self->executing));
2853
3.21M
    FT_ATOMIC_STORE_INT(self->executing, 0);
2854
3.21M
}
2855
2856
/*[clinic input]
2857
_sre.SRE_Scanner.prefixmatch
2858
2859
    cls: defining_class
2860
    /
2861
2862
[clinic start generated code]*/
2863
2864
static PyObject *
2865
_sre_SRE_Scanner_prefixmatch_impl(ScannerObject *self, PyTypeObject *cls)
2866
/*[clinic end generated code: output=02b3b9d2954a2157 input=3049b20466c56a8e]*/
2867
0
{
2868
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2869
0
    SRE_STATE* state = &self->state;
2870
0
    PyObject* match;
2871
0
    Py_ssize_t status;
2872
2873
0
    if (!scanner_begin(self)) {
2874
0
        return NULL;
2875
0
    }
2876
0
    if (state->start == NULL) {
2877
0
        scanner_end(self);
2878
0
        Py_RETURN_NONE;
2879
0
    }
2880
2881
0
    state_reset(state);
2882
2883
0
    state->ptr = state->start;
2884
2885
0
    status = sre_match(state, PatternObject_GetCode(self->pattern));
2886
0
    if (PyErr_Occurred()) {
2887
0
        scanner_end(self);
2888
0
        return NULL;
2889
0
    }
2890
2891
0
    match = pattern_new_match(module_state, self->pattern,
2892
0
                              state, status);
2893
2894
0
    if (status == 0)
2895
0
        state->start = NULL;
2896
0
    else {
2897
0
        state->must_advance = (state->ptr == state->start);
2898
0
        state->start = state->ptr;
2899
0
    }
2900
2901
0
    scanner_end(self);
2902
0
    return match;
2903
0
}
2904
2905
2906
/*[clinic input]
2907
_sre.SRE_Scanner.search
2908
2909
    cls: defining_class
2910
    /
2911
2912
[clinic start generated code]*/
2913
2914
static PyObject *
2915
_sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2916
/*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
2917
3.21M
{
2918
3.21M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2919
3.21M
    SRE_STATE* state = &self->state;
2920
3.21M
    PyObject* match;
2921
3.21M
    Py_ssize_t status;
2922
2923
3.21M
    if (!scanner_begin(self)) {
2924
0
        return NULL;
2925
0
    }
2926
3.21M
    if (state->start == NULL) {
2927
0
        scanner_end(self);
2928
0
        Py_RETURN_NONE;
2929
0
    }
2930
2931
3.21M
    state_reset(state);
2932
2933
3.21M
    state->ptr = state->start;
2934
2935
3.21M
    status = sre_search(state, PatternObject_GetCode(self->pattern));
2936
3.21M
    if (PyErr_Occurred()) {
2937
0
        scanner_end(self);
2938
0
        return NULL;
2939
0
    }
2940
2941
3.21M
    match = pattern_new_match(module_state, self->pattern,
2942
3.21M
                              state, status);
2943
2944
3.21M
    if (status == 0)
2945
352k
        state->start = NULL;
2946
2.85M
    else {
2947
2.85M
        state->must_advance = (state->ptr == state->start);
2948
2.85M
        state->start = state->ptr;
2949
2.85M
    }
2950
2951
3.21M
    scanner_end(self);
2952
3.21M
    return match;
2953
3.21M
}
2954
2955
static PyObject *
2956
pattern_scanner(_sremodulestate *module_state,
2957
                PatternObject *self,
2958
                PyObject *string,
2959
                Py_ssize_t pos,
2960
                Py_ssize_t endpos)
2961
352k
{
2962
352k
    ScannerObject* scanner;
2963
2964
    /* create scanner object */
2965
352k
    scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
2966
352k
    if (!scanner)
2967
0
        return NULL;
2968
352k
    scanner->pattern = NULL;
2969
352k
    scanner->executing = 0;
2970
2971
    /* create search state object */
2972
352k
    if (!state_init(&scanner->state, self, string, pos, endpos)) {
2973
0
        Py_DECREF(scanner);
2974
0
        return NULL;
2975
0
    }
2976
2977
352k
    Py_INCREF(self);
2978
352k
    scanner->pattern = self;
2979
2980
352k
    PyObject_GC_Track(scanner);
2981
352k
    return (PyObject*) scanner;
2982
352k
}
2983
2984
/* -------------------------------------------------------------------- */
2985
/* template methods */
2986
2987
static int
2988
template_traverse(PyObject *op, visitproc visit, void *arg)
2989
0
{
2990
0
    TemplateObject *self = _TemplateObject_CAST(op);
2991
0
    Py_VISIT(Py_TYPE(self));
2992
0
    Py_VISIT(self->literal);
2993
0
    for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
2994
0
        Py_VISIT(self->items[i].literal);
2995
0
    }
2996
0
    return 0;
2997
0
}
2998
2999
static int
3000
template_clear(PyObject *op)
3001
0
{
3002
0
    TemplateObject *self = _TemplateObject_CAST(op);
3003
0
    Py_CLEAR(self->literal);
3004
0
    for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
3005
0
        Py_CLEAR(self->items[i].literal);
3006
0
    }
3007
0
    return 0;
3008
0
}
3009
3010
static void
3011
template_dealloc(PyObject *self)
3012
0
{
3013
0
    PyTypeObject *tp = Py_TYPE(self);
3014
0
    PyObject_GC_UnTrack(self);
3015
0
    (void)template_clear(self);
3016
0
    tp->tp_free(self);
3017
0
    Py_DECREF(tp);
3018
0
}
3019
3020
static PyObject *
3021
expand_template(TemplateObject *self, MatchObject *match)
3022
0
{
3023
0
    if (Py_SIZE(self) == 0) {
3024
0
        return Py_NewRef(self->literal);
3025
0
    }
3026
3027
0
    PyObject *result = NULL;
3028
0
    Py_ssize_t count = 0;  // the number of non-empty chunks
3029
    /* For small number of strings use a buffer allocated on the stack,
3030
     * otherwise use a list object. */
3031
0
    PyObject *buffer[10];
3032
0
    PyObject **out = buffer;
3033
0
    PyObject *list = NULL;
3034
0
    if (self->chunks > (int)Py_ARRAY_LENGTH(buffer) ||
3035
0
        !PyUnicode_Check(self->literal))
3036
0
    {
3037
0
        list = PyList_New(self->chunks);
3038
0
        if (!list) {
3039
0
            return NULL;
3040
0
        }
3041
0
        out = &PyList_GET_ITEM(list, 0);
3042
0
    }
3043
3044
0
    out[count++] = Py_NewRef(self->literal);
3045
0
    for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) {
3046
0
        Py_ssize_t index = self->items[i].index;
3047
0
        if (index >= match->groups) {
3048
0
            PyErr_SetString(PyExc_IndexError, "no such group");
3049
0
            goto cleanup;
3050
0
        }
3051
0
        PyObject *item = match_getslice_by_index(match, index, Py_None);
3052
0
        if (item == NULL) {
3053
0
            goto cleanup;
3054
0
        }
3055
0
        if (item != Py_None) {
3056
0
            out[count++] = Py_NewRef(item);
3057
0
        }
3058
0
        Py_DECREF(item);
3059
3060
0
        PyObject *literal = self->items[i].literal;
3061
0
        if (literal != NULL) {
3062
0
            out[count++] = Py_NewRef(literal);
3063
0
        }
3064
0
    }
3065
3066
0
    if (PyUnicode_Check(self->literal)) {
3067
0
        result = _PyUnicode_JoinArray(&_Py_STR(empty), out, count);
3068
0
    }
3069
0
    else {
3070
0
        Py_SET_SIZE(list, count);
3071
0
        result = PyBytes_Join((PyObject *)&_Py_SINGLETON(bytes_empty), list);
3072
0
    }
3073
3074
0
cleanup:
3075
0
    if (list) {
3076
0
        Py_DECREF(list);
3077
0
    }
3078
0
    else {
3079
0
        for (Py_ssize_t i = 0; i < count; i++) {
3080
0
            Py_DECREF(out[i]);
3081
0
        }
3082
0
    }
3083
0
    return result;
3084
0
}
3085
3086
3087
static Py_hash_t
3088
pattern_hash(PyObject *op)
3089
0
{
3090
0
    PatternObject *self = _PatternObject_CAST(op);
3091
3092
0
    Py_hash_t hash, hash2;
3093
3094
0
    hash = PyObject_Hash(self->pattern);
3095
0
    if (hash == -1) {
3096
0
        return -1;
3097
0
    }
3098
3099
0
    hash2 = Py_HashBuffer(self->code, sizeof(self->code[0]) * self->codesize);
3100
0
    hash ^= hash2;
3101
3102
0
    hash ^= self->flags;
3103
0
    hash ^= self->isbytes;
3104
0
    hash ^= self->codesize;
3105
3106
0
    if (hash == -1) {
3107
0
        hash = -2;
3108
0
    }
3109
0
    return hash;
3110
0
}
3111
3112
static PyObject*
3113
pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
3114
0
{
3115
0
    PyTypeObject *tp = Py_TYPE(lefto);
3116
0
    _sremodulestate *module_state = get_sre_module_state_by_class(tp);
3117
0
    PatternObject *left, *right;
3118
0
    int cmp;
3119
3120
0
    if (op != Py_EQ && op != Py_NE) {
3121
0
        Py_RETURN_NOTIMPLEMENTED;
3122
0
    }
3123
3124
0
    if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
3125
0
    {
3126
0
        Py_RETURN_NOTIMPLEMENTED;
3127
0
    }
3128
3129
0
    if (lefto == righto) {
3130
        /* a pattern is equal to itself */
3131
0
        return PyBool_FromLong(op == Py_EQ);
3132
0
    }
3133
3134
0
    left = (PatternObject *)lefto;
3135
0
    right = (PatternObject *)righto;
3136
3137
0
    cmp = (left->flags == right->flags
3138
0
           && left->isbytes == right->isbytes
3139
0
           && left->codesize == right->codesize);
3140
0
    if (cmp) {
3141
        /* Compare the code and the pattern because the same pattern can
3142
           produce different codes depending on the locale used to compile the
3143
           pattern when the re.LOCALE flag is used. Don't compare groups,
3144
           indexgroup nor groupindex: they are derivated from the pattern. */
3145
0
        cmp = (memcmp(left->code, right->code,
3146
0
                      sizeof(left->code[0]) * left->codesize) == 0);
3147
0
    }
3148
0
    if (cmp) {
3149
0
        cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
3150
0
                                       Py_EQ);
3151
0
        if (cmp < 0) {
3152
0
            return NULL;
3153
0
        }
3154
0
    }
3155
0
    if (op == Py_NE) {
3156
0
        cmp = !cmp;
3157
0
    }
3158
0
    return PyBool_FromLong(cmp);
3159
0
}
3160
3161
#include "clinic/sre.c.h"
3162
3163
static PyMethodDef pattern_methods[] = {
3164
    _SRE_SRE_PATTERN_PREFIXMATCH_METHODDEF
3165
    /* "match" reuses the prefixmatch Clinic-generated parser and impl
3166
     * to avoid duplicating the argument parsing boilerplate code. */
3167
    {"match", _PyCFunction_CAST(_sre_SRE_Pattern_prefixmatch),
3168
     METH_METHOD|METH_FASTCALL|METH_KEYWORDS,
3169
     _sre_SRE_Pattern_prefixmatch__doc__},
3170
    _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
3171
    _SRE_SRE_PATTERN_SEARCH_METHODDEF
3172
    _SRE_SRE_PATTERN_SUB_METHODDEF
3173
    _SRE_SRE_PATTERN_SUBN_METHODDEF
3174
    _SRE_SRE_PATTERN_FINDALL_METHODDEF
3175
    _SRE_SRE_PATTERN_SPLIT_METHODDEF
3176
    _SRE_SRE_PATTERN_FINDITER_METHODDEF
3177
    _SRE_SRE_PATTERN_SCANNER_METHODDEF
3178
    _SRE_SRE_PATTERN___COPY___METHODDEF
3179
    _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
3180
    _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
3181
    {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3182
     PyDoc_STR("See PEP 585")},
3183
    {NULL, NULL}
3184
};
3185
3186
static PyGetSetDef pattern_getset[] = {
3187
    {"groupindex", pattern_groupindex, NULL,
3188
      "A dictionary mapping group names to group numbers."},
3189
    {NULL}  /* Sentinel */
3190
};
3191
3192
#define PAT_OFF(x) offsetof(PatternObject, x)
3193
static PyMemberDef pattern_members[] = {
3194
    {"pattern",    _Py_T_OBJECT,    PAT_OFF(pattern),       Py_READONLY,
3195
     "The pattern string from which the RE object was compiled."},
3196
    {"flags",      Py_T_INT,       PAT_OFF(flags),         Py_READONLY,
3197
     "The regex matching flags."},
3198
    {"groups",     Py_T_PYSSIZET,  PAT_OFF(groups),        Py_READONLY,
3199
     "The number of capturing groups in the pattern."},
3200
    {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(PatternObject, weakreflist), Py_READONLY},
3201
    {NULL}  /* Sentinel */
3202
};
3203
3204
static PyType_Slot pattern_slots[] = {
3205
    {Py_tp_dealloc, pattern_dealloc},
3206
    {Py_tp_repr, pattern_repr},
3207
    {Py_tp_hash, pattern_hash},
3208
    {Py_tp_doc, (void *)pattern_doc},
3209
    {Py_tp_richcompare, pattern_richcompare},
3210
    {Py_tp_methods, pattern_methods},
3211
    {Py_tp_members, pattern_members},
3212
    {Py_tp_getset, pattern_getset},
3213
    {Py_tp_traverse, pattern_traverse},
3214
    {Py_tp_clear, pattern_clear},
3215
    {0, NULL},
3216
};
3217
3218
static PyType_Spec pattern_spec = {
3219
    .name = "re.Pattern",
3220
    .basicsize = sizeof(PatternObject),
3221
    .itemsize = sizeof(SRE_CODE),
3222
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3223
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3224
    .slots = pattern_slots,
3225
};
3226
3227
static PyMethodDef match_methods[] = {
3228
    {"group", match_group, METH_VARARGS, match_group_doc},
3229
    _SRE_SRE_MATCH_START_METHODDEF
3230
    _SRE_SRE_MATCH_END_METHODDEF
3231
    _SRE_SRE_MATCH_SPAN_METHODDEF
3232
    _SRE_SRE_MATCH_GROUPS_METHODDEF
3233
    _SRE_SRE_MATCH_GROUPDICT_METHODDEF
3234
    _SRE_SRE_MATCH_EXPAND_METHODDEF
3235
    _SRE_SRE_MATCH___COPY___METHODDEF
3236
    _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
3237
    {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3238
     PyDoc_STR("See PEP 585")},
3239
    {NULL, NULL}
3240
};
3241
3242
static PyGetSetDef match_getset[] = {
3243
    {"lastindex", match_lastindex_get, NULL,
3244
     "The integer index of the last matched capturing group."},
3245
    {"lastgroup", match_lastgroup_get, NULL,
3246
     "The name of the last matched capturing group."},
3247
    {"regs", match_regs_get, NULL, NULL},
3248
    {NULL}
3249
};
3250
3251
#define MATCH_OFF(x) offsetof(MatchObject, x)
3252
static PyMemberDef match_members[] = {
3253
    {"string",  _Py_T_OBJECT,   MATCH_OFF(string),  Py_READONLY,
3254
     "The string passed to match() or search()."},
3255
    {"re",      _Py_T_OBJECT,   MATCH_OFF(pattern), Py_READONLY,
3256
     "The regular expression object."},
3257
    {"pos",     Py_T_PYSSIZET, MATCH_OFF(pos),     Py_READONLY,
3258
     "The index into the string at which the RE engine started looking for a match."},
3259
    {"endpos",  Py_T_PYSSIZET, MATCH_OFF(endpos),  Py_READONLY,
3260
     "The index into the string beyond which the RE engine will not go."},
3261
    {NULL}
3262
};
3263
3264
/* FIXME: implement setattr("string", None) as a special case (to
3265
   detach the associated string, if any */
3266
static PyType_Slot match_slots[] = {
3267
    {Py_tp_dealloc, match_dealloc},
3268
    {Py_tp_repr, match_repr},
3269
    {Py_tp_doc, (void *)match_doc},
3270
    {Py_tp_methods, match_methods},
3271
    {Py_tp_members, match_members},
3272
    {Py_tp_getset, match_getset},
3273
    {Py_tp_traverse, match_traverse},
3274
    {Py_tp_clear, match_clear},
3275
3276
    /* As mapping.
3277
     *
3278
     * Match objects do not support length or assignment, but do support
3279
     * __getitem__.
3280
     */
3281
    {Py_mp_subscript, match_getitem},
3282
3283
    {0, NULL},
3284
};
3285
3286
static PyType_Spec match_spec = {
3287
    .name = "re.Match",
3288
    .basicsize = sizeof(MatchObject),
3289
    .itemsize = sizeof(Py_ssize_t),
3290
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3291
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3292
    .slots = match_slots,
3293
};
3294
3295
static PyMethodDef scanner_methods[] = {
3296
    _SRE_SRE_SCANNER_PREFIXMATCH_METHODDEF
3297
    /* "match" reuses the prefixmatch Clinic-generated parser and impl
3298
     * to avoid duplicating the argument parsing boilerplate code. */
3299
    {"match", _PyCFunction_CAST(_sre_SRE_Scanner_prefixmatch),
3300
     METH_METHOD|METH_FASTCALL|METH_KEYWORDS,
3301
     _sre_SRE_Scanner_prefixmatch__doc__},
3302
    _SRE_SRE_SCANNER_SEARCH_METHODDEF
3303
    {NULL, NULL}
3304
};
3305
3306
#define SCAN_OFF(x) offsetof(ScannerObject, x)
3307
static PyMemberDef scanner_members[] = {
3308
    {"pattern", _Py_T_OBJECT, SCAN_OFF(pattern), Py_READONLY},
3309
    {NULL}  /* Sentinel */
3310
};
3311
3312
static PyType_Slot scanner_slots[] = {
3313
    {Py_tp_dealloc, scanner_dealloc},
3314
    {Py_tp_methods, scanner_methods},
3315
    {Py_tp_members, scanner_members},
3316
    {Py_tp_traverse, scanner_traverse},
3317
    {Py_tp_clear, scanner_clear},
3318
    {0, NULL},
3319
};
3320
3321
static PyType_Spec scanner_spec = {
3322
    .name = "_sre.SRE_Scanner",
3323
    .basicsize = sizeof(ScannerObject),
3324
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3325
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3326
    .slots = scanner_slots,
3327
};
3328
3329
static PyType_Slot template_slots[] = {
3330
    {Py_tp_dealloc, template_dealloc},
3331
    {Py_tp_traverse, template_traverse},
3332
    {Py_tp_clear, template_clear},
3333
    {0, NULL},
3334
};
3335
3336
static PyType_Spec template_spec = {
3337
    .name = "_sre.SRE_Template",
3338
    .basicsize = sizeof(TemplateObject),
3339
    .itemsize = sizeof(((TemplateObject *)0)->items[0]),
3340
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3341
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3342
    .slots = template_slots,
3343
};
3344
3345
static PyMethodDef _functions[] = {
3346
    _SRE_COMPILE_METHODDEF
3347
    _SRE_TEMPLATE_METHODDEF
3348
    _SRE_GETCODESIZE_METHODDEF
3349
    _SRE_ASCII_ISCASED_METHODDEF
3350
    _SRE_UNICODE_ISCASED_METHODDEF
3351
    _SRE_ASCII_TOLOWER_METHODDEF
3352
    _SRE_UNICODE_TOLOWER_METHODDEF
3353
    {NULL, NULL}
3354
};
3355
3356
static int
3357
sre_traverse(PyObject *module, visitproc visit, void *arg)
3358
1.31k
{
3359
1.31k
    _sremodulestate *state = get_sre_module_state(module);
3360
3361
1.31k
    Py_VISIT(state->Pattern_Type);
3362
1.31k
    Py_VISIT(state->Match_Type);
3363
1.31k
    Py_VISIT(state->Scanner_Type);
3364
1.31k
    Py_VISIT(state->Template_Type);
3365
1.31k
    Py_VISIT(state->compile_template);
3366
3367
1.31k
    return 0;
3368
1.31k
}
3369
3370
static int
3371
sre_clear(PyObject *module)
3372
0
{
3373
0
    _sremodulestate *state = get_sre_module_state(module);
3374
3375
0
    Py_CLEAR(state->Pattern_Type);
3376
0
    Py_CLEAR(state->Match_Type);
3377
0
    Py_CLEAR(state->Scanner_Type);
3378
0
    Py_CLEAR(state->Template_Type);
3379
0
    Py_CLEAR(state->compile_template);
3380
3381
0
    return 0;
3382
0
}
3383
3384
static void
3385
sre_free(void *module)
3386
0
{
3387
0
    sre_clear((PyObject *)module);
3388
0
}
3389
3390
112
#define CREATE_TYPE(m, type, spec)                                  \
3391
112
do {                                                                \
3392
112
    type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
3393
112
    if (type == NULL) {                                             \
3394
0
        goto error;                                                 \
3395
0
    }                                                               \
3396
112
} while (0)
3397
3398
#define ADD_ULONG_CONSTANT(module, name, value)           \
3399
56
    do {                                                  \
3400
56
        if (PyModule_Add(module, name, PyLong_FromUnsignedLong(value)) < 0) { \
3401
0
            goto error;                                   \
3402
0
        }                                                 \
3403
56
} while (0)
3404
3405
3406
#ifdef Py_DEBUG
3407
static void
3408
_assert_match_aliases_prefixmatch(PyMethodDef *methods)
3409
{
3410
    PyMethodDef *prefixmatch_md = &methods[0];
3411
    PyMethodDef *match_md = &methods[1];
3412
    assert(strcmp(prefixmatch_md->ml_name, "prefixmatch") == 0);
3413
    assert(strcmp(match_md->ml_name, "match") == 0);
3414
    assert(match_md->ml_meth == prefixmatch_md->ml_meth);
3415
    assert(match_md->ml_flags == prefixmatch_md->ml_flags);
3416
    assert(match_md->ml_doc == prefixmatch_md->ml_doc);
3417
}
3418
#endif
3419
3420
static int
3421
sre_exec(PyObject *m)
3422
28
{
3423
28
    _sremodulestate *state;
3424
3425
#ifdef Py_DEBUG
3426
    _assert_match_aliases_prefixmatch(pattern_methods);
3427
    _assert_match_aliases_prefixmatch(scanner_methods);
3428
#endif
3429
3430
    /* Create heap types */
3431
28
    state = get_sre_module_state(m);
3432
28
    CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
3433
28
    CREATE_TYPE(m, state->Match_Type, &match_spec);
3434
28
    CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
3435
28
    CREATE_TYPE(m, state->Template_Type, &template_spec);
3436
3437
28
    if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
3438
0
        goto error;
3439
0
    }
3440
3441
28
    if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
3442
0
        goto error;
3443
0
    }
3444
3445
28
    ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
3446
28
    ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
3447
3448
28
    if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
3449
0
        goto error;
3450
0
    }
3451
3452
28
    return 0;
3453
3454
0
error:
3455
0
    return -1;
3456
28
}
3457
3458
static PyModuleDef_Slot sre_slots[] = {
3459
    _Py_ABI_SLOT,
3460
    {Py_mod_exec, sre_exec},
3461
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
3462
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
3463
    {0, NULL},
3464
};
3465
3466
static struct PyModuleDef sremodule = {
3467
    .m_base = PyModuleDef_HEAD_INIT,
3468
    .m_name = "_sre",
3469
    .m_size = sizeof(_sremodulestate),
3470
    .m_methods = _functions,
3471
    .m_slots = sre_slots,
3472
    .m_traverse = sre_traverse,
3473
    .m_free = sre_free,
3474
    .m_clear = sre_clear,
3475
};
3476
3477
PyMODINIT_FUNC
3478
PyInit__sre(void)
3479
28
{
3480
28
    return PyModuleDef_Init(&sremodule);
3481
28
}
3482
3483
/* vim:ts=4:sw=4:et
3484
*/