Coverage Report

Created: 2026-06-21 06:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Modules/_sre/sre.c
Line
Count
Source
1
/*
2
 * Secret Labs' Regular Expression Engine
3
 *
4
 * regular expression matching engine
5
 *
6
 * partial history:
7
 * 1999-10-24 fl   created (based on existing template matcher code)
8
 * 2000-03-06 fl   first alpha, sort of
9
 * 2000-08-01 fl   fixes for 1.6b1
10
 * 2000-08-07 fl   use PyOS_CheckStack() if available
11
 * 2000-09-20 fl   added expand method
12
 * 2001-03-20 fl   lots of fixes for 2.1b2
13
 * 2001-04-15 fl   export copyright as Python attribute, not global
14
 * 2001-04-28 fl   added __copy__ methods (work in progress)
15
 * 2001-05-14 fl   fixes for 1.5.2 compatibility
16
 * 2001-07-01 fl   added BIGCHARSET support (from Martin von Loewis)
17
 * 2001-10-18 fl   fixed group reset issue (from Matthew Mueller)
18
 * 2001-10-20 fl   added split primitive; re-enable unicode for 1.6/2.0/2.1
19
 * 2001-10-21 fl   added sub/subn primitive
20
 * 2001-10-24 fl   added finditer primitive (for 2.2 only)
21
 * 2001-12-07 fl   fixed memory leak in sub/subn (Guido van Rossum)
22
 * 2002-11-09 fl   fixed empty sub/subn return type
23
 * 2003-04-18 mvl  fully support 4-byte codes
24
 * 2003-10-17 gn   implemented non recursive scheme
25
 * 2013-02-04 mrab added fullmatch primitive
26
 *
27
 * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
28
 *
29
 * This version of the SRE library can be redistributed under CNRI's
30
 * Python 1.6 license.  For any other use, please contact Secret Labs
31
 * AB (info@pythonware.com).
32
 *
33
 * Portions of this engine have been developed in cooperation with
34
 * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
35
 * other compatibility work.
36
 */
37
38
static const char copyright[] =
39
    " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
40
41
#include "Python.h"
42
#include "pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
43
#include "pycore_dict.h"             // _PyDict_Next()
44
#include "pycore_long.h"             // _PyLong_GetZero()
45
#include "pycore_list.h"             // _PyList_AppendTakeRef()
46
#include "pycore_moduleobject.h"     // _PyModule_GetState()
47
#include "pycore_tuple.h"            // _PyTuple_FromPairSteal
48
#include "pycore_unicodeobject.h"    // _PyUnicode_Copy
49
#include "pycore_weakref.h"          // FT_CLEAR_WEAKREFS()
50
51
#include "sre.h"                     // SRE_CODE
52
53
#include <ctype.h>                   // tolower(), toupper(), isalnum()
54
55
1.36G
#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
56
57
// On macOS, use the wide character ctype API using btowc()
58
#if defined(__APPLE__)
59
#  define USE_CTYPE_WINT_T
60
#endif
61
62
0
static int sre_isalnum(unsigned int ch) {
63
#ifdef USE_CTYPE_WINT_T
64
    return (unsigned int)iswalnum(btowc((int)ch));
65
#else
66
0
    return (unsigned int)isalnum((int)ch);
67
0
#endif
68
0
}
69
70
0
static unsigned int sre_tolower(unsigned int ch) {
71
#ifdef USE_CTYPE_WINT_T
72
    return (unsigned int)towlower(btowc((int)ch));
73
#else
74
0
    return (unsigned int)tolower((int)ch);
75
0
#endif
76
0
}
77
78
0
static unsigned int sre_toupper(unsigned int ch) {
79
#ifdef USE_CTYPE_WINT_T
80
    return (unsigned int)towupper(btowc((int)ch));
81
#else
82
0
    return (unsigned int)toupper((int)ch);
83
0
#endif
84
0
}
85
86
/* Defining this one controls tracing:
87
 * 0 -- disabled
88
 * 1 -- only if the DEBUG flag set
89
 * 2 -- always
90
 */
91
#ifndef VERBOSE
92
#  define VERBOSE 0
93
#endif
94
95
/* -------------------------------------------------------------------- */
96
97
#if defined(_MSC_VER) && !defined(__clang__)
98
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
99
#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
100
/* fastest possible local call under MSVC */
101
#define LOCAL(type) static __inline type __fastcall
102
#else
103
#define LOCAL(type) static inline type
104
#endif
105
106
/* error codes */
107
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
108
#define SRE_ERROR_STATE -2 /* illegal state */
109
0
#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
110
0
#define SRE_ERROR_MEMORY -9 /* out of memory */
111
0
#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
112
113
#if VERBOSE == 0
114
#  define INIT_TRACE(state)
115
#  define DO_TRACE 0
116
#  define TRACE(v)
117
#elif VERBOSE == 1
118
#  define INIT_TRACE(state) int _debug = (state)->debug
119
#  define DO_TRACE (_debug)
120
#  define TRACE(v) do {     \
121
        if (_debug) { \
122
            printf v;       \
123
        }                   \
124
    } while (0)
125
#elif VERBOSE == 2
126
#  define INIT_TRACE(state)
127
#  define DO_TRACE 1
128
#  define TRACE(v) printf v
129
#else
130
#  error VERBOSE must be 0, 1 or 2
131
#endif
132
133
/* -------------------------------------------------------------------- */
134
/* search engine state */
135
136
#define SRE_IS_DIGIT(ch)\
137
1.79k
    ((ch) <= '9' && Py_ISDIGIT(ch))
138
#define SRE_IS_SPACE(ch)\
139
32
    ((ch) <= ' ' && Py_ISSPACE(ch))
140
#define SRE_IS_LINEBREAK(ch)\
141
23.9k
    ((ch) == '\n')
142
#define SRE_IS_WORD(ch)\
143
10.9M
    ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
144
145
static unsigned int sre_lower_ascii(unsigned int ch)
146
9.24M
{
147
9.24M
    return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
148
9.24M
}
149
150
/* locale-specific character predicates */
151
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
152
 * warnings when c's type supports only numbers < N+1 */
153
0
#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? sre_isalnum((ch)) : 0)
154
0
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
155
156
static unsigned int sre_lower_locale(unsigned int ch)
157
0
{
158
0
    return ((ch) < 256 ? (unsigned int)sre_tolower((ch)) : ch);
159
0
}
160
161
static unsigned int sre_upper_locale(unsigned int ch)
162
0
{
163
0
    return ((ch) < 256 ? (unsigned int)sre_toupper((ch)) : ch);
164
0
}
165
166
/* unicode-specific character predicates */
167
168
12
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
169
80
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
170
0
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
171
1.43k
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
172
716
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
173
174
static unsigned int sre_lower_unicode(unsigned int ch)
175
116M
{
176
116M
    return (unsigned int) Py_UNICODE_TOLOWER(ch);
177
116M
}
178
179
static unsigned int sre_upper_unicode(unsigned int ch)
180
27.8M
{
181
27.8M
    return (unsigned int) Py_UNICODE_TOUPPER(ch);
182
27.8M
}
183
184
LOCAL(int)
185
sre_category(SRE_CODE category, unsigned int ch)
186
10.9M
{
187
10.9M
    switch (category) {
188
189
1.79k
    case SRE_CATEGORY_DIGIT:
190
1.79k
        return SRE_IS_DIGIT(ch);
191
0
    case SRE_CATEGORY_NOT_DIGIT:
192
0
        return !SRE_IS_DIGIT(ch);
193
32
    case SRE_CATEGORY_SPACE:
194
32
        return SRE_IS_SPACE(ch);
195
0
    case SRE_CATEGORY_NOT_SPACE:
196
0
        return !SRE_IS_SPACE(ch);
197
10.9M
    case SRE_CATEGORY_WORD:
198
10.9M
        return SRE_IS_WORD(ch);
199
0
    case SRE_CATEGORY_NOT_WORD:
200
0
        return !SRE_IS_WORD(ch);
201
0
    case SRE_CATEGORY_LINEBREAK:
202
0
        return SRE_IS_LINEBREAK(ch);
203
0
    case SRE_CATEGORY_NOT_LINEBREAK:
204
0
        return !SRE_IS_LINEBREAK(ch);
205
206
0
    case SRE_CATEGORY_LOC_WORD:
207
0
        return SRE_LOC_IS_WORD(ch);
208
0
    case SRE_CATEGORY_LOC_NOT_WORD:
209
0
        return !SRE_LOC_IS_WORD(ch);
210
211
12
    case SRE_CATEGORY_UNI_DIGIT:
212
12
        return SRE_UNI_IS_DIGIT(ch);
213
0
    case SRE_CATEGORY_UNI_NOT_DIGIT:
214
0
        return !SRE_UNI_IS_DIGIT(ch);
215
80
    case SRE_CATEGORY_UNI_SPACE:
216
80
        return SRE_UNI_IS_SPACE(ch);
217
0
    case SRE_CATEGORY_UNI_NOT_SPACE:
218
0
        return !SRE_UNI_IS_SPACE(ch);
219
716
    case SRE_CATEGORY_UNI_WORD:
220
716
        return SRE_UNI_IS_WORD(ch);
221
0
    case SRE_CATEGORY_UNI_NOT_WORD:
222
0
        return !SRE_UNI_IS_WORD(ch);
223
0
    case SRE_CATEGORY_UNI_LINEBREAK:
224
0
        return SRE_UNI_IS_LINEBREAK(ch);
225
0
    case SRE_CATEGORY_UNI_NOT_LINEBREAK:
226
0
        return !SRE_UNI_IS_LINEBREAK(ch);
227
10.9M
    }
228
0
    return 0;
229
10.9M
}
230
231
LOCAL(int)
232
char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
233
0
{
234
0
    return ch == pattern
235
0
        || (SRE_CODE) sre_lower_locale(ch) == pattern
236
0
        || (SRE_CODE) sre_upper_locale(ch) == pattern;
237
0
}
238
239
240
/* helpers */
241
242
static void
243
data_stack_dealloc(SRE_STATE* state)
244
175M
{
245
175M
    if (state->data_stack) {
246
158M
        PyMem_Free(state->data_stack);
247
158M
        state->data_stack = NULL;
248
158M
    }
249
175M
    state->data_stack_size = state->data_stack_base = 0;
250
175M
}
251
252
static int
253
data_stack_grow(SRE_STATE* state, Py_ssize_t size)
254
159M
{
255
159M
    INIT_TRACE(state);
256
159M
    Py_ssize_t minsize, cursize;
257
159M
    minsize = state->data_stack_base+size;
258
159M
    cursize = state->data_stack_size;
259
159M
    if (cursize < minsize) {
260
159M
        void* stack;
261
159M
        cursize = minsize+minsize/4+1024;
262
159M
        TRACE(("allocate/grow stack %zd\n", cursize));
263
159M
        stack = PyMem_Realloc(state->data_stack, cursize);
264
159M
        if (!stack) {
265
0
            data_stack_dealloc(state);
266
0
            return SRE_ERROR_MEMORY;
267
0
        }
268
159M
        state->data_stack = (char *)stack;
269
159M
        state->data_stack_size = cursize;
270
159M
    }
271
159M
    return 0;
272
159M
}
273
274
/* memory pool functions for SRE_REPEAT, this can avoid memory
275
   leak when SRE(match) function terminates abruptly.
276
   state->repeat_pool_used is a doubly-linked list, so that we
277
   can remove a SRE_REPEAT node from it.
278
   state->repeat_pool_unused is a singly-linked list, we put/get
279
   node at the head. */
280
static SRE_REPEAT *
281
repeat_pool_malloc(SRE_STATE *state)
282
37.1M
{
283
37.1M
    SRE_REPEAT *repeat;
284
285
37.1M
    if (state->repeat_pool_unused) {
286
        /* remove from unused pool (singly-linked list) */
287
409
        repeat = state->repeat_pool_unused;
288
409
        state->repeat_pool_unused = repeat->pool_next;
289
409
    }
290
37.1M
    else {
291
37.1M
        repeat = PyMem_Malloc(sizeof(SRE_REPEAT));
292
37.1M
        if (!repeat) {
293
0
            return NULL;
294
0
        }
295
37.1M
    }
296
297
    /* add to used pool (doubly-linked list) */
298
37.1M
    SRE_REPEAT *temp = state->repeat_pool_used;
299
37.1M
    if (temp) {
300
22.0M
        temp->pool_prev = repeat;
301
22.0M
    }
302
37.1M
    repeat->pool_prev = NULL;
303
37.1M
    repeat->pool_next = temp;
304
37.1M
    state->repeat_pool_used = repeat;
305
306
37.1M
    return repeat;
307
37.1M
}
308
309
static void
310
repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
311
37.1M
{
312
37.1M
    SRE_REPEAT *prev = repeat->pool_prev;
313
37.1M
    SRE_REPEAT *next = repeat->pool_next;
314
315
    /* remove from used pool (doubly-linked list) */
316
37.1M
    if (prev) {
317
0
        prev->pool_next = next;
318
0
    }
319
37.1M
    else {
320
37.1M
        state->repeat_pool_used = next;
321
37.1M
    }
322
37.1M
    if (next) {
323
22.0M
        next->pool_prev = prev;
324
22.0M
    }
325
326
    /* add to unused pool (singly-linked list) */
327
37.1M
    repeat->pool_next = state->repeat_pool_unused;
328
37.1M
    state->repeat_pool_unused = repeat;
329
37.1M
}
330
331
static void
332
repeat_pool_clear(SRE_STATE *state)
333
63.1M
{
334
    /* clear used pool */
335
63.1M
    SRE_REPEAT *next = state->repeat_pool_used;
336
63.1M
    state->repeat_pool_used = NULL;
337
63.1M
    while (next) {
338
0
        SRE_REPEAT *temp = next;
339
0
        next = temp->pool_next;
340
0
        PyMem_Free(temp);
341
0
    }
342
343
    /* clear unused pool */
344
63.1M
    next = state->repeat_pool_unused;
345
63.1M
    state->repeat_pool_unused = NULL;
346
100M
    while (next) {
347
37.1M
        SRE_REPEAT *temp = next;
348
37.1M
        next = temp->pool_next;
349
37.1M
        PyMem_Free(temp);
350
37.1M
    }
351
63.1M
}
352
353
/* generate 8-bit version */
354
355
206M
#define SRE_CHAR Py_UCS1
356
#define SIZEOF_SRE_CHAR 1
357
902M
#define SRE(F) sre_ucs1_##F
358
#include "sre_lib.h"
359
360
/* generate 16-bit unicode version */
361
362
300M
#define SRE_CHAR Py_UCS2
363
#define SIZEOF_SRE_CHAR 2
364
1.41G
#define SRE(F) sre_ucs2_##F
365
#include "sre_lib.h"
366
367
/* generate 32-bit unicode version */
368
369
106M
#define SRE_CHAR Py_UCS4
370
#define SIZEOF_SRE_CHAR 4
371
575M
#define SRE(F) sre_ucs4_##F
372
#include "sre_lib.h"
373
374
/* -------------------------------------------------------------------- */
375
/* factories and destructors */
376
377
/* module state */
378
typedef struct {
379
    PyTypeObject *Pattern_Type;
380
    PyTypeObject *Match_Type;
381
    PyTypeObject *Scanner_Type;
382
    PyTypeObject *Template_Type;
383
    PyObject *compile_template;  // reference to re._compile_template
384
} _sremodulestate;
385
386
static _sremodulestate *
387
get_sre_module_state(PyObject *m)
388
61.6M
{
389
61.6M
    _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
390
61.6M
    assert(state);
391
61.6M
    return state;
392
61.6M
}
393
394
static struct PyModuleDef sremodule;
395
#define get_sre_module_state_by_class(cls) \
396
61.6M
    (get_sre_module_state(PyType_GetModule(cls)))
397
398
/* see sre.h for object declarations */
399
static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
400
static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
401
402
16.1k
#define _PatternObject_CAST(op)     ((PatternObject *)(op))
403
73.2M
#define _MatchObject_CAST(op)       ((MatchObject *)(op))
404
0
#define _TemplateObject_CAST(op)    ((TemplateObject *)(op))
405
711k
#define _ScannerObject_CAST(op)     ((ScannerObject *)(op))
406
407
/*[clinic input]
408
module _sre
409
class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
410
class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
411
class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
412
[clinic start generated code]*/
413
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
414
415
/*[clinic input]
416
_sre.getcodesize -> int
417
[clinic start generated code]*/
418
419
static int
420
_sre_getcodesize_impl(PyObject *module)
421
/*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
422
0
{
423
0
    return sizeof(SRE_CODE);
424
0
}
425
426
/*[clinic input]
427
_sre.ascii_iscased -> bool
428
429
    character: int
430
    /
431
432
[clinic start generated code]*/
433
434
static int
435
_sre_ascii_iscased_impl(PyObject *module, int character)
436
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
437
7.40k
{
438
7.40k
    unsigned int ch = (unsigned int)character;
439
7.40k
    return ch < 128 && Py_ISALPHA(ch);
440
7.40k
}
441
442
/*[clinic input]
443
_sre.unicode_iscased -> bool
444
445
    character: int
446
    /
447
448
[clinic start generated code]*/
449
450
static int
451
_sre_unicode_iscased_impl(PyObject *module, int character)
452
/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
453
30.8M
{
454
30.8M
    unsigned int ch = (unsigned int)character;
455
30.8M
    return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
456
30.8M
}
457
458
/*[clinic input]
459
_sre.ascii_tolower -> int
460
461
    character: int
462
    /
463
464
[clinic start generated code]*/
465
466
static int
467
_sre_ascii_tolower_impl(PyObject *module, int character)
468
/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
469
1.37M
{
470
1.37M
    return sre_lower_ascii(character);
471
1.37M
}
472
473
/*[clinic input]
474
_sre.unicode_tolower -> int
475
476
    character: int
477
    /
478
479
[clinic start generated code]*/
480
481
static int
482
_sre_unicode_tolower_impl(PyObject *module, int character)
483
/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
484
85.9M
{
485
85.9M
    return sre_lower_unicode(character);
486
85.9M
}
487
488
LOCAL(void)
489
state_reset(SRE_STATE* state)
490
111M
{
491
    /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
492
    /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
493
494
111M
    state->lastmark = -1;
495
111M
    state->lastindex = -1;
496
497
111M
    state->repeat = NULL;
498
499
111M
    data_stack_dealloc(state);
500
111M
}
501
502
static const void*
503
getstring(PyObject* string, Py_ssize_t* p_length,
504
          int* p_isbytes, int* p_charsize,
505
          Py_buffer *view)
506
97.4M
{
507
    /* given a python object, return a data pointer, a length (in
508
       characters), and a character size.  return NULL if the object
509
       is not a string (or not compatible) */
510
511
    /* Unicode objects do not support the buffer API. So, get the data
512
       directly instead. */
513
97.4M
    if (PyUnicode_Check(string)) {
514
96.6M
        *p_length = PyUnicode_GET_LENGTH(string);
515
96.6M
        *p_charsize = PyUnicode_KIND(string);
516
96.6M
        *p_isbytes = 0;
517
96.6M
        return PyUnicode_DATA(string);
518
96.6M
    }
519
520
    /* get pointer to byte string buffer */
521
755k
    if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
522
0
        PyErr_Format(PyExc_TypeError, "expected string or bytes-like "
523
0
                     "object, got '%.200s'", Py_TYPE(string)->tp_name);
524
0
        return NULL;
525
0
    }
526
527
755k
    *p_length = view->len;
528
755k
    *p_charsize = 1;
529
755k
    *p_isbytes = 1;
530
531
755k
    if (view->buf == NULL) {
532
0
        PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
533
0
        PyBuffer_Release(view);
534
0
        view->buf = NULL;
535
0
        return NULL;
536
0
    }
537
755k
    return view->buf;
538
755k
}
539
540
LOCAL(PyObject*)
541
state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
542
           Py_ssize_t start, Py_ssize_t end)
543
63.1M
{
544
    /* prepare state object */
545
546
63.1M
    Py_ssize_t length;
547
63.1M
    int isbytes, charsize;
548
63.1M
    const void* ptr;
549
550
63.1M
    memset(state, 0, sizeof(SRE_STATE));
551
552
    /* Patterns with no capturing groups never emit MARK opcodes and never
553
       read state->mark (group 0's span comes from state->start/ptr), so skip
554
       the allocation entirely -- state->mark stays NULL, which both the err
555
       path and state_fini already free safely. */
556
63.1M
    if (pattern->groups) {
557
26.3M
        state->mark = PyMem_New(const void *, pattern->groups * 2);
558
26.3M
        if (!state->mark) {
559
0
            PyErr_NoMemory();
560
0
            goto err;
561
0
        }
562
26.3M
    }
563
63.1M
    state->lastmark = -1;
564
63.1M
    state->lastindex = -1;
565
566
63.1M
    state->buffer.buf = NULL;
567
63.1M
    ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
568
63.1M
    if (!ptr)
569
0
        goto err;
570
571
63.1M
    if (isbytes && pattern->isbytes == 0) {
572
0
        PyErr_SetString(PyExc_TypeError,
573
0
                        "cannot use a string pattern on a bytes-like object");
574
0
        goto err;
575
0
    }
576
63.1M
    if (!isbytes && pattern->isbytes > 0) {
577
0
        PyErr_SetString(PyExc_TypeError,
578
0
                        "cannot use a bytes pattern on a string-like object");
579
0
        goto err;
580
0
    }
581
582
    /* adjust boundaries */
583
63.1M
    if (start < 0)
584
0
        start = 0;
585
63.1M
    else if (start > length)
586
0
        start = length;
587
588
63.1M
    if (end < 0)
589
0
        end = 0;
590
63.1M
    else if (end > length)
591
63.1M
        end = length;
592
593
63.1M
    state->isbytes = isbytes;
594
63.1M
    state->charsize = charsize;
595
63.1M
    state->match_all = 0;
596
63.1M
    state->must_advance = 0;
597
63.1M
    state->debug = ((pattern->flags & SRE_FLAG_DEBUG) != 0);
598
599
63.1M
    state->beginning = ptr;
600
601
63.1M
    state->start = (void*) ((char*) ptr + start * state->charsize);
602
63.1M
    state->end = (void*) ((char*) ptr + end * state->charsize);
603
604
63.1M
    state->string = Py_NewRef(string);
605
63.1M
    state->pos = start;
606
63.1M
    state->endpos = end;
607
608
#ifdef Py_DEBUG
609
    state->fail_after_count = pattern->fail_after_count;
610
    state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
611
#endif
612
613
63.1M
    return string;
614
0
  err:
615
    /* We add an explicit cast here because MSVC has a bug when
616
       compiling C code where it believes that `const void**` cannot be
617
       safely casted to `void*`, see bpo-39943 for details. */
618
0
    PyMem_Free((void*) state->mark);
619
0
    state->mark = NULL;
620
0
    if (state->buffer.buf)
621
0
        PyBuffer_Release(&state->buffer);
622
0
    return NULL;
623
63.1M
}
624
625
LOCAL(void)
626
state_fini(SRE_STATE* state)
627
63.1M
{
628
63.1M
    if (state->buffer.buf)
629
384k
        PyBuffer_Release(&state->buffer);
630
63.1M
    Py_XDECREF(state->string);
631
63.1M
    data_stack_dealloc(state);
632
    /* See above PyMem_Free() for why we explicitly cast here. */
633
63.1M
    PyMem_Free((void*) state->mark);
634
63.1M
    state->mark = NULL;
635
    /* SRE_REPEAT pool */
636
63.1M
    repeat_pool_clear(state);
637
63.1M
}
638
639
/* calculate offset from start of string */
640
#define STATE_OFFSET(state, member)\
641
204M
    (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
642
643
LOCAL(PyObject*)
644
getslice(int isbytes, const void *ptr,
645
         PyObject* string, Py_ssize_t start, Py_ssize_t end)
646
140M
{
647
140M
    if (isbytes) {
648
441k
        if (PyBytes_CheckExact(string) &&
649
441k
            start == 0 && end == PyBytes_GET_SIZE(string)) {
650
2.00k
            return Py_NewRef(string);
651
2.00k
        }
652
439k
        return PyBytes_FromStringAndSize(
653
439k
                (const char *)ptr + start, end - start);
654
441k
    }
655
140M
    else {
656
140M
        return PyUnicode_Substring(string, start, end);
657
140M
    }
658
140M
}
659
660
LOCAL(PyObject*)
661
state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
662
791k
{
663
791k
    Py_ssize_t i, j;
664
665
791k
    index = (index - 1) * 2;
666
667
791k
    if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
668
0
        if (empty)
669
            /* want empty string */
670
0
            i = j = 0;
671
0
        else {
672
0
            Py_RETURN_NONE;
673
0
        }
674
791k
    } else {
675
791k
        i = STATE_OFFSET(state, state->mark[index]);
676
791k
        j = STATE_OFFSET(state, state->mark[index+1]);
677
678
        /* check wrong span */
679
791k
        if (i > j) {
680
0
            PyErr_SetString(PyExc_SystemError,
681
0
                            "The span of capturing group is wrong,"
682
0
                            " please report a bug for the re module.");
683
0
            return NULL;
684
0
        }
685
791k
    }
686
687
791k
    return getslice(state->isbytes, state->beginning, string, i, j);
688
791k
}
689
690
static void
691
pattern_error(Py_ssize_t status)
692
0
{
693
0
    switch (status) {
694
0
    case SRE_ERROR_RECURSION_LIMIT:
695
        /* This error code seems to be unused. */
696
0
        PyErr_SetString(
697
0
            PyExc_RecursionError,
698
0
            "maximum recursion limit exceeded"
699
0
            );
700
0
        break;
701
0
    case SRE_ERROR_MEMORY:
702
0
        PyErr_NoMemory();
703
0
        break;
704
0
    case SRE_ERROR_INTERRUPTED:
705
    /* An exception has already been raised, so let it fly */
706
0
        break;
707
0
    default:
708
        /* other error codes indicate compiler/engine bugs */
709
0
        PyErr_SetString(
710
0
            PyExc_RuntimeError,
711
0
            "internal error in regular expression engine"
712
0
            );
713
0
    }
714
0
}
715
716
static int
717
pattern_traverse(PyObject *op, visitproc visit, void *arg)
718
12.8k
{
719
12.8k
    PatternObject *self = _PatternObject_CAST(op);
720
12.8k
    Py_VISIT(Py_TYPE(self));
721
12.8k
    Py_VISIT(self->groupindex);
722
12.8k
    Py_VISIT(self->indexgroup);
723
12.8k
    Py_VISIT(self->pattern);
724
#ifdef Py_DEBUG
725
    Py_VISIT(self->fail_after_exc);
726
#endif
727
12.8k
    return 0;
728
12.8k
}
729
730
static int
731
pattern_clear(PyObject *op)
732
3.35k
{
733
3.35k
    PatternObject *self = _PatternObject_CAST(op);
734
3.35k
    Py_CLEAR(self->groupindex);
735
3.35k
    Py_CLEAR(self->indexgroup);
736
3.35k
    Py_CLEAR(self->pattern);
737
#ifdef Py_DEBUG
738
    Py_CLEAR(self->fail_after_exc);
739
#endif
740
3.35k
    return 0;
741
3.35k
}
742
743
static void
744
pattern_dealloc(PyObject *self)
745
3.35k
{
746
3.35k
    PyTypeObject *tp = Py_TYPE(self);
747
3.35k
    PyObject_GC_UnTrack(self);
748
3.35k
    FT_CLEAR_WEAKREFS(self, _PatternObject_CAST(self)->weakreflist);
749
3.35k
    (void)pattern_clear(self);
750
3.35k
    tp->tp_free(self);
751
3.35k
    Py_DECREF(tp);
752
3.35k
}
753
754
LOCAL(Py_ssize_t)
755
sre_match(SRE_STATE* state, SRE_CODE* pattern)
756
53.3M
{
757
53.3M
    if (state->charsize == 1)
758
32.4M
        return sre_ucs1_match(state, pattern, 1);
759
20.9M
    if (state->charsize == 2)
760
11.7M
        return sre_ucs2_match(state, pattern, 1);
761
20.9M
    assert(state->charsize == 4);
762
9.17M
    return sre_ucs4_match(state, pattern, 1);
763
20.9M
}
764
765
LOCAL(Py_ssize_t)
766
sre_search(SRE_STATE* state, SRE_CODE* pattern)
767
112M
{
768
112M
    if (state->charsize == 1)
769
49.5M
        return sre_ucs1_search(state, pattern);
770
63.2M
    if (state->charsize == 2)
771
56.8M
        return sre_ucs2_search(state, pattern);
772
63.2M
    assert(state->charsize == 4);
773
6.40M
    return sre_ucs4_search(state, pattern);
774
63.2M
}
775
776
/*[clinic input]
777
_sre.SRE_Pattern.prefixmatch
778
779
    cls: defining_class
780
    /
781
    string: object
782
    pos: Py_ssize_t = 0
783
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
784
785
Matches zero or more characters at the beginning of the string.
786
[clinic start generated code]*/
787
788
static PyObject *
789
_sre_SRE_Pattern_prefixmatch_impl(PatternObject *self, PyTypeObject *cls,
790
                                  PyObject *string, Py_ssize_t pos,
791
                                  Py_ssize_t endpos)
792
/*[clinic end generated code: output=a0e079fb4f875240 input=e2a7e68ea47d048c]*/
793
53.3M
{
794
53.3M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
795
53.3M
    SRE_STATE state;
796
53.3M
    Py_ssize_t status;
797
53.3M
    PyObject *match;
798
799
53.3M
    if (!state_init(&state, self, string, pos, endpos))
800
0
        return NULL;
801
802
53.3M
    INIT_TRACE(&state);
803
53.3M
    state.ptr = state.start;
804
805
53.3M
    TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
806
807
53.3M
    status = sre_match(&state, PatternObject_GetCode(self));
808
809
53.3M
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
810
53.3M
    if (PyErr_Occurred()) {
811
0
        state_fini(&state);
812
0
        return NULL;
813
0
    }
814
815
53.3M
    match = pattern_new_match(module_state, self, &state, status);
816
53.3M
    state_fini(&state);
817
53.3M
    return match;
818
53.3M
}
819
820
821
/*[clinic input]
822
_sre.SRE_Pattern.fullmatch
823
824
    cls: defining_class
825
    /
826
    string: object
827
    pos: Py_ssize_t = 0
828
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
829
830
Matches against all of the string.
831
[clinic start generated code]*/
832
833
static PyObject *
834
_sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
835
                                PyObject *string, Py_ssize_t pos,
836
                                Py_ssize_t endpos)
837
/*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
838
0
{
839
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
840
0
    SRE_STATE state;
841
0
    Py_ssize_t status;
842
0
    PyObject *match;
843
844
0
    if (!state_init(&state, self, string, pos, endpos))
845
0
        return NULL;
846
847
0
    INIT_TRACE(&state);
848
0
    state.ptr = state.start;
849
850
0
    TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
851
852
0
    state.match_all = 1;
853
0
    status = sre_match(&state, PatternObject_GetCode(self));
854
855
0
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
856
0
    if (PyErr_Occurred()) {
857
0
        state_fini(&state);
858
0
        return NULL;
859
0
    }
860
861
0
    match = pattern_new_match(module_state, self, &state, status);
862
0
    state_fini(&state);
863
0
    return match;
864
0
}
865
866
/*[clinic input]
867
@permit_long_summary
868
_sre.SRE_Pattern.search
869
870
    cls: defining_class
871
    /
872
    string: object
873
    pos: Py_ssize_t = 0
874
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
875
876
Scan through string looking for a match, and return a corresponding match object instance.
877
878
Return None if no position in the string matches.
879
[clinic start generated code]*/
880
881
static PyObject *
882
_sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
883
                             PyObject *string, Py_ssize_t pos,
884
                             Py_ssize_t endpos)
885
/*[clinic end generated code: output=bd7f2d9d583e1463 input=05e9feee0334c156]*/
886
867k
{
887
867k
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
888
867k
    SRE_STATE state;
889
867k
    Py_ssize_t status;
890
867k
    PyObject *match;
891
892
867k
    if (!state_init(&state, self, string, pos, endpos))
893
0
        return NULL;
894
895
867k
    INIT_TRACE(&state);
896
867k
    TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
897
898
867k
    status = sre_search(&state, PatternObject_GetCode(self));
899
900
867k
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
901
902
867k
    if (PyErr_Occurred()) {
903
0
        state_fini(&state);
904
0
        return NULL;
905
0
    }
906
907
867k
    match = pattern_new_match(module_state, self, &state, status);
908
867k
    state_fini(&state);
909
867k
    return match;
910
867k
}
911
912
/*[clinic input]
913
_sre.SRE_Pattern.findall
914
915
    string: object
916
    pos: Py_ssize_t = 0
917
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
918
919
Return a list of all non-overlapping matches of pattern in string.
920
[clinic start generated code]*/
921
922
static PyObject *
923
_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
924
                              Py_ssize_t pos, Py_ssize_t endpos)
925
/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
926
3.54M
{
927
3.54M
    SRE_STATE state;
928
3.54M
    PyObject* list;
929
3.54M
    Py_ssize_t status;
930
3.54M
    Py_ssize_t i, b, e;
931
932
3.54M
    if (!state_init(&state, self, string, pos, endpos))
933
0
        return NULL;
934
935
3.54M
    list = PyList_New(0);
936
3.54M
    if (!list) {
937
0
        state_fini(&state);
938
0
        return NULL;
939
0
    }
940
941
96.9M
    while (state.start <= state.end) {
942
943
96.9M
        PyObject* item;
944
945
96.9M
        state_reset(&state);
946
947
96.9M
        state.ptr = state.start;
948
949
96.9M
        status = sre_search(&state, PatternObject_GetCode(self));
950
96.9M
        if (PyErr_Occurred())
951
0
            goto error;
952
953
96.9M
        if (status <= 0) {
954
3.54M
            if (status == 0)
955
3.54M
                break;
956
0
            pattern_error(status);
957
0
            goto error;
958
3.54M
        }
959
960
        /* don't bother to build a match object */
961
93.4M
        switch (self->groups) {
962
93.4M
        case 0:
963
93.4M
            b = STATE_OFFSET(&state, state.start);
964
93.4M
            e = STATE_OFFSET(&state, state.ptr);
965
93.4M
            item = getslice(state.isbytes, state.beginning,
966
93.4M
                            string, b, e);
967
93.4M
            if (!item)
968
0
                goto error;
969
93.4M
            break;
970
93.4M
        case 1:
971
0
            item = state_getslice(&state, 1, string, 1);
972
0
            if (!item)
973
0
                goto error;
974
0
            break;
975
0
        default:
976
0
            item = PyTuple_New(self->groups);
977
0
            if (!item)
978
0
                goto error;
979
0
            for (i = 0; i < self->groups; i++) {
980
0
                PyObject* o = state_getslice(&state, i+1, string, 1);
981
0
                if (!o) {
982
0
                    Py_DECREF(item);
983
0
                    goto error;
984
0
                }
985
0
                PyTuple_SET_ITEM(item, i, o);
986
0
            }
987
0
            break;
988
93.4M
        }
989
990
93.4M
        status = _PyList_AppendTakeRef((PyListObject *)list, item);
991
93.4M
        if (status < 0)
992
0
            goto error;
993
994
93.4M
        state.must_advance = (state.ptr == state.start);
995
93.4M
        state.start = state.ptr;
996
93.4M
    }
997
998
3.54M
    state_fini(&state);
999
3.54M
    return list;
1000
1001
0
error:
1002
0
    Py_DECREF(list);
1003
0
    state_fini(&state);
1004
0
    return NULL;
1005
1006
3.54M
}
1007
1008
/*[clinic input]
1009
@permit_long_summary
1010
_sre.SRE_Pattern.finditer
1011
1012
    cls: defining_class
1013
    /
1014
    string: object
1015
    pos: Py_ssize_t = 0
1016
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
1017
1018
Return an iterator over all non-overlapping matches for the RE pattern in string.
1019
1020
For each match, the iterator returns a match object.
1021
[clinic start generated code]*/
1022
1023
static PyObject *
1024
_sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
1025
                               PyObject *string, Py_ssize_t pos,
1026
                               Py_ssize_t endpos)
1027
/*[clinic end generated code: output=1791dbf3618ade56 input=ee28865796048023]*/
1028
355k
{
1029
355k
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1030
355k
    PyObject* scanner;
1031
355k
    PyObject* search;
1032
355k
    PyObject* iterator;
1033
1034
355k
    scanner = pattern_scanner(module_state, self, string, pos, endpos);
1035
355k
    if (!scanner)
1036
0
        return NULL;
1037
1038
355k
    search = PyObject_GetAttrString(scanner, "search");
1039
355k
    Py_DECREF(scanner);
1040
355k
    if (!search)
1041
0
        return NULL;
1042
1043
355k
    iterator = PyCallIter_New(search, Py_None);
1044
355k
    Py_DECREF(search);
1045
1046
355k
    return iterator;
1047
355k
}
1048
1049
/*[clinic input]
1050
_sre.SRE_Pattern.scanner
1051
1052
    cls: defining_class
1053
    /
1054
    string: object
1055
    pos: Py_ssize_t = 0
1056
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
1057
1058
[clinic start generated code]*/
1059
1060
static PyObject *
1061
_sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
1062
                              PyObject *string, Py_ssize_t pos,
1063
                              Py_ssize_t endpos)
1064
/*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
1065
0
{
1066
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1067
1068
0
    return pattern_scanner(module_state, self, string, pos, endpos);
1069
0
}
1070
1071
/*[clinic input]
1072
_sre.SRE_Pattern.split
1073
1074
    string: object
1075
    maxsplit: Py_ssize_t = 0
1076
1077
Split string by the occurrences of pattern.
1078
[clinic start generated code]*/
1079
1080
static PyObject *
1081
_sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
1082
                            Py_ssize_t maxsplit)
1083
/*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
1084
1.24M
{
1085
1.24M
    SRE_STATE state;
1086
1.24M
    PyObject* list;
1087
1.24M
    PyObject* item;
1088
1.24M
    Py_ssize_t status;
1089
1.24M
    Py_ssize_t n;
1090
1.24M
    Py_ssize_t i;
1091
1.24M
    const void* last;
1092
1093
1.24M
    assert(self->codesize != 0);
1094
1095
1.24M
    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
1096
0
        return NULL;
1097
1098
1.24M
    list = PyList_New(0);
1099
1.24M
    if (!list) {
1100
0
        state_fini(&state);
1101
0
        return NULL;
1102
0
    }
1103
1104
1.24M
    n = 0;
1105
1.24M
    last = state.start;
1106
1107
2.10M
    while (!maxsplit || n < maxsplit) {
1108
1109
1.30M
        state_reset(&state);
1110
1111
1.30M
        state.ptr = state.start;
1112
1113
1.30M
        status = sre_search(&state, PatternObject_GetCode(self));
1114
1.30M
        if (PyErr_Occurred())
1115
0
            goto error;
1116
1117
1.30M
        if (status <= 0) {
1118
453k
            if (status == 0)
1119
453k
                break;
1120
0
            pattern_error(status);
1121
0
            goto error;
1122
453k
        }
1123
1124
        /* get segment before this match */
1125
855k
        item = getslice(state.isbytes, state.beginning,
1126
855k
            string, STATE_OFFSET(&state, last),
1127
855k
            STATE_OFFSET(&state, state.start)
1128
855k
            );
1129
855k
        if (!item)
1130
0
            goto error;
1131
855k
        status = PyList_Append(list, item);
1132
855k
        Py_DECREF(item);
1133
855k
        if (status < 0)
1134
0
            goto error;
1135
1136
        /* add groups (if any) */
1137
1.64M
        for (i = 0; i < self->groups; i++) {
1138
791k
            item = state_getslice(&state, i+1, string, 0);
1139
791k
            if (!item)
1140
0
                goto error;
1141
791k
            status = PyList_Append(list, item);
1142
791k
            Py_DECREF(item);
1143
791k
            if (status < 0)
1144
0
                goto error;
1145
791k
        }
1146
1147
855k
        n = n + 1;
1148
855k
        state.must_advance = (state.ptr == state.start);
1149
855k
        last = state.start = state.ptr;
1150
1151
855k
    }
1152
1153
    /* get segment following last match (even if empty) */
1154
1.24M
    item = getslice(state.isbytes, state.beginning,
1155
1.24M
        string, STATE_OFFSET(&state, last), state.endpos
1156
1.24M
        );
1157
1.24M
    if (!item)
1158
0
        goto error;
1159
1.24M
    status = PyList_Append(list, item);
1160
1.24M
    Py_DECREF(item);
1161
1.24M
    if (status < 0)
1162
0
        goto error;
1163
1164
1.24M
    state_fini(&state);
1165
1.24M
    return list;
1166
1167
0
error:
1168
0
    Py_DECREF(list);
1169
0
    state_fini(&state);
1170
0
    return NULL;
1171
1172
1.24M
}
1173
1174
static PyObject *
1175
compile_template(_sremodulestate *module_state,
1176
                 PatternObject *pattern, PyObject *template)
1177
0
{
1178
    /* delegate to Python code */
1179
0
    PyObject *func = FT_ATOMIC_LOAD_PTR(module_state->compile_template);
1180
0
    if (func == NULL) {
1181
0
        func = PyImport_ImportModuleAttrString("re", "_compile_template");
1182
0
        if (func == NULL) {
1183
0
            return NULL;
1184
0
        }
1185
#ifdef Py_GIL_DISABLED
1186
        PyObject *other_func = NULL;
1187
        if (!_Py_atomic_compare_exchange_ptr(&module_state->compile_template, &other_func, func))  {
1188
            Py_DECREF(func);
1189
            func = other_func;
1190
        }
1191
#else
1192
0
        Py_XSETREF(module_state->compile_template, func);
1193
0
#endif
1194
0
    }
1195
1196
0
    PyObject *args[] = {(PyObject *)pattern, template};
1197
0
    PyObject *result = PyObject_Vectorcall(func, args, 2, NULL);
1198
1199
0
    if (result == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
1200
        /* If the replacement string is unhashable (e.g. bytearray),
1201
         * convert it to the basic type (str or bytes) and repeat. */
1202
0
        if (PyUnicode_Check(template) && !PyUnicode_CheckExact(template)) {
1203
0
            PyErr_Clear();
1204
0
            template = _PyUnicode_Copy(template);
1205
0
        }
1206
0
        else if (PyObject_CheckBuffer(template) && !PyBytes_CheckExact(template)) {
1207
0
            PyErr_Clear();
1208
0
            template = PyBytes_FromObject(template);
1209
0
        }
1210
0
        else {
1211
0
            return NULL;
1212
0
        }
1213
0
        if (template == NULL) {
1214
0
            return NULL;
1215
0
        }
1216
0
        args[1] = template;
1217
0
        result = PyObject_Vectorcall(func, args, 2, NULL);
1218
0
        Py_DECREF(template);
1219
0
    }
1220
1221
0
    if (result != NULL && Py_TYPE(result) != module_state->Template_Type) {
1222
0
        PyErr_Format(PyExc_RuntimeError,
1223
0
                    "the result of compiling a replacement string is %.200s",
1224
0
                    Py_TYPE(result)->tp_name);
1225
0
        Py_DECREF(result);
1226
0
        return NULL;
1227
0
    }
1228
0
    return result;
1229
0
}
1230
1231
static PyObject *expand_template(TemplateObject *, MatchObject *); /* Forward */
1232
1233
static PyObject*
1234
pattern_subx(_sremodulestate* module_state,
1235
             PatternObject* self,
1236
             PyObject* ptemplate,
1237
             PyObject* string,
1238
             Py_ssize_t count,
1239
             Py_ssize_t subn)
1240
3.82M
{
1241
3.82M
    SRE_STATE state;
1242
3.82M
    PyObject* list;
1243
3.82M
    PyObject* joiner;
1244
3.82M
    PyObject* item;
1245
3.82M
    PyObject* filter;
1246
3.82M
    PyObject* match;
1247
3.82M
    const void* ptr;
1248
3.82M
    Py_ssize_t status;
1249
3.82M
    Py_ssize_t n;
1250
3.82M
    Py_ssize_t i, b, e;
1251
3.82M
    int isbytes, charsize;
1252
3.82M
    enum {LITERAL, TEMPLATE, CALLABLE} filter_type;
1253
3.82M
    Py_buffer view;
1254
1255
3.82M
    if (PyCallable_Check(ptemplate)) {
1256
        /* sub/subn takes either a function or a template */
1257
3.82M
        filter = Py_NewRef(ptemplate);
1258
3.82M
        filter_type = CALLABLE;
1259
3.82M
    } else {
1260
        /* if not callable, check if it's a literal string */
1261
0
        int literal;
1262
0
        view.buf = NULL;
1263
0
        ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
1264
0
        if (ptr) {
1265
0
            if (charsize == 1)
1266
0
                literal = memchr(ptr, '\\', n) == NULL;
1267
0
            else
1268
0
                literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
1269
0
        } else {
1270
0
            PyErr_Clear();
1271
0
            literal = 0;
1272
0
        }
1273
0
        if (view.buf)
1274
0
            PyBuffer_Release(&view);
1275
0
        if (literal) {
1276
0
            filter = Py_NewRef(ptemplate);
1277
0
            filter_type = LITERAL;
1278
0
        } else {
1279
            /* not a literal; hand it over to the template compiler */
1280
0
            filter = compile_template(module_state, self, ptemplate);
1281
0
            if (!filter)
1282
0
                return NULL;
1283
1284
0
            assert(Py_TYPE(filter) == module_state->Template_Type);
1285
0
            if (Py_SIZE(filter) == 0) {
1286
0
                Py_SETREF(filter,
1287
0
                          Py_NewRef(((TemplateObject *)filter)->literal));
1288
0
                filter_type = LITERAL;
1289
0
            }
1290
0
            else {
1291
0
                filter_type = TEMPLATE;
1292
0
            }
1293
0
        }
1294
0
    }
1295
1296
3.82M
    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
1297
0
        Py_DECREF(filter);
1298
0
        return NULL;
1299
0
    }
1300
1301
3.82M
    list = PyList_New(0);
1302
3.82M
    if (!list) {
1303
0
        Py_DECREF(filter);
1304
0
        state_fini(&state);
1305
0
        return NULL;
1306
0
    }
1307
1308
3.82M
    n = i = 0;
1309
1310
10.3M
    while (!count || n < count) {
1311
1312
10.3M
        state_reset(&state);
1313
1314
10.3M
        state.ptr = state.start;
1315
1316
10.3M
        status = sre_search(&state, PatternObject_GetCode(self));
1317
10.3M
        if (PyErr_Occurred())
1318
0
            goto error;
1319
1320
10.3M
        if (status <= 0) {
1321
3.82M
            if (status == 0)
1322
3.82M
                break;
1323
0
            pattern_error(status);
1324
0
            goto error;
1325
3.82M
        }
1326
1327
6.56M
        b = STATE_OFFSET(&state, state.start);
1328
6.56M
        e = STATE_OFFSET(&state, state.ptr);
1329
1330
6.56M
        if (i < b) {
1331
            /* get segment before this match */
1332
3.35M
            item = getslice(state.isbytes, state.beginning,
1333
3.35M
                string, i, b);
1334
3.35M
            if (!item)
1335
0
                goto error;
1336
3.35M
            status = _PyList_AppendTakeRef((PyListObject *)list, item);
1337
3.35M
            if (status < 0)
1338
0
                goto error;
1339
1340
3.35M
        }
1341
1342
6.56M
        if (filter_type != LITERAL) {
1343
            /* pass match object through filter */
1344
6.56M
            match = pattern_new_match(module_state, self, &state, 1);
1345
6.56M
            if (!match)
1346
0
                goto error;
1347
6.56M
            if (filter_type == TEMPLATE) {
1348
0
                item = expand_template((TemplateObject *)filter,
1349
0
                                       (MatchObject *)match);
1350
0
            }
1351
6.56M
            else {
1352
6.56M
                assert(filter_type == CALLABLE);
1353
6.56M
                item = PyObject_CallOneArg(filter, match);
1354
6.56M
            }
1355
6.56M
            Py_DECREF(match);
1356
6.56M
            if (!item)
1357
56
                goto error;
1358
6.56M
        } else {
1359
            /* filter is literal string */
1360
0
            item = Py_NewRef(filter);
1361
0
        }
1362
1363
        /* add to list */
1364
6.56M
        if (item != Py_None) {
1365
6.56M
            status = _PyList_AppendTakeRef((PyListObject *)list, item);
1366
6.56M
            if (status < 0)
1367
0
                goto error;
1368
6.56M
        }
1369
1370
6.56M
        i = e;
1371
6.56M
        n = n + 1;
1372
6.56M
        state.must_advance = (state.ptr == state.start);
1373
6.56M
        state.start = state.ptr;
1374
6.56M
    }
1375
1376
    /* get segment following last match */
1377
3.82M
    if (i < state.endpos) {
1378
3.18M
        item = getslice(state.isbytes, state.beginning,
1379
3.18M
                        string, i, state.endpos);
1380
3.18M
        if (!item)
1381
0
            goto error;
1382
3.18M
        status = _PyList_AppendTakeRef((PyListObject *)list, item);
1383
3.18M
        if (status < 0)
1384
0
            goto error;
1385
3.18M
    }
1386
1387
3.82M
    state_fini(&state);
1388
1389
3.82M
    Py_DECREF(filter);
1390
1391
    /* convert list to single string (also removes list) */
1392
3.82M
    joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
1393
3.82M
    if (!joiner) {
1394
0
        Py_DECREF(list);
1395
0
        return NULL;
1396
0
    }
1397
3.82M
    if (PyList_GET_SIZE(list) == 0) {
1398
224
        Py_DECREF(list);
1399
224
        item = joiner;
1400
224
    }
1401
3.82M
    else {
1402
3.82M
        if (state.isbytes)
1403
34.0k
            item = PyBytes_Join(joiner, list);
1404
3.78M
        else
1405
3.78M
            item = PyUnicode_Join(joiner, list);
1406
3.82M
        Py_DECREF(joiner);
1407
3.82M
        Py_DECREF(list);
1408
3.82M
        if (!item)
1409
0
            return NULL;
1410
3.82M
    }
1411
1412
3.82M
    if (subn)
1413
0
        return Py_BuildValue("Nn", item, n);
1414
1415
3.82M
    return item;
1416
1417
56
error:
1418
56
    Py_DECREF(list);
1419
56
    state_fini(&state);
1420
56
    Py_DECREF(filter);
1421
56
    return NULL;
1422
1423
3.82M
}
1424
1425
/*[clinic input]
1426
@permit_long_summary
1427
_sre.SRE_Pattern.sub
1428
1429
    cls: defining_class
1430
    /
1431
    repl: object
1432
    string: object
1433
    count: Py_ssize_t = 0
1434
1435
Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1436
[clinic start generated code]*/
1437
1438
static PyObject *
1439
_sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1440
                          PyObject *repl, PyObject *string, Py_ssize_t count)
1441
/*[clinic end generated code: output=4be141ab04bca60d input=eba511fd1c4908b7]*/
1442
3.82M
{
1443
3.82M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1444
1445
3.82M
    return pattern_subx(module_state, self, repl, string, count, 0);
1446
3.82M
}
1447
1448
/*[clinic input]
1449
@permit_long_summary
1450
_sre.SRE_Pattern.subn
1451
1452
    cls: defining_class
1453
    /
1454
    repl: object
1455
    string: object
1456
    count: Py_ssize_t = 0
1457
1458
Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1459
[clinic start generated code]*/
1460
1461
static PyObject *
1462
_sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1463
                           PyObject *repl, PyObject *string,
1464
                           Py_ssize_t count)
1465
/*[clinic end generated code: output=da02fd85258b1e1f input=6a5bb5b61717abf0]*/
1466
0
{
1467
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1468
1469
0
    return pattern_subx(module_state, self, repl, string, count, 1);
1470
0
}
1471
1472
/*[clinic input]
1473
_sre.SRE_Pattern.__copy__
1474
1475
[clinic start generated code]*/
1476
1477
static PyObject *
1478
_sre_SRE_Pattern___copy___impl(PatternObject *self)
1479
/*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
1480
0
{
1481
0
    return Py_NewRef(self);
1482
0
}
1483
1484
/*[clinic input]
1485
_sre.SRE_Pattern.__deepcopy__
1486
1487
    memo: object
1488
    /
1489
1490
[clinic start generated code]*/
1491
1492
static PyObject *
1493
_sre_SRE_Pattern___deepcopy___impl(PatternObject *self, PyObject *memo)
1494
/*[clinic end generated code: output=75efe69bd12c5d7d input=a465b1602f997bed]*/
1495
0
{
1496
0
    return Py_NewRef(self);
1497
0
}
1498
1499
#ifdef Py_DEBUG
1500
/*[clinic input]
1501
_sre.SRE_Pattern._fail_after
1502
1503
    count: int
1504
    exception: object
1505
    /
1506
1507
For debugging.
1508
[clinic start generated code]*/
1509
1510
static PyObject *
1511
_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
1512
                                  PyObject *exception)
1513
/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
1514
{
1515
    self->fail_after_count = count;
1516
    Py_INCREF(exception);
1517
    Py_XSETREF(self->fail_after_exc, exception);
1518
    Py_RETURN_NONE;
1519
}
1520
#endif /* Py_DEBUG */
1521
1522
static PyObject *
1523
pattern_repr(PyObject *self)
1524
0
{
1525
0
    static const struct {
1526
0
        const char *name;
1527
0
        int value;
1528
0
    } flag_names[] = {
1529
0
        {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1530
0
        {"re.LOCALE", SRE_FLAG_LOCALE},
1531
0
        {"re.MULTILINE", SRE_FLAG_MULTILINE},
1532
0
        {"re.DOTALL", SRE_FLAG_DOTALL},
1533
0
        {"re.UNICODE", SRE_FLAG_UNICODE},
1534
0
        {"re.VERBOSE", SRE_FLAG_VERBOSE},
1535
0
        {"re.DEBUG", SRE_FLAG_DEBUG},
1536
0
        {"re.ASCII", SRE_FLAG_ASCII},
1537
0
    };
1538
1539
0
    PatternObject *obj = _PatternObject_CAST(self);
1540
0
    PyObject *result = NULL;
1541
0
    PyObject *flag_items;
1542
0
    size_t i;
1543
0
    int flags = obj->flags;
1544
1545
    /* Omit re.UNICODE for valid string patterns. */
1546
0
    if (obj->isbytes == 0 &&
1547
0
        (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1548
0
         SRE_FLAG_UNICODE)
1549
0
        flags &= ~SRE_FLAG_UNICODE;
1550
1551
0
    flag_items = PyList_New(0);
1552
0
    if (!flag_items)
1553
0
        return NULL;
1554
1555
0
    for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1556
0
        if (flags & flag_names[i].value) {
1557
0
            PyObject *item = PyUnicode_FromString(flag_names[i].name);
1558
0
            if (!item)
1559
0
                goto done;
1560
1561
0
            if (PyList_Append(flag_items, item) < 0) {
1562
0
                Py_DECREF(item);
1563
0
                goto done;
1564
0
            }
1565
0
            Py_DECREF(item);
1566
0
            flags &= ~flag_names[i].value;
1567
0
        }
1568
0
    }
1569
0
    if (flags) {
1570
0
        PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1571
0
        if (!item)
1572
0
            goto done;
1573
1574
0
        if (PyList_Append(flag_items, item) < 0) {
1575
0
            Py_DECREF(item);
1576
0
            goto done;
1577
0
        }
1578
0
        Py_DECREF(item);
1579
0
    }
1580
1581
0
    if (PyList_Size(flag_items) > 0) {
1582
0
        PyObject *flags_result;
1583
0
        PyObject *sep = PyUnicode_FromString("|");
1584
0
        if (!sep)
1585
0
            goto done;
1586
0
        flags_result = PyUnicode_Join(sep, flag_items);
1587
0
        Py_DECREF(sep);
1588
0
        if (!flags_result)
1589
0
            goto done;
1590
0
        result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1591
0
                                      obj->pattern, flags_result);
1592
0
        Py_DECREF(flags_result);
1593
0
    }
1594
0
    else {
1595
0
        result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1596
0
    }
1597
1598
0
done:
1599
0
    Py_DECREF(flag_items);
1600
0
    return result;
1601
0
}
1602
1603
PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
1604
1605
/* PatternObject's 'groupindex' method. */
1606
static PyObject *
1607
pattern_groupindex(PyObject *op, void *Py_UNUSED(ignored))
1608
0
{
1609
0
    PatternObject *self = _PatternObject_CAST(op);
1610
0
    if (self->groupindex == NULL)
1611
0
        return PyDict_New();
1612
0
    return PyDictProxy_New(self->groupindex);
1613
0
}
1614
1615
static int _validate(PatternObject *self); /* Forward */
1616
1617
/*[clinic input]
1618
_sre.compile
1619
1620
    pattern: object
1621
    flags: int
1622
    code: object(subclass_of='&PyList_Type')
1623
    groups: Py_ssize_t
1624
    groupindex: object(subclass_of='&PyDict_Type')
1625
    indexgroup: object(subclass_of='&PyTuple_Type')
1626
1627
[clinic start generated code]*/
1628
1629
static PyObject *
1630
_sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
1631
                  PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1632
                  PyObject *indexgroup)
1633
/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
1634
3.72k
{
1635
    /* "compile" pattern descriptor to pattern object */
1636
1637
3.72k
    _sremodulestate *module_state = get_sre_module_state(module);
1638
3.72k
    PatternObject* self;
1639
3.72k
    Py_ssize_t i, n;
1640
1641
3.72k
    n = PyList_GET_SIZE(code);
1642
    /* coverity[ampersand_in_size] */
1643
3.72k
    self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
1644
3.72k
    if (!self)
1645
0
        return NULL;
1646
3.72k
    self->weakreflist = NULL;
1647
3.72k
    self->pattern = NULL;
1648
3.72k
    self->groupindex = NULL;
1649
3.72k
    self->indexgroup = NULL;
1650
#ifdef Py_DEBUG
1651
    self->fail_after_count = -1;
1652
    self->fail_after_exc = NULL;
1653
#endif
1654
1655
3.72k
    self->codesize = n;
1656
1657
96.5M
    for (i = 0; i < n; i++) {
1658
96.5M
        PyObject *o = PyList_GET_ITEM(code, i);
1659
96.5M
        unsigned long value = PyLong_AsUnsignedLong(o);
1660
96.5M
        if (value == (unsigned long)-1 && PyErr_Occurred()) {
1661
0
            break;
1662
0
        }
1663
96.5M
        self->code[i] = (SRE_CODE) value;
1664
96.5M
        if ((unsigned long) self->code[i] != value) {
1665
0
            PyErr_SetString(PyExc_OverflowError,
1666
0
                            "regular expression code size limit exceeded");
1667
0
            break;
1668
0
        }
1669
96.5M
    }
1670
3.72k
    PyObject_GC_Track(self);
1671
1672
3.72k
    if (PyErr_Occurred()) {
1673
0
        Py_DECREF(self);
1674
0
        return NULL;
1675
0
    }
1676
1677
3.72k
    if (pattern == Py_None) {
1678
0
        self->isbytes = -1;
1679
0
    }
1680
3.72k
    else {
1681
3.72k
        Py_ssize_t p_length;
1682
3.72k
        int charsize;
1683
3.72k
        Py_buffer view;
1684
3.72k
        view.buf = NULL;
1685
3.72k
        if (!getstring(pattern, &p_length, &self->isbytes,
1686
3.72k
                       &charsize, &view)) {
1687
0
            Py_DECREF(self);
1688
0
            return NULL;
1689
0
        }
1690
3.72k
        if (view.buf)
1691
42
            PyBuffer_Release(&view);
1692
3.72k
    }
1693
1694
3.72k
    self->pattern = Py_NewRef(pattern);
1695
1696
3.72k
    self->flags = flags;
1697
1698
3.72k
    self->groups = groups;
1699
1700
3.72k
    if (PyDict_GET_SIZE(groupindex) > 0) {
1701
55
        self->groupindex = Py_NewRef(groupindex);
1702
55
        if (PyTuple_GET_SIZE(indexgroup) > 0) {
1703
55
            self->indexgroup = Py_NewRef(indexgroup);
1704
55
        }
1705
55
    }
1706
1707
3.72k
    if (!_validate(self)) {
1708
0
        Py_DECREF(self);
1709
0
        return NULL;
1710
0
    }
1711
1712
3.72k
    return (PyObject*) self;
1713
3.72k
}
1714
1715
/*[clinic input]
1716
_sre.template
1717
1718
    pattern: object
1719
    template: object(subclass_of="&PyList_Type")
1720
        A list containing interleaved literal strings (str or bytes) and group
1721
        indices (int), as returned by re._parser.parse_template():
1722
            [literal1, group1, ..., literalN, groupN]
1723
    /
1724
1725
[clinic start generated code]*/
1726
1727
static PyObject *
1728
_sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template)
1729
/*[clinic end generated code: output=d51290e596ebca86 input=af55380b27f02942]*/
1730
0
{
1731
    /* template is a list containing interleaved literal strings (str or bytes)
1732
     * and group indices (int), as returned by _parser.parse_template:
1733
     * [literal1, group1, literal2, ..., literalN].
1734
     */
1735
0
    _sremodulestate *module_state = get_sre_module_state(module);
1736
0
    TemplateObject *self = NULL;
1737
0
    Py_ssize_t n = PyList_GET_SIZE(template);
1738
0
    if ((n & 1) == 0 || n < 1) {
1739
0
        goto bad_template;
1740
0
    }
1741
0
    n /= 2;
1742
0
    self = PyObject_GC_NewVar(TemplateObject, module_state->Template_Type, n);
1743
0
    if (!self)
1744
0
        return NULL;
1745
0
    self->chunks = 1 + 2*n;
1746
0
    self->literal = Py_NewRef(PyList_GET_ITEM(template, 0));
1747
0
    for (Py_ssize_t i = 0; i < n; i++) {
1748
0
        Py_ssize_t index = PyLong_AsSsize_t(PyList_GET_ITEM(template, 2*i+1));
1749
0
        if (index == -1 && PyErr_Occurred()) {
1750
0
            Py_SET_SIZE(self, i);
1751
0
            Py_DECREF(self);
1752
0
            return NULL;
1753
0
        }
1754
0
        if (index < 0) {
1755
0
            Py_SET_SIZE(self, i);
1756
0
            goto bad_template;
1757
0
        }
1758
0
        self->items[i].index = index;
1759
1760
0
        PyObject *literal = PyList_GET_ITEM(template, 2*i+2);
1761
        // Skip empty literals.
1762
0
        if ((PyUnicode_Check(literal) && !PyUnicode_GET_LENGTH(literal)) ||
1763
0
            (PyBytes_Check(literal) && !PyBytes_GET_SIZE(literal)))
1764
0
        {
1765
0
            literal = NULL;
1766
0
            self->chunks--;
1767
0
        }
1768
0
        self->items[i].literal = Py_XNewRef(literal);
1769
0
    }
1770
0
    PyObject_GC_Track(self);
1771
0
    return (PyObject*) self;
1772
1773
0
bad_template:
1774
0
    PyErr_SetString(PyExc_TypeError, "invalid template");
1775
0
    Py_XDECREF(self);
1776
0
    return NULL;
1777
0
}
1778
1779
/* -------------------------------------------------------------------- */
1780
/* Code validation */
1781
1782
/* To learn more about this code, have a look at the _compile() function in
1783
   Lib/sre_compile.py.  The validation functions below checks the code array
1784
   for conformance with the code patterns generated there.
1785
1786
   The nice thing about the generated code is that it is position-independent:
1787
   all jumps are relative jumps forward.  Also, jumps don't cross each other:
1788
   the target of a later jump is always earlier than the target of an earlier
1789
   jump.  IOW, this is okay:
1790
1791
   J---------J-------T--------T
1792
    \         \_____/        /
1793
     \______________________/
1794
1795
   but this is not:
1796
1797
   J---------J-------T--------T
1798
    \_________\_____/        /
1799
               \____________/
1800
1801
   It also helps that SRE_CODE is always an unsigned type.
1802
*/
1803
1804
/* Defining this one enables tracing of the validator */
1805
#undef VVERBOSE
1806
1807
/* Trace macro for the validator */
1808
#if defined(VVERBOSE)
1809
#define VTRACE(v) printf v
1810
#else
1811
146M
#define VTRACE(v) do {} while(0)  /* do nothing */
1812
#endif
1813
1814
/* Report failure */
1815
0
#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return -1; } while (0)
1816
1817
/* Extract opcode, argument, or skip count from code array */
1818
#define GET_OP                                          \
1819
34.3M
    do {                                                \
1820
34.3M
        VTRACE(("%p: ", code));                         \
1821
34.3M
        if (code >= end) FAIL;                          \
1822
34.3M
        op = *code++;                                   \
1823
34.3M
        VTRACE(("%lu (op)\n", (unsigned long)op));      \
1824
34.3M
    } while (0)
1825
#define GET_ARG                                         \
1826
29.9M
    do {                                                \
1827
29.9M
        VTRACE(("%p= ", code));                         \
1828
29.9M
        if (code >= end) FAIL;                          \
1829
29.9M
        arg = *code++;                                  \
1830
29.9M
        VTRACE(("%lu (arg)\n", (unsigned long)arg));    \
1831
29.9M
    } while (0)
1832
#define GET_SKIP_ADJ(adj)                               \
1833
6.51M
    do {                                                \
1834
6.51M
        VTRACE(("%p= ", code));                         \
1835
6.51M
        if (code >= end) FAIL;                          \
1836
6.51M
        skip = *code;                                   \
1837
6.51M
        VTRACE(("%lu (skip to %p)\n",                   \
1838
6.51M
               (unsigned long)skip, code+skip));        \
1839
6.51M
        if (skip-adj > (uintptr_t)(end - code))         \
1840
6.51M
            FAIL;                                       \
1841
6.51M
        code++;                                         \
1842
6.51M
    } while (0)
1843
6.51M
#define GET_SKIP GET_SKIP_ADJ(0)
1844
1845
static int
1846
_validate_charset(SRE_CODE *code, SRE_CODE *end)
1847
3.40M
{
1848
    /* Some variables are manipulated by the macros above */
1849
3.40M
    SRE_CODE op;
1850
3.40M
    SRE_CODE arg;
1851
3.40M
    SRE_CODE offset;
1852
3.40M
    int i;
1853
1854
10.1M
    while (code < end) {
1855
6.71M
        GET_OP;
1856
6.71M
        switch (op) {
1857
1858
1.26k
        case SRE_OP_NEGATE:
1859
1.26k
            break;
1860
1861
6.61M
        case SRE_OP_LITERAL:
1862
6.61M
            GET_ARG;
1863
6.61M
            break;
1864
1865
6.61M
        case SRE_OP_RANGE:
1866
11.7k
        case SRE_OP_RANGE_UNI_IGNORE:
1867
11.7k
            GET_ARG;
1868
11.7k
            GET_ARG;
1869
11.7k
            break;
1870
1871
11.7k
        case SRE_OP_CHARSET:
1872
776
            offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
1873
776
            if (offset > (uintptr_t)(end - code))
1874
0
                FAIL;
1875
776
            code += offset;
1876
776
            break;
1877
1878
89.5k
        case SRE_OP_BIGCHARSET:
1879
89.5k
            GET_ARG; /* Number of blocks */
1880
89.5k
            offset = 256/sizeof(SRE_CODE); /* 256-byte table */
1881
89.5k
            if (offset > (uintptr_t)(end - code))
1882
0
                FAIL;
1883
            /* Make sure that each byte points to a valid block */
1884
23.0M
            for (i = 0; i < 256; i++) {
1885
22.9M
                if (((unsigned char *)code)[i] >= arg)
1886
0
                    FAIL;
1887
22.9M
            }
1888
89.5k
            code += offset;
1889
89.5k
            offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
1890
89.5k
            if (offset > (uintptr_t)(end - code))
1891
0
                FAIL;
1892
89.5k
            code += offset;
1893
89.5k
            break;
1894
1895
1.58k
        case SRE_OP_CATEGORY:
1896
1.58k
            GET_ARG;
1897
1.58k
            switch (arg) {
1898
34
            case SRE_CATEGORY_DIGIT:
1899
34
            case SRE_CATEGORY_NOT_DIGIT:
1900
66
            case SRE_CATEGORY_SPACE:
1901
66
            case SRE_CATEGORY_NOT_SPACE:
1902
92
            case SRE_CATEGORY_WORD:
1903
92
            case SRE_CATEGORY_NOT_WORD:
1904
92
            case SRE_CATEGORY_LINEBREAK:
1905
92
            case SRE_CATEGORY_NOT_LINEBREAK:
1906
92
            case SRE_CATEGORY_LOC_WORD:
1907
92
            case SRE_CATEGORY_LOC_NOT_WORD:
1908
202
            case SRE_CATEGORY_UNI_DIGIT:
1909
746
            case SRE_CATEGORY_UNI_NOT_DIGIT:
1910
1.43k
            case SRE_CATEGORY_UNI_SPACE:
1911
1.44k
            case SRE_CATEGORY_UNI_NOT_SPACE:
1912
1.51k
            case SRE_CATEGORY_UNI_WORD:
1913
1.58k
            case SRE_CATEGORY_UNI_NOT_WORD:
1914
1.58k
            case SRE_CATEGORY_UNI_LINEBREAK:
1915
1.58k
            case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1916
1.58k
                break;
1917
0
            default:
1918
0
                FAIL;
1919
1.58k
            }
1920
1.58k
            break;
1921
1922
1.58k
        default:
1923
0
            FAIL;
1924
1925
6.71M
        }
1926
6.71M
    }
1927
1928
3.40M
    return 0;
1929
3.40M
}
1930
1931
/* Returns 0 on success, -1 on failure, and 1 if the last op is JUMP. */
1932
static int
1933
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1934
2.20M
{
1935
    /* Some variables are manipulated by the macros above */
1936
2.20M
    SRE_CODE op;
1937
2.20M
    SRE_CODE arg;
1938
2.20M
    SRE_CODE skip;
1939
1940
2.20M
    VTRACE(("code=%p, end=%p\n", code, end));
1941
1942
2.20M
    if (code > end)
1943
0
        FAIL;
1944
1945
27.6M
    while (code < end) {
1946
25.4M
        GET_OP;
1947
25.4M
        switch (op) {
1948
1949
358k
        case SRE_OP_MARK:
1950
            /* We don't check whether marks are properly nested; the
1951
               sre_match() code is robust even if they don't, and the worst
1952
               you can get is nonsensical match results. */
1953
358k
            GET_ARG;
1954
358k
            if (arg >= 2 * (size_t)groups) {
1955
0
                VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1956
0
                FAIL;
1957
0
            }
1958
358k
            break;
1959
1960
16.3M
        case SRE_OP_LITERAL:
1961
16.3M
        case SRE_OP_NOT_LITERAL:
1962
16.3M
        case SRE_OP_LITERAL_IGNORE:
1963
16.3M
        case SRE_OP_NOT_LITERAL_IGNORE:
1964
20.1M
        case SRE_OP_LITERAL_UNI_IGNORE:
1965
20.1M
        case SRE_OP_NOT_LITERAL_UNI_IGNORE:
1966
20.1M
        case SRE_OP_LITERAL_LOC_IGNORE:
1967
20.1M
        case SRE_OP_NOT_LITERAL_LOC_IGNORE:
1968
20.1M
            GET_ARG;
1969
            /* The arg is just a character, nothing to check */
1970
20.1M
            break;
1971
1972
20.1M
        case SRE_OP_SUCCESS:
1973
81
        case SRE_OP_FAILURE:
1974
            /* Nothing to check; these normally end the matching process */
1975
81
            break;
1976
1977
83.9k
        case SRE_OP_AT:
1978
83.9k
            GET_ARG;
1979
83.9k
            switch (arg) {
1980
39
            case SRE_AT_BEGINNING:
1981
47
            case SRE_AT_BEGINNING_STRING:
1982
71.7k
            case SRE_AT_BEGINNING_LINE:
1983
71.7k
            case SRE_AT_END:
1984
79.6k
            case SRE_AT_END_LINE:
1985
79.7k
            case SRE_AT_END_STRING:
1986
79.7k
            case SRE_AT_BOUNDARY:
1987
79.7k
            case SRE_AT_NON_BOUNDARY:
1988
79.7k
            case SRE_AT_LOC_BOUNDARY:
1989
79.7k
            case SRE_AT_LOC_NON_BOUNDARY:
1990
83.9k
            case SRE_AT_UNI_BOUNDARY:
1991
83.9k
            case SRE_AT_UNI_NON_BOUNDARY:
1992
83.9k
                break;
1993
0
            default:
1994
0
                FAIL;
1995
83.9k
            }
1996
83.9k
            break;
1997
1998
83.9k
        case SRE_OP_ANY:
1999
38.1k
        case SRE_OP_ANY_ALL:
2000
            /* These have no operands */
2001
38.1k
            break;
2002
2003
5.72k
        case SRE_OP_IN:
2004
5.96k
        case SRE_OP_IN_IGNORE:
2005
3.40M
        case SRE_OP_IN_UNI_IGNORE:
2006
3.40M
        case SRE_OP_IN_LOC_IGNORE:
2007
3.40M
            GET_SKIP;
2008
            /* Stop 1 before the end; we check the FAILURE below */
2009
3.40M
            if (_validate_charset(code, code+skip-2))
2010
0
                FAIL;
2011
3.40M
            if (code[skip-2] != SRE_OP_FAILURE)
2012
0
                FAIL;
2013
3.40M
            code += skip-1;
2014
3.40M
            break;
2015
2016
3.72k
        case SRE_OP_INFO:
2017
3.72k
            {
2018
                /* A minimal info field is
2019
                   <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2020
                   If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2021
                   more follows. */
2022
3.72k
                SRE_CODE flags, i;
2023
3.72k
                SRE_CODE *newcode;
2024
3.72k
                GET_SKIP;
2025
3.72k
                newcode = code+skip-1;
2026
3.72k
                GET_ARG; flags = arg;
2027
3.72k
                GET_ARG;
2028
3.72k
                GET_ARG;
2029
                /* Check that only valid flags are present */
2030
3.72k
                if ((flags & ~(SRE_INFO_PREFIX |
2031
3.72k
                               SRE_INFO_LITERAL |
2032
3.72k
                               SRE_INFO_CHARSET)) != 0)
2033
0
                    FAIL;
2034
                /* PREFIX and CHARSET are mutually exclusive */
2035
3.72k
                if ((flags & SRE_INFO_PREFIX) &&
2036
1.64k
                    (flags & SRE_INFO_CHARSET))
2037
0
                    FAIL;
2038
                /* LITERAL implies PREFIX */
2039
3.72k
                if ((flags & SRE_INFO_LITERAL) &&
2040
714
                    !(flags & SRE_INFO_PREFIX))
2041
0
                    FAIL;
2042
                /* Validate the prefix */
2043
3.72k
                if (flags & SRE_INFO_PREFIX) {
2044
1.64k
                    SRE_CODE prefix_len;
2045
1.64k
                    GET_ARG; prefix_len = arg;
2046
1.64k
                    GET_ARG;
2047
                    /* Here comes the prefix string */
2048
1.64k
                    if (prefix_len > (uintptr_t)(newcode - code))
2049
0
                        FAIL;
2050
1.64k
                    code += prefix_len;
2051
                    /* And here comes the overlap table */
2052
1.64k
                    if (prefix_len > (uintptr_t)(newcode - code))
2053
0
                        FAIL;
2054
                    /* Each overlap value should be < prefix_len */
2055
7.14M
                    for (i = 0; i < prefix_len; i++) {
2056
7.14M
                        if (code[i] >= prefix_len)
2057
0
                            FAIL;
2058
7.14M
                    }
2059
1.64k
                    code += prefix_len;
2060
1.64k
                }
2061
                /* Validate the charset */
2062
3.72k
                if (flags & SRE_INFO_CHARSET) {
2063
408
                    if (_validate_charset(code, newcode-1))
2064
0
                        FAIL;
2065
408
                    if (newcode[-1] != SRE_OP_FAILURE)
2066
0
                        FAIL;
2067
408
                    code = newcode;
2068
408
                }
2069
3.31k
                else if (code != newcode) {
2070
0
                  VTRACE(("code=%p, newcode=%p\n", code, newcode));
2071
0
                    FAIL;
2072
0
                }
2073
3.72k
            }
2074
3.72k
            break;
2075
2076
28.0k
        case SRE_OP_BRANCH:
2077
28.0k
            {
2078
28.0k
                SRE_CODE *target = NULL;
2079
909k
                for (;;) {
2080
909k
                    GET_SKIP;
2081
909k
                    if (skip == 0)
2082
28.0k
                        break;
2083
                    /* Stop 2 before the end; we check the JUMP below */
2084
881k
                    if (_validate_inner(code, code+skip-3, groups))
2085
0
                        FAIL;
2086
881k
                    code += skip-3;
2087
                    /* Check that it ends with a JUMP, and that each JUMP
2088
                       has the same target */
2089
881k
                    GET_OP;
2090
881k
                    if (op != SRE_OP_JUMP)
2091
0
                        FAIL;
2092
881k
                    GET_SKIP;
2093
881k
                    if (target == NULL)
2094
28.0k
                        target = code+skip-1;
2095
853k
                    else if (code+skip-1 != target)
2096
0
                        FAIL;
2097
881k
                }
2098
28.0k
                if (code != target)
2099
0
                    FAIL;
2100
28.0k
            }
2101
28.0k
            break;
2102
2103
1.27M
        case SRE_OP_REPEAT_ONE:
2104
1.27M
        case SRE_OP_MIN_REPEAT_ONE:
2105
1.27M
        case SRE_OP_POSSESSIVE_REPEAT_ONE:
2106
1.27M
            {
2107
1.27M
                SRE_CODE min, max;
2108
1.27M
                GET_SKIP;
2109
1.27M
                GET_ARG; min = arg;
2110
1.27M
                GET_ARG; max = arg;
2111
1.27M
                if (min > max)
2112
0
                    FAIL;
2113
1.27M
                if (max > SRE_MAXREPEAT)
2114
0
                    FAIL;
2115
1.27M
                if (_validate_inner(code, code+skip-4, groups))
2116
0
                    FAIL;
2117
1.27M
                code += skip-4;
2118
1.27M
                GET_OP;
2119
1.27M
                if (op != SRE_OP_SUCCESS)
2120
0
                    FAIL;
2121
1.27M
            }
2122
1.27M
            break;
2123
2124
1.27M
        case SRE_OP_REPEAT:
2125
41.9k
        case SRE_OP_POSSESSIVE_REPEAT:
2126
41.9k
            {
2127
41.9k
                SRE_CODE op1 = op, min, max;
2128
41.9k
                GET_SKIP;
2129
41.9k
                GET_ARG; min = arg;
2130
41.9k
                GET_ARG; max = arg;
2131
41.9k
                if (min > max)
2132
0
                    FAIL;
2133
41.9k
                if (max > SRE_MAXREPEAT)
2134
0
                    FAIL;
2135
41.9k
                if (_validate_inner(code, code+skip-3, groups))
2136
0
                    FAIL;
2137
41.9k
                code += skip-3;
2138
41.9k
                GET_OP;
2139
41.9k
                if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
2140
46
                    if (op != SRE_OP_SUCCESS)
2141
0
                        FAIL;
2142
46
                }
2143
41.9k
                else {
2144
41.9k
                    if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
2145
0
                        FAIL;
2146
41.9k
                }
2147
41.9k
            }
2148
41.9k
            break;
2149
2150
41.9k
        case SRE_OP_ATOMIC_GROUP:
2151
157
            {
2152
157
                GET_SKIP;
2153
157
                if (_validate_inner(code, code+skip-2, groups))
2154
0
                    FAIL;
2155
157
                code += skip-2;
2156
157
                GET_OP;
2157
157
                if (op != SRE_OP_SUCCESS)
2158
0
                    FAIL;
2159
157
            }
2160
157
            break;
2161
2162
157
        case SRE_OP_GROUPREF:
2163
849
        case SRE_OP_GROUPREF_IGNORE:
2164
1.60k
        case SRE_OP_GROUPREF_UNI_IGNORE:
2165
1.60k
        case SRE_OP_GROUPREF_LOC_IGNORE:
2166
1.60k
            GET_ARG;
2167
1.60k
            if (arg >= (size_t)groups)
2168
0
                FAIL;
2169
1.60k
            break;
2170
2171
1.60k
        case SRE_OP_GROUPREF_EXISTS:
2172
            /* The regex syntax for this is: '(?(group)then|else)', where
2173
               'group' is either an integer group number or a group name,
2174
               'then' and 'else' are sub-regexes, and 'else' is optional. */
2175
54
            GET_ARG;
2176
54
            if (arg >= (size_t)groups)
2177
0
                FAIL;
2178
54
            GET_SKIP_ADJ(1);
2179
54
            code--; /* The skip is relative to the first arg! */
2180
            /* There are two possibilities here: if there is both a 'then'
2181
               part and an 'else' part, the generated code looks like:
2182
2183
               GROUPREF_EXISTS
2184
               <group>
2185
               <skipyes>
2186
               ...then part...
2187
               JUMP
2188
               <skipno>
2189
               (<skipyes> jumps here)
2190
               ...else part...
2191
               (<skipno> jumps here)
2192
2193
               If there is only a 'then' part, it looks like:
2194
2195
               GROUPREF_EXISTS
2196
               <group>
2197
               <skip>
2198
               ...then part...
2199
               (<skip> jumps here)
2200
2201
               There is no direct way to decide which it is, and we don't want
2202
               to allow arbitrary jumps anywhere in the code; so we just look
2203
               for a JUMP opcode preceding our skip target.
2204
            */
2205
54
            VTRACE(("then part:\n"));
2206
54
            int rc = _validate_inner(code+1, code+skip-1, groups);
2207
54
            if (rc == 1) {
2208
32
                VTRACE(("else part:\n"));
2209
32
                code += skip-2; /* Position after JUMP, at <skipno> */
2210
32
                GET_SKIP;
2211
32
                rc = _validate_inner(code, code+skip-1, groups);
2212
32
            }
2213
54
            if (rc)
2214
0
                FAIL;
2215
54
            code += skip-1;
2216
54
            break;
2217
2218
117
        case SRE_OP_ASSERT:
2219
365
        case SRE_OP_ASSERT_NOT:
2220
365
            GET_SKIP;
2221
365
            GET_ARG; /* 0 for lookahead, width for lookbehind */
2222
365
            code--; /* Back up over arg to simplify math below */
2223
            /* Stop 1 before the end; we check the SUCCESS below */
2224
365
            if (_validate_inner(code+1, code+skip-2, groups))
2225
0
                FAIL;
2226
365
            code += skip-2;
2227
365
            GET_OP;
2228
365
            if (op != SRE_OP_SUCCESS)
2229
0
                FAIL;
2230
365
            break;
2231
2232
365
        case SRE_OP_JUMP:
2233
32
            if (code + 1 != end)
2234
0
                FAIL;
2235
32
            VTRACE(("JUMP: %d\n", __LINE__));
2236
32
            return 1;
2237
2238
0
        default:
2239
0
            FAIL;
2240
2241
25.4M
        }
2242
25.4M
    }
2243
2244
2.20M
    VTRACE(("okay\n"));
2245
2.20M
    return 0;
2246
2.20M
}
2247
2248
static int
2249
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2250
3.72k
{
2251
3.72k
    if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
2252
3.72k
        code >= end || end[-1] != SRE_OP_SUCCESS)
2253
0
        FAIL;
2254
3.72k
    return _validate_inner(code, end-1, groups);
2255
3.72k
}
2256
2257
static int
2258
_validate(PatternObject *self)
2259
3.72k
{
2260
3.72k
    if (_validate_outer(self->code, self->code+self->codesize, self->groups))
2261
0
    {
2262
0
        PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
2263
0
        return 0;
2264
0
    }
2265
3.72k
    else
2266
3.72k
        VTRACE(("Success!\n"));
2267
3.72k
    return 1;
2268
3.72k
}
2269
2270
/* -------------------------------------------------------------------- */
2271
/* match methods */
2272
2273
static int
2274
match_traverse(PyObject *op, visitproc visit, void *arg)
2275
31.0k
{
2276
31.0k
    MatchObject *self = _MatchObject_CAST(op);
2277
31.0k
    Py_VISIT(Py_TYPE(self));
2278
31.0k
    Py_VISIT(self->string);
2279
31.0k
    Py_VISIT(self->regs);
2280
31.0k
    Py_VISIT(self->pattern);
2281
31.0k
    return 0;
2282
31.0k
}
2283
2284
static int
2285
match_clear(PyObject *op)
2286
47.8M
{
2287
47.8M
    MatchObject *self = _MatchObject_CAST(op);
2288
47.8M
    Py_CLEAR(self->string);
2289
47.8M
    Py_CLEAR(self->regs);
2290
47.8M
    Py_CLEAR(self->pattern);
2291
47.8M
    return 0;
2292
47.8M
}
2293
2294
static void
2295
match_dealloc(PyObject *self)
2296
47.8M
{
2297
47.8M
    PyTypeObject *tp = Py_TYPE(self);
2298
47.8M
    PyObject_GC_UnTrack(self);
2299
47.8M
    (void)match_clear(self);
2300
47.8M
    tp->tp_free(self);
2301
47.8M
    Py_DECREF(tp);
2302
47.8M
}
2303
2304
static PyObject*
2305
match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2306
41.0M
{
2307
41.0M
    Py_ssize_t length;
2308
41.0M
    int isbytes, charsize;
2309
41.0M
    Py_buffer view;
2310
41.0M
    PyObject *result;
2311
41.0M
    const void* ptr;
2312
41.0M
    Py_ssize_t i, j;
2313
2314
41.0M
    assert(0 <= index && index < self->groups);
2315
41.0M
    index *= 2;
2316
2317
41.0M
    if (self->string == Py_None || self->mark[index] < 0) {
2318
        /* return default value if the string or group is undefined */
2319
6.80M
        return Py_NewRef(def);
2320
6.80M
    }
2321
2322
34.2M
    ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
2323
34.2M
    if (ptr == NULL)
2324
0
        return NULL;
2325
2326
34.2M
    i = self->mark[index];
2327
34.2M
    j = self->mark[index+1];
2328
34.2M
    i = Py_MIN(i, length);
2329
34.2M
    j = Py_MIN(j, length);
2330
34.2M
    result = getslice(isbytes, ptr, self->string, i, j);
2331
34.2M
    if (isbytes && view.buf != NULL)
2332
370k
        PyBuffer_Release(&view);
2333
34.2M
    return result;
2334
34.2M
}
2335
2336
static Py_ssize_t
2337
match_getindex(MatchObject* self, PyObject* index)
2338
56.7M
{
2339
56.7M
    Py_ssize_t i;
2340
2341
56.7M
    if (index == NULL)
2342
        /* Default value */
2343
15.0M
        return 0;
2344
2345
41.7M
    if (PyIndex_Check(index)) {
2346
35.3M
        i = PyNumber_AsSsize_t(index, NULL);
2347
35.3M
    }
2348
6.32M
    else {
2349
6.32M
        i = -1;
2350
2351
6.32M
        if (self->pattern->groupindex) {
2352
6.32M
            index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2353
6.32M
            if (index && PyLong_Check(index)) {
2354
6.32M
                i = PyLong_AsSsize_t(index);
2355
6.32M
            }
2356
6.32M
        }
2357
6.32M
    }
2358
41.7M
    if (i < 0 || i >= self->groups) {
2359
        /* raise IndexError if we were given a bad group number */
2360
0
        if (!PyErr_Occurred()) {
2361
0
            PyErr_SetString(PyExc_IndexError, "no such group");
2362
0
        }
2363
0
        return -1;
2364
0
    }
2365
2366
    // Check that i*2 cannot overflow to make static analyzers happy
2367
41.7M
    assert((size_t)i <= SRE_MAXGROUPS);
2368
41.7M
    return i;
2369
41.7M
}
2370
2371
static PyObject*
2372
match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2373
41.0M
{
2374
41.0M
    Py_ssize_t i = match_getindex(self, index);
2375
2376
41.0M
    if (i < 0) {
2377
0
        return NULL;
2378
0
    }
2379
2380
41.0M
    return match_getslice_by_index(self, i, def);
2381
41.0M
}
2382
2383
/*[clinic input]
2384
@permit_long_summary
2385
_sre.SRE_Match.expand
2386
2387
    template: object
2388
2389
Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2390
[clinic start generated code]*/
2391
2392
static PyObject *
2393
_sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2394
/*[clinic end generated code: output=931b58ccc323c3a1 input=dc74d81265376ac3]*/
2395
0
{
2396
0
    _sremodulestate *module_state = get_sre_module_state_by_class(Py_TYPE(self));
2397
0
    PyObject *filter = compile_template(module_state, self->pattern, template);
2398
0
    if (filter == NULL) {
2399
0
        return NULL;
2400
0
    }
2401
0
    PyObject *result = expand_template((TemplateObject *)filter, self);
2402
0
    Py_DECREF(filter);
2403
0
    return result;
2404
0
}
2405
2406
static PyObject*
2407
match_group(PyObject *op, PyObject* args)
2408
22.5M
{
2409
22.5M
    MatchObject *self = _MatchObject_CAST(op);
2410
22.5M
    PyObject* result;
2411
22.5M
    Py_ssize_t i, size;
2412
2413
22.5M
    size = PyTuple_GET_SIZE(args);
2414
2415
22.5M
    switch (size) {
2416
3.17M
    case 0:
2417
3.17M
        result = match_getslice(self, _PyLong_GetZero(), Py_None);
2418
3.17M
        break;
2419
10.1M
    case 1:
2420
10.1M
        result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2421
10.1M
        break;
2422
9.22M
    default:
2423
        /* fetch multiple items */
2424
9.22M
        result = PyTuple_New(size);
2425
9.22M
        if (!result)
2426
0
            return NULL;
2427
34.0M
        for (i = 0; i < size; i++) {
2428
24.8M
            PyObject* item = match_getslice(
2429
24.8M
                self, PyTuple_GET_ITEM(args, i), Py_None
2430
24.8M
                );
2431
24.8M
            if (!item) {
2432
0
                Py_DECREF(result);
2433
0
                return NULL;
2434
0
            }
2435
24.8M
            PyTuple_SET_ITEM(result, i, item);
2436
24.8M
        }
2437
9.22M
        break;
2438
22.5M
    }
2439
22.5M
    return result;
2440
22.5M
}
2441
2442
static PyObject*
2443
match_getitem(PyObject *op, PyObject* name)
2444
2.90M
{
2445
2.90M
    MatchObject *self = _MatchObject_CAST(op);
2446
2.90M
    return match_getslice(self, name, Py_None);
2447
2.90M
}
2448
2449
/*[clinic input]
2450
_sre.SRE_Match.groups
2451
2452
    default: object = None
2453
        Is used for groups that did not participate in the match.
2454
2455
Return a tuple containing all the subgroups of the match, from 1.
2456
[clinic start generated code]*/
2457
2458
static PyObject *
2459
_sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2460
/*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
2461
323
{
2462
323
    PyObject* result;
2463
323
    Py_ssize_t index;
2464
2465
323
    result = PyTuple_New(self->groups-1);
2466
323
    if (!result)
2467
0
        return NULL;
2468
2469
2.74k
    for (index = 1; index < self->groups; index++) {
2470
2.42k
        PyObject* item;
2471
2.42k
        item = match_getslice_by_index(self, index, default_value);
2472
2.42k
        if (!item) {
2473
0
            Py_DECREF(result);
2474
0
            return NULL;
2475
0
        }
2476
2.42k
        PyTuple_SET_ITEM(result, index-1, item);
2477
2.42k
    }
2478
2479
323
    return result;
2480
323
}
2481
2482
/*[clinic input]
2483
@permit_long_summary
2484
_sre.SRE_Match.groupdict
2485
2486
    default: object = None
2487
        Is used for groups that did not participate in the match.
2488
2489
Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2490
[clinic start generated code]*/
2491
2492
static PyObject *
2493
_sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2494
/*[clinic end generated code: output=29917c9073e41757 input=a8d3a1dc80336872]*/
2495
144
{
2496
144
    PyObject *result;
2497
144
    PyObject *key;
2498
144
    PyObject *value;
2499
144
    Py_ssize_t pos = 0;
2500
144
    Py_hash_t hash;
2501
2502
144
    result = PyDict_New();
2503
144
    if (!result || !self->pattern->groupindex)
2504
0
        return result;
2505
2506
144
    Py_BEGIN_CRITICAL_SECTION(self->pattern->groupindex);
2507
960
    while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
2508
816
        int status;
2509
816
        Py_INCREF(key);
2510
816
        value = match_getslice(self, key, default_value);
2511
816
        if (!value) {
2512
0
            Py_DECREF(key);
2513
0
            Py_CLEAR(result);
2514
0
            goto exit;
2515
0
        }
2516
816
        status = _PyDict_SetItem_KnownHash(result, key, value, hash);
2517
816
        Py_DECREF(value);
2518
816
        Py_DECREF(key);
2519
816
        if (status < 0) {
2520
0
            Py_CLEAR(result);
2521
0
            goto exit;
2522
0
        }
2523
816
    }
2524
144
exit:;
2525
144
    Py_END_CRITICAL_SECTION();
2526
2527
144
    return result;
2528
144
}
2529
2530
/*[clinic input]
2531
_sre.SRE_Match.start -> Py_ssize_t
2532
2533
    group: object(c_default="NULL") = 0
2534
    /
2535
2536
Return index of the start of the substring matched by group.
2537
[clinic start generated code]*/
2538
2539
static Py_ssize_t
2540
_sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2541
/*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
2542
1.19M
{
2543
1.19M
    Py_ssize_t index = match_getindex(self, group);
2544
2545
1.19M
    if (index < 0) {
2546
0
        return -1;
2547
0
    }
2548
2549
    /* mark is -1 if group is undefined */
2550
1.19M
    return self->mark[index*2];
2551
1.19M
}
2552
2553
/*[clinic input]
2554
_sre.SRE_Match.end -> Py_ssize_t
2555
2556
    group: object(c_default="NULL") = 0
2557
    /
2558
2559
Return index of the end of the substring matched by group.
2560
[clinic start generated code]*/
2561
2562
static Py_ssize_t
2563
_sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2564
/*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
2565
11.5M
{
2566
11.5M
    Py_ssize_t index = match_getindex(self, group);
2567
2568
11.5M
    if (index < 0) {
2569
0
        return -1;
2570
0
    }
2571
2572
    /* mark is -1 if group is undefined */
2573
11.5M
    return self->mark[index*2+1];
2574
11.5M
}
2575
2576
LOCAL(PyObject*)
2577
_pair(Py_ssize_t i1, Py_ssize_t i2)
2578
2.92M
{
2579
2.92M
    PyObject* item1 = PyLong_FromSsize_t(i1);
2580
2.92M
    if (!item1) {
2581
0
        return NULL;
2582
0
    }
2583
2.92M
    PyObject* item2 = PyLong_FromSsize_t(i2);
2584
2.92M
    if(!item2) {
2585
0
        Py_DECREF(item1);
2586
0
        return NULL;
2587
0
    }
2588
2589
2.92M
    return _PyTuple_FromPairSteal(item1, item2);
2590
2.92M
}
2591
2592
/*[clinic input]
2593
@permit_long_summary
2594
_sre.SRE_Match.span
2595
2596
    group: object(c_default="NULL") = 0
2597
    /
2598
2599
For match object m, return the 2-tuple (m.start(group), m.end(group)).
2600
[clinic start generated code]*/
2601
2602
static PyObject *
2603
_sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
2604
/*[clinic end generated code: output=f02ae40594d14fe6 input=834cfe444f0f55cf]*/
2605
2.92M
{
2606
2.92M
    Py_ssize_t index = match_getindex(self, group);
2607
2608
2.92M
    if (index < 0) {
2609
0
        return NULL;
2610
0
    }
2611
2612
    /* marks are -1 if group is undefined */
2613
2.92M
    return _pair(self->mark[index*2], self->mark[index*2+1]);
2614
2.92M
}
2615
2616
static PyObject*
2617
match_regs(MatchObject* self)
2618
0
{
2619
0
    PyObject* regs;
2620
0
    PyObject* item;
2621
0
    Py_ssize_t index;
2622
2623
0
    regs = PyTuple_New(self->groups);
2624
0
    if (!regs)
2625
0
        return NULL;
2626
2627
0
    for (index = 0; index < self->groups; index++) {
2628
0
        item = _pair(self->mark[index*2], self->mark[index*2+1]);
2629
0
        if (!item) {
2630
0
            Py_DECREF(regs);
2631
0
            return NULL;
2632
0
        }
2633
0
        PyTuple_SET_ITEM(regs, index, item);
2634
0
    }
2635
2636
0
    self->regs = Py_NewRef(regs);
2637
2638
0
    return regs;
2639
0
}
2640
2641
/*[clinic input]
2642
_sre.SRE_Match.__copy__
2643
2644
[clinic start generated code]*/
2645
2646
static PyObject *
2647
_sre_SRE_Match___copy___impl(MatchObject *self)
2648
/*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
2649
0
{
2650
0
    return Py_NewRef(self);
2651
0
}
2652
2653
/*[clinic input]
2654
_sre.SRE_Match.__deepcopy__
2655
2656
    memo: object
2657
    /
2658
2659
[clinic start generated code]*/
2660
2661
static PyObject *
2662
_sre_SRE_Match___deepcopy___impl(MatchObject *self, PyObject *memo)
2663
/*[clinic end generated code: output=2b657578eb03f4a3 input=779d12a31c2c325e]*/
2664
0
{
2665
0
    return Py_NewRef(self);
2666
0
}
2667
2668
PyDoc_STRVAR(match_doc,
2669
"The result of re.search(), re.prefixmatch(), and re.fullmatch().\n\
2670
Match objects always have a boolean value of True.");
2671
2672
PyDoc_STRVAR(match_group_doc,
2673
"group([group1, ...]) -> str or tuple.\n\
2674
    Return subgroup(s) of the match by indices or names.\n\
2675
    For 0 returns the entire match.");
2676
2677
static PyObject *
2678
match_lastindex_get(PyObject *op, void *Py_UNUSED(ignored))
2679
0
{
2680
0
    MatchObject *self = _MatchObject_CAST(op);
2681
0
    if (self->lastindex >= 0)
2682
0
        return PyLong_FromSsize_t(self->lastindex);
2683
0
    Py_RETURN_NONE;
2684
0
}
2685
2686
static PyObject *
2687
match_lastgroup_get(PyObject *op, void *Py_UNUSED(ignored))
2688
0
{
2689
0
    MatchObject *self = _MatchObject_CAST(op);
2690
0
    if (self->pattern->indexgroup &&
2691
0
        self->lastindex >= 0 &&
2692
0
        self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2693
0
    {
2694
0
        PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2695
0
                                            self->lastindex);
2696
0
        return Py_NewRef(result);
2697
0
    }
2698
0
    Py_RETURN_NONE;
2699
0
}
2700
2701
static PyObject *
2702
match_regs_get(PyObject *op, void *Py_UNUSED(ignored))
2703
0
{
2704
0
    MatchObject *self = _MatchObject_CAST(op);
2705
0
    if (self->regs) {
2706
0
        return Py_NewRef(self->regs);
2707
0
    } else
2708
0
        return match_regs(self);
2709
0
}
2710
2711
static PyObject *
2712
match_repr(PyObject *op)
2713
0
{
2714
0
    MatchObject *self = _MatchObject_CAST(op);
2715
0
    PyObject *result;
2716
0
    PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2717
0
    if (group0 == NULL)
2718
0
        return NULL;
2719
0
    result = PyUnicode_FromFormat(
2720
0
            "<%s object; span=(%zd, %zd), match=%.50R>",
2721
0
            Py_TYPE(self)->tp_name,
2722
0
            self->mark[0], self->mark[1], group0);
2723
0
    Py_DECREF(group0);
2724
0
    return result;
2725
0
}
2726
2727
2728
static PyObject*
2729
pattern_new_match(_sremodulestate* module_state,
2730
                  PatternObject* pattern,
2731
                  SRE_STATE* state,
2732
                  Py_ssize_t status)
2733
64.0M
{
2734
    /* create match object (from state object) */
2735
2736
64.0M
    MatchObject* match;
2737
64.0M
    Py_ssize_t i, j;
2738
64.0M
    char* base;
2739
64.0M
    int n;
2740
2741
64.0M
    if (status > 0) {
2742
2743
        /* create match object (with room for extra group marks) */
2744
        /* coverity[ampersand_in_size] */
2745
47.8M
        match = PyObject_GC_NewVar(MatchObject,
2746
47.8M
                                   module_state->Match_Type,
2747
47.8M
                                   2*(pattern->groups+1));
2748
47.8M
        if (!match)
2749
0
            return NULL;
2750
2751
47.8M
        Py_INCREF(pattern);
2752
47.8M
        match->pattern = pattern;
2753
2754
47.8M
        match->string = Py_NewRef(state->string);
2755
2756
47.8M
        match->regs = NULL;
2757
47.8M
        match->groups = pattern->groups+1;
2758
2759
        /* fill in group slices */
2760
2761
47.8M
        base = (char*) state->beginning;
2762
47.8M
        n = state->charsize;
2763
2764
47.8M
        match->mark[0] = ((char*) state->start - base) / n;
2765
47.8M
        match->mark[1] = ((char*) state->ptr - base) / n;
2766
2767
90.9M
        for (i = j = 0; i < pattern->groups; i++, j+=2)
2768
43.1M
            if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2769
35.7M
                match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2770
35.7M
                match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2771
2772
                /* check wrong span */
2773
35.7M
                if (match->mark[j+2] > match->mark[j+3]) {
2774
0
                    PyErr_SetString(PyExc_SystemError,
2775
0
                                    "The span of capturing group is wrong,"
2776
0
                                    " please report a bug for the re module.");
2777
0
                    Py_DECREF(match);
2778
0
                    return NULL;
2779
0
                }
2780
35.7M
            } else
2781
7.32M
                match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2782
2783
47.8M
        match->pos = state->pos;
2784
47.8M
        match->endpos = state->endpos;
2785
2786
47.8M
        match->lastindex = state->lastindex;
2787
2788
47.8M
        PyObject_GC_Track(match);
2789
47.8M
        return (PyObject*) match;
2790
2791
47.8M
    } else if (status == 0) {
2792
2793
        /* no match */
2794
16.1M
        Py_RETURN_NONE;
2795
2796
16.1M
    }
2797
2798
    /* internal error */
2799
0
    pattern_error(status);
2800
0
    return NULL;
2801
64.0M
}
2802
2803
2804
/* -------------------------------------------------------------------- */
2805
/* scanner methods (experimental) */
2806
2807
static int
2808
scanner_traverse(PyObject *op, visitproc visit, void *arg)
2809
1.07k
{
2810
1.07k
    ScannerObject *self = _ScannerObject_CAST(op);
2811
1.07k
    Py_VISIT(Py_TYPE(self));
2812
1.07k
    Py_VISIT(self->pattern);
2813
1.07k
    return 0;
2814
1.07k
}
2815
2816
static int
2817
scanner_clear(PyObject *op)
2818
355k
{
2819
355k
    ScannerObject *self = _ScannerObject_CAST(op);
2820
355k
    Py_CLEAR(self->pattern);
2821
355k
    return 0;
2822
355k
}
2823
2824
static void
2825
scanner_dealloc(PyObject *self)
2826
355k
{
2827
355k
    PyTypeObject *tp = Py_TYPE(self);
2828
355k
    PyObject_GC_UnTrack(self);
2829
355k
    ScannerObject *scanner = _ScannerObject_CAST(self);
2830
355k
    state_fini(&scanner->state);
2831
355k
    (void)scanner_clear(self);
2832
355k
    tp->tp_free(self);
2833
355k
    Py_DECREF(tp);
2834
355k
}
2835
2836
static int
2837
scanner_begin(ScannerObject* self)
2838
3.26M
{
2839
#ifdef Py_GIL_DISABLED
2840
    int was_executing = _Py_atomic_exchange_int(&self->executing, 1);
2841
#else
2842
3.26M
    int was_executing = self->executing;
2843
3.26M
    self->executing = 1;
2844
3.26M
#endif
2845
3.26M
    if (was_executing) {
2846
0
        PyErr_SetString(PyExc_ValueError,
2847
0
                        "regular expression scanner already executing");
2848
0
        return 0;
2849
0
    }
2850
3.26M
    return 1;
2851
3.26M
}
2852
2853
static void
2854
scanner_end(ScannerObject* self)
2855
3.26M
{
2856
3.26M
    assert(FT_ATOMIC_LOAD_INT_RELAXED(self->executing));
2857
3.26M
    FT_ATOMIC_STORE_INT(self->executing, 0);
2858
3.26M
}
2859
2860
/*[clinic input]
2861
_sre.SRE_Scanner.prefixmatch
2862
2863
    cls: defining_class
2864
    /
2865
2866
[clinic start generated code]*/
2867
2868
static PyObject *
2869
_sre_SRE_Scanner_prefixmatch_impl(ScannerObject *self, PyTypeObject *cls)
2870
/*[clinic end generated code: output=02b3b9d2954a2157 input=3049b20466c56a8e]*/
2871
0
{
2872
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2873
0
    SRE_STATE* state = &self->state;
2874
0
    PyObject* match;
2875
0
    Py_ssize_t status;
2876
2877
0
    if (!scanner_begin(self)) {
2878
0
        return NULL;
2879
0
    }
2880
0
    if (state->start == NULL) {
2881
0
        scanner_end(self);
2882
0
        Py_RETURN_NONE;
2883
0
    }
2884
2885
0
    state_reset(state);
2886
2887
0
    state->ptr = state->start;
2888
2889
0
    status = sre_match(state, PatternObject_GetCode(self->pattern));
2890
0
    if (PyErr_Occurred()) {
2891
0
        scanner_end(self);
2892
0
        return NULL;
2893
0
    }
2894
2895
0
    match = pattern_new_match(module_state, self->pattern,
2896
0
                              state, status);
2897
2898
0
    if (status == 0)
2899
0
        state->start = NULL;
2900
0
    else {
2901
0
        state->must_advance = (state->ptr == state->start);
2902
0
        state->start = state->ptr;
2903
0
    }
2904
2905
0
    scanner_end(self);
2906
0
    return match;
2907
0
}
2908
2909
2910
/*[clinic input]
2911
_sre.SRE_Scanner.search
2912
2913
    cls: defining_class
2914
    /
2915
2916
[clinic start generated code]*/
2917
2918
static PyObject *
2919
_sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2920
/*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
2921
3.26M
{
2922
3.26M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2923
3.26M
    SRE_STATE* state = &self->state;
2924
3.26M
    PyObject* match;
2925
3.26M
    Py_ssize_t status;
2926
2927
3.26M
    if (!scanner_begin(self)) {
2928
0
        return NULL;
2929
0
    }
2930
3.26M
    if (state->start == NULL) {
2931
0
        scanner_end(self);
2932
0
        Py_RETURN_NONE;
2933
0
    }
2934
2935
3.26M
    state_reset(state);
2936
2937
3.26M
    state->ptr = state->start;
2938
2939
3.26M
    status = sre_search(state, PatternObject_GetCode(self->pattern));
2940
3.26M
    if (PyErr_Occurred()) {
2941
0
        scanner_end(self);
2942
0
        return NULL;
2943
0
    }
2944
2945
3.26M
    match = pattern_new_match(module_state, self->pattern,
2946
3.26M
                              state, status);
2947
2948
3.26M
    if (status == 0)
2949
355k
        state->start = NULL;
2950
2.90M
    else {
2951
2.90M
        state->must_advance = (state->ptr == state->start);
2952
2.90M
        state->start = state->ptr;
2953
2.90M
    }
2954
2955
3.26M
    scanner_end(self);
2956
3.26M
    return match;
2957
3.26M
}
2958
2959
static PyObject *
2960
pattern_scanner(_sremodulestate *module_state,
2961
                PatternObject *self,
2962
                PyObject *string,
2963
                Py_ssize_t pos,
2964
                Py_ssize_t endpos)
2965
355k
{
2966
355k
    ScannerObject* scanner;
2967
2968
    /* create scanner object */
2969
355k
    scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
2970
355k
    if (!scanner)
2971
0
        return NULL;
2972
355k
    scanner->pattern = NULL;
2973
355k
    scanner->executing = 0;
2974
2975
    /* create search state object */
2976
355k
    if (!state_init(&scanner->state, self, string, pos, endpos)) {
2977
0
        Py_DECREF(scanner);
2978
0
        return NULL;
2979
0
    }
2980
2981
355k
    Py_INCREF(self);
2982
355k
    scanner->pattern = self;
2983
2984
355k
    PyObject_GC_Track(scanner);
2985
355k
    return (PyObject*) scanner;
2986
355k
}
2987
2988
/* -------------------------------------------------------------------- */
2989
/* template methods */
2990
2991
static int
2992
template_traverse(PyObject *op, visitproc visit, void *arg)
2993
0
{
2994
0
    TemplateObject *self = _TemplateObject_CAST(op);
2995
0
    Py_VISIT(Py_TYPE(self));
2996
0
    Py_VISIT(self->literal);
2997
0
    for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
2998
0
        Py_VISIT(self->items[i].literal);
2999
0
    }
3000
0
    return 0;
3001
0
}
3002
3003
static int
3004
template_clear(PyObject *op)
3005
0
{
3006
0
    TemplateObject *self = _TemplateObject_CAST(op);
3007
0
    Py_CLEAR(self->literal);
3008
0
    for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
3009
0
        Py_CLEAR(self->items[i].literal);
3010
0
    }
3011
0
    return 0;
3012
0
}
3013
3014
static void
3015
template_dealloc(PyObject *self)
3016
0
{
3017
0
    PyTypeObject *tp = Py_TYPE(self);
3018
0
    PyObject_GC_UnTrack(self);
3019
0
    (void)template_clear(self);
3020
0
    tp->tp_free(self);
3021
0
    Py_DECREF(tp);
3022
0
}
3023
3024
static PyObject *
3025
expand_template(TemplateObject *self, MatchObject *match)
3026
0
{
3027
0
    if (Py_SIZE(self) == 0) {
3028
0
        return Py_NewRef(self->literal);
3029
0
    }
3030
3031
0
    PyObject *result = NULL;
3032
0
    Py_ssize_t count = 0;  // the number of non-empty chunks
3033
    /* For small number of strings use a buffer allocated on the stack,
3034
     * otherwise use a list object. */
3035
0
    PyObject *buffer[10];
3036
0
    PyObject **out = buffer;
3037
0
    PyObject *list = NULL;
3038
0
    if (self->chunks > (int)Py_ARRAY_LENGTH(buffer) ||
3039
0
        !PyUnicode_Check(self->literal))
3040
0
    {
3041
0
        list = PyList_New(self->chunks);
3042
0
        if (!list) {
3043
0
            return NULL;
3044
0
        }
3045
0
        out = &PyList_GET_ITEM(list, 0);
3046
0
    }
3047
3048
0
    out[count++] = Py_NewRef(self->literal);
3049
0
    for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) {
3050
0
        Py_ssize_t index = self->items[i].index;
3051
0
        if (index >= match->groups) {
3052
0
            PyErr_SetString(PyExc_IndexError, "no such group");
3053
0
            goto cleanup;
3054
0
        }
3055
0
        PyObject *item = match_getslice_by_index(match, index, Py_None);
3056
0
        if (item == NULL) {
3057
0
            goto cleanup;
3058
0
        }
3059
0
        if (item != Py_None) {
3060
0
            out[count++] = Py_NewRef(item);
3061
0
        }
3062
0
        Py_DECREF(item);
3063
3064
0
        PyObject *literal = self->items[i].literal;
3065
0
        if (literal != NULL) {
3066
0
            out[count++] = Py_NewRef(literal);
3067
0
        }
3068
0
    }
3069
3070
0
    if (PyUnicode_Check(self->literal)) {
3071
0
        result = _PyUnicode_JoinArray(&_Py_STR(empty), out, count);
3072
0
    }
3073
0
    else {
3074
0
        Py_SET_SIZE(list, count);
3075
0
        result = PyBytes_Join((PyObject *)&_Py_SINGLETON(bytes_empty), list);
3076
0
    }
3077
3078
0
cleanup:
3079
0
    if (list) {
3080
0
        Py_DECREF(list);
3081
0
    }
3082
0
    else {
3083
0
        for (Py_ssize_t i = 0; i < count; i++) {
3084
0
            Py_DECREF(out[i]);
3085
0
        }
3086
0
    }
3087
0
    return result;
3088
0
}
3089
3090
3091
static Py_hash_t
3092
pattern_hash(PyObject *op)
3093
0
{
3094
0
    PatternObject *self = _PatternObject_CAST(op);
3095
3096
0
    Py_hash_t hash, hash2;
3097
3098
0
    hash = PyObject_Hash(self->pattern);
3099
0
    if (hash == -1) {
3100
0
        return -1;
3101
0
    }
3102
3103
0
    hash2 = Py_HashBuffer(self->code, sizeof(self->code[0]) * self->codesize);
3104
0
    hash ^= hash2;
3105
3106
0
    hash ^= self->flags;
3107
0
    hash ^= self->isbytes;
3108
0
    hash ^= self->codesize;
3109
3110
0
    if (hash == -1) {
3111
0
        hash = -2;
3112
0
    }
3113
0
    return hash;
3114
0
}
3115
3116
static PyObject*
3117
pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
3118
0
{
3119
0
    PyTypeObject *tp = Py_TYPE(lefto);
3120
0
    _sremodulestate *module_state = get_sre_module_state_by_class(tp);
3121
0
    PatternObject *left, *right;
3122
0
    int cmp;
3123
3124
0
    if (op != Py_EQ && op != Py_NE) {
3125
0
        Py_RETURN_NOTIMPLEMENTED;
3126
0
    }
3127
3128
0
    if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
3129
0
    {
3130
0
        Py_RETURN_NOTIMPLEMENTED;
3131
0
    }
3132
3133
0
    if (lefto == righto) {
3134
        /* a pattern is equal to itself */
3135
0
        return PyBool_FromLong(op == Py_EQ);
3136
0
    }
3137
3138
0
    left = (PatternObject *)lefto;
3139
0
    right = (PatternObject *)righto;
3140
3141
0
    cmp = (left->flags == right->flags
3142
0
           && left->isbytes == right->isbytes
3143
0
           && left->codesize == right->codesize);
3144
0
    if (cmp) {
3145
        /* Compare the code and the pattern because the same pattern can
3146
           produce different codes depending on the locale used to compile the
3147
           pattern when the re.LOCALE flag is used. Don't compare groups,
3148
           indexgroup nor groupindex: they are derivated from the pattern. */
3149
0
        cmp = (memcmp(left->code, right->code,
3150
0
                      sizeof(left->code[0]) * left->codesize) == 0);
3151
0
    }
3152
0
    if (cmp) {
3153
0
        cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
3154
0
                                       Py_EQ);
3155
0
        if (cmp < 0) {
3156
0
            return NULL;
3157
0
        }
3158
0
    }
3159
0
    if (op == Py_NE) {
3160
0
        cmp = !cmp;
3161
0
    }
3162
0
    return PyBool_FromLong(cmp);
3163
0
}
3164
3165
#include "clinic/sre.c.h"
3166
3167
static PyMethodDef pattern_methods[] = {
3168
    _SRE_SRE_PATTERN_PREFIXMATCH_METHODDEF
3169
    /* "match" reuses the prefixmatch Clinic-generated parser and impl
3170
     * to avoid duplicating the argument parsing boilerplate code. */
3171
    {"match", _PyCFunction_CAST(_sre_SRE_Pattern_prefixmatch),
3172
     METH_METHOD|METH_FASTCALL|METH_KEYWORDS,
3173
     _sre_SRE_Pattern_prefixmatch__doc__},
3174
    _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
3175
    _SRE_SRE_PATTERN_SEARCH_METHODDEF
3176
    _SRE_SRE_PATTERN_SUB_METHODDEF
3177
    _SRE_SRE_PATTERN_SUBN_METHODDEF
3178
    _SRE_SRE_PATTERN_FINDALL_METHODDEF
3179
    _SRE_SRE_PATTERN_SPLIT_METHODDEF
3180
    _SRE_SRE_PATTERN_FINDITER_METHODDEF
3181
    _SRE_SRE_PATTERN_SCANNER_METHODDEF
3182
    _SRE_SRE_PATTERN___COPY___METHODDEF
3183
    _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
3184
    _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
3185
    {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3186
     PyDoc_STR("Patterns are generic over the type of string they handle (str or bytes)")},
3187
    {NULL, NULL}
3188
};
3189
3190
static PyGetSetDef pattern_getset[] = {
3191
    {"groupindex", pattern_groupindex, NULL,
3192
      "A dictionary mapping group names to group numbers."},
3193
    {NULL}  /* Sentinel */
3194
};
3195
3196
#define PAT_OFF(x) offsetof(PatternObject, x)
3197
static PyMemberDef pattern_members[] = {
3198
    {"pattern",    _Py_T_OBJECT,    PAT_OFF(pattern),       Py_READONLY,
3199
     "The pattern string from which the RE object was compiled."},
3200
    {"flags",      Py_T_INT,       PAT_OFF(flags),         Py_READONLY,
3201
     "The regex matching flags."},
3202
    {"groups",     Py_T_PYSSIZET,  PAT_OFF(groups),        Py_READONLY,
3203
     "The number of capturing groups in the pattern."},
3204
    {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(PatternObject, weakreflist), Py_READONLY},
3205
    {NULL}  /* Sentinel */
3206
};
3207
3208
static PyType_Slot pattern_slots[] = {
3209
    {Py_tp_dealloc, pattern_dealloc},
3210
    {Py_tp_repr, pattern_repr},
3211
    {Py_tp_hash, pattern_hash},
3212
    {Py_tp_doc, (void *)pattern_doc},
3213
    {Py_tp_richcompare, pattern_richcompare},
3214
    {Py_tp_methods, pattern_methods},
3215
    {Py_tp_members, pattern_members},
3216
    {Py_tp_getset, pattern_getset},
3217
    {Py_tp_traverse, pattern_traverse},
3218
    {Py_tp_clear, pattern_clear},
3219
    {0, NULL},
3220
};
3221
3222
static PyType_Spec pattern_spec = {
3223
    .name = "re.Pattern",
3224
    .basicsize = sizeof(PatternObject),
3225
    .itemsize = sizeof(SRE_CODE),
3226
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3227
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3228
    .slots = pattern_slots,
3229
};
3230
3231
static PyMethodDef match_methods[] = {
3232
    {"group", match_group, METH_VARARGS, match_group_doc},
3233
    _SRE_SRE_MATCH_START_METHODDEF
3234
    _SRE_SRE_MATCH_END_METHODDEF
3235
    _SRE_SRE_MATCH_SPAN_METHODDEF
3236
    _SRE_SRE_MATCH_GROUPS_METHODDEF
3237
    _SRE_SRE_MATCH_GROUPDICT_METHODDEF
3238
    _SRE_SRE_MATCH_EXPAND_METHODDEF
3239
    _SRE_SRE_MATCH___COPY___METHODDEF
3240
    _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
3241
    {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3242
     PyDoc_STR("Matches are generic over the type of string which was matched (str or bytes)")},
3243
    {NULL, NULL}
3244
};
3245
3246
static PyGetSetDef match_getset[] = {
3247
    {"lastindex", match_lastindex_get, NULL,
3248
     "The integer index of the last matched capturing group."},
3249
    {"lastgroup", match_lastgroup_get, NULL,
3250
     "The name of the last matched capturing group."},
3251
    {"regs", match_regs_get, NULL, NULL},
3252
    {NULL}
3253
};
3254
3255
#define MATCH_OFF(x) offsetof(MatchObject, x)
3256
static PyMemberDef match_members[] = {
3257
    {"string",  _Py_T_OBJECT,   MATCH_OFF(string),  Py_READONLY,
3258
     "The string passed to match() or search()."},
3259
    {"re",      _Py_T_OBJECT,   MATCH_OFF(pattern), Py_READONLY,
3260
     "The regular expression object."},
3261
    {"pos",     Py_T_PYSSIZET, MATCH_OFF(pos),     Py_READONLY,
3262
     "The index into the string at which the RE engine started looking for a match."},
3263
    {"endpos",  Py_T_PYSSIZET, MATCH_OFF(endpos),  Py_READONLY,
3264
     "The index into the string beyond which the RE engine will not go."},
3265
    {NULL}
3266
};
3267
3268
/* FIXME: implement setattr("string", None) as a special case (to
3269
   detach the associated string, if any */
3270
static PyType_Slot match_slots[] = {
3271
    {Py_tp_dealloc, match_dealloc},
3272
    {Py_tp_repr, match_repr},
3273
    {Py_tp_doc, (void *)match_doc},
3274
    {Py_tp_methods, match_methods},
3275
    {Py_tp_members, match_members},
3276
    {Py_tp_getset, match_getset},
3277
    {Py_tp_traverse, match_traverse},
3278
    {Py_tp_clear, match_clear},
3279
3280
    /* As mapping.
3281
     *
3282
     * Match objects do not support length or assignment, but do support
3283
     * __getitem__.
3284
     */
3285
    {Py_mp_subscript, match_getitem},
3286
3287
    {0, NULL},
3288
};
3289
3290
static PyType_Spec match_spec = {
3291
    .name = "re.Match",
3292
    .basicsize = sizeof(MatchObject),
3293
    .itemsize = sizeof(Py_ssize_t),
3294
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3295
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3296
    .slots = match_slots,
3297
};
3298
3299
static PyMethodDef scanner_methods[] = {
3300
    _SRE_SRE_SCANNER_PREFIXMATCH_METHODDEF
3301
    /* "match" reuses the prefixmatch Clinic-generated parser and impl
3302
     * to avoid duplicating the argument parsing boilerplate code. */
3303
    {"match", _PyCFunction_CAST(_sre_SRE_Scanner_prefixmatch),
3304
     METH_METHOD|METH_FASTCALL|METH_KEYWORDS,
3305
     _sre_SRE_Scanner_prefixmatch__doc__},
3306
    _SRE_SRE_SCANNER_SEARCH_METHODDEF
3307
    {NULL, NULL}
3308
};
3309
3310
#define SCAN_OFF(x) offsetof(ScannerObject, x)
3311
static PyMemberDef scanner_members[] = {
3312
    {"pattern", _Py_T_OBJECT, SCAN_OFF(pattern), Py_READONLY},
3313
    {NULL}  /* Sentinel */
3314
};
3315
3316
static PyType_Slot scanner_slots[] = {
3317
    {Py_tp_dealloc, scanner_dealloc},
3318
    {Py_tp_methods, scanner_methods},
3319
    {Py_tp_members, scanner_members},
3320
    {Py_tp_traverse, scanner_traverse},
3321
    {Py_tp_clear, scanner_clear},
3322
    {0, NULL},
3323
};
3324
3325
static PyType_Spec scanner_spec = {
3326
    .name = "_sre.SRE_Scanner",
3327
    .basicsize = sizeof(ScannerObject),
3328
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3329
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3330
    .slots = scanner_slots,
3331
};
3332
3333
static PyType_Slot template_slots[] = {
3334
    {Py_tp_dealloc, template_dealloc},
3335
    {Py_tp_traverse, template_traverse},
3336
    {Py_tp_clear, template_clear},
3337
    {0, NULL},
3338
};
3339
3340
static PyType_Spec template_spec = {
3341
    .name = "_sre.SRE_Template",
3342
    .basicsize = sizeof(TemplateObject),
3343
    .itemsize = sizeof(((TemplateObject *)0)->items[0]),
3344
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3345
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3346
    .slots = template_slots,
3347
};
3348
3349
static PyMethodDef _functions[] = {
3350
    _SRE_COMPILE_METHODDEF
3351
    _SRE_TEMPLATE_METHODDEF
3352
    _SRE_GETCODESIZE_METHODDEF
3353
    _SRE_ASCII_ISCASED_METHODDEF
3354
    _SRE_UNICODE_ISCASED_METHODDEF
3355
    _SRE_ASCII_TOLOWER_METHODDEF
3356
    _SRE_UNICODE_TOLOWER_METHODDEF
3357
    {NULL, NULL}
3358
};
3359
3360
static int
3361
sre_traverse(PyObject *module, visitproc visit, void *arg)
3362
1.27k
{
3363
1.27k
    _sremodulestate *state = get_sre_module_state(module);
3364
3365
1.27k
    Py_VISIT(state->Pattern_Type);
3366
1.27k
    Py_VISIT(state->Match_Type);
3367
1.27k
    Py_VISIT(state->Scanner_Type);
3368
1.27k
    Py_VISIT(state->Template_Type);
3369
1.27k
    Py_VISIT(state->compile_template);
3370
3371
1.27k
    return 0;
3372
1.27k
}
3373
3374
static int
3375
sre_clear(PyObject *module)
3376
0
{
3377
0
    _sremodulestate *state = get_sre_module_state(module);
3378
3379
0
    Py_CLEAR(state->Pattern_Type);
3380
0
    Py_CLEAR(state->Match_Type);
3381
0
    Py_CLEAR(state->Scanner_Type);
3382
0
    Py_CLEAR(state->Template_Type);
3383
0
    Py_CLEAR(state->compile_template);
3384
3385
0
    return 0;
3386
0
}
3387
3388
static void
3389
sre_free(void *module)
3390
0
{
3391
0
    sre_clear((PyObject *)module);
3392
0
}
3393
3394
108
#define CREATE_TYPE(m, type, spec)                                  \
3395
108
do {                                                                \
3396
108
    type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
3397
108
    if (type == NULL) {                                             \
3398
0
        goto error;                                                 \
3399
0
    }                                                               \
3400
108
} while (0)
3401
3402
#define ADD_ULONG_CONSTANT(module, name, value)           \
3403
54
    do {                                                  \
3404
54
        if (PyModule_Add(module, name, PyLong_FromUnsignedLong(value)) < 0) { \
3405
0
            goto error;                                   \
3406
0
        }                                                 \
3407
54
} while (0)
3408
3409
3410
#ifdef Py_DEBUG
3411
static void
3412
_assert_match_aliases_prefixmatch(PyMethodDef *methods)
3413
{
3414
    PyMethodDef *prefixmatch_md = &methods[0];
3415
    PyMethodDef *match_md = &methods[1];
3416
    assert(strcmp(prefixmatch_md->ml_name, "prefixmatch") == 0);
3417
    assert(strcmp(match_md->ml_name, "match") == 0);
3418
    assert(match_md->ml_meth == prefixmatch_md->ml_meth);
3419
    assert(match_md->ml_flags == prefixmatch_md->ml_flags);
3420
    assert(match_md->ml_doc == prefixmatch_md->ml_doc);
3421
}
3422
#endif
3423
3424
static int
3425
sre_exec(PyObject *m)
3426
27
{
3427
27
    _sremodulestate *state;
3428
3429
#ifdef Py_DEBUG
3430
    _assert_match_aliases_prefixmatch(pattern_methods);
3431
    _assert_match_aliases_prefixmatch(scanner_methods);
3432
#endif
3433
3434
    /* Create heap types */
3435
27
    state = get_sre_module_state(m);
3436
27
    CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
3437
27
    CREATE_TYPE(m, state->Match_Type, &match_spec);
3438
27
    CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
3439
27
    CREATE_TYPE(m, state->Template_Type, &template_spec);
3440
3441
27
    if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
3442
0
        goto error;
3443
0
    }
3444
3445
27
    if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
3446
0
        goto error;
3447
0
    }
3448
3449
27
    ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
3450
27
    ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
3451
3452
27
    if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
3453
0
        goto error;
3454
0
    }
3455
3456
27
    return 0;
3457
3458
0
error:
3459
0
    return -1;
3460
27
}
3461
3462
static PyModuleDef_Slot sre_slots[] = {
3463
    _Py_ABI_SLOT,
3464
    {Py_mod_exec, sre_exec},
3465
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
3466
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
3467
    {0, NULL},
3468
};
3469
3470
static struct PyModuleDef sremodule = {
3471
    .m_base = PyModuleDef_HEAD_INIT,
3472
    .m_name = "_sre",
3473
    .m_size = sizeof(_sremodulestate),
3474
    .m_methods = _functions,
3475
    .m_slots = sre_slots,
3476
    .m_traverse = sre_traverse,
3477
    .m_free = sre_free,
3478
    .m_clear = sre_clear,
3479
};
3480
3481
PyMODINIT_FUNC
3482
PyInit__sre(void)
3483
27
{
3484
27
    return PyModuleDef_Init(&sremodule);
3485
27
}
3486
3487
/* vim:ts=4:sw=4:et
3488
*/