Coverage Report

Created: 2026-05-30 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Modules/_sre/sre.c
Line
Count
Source
1
/*
2
 * Secret Labs' Regular Expression Engine
3
 *
4
 * regular expression matching engine
5
 *
6
 * partial history:
7
 * 1999-10-24 fl   created (based on existing template matcher code)
8
 * 2000-03-06 fl   first alpha, sort of
9
 * 2000-08-01 fl   fixes for 1.6b1
10
 * 2000-08-07 fl   use PyOS_CheckStack() if available
11
 * 2000-09-20 fl   added expand method
12
 * 2001-03-20 fl   lots of fixes for 2.1b2
13
 * 2001-04-15 fl   export copyright as Python attribute, not global
14
 * 2001-04-28 fl   added __copy__ methods (work in progress)
15
 * 2001-05-14 fl   fixes for 1.5.2 compatibility
16
 * 2001-07-01 fl   added BIGCHARSET support (from Martin von Loewis)
17
 * 2001-10-18 fl   fixed group reset issue (from Matthew Mueller)
18
 * 2001-10-20 fl   added split primitive; re-enable unicode for 1.6/2.0/2.1
19
 * 2001-10-21 fl   added sub/subn primitive
20
 * 2001-10-24 fl   added finditer primitive (for 2.2 only)
21
 * 2001-12-07 fl   fixed memory leak in sub/subn (Guido van Rossum)
22
 * 2002-11-09 fl   fixed empty sub/subn return type
23
 * 2003-04-18 mvl  fully support 4-byte codes
24
 * 2003-10-17 gn   implemented non recursive scheme
25
 * 2013-02-04 mrab added fullmatch primitive
26
 *
27
 * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
28
 *
29
 * This version of the SRE library can be redistributed under CNRI's
30
 * Python 1.6 license.  For any other use, please contact Secret Labs
31
 * AB (info@pythonware.com).
32
 *
33
 * Portions of this engine have been developed in cooperation with
34
 * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
35
 * other compatibility work.
36
 */
37
38
static const char copyright[] =
39
    " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
40
41
#include "Python.h"
42
#include "pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
43
#include "pycore_dict.h"             // _PyDict_Next()
44
#include "pycore_long.h"             // _PyLong_GetZero()
45
#include "pycore_moduleobject.h"     // _PyModule_GetState()
46
#include "pycore_tuple.h"            // _PyTuple_FromPairSteal
47
#include "pycore_unicodeobject.h"    // _PyUnicode_Copy
48
#include "pycore_weakref.h"          // FT_CLEAR_WEAKREFS()
49
50
#include "sre.h"                     // SRE_CODE
51
52
#include <ctype.h>                   // tolower(), toupper(), isalnum()
53
54
1.38G
#define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
55
56
// On macOS, use the wide character ctype API using btowc()
57
#if defined(__APPLE__)
58
#  define USE_CTYPE_WINT_T
59
#endif
60
61
0
static int sre_isalnum(unsigned int ch) {
62
#ifdef USE_CTYPE_WINT_T
63
    return (unsigned int)iswalnum(btowc((int)ch));
64
#else
65
0
    return (unsigned int)isalnum((int)ch);
66
0
#endif
67
0
}
68
69
0
static unsigned int sre_tolower(unsigned int ch) {
70
#ifdef USE_CTYPE_WINT_T
71
    return (unsigned int)towlower(btowc((int)ch));
72
#else
73
0
    return (unsigned int)tolower((int)ch);
74
0
#endif
75
0
}
76
77
0
static unsigned int sre_toupper(unsigned int ch) {
78
#ifdef USE_CTYPE_WINT_T
79
    return (unsigned int)towupper(btowc((int)ch));
80
#else
81
0
    return (unsigned int)toupper((int)ch);
82
0
#endif
83
0
}
84
85
/* Defining this one controls tracing:
86
 * 0 -- disabled
87
 * 1 -- only if the DEBUG flag set
88
 * 2 -- always
89
 */
90
#ifndef VERBOSE
91
#  define VERBOSE 0
92
#endif
93
94
/* -------------------------------------------------------------------- */
95
96
#if defined(_MSC_VER) && !defined(__clang__)
97
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
98
#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
99
/* fastest possible local call under MSVC */
100
#define LOCAL(type) static __inline type __fastcall
101
#else
102
#define LOCAL(type) static inline type
103
#endif
104
105
/* error codes */
106
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
107
#define SRE_ERROR_STATE -2 /* illegal state */
108
0
#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
109
0
#define SRE_ERROR_MEMORY -9 /* out of memory */
110
0
#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
111
112
#if VERBOSE == 0
113
#  define INIT_TRACE(state)
114
#  define DO_TRACE 0
115
#  define TRACE(v)
116
#elif VERBOSE == 1
117
#  define INIT_TRACE(state) int _debug = (state)->debug
118
#  define DO_TRACE (_debug)
119
#  define TRACE(v) do {     \
120
        if (_debug) { \
121
            printf v;       \
122
        }                   \
123
    } while (0)
124
#elif VERBOSE == 2
125
#  define INIT_TRACE(state)
126
#  define DO_TRACE 1
127
#  define TRACE(v) printf v
128
#else
129
#  error VERBOSE must be 0, 1 or 2
130
#endif
131
132
/* -------------------------------------------------------------------- */
133
/* search engine state */
134
135
#define SRE_IS_DIGIT(ch)\
136
1.51k
    ((ch) <= '9' && Py_ISDIGIT(ch))
137
#define SRE_IS_SPACE(ch)\
138
32
    ((ch) <= ' ' && Py_ISSPACE(ch))
139
#define SRE_IS_LINEBREAK(ch)\
140
36.0M
    ((ch) == '\n')
141
#define SRE_IS_WORD(ch)\
142
10.9M
    ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
143
144
static unsigned int sre_lower_ascii(unsigned int ch)
145
9.24M
{
146
9.24M
    return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
147
9.24M
}
148
149
/* locale-specific character predicates */
150
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
151
 * warnings when c's type supports only numbers < N+1 */
152
0
#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? sre_isalnum((ch)) : 0)
153
0
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
154
155
static unsigned int sre_lower_locale(unsigned int ch)
156
0
{
157
0
    return ((ch) < 256 ? (unsigned int)sre_tolower((ch)) : ch);
158
0
}
159
160
static unsigned int sre_upper_locale(unsigned int ch)
161
0
{
162
0
    return ((ch) < 256 ? (unsigned int)sre_toupper((ch)) : ch);
163
0
}
164
165
/* unicode-specific character predicates */
166
167
16
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
168
75.7M
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
169
0
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
170
1.49k
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
171
748
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
172
173
static unsigned int sre_lower_unicode(unsigned int ch)
174
116M
{
175
116M
    return (unsigned int) Py_UNICODE_TOLOWER(ch);
176
116M
}
177
178
static unsigned int sre_upper_unicode(unsigned int ch)
179
27.8M
{
180
27.8M
    return (unsigned int) Py_UNICODE_TOUPPER(ch);
181
27.8M
}
182
183
LOCAL(int)
184
sre_category(SRE_CODE category, unsigned int ch)
185
86.7M
{
186
86.7M
    switch (category) {
187
188
1.51k
    case SRE_CATEGORY_DIGIT:
189
1.51k
        return SRE_IS_DIGIT(ch);
190
0
    case SRE_CATEGORY_NOT_DIGIT:
191
0
        return !SRE_IS_DIGIT(ch);
192
32
    case SRE_CATEGORY_SPACE:
193
32
        return SRE_IS_SPACE(ch);
194
0
    case SRE_CATEGORY_NOT_SPACE:
195
0
        return !SRE_IS_SPACE(ch);
196
10.9M
    case SRE_CATEGORY_WORD:
197
10.9M
        return SRE_IS_WORD(ch);
198
0
    case SRE_CATEGORY_NOT_WORD:
199
0
        return !SRE_IS_WORD(ch);
200
0
    case SRE_CATEGORY_LINEBREAK:
201
0
        return SRE_IS_LINEBREAK(ch);
202
0
    case SRE_CATEGORY_NOT_LINEBREAK:
203
0
        return !SRE_IS_LINEBREAK(ch);
204
205
0
    case SRE_CATEGORY_LOC_WORD:
206
0
        return SRE_LOC_IS_WORD(ch);
207
0
    case SRE_CATEGORY_LOC_NOT_WORD:
208
0
        return !SRE_LOC_IS_WORD(ch);
209
210
16
    case SRE_CATEGORY_UNI_DIGIT:
211
16
        return SRE_UNI_IS_DIGIT(ch);
212
0
    case SRE_CATEGORY_UNI_NOT_DIGIT:
213
0
        return !SRE_UNI_IS_DIGIT(ch);
214
5.63M
    case SRE_CATEGORY_UNI_SPACE:
215
5.63M
        return SRE_UNI_IS_SPACE(ch);
216
70.1M
    case SRE_CATEGORY_UNI_NOT_SPACE:
217
70.1M
        return !SRE_UNI_IS_SPACE(ch);
218
748
    case SRE_CATEGORY_UNI_WORD:
219
748
        return SRE_UNI_IS_WORD(ch);
220
0
    case SRE_CATEGORY_UNI_NOT_WORD:
221
0
        return !SRE_UNI_IS_WORD(ch);
222
0
    case SRE_CATEGORY_UNI_LINEBREAK:
223
0
        return SRE_UNI_IS_LINEBREAK(ch);
224
0
    case SRE_CATEGORY_UNI_NOT_LINEBREAK:
225
0
        return !SRE_UNI_IS_LINEBREAK(ch);
226
86.7M
    }
227
0
    return 0;
228
86.7M
}
229
230
LOCAL(int)
231
char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
232
0
{
233
0
    return ch == pattern
234
0
        || (SRE_CODE) sre_lower_locale(ch) == pattern
235
0
        || (SRE_CODE) sre_upper_locale(ch) == pattern;
236
0
}
237
238
239
/* helpers */
240
241
static void
242
data_stack_dealloc(SRE_STATE* state)
243
201M
{
244
201M
    if (state->data_stack) {
245
176M
        PyMem_Free(state->data_stack);
246
176M
        state->data_stack = NULL;
247
176M
    }
248
201M
    state->data_stack_size = state->data_stack_base = 0;
249
201M
}
250
251
static int
252
data_stack_grow(SRE_STATE* state, Py_ssize_t size)
253
179M
{
254
179M
    INIT_TRACE(state);
255
179M
    Py_ssize_t minsize, cursize;
256
179M
    minsize = state->data_stack_base+size;
257
179M
    cursize = state->data_stack_size;
258
179M
    if (cursize < minsize) {
259
179M
        void* stack;
260
179M
        cursize = minsize+minsize/4+1024;
261
179M
        TRACE(("allocate/grow stack %zd\n", cursize));
262
179M
        stack = PyMem_Realloc(state->data_stack, cursize);
263
179M
        if (!stack) {
264
0
            data_stack_dealloc(state);
265
0
            return SRE_ERROR_MEMORY;
266
0
        }
267
179M
        state->data_stack = (char *)stack;
268
179M
        state->data_stack_size = cursize;
269
179M
    }
270
179M
    return 0;
271
179M
}
272
273
/* memory pool functions for SRE_REPEAT, this can avoid memory
274
   leak when SRE(match) function terminates abruptly.
275
   state->repeat_pool_used is a doubly-linked list, so that we
276
   can remove a SRE_REPEAT node from it.
277
   state->repeat_pool_unused is a singly-linked list, we put/get
278
   node at the head. */
279
static SRE_REPEAT *
280
repeat_pool_malloc(SRE_STATE *state)
281
48.4M
{
282
48.4M
    SRE_REPEAT *repeat;
283
284
48.4M
    if (state->repeat_pool_unused) {
285
        /* remove from unused pool (singly-linked list) */
286
16.3k
        repeat = state->repeat_pool_unused;
287
16.3k
        state->repeat_pool_unused = repeat->pool_next;
288
16.3k
    }
289
48.4M
    else {
290
48.4M
        repeat = PyMem_Malloc(sizeof(SRE_REPEAT));
291
48.4M
        if (!repeat) {
292
0
            return NULL;
293
0
        }
294
48.4M
    }
295
296
    /* add to used pool (doubly-linked list) */
297
48.4M
    SRE_REPEAT *temp = state->repeat_pool_used;
298
48.4M
    if (temp) {
299
29.6M
        temp->pool_prev = repeat;
300
29.6M
    }
301
48.4M
    repeat->pool_prev = NULL;
302
48.4M
    repeat->pool_next = temp;
303
48.4M
    state->repeat_pool_used = repeat;
304
305
48.4M
    return repeat;
306
48.4M
}
307
308
static void
309
repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat)
310
48.4M
{
311
48.4M
    SRE_REPEAT *prev = repeat->pool_prev;
312
48.4M
    SRE_REPEAT *next = repeat->pool_next;
313
314
    /* remove from used pool (doubly-linked list) */
315
48.4M
    if (prev) {
316
0
        prev->pool_next = next;
317
0
    }
318
48.4M
    else {
319
48.4M
        state->repeat_pool_used = next;
320
48.4M
    }
321
48.4M
    if (next) {
322
29.6M
        next->pool_prev = prev;
323
29.6M
    }
324
325
    /* add to unused pool (singly-linked list) */
326
48.4M
    repeat->pool_next = state->repeat_pool_unused;
327
48.4M
    state->repeat_pool_unused = repeat;
328
48.4M
}
329
330
static void
331
repeat_pool_clear(SRE_STATE *state)
332
81.9M
{
333
    /* clear used pool */
334
81.9M
    SRE_REPEAT *next = state->repeat_pool_used;
335
81.9M
    state->repeat_pool_used = NULL;
336
81.9M
    while (next) {
337
0
        SRE_REPEAT *temp = next;
338
0
        next = temp->pool_next;
339
0
        PyMem_Free(temp);
340
0
    }
341
342
    /* clear unused pool */
343
81.9M
    next = state->repeat_pool_unused;
344
81.9M
    state->repeat_pool_unused = NULL;
345
130M
    while (next) {
346
48.4M
        SRE_REPEAT *temp = next;
347
48.4M
        next = temp->pool_next;
348
48.4M
        PyMem_Free(temp);
349
48.4M
    }
350
81.9M
}
351
352
/* generate 8-bit version */
353
354
228M
#define SRE_CHAR Py_UCS1
355
#define SIZEOF_SRE_CHAR 1
356
974M
#define SRE(F) sre_ucs1_##F
357
#include "sre_lib.h"
358
359
/* generate 16-bit unicode version */
360
361
311M
#define SRE_CHAR Py_UCS2
362
#define SIZEOF_SRE_CHAR 2
363
1.54G
#define SRE(F) sre_ucs2_##F
364
#include "sre_lib.h"
365
366
/* generate 32-bit unicode version */
367
368
109M
#define SRE_CHAR Py_UCS4
369
#define SIZEOF_SRE_CHAR 4
370
600M
#define SRE(F) sre_ucs4_##F
371
#include "sre_lib.h"
372
373
/* -------------------------------------------------------------------- */
374
/* factories and destructors */
375
376
/* module state */
377
typedef struct {
378
    PyTypeObject *Pattern_Type;
379
    PyTypeObject *Match_Type;
380
    PyTypeObject *Scanner_Type;
381
    PyTypeObject *Template_Type;
382
    PyObject *compile_template;  // reference to re._compile_template
383
} _sremodulestate;
384
385
static _sremodulestate *
386
get_sre_module_state(PyObject *m)
387
80.1M
{
388
80.1M
    _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
389
80.1M
    assert(state);
390
80.1M
    return state;
391
80.1M
}
392
393
static struct PyModuleDef sremodule;
394
#define get_sre_module_state_by_class(cls) \
395
80.1M
    (get_sre_module_state(PyType_GetModule(cls)))
396
397
/* see sre.h for object declarations */
398
static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
399
static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
400
401
17.2k
#define _PatternObject_CAST(op)     ((PatternObject *)(op))
402
85.9M
#define _MatchObject_CAST(op)       ((MatchObject *)(op))
403
0
#define _TemplateObject_CAST(op)    ((TemplateObject *)(op))
404
718k
#define _ScannerObject_CAST(op)     ((ScannerObject *)(op))
405
406
/*[clinic input]
407
module _sre
408
class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
409
class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
410
class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
411
[clinic start generated code]*/
412
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
413
414
/*[clinic input]
415
_sre.getcodesize -> int
416
[clinic start generated code]*/
417
418
static int
419
_sre_getcodesize_impl(PyObject *module)
420
/*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
421
0
{
422
0
    return sizeof(SRE_CODE);
423
0
}
424
425
/*[clinic input]
426
_sre.ascii_iscased -> bool
427
428
    character: int
429
    /
430
431
[clinic start generated code]*/
432
433
static int
434
_sre_ascii_iscased_impl(PyObject *module, int character)
435
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
436
7.40k
{
437
7.40k
    unsigned int ch = (unsigned int)character;
438
7.40k
    return ch < 128 && Py_ISALPHA(ch);
439
7.40k
}
440
441
/*[clinic input]
442
_sre.unicode_iscased -> bool
443
444
    character: int
445
    /
446
447
[clinic start generated code]*/
448
449
static int
450
_sre_unicode_iscased_impl(PyObject *module, int character)
451
/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
452
30.8M
{
453
30.8M
    unsigned int ch = (unsigned int)character;
454
30.8M
    return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
455
30.8M
}
456
457
/*[clinic input]
458
_sre.ascii_tolower -> int
459
460
    character: int
461
    /
462
463
[clinic start generated code]*/
464
465
static int
466
_sre_ascii_tolower_impl(PyObject *module, int character)
467
/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
468
1.37M
{
469
1.37M
    return sre_lower_ascii(character);
470
1.37M
}
471
472
/*[clinic input]
473
_sre.unicode_tolower -> int
474
475
    character: int
476
    /
477
478
[clinic start generated code]*/
479
480
static int
481
_sre_unicode_tolower_impl(PyObject *module, int character)
482
/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
483
85.9M
{
484
85.9M
    return sre_lower_unicode(character);
485
85.9M
}
486
487
LOCAL(void)
488
state_reset(SRE_STATE* state)
489
119M
{
490
    /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
491
    /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
492
493
119M
    state->lastmark = -1;
494
119M
    state->lastindex = -1;
495
496
119M
    state->repeat = NULL;
497
498
119M
    data_stack_dealloc(state);
499
119M
}
500
501
static const void*
502
getstring(PyObject* string, Py_ssize_t* p_length,
503
          int* p_isbytes, int* p_charsize,
504
          Py_buffer *view)
505
130M
{
506
    /* given a python object, return a data pointer, a length (in
507
       characters), and a character size.  return NULL if the object
508
       is not a string (or not compatible) */
509
510
    /* Unicode objects do not support the buffer API. So, get the data
511
       directly instead. */
512
130M
    if (PyUnicode_Check(string)) {
513
130M
        *p_length = PyUnicode_GET_LENGTH(string);
514
130M
        *p_charsize = PyUnicode_KIND(string);
515
130M
        *p_isbytes = 0;
516
130M
        return PyUnicode_DATA(string);
517
130M
    }
518
519
    /* get pointer to byte string buffer */
520
850k
    if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
521
0
        PyErr_Format(PyExc_TypeError, "expected string or bytes-like "
522
0
                     "object, got '%.200s'", Py_TYPE(string)->tp_name);
523
0
        return NULL;
524
0
    }
525
526
850k
    *p_length = view->len;
527
850k
    *p_charsize = 1;
528
850k
    *p_isbytes = 1;
529
530
850k
    if (view->buf == NULL) {
531
0
        PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
532
0
        PyBuffer_Release(view);
533
0
        view->buf = NULL;
534
0
        return NULL;
535
0
    }
536
850k
    return view->buf;
537
850k
}
538
539
LOCAL(PyObject*)
540
state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
541
           Py_ssize_t start, Py_ssize_t end)
542
81.9M
{
543
    /* prepare state object */
544
545
81.9M
    Py_ssize_t length;
546
81.9M
    int isbytes, charsize;
547
81.9M
    const void* ptr;
548
549
81.9M
    memset(state, 0, sizeof(SRE_STATE));
550
551
81.9M
    state->mark = PyMem_New(const void *, pattern->groups * 2);
552
81.9M
    if (!state->mark) {
553
0
        PyErr_NoMemory();
554
0
        goto err;
555
0
    }
556
81.9M
    state->lastmark = -1;
557
81.9M
    state->lastindex = -1;
558
559
81.9M
    state->buffer.buf = NULL;
560
81.9M
    ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
561
81.9M
    if (!ptr)
562
0
        goto err;
563
564
81.9M
    if (isbytes && pattern->isbytes == 0) {
565
0
        PyErr_SetString(PyExc_TypeError,
566
0
                        "cannot use a string pattern on a bytes-like object");
567
0
        goto err;
568
0
    }
569
81.9M
    if (!isbytes && pattern->isbytes > 0) {
570
0
        PyErr_SetString(PyExc_TypeError,
571
0
                        "cannot use a bytes pattern on a string-like object");
572
0
        goto err;
573
0
    }
574
575
    /* adjust boundaries */
576
81.9M
    if (start < 0)
577
0
        start = 0;
578
81.9M
    else if (start > length)
579
0
        start = length;
580
581
81.9M
    if (end < 0)
582
0
        end = 0;
583
81.9M
    else if (end > length)
584
81.9M
        end = length;
585
586
81.9M
    state->isbytes = isbytes;
587
81.9M
    state->charsize = charsize;
588
81.9M
    state->match_all = 0;
589
81.9M
    state->must_advance = 0;
590
81.9M
    state->debug = ((pattern->flags & SRE_FLAG_DEBUG) != 0);
591
592
81.9M
    state->beginning = ptr;
593
594
81.9M
    state->start = (void*) ((char*) ptr + start * state->charsize);
595
81.9M
    state->end = (void*) ((char*) ptr + end * state->charsize);
596
597
81.9M
    state->string = Py_NewRef(string);
598
81.9M
    state->pos = start;
599
81.9M
    state->endpos = end;
600
601
#ifdef Py_DEBUG
602
    state->fail_after_count = pattern->fail_after_count;
603
    state->fail_after_exc = pattern->fail_after_exc; // borrowed ref
604
#endif
605
606
81.9M
    return string;
607
0
  err:
608
    /* We add an explicit cast here because MSVC has a bug when
609
       compiling C code where it believes that `const void**` cannot be
610
       safely casted to `void*`, see bpo-39943 for details. */
611
0
    PyMem_Free((void*) state->mark);
612
0
    state->mark = NULL;
613
0
    if (state->buffer.buf)
614
0
        PyBuffer_Release(&state->buffer);
615
0
    return NULL;
616
81.9M
}
617
618
LOCAL(void)
619
state_fini(SRE_STATE* state)
620
81.9M
{
621
81.9M
    if (state->buffer.buf)
622
435k
        PyBuffer_Release(&state->buffer);
623
81.9M
    Py_XDECREF(state->string);
624
81.9M
    data_stack_dealloc(state);
625
    /* See above PyMem_Free() for why we explicitly cast here. */
626
81.9M
    PyMem_Free((void*) state->mark);
627
81.9M
    state->mark = NULL;
628
    /* SRE_REPEAT pool */
629
81.9M
    repeat_pool_clear(state);
630
81.9M
}
631
632
/* calculate offset from start of string */
633
#define STATE_OFFSET(state, member)\
634
207M
    (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
635
636
LOCAL(PyObject*)
637
getslice(int isbytes, const void *ptr,
638
         PyObject* string, Py_ssize_t start, Py_ssize_t end)
639
161M
{
640
161M
    if (isbytes) {
641
490k
        if (PyBytes_CheckExact(string) &&
642
490k
            start == 0 && end == PyBytes_GET_SIZE(string)) {
643
2.12k
            return Py_NewRef(string);
644
2.12k
        }
645
488k
        return PyBytes_FromStringAndSize(
646
488k
                (const char *)ptr + start, end - start);
647
490k
    }
648
160M
    else {
649
160M
        return PyUnicode_Substring(string, start, end);
650
160M
    }
651
161M
}
652
653
LOCAL(PyObject*)
654
state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
655
835k
{
656
835k
    Py_ssize_t i, j;
657
658
835k
    index = (index - 1) * 2;
659
660
835k
    if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
661
0
        if (empty)
662
            /* want empty string */
663
0
            i = j = 0;
664
0
        else {
665
0
            Py_RETURN_NONE;
666
0
        }
667
835k
    } else {
668
835k
        i = STATE_OFFSET(state, state->mark[index]);
669
835k
        j = STATE_OFFSET(state, state->mark[index+1]);
670
671
        /* check wrong span */
672
835k
        if (i > j) {
673
0
            PyErr_SetString(PyExc_SystemError,
674
0
                            "The span of capturing group is wrong,"
675
0
                            " please report a bug for the re module.");
676
0
            return NULL;
677
0
        }
678
835k
    }
679
680
835k
    return getslice(state->isbytes, state->beginning, string, i, j);
681
835k
}
682
683
static void
684
pattern_error(Py_ssize_t status)
685
0
{
686
0
    switch (status) {
687
0
    case SRE_ERROR_RECURSION_LIMIT:
688
        /* This error code seems to be unused. */
689
0
        PyErr_SetString(
690
0
            PyExc_RecursionError,
691
0
            "maximum recursion limit exceeded"
692
0
            );
693
0
        break;
694
0
    case SRE_ERROR_MEMORY:
695
0
        PyErr_NoMemory();
696
0
        break;
697
0
    case SRE_ERROR_INTERRUPTED:
698
    /* An exception has already been raised, so let it fly */
699
0
        break;
700
0
    default:
701
        /* other error codes indicate compiler/engine bugs */
702
0
        PyErr_SetString(
703
0
            PyExc_RuntimeError,
704
0
            "internal error in regular expression engine"
705
0
            );
706
0
    }
707
0
}
708
709
static int
710
pattern_traverse(PyObject *op, visitproc visit, void *arg)
711
13.8k
{
712
13.8k
    PatternObject *self = _PatternObject_CAST(op);
713
13.8k
    Py_VISIT(Py_TYPE(self));
714
13.8k
    Py_VISIT(self->groupindex);
715
13.8k
    Py_VISIT(self->indexgroup);
716
13.8k
    Py_VISIT(self->pattern);
717
#ifdef Py_DEBUG
718
    Py_VISIT(self->fail_after_exc);
719
#endif
720
13.8k
    return 0;
721
13.8k
}
722
723
static int
724
pattern_clear(PyObject *op)
725
3.35k
{
726
3.35k
    PatternObject *self = _PatternObject_CAST(op);
727
3.35k
    Py_CLEAR(self->groupindex);
728
3.35k
    Py_CLEAR(self->indexgroup);
729
3.35k
    Py_CLEAR(self->pattern);
730
#ifdef Py_DEBUG
731
    Py_CLEAR(self->fail_after_exc);
732
#endif
733
3.35k
    return 0;
734
3.35k
}
735
736
static void
737
pattern_dealloc(PyObject *self)
738
3.35k
{
739
3.35k
    PyTypeObject *tp = Py_TYPE(self);
740
3.35k
    PyObject_GC_UnTrack(self);
741
3.35k
    FT_CLEAR_WEAKREFS(self, _PatternObject_CAST(self)->weakreflist);
742
3.35k
    (void)pattern_clear(self);
743
3.35k
    tp->tp_free(self);
744
3.35k
    Py_DECREF(tp);
745
3.35k
}
746
747
LOCAL(Py_ssize_t)
748
sre_match(SRE_STATE* state, SRE_CODE* pattern)
749
61.7M
{
750
61.7M
    if (state->charsize == 1)
751
38.0M
        return sre_ucs1_match(state, pattern, 1);
752
23.7M
    if (state->charsize == 2)
753
14.5M
        return sre_ucs2_match(state, pattern, 1);
754
23.7M
    assert(state->charsize == 4);
755
9.20M
    return sre_ucs4_match(state, pattern, 1);
756
23.7M
}
757
758
LOCAL(Py_ssize_t)
759
sre_search(SRE_STATE* state, SRE_CODE* pattern)
760
124M
{
761
124M
    if (state->charsize == 1)
762
55.8M
        return sre_ucs1_search(state, pattern);
763
68.6M
    if (state->charsize == 2)
764
61.3M
        return sre_ucs2_search(state, pattern);
765
68.6M
    assert(state->charsize == 4);
766
7.30M
    return sre_ucs4_search(state, pattern);
767
68.6M
}
768
769
/*[clinic input]
770
_sre.SRE_Pattern.prefixmatch
771
772
    cls: defining_class
773
    /
774
    string: object
775
    pos: Py_ssize_t = 0
776
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
777
778
Matches zero or more characters at the beginning of the string.
779
[clinic start generated code]*/
780
781
static PyObject *
782
_sre_SRE_Pattern_prefixmatch_impl(PatternObject *self, PyTypeObject *cls,
783
                                  PyObject *string, Py_ssize_t pos,
784
                                  Py_ssize_t endpos)
785
/*[clinic end generated code: output=a0e079fb4f875240 input=e2a7e68ea47d048c]*/
786
61.7M
{
787
61.7M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
788
61.7M
    SRE_STATE state;
789
61.7M
    Py_ssize_t status;
790
61.7M
    PyObject *match;
791
792
61.7M
    if (!state_init(&state, self, string, pos, endpos))
793
0
        return NULL;
794
795
61.7M
    INIT_TRACE(&state);
796
61.7M
    state.ptr = state.start;
797
798
61.7M
    TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
799
800
61.7M
    status = sre_match(&state, PatternObject_GetCode(self));
801
802
61.7M
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
803
61.7M
    if (PyErr_Occurred()) {
804
0
        state_fini(&state);
805
0
        return NULL;
806
0
    }
807
808
61.7M
    match = pattern_new_match(module_state, self, &state, status);
809
61.7M
    state_fini(&state);
810
61.7M
    return match;
811
61.7M
}
812
813
814
/*[clinic input]
815
_sre.SRE_Pattern.fullmatch
816
817
    cls: defining_class
818
    /
819
    string: object
820
    pos: Py_ssize_t = 0
821
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
822
823
Matches against all of the string.
824
[clinic start generated code]*/
825
826
static PyObject *
827
_sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
828
                                PyObject *string, Py_ssize_t pos,
829
                                Py_ssize_t endpos)
830
/*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
831
0
{
832
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
833
0
    SRE_STATE state;
834
0
    Py_ssize_t status;
835
0
    PyObject *match;
836
837
0
    if (!state_init(&state, self, string, pos, endpos))
838
0
        return NULL;
839
840
0
    INIT_TRACE(&state);
841
0
    state.ptr = state.start;
842
843
0
    TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
844
845
0
    state.match_all = 1;
846
0
    status = sre_match(&state, PatternObject_GetCode(self));
847
848
0
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
849
0
    if (PyErr_Occurred()) {
850
0
        state_fini(&state);
851
0
        return NULL;
852
0
    }
853
854
0
    match = pattern_new_match(module_state, self, &state, status);
855
0
    state_fini(&state);
856
0
    return match;
857
0
}
858
859
/*[clinic input]
860
@permit_long_summary
861
_sre.SRE_Pattern.search
862
863
    cls: defining_class
864
    /
865
    string: object
866
    pos: Py_ssize_t = 0
867
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
868
869
Scan through string looking for a match, and return a corresponding match object instance.
870
871
Return None if no position in the string matches.
872
[clinic start generated code]*/
873
874
static PyObject *
875
_sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
876
                             PyObject *string, Py_ssize_t pos,
877
                             Py_ssize_t endpos)
878
/*[clinic end generated code: output=bd7f2d9d583e1463 input=05e9feee0334c156]*/
879
4.92M
{
880
4.92M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
881
4.92M
    SRE_STATE state;
882
4.92M
    Py_ssize_t status;
883
4.92M
    PyObject *match;
884
885
4.92M
    if (!state_init(&state, self, string, pos, endpos))
886
0
        return NULL;
887
888
4.92M
    INIT_TRACE(&state);
889
4.92M
    TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
890
891
4.92M
    status = sre_search(&state, PatternObject_GetCode(self));
892
893
4.92M
    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
894
895
4.92M
    if (PyErr_Occurred()) {
896
0
        state_fini(&state);
897
0
        return NULL;
898
0
    }
899
900
4.92M
    match = pattern_new_match(module_state, self, &state, status);
901
4.92M
    state_fini(&state);
902
4.92M
    return match;
903
4.92M
}
904
905
/*[clinic input]
906
_sre.SRE_Pattern.findall
907
908
    string: object
909
    pos: Py_ssize_t = 0
910
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
911
912
Return a list of all non-overlapping matches of pattern in string.
913
[clinic start generated code]*/
914
915
static PyObject *
916
_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
917
                              Py_ssize_t pos, Py_ssize_t endpos)
918
/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
919
3.75M
{
920
3.75M
    SRE_STATE state;
921
3.75M
    PyObject* list;
922
3.75M
    Py_ssize_t status;
923
3.75M
    Py_ssize_t i, b, e;
924
925
3.75M
    if (!state_init(&state, self, string, pos, endpos))
926
0
        return NULL;
927
928
3.75M
    list = PyList_New(0);
929
3.75M
    if (!list) {
930
0
        state_fini(&state);
931
0
        return NULL;
932
0
    }
933
934
98.5M
    while (state.start <= state.end) {
935
936
98.5M
        PyObject* item;
937
938
98.5M
        state_reset(&state);
939
940
98.5M
        state.ptr = state.start;
941
942
98.5M
        status = sre_search(&state, PatternObject_GetCode(self));
943
98.5M
        if (PyErr_Occurred())
944
0
            goto error;
945
946
98.5M
        if (status <= 0) {
947
3.75M
            if (status == 0)
948
3.75M
                break;
949
0
            pattern_error(status);
950
0
            goto error;
951
3.75M
        }
952
953
        /* don't bother to build a match object */
954
94.8M
        switch (self->groups) {
955
94.8M
        case 0:
956
94.8M
            b = STATE_OFFSET(&state, state.start);
957
94.8M
            e = STATE_OFFSET(&state, state.ptr);
958
94.8M
            item = getslice(state.isbytes, state.beginning,
959
94.8M
                            string, b, e);
960
94.8M
            if (!item)
961
0
                goto error;
962
94.8M
            break;
963
94.8M
        case 1:
964
0
            item = state_getslice(&state, 1, string, 1);
965
0
            if (!item)
966
0
                goto error;
967
0
            break;
968
0
        default:
969
0
            item = PyTuple_New(self->groups);
970
0
            if (!item)
971
0
                goto error;
972
0
            for (i = 0; i < self->groups; i++) {
973
0
                PyObject* o = state_getslice(&state, i+1, string, 1);
974
0
                if (!o) {
975
0
                    Py_DECREF(item);
976
0
                    goto error;
977
0
                }
978
0
                PyTuple_SET_ITEM(item, i, o);
979
0
            }
980
0
            break;
981
94.8M
        }
982
983
94.8M
        status = PyList_Append(list, item);
984
94.8M
        Py_DECREF(item);
985
94.8M
        if (status < 0)
986
0
            goto error;
987
988
94.8M
        state.must_advance = (state.ptr == state.start);
989
94.8M
        state.start = state.ptr;
990
94.8M
    }
991
992
3.75M
    state_fini(&state);
993
3.75M
    return list;
994
995
0
error:
996
0
    Py_DECREF(list);
997
0
    state_fini(&state);
998
0
    return NULL;
999
1000
3.75M
}
1001
1002
/*[clinic input]
1003
@permit_long_summary
1004
_sre.SRE_Pattern.finditer
1005
1006
    cls: defining_class
1007
    /
1008
    string: object
1009
    pos: Py_ssize_t = 0
1010
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
1011
1012
Return an iterator over all non-overlapping matches for the RE pattern in string.
1013
1014
For each match, the iterator returns a match object.
1015
[clinic start generated code]*/
1016
1017
static PyObject *
1018
_sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
1019
                               PyObject *string, Py_ssize_t pos,
1020
                               Py_ssize_t endpos)
1021
/*[clinic end generated code: output=1791dbf3618ade56 input=ee28865796048023]*/
1022
358k
{
1023
358k
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1024
358k
    PyObject* scanner;
1025
358k
    PyObject* search;
1026
358k
    PyObject* iterator;
1027
1028
358k
    scanner = pattern_scanner(module_state, self, string, pos, endpos);
1029
358k
    if (!scanner)
1030
0
        return NULL;
1031
1032
358k
    search = PyObject_GetAttrString(scanner, "search");
1033
358k
    Py_DECREF(scanner);
1034
358k
    if (!search)
1035
0
        return NULL;
1036
1037
358k
    iterator = PyCallIter_New(search, Py_None);
1038
358k
    Py_DECREF(search);
1039
1040
358k
    return iterator;
1041
358k
}
1042
1043
/*[clinic input]
1044
_sre.SRE_Pattern.scanner
1045
1046
    cls: defining_class
1047
    /
1048
    string: object
1049
    pos: Py_ssize_t = 0
1050
    endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
1051
1052
[clinic start generated code]*/
1053
1054
static PyObject *
1055
_sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
1056
                              PyObject *string, Py_ssize_t pos,
1057
                              Py_ssize_t endpos)
1058
/*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
1059
0
{
1060
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1061
1062
0
    return pattern_scanner(module_state, self, string, pos, endpos);
1063
0
}
1064
1065
/*[clinic input]
1066
_sre.SRE_Pattern.split
1067
1068
    string: object
1069
    maxsplit: Py_ssize_t = 0
1070
1071
Split string by the occurrences of pattern.
1072
[clinic start generated code]*/
1073
1074
static PyObject *
1075
_sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
1076
                            Py_ssize_t maxsplit)
1077
/*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
1078
1.32M
{
1079
1.32M
    SRE_STATE state;
1080
1.32M
    PyObject* list;
1081
1.32M
    PyObject* item;
1082
1.32M
    Py_ssize_t status;
1083
1.32M
    Py_ssize_t n;
1084
1.32M
    Py_ssize_t i;
1085
1.32M
    const void* last;
1086
1087
1.32M
    assert(self->codesize != 0);
1088
1089
1.32M
    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
1090
0
        return NULL;
1091
1092
1.32M
    list = PyList_New(0);
1093
1.32M
    if (!list) {
1094
0
        state_fini(&state);
1095
0
        return NULL;
1096
0
    }
1097
1098
1.32M
    n = 0;
1099
1.32M
    last = state.start;
1100
1101
2.23M
    while (!maxsplit || n < maxsplit) {
1102
1103
1.40M
        state_reset(&state);
1104
1105
1.40M
        state.ptr = state.start;
1106
1107
1.40M
        status = sre_search(&state, PatternObject_GetCode(self));
1108
1.40M
        if (PyErr_Occurred())
1109
0
            goto error;
1110
1111
1.40M
        if (status <= 0) {
1112
488k
            if (status == 0)
1113
488k
                break;
1114
0
            pattern_error(status);
1115
0
            goto error;
1116
488k
        }
1117
1118
        /* get segment before this match */
1119
912k
        item = getslice(state.isbytes, state.beginning,
1120
912k
            string, STATE_OFFSET(&state, last),
1121
912k
            STATE_OFFSET(&state, state.start)
1122
912k
            );
1123
912k
        if (!item)
1124
0
            goto error;
1125
912k
        status = PyList_Append(list, item);
1126
912k
        Py_DECREF(item);
1127
912k
        if (status < 0)
1128
0
            goto error;
1129
1130
        /* add groups (if any) */
1131
1.74M
        for (i = 0; i < self->groups; i++) {
1132
835k
            item = state_getslice(&state, i+1, string, 0);
1133
835k
            if (!item)
1134
0
                goto error;
1135
835k
            status = PyList_Append(list, item);
1136
835k
            Py_DECREF(item);
1137
835k
            if (status < 0)
1138
0
                goto error;
1139
835k
        }
1140
1141
912k
        n = n + 1;
1142
912k
        state.must_advance = (state.ptr == state.start);
1143
912k
        last = state.start = state.ptr;
1144
1145
912k
    }
1146
1147
    /* get segment following last match (even if empty) */
1148
1.32M
    item = getslice(state.isbytes, state.beginning,
1149
1.32M
        string, STATE_OFFSET(&state, last), state.endpos
1150
1.32M
        );
1151
1.32M
    if (!item)
1152
0
        goto error;
1153
1.32M
    status = PyList_Append(list, item);
1154
1.32M
    Py_DECREF(item);
1155
1.32M
    if (status < 0)
1156
0
        goto error;
1157
1158
1.32M
    state_fini(&state);
1159
1.32M
    return list;
1160
1161
0
error:
1162
0
    Py_DECREF(list);
1163
0
    state_fini(&state);
1164
0
    return NULL;
1165
1166
1.32M
}
1167
1168
static PyObject *
1169
compile_template(_sremodulestate *module_state,
1170
                 PatternObject *pattern, PyObject *template)
1171
0
{
1172
    /* delegate to Python code */
1173
0
    PyObject *func = FT_ATOMIC_LOAD_PTR(module_state->compile_template);
1174
0
    if (func == NULL) {
1175
0
        func = PyImport_ImportModuleAttrString("re", "_compile_template");
1176
0
        if (func == NULL) {
1177
0
            return NULL;
1178
0
        }
1179
#ifdef Py_GIL_DISABLED
1180
        PyObject *other_func = NULL;
1181
        if (!_Py_atomic_compare_exchange_ptr(&module_state->compile_template, &other_func, func))  {
1182
            Py_DECREF(func);
1183
            func = other_func;
1184
        }
1185
#else
1186
0
        Py_XSETREF(module_state->compile_template, func);
1187
0
#endif
1188
0
    }
1189
1190
0
    PyObject *args[] = {(PyObject *)pattern, template};
1191
0
    PyObject *result = PyObject_Vectorcall(func, args, 2, NULL);
1192
1193
0
    if (result == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
1194
        /* If the replacement string is unhashable (e.g. bytearray),
1195
         * convert it to the basic type (str or bytes) and repeat. */
1196
0
        if (PyUnicode_Check(template) && !PyUnicode_CheckExact(template)) {
1197
0
            PyErr_Clear();
1198
0
            template = _PyUnicode_Copy(template);
1199
0
        }
1200
0
        else if (PyObject_CheckBuffer(template) && !PyBytes_CheckExact(template)) {
1201
0
            PyErr_Clear();
1202
0
            template = PyBytes_FromObject(template);
1203
0
        }
1204
0
        else {
1205
0
            return NULL;
1206
0
        }
1207
0
        if (template == NULL) {
1208
0
            return NULL;
1209
0
        }
1210
0
        args[1] = template;
1211
0
        result = PyObject_Vectorcall(func, args, 2, NULL);
1212
0
        Py_DECREF(template);
1213
0
    }
1214
1215
0
    if (result != NULL && Py_TYPE(result) != module_state->Template_Type) {
1216
0
        PyErr_Format(PyExc_RuntimeError,
1217
0
                    "the result of compiling a replacement string is %.200s",
1218
0
                    Py_TYPE(result)->tp_name);
1219
0
        Py_DECREF(result);
1220
0
        return NULL;
1221
0
    }
1222
0
    return result;
1223
0
}
1224
1225
static PyObject *expand_template(TemplateObject *, MatchObject *); /* Forward */
1226
1227
static PyObject*
1228
pattern_subx(_sremodulestate* module_state,
1229
             PatternObject* self,
1230
             PyObject* ptemplate,
1231
             PyObject* string,
1232
             Py_ssize_t count,
1233
             Py_ssize_t subn)
1234
9.79M
{
1235
9.79M
    SRE_STATE state;
1236
9.79M
    PyObject* list;
1237
9.79M
    PyObject* joiner;
1238
9.79M
    PyObject* item;
1239
9.79M
    PyObject* filter;
1240
9.79M
    PyObject* match;
1241
9.79M
    const void* ptr;
1242
9.79M
    Py_ssize_t status;
1243
9.79M
    Py_ssize_t n;
1244
9.79M
    Py_ssize_t i, b, e;
1245
9.79M
    int isbytes, charsize;
1246
9.79M
    enum {LITERAL, TEMPLATE, CALLABLE} filter_type;
1247
9.79M
    Py_buffer view;
1248
1249
9.79M
    if (PyCallable_Check(ptemplate)) {
1250
        /* sub/subn takes either a function or a template */
1251
3.82M
        filter = Py_NewRef(ptemplate);
1252
3.82M
        filter_type = CALLABLE;
1253
5.97M
    } else {
1254
        /* if not callable, check if it's a literal string */
1255
5.97M
        int literal;
1256
5.97M
        view.buf = NULL;
1257
5.97M
        ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
1258
5.97M
        if (ptr) {
1259
5.97M
            if (charsize == 1)
1260
5.97M
                literal = memchr(ptr, '\\', n) == NULL;
1261
0
            else
1262
0
                literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
1263
5.97M
        } else {
1264
0
            PyErr_Clear();
1265
0
            literal = 0;
1266
0
        }
1267
5.97M
        if (view.buf)
1268
0
            PyBuffer_Release(&view);
1269
5.97M
        if (literal) {
1270
5.97M
            filter = Py_NewRef(ptemplate);
1271
5.97M
            filter_type = LITERAL;
1272
5.97M
        } else {
1273
            /* not a literal; hand it over to the template compiler */
1274
0
            filter = compile_template(module_state, self, ptemplate);
1275
0
            if (!filter)
1276
0
                return NULL;
1277
1278
0
            assert(Py_TYPE(filter) == module_state->Template_Type);
1279
0
            if (Py_SIZE(filter) == 0) {
1280
0
                Py_SETREF(filter,
1281
0
                          Py_NewRef(((TemplateObject *)filter)->literal));
1282
0
                filter_type = LITERAL;
1283
0
            }
1284
0
            else {
1285
0
                filter_type = TEMPLATE;
1286
0
            }
1287
0
        }
1288
5.97M
    }
1289
1290
9.79M
    if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
1291
0
        Py_DECREF(filter);
1292
0
        return NULL;
1293
0
    }
1294
1295
9.79M
    list = PyList_New(0);
1296
9.79M
    if (!list) {
1297
0
        Py_DECREF(filter);
1298
0
        state_fini(&state);
1299
0
        return NULL;
1300
0
    }
1301
1302
9.79M
    n = i = 0;
1303
1304
16.3M
    while (!count || n < count) {
1305
1306
16.3M
        state_reset(&state);
1307
1308
16.3M
        state.ptr = state.start;
1309
1310
16.3M
        status = sre_search(&state, PatternObject_GetCode(self));
1311
16.3M
        if (PyErr_Occurred())
1312
0
            goto error;
1313
1314
16.3M
        if (status <= 0) {
1315
9.79M
            if (status == 0)
1316
9.79M
                break;
1317
0
            pattern_error(status);
1318
0
            goto error;
1319
9.79M
        }
1320
1321
6.56M
        b = STATE_OFFSET(&state, state.start);
1322
6.56M
        e = STATE_OFFSET(&state, state.ptr);
1323
1324
6.56M
        if (i < b) {
1325
            /* get segment before this match */
1326
3.35M
            item = getslice(state.isbytes, state.beginning,
1327
3.35M
                string, i, b);
1328
3.35M
            if (!item)
1329
0
                goto error;
1330
3.35M
            status = PyList_Append(list, item);
1331
3.35M
            Py_DECREF(item);
1332
3.35M
            if (status < 0)
1333
0
                goto error;
1334
1335
3.35M
        }
1336
1337
6.56M
        if (filter_type != LITERAL) {
1338
            /* pass match object through filter */
1339
6.56M
            match = pattern_new_match(module_state, self, &state, 1);
1340
6.56M
            if (!match)
1341
0
                goto error;
1342
6.56M
            if (filter_type == TEMPLATE) {
1343
0
                item = expand_template((TemplateObject *)filter,
1344
0
                                       (MatchObject *)match);
1345
0
            }
1346
6.56M
            else {
1347
6.56M
                assert(filter_type == CALLABLE);
1348
6.56M
                item = PyObject_CallOneArg(filter, match);
1349
6.56M
            }
1350
6.56M
            Py_DECREF(match);
1351
6.56M
            if (!item)
1352
56
                goto error;
1353
6.56M
        } else {
1354
            /* filter is literal string */
1355
2.59k
            item = Py_NewRef(filter);
1356
2.59k
        }
1357
1358
        /* add to list */
1359
6.56M
        if (item != Py_None) {
1360
6.56M
            status = PyList_Append(list, item);
1361
6.56M
            Py_DECREF(item);
1362
6.56M
            if (status < 0)
1363
0
                goto error;
1364
6.56M
        }
1365
1366
6.56M
        i = e;
1367
6.56M
        n = n + 1;
1368
6.56M
        state.must_advance = (state.ptr == state.start);
1369
6.56M
        state.start = state.ptr;
1370
6.56M
    }
1371
1372
    /* get segment following last match */
1373
9.79M
    if (i < state.endpos) {
1374
7.24M
        item = getslice(state.isbytes, state.beginning,
1375
7.24M
                        string, i, state.endpos);
1376
7.24M
        if (!item)
1377
0
            goto error;
1378
7.24M
        status = PyList_Append(list, item);
1379
7.24M
        Py_DECREF(item);
1380
7.24M
        if (status < 0)
1381
0
            goto error;
1382
7.24M
    }
1383
1384
9.79M
    state_fini(&state);
1385
1386
9.79M
    Py_DECREF(filter);
1387
1388
    /* convert list to single string (also removes list) */
1389
9.79M
    joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
1390
9.79M
    if (!joiner) {
1391
0
        Py_DECREF(list);
1392
0
        return NULL;
1393
0
    }
1394
9.79M
    if (PyList_GET_SIZE(list) == 0) {
1395
1.91M
        Py_DECREF(list);
1396
1.91M
        item = joiner;
1397
1.91M
    }
1398
7.87M
    else {
1399
7.87M
        if (state.isbytes)
1400
35.2k
            item = PyBytes_Join(joiner, list);
1401
7.84M
        else
1402
7.84M
            item = PyUnicode_Join(joiner, list);
1403
7.87M
        Py_DECREF(joiner);
1404
7.87M
        Py_DECREF(list);
1405
7.87M
        if (!item)
1406
0
            return NULL;
1407
7.87M
    }
1408
1409
9.79M
    if (subn)
1410
0
        return Py_BuildValue("Nn", item, n);
1411
1412
9.79M
    return item;
1413
1414
56
error:
1415
56
    Py_DECREF(list);
1416
56
    state_fini(&state);
1417
56
    Py_DECREF(filter);
1418
56
    return NULL;
1419
1420
9.79M
}
1421
1422
/*[clinic input]
1423
@permit_long_summary
1424
_sre.SRE_Pattern.sub
1425
1426
    cls: defining_class
1427
    /
1428
    repl: object
1429
    string: object
1430
    count: Py_ssize_t = 0
1431
1432
Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
1433
[clinic start generated code]*/
1434
1435
static PyObject *
1436
_sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
1437
                          PyObject *repl, PyObject *string, Py_ssize_t count)
1438
/*[clinic end generated code: output=4be141ab04bca60d input=eba511fd1c4908b7]*/
1439
9.79M
{
1440
9.79M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1441
1442
9.79M
    return pattern_subx(module_state, self, repl, string, count, 0);
1443
9.79M
}
1444
1445
/*[clinic input]
1446
@permit_long_summary
1447
_sre.SRE_Pattern.subn
1448
1449
    cls: defining_class
1450
    /
1451
    repl: object
1452
    string: object
1453
    count: Py_ssize_t = 0
1454
1455
Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
1456
[clinic start generated code]*/
1457
1458
static PyObject *
1459
_sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
1460
                           PyObject *repl, PyObject *string,
1461
                           Py_ssize_t count)
1462
/*[clinic end generated code: output=da02fd85258b1e1f input=6a5bb5b61717abf0]*/
1463
0
{
1464
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
1465
1466
0
    return pattern_subx(module_state, self, repl, string, count, 1);
1467
0
}
1468
1469
/*[clinic input]
1470
_sre.SRE_Pattern.__copy__
1471
1472
[clinic start generated code]*/
1473
1474
static PyObject *
1475
_sre_SRE_Pattern___copy___impl(PatternObject *self)
1476
/*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
1477
0
{
1478
0
    return Py_NewRef(self);
1479
0
}
1480
1481
/*[clinic input]
1482
_sre.SRE_Pattern.__deepcopy__
1483
1484
    memo: object
1485
    /
1486
1487
[clinic start generated code]*/
1488
1489
static PyObject *
1490
_sre_SRE_Pattern___deepcopy___impl(PatternObject *self, PyObject *memo)
1491
/*[clinic end generated code: output=75efe69bd12c5d7d input=a465b1602f997bed]*/
1492
0
{
1493
0
    return Py_NewRef(self);
1494
0
}
1495
1496
#ifdef Py_DEBUG
1497
/*[clinic input]
1498
_sre.SRE_Pattern._fail_after
1499
1500
    count: int
1501
    exception: object
1502
    /
1503
1504
For debugging.
1505
[clinic start generated code]*/
1506
1507
static PyObject *
1508
_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count,
1509
                                  PyObject *exception)
1510
/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/
1511
{
1512
    self->fail_after_count = count;
1513
    Py_INCREF(exception);
1514
    Py_XSETREF(self->fail_after_exc, exception);
1515
    Py_RETURN_NONE;
1516
}
1517
#endif /* Py_DEBUG */
1518
1519
static PyObject *
1520
pattern_repr(PyObject *self)
1521
0
{
1522
0
    static const struct {
1523
0
        const char *name;
1524
0
        int value;
1525
0
    } flag_names[] = {
1526
0
        {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
1527
0
        {"re.LOCALE", SRE_FLAG_LOCALE},
1528
0
        {"re.MULTILINE", SRE_FLAG_MULTILINE},
1529
0
        {"re.DOTALL", SRE_FLAG_DOTALL},
1530
0
        {"re.UNICODE", SRE_FLAG_UNICODE},
1531
0
        {"re.VERBOSE", SRE_FLAG_VERBOSE},
1532
0
        {"re.DEBUG", SRE_FLAG_DEBUG},
1533
0
        {"re.ASCII", SRE_FLAG_ASCII},
1534
0
    };
1535
1536
0
    PatternObject *obj = _PatternObject_CAST(self);
1537
0
    PyObject *result = NULL;
1538
0
    PyObject *flag_items;
1539
0
    size_t i;
1540
0
    int flags = obj->flags;
1541
1542
    /* Omit re.UNICODE for valid string patterns. */
1543
0
    if (obj->isbytes == 0 &&
1544
0
        (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
1545
0
         SRE_FLAG_UNICODE)
1546
0
        flags &= ~SRE_FLAG_UNICODE;
1547
1548
0
    flag_items = PyList_New(0);
1549
0
    if (!flag_items)
1550
0
        return NULL;
1551
1552
0
    for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
1553
0
        if (flags & flag_names[i].value) {
1554
0
            PyObject *item = PyUnicode_FromString(flag_names[i].name);
1555
0
            if (!item)
1556
0
                goto done;
1557
1558
0
            if (PyList_Append(flag_items, item) < 0) {
1559
0
                Py_DECREF(item);
1560
0
                goto done;
1561
0
            }
1562
0
            Py_DECREF(item);
1563
0
            flags &= ~flag_names[i].value;
1564
0
        }
1565
0
    }
1566
0
    if (flags) {
1567
0
        PyObject *item = PyUnicode_FromFormat("0x%x", flags);
1568
0
        if (!item)
1569
0
            goto done;
1570
1571
0
        if (PyList_Append(flag_items, item) < 0) {
1572
0
            Py_DECREF(item);
1573
0
            goto done;
1574
0
        }
1575
0
        Py_DECREF(item);
1576
0
    }
1577
1578
0
    if (PyList_Size(flag_items) > 0) {
1579
0
        PyObject *flags_result;
1580
0
        PyObject *sep = PyUnicode_FromString("|");
1581
0
        if (!sep)
1582
0
            goto done;
1583
0
        flags_result = PyUnicode_Join(sep, flag_items);
1584
0
        Py_DECREF(sep);
1585
0
        if (!flags_result)
1586
0
            goto done;
1587
0
        result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
1588
0
                                      obj->pattern, flags_result);
1589
0
        Py_DECREF(flags_result);
1590
0
    }
1591
0
    else {
1592
0
        result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
1593
0
    }
1594
1595
0
done:
1596
0
    Py_DECREF(flag_items);
1597
0
    return result;
1598
0
}
1599
1600
PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
1601
1602
/* PatternObject's 'groupindex' method. */
1603
static PyObject *
1604
pattern_groupindex(PyObject *op, void *Py_UNUSED(ignored))
1605
0
{
1606
0
    PatternObject *self = _PatternObject_CAST(op);
1607
0
    if (self->groupindex == NULL)
1608
0
        return PyDict_New();
1609
0
    return PyDictProxy_New(self->groupindex);
1610
0
}
1611
1612
static int _validate(PatternObject *self); /* Forward */
1613
1614
/*[clinic input]
1615
_sre.compile
1616
1617
    pattern: object
1618
    flags: int
1619
    code: object(subclass_of='&PyList_Type')
1620
    groups: Py_ssize_t
1621
    groupindex: object(subclass_of='&PyDict_Type')
1622
    indexgroup: object(subclass_of='&PyTuple_Type')
1623
1624
[clinic start generated code]*/
1625
1626
static PyObject *
1627
_sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
1628
                  PyObject *code, Py_ssize_t groups, PyObject *groupindex,
1629
                  PyObject *indexgroup)
1630
/*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
1631
3.74k
{
1632
    /* "compile" pattern descriptor to pattern object */
1633
1634
3.74k
    _sremodulestate *module_state = get_sre_module_state(module);
1635
3.74k
    PatternObject* self;
1636
3.74k
    Py_ssize_t i, n;
1637
1638
3.74k
    n = PyList_GET_SIZE(code);
1639
    /* coverity[ampersand_in_size] */
1640
3.74k
    self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
1641
3.74k
    if (!self)
1642
0
        return NULL;
1643
3.74k
    self->weakreflist = NULL;
1644
3.74k
    self->pattern = NULL;
1645
3.74k
    self->groupindex = NULL;
1646
3.74k
    self->indexgroup = NULL;
1647
#ifdef Py_DEBUG
1648
    self->fail_after_count = -1;
1649
    self->fail_after_exc = NULL;
1650
#endif
1651
1652
3.74k
    self->codesize = n;
1653
1654
96.5M
    for (i = 0; i < n; i++) {
1655
96.5M
        PyObject *o = PyList_GET_ITEM(code, i);
1656
96.5M
        unsigned long value = PyLong_AsUnsignedLong(o);
1657
96.5M
        if (value == (unsigned long)-1 && PyErr_Occurred()) {
1658
0
            break;
1659
0
        }
1660
96.5M
        self->code[i] = (SRE_CODE) value;
1661
96.5M
        if ((unsigned long) self->code[i] != value) {
1662
0
            PyErr_SetString(PyExc_OverflowError,
1663
0
                            "regular expression code size limit exceeded");
1664
0
            break;
1665
0
        }
1666
96.5M
    }
1667
3.74k
    PyObject_GC_Track(self);
1668
1669
3.74k
    if (PyErr_Occurred()) {
1670
0
        Py_DECREF(self);
1671
0
        return NULL;
1672
0
    }
1673
1674
3.74k
    if (pattern == Py_None) {
1675
0
        self->isbytes = -1;
1676
0
    }
1677
3.74k
    else {
1678
3.74k
        Py_ssize_t p_length;
1679
3.74k
        int charsize;
1680
3.74k
        Py_buffer view;
1681
3.74k
        view.buf = NULL;
1682
3.74k
        if (!getstring(pattern, &p_length, &self->isbytes,
1683
3.74k
                       &charsize, &view)) {
1684
0
            Py_DECREF(self);
1685
0
            return NULL;
1686
0
        }
1687
3.74k
        if (view.buf)
1688
42
            PyBuffer_Release(&view);
1689
3.74k
    }
1690
1691
3.74k
    self->pattern = Py_NewRef(pattern);
1692
1693
3.74k
    self->flags = flags;
1694
1695
3.74k
    self->groups = groups;
1696
1697
3.74k
    if (PyDict_GET_SIZE(groupindex) > 0) {
1698
59
        self->groupindex = Py_NewRef(groupindex);
1699
59
        if (PyTuple_GET_SIZE(indexgroup) > 0) {
1700
59
            self->indexgroup = Py_NewRef(indexgroup);
1701
59
        }
1702
59
    }
1703
1704
3.74k
    if (!_validate(self)) {
1705
0
        Py_DECREF(self);
1706
0
        return NULL;
1707
0
    }
1708
1709
3.74k
    return (PyObject*) self;
1710
3.74k
}
1711
1712
/*[clinic input]
1713
_sre.template
1714
1715
    pattern: object
1716
    template: object(subclass_of="&PyList_Type")
1717
        A list containing interleaved literal strings (str or bytes) and group
1718
        indices (int), as returned by re._parser.parse_template():
1719
            [literal1, group1, ..., literalN, groupN]
1720
    /
1721
1722
[clinic start generated code]*/
1723
1724
static PyObject *
1725
_sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template)
1726
/*[clinic end generated code: output=d51290e596ebca86 input=af55380b27f02942]*/
1727
0
{
1728
    /* template is a list containing interleaved literal strings (str or bytes)
1729
     * and group indices (int), as returned by _parser.parse_template:
1730
     * [literal1, group1, literal2, ..., literalN].
1731
     */
1732
0
    _sremodulestate *module_state = get_sre_module_state(module);
1733
0
    TemplateObject *self = NULL;
1734
0
    Py_ssize_t n = PyList_GET_SIZE(template);
1735
0
    if ((n & 1) == 0 || n < 1) {
1736
0
        goto bad_template;
1737
0
    }
1738
0
    n /= 2;
1739
0
    self = PyObject_GC_NewVar(TemplateObject, module_state->Template_Type, n);
1740
0
    if (!self)
1741
0
        return NULL;
1742
0
    self->chunks = 1 + 2*n;
1743
0
    self->literal = Py_NewRef(PyList_GET_ITEM(template, 0));
1744
0
    for (Py_ssize_t i = 0; i < n; i++) {
1745
0
        Py_ssize_t index = PyLong_AsSsize_t(PyList_GET_ITEM(template, 2*i+1));
1746
0
        if (index == -1 && PyErr_Occurred()) {
1747
0
            Py_SET_SIZE(self, i);
1748
0
            Py_DECREF(self);
1749
0
            return NULL;
1750
0
        }
1751
0
        if (index < 0) {
1752
0
            Py_SET_SIZE(self, i);
1753
0
            goto bad_template;
1754
0
        }
1755
0
        self->items[i].index = index;
1756
1757
0
        PyObject *literal = PyList_GET_ITEM(template, 2*i+2);
1758
        // Skip empty literals.
1759
0
        if ((PyUnicode_Check(literal) && !PyUnicode_GET_LENGTH(literal)) ||
1760
0
            (PyBytes_Check(literal) && !PyBytes_GET_SIZE(literal)))
1761
0
        {
1762
0
            literal = NULL;
1763
0
            self->chunks--;
1764
0
        }
1765
0
        self->items[i].literal = Py_XNewRef(literal);
1766
0
    }
1767
0
    PyObject_GC_Track(self);
1768
0
    return (PyObject*) self;
1769
1770
0
bad_template:
1771
0
    PyErr_SetString(PyExc_TypeError, "invalid template");
1772
0
    Py_XDECREF(self);
1773
0
    return NULL;
1774
0
}
1775
1776
/* -------------------------------------------------------------------- */
1777
/* Code validation */
1778
1779
/* To learn more about this code, have a look at the _compile() function in
1780
   Lib/sre_compile.py.  The validation functions below checks the code array
1781
   for conformance with the code patterns generated there.
1782
1783
   The nice thing about the generated code is that it is position-independent:
1784
   all jumps are relative jumps forward.  Also, jumps don't cross each other:
1785
   the target of a later jump is always earlier than the target of an earlier
1786
   jump.  IOW, this is okay:
1787
1788
   J---------J-------T--------T
1789
    \         \_____/        /
1790
     \______________________/
1791
1792
   but this is not:
1793
1794
   J---------J-------T--------T
1795
    \_________\_____/        /
1796
               \____________/
1797
1798
   It also helps that SRE_CODE is always an unsigned type.
1799
*/
1800
1801
/* Defining this one enables tracing of the validator */
1802
#undef VVERBOSE
1803
1804
/* Trace macro for the validator */
1805
#if defined(VVERBOSE)
1806
#define VTRACE(v) printf v
1807
#else
1808
146M
#define VTRACE(v) do {} while(0)  /* do nothing */
1809
#endif
1810
1811
/* Report failure */
1812
0
#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return -1; } while (0)
1813
1814
/* Extract opcode, argument, or skip count from code array */
1815
#define GET_OP                                          \
1816
34.3M
    do {                                                \
1817
34.3M
        VTRACE(("%p: ", code));                         \
1818
34.3M
        if (code >= end) FAIL;                          \
1819
34.3M
        op = *code++;                                   \
1820
34.3M
        VTRACE(("%lu (op)\n", (unsigned long)op));      \
1821
34.3M
    } while (0)
1822
#define GET_ARG                                         \
1823
29.9M
    do {                                                \
1824
29.9M
        VTRACE(("%p= ", code));                         \
1825
29.9M
        if (code >= end) FAIL;                          \
1826
29.9M
        arg = *code++;                                  \
1827
29.9M
        VTRACE(("%lu (arg)\n", (unsigned long)arg));    \
1828
29.9M
    } while (0)
1829
#define GET_SKIP_ADJ(adj)                               \
1830
6.51M
    do {                                                \
1831
6.51M
        VTRACE(("%p= ", code));                         \
1832
6.51M
        if (code >= end) FAIL;                          \
1833
6.51M
        skip = *code;                                   \
1834
6.51M
        VTRACE(("%lu (skip to %p)\n",                   \
1835
6.51M
               (unsigned long)skip, code+skip));        \
1836
6.51M
        if (skip-adj > (uintptr_t)(end - code))         \
1837
6.51M
            FAIL;                                       \
1838
6.51M
        code++;                                         \
1839
6.51M
    } while (0)
1840
6.51M
#define GET_SKIP GET_SKIP_ADJ(0)
1841
1842
static int
1843
_validate_charset(SRE_CODE *code, SRE_CODE *end)
1844
3.40M
{
1845
    /* Some variables are manipulated by the macros above */
1846
3.40M
    SRE_CODE op;
1847
3.40M
    SRE_CODE arg;
1848
3.40M
    SRE_CODE offset;
1849
3.40M
    int i;
1850
1851
10.1M
    while (code < end) {
1852
6.71M
        GET_OP;
1853
6.71M
        switch (op) {
1854
1855
1.26k
        case SRE_OP_NEGATE:
1856
1.26k
            break;
1857
1858
6.61M
        case SRE_OP_LITERAL:
1859
6.61M
            GET_ARG;
1860
6.61M
            break;
1861
1862
6.61M
        case SRE_OP_RANGE:
1863
11.7k
        case SRE_OP_RANGE_UNI_IGNORE:
1864
11.7k
            GET_ARG;
1865
11.7k
            GET_ARG;
1866
11.7k
            break;
1867
1868
11.7k
        case SRE_OP_CHARSET:
1869
784
            offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
1870
784
            if (offset > (uintptr_t)(end - code))
1871
0
                FAIL;
1872
784
            code += offset;
1873
784
            break;
1874
1875
89.6k
        case SRE_OP_BIGCHARSET:
1876
89.6k
            GET_ARG; /* Number of blocks */
1877
89.6k
            offset = 256/sizeof(SRE_CODE); /* 256-byte table */
1878
89.6k
            if (offset > (uintptr_t)(end - code))
1879
0
                FAIL;
1880
            /* Make sure that each byte points to a valid block */
1881
23.0M
            for (i = 0; i < 256; i++) {
1882
22.9M
                if (((unsigned char *)code)[i] >= arg)
1883
0
                    FAIL;
1884
22.9M
            }
1885
89.6k
            code += offset;
1886
89.6k
            offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
1887
89.6k
            if (offset > (uintptr_t)(end - code))
1888
0
                FAIL;
1889
89.6k
            code += offset;
1890
89.6k
            break;
1891
1892
1.63k
        case SRE_OP_CATEGORY:
1893
1.63k
            GET_ARG;
1894
1.63k
            switch (arg) {
1895
34
            case SRE_CATEGORY_DIGIT:
1896
34
            case SRE_CATEGORY_NOT_DIGIT:
1897
66
            case SRE_CATEGORY_SPACE:
1898
66
            case SRE_CATEGORY_NOT_SPACE:
1899
92
            case SRE_CATEGORY_WORD:
1900
92
            case SRE_CATEGORY_NOT_WORD:
1901
92
            case SRE_CATEGORY_LINEBREAK:
1902
92
            case SRE_CATEGORY_NOT_LINEBREAK:
1903
92
            case SRE_CATEGORY_LOC_WORD:
1904
92
            case SRE_CATEGORY_LOC_NOT_WORD:
1905
222
            case SRE_CATEGORY_UNI_DIGIT:
1906
766
            case SRE_CATEGORY_UNI_NOT_DIGIT:
1907
1.46k
            case SRE_CATEGORY_UNI_SPACE:
1908
1.47k
            case SRE_CATEGORY_UNI_NOT_SPACE:
1909
1.57k
            case SRE_CATEGORY_UNI_WORD:
1910
1.63k
            case SRE_CATEGORY_UNI_NOT_WORD:
1911
1.63k
            case SRE_CATEGORY_UNI_LINEBREAK:
1912
1.63k
            case SRE_CATEGORY_UNI_NOT_LINEBREAK:
1913
1.63k
                break;
1914
0
            default:
1915
0
                FAIL;
1916
1.63k
            }
1917
1.63k
            break;
1918
1919
1.63k
        default:
1920
0
            FAIL;
1921
1922
6.71M
        }
1923
6.71M
    }
1924
1925
3.40M
    return 0;
1926
3.40M
}
1927
1928
/* Returns 0 on success, -1 on failure, and 1 if the last op is JUMP. */
1929
static int
1930
_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
1931
2.20M
{
1932
    /* Some variables are manipulated by the macros above */
1933
2.20M
    SRE_CODE op;
1934
2.20M
    SRE_CODE arg;
1935
2.20M
    SRE_CODE skip;
1936
1937
2.20M
    VTRACE(("code=%p, end=%p\n", code, end));
1938
1939
2.20M
    if (code > end)
1940
0
        FAIL;
1941
1942
27.6M
    while (code < end) {
1943
25.4M
        GET_OP;
1944
25.4M
        switch (op) {
1945
1946
358k
        case SRE_OP_MARK:
1947
            /* We don't check whether marks are properly nested; the
1948
               sre_match() code is robust even if they don't, and the worst
1949
               you can get is nonsensical match results. */
1950
358k
            GET_ARG;
1951
358k
            if (arg >= 2 * (size_t)groups) {
1952
0
                VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
1953
0
                FAIL;
1954
0
            }
1955
358k
            break;
1956
1957
16.3M
        case SRE_OP_LITERAL:
1958
16.3M
        case SRE_OP_NOT_LITERAL:
1959
16.3M
        case SRE_OP_LITERAL_IGNORE:
1960
16.3M
        case SRE_OP_NOT_LITERAL_IGNORE:
1961
20.1M
        case SRE_OP_LITERAL_UNI_IGNORE:
1962
20.1M
        case SRE_OP_NOT_LITERAL_UNI_IGNORE:
1963
20.1M
        case SRE_OP_LITERAL_LOC_IGNORE:
1964
20.1M
        case SRE_OP_NOT_LITERAL_LOC_IGNORE:
1965
20.1M
            GET_ARG;
1966
            /* The arg is just a character, nothing to check */
1967
20.1M
            break;
1968
1969
20.1M
        case SRE_OP_SUCCESS:
1970
81
        case SRE_OP_FAILURE:
1971
            /* Nothing to check; these normally end the matching process */
1972
81
            break;
1973
1974
83.9k
        case SRE_OP_AT:
1975
83.9k
            GET_ARG;
1976
83.9k
            switch (arg) {
1977
49
            case SRE_AT_BEGINNING:
1978
57
            case SRE_AT_BEGINNING_STRING:
1979
71.7k
            case SRE_AT_BEGINNING_LINE:
1980
71.8k
            case SRE_AT_END:
1981
79.6k
            case SRE_AT_END_LINE:
1982
79.7k
            case SRE_AT_END_STRING:
1983
79.7k
            case SRE_AT_BOUNDARY:
1984
79.7k
            case SRE_AT_NON_BOUNDARY:
1985
79.7k
            case SRE_AT_LOC_BOUNDARY:
1986
79.7k
            case SRE_AT_LOC_NON_BOUNDARY:
1987
83.9k
            case SRE_AT_UNI_BOUNDARY:
1988
83.9k
            case SRE_AT_UNI_NON_BOUNDARY:
1989
83.9k
                break;
1990
0
            default:
1991
0
                FAIL;
1992
83.9k
            }
1993
83.9k
            break;
1994
1995
83.9k
        case SRE_OP_ANY:
1996
38.2k
        case SRE_OP_ANY_ALL:
1997
            /* These have no operands */
1998
38.2k
            break;
1999
2000
5.79k
        case SRE_OP_IN:
2001
6.04k
        case SRE_OP_IN_IGNORE:
2002
3.40M
        case SRE_OP_IN_UNI_IGNORE:
2003
3.40M
        case SRE_OP_IN_LOC_IGNORE:
2004
3.40M
            GET_SKIP;
2005
            /* Stop 1 before the end; we check the FAILURE below */
2006
3.40M
            if (_validate_charset(code, code+skip-2))
2007
0
                FAIL;
2008
3.40M
            if (code[skip-2] != SRE_OP_FAILURE)
2009
0
                FAIL;
2010
3.40M
            code += skip-1;
2011
3.40M
            break;
2012
2013
3.74k
        case SRE_OP_INFO:
2014
3.74k
            {
2015
                /* A minimal info field is
2016
                   <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2017
                   If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2018
                   more follows. */
2019
3.74k
                SRE_CODE flags, i;
2020
3.74k
                SRE_CODE *newcode;
2021
3.74k
                GET_SKIP;
2022
3.74k
                newcode = code+skip-1;
2023
3.74k
                GET_ARG; flags = arg;
2024
3.74k
                GET_ARG;
2025
3.74k
                GET_ARG;
2026
                /* Check that only valid flags are present */
2027
3.74k
                if ((flags & ~(SRE_INFO_PREFIX |
2028
3.74k
                               SRE_INFO_LITERAL |
2029
3.74k
                               SRE_INFO_CHARSET)) != 0)
2030
0
                    FAIL;
2031
                /* PREFIX and CHARSET are mutually exclusive */
2032
3.74k
                if ((flags & SRE_INFO_PREFIX) &&
2033
1.64k
                    (flags & SRE_INFO_CHARSET))
2034
0
                    FAIL;
2035
                /* LITERAL implies PREFIX */
2036
3.74k
                if ((flags & SRE_INFO_LITERAL) &&
2037
714
                    !(flags & SRE_INFO_PREFIX))
2038
0
                    FAIL;
2039
                /* Validate the prefix */
2040
3.74k
                if (flags & SRE_INFO_PREFIX) {
2041
1.64k
                    SRE_CODE prefix_len;
2042
1.64k
                    GET_ARG; prefix_len = arg;
2043
1.64k
                    GET_ARG;
2044
                    /* Here comes the prefix string */
2045
1.64k
                    if (prefix_len > (uintptr_t)(newcode - code))
2046
0
                        FAIL;
2047
1.64k
                    code += prefix_len;
2048
                    /* And here comes the overlap table */
2049
1.64k
                    if (prefix_len > (uintptr_t)(newcode - code))
2050
0
                        FAIL;
2051
                    /* Each overlap value should be < prefix_len */
2052
7.14M
                    for (i = 0; i < prefix_len; i++) {
2053
7.14M
                        if (code[i] >= prefix_len)
2054
0
                            FAIL;
2055
7.14M
                    }
2056
1.64k
                    code += prefix_len;
2057
1.64k
                }
2058
                /* Validate the charset */
2059
3.74k
                if (flags & SRE_INFO_CHARSET) {
2060
409
                    if (_validate_charset(code, newcode-1))
2061
0
                        FAIL;
2062
409
                    if (newcode[-1] != SRE_OP_FAILURE)
2063
0
                        FAIL;
2064
409
                    code = newcode;
2065
409
                }
2066
3.33k
                else if (code != newcode) {
2067
0
                  VTRACE(("code=%p, newcode=%p\n", code, newcode));
2068
0
                    FAIL;
2069
0
                }
2070
3.74k
            }
2071
3.74k
            break;
2072
2073
28.0k
        case SRE_OP_BRANCH:
2074
28.0k
            {
2075
28.0k
                SRE_CODE *target = NULL;
2076
909k
                for (;;) {
2077
909k
                    GET_SKIP;
2078
909k
                    if (skip == 0)
2079
28.0k
                        break;
2080
                    /* Stop 2 before the end; we check the JUMP below */
2081
881k
                    if (_validate_inner(code, code+skip-3, groups))
2082
0
                        FAIL;
2083
881k
                    code += skip-3;
2084
                    /* Check that it ends with a JUMP, and that each JUMP
2085
                       has the same target */
2086
881k
                    GET_OP;
2087
881k
                    if (op != SRE_OP_JUMP)
2088
0
                        FAIL;
2089
881k
                    GET_SKIP;
2090
881k
                    if (target == NULL)
2091
28.0k
                        target = code+skip-1;
2092
853k
                    else if (code+skip-1 != target)
2093
0
                        FAIL;
2094
881k
                }
2095
28.0k
                if (code != target)
2096
0
                    FAIL;
2097
28.0k
            }
2098
28.0k
            break;
2099
2100
1.27M
        case SRE_OP_REPEAT_ONE:
2101
1.27M
        case SRE_OP_MIN_REPEAT_ONE:
2102
1.27M
        case SRE_OP_POSSESSIVE_REPEAT_ONE:
2103
1.27M
            {
2104
1.27M
                SRE_CODE min, max;
2105
1.27M
                GET_SKIP;
2106
1.27M
                GET_ARG; min = arg;
2107
1.27M
                GET_ARG; max = arg;
2108
1.27M
                if (min > max)
2109
0
                    FAIL;
2110
1.27M
                if (max > SRE_MAXREPEAT)
2111
0
                    FAIL;
2112
1.27M
                if (_validate_inner(code, code+skip-4, groups))
2113
0
                    FAIL;
2114
1.27M
                code += skip-4;
2115
1.27M
                GET_OP;
2116
1.27M
                if (op != SRE_OP_SUCCESS)
2117
0
                    FAIL;
2118
1.27M
            }
2119
1.27M
            break;
2120
2121
1.27M
        case SRE_OP_REPEAT:
2122
41.9k
        case SRE_OP_POSSESSIVE_REPEAT:
2123
41.9k
            {
2124
41.9k
                SRE_CODE op1 = op, min, max;
2125
41.9k
                GET_SKIP;
2126
41.9k
                GET_ARG; min = arg;
2127
41.9k
                GET_ARG; max = arg;
2128
41.9k
                if (min > max)
2129
0
                    FAIL;
2130
41.9k
                if (max > SRE_MAXREPEAT)
2131
0
                    FAIL;
2132
41.9k
                if (_validate_inner(code, code+skip-3, groups))
2133
0
                    FAIL;
2134
41.9k
                code += skip-3;
2135
41.9k
                GET_OP;
2136
41.9k
                if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
2137
46
                    if (op != SRE_OP_SUCCESS)
2138
0
                        FAIL;
2139
46
                }
2140
41.9k
                else {
2141
41.9k
                    if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
2142
0
                        FAIL;
2143
41.9k
                }
2144
41.9k
            }
2145
41.9k
            break;
2146
2147
41.9k
        case SRE_OP_ATOMIC_GROUP:
2148
157
            {
2149
157
                GET_SKIP;
2150
157
                if (_validate_inner(code, code+skip-2, groups))
2151
0
                    FAIL;
2152
157
                code += skip-2;
2153
157
                GET_OP;
2154
157
                if (op != SRE_OP_SUCCESS)
2155
0
                    FAIL;
2156
157
            }
2157
157
            break;
2158
2159
157
        case SRE_OP_GROUPREF:
2160
849
        case SRE_OP_GROUPREF_IGNORE:
2161
1.60k
        case SRE_OP_GROUPREF_UNI_IGNORE:
2162
1.60k
        case SRE_OP_GROUPREF_LOC_IGNORE:
2163
1.60k
            GET_ARG;
2164
1.60k
            if (arg >= (size_t)groups)
2165
0
                FAIL;
2166
1.60k
            break;
2167
2168
1.60k
        case SRE_OP_GROUPREF_EXISTS:
2169
            /* The regex syntax for this is: '(?(group)then|else)', where
2170
               'group' is either an integer group number or a group name,
2171
               'then' and 'else' are sub-regexes, and 'else' is optional. */
2172
54
            GET_ARG;
2173
54
            if (arg >= (size_t)groups)
2174
0
                FAIL;
2175
54
            GET_SKIP_ADJ(1);
2176
54
            code--; /* The skip is relative to the first arg! */
2177
            /* There are two possibilities here: if there is both a 'then'
2178
               part and an 'else' part, the generated code looks like:
2179
2180
               GROUPREF_EXISTS
2181
               <group>
2182
               <skipyes>
2183
               ...then part...
2184
               JUMP
2185
               <skipno>
2186
               (<skipyes> jumps here)
2187
               ...else part...
2188
               (<skipno> jumps here)
2189
2190
               If there is only a 'then' part, it looks like:
2191
2192
               GROUPREF_EXISTS
2193
               <group>
2194
               <skip>
2195
               ...then part...
2196
               (<skip> jumps here)
2197
2198
               There is no direct way to decide which it is, and we don't want
2199
               to allow arbitrary jumps anywhere in the code; so we just look
2200
               for a JUMP opcode preceding our skip target.
2201
            */
2202
54
            VTRACE(("then part:\n"));
2203
54
            int rc = _validate_inner(code+1, code+skip-1, groups);
2204
54
            if (rc == 1) {
2205
32
                VTRACE(("else part:\n"));
2206
32
                code += skip-2; /* Position after JUMP, at <skipno> */
2207
32
                GET_SKIP;
2208
32
                rc = _validate_inner(code, code+skip-1, groups);
2209
32
            }
2210
54
            if (rc)
2211
0
                FAIL;
2212
54
            code += skip-1;
2213
54
            break;
2214
2215
117
        case SRE_OP_ASSERT:
2216
369
        case SRE_OP_ASSERT_NOT:
2217
369
            GET_SKIP;
2218
369
            GET_ARG; /* 0 for lookahead, width for lookbehind */
2219
369
            code--; /* Back up over arg to simplify math below */
2220
            /* Stop 1 before the end; we check the SUCCESS below */
2221
369
            if (_validate_inner(code+1, code+skip-2, groups))
2222
0
                FAIL;
2223
369
            code += skip-2;
2224
369
            GET_OP;
2225
369
            if (op != SRE_OP_SUCCESS)
2226
0
                FAIL;
2227
369
            break;
2228
2229
369
        case SRE_OP_JUMP:
2230
32
            if (code + 1 != end)
2231
0
                FAIL;
2232
32
            VTRACE(("JUMP: %d\n", __LINE__));
2233
32
            return 1;
2234
2235
0
        default:
2236
0
            FAIL;
2237
2238
25.4M
        }
2239
25.4M
    }
2240
2241
2.20M
    VTRACE(("okay\n"));
2242
2.20M
    return 0;
2243
2.20M
}
2244
2245
static int
2246
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2247
3.74k
{
2248
3.74k
    if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
2249
3.74k
        code >= end || end[-1] != SRE_OP_SUCCESS)
2250
0
        FAIL;
2251
3.74k
    return _validate_inner(code, end-1, groups);
2252
3.74k
}
2253
2254
static int
2255
_validate(PatternObject *self)
2256
3.74k
{
2257
3.74k
    if (_validate_outer(self->code, self->code+self->codesize, self->groups))
2258
0
    {
2259
0
        PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
2260
0
        return 0;
2261
0
    }
2262
3.74k
    else
2263
3.74k
        VTRACE(("Success!\n"));
2264
3.74k
    return 1;
2265
3.74k
}
2266
2267
/* -------------------------------------------------------------------- */
2268
/* match methods */
2269
2270
static int
2271
match_traverse(PyObject *op, visitproc visit, void *arg)
2272
49.7k
{
2273
49.7k
    MatchObject *self = _MatchObject_CAST(op);
2274
49.7k
    Py_VISIT(Py_TYPE(self));
2275
49.7k
    Py_VISIT(self->string);
2276
49.7k
    Py_VISIT(self->regs);
2277
49.7k
    Py_VISIT(self->pattern);
2278
49.7k
    return 0;
2279
49.7k
}
2280
2281
static int
2282
match_clear(PyObject *op)
2283
56.1M
{
2284
56.1M
    MatchObject *self = _MatchObject_CAST(op);
2285
56.1M
    Py_CLEAR(self->string);
2286
56.1M
    Py_CLEAR(self->regs);
2287
56.1M
    Py_CLEAR(self->pattern);
2288
56.1M
    return 0;
2289
56.1M
}
2290
2291
static void
2292
match_dealloc(PyObject *self)
2293
56.1M
{
2294
56.1M
    PyTypeObject *tp = Py_TYPE(self);
2295
56.1M
    PyObject_GC_UnTrack(self);
2296
56.1M
    (void)match_clear(self);
2297
56.1M
    tp->tp_free(self);
2298
56.1M
    Py_DECREF(tp);
2299
56.1M
}
2300
2301
static PyObject*
2302
match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2303
52.7M
{
2304
52.7M
    Py_ssize_t length;
2305
52.7M
    int isbytes, charsize;
2306
52.7M
    Py_buffer view;
2307
52.7M
    PyObject *result;
2308
52.7M
    const void* ptr;
2309
52.7M
    Py_ssize_t i, j;
2310
2311
52.7M
    assert(0 <= index && index < self->groups);
2312
52.7M
    index *= 2;
2313
2314
52.7M
    if (self->string == Py_None || self->mark[index] < 0) {
2315
        /* return default value if the string or group is undefined */
2316
9.75M
        return Py_NewRef(def);
2317
9.75M
    }
2318
2319
43.0M
    ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
2320
43.0M
    if (ptr == NULL)
2321
0
        return NULL;
2322
2323
43.0M
    i = self->mark[index];
2324
43.0M
    j = self->mark[index+1];
2325
43.0M
    i = Py_MIN(i, length);
2326
43.0M
    j = Py_MIN(j, length);
2327
43.0M
    result = getslice(isbytes, ptr, self->string, i, j);
2328
43.0M
    if (isbytes && view.buf != NULL)
2329
414k
        PyBuffer_Release(&view);
2330
43.0M
    return result;
2331
43.0M
}
2332
2333
static Py_ssize_t
2334
match_getindex(MatchObject* self, PyObject* index)
2335
72.6M
{
2336
72.6M
    Py_ssize_t i;
2337
2338
72.6M
    if (index == NULL)
2339
        /* Default value */
2340
19.1M
        return 0;
2341
2342
53.5M
    if (PyIndex_Check(index)) {
2343
35.7M
        i = PyNumber_AsSsize_t(index, NULL);
2344
35.7M
    }
2345
17.8M
    else {
2346
17.8M
        i = -1;
2347
2348
17.8M
        if (self->pattern->groupindex) {
2349
17.8M
            index = PyDict_GetItemWithError(self->pattern->groupindex, index);
2350
17.8M
            if (index && PyLong_Check(index)) {
2351
17.8M
                i = PyLong_AsSsize_t(index);
2352
17.8M
            }
2353
17.8M
        }
2354
17.8M
    }
2355
53.5M
    if (i < 0 || i >= self->groups) {
2356
        /* raise IndexError if we were given a bad group number */
2357
0
        if (!PyErr_Occurred()) {
2358
0
            PyErr_SetString(PyExc_IndexError, "no such group");
2359
0
        }
2360
0
        return -1;
2361
0
    }
2362
2363
    // Check that i*2 cannot overflow to make static analyzers happy
2364
53.5M
    assert((size_t)i <= SRE_MAXGROUPS);
2365
53.5M
    return i;
2366
53.5M
}
2367
2368
static PyObject*
2369
match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2370
52.7M
{
2371
52.7M
    Py_ssize_t i = match_getindex(self, index);
2372
2373
52.7M
    if (i < 0) {
2374
0
        return NULL;
2375
0
    }
2376
2377
52.7M
    return match_getslice_by_index(self, i, def);
2378
52.7M
}
2379
2380
/*[clinic input]
2381
@permit_long_summary
2382
_sre.SRE_Match.expand
2383
2384
    template: object
2385
2386
Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
2387
[clinic start generated code]*/
2388
2389
static PyObject *
2390
_sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
2391
/*[clinic end generated code: output=931b58ccc323c3a1 input=dc74d81265376ac3]*/
2392
0
{
2393
0
    _sremodulestate *module_state = get_sre_module_state_by_class(Py_TYPE(self));
2394
0
    PyObject *filter = compile_template(module_state, self->pattern, template);
2395
0
    if (filter == NULL) {
2396
0
        return NULL;
2397
0
    }
2398
0
    PyObject *result = expand_template((TemplateObject *)filter, self);
2399
0
    Py_DECREF(filter);
2400
0
    return result;
2401
0
}
2402
2403
static PyObject*
2404
match_group(PyObject *op, PyObject* args)
2405
26.7M
{
2406
26.7M
    MatchObject *self = _MatchObject_CAST(op);
2407
26.7M
    PyObject* result;
2408
26.7M
    Py_ssize_t i, size;
2409
2410
26.7M
    size = PyTuple_GET_SIZE(args);
2411
2412
26.7M
    switch (size) {
2413
3.33M
    case 0:
2414
3.33M
        result = match_getslice(self, _PyLong_GetZero(), Py_None);
2415
3.33M
        break;
2416
10.5M
    case 1:
2417
10.5M
        result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2418
10.5M
        break;
2419
12.9M
    default:
2420
        /* fetch multiple items */
2421
12.9M
        result = PyTuple_New(size);
2422
12.9M
        if (!result)
2423
0
            return NULL;
2424
48.9M
        for (i = 0; i < size; i++) {
2425
35.9M
            PyObject* item = match_getslice(
2426
35.9M
                self, PyTuple_GET_ITEM(args, i), Py_None
2427
35.9M
                );
2428
35.9M
            if (!item) {
2429
0
                Py_DECREF(result);
2430
0
                return NULL;
2431
0
            }
2432
35.9M
            PyTuple_SET_ITEM(result, i, item);
2433
35.9M
        }
2434
12.9M
        break;
2435
26.7M
    }
2436
26.7M
    return result;
2437
26.7M
}
2438
2439
static PyObject*
2440
match_getitem(PyObject *op, PyObject* name)
2441
2.92M
{
2442
2.92M
    MatchObject *self = _MatchObject_CAST(op);
2443
2.92M
    return match_getslice(self, name, Py_None);
2444
2.92M
}
2445
2446
/*[clinic input]
2447
_sre.SRE_Match.groups
2448
2449
    default: object = None
2450
        Is used for groups that did not participate in the match.
2451
2452
Return a tuple containing all the subgroups of the match, from 1.
2453
[clinic start generated code]*/
2454
2455
static PyObject *
2456
_sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
2457
/*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
2458
323
{
2459
323
    PyObject* result;
2460
323
    Py_ssize_t index;
2461
2462
323
    result = PyTuple_New(self->groups-1);
2463
323
    if (!result)
2464
0
        return NULL;
2465
2466
2.74k
    for (index = 1; index < self->groups; index++) {
2467
2.42k
        PyObject* item;
2468
2.42k
        item = match_getslice_by_index(self, index, default_value);
2469
2.42k
        if (!item) {
2470
0
            Py_DECREF(result);
2471
0
            return NULL;
2472
0
        }
2473
2.42k
        PyTuple_SET_ITEM(result, index-1, item);
2474
2.42k
    }
2475
2476
323
    return result;
2477
323
}
2478
2479
/*[clinic input]
2480
@permit_long_summary
2481
_sre.SRE_Match.groupdict
2482
2483
    default: object = None
2484
        Is used for groups that did not participate in the match.
2485
2486
Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
2487
[clinic start generated code]*/
2488
2489
static PyObject *
2490
_sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
2491
/*[clinic end generated code: output=29917c9073e41757 input=a8d3a1dc80336872]*/
2492
124
{
2493
124
    PyObject *result;
2494
124
    PyObject *key;
2495
124
    PyObject *value;
2496
124
    Py_ssize_t pos = 0;
2497
124
    Py_hash_t hash;
2498
2499
124
    result = PyDict_New();
2500
124
    if (!result || !self->pattern->groupindex)
2501
0
        return result;
2502
2503
124
    Py_BEGIN_CRITICAL_SECTION(self->pattern->groupindex);
2504
820
    while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
2505
696
        int status;
2506
696
        Py_INCREF(key);
2507
696
        value = match_getslice(self, key, default_value);
2508
696
        if (!value) {
2509
0
            Py_DECREF(key);
2510
0
            Py_CLEAR(result);
2511
0
            goto exit;
2512
0
        }
2513
696
        status = _PyDict_SetItem_KnownHash(result, key, value, hash);
2514
696
        Py_DECREF(value);
2515
696
        Py_DECREF(key);
2516
696
        if (status < 0) {
2517
0
            Py_CLEAR(result);
2518
0
            goto exit;
2519
0
        }
2520
696
    }
2521
124
exit:;
2522
124
    Py_END_CRITICAL_SECTION();
2523
2524
124
    return result;
2525
124
}
2526
2527
/*[clinic input]
2528
_sre.SRE_Match.start -> Py_ssize_t
2529
2530
    group: object(c_default="NULL") = 0
2531
    /
2532
2533
Return index of the start of the substring matched by group.
2534
[clinic start generated code]*/
2535
2536
static Py_ssize_t
2537
_sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
2538
/*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
2539
5.29M
{
2540
5.29M
    Py_ssize_t index = match_getindex(self, group);
2541
2542
5.29M
    if (index < 0) {
2543
0
        return -1;
2544
0
    }
2545
2546
    /* mark is -1 if group is undefined */
2547
5.29M
    return self->mark[index*2];
2548
5.29M
}
2549
2550
/*[clinic input]
2551
_sre.SRE_Match.end -> Py_ssize_t
2552
2553
    group: object(c_default="NULL") = 0
2554
    /
2555
2556
Return index of the end of the substring matched by group.
2557
[clinic start generated code]*/
2558
2559
static Py_ssize_t
2560
_sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
2561
/*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
2562
11.6M
{
2563
11.6M
    Py_ssize_t index = match_getindex(self, group);
2564
2565
11.6M
    if (index < 0) {
2566
0
        return -1;
2567
0
    }
2568
2569
    /* mark is -1 if group is undefined */
2570
11.6M
    return self->mark[index*2+1];
2571
11.6M
}
2572
2573
LOCAL(PyObject*)
2574
_pair(Py_ssize_t i1, Py_ssize_t i2)
2575
2.94M
{
2576
2.94M
    PyObject* item1 = PyLong_FromSsize_t(i1);
2577
2.94M
    if (!item1) {
2578
0
        return NULL;
2579
0
    }
2580
2.94M
    PyObject* item2 = PyLong_FromSsize_t(i2);
2581
2.94M
    if(!item2) {
2582
0
        Py_DECREF(item1);
2583
0
        return NULL;
2584
0
    }
2585
2586
2.94M
    return _PyTuple_FromPairSteal(item1, item2);
2587
2.94M
}
2588
2589
/*[clinic input]
2590
@permit_long_summary
2591
_sre.SRE_Match.span
2592
2593
    group: object(c_default="NULL") = 0
2594
    /
2595
2596
For match object m, return the 2-tuple (m.start(group), m.end(group)).
2597
[clinic start generated code]*/
2598
2599
static PyObject *
2600
_sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
2601
/*[clinic end generated code: output=f02ae40594d14fe6 input=834cfe444f0f55cf]*/
2602
2.94M
{
2603
2.94M
    Py_ssize_t index = match_getindex(self, group);
2604
2605
2.94M
    if (index < 0) {
2606
0
        return NULL;
2607
0
    }
2608
2609
    /* marks are -1 if group is undefined */
2610
2.94M
    return _pair(self->mark[index*2], self->mark[index*2+1]);
2611
2.94M
}
2612
2613
static PyObject*
2614
match_regs(MatchObject* self)
2615
0
{
2616
0
    PyObject* regs;
2617
0
    PyObject* item;
2618
0
    Py_ssize_t index;
2619
2620
0
    regs = PyTuple_New(self->groups);
2621
0
    if (!regs)
2622
0
        return NULL;
2623
2624
0
    for (index = 0; index < self->groups; index++) {
2625
0
        item = _pair(self->mark[index*2], self->mark[index*2+1]);
2626
0
        if (!item) {
2627
0
            Py_DECREF(regs);
2628
0
            return NULL;
2629
0
        }
2630
0
        PyTuple_SET_ITEM(regs, index, item);
2631
0
    }
2632
2633
0
    self->regs = Py_NewRef(regs);
2634
2635
0
    return regs;
2636
0
}
2637
2638
/*[clinic input]
2639
_sre.SRE_Match.__copy__
2640
2641
[clinic start generated code]*/
2642
2643
static PyObject *
2644
_sre_SRE_Match___copy___impl(MatchObject *self)
2645
/*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
2646
0
{
2647
0
    return Py_NewRef(self);
2648
0
}
2649
2650
/*[clinic input]
2651
_sre.SRE_Match.__deepcopy__
2652
2653
    memo: object
2654
    /
2655
2656
[clinic start generated code]*/
2657
2658
static PyObject *
2659
_sre_SRE_Match___deepcopy___impl(MatchObject *self, PyObject *memo)
2660
/*[clinic end generated code: output=2b657578eb03f4a3 input=779d12a31c2c325e]*/
2661
0
{
2662
0
    return Py_NewRef(self);
2663
0
}
2664
2665
PyDoc_STRVAR(match_doc,
2666
"The result of re.search(), re.prefixmatch(), and re.fullmatch().\n\
2667
Match objects always have a boolean value of True.");
2668
2669
PyDoc_STRVAR(match_group_doc,
2670
"group([group1, ...]) -> str or tuple.\n\
2671
    Return subgroup(s) of the match by indices or names.\n\
2672
    For 0 returns the entire match.");
2673
2674
static PyObject *
2675
match_lastindex_get(PyObject *op, void *Py_UNUSED(ignored))
2676
0
{
2677
0
    MatchObject *self = _MatchObject_CAST(op);
2678
0
    if (self->lastindex >= 0)
2679
0
        return PyLong_FromSsize_t(self->lastindex);
2680
0
    Py_RETURN_NONE;
2681
0
}
2682
2683
static PyObject *
2684
match_lastgroup_get(PyObject *op, void *Py_UNUSED(ignored))
2685
0
{
2686
0
    MatchObject *self = _MatchObject_CAST(op);
2687
0
    if (self->pattern->indexgroup &&
2688
0
        self->lastindex >= 0 &&
2689
0
        self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
2690
0
    {
2691
0
        PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
2692
0
                                            self->lastindex);
2693
0
        return Py_NewRef(result);
2694
0
    }
2695
0
    Py_RETURN_NONE;
2696
0
}
2697
2698
static PyObject *
2699
match_regs_get(PyObject *op, void *Py_UNUSED(ignored))
2700
0
{
2701
0
    MatchObject *self = _MatchObject_CAST(op);
2702
0
    if (self->regs) {
2703
0
        return Py_NewRef(self->regs);
2704
0
    } else
2705
0
        return match_regs(self);
2706
0
}
2707
2708
static PyObject *
2709
match_repr(PyObject *op)
2710
0
{
2711
0
    MatchObject *self = _MatchObject_CAST(op);
2712
0
    PyObject *result;
2713
0
    PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
2714
0
    if (group0 == NULL)
2715
0
        return NULL;
2716
0
    result = PyUnicode_FromFormat(
2717
0
            "<%s object; span=(%zd, %zd), match=%.50R>",
2718
0
            Py_TYPE(self)->tp_name,
2719
0
            self->mark[0], self->mark[1], group0);
2720
0
    Py_DECREF(group0);
2721
0
    return result;
2722
0
}
2723
2724
2725
static PyObject*
2726
pattern_new_match(_sremodulestate* module_state,
2727
                  PatternObject* pattern,
2728
                  SRE_STATE* state,
2729
                  Py_ssize_t status)
2730
76.5M
{
2731
    /* create match object (from state object) */
2732
2733
76.5M
    MatchObject* match;
2734
76.5M
    Py_ssize_t i, j;
2735
76.5M
    char* base;
2736
76.5M
    int n;
2737
2738
76.5M
    if (status > 0) {
2739
2740
        /* create match object (with room for extra group marks) */
2741
        /* coverity[ampersand_in_size] */
2742
56.1M
        match = PyObject_GC_NewVar(MatchObject,
2743
56.1M
                                   module_state->Match_Type,
2744
56.1M
                                   2*(pattern->groups+1));
2745
56.1M
        if (!match)
2746
0
            return NULL;
2747
2748
56.1M
        Py_INCREF(pattern);
2749
56.1M
        match->pattern = pattern;
2750
2751
56.1M
        match->string = Py_NewRef(state->string);
2752
2753
56.1M
        match->regs = NULL;
2754
56.1M
        match->groups = pattern->groups+1;
2755
2756
        /* fill in group slices */
2757
2758
56.1M
        base = (char*) state->beginning;
2759
56.1M
        n = state->charsize;
2760
2761
56.1M
        match->mark[0] = ((char*) state->start - base) / n;
2762
56.1M
        match->mark[1] = ((char*) state->ptr - base) / n;
2763
2764
110M
        for (i = j = 0; i < pattern->groups; i++, j+=2)
2765
54.6M
            if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
2766
44.3M
                match->mark[j+2] = ((char*) state->mark[j] - base) / n;
2767
44.3M
                match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2768
2769
                /* check wrong span */
2770
44.3M
                if (match->mark[j+2] > match->mark[j+3]) {
2771
0
                    PyErr_SetString(PyExc_SystemError,
2772
0
                                    "The span of capturing group is wrong,"
2773
0
                                    " please report a bug for the re module.");
2774
0
                    Py_DECREF(match);
2775
0
                    return NULL;
2776
0
                }
2777
44.3M
            } else
2778
10.2M
                match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
2779
2780
56.1M
        match->pos = state->pos;
2781
56.1M
        match->endpos = state->endpos;
2782
2783
56.1M
        match->lastindex = state->lastindex;
2784
2785
56.1M
        PyObject_GC_Track(match);
2786
56.1M
        return (PyObject*) match;
2787
2788
56.1M
    } else if (status == 0) {
2789
2790
        /* no match */
2791
20.3M
        Py_RETURN_NONE;
2792
2793
20.3M
    }
2794
2795
    /* internal error */
2796
0
    pattern_error(status);
2797
0
    return NULL;
2798
76.5M
}
2799
2800
2801
/* -------------------------------------------------------------------- */
2802
/* scanner methods (experimental) */
2803
2804
static int
2805
scanner_traverse(PyObject *op, visitproc visit, void *arg)
2806
1.08k
{
2807
1.08k
    ScannerObject *self = _ScannerObject_CAST(op);
2808
1.08k
    Py_VISIT(Py_TYPE(self));
2809
1.08k
    Py_VISIT(self->pattern);
2810
1.08k
    return 0;
2811
1.08k
}
2812
2813
static int
2814
scanner_clear(PyObject *op)
2815
358k
{
2816
358k
    ScannerObject *self = _ScannerObject_CAST(op);
2817
358k
    Py_CLEAR(self->pattern);
2818
358k
    return 0;
2819
358k
}
2820
2821
static void
2822
scanner_dealloc(PyObject *self)
2823
358k
{
2824
358k
    PyTypeObject *tp = Py_TYPE(self);
2825
358k
    PyObject_GC_UnTrack(self);
2826
358k
    ScannerObject *scanner = _ScannerObject_CAST(self);
2827
358k
    state_fini(&scanner->state);
2828
358k
    (void)scanner_clear(self);
2829
358k
    tp->tp_free(self);
2830
358k
    Py_DECREF(tp);
2831
358k
}
2832
2833
static int
2834
scanner_begin(ScannerObject* self)
2835
3.28M
{
2836
#ifdef Py_GIL_DISABLED
2837
    int was_executing = _Py_atomic_exchange_int(&self->executing, 1);
2838
#else
2839
3.28M
    int was_executing = self->executing;
2840
3.28M
    self->executing = 1;
2841
3.28M
#endif
2842
3.28M
    if (was_executing) {
2843
0
        PyErr_SetString(PyExc_ValueError,
2844
0
                        "regular expression scanner already executing");
2845
0
        return 0;
2846
0
    }
2847
3.28M
    return 1;
2848
3.28M
}
2849
2850
static void
2851
scanner_end(ScannerObject* self)
2852
3.28M
{
2853
3.28M
    assert(FT_ATOMIC_LOAD_INT_RELAXED(self->executing));
2854
3.28M
    FT_ATOMIC_STORE_INT(self->executing, 0);
2855
3.28M
}
2856
2857
/*[clinic input]
2858
_sre.SRE_Scanner.prefixmatch
2859
2860
    cls: defining_class
2861
    /
2862
2863
[clinic start generated code]*/
2864
2865
static PyObject *
2866
_sre_SRE_Scanner_prefixmatch_impl(ScannerObject *self, PyTypeObject *cls)
2867
/*[clinic end generated code: output=02b3b9d2954a2157 input=3049b20466c56a8e]*/
2868
0
{
2869
0
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2870
0
    SRE_STATE* state = &self->state;
2871
0
    PyObject* match;
2872
0
    Py_ssize_t status;
2873
2874
0
    if (!scanner_begin(self)) {
2875
0
        return NULL;
2876
0
    }
2877
0
    if (state->start == NULL) {
2878
0
        scanner_end(self);
2879
0
        Py_RETURN_NONE;
2880
0
    }
2881
2882
0
    state_reset(state);
2883
2884
0
    state->ptr = state->start;
2885
2886
0
    status = sre_match(state, PatternObject_GetCode(self->pattern));
2887
0
    if (PyErr_Occurred()) {
2888
0
        scanner_end(self);
2889
0
        return NULL;
2890
0
    }
2891
2892
0
    match = pattern_new_match(module_state, self->pattern,
2893
0
                              state, status);
2894
2895
0
    if (status == 0)
2896
0
        state->start = NULL;
2897
0
    else {
2898
0
        state->must_advance = (state->ptr == state->start);
2899
0
        state->start = state->ptr;
2900
0
    }
2901
2902
0
    scanner_end(self);
2903
0
    return match;
2904
0
}
2905
2906
2907
/*[clinic input]
2908
_sre.SRE_Scanner.search
2909
2910
    cls: defining_class
2911
    /
2912
2913
[clinic start generated code]*/
2914
2915
static PyObject *
2916
_sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
2917
/*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
2918
3.28M
{
2919
3.28M
    _sremodulestate *module_state = get_sre_module_state_by_class(cls);
2920
3.28M
    SRE_STATE* state = &self->state;
2921
3.28M
    PyObject* match;
2922
3.28M
    Py_ssize_t status;
2923
2924
3.28M
    if (!scanner_begin(self)) {
2925
0
        return NULL;
2926
0
    }
2927
3.28M
    if (state->start == NULL) {
2928
0
        scanner_end(self);
2929
0
        Py_RETURN_NONE;
2930
0
    }
2931
2932
3.28M
    state_reset(state);
2933
2934
3.28M
    state->ptr = state->start;
2935
2936
3.28M
    status = sre_search(state, PatternObject_GetCode(self->pattern));
2937
3.28M
    if (PyErr_Occurred()) {
2938
0
        scanner_end(self);
2939
0
        return NULL;
2940
0
    }
2941
2942
3.28M
    match = pattern_new_match(module_state, self->pattern,
2943
3.28M
                              state, status);
2944
2945
3.28M
    if (status == 0)
2946
358k
        state->start = NULL;
2947
2.92M
    else {
2948
2.92M
        state->must_advance = (state->ptr == state->start);
2949
2.92M
        state->start = state->ptr;
2950
2.92M
    }
2951
2952
3.28M
    scanner_end(self);
2953
3.28M
    return match;
2954
3.28M
}
2955
2956
static PyObject *
2957
pattern_scanner(_sremodulestate *module_state,
2958
                PatternObject *self,
2959
                PyObject *string,
2960
                Py_ssize_t pos,
2961
                Py_ssize_t endpos)
2962
358k
{
2963
358k
    ScannerObject* scanner;
2964
2965
    /* create scanner object */
2966
358k
    scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
2967
358k
    if (!scanner)
2968
0
        return NULL;
2969
358k
    scanner->pattern = NULL;
2970
358k
    scanner->executing = 0;
2971
2972
    /* create search state object */
2973
358k
    if (!state_init(&scanner->state, self, string, pos, endpos)) {
2974
0
        Py_DECREF(scanner);
2975
0
        return NULL;
2976
0
    }
2977
2978
358k
    Py_INCREF(self);
2979
358k
    scanner->pattern = self;
2980
2981
358k
    PyObject_GC_Track(scanner);
2982
358k
    return (PyObject*) scanner;
2983
358k
}
2984
2985
/* -------------------------------------------------------------------- */
2986
/* template methods */
2987
2988
static int
2989
template_traverse(PyObject *op, visitproc visit, void *arg)
2990
0
{
2991
0
    TemplateObject *self = _TemplateObject_CAST(op);
2992
0
    Py_VISIT(Py_TYPE(self));
2993
0
    Py_VISIT(self->literal);
2994
0
    for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
2995
0
        Py_VISIT(self->items[i].literal);
2996
0
    }
2997
0
    return 0;
2998
0
}
2999
3000
static int
3001
template_clear(PyObject *op)
3002
0
{
3003
0
    TemplateObject *self = _TemplateObject_CAST(op);
3004
0
    Py_CLEAR(self->literal);
3005
0
    for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
3006
0
        Py_CLEAR(self->items[i].literal);
3007
0
    }
3008
0
    return 0;
3009
0
}
3010
3011
static void
3012
template_dealloc(PyObject *self)
3013
0
{
3014
0
    PyTypeObject *tp = Py_TYPE(self);
3015
0
    PyObject_GC_UnTrack(self);
3016
0
    (void)template_clear(self);
3017
0
    tp->tp_free(self);
3018
0
    Py_DECREF(tp);
3019
0
}
3020
3021
static PyObject *
3022
expand_template(TemplateObject *self, MatchObject *match)
3023
0
{
3024
0
    if (Py_SIZE(self) == 0) {
3025
0
        return Py_NewRef(self->literal);
3026
0
    }
3027
3028
0
    PyObject *result = NULL;
3029
0
    Py_ssize_t count = 0;  // the number of non-empty chunks
3030
    /* For small number of strings use a buffer allocated on the stack,
3031
     * otherwise use a list object. */
3032
0
    PyObject *buffer[10];
3033
0
    PyObject **out = buffer;
3034
0
    PyObject *list = NULL;
3035
0
    if (self->chunks > (int)Py_ARRAY_LENGTH(buffer) ||
3036
0
        !PyUnicode_Check(self->literal))
3037
0
    {
3038
0
        list = PyList_New(self->chunks);
3039
0
        if (!list) {
3040
0
            return NULL;
3041
0
        }
3042
0
        out = &PyList_GET_ITEM(list, 0);
3043
0
    }
3044
3045
0
    out[count++] = Py_NewRef(self->literal);
3046
0
    for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) {
3047
0
        Py_ssize_t index = self->items[i].index;
3048
0
        if (index >= match->groups) {
3049
0
            PyErr_SetString(PyExc_IndexError, "no such group");
3050
0
            goto cleanup;
3051
0
        }
3052
0
        PyObject *item = match_getslice_by_index(match, index, Py_None);
3053
0
        if (item == NULL) {
3054
0
            goto cleanup;
3055
0
        }
3056
0
        if (item != Py_None) {
3057
0
            out[count++] = Py_NewRef(item);
3058
0
        }
3059
0
        Py_DECREF(item);
3060
3061
0
        PyObject *literal = self->items[i].literal;
3062
0
        if (literal != NULL) {
3063
0
            out[count++] = Py_NewRef(literal);
3064
0
        }
3065
0
    }
3066
3067
0
    if (PyUnicode_Check(self->literal)) {
3068
0
        result = _PyUnicode_JoinArray(&_Py_STR(empty), out, count);
3069
0
    }
3070
0
    else {
3071
0
        Py_SET_SIZE(list, count);
3072
0
        result = PyBytes_Join((PyObject *)&_Py_SINGLETON(bytes_empty), list);
3073
0
    }
3074
3075
0
cleanup:
3076
0
    if (list) {
3077
0
        Py_DECREF(list);
3078
0
    }
3079
0
    else {
3080
0
        for (Py_ssize_t i = 0; i < count; i++) {
3081
0
            Py_DECREF(out[i]);
3082
0
        }
3083
0
    }
3084
0
    return result;
3085
0
}
3086
3087
3088
static Py_hash_t
3089
pattern_hash(PyObject *op)
3090
0
{
3091
0
    PatternObject *self = _PatternObject_CAST(op);
3092
3093
0
    Py_hash_t hash, hash2;
3094
3095
0
    hash = PyObject_Hash(self->pattern);
3096
0
    if (hash == -1) {
3097
0
        return -1;
3098
0
    }
3099
3100
0
    hash2 = Py_HashBuffer(self->code, sizeof(self->code[0]) * self->codesize);
3101
0
    hash ^= hash2;
3102
3103
0
    hash ^= self->flags;
3104
0
    hash ^= self->isbytes;
3105
0
    hash ^= self->codesize;
3106
3107
0
    if (hash == -1) {
3108
0
        hash = -2;
3109
0
    }
3110
0
    return hash;
3111
0
}
3112
3113
static PyObject*
3114
pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
3115
0
{
3116
0
    PyTypeObject *tp = Py_TYPE(lefto);
3117
0
    _sremodulestate *module_state = get_sre_module_state_by_class(tp);
3118
0
    PatternObject *left, *right;
3119
0
    int cmp;
3120
3121
0
    if (op != Py_EQ && op != Py_NE) {
3122
0
        Py_RETURN_NOTIMPLEMENTED;
3123
0
    }
3124
3125
0
    if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
3126
0
    {
3127
0
        Py_RETURN_NOTIMPLEMENTED;
3128
0
    }
3129
3130
0
    if (lefto == righto) {
3131
        /* a pattern is equal to itself */
3132
0
        return PyBool_FromLong(op == Py_EQ);
3133
0
    }
3134
3135
0
    left = (PatternObject *)lefto;
3136
0
    right = (PatternObject *)righto;
3137
3138
0
    cmp = (left->flags == right->flags
3139
0
           && left->isbytes == right->isbytes
3140
0
           && left->codesize == right->codesize);
3141
0
    if (cmp) {
3142
        /* Compare the code and the pattern because the same pattern can
3143
           produce different codes depending on the locale used to compile the
3144
           pattern when the re.LOCALE flag is used. Don't compare groups,
3145
           indexgroup nor groupindex: they are derivated from the pattern. */
3146
0
        cmp = (memcmp(left->code, right->code,
3147
0
                      sizeof(left->code[0]) * left->codesize) == 0);
3148
0
    }
3149
0
    if (cmp) {
3150
0
        cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
3151
0
                                       Py_EQ);
3152
0
        if (cmp < 0) {
3153
0
            return NULL;
3154
0
        }
3155
0
    }
3156
0
    if (op == Py_NE) {
3157
0
        cmp = !cmp;
3158
0
    }
3159
0
    return PyBool_FromLong(cmp);
3160
0
}
3161
3162
#include "clinic/sre.c.h"
3163
3164
static PyMethodDef pattern_methods[] = {
3165
    _SRE_SRE_PATTERN_PREFIXMATCH_METHODDEF
3166
    /* "match" reuses the prefixmatch Clinic-generated parser and impl
3167
     * to avoid duplicating the argument parsing boilerplate code. */
3168
    {"match", _PyCFunction_CAST(_sre_SRE_Pattern_prefixmatch),
3169
     METH_METHOD|METH_FASTCALL|METH_KEYWORDS,
3170
     _sre_SRE_Pattern_prefixmatch__doc__},
3171
    _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
3172
    _SRE_SRE_PATTERN_SEARCH_METHODDEF
3173
    _SRE_SRE_PATTERN_SUB_METHODDEF
3174
    _SRE_SRE_PATTERN_SUBN_METHODDEF
3175
    _SRE_SRE_PATTERN_FINDALL_METHODDEF
3176
    _SRE_SRE_PATTERN_SPLIT_METHODDEF
3177
    _SRE_SRE_PATTERN_FINDITER_METHODDEF
3178
    _SRE_SRE_PATTERN_SCANNER_METHODDEF
3179
    _SRE_SRE_PATTERN___COPY___METHODDEF
3180
    _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
3181
    _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF
3182
    {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3183
     PyDoc_STR("See PEP 585")},
3184
    {NULL, NULL}
3185
};
3186
3187
static PyGetSetDef pattern_getset[] = {
3188
    {"groupindex", pattern_groupindex, NULL,
3189
      "A dictionary mapping group names to group numbers."},
3190
    {NULL}  /* Sentinel */
3191
};
3192
3193
#define PAT_OFF(x) offsetof(PatternObject, x)
3194
static PyMemberDef pattern_members[] = {
3195
    {"pattern",    _Py_T_OBJECT,    PAT_OFF(pattern),       Py_READONLY,
3196
     "The pattern string from which the RE object was compiled."},
3197
    {"flags",      Py_T_INT,       PAT_OFF(flags),         Py_READONLY,
3198
     "The regex matching flags."},
3199
    {"groups",     Py_T_PYSSIZET,  PAT_OFF(groups),        Py_READONLY,
3200
     "The number of capturing groups in the pattern."},
3201
    {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(PatternObject, weakreflist), Py_READONLY},
3202
    {NULL}  /* Sentinel */
3203
};
3204
3205
static PyType_Slot pattern_slots[] = {
3206
    {Py_tp_dealloc, pattern_dealloc},
3207
    {Py_tp_repr, pattern_repr},
3208
    {Py_tp_hash, pattern_hash},
3209
    {Py_tp_doc, (void *)pattern_doc},
3210
    {Py_tp_richcompare, pattern_richcompare},
3211
    {Py_tp_methods, pattern_methods},
3212
    {Py_tp_members, pattern_members},
3213
    {Py_tp_getset, pattern_getset},
3214
    {Py_tp_traverse, pattern_traverse},
3215
    {Py_tp_clear, pattern_clear},
3216
    {0, NULL},
3217
};
3218
3219
static PyType_Spec pattern_spec = {
3220
    .name = "re.Pattern",
3221
    .basicsize = sizeof(PatternObject),
3222
    .itemsize = sizeof(SRE_CODE),
3223
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3224
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3225
    .slots = pattern_slots,
3226
};
3227
3228
static PyMethodDef match_methods[] = {
3229
    {"group", match_group, METH_VARARGS, match_group_doc},
3230
    _SRE_SRE_MATCH_START_METHODDEF
3231
    _SRE_SRE_MATCH_END_METHODDEF
3232
    _SRE_SRE_MATCH_SPAN_METHODDEF
3233
    _SRE_SRE_MATCH_GROUPS_METHODDEF
3234
    _SRE_SRE_MATCH_GROUPDICT_METHODDEF
3235
    _SRE_SRE_MATCH_EXPAND_METHODDEF
3236
    _SRE_SRE_MATCH___COPY___METHODDEF
3237
    _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
3238
    {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
3239
     PyDoc_STR("See PEP 585")},
3240
    {NULL, NULL}
3241
};
3242
3243
static PyGetSetDef match_getset[] = {
3244
    {"lastindex", match_lastindex_get, NULL,
3245
     "The integer index of the last matched capturing group."},
3246
    {"lastgroup", match_lastgroup_get, NULL,
3247
     "The name of the last matched capturing group."},
3248
    {"regs", match_regs_get, NULL, NULL},
3249
    {NULL}
3250
};
3251
3252
#define MATCH_OFF(x) offsetof(MatchObject, x)
3253
static PyMemberDef match_members[] = {
3254
    {"string",  _Py_T_OBJECT,   MATCH_OFF(string),  Py_READONLY,
3255
     "The string passed to match() or search()."},
3256
    {"re",      _Py_T_OBJECT,   MATCH_OFF(pattern), Py_READONLY,
3257
     "The regular expression object."},
3258
    {"pos",     Py_T_PYSSIZET, MATCH_OFF(pos),     Py_READONLY,
3259
     "The index into the string at which the RE engine started looking for a match."},
3260
    {"endpos",  Py_T_PYSSIZET, MATCH_OFF(endpos),  Py_READONLY,
3261
     "The index into the string beyond which the RE engine will not go."},
3262
    {NULL}
3263
};
3264
3265
/* FIXME: implement setattr("string", None) as a special case (to
3266
   detach the associated string, if any */
3267
static PyType_Slot match_slots[] = {
3268
    {Py_tp_dealloc, match_dealloc},
3269
    {Py_tp_repr, match_repr},
3270
    {Py_tp_doc, (void *)match_doc},
3271
    {Py_tp_methods, match_methods},
3272
    {Py_tp_members, match_members},
3273
    {Py_tp_getset, match_getset},
3274
    {Py_tp_traverse, match_traverse},
3275
    {Py_tp_clear, match_clear},
3276
3277
    /* As mapping.
3278
     *
3279
     * Match objects do not support length or assignment, but do support
3280
     * __getitem__.
3281
     */
3282
    {Py_mp_subscript, match_getitem},
3283
3284
    {0, NULL},
3285
};
3286
3287
static PyType_Spec match_spec = {
3288
    .name = "re.Match",
3289
    .basicsize = sizeof(MatchObject),
3290
    .itemsize = sizeof(Py_ssize_t),
3291
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3292
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3293
    .slots = match_slots,
3294
};
3295
3296
static PyMethodDef scanner_methods[] = {
3297
    _SRE_SRE_SCANNER_PREFIXMATCH_METHODDEF
3298
    /* "match" reuses the prefixmatch Clinic-generated parser and impl
3299
     * to avoid duplicating the argument parsing boilerplate code. */
3300
    {"match", _PyCFunction_CAST(_sre_SRE_Scanner_prefixmatch),
3301
     METH_METHOD|METH_FASTCALL|METH_KEYWORDS,
3302
     _sre_SRE_Scanner_prefixmatch__doc__},
3303
    _SRE_SRE_SCANNER_SEARCH_METHODDEF
3304
    {NULL, NULL}
3305
};
3306
3307
#define SCAN_OFF(x) offsetof(ScannerObject, x)
3308
static PyMemberDef scanner_members[] = {
3309
    {"pattern", _Py_T_OBJECT, SCAN_OFF(pattern), Py_READONLY},
3310
    {NULL}  /* Sentinel */
3311
};
3312
3313
static PyType_Slot scanner_slots[] = {
3314
    {Py_tp_dealloc, scanner_dealloc},
3315
    {Py_tp_methods, scanner_methods},
3316
    {Py_tp_members, scanner_members},
3317
    {Py_tp_traverse, scanner_traverse},
3318
    {Py_tp_clear, scanner_clear},
3319
    {0, NULL},
3320
};
3321
3322
static PyType_Spec scanner_spec = {
3323
    .name = "_sre.SRE_Scanner",
3324
    .basicsize = sizeof(ScannerObject),
3325
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3326
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3327
    .slots = scanner_slots,
3328
};
3329
3330
static PyType_Slot template_slots[] = {
3331
    {Py_tp_dealloc, template_dealloc},
3332
    {Py_tp_traverse, template_traverse},
3333
    {Py_tp_clear, template_clear},
3334
    {0, NULL},
3335
};
3336
3337
static PyType_Spec template_spec = {
3338
    .name = "_sre.SRE_Template",
3339
    .basicsize = sizeof(TemplateObject),
3340
    .itemsize = sizeof(((TemplateObject *)0)->items[0]),
3341
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
3342
              Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
3343
    .slots = template_slots,
3344
};
3345
3346
static PyMethodDef _functions[] = {
3347
    _SRE_COMPILE_METHODDEF
3348
    _SRE_TEMPLATE_METHODDEF
3349
    _SRE_GETCODESIZE_METHODDEF
3350
    _SRE_ASCII_ISCASED_METHODDEF
3351
    _SRE_UNICODE_ISCASED_METHODDEF
3352
    _SRE_ASCII_TOLOWER_METHODDEF
3353
    _SRE_UNICODE_TOLOWER_METHODDEF
3354
    {NULL, NULL}
3355
};
3356
3357
static int
3358
sre_traverse(PyObject *module, visitproc visit, void *arg)
3359
1.35k
{
3360
1.35k
    _sremodulestate *state = get_sre_module_state(module);
3361
3362
1.35k
    Py_VISIT(state->Pattern_Type);
3363
1.35k
    Py_VISIT(state->Match_Type);
3364
1.35k
    Py_VISIT(state->Scanner_Type);
3365
1.35k
    Py_VISIT(state->Template_Type);
3366
1.35k
    Py_VISIT(state->compile_template);
3367
3368
1.35k
    return 0;
3369
1.35k
}
3370
3371
static int
3372
sre_clear(PyObject *module)
3373
0
{
3374
0
    _sremodulestate *state = get_sre_module_state(module);
3375
3376
0
    Py_CLEAR(state->Pattern_Type);
3377
0
    Py_CLEAR(state->Match_Type);
3378
0
    Py_CLEAR(state->Scanner_Type);
3379
0
    Py_CLEAR(state->Template_Type);
3380
0
    Py_CLEAR(state->compile_template);
3381
3382
0
    return 0;
3383
0
}
3384
3385
static void
3386
sre_free(void *module)
3387
0
{
3388
0
    sre_clear((PyObject *)module);
3389
0
}
3390
3391
112
#define CREATE_TYPE(m, type, spec)                                  \
3392
112
do {                                                                \
3393
112
    type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
3394
112
    if (type == NULL) {                                             \
3395
0
        goto error;                                                 \
3396
0
    }                                                               \
3397
112
} while (0)
3398
3399
#define ADD_ULONG_CONSTANT(module, name, value)           \
3400
56
    do {                                                  \
3401
56
        if (PyModule_Add(module, name, PyLong_FromUnsignedLong(value)) < 0) { \
3402
0
            goto error;                                   \
3403
0
        }                                                 \
3404
56
} while (0)
3405
3406
3407
#ifdef Py_DEBUG
3408
static void
3409
_assert_match_aliases_prefixmatch(PyMethodDef *methods)
3410
{
3411
    PyMethodDef *prefixmatch_md = &methods[0];
3412
    PyMethodDef *match_md = &methods[1];
3413
    assert(strcmp(prefixmatch_md->ml_name, "prefixmatch") == 0);
3414
    assert(strcmp(match_md->ml_name, "match") == 0);
3415
    assert(match_md->ml_meth == prefixmatch_md->ml_meth);
3416
    assert(match_md->ml_flags == prefixmatch_md->ml_flags);
3417
    assert(match_md->ml_doc == prefixmatch_md->ml_doc);
3418
}
3419
#endif
3420
3421
static int
3422
sre_exec(PyObject *m)
3423
28
{
3424
28
    _sremodulestate *state;
3425
3426
#ifdef Py_DEBUG
3427
    _assert_match_aliases_prefixmatch(pattern_methods);
3428
    _assert_match_aliases_prefixmatch(scanner_methods);
3429
#endif
3430
3431
    /* Create heap types */
3432
28
    state = get_sre_module_state(m);
3433
28
    CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
3434
28
    CREATE_TYPE(m, state->Match_Type, &match_spec);
3435
28
    CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
3436
28
    CREATE_TYPE(m, state->Template_Type, &template_spec);
3437
3438
28
    if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
3439
0
        goto error;
3440
0
    }
3441
3442
28
    if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
3443
0
        goto error;
3444
0
    }
3445
3446
28
    ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
3447
28
    ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
3448
3449
28
    if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
3450
0
        goto error;
3451
0
    }
3452
3453
28
    return 0;
3454
3455
0
error:
3456
0
    return -1;
3457
28
}
3458
3459
static PyModuleDef_Slot sre_slots[] = {
3460
    _Py_ABI_SLOT,
3461
    {Py_mod_exec, sre_exec},
3462
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
3463
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
3464
    {0, NULL},
3465
};
3466
3467
static struct PyModuleDef sremodule = {
3468
    .m_base = PyModuleDef_HEAD_INIT,
3469
    .m_name = "_sre",
3470
    .m_size = sizeof(_sremodulestate),
3471
    .m_methods = _functions,
3472
    .m_slots = sre_slots,
3473
    .m_traverse = sre_traverse,
3474
    .m_free = sre_free,
3475
    .m_clear = sre_clear,
3476
};
3477
3478
PyMODINIT_FUNC
3479
PyInit__sre(void)
3480
28
{
3481
28
    return PyModuleDef_Init(&sremodule);
3482
28
}
3483
3484
/* vim:ts=4:sw=4:et
3485
*/