Coverage Report

Created: 2026-05-16 06:35

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gstreamer/subprojects/glib-2.86.3/glib/gregex.c
Line
Count
Source
1
/* GRegex -- regular expression API wrapper around PCRE.
2
 *
3
 * Copyright (C) 1999, 2000 Scott Wimer
4
 * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
5
 * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
6
 * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com>
7
 *
8
 * SPDX-License-Identifier: LGPL-2.1-or-later
9
 *
10
 * This library is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * This library is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public License
21
 * along with this library; if not, see <http://www.gnu.org/licenses/>.
22
 */
23
24
#include "config.h"
25
26
#include <stdint.h>
27
#include <string.h>
28
29
#define PCRE2_CODE_UNIT_WIDTH 8
30
#include <pcre2.h>
31
32
#include "gtypes.h"
33
#include "gregex.h"
34
#include "glibintl.h"
35
#include "glist.h"
36
#include "gmessages.h"
37
#include "gstrfuncs.h"
38
#include "gatomic.h"
39
#include "gtestutils.h"
40
#include "gthread.h"
41
42
/**
43
 * GRegex:
44
 *
45
 * A `GRegex` is a compiled form of a regular expression.
46
 * 
47
 * After instantiating a `GRegex`, you can use its methods to find matches
48
 * in a string, replace matches within a string, or split the string at matches.
49
 *
50
 * `GRegex` implements regular expression pattern matching using syntax and 
51
 * semantics (such as character classes, quantifiers, and capture groups) 
52
 * similar to Perl regular expression. See the 
53
 * [PCRE documentation](man:pcre2pattern(3)) for details.
54
 *
55
 * A typical scenario for regex pattern matching is to check if a string 
56
 * matches a pattern. The following statements implement this scenario.
57
 * 
58
 * ``` { .c }
59
 * const char *regex_pattern = ".*GLib.*";
60
 * const char *string_to_search = "You will love the GLib implementation of regex";
61
 * g_autoptr(GMatchInfo) match_info = NULL;
62
 * g_autoptr(GRegex) regex = NULL;
63
 *
64
 * regex = g_regex_new (regex_pattern, G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
65
 * g_assert (regex != NULL);
66
 * 
67
 * if (g_regex_match (regex, string_to_search, G_REGEX_MATCH_DEFAULT, &match_info))
68
 *   {
69
 *     int start_pos, end_pos;
70
 *     g_match_info_fetch_pos (match_info, 0, &start_pos, &end_pos);
71
 *     g_print ("Match successful! Overall pattern matches bytes %d to %d\n", start_pos, end_pos);
72
 *   }
73
 * else
74
 *   {
75
 *     g_print ("No match!\n");
76
 *   }
77
 * ```
78
 * 
79
 * The constructor for `GRegex` includes two sets of bitmapped flags:
80
81
 * * [flags@GLib.RegexCompileFlags]—These flags 
82
 * control how GLib compiles the regex. There are options for case 
83
 * sensitivity, multiline, ignoring whitespace, etc.
84
 * * [flags@GLib.RegexMatchFlags]—These flags control 
85
 * `GRegex`’s matching behavior, such as anchoring and customizing definitions 
86
 * for newline characters.
87
 * 
88
 * Some regex patterns include backslash assertions, such as `\d` (digit) or 
89
 * `\D` (non-digit). The regex pattern must escape those backslashes. For 
90
 * example, the pattern `"\\d\\D"` matches a digit followed by a non-digit.
91
 *
92
 * GLib’s implementation of pattern matching includes a `start_position` 
93
 * argument for some of the match, replace, and split methods. Specifying 
94
 * a start position provides flexibility when you want to ignore the first 
95
 * _n_ characters of a string, but want to incorporate backslash assertions 
96
 * at character _n_ - 1. For example, a database field contains inconsistent
97
 * spelling for a job title: `healthcare provider` and `health-care provider`.
98
 * The database manager wants to make the spelling consistent by adding a 
99
 * hyphen when it is missing. The following regex pattern tests for the string 
100
 * `care` preceded by a non-word boundary character (instead of a hyphen) 
101
 * and followed by a space.
102
 *
103
 * ``` { .c }
104
 * const char *regex_pattern = "\\Bcare\\s";
105
 * ```
106
 *
107
 * An efficient way to match with this pattern is to start examining at 
108
 * `start_position` 6 in the string `healthcare` or `health-care`.
109
110
 * ``` { .c }
111
 * const char *regex_pattern = "\\Bcare\\s";
112
 * const char *string_to_search = "healthcare provider";
113
 * g_autoptr(GMatchInfo) match_info = NULL;
114
 * g_autoptr(GRegex) regex = NULL;
115
 *
116
 * regex = g_regex_new (
117
 *   regex_pattern,
118
 *   G_REGEX_DEFAULT,
119
 *   G_REGEX_MATCH_DEFAULT,
120
 *   NULL);
121
 * g_assert (regex != NULL);
122
 * 
123
 * g_regex_match_full (
124
 *   regex, 
125
 *   string_to_search, 
126
 *   -1,
127
 *   6, // position of 'c' in the test string.
128
 *   G_REGEX_MATCH_DEFAULT, 
129
 *   &match_info,
130
 *   NULL);
131
 * ```
132
 * 
133
 * The method [method@GLib.Regex.match_full] (and other methods implementing 
134
 * `start_pos`) allow for lookback before the start position to determine if 
135
 * the previous character satisfies an assertion.
136
 *
137
 * Unless you set the [flags@GLib.RegexCompileFlags.RAW] as one of 
138
 * the `GRegexCompileFlags`, all the strings passed to `GRegex` methods must 
139
 * be encoded in UTF-8. The lengths and the positions inside the strings are 
140
 * in bytes and not in characters, so, for instance, `\xc3\xa0` (i.e., `à`) 
141
 * is two bytes long but it is treated as a single character. If you set 
142
 * `G_REGEX_RAW`, the strings can be non-valid UTF-8 strings and a byte is 
143
 * treated as a character, so `\xc3\xa0` is two bytes and two characters long.
144
 *
145
 * Regarding line endings, `\n` matches a `\n` character, and `\r` matches 
146
 * a `\r` character. More generally, `\R` matches all typical line endings: 
147
 * CR + LF (`\r\n`), LF (linefeed, U+000A, `\n`), VT (vertical tab, U+000B, 
148
 * `\v`), FF (formfeed, U+000C, `\f`), CR (carriage return, U+000D, `\r`), 
149
 * NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph 
150
 * separator, U+2029).
151
 * 
152
 * The behaviour of the dot, circumflex, and dollar metacharacters are 
153
 * affected by newline characters. By default, `GRegex` matches any newline 
154
 * character matched by `\R`. You can limit the matched newline characters by 
155
 * specifying the [flags@GLib.RegexMatchFlags.NEWLINE_CR], 
156
 * [flags@GLib.RegexMatchFlags.NEWLINE_LF], and 
157
 * [flags@GLib.RegexMatchFlags.NEWLINE_CRLF] compile options, and 
158
 * with [flags@GLib.RegexMatchFlags.NEWLINE_ANY], 
159
 * [flags@GLib.RegexMatchFlags.NEWLINE_CR], 
160
 * [flags@GLib.RegexMatchFlags.NEWLINE_LF] and 
161
 * [flags@GLib.RegexMatchFlags.NEWLINE_CRLF] match options. 
162
 * These settings are also relevant when compiling a pattern if 
163
 * [flags@GLib.RegexCompileFlags.EXTENDED] is set and an unescaped 
164
 * `#` outside a character class is encountered. This indicates a comment 
165
 * that lasts until after the next newline.
166
 * 
167
 * Because `GRegex` does not modify its internal state between creation and 
168
 * destruction, you can create and modify the same `GRegex` instance from 
169
 * different threads. In contrast, [struct@GLib.MatchInfo] is not thread safe.
170
 * 
171
 * The regular expression low-level functionalities are obtained through
172
 * the excellent [PCRE](http://www.pcre.org/) library written by Philip Hazel.
173
 *
174
 * Since: 2.14
175
 */
176
177
131k
#define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED       | \
178
131k
                                   PCRE2_NO_UTF_CHECK   | \
179
131k
                                   PCRE2_ENDANCHORED)
180
181
/* Mask of all the possible values for GRegexCompileFlags. */
182
0
#define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT          | \
183
0
                              G_REGEX_CASELESS         | \
184
0
                              G_REGEX_MULTILINE        | \
185
0
                              G_REGEX_DOTALL           | \
186
0
                              G_REGEX_EXTENDED         | \
187
0
                              G_REGEX_ANCHORED         | \
188
0
                              G_REGEX_DOLLAR_ENDONLY   | \
189
0
                              G_REGEX_UNGREEDY         | \
190
0
                              G_REGEX_RAW              | \
191
0
                              G_REGEX_NO_AUTO_CAPTURE  | \
192
0
                              G_REGEX_OPTIMIZE         | \
193
0
                              G_REGEX_FIRSTLINE        | \
194
0
                              G_REGEX_DUPNAMES         | \
195
0
                              G_REGEX_NEWLINE_CR       | \
196
0
                              G_REGEX_NEWLINE_LF       | \
197
0
                              G_REGEX_NEWLINE_CRLF     | \
198
0
                              G_REGEX_NEWLINE_ANYCRLF  | \
199
0
                              G_REGEX_BSR_ANYCRLF)
200
201
32
#define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS    | \
202
32
                                    PCRE2_ALT_BSUX             | \
203
32
                                    PCRE2_AUTO_CALLOUT         | \
204
32
                                    PCRE2_CASELESS             | \
205
32
                                    PCRE2_DOLLAR_ENDONLY       | \
206
32
                                    PCRE2_DOTALL               | \
207
32
                                    PCRE2_DUPNAMES             | \
208
32
                                    PCRE2_EXTENDED             | \
209
32
                                    PCRE2_FIRSTLINE            | \
210
32
                                    PCRE2_MATCH_UNSET_BACKREF  | \
211
32
                                    PCRE2_MULTILINE            | \
212
32
                                    PCRE2_NEVER_UCP            | \
213
32
                                    PCRE2_NEVER_UTF            | \
214
32
                                    PCRE2_NO_AUTO_CAPTURE      | \
215
32
                                    PCRE2_NO_AUTO_POSSESS      | \
216
32
                                    PCRE2_NO_DOTSTAR_ANCHOR    | \
217
32
                                    PCRE2_NO_START_OPTIMIZE    | \
218
32
                                    PCRE2_UCP                  | \
219
32
                                    PCRE2_UNGREEDY             | \
220
32
                                    PCRE2_UTF                  | \
221
32
                                    PCRE2_NEVER_BACKSLASH_C    | \
222
32
                                    PCRE2_ALT_CIRCUMFLEX       | \
223
32
                                    PCRE2_ALT_VERBNAMES        | \
224
32
                                    PCRE2_USE_OFFSET_LIMIT     | \
225
32
                                    PCRE2_EXTENDED_MORE        | \
226
32
                                    PCRE2_LITERAL              | \
227
32
                                    PCRE2_MATCH_INVALID_UTF    | \
228
32
                                    G_REGEX_PCRE_GENERIC_MASK)
229
230
16
#define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF)
231
232
/* Mask of all the possible values for GRegexMatchFlags. */
233
0
#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT          | \
234
0
                            G_REGEX_MATCH_ANCHORED         | \
235
0
                            G_REGEX_MATCH_NOTBOL           | \
236
0
                            G_REGEX_MATCH_NOTEOL           | \
237
0
                            G_REGEX_MATCH_NOTEMPTY         | \
238
0
                            G_REGEX_MATCH_PARTIAL          | \
239
0
                            G_REGEX_MATCH_NEWLINE_CR       | \
240
0
                            G_REGEX_MATCH_NEWLINE_LF       | \
241
0
                            G_REGEX_MATCH_NEWLINE_CRLF     | \
242
0
                            G_REGEX_MATCH_NEWLINE_ANY      | \
243
0
                            G_REGEX_MATCH_NEWLINE_ANYCRLF  | \
244
0
                            G_REGEX_MATCH_BSR_ANYCRLF      | \
245
0
                            G_REGEX_MATCH_BSR_ANY          | \
246
0
                            G_REGEX_MATCH_PARTIAL_SOFT     | \
247
0
                            G_REGEX_MATCH_PARTIAL_HARD     | \
248
0
                            G_REGEX_MATCH_NOTEMPTY_ATSTART)
249
250
131k
#define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL                      |\
251
131k
                                  PCRE2_NOTEOL                      |\
252
131k
                                  PCRE2_NOTEMPTY                    |\
253
131k
                                  PCRE2_NOTEMPTY_ATSTART            |\
254
131k
                                  PCRE2_PARTIAL_SOFT                |\
255
131k
                                  PCRE2_PARTIAL_HARD                |\
256
131k
                                  PCRE2_NO_JIT                      |\
257
131k
                                  PCRE2_COPY_MATCHED_SUBJECT        |\
258
131k
                                  G_REGEX_PCRE_GENERIC_MASK)
259
260
/* TODO: Support PCRE2_NEWLINE_NUL */
261
#define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR |     \
262
                              PCRE2_NEWLINE_LF |     \
263
                              PCRE2_NEWLINE_CRLF |   \
264
                              PCRE2_NEWLINE_ANYCRLF)
265
266
/* Some match options are not supported when using JIT as stated in the
267
 * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section:
268
 *   https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5
269
 */
270
99.6k
#define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \
271
99.6k
                                               PCRE2_ENDANCHORED)
272
273
16
#define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR      | \
274
16
                                      G_REGEX_NEWLINE_LF      | \
275
16
                                      G_REGEX_NEWLINE_CRLF    | \
276
16
                                      G_REGEX_NEWLINE_ANYCRLF)
277
278
16
#define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR      | \
279
16
                                    G_REGEX_MATCH_NEWLINE_LF      | \
280
16
                                    G_REGEX_MATCH_NEWLINE_CRLF    | \
281
16
                                    G_REGEX_MATCH_NEWLINE_ANY    | \
282
16
                                    G_REGEX_MATCH_NEWLINE_ANYCRLF)
283
284
/* if the string is in UTF-8 use g_utf8_ functions, else use
285
 * use just +/- 1. */
286
0
#define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
287
0
                                ((s) + 1) : \
288
0
                                g_utf8_next_char (s))
289
0
#define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
290
0
                                ((s) - 1) : \
291
0
                                g_utf8_prev_char (s))
292
293
struct _GMatchInfo
294
{
295
  gint ref_count;               /* the ref count (atomic) */
296
  GRegex *regex;                /* the regex */
297
  uint32_t match_opts;          /* pcre match options used at match time on the regex */
298
  gint matches;                 /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */
299
  uint32_t n_subpatterns;       /* total number of sub patterns in the regex */
300
  gint pos;                     /* position in the string where last match left off */
301
  uint32_t n_offsets;           /* number of offsets */
302
  gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
303
  gint *workspace;              /* workspace for pcre2_dfa_match() */
304
  PCRE2_SIZE n_workspace;       /* number of workspace elements */
305
  const gchar *string;          /* string passed to the match function */
306
  gssize string_len;            /* length of string, in bytes */
307
  pcre2_match_context *match_context;
308
  pcre2_match_data *match_data;
309
  pcre2_jit_stack *jit_stack;
310
};
311
312
typedef enum
313
{
314
  JIT_STATUS_DEFAULT,
315
  JIT_STATUS_ENABLED,
316
  JIT_STATUS_DISABLED
317
} JITStatus;
318
319
struct _GRegex
320
{
321
  gint ref_count;               /* the ref count for the immutable part (atomic) */
322
  gchar *pattern;               /* the pattern */
323
  pcre2_code *pcre_re;          /* compiled form of the pattern */
324
  uint32_t compile_opts;        /* options used at compile time on the pattern, pcre2 values */
325
  GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */
326
  uint32_t match_opts;          /* pcre2 options used at match time on the regex */
327
  GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */
328
  uint32_t jit_options;         /* options which were enabled for jit compiler */
329
  JITStatus jit_status;         /* indicates the status of jit compiler for this compiled regex */
330
  /* The jit_status here does _not_ correspond to whether we used the JIT in the last invocation,
331
   * which may be affected by match_options or a JIT_STACK_LIMIT error, but whether it was ever
332
   * enabled for the current regex AND current set of jit_options.
333
   * JIT_STATUS_DEFAULT means enablement was never tried,
334
   * JIT_STATUS_ENABLED means it was tried and successful (even if we're not currently using it),
335
   * and JIT_STATUS_DISABLED means it was tried and failed (so we shouldn't try again).
336
   */
337
};
338
339
/* TRUE if ret is an error code, FALSE otherwise. */
340
131k
#define IS_PCRE2_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL)
341
342
typedef struct _InterpolationData InterpolationData;
343
static gboolean  interpolation_list_needs_match (GList *list);
344
static gboolean  interpolate_replacement        (const GMatchInfo *match_info,
345
                                                 GString *result,
346
                                                 gpointer data);
347
static GList    *split_replacement              (const gchar *replacement,
348
                                                 GError **error);
349
static void      free_interpolation_data        (InterpolationData *data);
350
351
static uint32_t
352
get_pcre2_compile_options (GRegexCompileFlags compile_flags)
353
16
{
354
  /* Maps compile flags to pcre2 values */
355
16
  uint32_t pcre2_flags = 0;
356
357
16
  if (compile_flags & G_REGEX_CASELESS)
358
0
    pcre2_flags |= PCRE2_CASELESS;
359
16
  if (compile_flags & G_REGEX_MULTILINE)
360
0
    pcre2_flags |= PCRE2_MULTILINE;
361
16
  if (compile_flags & G_REGEX_DOTALL)
362
0
    pcre2_flags |= PCRE2_DOTALL;
363
16
  if (compile_flags & G_REGEX_EXTENDED)
364
0
    pcre2_flags |= PCRE2_EXTENDED;
365
16
  if (compile_flags & G_REGEX_ANCHORED)
366
0
    pcre2_flags |= PCRE2_ANCHORED;
367
16
  if (compile_flags & G_REGEX_DOLLAR_ENDONLY)
368
0
    pcre2_flags |= PCRE2_DOLLAR_ENDONLY;
369
16
  if (compile_flags & G_REGEX_UNGREEDY)
370
0
    pcre2_flags |= PCRE2_UNGREEDY;
371
16
  if (!(compile_flags & G_REGEX_RAW))
372
4
    pcre2_flags |= PCRE2_UTF;
373
16
  if (compile_flags & G_REGEX_NO_AUTO_CAPTURE)
374
0
    pcre2_flags |= PCRE2_NO_AUTO_CAPTURE;
375
16
  if (compile_flags & G_REGEX_FIRSTLINE)
376
0
    pcre2_flags |= PCRE2_FIRSTLINE;
377
16
  if (compile_flags & G_REGEX_DUPNAMES)
378
0
    pcre2_flags |= PCRE2_DUPNAMES;
379
380
16
  return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK;
381
16
}
382
383
static uint32_t
384
get_pcre2_match_options (GRegexMatchFlags   match_flags,
385
                         GRegexCompileFlags compile_flags)
386
131k
{
387
  /* Maps match flags to pcre2 values */
388
131k
  uint32_t pcre2_flags = 0;
389
390
131k
  if (match_flags & G_REGEX_MATCH_ANCHORED)
391
0
    pcre2_flags |= PCRE2_ANCHORED;
392
131k
  if (match_flags & G_REGEX_MATCH_NOTBOL)
393
0
    pcre2_flags |= PCRE2_NOTBOL;
394
131k
  if (match_flags & G_REGEX_MATCH_NOTEOL)
395
0
    pcre2_flags |= PCRE2_NOTEOL;
396
131k
  if (match_flags & G_REGEX_MATCH_NOTEMPTY)
397
0
    pcre2_flags |= PCRE2_NOTEMPTY;
398
131k
  if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT)
399
0
    pcre2_flags |= PCRE2_PARTIAL_SOFT;
400
131k
  if (match_flags & G_REGEX_MATCH_PARTIAL_HARD)
401
0
    pcre2_flags |= PCRE2_PARTIAL_HARD;
402
131k
  if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART)
403
0
    pcre2_flags |= PCRE2_NOTEMPTY_ATSTART;
404
405
131k
  if (compile_flags & G_REGEX_RAW)
406
99.6k
    pcre2_flags |= PCRE2_NO_UTF_CHECK;
407
408
131k
  return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK;
409
131k
}
410
411
static GRegexCompileFlags
412
g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags)
413
0
{
414
0
  GRegexCompileFlags compile_flags = G_REGEX_DEFAULT;
415
416
0
  if (pcre2_flags & PCRE2_CASELESS)
417
0
    compile_flags |= G_REGEX_CASELESS;
418
0
  if (pcre2_flags & PCRE2_MULTILINE)
419
0
    compile_flags |= G_REGEX_MULTILINE;
420
0
  if (pcre2_flags & PCRE2_DOTALL)
421
0
    compile_flags |= G_REGEX_DOTALL;
422
0
  if (pcre2_flags & PCRE2_EXTENDED)
423
0
    compile_flags |= G_REGEX_EXTENDED;
424
0
  if (pcre2_flags & PCRE2_ANCHORED)
425
0
    compile_flags |= G_REGEX_ANCHORED;
426
0
  if (pcre2_flags & PCRE2_DOLLAR_ENDONLY)
427
0
    compile_flags |= G_REGEX_DOLLAR_ENDONLY;
428
0
  if (pcre2_flags & PCRE2_UNGREEDY)
429
0
    compile_flags |= G_REGEX_UNGREEDY;
430
0
  if (!(pcre2_flags & PCRE2_UTF))
431
0
    compile_flags |= G_REGEX_RAW;
432
0
  if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE)
433
0
    compile_flags |= G_REGEX_NO_AUTO_CAPTURE;
434
0
  if (pcre2_flags & PCRE2_FIRSTLINE)
435
0
    compile_flags |= G_REGEX_FIRSTLINE;
436
0
  if (pcre2_flags & PCRE2_DUPNAMES)
437
0
    compile_flags |= G_REGEX_DUPNAMES;
438
439
0
  return compile_flags & G_REGEX_COMPILE_MASK;
440
0
}
441
442
static GRegexMatchFlags
443
g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags)
444
0
{
445
0
  GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT;
446
447
0
  if (pcre2_flags & PCRE2_ANCHORED)
448
0
    match_flags |= G_REGEX_MATCH_ANCHORED;
449
0
  if (pcre2_flags & PCRE2_NOTBOL)
450
0
    match_flags |= G_REGEX_MATCH_NOTBOL;
451
0
  if (pcre2_flags & PCRE2_NOTEOL)
452
0
    match_flags |= G_REGEX_MATCH_NOTEOL;
453
0
  if (pcre2_flags & PCRE2_NOTEMPTY)
454
0
    match_flags |= G_REGEX_MATCH_NOTEMPTY;
455
0
  if (pcre2_flags & PCRE2_PARTIAL_SOFT)
456
0
    match_flags |= G_REGEX_MATCH_PARTIAL_SOFT;
457
0
  if (pcre2_flags & PCRE2_PARTIAL_HARD)
458
0
    match_flags |= G_REGEX_MATCH_PARTIAL_HARD;
459
0
  if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART)
460
0
    match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART;
461
462
0
  return (match_flags & G_REGEX_MATCH_MASK);
463
0
}
464
465
static uint32_t
466
get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags)
467
16
{
468
16
  compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK;
469
470
16
  switch (compile_flags)
471
16
    {
472
0
    case G_REGEX_NEWLINE_CR:
473
0
      return PCRE2_NEWLINE_CR;
474
0
    case G_REGEX_NEWLINE_LF:
475
0
      return PCRE2_NEWLINE_LF;
476
0
    case G_REGEX_NEWLINE_CRLF:
477
0
      return PCRE2_NEWLINE_CRLF;
478
0
    case G_REGEX_NEWLINE_ANYCRLF:
479
0
      return PCRE2_NEWLINE_ANYCRLF;
480
16
    default:
481
16
      if (compile_flags != 0)
482
0
        return 0;
483
484
16
      return PCRE2_NEWLINE_ANY;
485
16
    }
486
16
}
487
488
static uint32_t
489
get_pcre2_newline_match_options (GRegexMatchFlags match_flags)
490
16
{
491
16
  switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK)
492
16
    {
493
0
    case G_REGEX_MATCH_NEWLINE_CR:
494
0
      return PCRE2_NEWLINE_CR;
495
0
    case G_REGEX_MATCH_NEWLINE_LF:
496
0
      return PCRE2_NEWLINE_LF;
497
0
    case G_REGEX_MATCH_NEWLINE_CRLF:
498
0
      return PCRE2_NEWLINE_CRLF;
499
0
    case G_REGEX_MATCH_NEWLINE_ANY:
500
0
      return PCRE2_NEWLINE_ANY;
501
0
    case G_REGEX_MATCH_NEWLINE_ANYCRLF:
502
0
      return PCRE2_NEWLINE_ANYCRLF;
503
16
    default:
504
16
      return 0;
505
16
    }
506
16
}
507
508
static uint32_t
509
get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags)
510
16
{
511
16
  if (compile_flags & G_REGEX_BSR_ANYCRLF)
512
0
    return PCRE2_BSR_ANYCRLF;
513
514
16
  return PCRE2_BSR_UNICODE;
515
16
}
516
517
static uint32_t
518
get_pcre2_bsr_match_options (GRegexMatchFlags match_flags)
519
16
{
520
16
  if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF)
521
0
    return PCRE2_BSR_ANYCRLF;
522
523
16
  if (match_flags & G_REGEX_MATCH_BSR_ANY)
524
0
    return PCRE2_BSR_UNICODE;
525
526
16
  return 0;
527
16
}
528
529
static char *
530
get_pcre2_error_string (int errcode)
531
13.4k
{
532
13.4k
  PCRE2_UCHAR8 error_msg[2048];
533
13.4k
  int err_length;
534
535
13.4k
  err_length = pcre2_get_error_message (errcode, error_msg,
536
13.4k
                                        G_N_ELEMENTS (error_msg));
537
538
13.4k
  if (err_length <= 0)
539
0
    return NULL;
540
541
  /* The array is always filled with a trailing zero */
542
13.4k
  g_assert ((size_t) err_length < G_N_ELEMENTS (error_msg));
543
13.4k
  return g_memdup2 (error_msg, err_length + 1);
544
13.4k
}
545
546
static const gchar *
547
translate_match_error (gint errcode)
548
13.4k
{
549
13.4k
  switch (errcode)
550
13.4k
    {
551
0
    case PCRE2_ERROR_NOMATCH:
552
      /* not an error */
553
0
      break;
554
0
    case PCRE2_ERROR_NULL:
555
      /* NULL argument, this should not happen in GRegex */
556
0
      g_critical ("A NULL argument was passed to PCRE");
557
0
      break;
558
0
    case PCRE2_ERROR_BADOPTION:
559
0
      return "bad options";
560
0
    case PCRE2_ERROR_BADMAGIC:
561
0
      return _("corrupted object");
562
0
    case PCRE2_ERROR_NOMEMORY:
563
0
      return _("out of memory");
564
0
    case PCRE2_ERROR_NOSUBSTRING:
565
      /* not used by pcre2_match() */
566
0
      break;
567
0
    case PCRE2_ERROR_MATCHLIMIT:
568
0
    case PCRE2_ERROR_CALLOUT:
569
      /* callouts are not implemented */
570
0
      break;
571
0
    case PCRE2_ERROR_BADUTFOFFSET:
572
      /* we do not check if strings are valid */
573
0
      break;
574
0
    case PCRE2_ERROR_PARTIAL:
575
      /* not an error */
576
0
      break;
577
0
    case PCRE2_ERROR_INTERNAL:
578
0
      return _("internal error");
579
0
    case PCRE2_ERROR_DFA_UITEM:
580
0
      return _("the pattern contains items not supported for partial matching");
581
0
    case PCRE2_ERROR_DFA_UCOND:
582
0
      return _("back references as conditions are not supported for partial matching");
583
0
    case PCRE2_ERROR_DFA_WSSIZE:
584
      /* handled expanding the workspace */
585
0
      break;
586
0
    case PCRE2_ERROR_DFA_RECURSE:
587
0
    case PCRE2_ERROR_RECURSIONLIMIT:
588
0
      return _("recursion limit reached");
589
0
    case PCRE2_ERROR_BADOFFSET:
590
0
      return _("bad offset");
591
0
    case PCRE2_ERROR_RECURSELOOP:
592
0
      return _("recursion loop");
593
0
    case PCRE2_ERROR_JIT_BADOPTION:
594
      /* should not happen in GRegex since we check modes before each match */
595
0
      return _("matching mode is requested that was not compiled for JIT");
596
13.4k
    default:
597
13.4k
      break;
598
13.4k
    }
599
13.4k
  return NULL;
600
13.4k
}
601
602
static char *
603
get_match_error_message (int errcode)
604
13.4k
{
605
13.4k
  const char *msg = translate_match_error (errcode);
606
13.4k
  char *error_string;
607
608
13.4k
  if (msg)
609
0
    return g_strdup (msg);
610
611
13.4k
  error_string = get_pcre2_error_string (errcode);
612
613
13.4k
  if (error_string)
614
13.4k
    return error_string;
615
616
0
  return g_strdup (_("unknown error"));
617
13.4k
}
618
619
static void
620
translate_compile_error (gint *errcode, const gchar **errmsg)
621
0
{
622
  /* If errcode is known we put the translatable error message in
623
   * errmsg. If errcode is unknown we put the generic
624
   * G_REGEX_ERROR_COMPILE error code in errcode.
625
   * Note that there can be more PCRE errors with the same GRegexError
626
   * and that some PCRE errors are useless for us.
627
   */
628
0
  gint original_errcode = *errcode;
629
630
0
  *errcode = -1;
631
0
  *errmsg = NULL;
632
633
0
  switch (original_errcode)
634
0
    {
635
0
    case PCRE2_ERROR_END_BACKSLASH:
636
0
      *errcode = G_REGEX_ERROR_STRAY_BACKSLASH;
637
0
      *errmsg = _("\\ at end of pattern");
638
0
      break;
639
0
    case PCRE2_ERROR_END_BACKSLASH_C:
640
0
      *errcode = G_REGEX_ERROR_MISSING_CONTROL_CHAR;
641
0
      *errmsg = _("\\c at end of pattern");
642
0
      break;
643
0
    case PCRE2_ERROR_UNKNOWN_ESCAPE:
644
0
    case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE:
645
0
      *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
646
0
      *errmsg = _("unrecognized character following \\");
647
0
      break;
648
0
    case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER:
649
0
      *errcode = G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER;
650
0
      *errmsg = _("numbers out of order in {} quantifier");
651
0
      break;
652
0
    case PCRE2_ERROR_QUANTIFIER_TOO_BIG:
653
0
      *errcode = G_REGEX_ERROR_QUANTIFIER_TOO_BIG;
654
0
      *errmsg = _("number too big in {} quantifier");
655
0
      break;
656
0
    case PCRE2_ERROR_MISSING_SQUARE_BRACKET:
657
0
      *errcode = G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS;
658
0
      *errmsg = _("missing terminating ] for character class");
659
0
      break;
660
0
    case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS:
661
0
      *errcode = G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS;
662
0
      *errmsg = _("invalid escape sequence in character class");
663
0
      break;
664
0
    case PCRE2_ERROR_CLASS_RANGE_ORDER:
665
0
      *errcode = G_REGEX_ERROR_RANGE_OUT_OF_ORDER;
666
0
      *errmsg = _("range out of order in character class");
667
0
      break;
668
0
    case PCRE2_ERROR_QUANTIFIER_INVALID:
669
0
    case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT:
670
0
      *errcode = G_REGEX_ERROR_NOTHING_TO_REPEAT;
671
0
      *errmsg = _("nothing to repeat");
672
0
      break;
673
0
    case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY:
674
0
      *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
675
0
      *errmsg = _("unrecognized character after (? or (?-");
676
0
      break;
677
0
    case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS:
678
0
      *errcode = G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS;
679
0
      *errmsg = _("POSIX named classes are supported only within a class");
680
0
      break;
681
0
    case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING:
682
0
      *errcode = G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED;
683
0
      *errmsg = _("POSIX collating elements are not supported");
684
0
      break;
685
0
    case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS:
686
0
    case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS:
687
0
    case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING:
688
0
      *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
689
0
      *errmsg = _("missing terminating )");
690
0
      break;
691
0
    case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE:
692
0
      *errcode = G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE;
693
0
      *errmsg = _("reference to non-existent subpattern");
694
0
      break;
695
0
    case PCRE2_ERROR_MISSING_COMMENT_CLOSING:
696
0
      *errcode = G_REGEX_ERROR_UNTERMINATED_COMMENT;
697
0
      *errmsg = _("missing ) after comment");
698
0
      break;
699
0
    case PCRE2_ERROR_PATTERN_TOO_LARGE:
700
0
      *errcode = G_REGEX_ERROR_EXPRESSION_TOO_LARGE;
701
0
      *errmsg = _("regular expression is too large");
702
0
      break;
703
0
    case PCRE2_ERROR_MISSING_CONDITION_CLOSING:
704
0
      *errcode = G_REGEX_ERROR_MALFORMED_CONDITION;
705
0
      *errmsg = _("malformed number or name after (?(");
706
0
      break;
707
0
    case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH:
708
0
      *errcode = G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND;
709
0
      *errmsg = _("lookbehind assertion is not fixed length");
710
0
      break;
711
0
    case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES:
712
0
      *errcode = G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES;
713
0
      *errmsg = _("conditional group contains more than two branches");
714
0
      break;
715
0
    case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED:
716
0
      *errcode = G_REGEX_ERROR_ASSERTION_EXPECTED;
717
0
      *errmsg = _("assertion expected after (?(");
718
0
      break;
719
0
    case PCRE2_ERROR_BAD_RELATIVE_REFERENCE:
720
0
      *errcode = G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE;
721
0
      *errmsg = _("a numbered reference must not be zero");
722
0
      break;
723
0
    case PCRE2_ERROR_UNKNOWN_POSIX_CLASS:
724
0
      *errcode = G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME;
725
0
      *errmsg = _("unknown POSIX class name");
726
0
      break;
727
0
    case PCRE2_ERROR_CODE_POINT_TOO_BIG:
728
0
    case PCRE2_ERROR_INVALID_HEXADECIMAL:
729
0
      *errcode = G_REGEX_ERROR_HEX_CODE_TOO_LARGE;
730
0
      *errmsg = _("character value in \\x{...} sequence is too large");
731
0
      break;
732
0
    case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C:
733
0
      *errcode = G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND;
734
0
      *errmsg = _("\\C not allowed in lookbehind assertion");
735
0
      break;
736
0
    case PCRE2_ERROR_MISSING_NAME_TERMINATOR:
737
0
      *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR;
738
0
      *errmsg = _("missing terminator in subpattern name");
739
0
      break;
740
0
    case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME:
741
0
      *errcode = G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME;
742
0
      *errmsg = _("two named subpatterns have the same name");
743
0
      break;
744
0
    case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY:
745
0
      *errcode = G_REGEX_ERROR_MALFORMED_PROPERTY;
746
0
      *errmsg = _("malformed \\P or \\p sequence");
747
0
      break;
748
0
    case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY:
749
0
      *errcode = G_REGEX_ERROR_UNKNOWN_PROPERTY;
750
0
      *errmsg = _("unknown property name after \\P or \\p");
751
0
      break;
752
0
    case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG:
753
0
      *errcode = G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG;
754
0
      *errmsg = _("subpattern name is too long (maximum 32 characters)");
755
0
      break;
756
0
    case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS:
757
0
      *errcode = G_REGEX_ERROR_TOO_MANY_SUBPATTERNS;
758
0
      *errmsg = _("too many named subpatterns (maximum 10,000)");
759
0
      break;
760
0
    case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG:
761
0
      *errcode = G_REGEX_ERROR_INVALID_OCTAL_VALUE;
762
0
      *errmsg = _("octal value is greater than \\377");
763
0
      break;
764
0
    case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES:
765
0
      *errcode = G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE;
766
0
      *errmsg = _("DEFINE group contains more than one branch");
767
0
      break;
768
0
    case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE:
769
0
      *errcode = G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS;
770
0
      *errmsg = _("inconsistent NEWLINE options");
771
0
      break;
772
0
    case PCRE2_ERROR_BACKSLASH_G_SYNTAX:
773
0
#ifdef PCRE2_ERROR_MISSING_NUMBER_TERMINATOR
774
0
    case PCRE2_ERROR_MISSING_NUMBER_TERMINATOR:
775
0
#endif
776
0
      *errcode = G_REGEX_ERROR_MISSING_BACK_REFERENCE;
777
0
      *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "
778
0
                  "number, or by a plain number");
779
0
      break;
780
0
    case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED:
781
0
      *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN;
782
0
      *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)");
783
0
      break;
784
0
    case PCRE2_ERROR_VERB_UNKNOWN:
785
0
      *errcode = G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB;
786
0
      *errmsg = _("(*VERB) not recognized");
787
0
      break;
788
0
    case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG:
789
0
      *errcode = G_REGEX_ERROR_NUMBER_TOO_BIG;
790
0
      *errmsg = _("number is too big");
791
0
      break;
792
0
    case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED:
793
0
      *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME;
794
0
      *errmsg = _("missing subpattern name after (?&");
795
0
      break;
796
0
    case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH:
797
0
      *errcode = G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME;
798
0
      *errmsg = _("different names for subpatterns of the same number are not allowed");
799
0
      break;
800
0
    case PCRE2_ERROR_MARK_MISSING_ARGUMENT:
801
0
      *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED;
802
0
      *errmsg = _("(*MARK) must have an argument");
803
0
      break;
804
0
    case PCRE2_ERROR_BACKSLASH_C_SYNTAX:
805
0
      *errcode = G_REGEX_ERROR_INVALID_CONTROL_CHAR;
806
0
      *errmsg = _( "\\c must be followed by an ASCII character");
807
0
      break;
808
0
    case PCRE2_ERROR_BACKSLASH_K_SYNTAX:
809
0
      *errcode = G_REGEX_ERROR_MISSING_NAME;
810
0
      *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");
811
0
      break;
812
0
    case PCRE2_ERROR_BACKSLASH_N_IN_CLASS:
813
0
      *errcode = G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS;
814
0
      *errmsg = _("\\N is not supported in a class");
815
0
      break;
816
0
    case PCRE2_ERROR_VERB_NAME_TOO_LONG:
817
0
      *errcode = G_REGEX_ERROR_NAME_TOO_LONG;
818
0
      *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)");
819
0
      break;
820
0
    case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW:
821
0
      *errcode = G_REGEX_ERROR_INTERNAL;
822
0
      *errmsg = _("code overflow");
823
0
      break;
824
0
    case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P:
825
0
      *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
826
0
      *errmsg = _("unrecognized character after (?P");
827
0
      break;
828
0
    case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE:
829
0
      *errcode = G_REGEX_ERROR_INTERNAL;
830
0
      *errmsg = _("overran compiling workspace");
831
0
      break;
832
0
    case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN:
833
0
      *errcode = G_REGEX_ERROR_INTERNAL;
834
0
      *errmsg = _("previously-checked referenced subpattern not found");
835
0
      break;
836
0
    case PCRE2_ERROR_HEAP_FAILED:
837
0
    case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW:
838
0
    case PCRE2_ERROR_UNICODE_NOT_SUPPORTED:
839
0
    case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT:
840
0
    case PCRE2_ERROR_NO_SURROGATES_IN_UTF16:
841
0
    case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS:
842
0
    case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE:
843
0
    case PCRE2_ERROR_INTERNAL_STUDY_ERROR:
844
0
    case PCRE2_ERROR_UTF_IS_DISABLED:
845
0
    case PCRE2_ERROR_UCP_IS_DISABLED:
846
0
    case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS:
847
0
    case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED:
848
0
    case PCRE2_ERROR_INTERNAL_BAD_CODE:
849
0
    case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP:
850
0
      *errcode = G_REGEX_ERROR_INTERNAL;
851
0
      break;
852
0
    case PCRE2_ERROR_INVALID_SUBPATTERN_NAME:
853
0
    case PCRE2_ERROR_CLASS_INVALID_RANGE:
854
0
    case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE:
855
0
    case PCRE2_ERROR_PARENTHESES_STACK_CHECK:
856
0
    case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED:
857
0
    case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG:
858
0
    case PCRE2_ERROR_MISSING_CALLOUT_CLOSING:
859
0
    case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB:
860
0
    case PCRE2_ERROR_NULL_PATTERN:
861
0
    case PCRE2_ERROR_BAD_OPTIONS:
862
0
    case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP:
863
0
    case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE:
864
0
    case PCRE2_ERROR_INVALID_OCTAL:
865
0
    case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG:
866
0
    case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG:
867
0
    case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS:
868
0
    case PCRE2_ERROR_VERSION_CONDITION_SYNTAX:
869
0
    case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER:
870
0
    case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER:
871
0
    case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED:
872
0
    case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP:
873
0
    case PCRE2_ERROR_PATTERN_TOO_COMPLICATED:
874
0
    case PCRE2_ERROR_LOOKBEHIND_TOO_LONG:
875
0
    case PCRE2_ERROR_PATTERN_STRING_TOO_LONG:
876
0
    case PCRE2_ERROR_BAD_LITERAL_OPTIONS:
877
0
    default:
878
0
      *errcode = G_REGEX_ERROR_COMPILE;
879
0
      break;
880
0
    }
881
882
0
  g_assert (*errcode != -1);
883
0
}
884
885
/* GMatchInfo */
886
887
static GMatchInfo *
888
match_info_new (const GRegex     *regex,
889
                const gchar      *string,
890
                gint              string_len,
891
                gint              start_position,
892
                GRegexMatchFlags  match_options,
893
                gboolean          is_dfa)
894
131k
{
895
131k
  GMatchInfo *match_info;
896
897
131k
  if (string_len < 0)
898
131k
    string_len = strlen (string);
899
900
131k
  match_info = g_new0 (GMatchInfo, 1);
901
131k
  match_info->ref_count = 1;
902
131k
  match_info->regex = g_regex_ref ((GRegex *)regex);
903
131k
  match_info->string = string;
904
131k
  match_info->string_len = string_len;
905
131k
  match_info->matches = PCRE2_ERROR_NOMATCH;
906
131k
  match_info->pos = start_position;
907
131k
  match_info->match_opts =
908
131k
    get_pcre2_match_options (match_options, regex->orig_compile_opts);
909
910
131k
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT,
911
131k
                      &match_info->n_subpatterns);
912
913
131k
  match_info->match_context = pcre2_match_context_create (NULL);
914
915
131k
  if (is_dfa)
916
0
    {
917
      /* These values should be enough for most cases, if they are not
918
       * enough g_regex_match_all_full() will expand them. */
919
0
      match_info->n_workspace = 100;
920
0
      match_info->workspace = g_new (gint, match_info->n_workspace);
921
0
    }
922
923
131k
  match_info->n_offsets = 2;
924
131k
  match_info->offsets = g_new0 (gint, match_info->n_offsets);
925
  /* Set an invalid position for the previous match. */
926
131k
  match_info->offsets[0] = -1;
927
131k
  match_info->offsets[1] = -1;
928
929
131k
  match_info->match_data = pcre2_match_data_create_from_pattern (
930
131k
      match_info->regex->pcre_re,
931
131k
      NULL);
932
933
131k
  return match_info;
934
131k
}
935
936
static gboolean
937
recalc_match_offsets (GMatchInfo *match_info,
938
                      GError     **error)
939
1.70k
{
940
1.70k
  PCRE2_SIZE *ovector;
941
1.70k
  uint32_t ovector_size = 0;
942
1.70k
  uint32_t pre_n_offset;
943
1.70k
  uint32_t i;
944
945
1.70k
  g_assert (!IS_PCRE2_ERROR (match_info->matches));
946
947
1.70k
  if (match_info->matches == PCRE2_ERROR_PARTIAL)
948
0
    ovector_size = 1;
949
1.70k
  else if (match_info->matches > 0)
950
1.70k
    ovector_size = match_info->matches;
951
952
1.70k
  g_assert (ovector_size != 0);
953
954
1.70k
  if (pcre2_get_ovector_count (match_info->match_data) < ovector_size)
955
0
    {
956
0
      g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
957
0
                   _("Error while matching regular expression %s: %s"),
958
0
                   match_info->regex->pattern, _("code overflow"));
959
0
      return FALSE;
960
0
    }
961
962
1.70k
  pre_n_offset = match_info->n_offsets;
963
1.70k
  match_info->n_offsets = ovector_size * 2;
964
1.70k
  ovector = pcre2_get_ovector_pointer (match_info->match_data);
965
966
1.70k
  if (match_info->n_offsets != pre_n_offset)
967
6
    {
968
6
      match_info->offsets = g_realloc_n (match_info->offsets,
969
6
                                         match_info->n_offsets,
970
6
                                         sizeof (gint));
971
6
    }
972
973
5.11k
  for (i = 0; i < match_info->n_offsets; i++)
974
3.41k
    {
975
3.41k
      match_info->offsets[i] = (int) ovector[i];
976
3.41k
    }
977
978
1.70k
  return TRUE;
979
1.70k
}
980
981
static JITStatus
982
enable_jit_with_match_options (GMatchInfo  *match_info,
983
                               uint32_t  match_options)
984
131k
{
985
131k
  gint retval;
986
131k
  uint32_t old_jit_options, new_jit_options;
987
988
131k
  if (!(match_info->regex->orig_compile_opts & G_REGEX_OPTIMIZE))
989
32.0k
    return JIT_STATUS_DISABLED;
990
991
99.6k
  if (match_info->regex->jit_status == JIT_STATUS_DISABLED)
992
0
    return JIT_STATUS_DISABLED;
993
994
99.6k
  if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS)
995
0
    return JIT_STATUS_DISABLED;
996
997
99.6k
  old_jit_options = match_info->regex->jit_options;
998
99.6k
  new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE;
999
99.6k
  if (match_options & PCRE2_PARTIAL_HARD)
1000
0
    new_jit_options |= PCRE2_JIT_PARTIAL_HARD;
1001
99.6k
  if (match_options & PCRE2_PARTIAL_SOFT)
1002
0
    new_jit_options |= PCRE2_JIT_PARTIAL_SOFT;
1003
1004
  /* no new options enabled */
1005
99.6k
  if (new_jit_options == old_jit_options)
1006
99.6k
    {
1007
99.6k
      g_assert (match_info->regex->jit_status != JIT_STATUS_DEFAULT);
1008
99.6k
      return match_info->regex->jit_status;
1009
99.6k
    }
1010
1011
12
  retval = pcre2_jit_compile (match_info->regex->pcre_re, new_jit_options);
1012
12
  if (retval == 0)
1013
12
    {
1014
12
      match_info->regex->jit_status = JIT_STATUS_ENABLED;
1015
1016
12
      match_info->regex->jit_options = new_jit_options;
1017
      /* Set min stack size for JIT to 32KiB and max to 512KiB */
1018
12
      match_info->jit_stack = pcre2_jit_stack_create (1 << 15, 1 << 19, NULL);
1019
12
      pcre2_jit_stack_assign (match_info->match_context, NULL, match_info->jit_stack);
1020
12
    }
1021
0
  else
1022
0
    {
1023
0
      match_info->regex->jit_status = JIT_STATUS_DISABLED;
1024
1025
0
      switch (retval)
1026
0
        {
1027
0
        case PCRE2_ERROR_NOMEMORY:
1028
0
          g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, "
1029
0
                   "but JIT was unable to allocate executable memory for the "
1030
0
                   "compiler. Falling back to interpretive code.");
1031
0
          break;
1032
0
        case PCRE2_ERROR_JIT_BADOPTION:
1033
0
          g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, "
1034
0
                   "but JIT support is not available. Falling back to "
1035
0
                   "interpretive code.");
1036
0
          break;
1037
0
        default:
1038
0
          g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, "
1039
0
                   "but request for JIT support had unexpectedly failed (error %d). "
1040
0
                   "Falling back to interpretive code.",
1041
0
                   retval);
1042
0
          break;
1043
0
        }
1044
0
    }
1045
1046
12
  return match_info->regex->jit_status;
1047
1048
12
  g_assert_not_reached ();
1049
0
}
1050
1051
/**
1052
 * g_match_info_get_regex:
1053
 * @match_info: a #GMatchInfo
1054
 *
1055
 * Returns #GRegex object used in @match_info. It belongs to Glib
1056
 * and must not be freed. Use g_regex_ref() if you need to keep it
1057
 * after you free @match_info object.
1058
 *
1059
 * Returns: (transfer none): #GRegex object used in @match_info
1060
 *
1061
 * Since: 2.14
1062
 */
1063
GRegex *
1064
g_match_info_get_regex (const GMatchInfo *match_info)
1065
0
{
1066
0
  g_return_val_if_fail (match_info != NULL, NULL);
1067
0
  return match_info->regex;
1068
0
}
1069
1070
/**
1071
 * g_match_info_get_string:
1072
 * @match_info: a #GMatchInfo
1073
 *
1074
 * Returns the string searched with @match_info. This is the
1075
 * string passed to g_regex_match() or g_regex_replace() so
1076
 * you may not free it before calling this function.
1077
 *
1078
 * Returns: the string searched with @match_info
1079
 *
1080
 * Since: 2.14
1081
 */
1082
const gchar *
1083
g_match_info_get_string (const GMatchInfo *match_info)
1084
0
{
1085
0
  g_return_val_if_fail (match_info != NULL, NULL);
1086
0
  return match_info->string;
1087
0
}
1088
1089
/**
1090
 * g_match_info_ref:
1091
 * @match_info: a #GMatchInfo
1092
 *
1093
 * Increases reference count of @match_info by 1.
1094
 *
1095
 * Returns: @match_info
1096
 *
1097
 * Since: 2.30
1098
 */
1099
GMatchInfo       *
1100
g_match_info_ref (GMatchInfo *match_info)
1101
0
{
1102
0
  g_return_val_if_fail (match_info != NULL, NULL);
1103
0
  g_atomic_int_inc (&match_info->ref_count);
1104
0
  return match_info;
1105
0
}
1106
1107
/**
1108
 * g_match_info_unref:
1109
 * @match_info: a #GMatchInfo
1110
 *
1111
 * Decreases reference count of @match_info by 1. When reference count drops
1112
 * to zero, it frees all the memory associated with the match_info structure.
1113
 *
1114
 * Since: 2.30
1115
 */
1116
void
1117
g_match_info_unref (GMatchInfo *match_info)
1118
131k
{
1119
131k
  if (g_atomic_int_dec_and_test (&match_info->ref_count))
1120
131k
    {
1121
131k
      g_regex_unref (match_info->regex);
1122
131k
      if (match_info->match_context)
1123
131k
        pcre2_match_context_free (match_info->match_context);
1124
131k
      if (match_info->jit_stack)
1125
12
        pcre2_jit_stack_free (match_info->jit_stack);
1126
131k
      if (match_info->match_data)
1127
131k
        pcre2_match_data_free (match_info->match_data);
1128
131k
      g_free (match_info->offsets);
1129
131k
      g_free (match_info->workspace);
1130
131k
      g_free (match_info);
1131
131k
    }
1132
131k
}
1133
1134
/**
1135
 * g_match_info_free:
1136
 * @match_info: (nullable): a #GMatchInfo, or %NULL
1137
 *
1138
 * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does
1139
 * nothing.
1140
 *
1141
 * Since: 2.14
1142
 */
1143
void
1144
g_match_info_free (GMatchInfo *match_info)
1145
131k
{
1146
131k
  if (match_info == NULL)
1147
0
    return;
1148
1149
131k
  g_match_info_unref (match_info);
1150
131k
}
1151
1152
/**
1153
 * g_match_info_next:
1154
 * @match_info: a #GMatchInfo structure
1155
 * @error: location to store the error occurring, or %NULL to ignore errors
1156
 *
1157
 * Scans for the next match using the same parameters of the previous
1158
 * call to g_regex_match_full() or g_regex_match() that returned
1159
 * @match_info.
1160
 *
1161
 * The match is done on the string passed to the match function, so you
1162
 * cannot free it before calling this function.
1163
 *
1164
 * Returns: %TRUE is the string matched, %FALSE otherwise
1165
 *
1166
 * Since: 2.14
1167
 */
1168
gboolean
1169
g_match_info_next (GMatchInfo  *match_info,
1170
                   GError     **error)
1171
131k
{
1172
131k
  JITStatus jit_status;
1173
131k
  gint prev_match_start;
1174
131k
  gint prev_match_end;
1175
131k
  uint32_t opts;
1176
1177
131k
  g_return_val_if_fail (match_info != NULL, FALSE);
1178
131k
  g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1179
131k
  g_return_val_if_fail (match_info->pos >= 0, FALSE);
1180
1181
131k
  prev_match_start = match_info->offsets[0];
1182
131k
  prev_match_end = match_info->offsets[1];
1183
1184
131k
  if (match_info->pos > match_info->string_len)
1185
0
    {
1186
      /* we have reached the end of the string */
1187
0
      match_info->pos = -1;
1188
0
      match_info->matches = PCRE2_ERROR_NOMATCH;
1189
0
      return FALSE;
1190
0
    }
1191
1192
131k
  opts = match_info->regex->match_opts | match_info->match_opts;
1193
1194
131k
  jit_status = enable_jit_with_match_options (match_info, opts);
1195
131k
  if (jit_status == JIT_STATUS_ENABLED)
1196
99.6k
    {
1197
99.6k
      match_info->matches = pcre2_jit_match (match_info->regex->pcre_re,
1198
99.6k
                                             (PCRE2_SPTR8) match_info->string,
1199
99.6k
                                             match_info->string_len,
1200
99.6k
                                             match_info->pos,
1201
99.6k
                                             opts,
1202
99.6k
                                             match_info->match_data,
1203
99.6k
                                             match_info->match_context);
1204
      /* if the JIT stack limit was reached, fall back to non-JIT matching in
1205
       * the next conditional statement */
1206
99.6k
      if (match_info->matches == PCRE2_ERROR_JIT_STACKLIMIT)
1207
0
        {
1208
0
          g_debug ("PCRE2 JIT stack limit reached, falling back to "
1209
0
                   "non-optimized matching.");
1210
0
          opts |= PCRE2_NO_JIT;
1211
0
          jit_status = JIT_STATUS_DISABLED;
1212
0
        }
1213
99.6k
    }
1214
1215
131k
  if (jit_status != JIT_STATUS_ENABLED)
1216
32.0k
    {
1217
32.0k
      match_info->matches = pcre2_match (match_info->regex->pcre_re,
1218
32.0k
                                         (PCRE2_SPTR8) match_info->string,
1219
32.0k
                                         match_info->string_len,
1220
32.0k
                                         match_info->pos,
1221
32.0k
                                         opts,
1222
32.0k
                                         match_info->match_data,
1223
32.0k
                                         match_info->match_context);
1224
32.0k
    }
1225
1226
131k
  if (IS_PCRE2_ERROR (match_info->matches))
1227
13.4k
    {
1228
13.4k
      gchar *error_msg = get_match_error_message (match_info->matches);
1229
1230
13.4k
      g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1231
13.4k
                   _("Error while matching regular expression %s: %s"),
1232
13.4k
                   match_info->regex->pattern, error_msg);
1233
13.4k
      g_clear_pointer (&error_msg, g_free);
1234
13.4k
      return FALSE;
1235
13.4k
    }
1236
118k
  else if (match_info->matches == 0)
1237
0
    {
1238
      /* info->offsets is too small. */
1239
0
      match_info->n_offsets *= 2;
1240
0
      match_info->offsets = g_realloc_n (match_info->offsets,
1241
0
                                         match_info->n_offsets,
1242
0
                                         sizeof (gint));
1243
1244
0
      pcre2_match_data_free (match_info->match_data);
1245
0
      match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL);
1246
1247
0
      return g_match_info_next (match_info, error);
1248
0
    }
1249
118k
  else if (match_info->matches == PCRE2_ERROR_NOMATCH)
1250
116k
    {
1251
      /* We're done with this match info */
1252
116k
      match_info->pos = -1;
1253
116k
      return FALSE;
1254
116k
    }
1255
1.70k
  else
1256
1.70k
    if (!recalc_match_offsets (match_info, error))
1257
0
      return FALSE;
1258
1259
  /* avoid infinite loops if the pattern is an empty string or something
1260
   * equivalent */
1261
1.70k
  if (match_info->pos == match_info->offsets[1])
1262
0
    {
1263
0
      if (match_info->pos > match_info->string_len)
1264
0
        {
1265
          /* we have reached the end of the string */
1266
0
          match_info->pos = -1;
1267
0
          match_info->matches = PCRE2_ERROR_NOMATCH;
1268
0
          return FALSE;
1269
0
        }
1270
1271
0
      match_info->pos = NEXT_CHAR (match_info->regex,
1272
0
                                   &match_info->string[match_info->pos]) -
1273
0
                                   match_info->string;
1274
0
    }
1275
1.70k
  else
1276
1.70k
    {
1277
1.70k
      match_info->pos = match_info->offsets[1];
1278
1.70k
    }
1279
1280
1.70k
  g_assert (match_info->matches < 0 ||
1281
1.70k
            (uint32_t) match_info->matches <= match_info->n_subpatterns + 1);
1282
1283
  /* it's possible to get two identical matches when we are matching
1284
   * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and
1285
   * the string is "RegExTest" we have:
1286
   *  - search at position 0: match from 0 to 0
1287
   *  - search at position 1: match from 3 to 3
1288
   *  - search at position 3: match from 3 to 3 (duplicate)
1289
   *  - search at position 4: match from 5 to 5
1290
   *  - search at position 5: match from 5 to 5 (duplicate)
1291
   *  - search at position 6: no match -> stop
1292
   * so we have to ignore the duplicates.
1293
   * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */
1294
1.70k
  if (match_info->matches >= 0 &&
1295
1.70k
      prev_match_start == match_info->offsets[0] &&
1296
0
      prev_match_end == match_info->offsets[1])
1297
0
    {
1298
      /* ignore this match and search the next one */
1299
0
      return g_match_info_next (match_info, error);
1300
0
    }
1301
1302
1.70k
  return match_info->matches >= 0;
1303
1.70k
}
1304
1305
/**
1306
 * g_match_info_matches:
1307
 * @match_info: a #GMatchInfo structure
1308
 *
1309
 * Returns whether the previous match operation succeeded.
1310
 *
1311
 * Returns: %TRUE if the previous match operation succeeded,
1312
 *   %FALSE otherwise
1313
 *
1314
 * Since: 2.14
1315
 */
1316
gboolean
1317
g_match_info_matches (const GMatchInfo *match_info)
1318
0
{
1319
0
  g_return_val_if_fail (match_info != NULL, FALSE);
1320
1321
0
  return match_info->matches >= 0;
1322
0
}
1323
1324
/**
1325
 * g_match_info_get_match_count:
1326
 * @match_info: a #GMatchInfo structure
1327
 *
1328
 * Retrieves the number of matched substrings (including substring 0,
1329
 * that is the whole matched text), so 1 is returned if the pattern
1330
 * has no substrings in it and 0 is returned if the match failed.
1331
 *
1332
 * If the last match was obtained using the DFA algorithm, that is
1333
 * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
1334
 * count is not that of the number of capturing parentheses but that of
1335
 * the number of matched substrings.
1336
 *
1337
 * Returns: Number of matched substrings, or -1 if an error occurred
1338
 *
1339
 * Since: 2.14
1340
 */
1341
gint
1342
g_match_info_get_match_count (const GMatchInfo *match_info)
1343
0
{
1344
0
  g_return_val_if_fail (match_info, -1);
1345
1346
0
  if (match_info->matches == PCRE2_ERROR_NOMATCH)
1347
    /* no match */
1348
0
    return 0;
1349
0
  else if (match_info->matches < PCRE2_ERROR_NOMATCH)
1350
    /* error */
1351
0
    return -1;
1352
0
  else
1353
    /* match */
1354
0
    return match_info->matches;
1355
0
}
1356
1357
/**
1358
 * g_match_info_is_partial_match:
1359
 * @match_info: a #GMatchInfo structure
1360
 *
1361
 * Usually if the string passed to g_regex_match*() matches as far as
1362
 * it goes, but is too short to match the entire pattern, %FALSE is
1363
 * returned. There are circumstances where it might be helpful to
1364
 * distinguish this case from other cases in which there is no match.
1365
 *
1366
 * Consider, for example, an application where a human is required to
1367
 * type in data for a field with specific formatting requirements. An
1368
 * example might be a date in the form ddmmmyy, defined by the pattern
1369
 * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
1370
 * If the application sees the user’s keystrokes one by one, and can
1371
 * check that what has been typed so far is potentially valid, it is
1372
 * able to raise an error as soon as a mistake is made.
1373
 *
1374
 * GRegex supports the concept of partial matching by means of the
1375
 * %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD flags.
1376
 * When they are used, the return code for
1377
 * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
1378
 * for a complete match, %FALSE otherwise. But, when these functions
1379
 * return %FALSE, you can check if the match was partial calling
1380
 * g_match_info_is_partial_match().
1381
 *
1382
 * The difference between %G_REGEX_MATCH_PARTIAL_SOFT and
1383
 * %G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered
1384
 * with %G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a
1385
 * possible complete match, while with %G_REGEX_MATCH_PARTIAL_HARD matching
1386
 * stops at the partial match.
1387
 * When both %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD
1388
 * are set, the latter takes precedence.
1389
 *
1390
 * There were formerly some restrictions on the pattern for partial matching.
1391
 * The restrictions no longer apply.
1392
 *
1393
 * See pcrepartial(3) for more information on partial matching.
1394
 *
1395
 * Returns: %TRUE if the match was partial, %FALSE otherwise
1396
 *
1397
 * Since: 2.14
1398
 */
1399
gboolean
1400
g_match_info_is_partial_match (const GMatchInfo *match_info)
1401
0
{
1402
0
  g_return_val_if_fail (match_info != NULL, FALSE);
1403
1404
0
  return match_info->matches == PCRE2_ERROR_PARTIAL;
1405
0
}
1406
1407
/**
1408
 * g_match_info_expand_references:
1409
 * @match_info: (nullable): a #GMatchInfo or %NULL
1410
 * @string_to_expand: the string to expand
1411
 * @error: location to store the error occurring, or %NULL to ignore errors
1412
 *
1413
 * Returns a new string containing the text in @string_to_expand with
1414
 * references and escape sequences expanded. References refer to the last
1415
 * match done with @string against @regex and have the same syntax used by
1416
 * g_regex_replace().
1417
 *
1418
 * The @string_to_expand must be UTF-8 encoded even if %G_REGEX_RAW was
1419
 * passed to g_regex_new().
1420
 *
1421
 * The backreferences are extracted from the string passed to the match
1422
 * function, so you cannot call this function after freeing the string.
1423
 *
1424
 * @match_info may be %NULL in which case @string_to_expand must not
1425
 * contain references. For instance "foo\n" does not refer to an actual
1426
 * pattern and '\n' merely will be replaced with \n character,
1427
 * while to expand "\0" (whole match) one needs the result of a match.
1428
 * Use g_regex_check_replacement() to find out whether @string_to_expand
1429
 * contains references.
1430
 *
1431
 * Returns: (nullable): the expanded string, or %NULL if an error occurred
1432
 *
1433
 * Since: 2.14
1434
 */
1435
gchar *
1436
g_match_info_expand_references (const GMatchInfo  *match_info,
1437
                                const gchar       *string_to_expand,
1438
                                GError           **error)
1439
0
{
1440
0
  GString *result;
1441
0
  GList *list;
1442
0
  GError *tmp_error = NULL;
1443
1444
0
  g_return_val_if_fail (string_to_expand != NULL, NULL);
1445
0
  g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1446
1447
0
  list = split_replacement (string_to_expand, &tmp_error);
1448
0
  if (tmp_error != NULL)
1449
0
    {
1450
0
      g_propagate_error (error, tmp_error);
1451
0
      return NULL;
1452
0
    }
1453
1454
0
  if (!match_info && interpolation_list_needs_match (list))
1455
0
    {
1456
0
      g_critical ("String '%s' contains references to the match, can't "
1457
0
                  "expand references without GMatchInfo object",
1458
0
                  string_to_expand);
1459
0
      return NULL;
1460
0
    }
1461
1462
0
  result = g_string_sized_new (strlen (string_to_expand));
1463
0
  interpolate_replacement (match_info, result, list);
1464
1465
0
  g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
1466
1467
0
  return g_string_free (result, FALSE);
1468
0
}
1469
1470
/**
1471
 * g_match_info_fetch:
1472
 * @match_info: #GMatchInfo structure
1473
 * @match_num: number of the sub expression
1474
 *
1475
 * Retrieves the text matching the @match_num'th capturing
1476
 * parentheses. 0 is the full text of the match, 1 is the first paren
1477
 * set, 2 the second, and so on.
1478
 *
1479
 * If @match_num is a valid sub pattern but it didn't match anything
1480
 * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
1481
 * string is returned.
1482
 *
1483
 * If the match was obtained using the DFA algorithm, that is using
1484
 * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1485
 * string is not that of a set of parentheses but that of a matched
1486
 * substring. Substrings are matched in reverse order of length, so
1487
 * 0 is the longest match.
1488
 *
1489
 * The string is fetched from the string passed to the match function,
1490
 * so you cannot call this function after freeing the string.
1491
 *
1492
 * Returns: (nullable): The matched substring, or %NULL if an error
1493
 *     occurred. You have to free the string yourself
1494
 *
1495
 * Since: 2.14
1496
 */
1497
gchar *
1498
g_match_info_fetch (const GMatchInfo *match_info,
1499
                    gint              match_num)
1500
0
{
1501
0
  gchar *match = NULL;
1502
0
  gint start, end;
1503
1504
0
  g_return_val_if_fail (match_info != NULL, NULL);
1505
0
  g_return_val_if_fail (match_num >= 0, NULL);
1506
1507
  /* match_num does not exist or it didn't matched, i.e. matching "b"
1508
   * against "(a)?b" then group 0 is empty. */
1509
0
  if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
1510
0
    match = NULL;
1511
0
  else if (start == -1)
1512
0
    match = g_strdup ("");
1513
0
  else
1514
0
    match = g_strndup (&match_info->string[start], end - start);
1515
1516
0
  return match;
1517
0
}
1518
1519
/**
1520
 * g_match_info_fetch_pos:
1521
 * @match_info: #GMatchInfo structure
1522
 * @match_num: number of the capture parenthesis
1523
 * @start_pos: (out) (optional): pointer to location where to store
1524
 *     the start position, or %NULL
1525
 * @end_pos: (out) (optional): pointer to location where to store
1526
 *     the end position (the byte after the final byte of the match), or %NULL
1527
 *
1528
 * Returns the start and end positions (in bytes) of a successfully matching 
1529
 * capture parenthesis.
1530
 * 
1531
 * Valid values for @match_num are `0` for the full text of the match,
1532
 * `1` for the first paren set, `2` for the second, and so on.
1533
 *
1534
 * As @end_pos is set to the byte after the final byte of the match (on success),
1535
 * the length of the match can be calculated as `end_pos - start_pos`.
1536
 *
1537
 * As a best practice, initialize @start_pos and @end_pos to identifiable 
1538
 * values, such as `G_MAXINT`, so that you can test if 
1539
 * `g_match_info_fetch_pos()` actually changed the value for a given 
1540
 * capture parenthesis.
1541
 *
1542
 * The parameter @match_num corresponds to a matched capture parenthesis. The 
1543
 * actual value you use for @match_num depends on the method used to generate
1544
 * @match_info. The following sections describe those methods.
1545
 * 
1546
 * ## Methods Using Non-deterministic Finite Automata Matching
1547
 *
1548
 * The methods [method@GLib.Regex.match] and [method@GLib.Regex.match_full]
1549
 * return a [struct@GLib.MatchInfo] using traditional (greedy) pattern
1550
 * matching, also known as 
1551
 * [Non-deterministic Finite Automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton)
1552
 * (NFA) matching. You pass the returned `GMatchInfo` from these methods to 
1553
 * `g_match_info_fetch_pos()` to determine the start and end positions 
1554
 * of capture parentheses. The values for @match_num correspond to the capture 
1555
 * parentheses in order, with `0` corresponding to the entire matched string.
1556
 * 
1557
 * @match_num can refer to a capture parenthesis with no match. For example, 
1558
 * the string `b` matches against the pattern `(a)?b`, but the capture
1559
 * parenthesis `(a)` has no match. In this case, `g_match_info_fetch_pos()`
1560
 * returns true and sets @start_pos and @end_pos to `-1` when called with
1561
 * `match_num` as `1` (for `(a)`).
1562
 *
1563
 * For an expanded example, a regex pattern is `(a)?(.*?)the (.*)`, 
1564
 * and a candidate string is `glib regexes are the best`. In this scenario 
1565
 * there are four capture parentheses numbered 0–3: an implicit one 
1566
 * for the entire string, and three explicitly declared in the regex pattern.
1567
 *
1568
 * Given this example, the following table describes the return values 
1569
 * from `g_match_info_fetch_pos()` for various values of @match_num.
1570
 *
1571
 * `match_num` | Contents | Return value | Returned `start_pos` | Returned `end_pos`
1572
 * ----------- | -------- | ------------ | -------------------- | ------------------
1573
 * 0 | Matches entire string | True | 0 | 25
1574
 * 1 | Does not match first character | True | -1 | -1
1575
 * 2 | All text before `the ` | True | 0 | 17
1576
 * 3 | All text after `the ` | True | 21 | 25
1577
 * 4 | Capture paren out of range | False | Unchanged | Unchanged
1578
 *
1579
 * The following code sample and output implements this example.
1580
 *
1581
 * ``` { .c }
1582
 * #include <glib.h>
1583
 *
1584
 * int
1585
 * main (int argc, char *argv[])
1586
 * {
1587
 *   g_autoptr(GError) local_error = NULL;
1588
 *   const char *regex_pattern = "(a)?(.*?)the (.*)";
1589
 *   const char *test_string = "glib regexes are the best";
1590
 *   g_autoptr(GRegex) regex = NULL;
1591
 *
1592
 *   regex = g_regex_new (regex_pattern,
1593
 *                        G_REGEX_DEFAULT,
1594
 *                        G_REGEX_MATCH_DEFAULT,
1595
 *                        &local_error);
1596
 *   if (regex == NULL)
1597
 *     {
1598
 *       g_printerr ("Error creating regex: %s\n", local_error->message);
1599
 *       return 1;
1600
 *     }
1601
 *
1602
 *   g_autoptr(GMatchInfo) match_info = NULL;
1603
 *   g_regex_match (regex, test_string, G_REGEX_MATCH_DEFAULT, &match_info);
1604
 *
1605
 *   int n_matched_strings = g_match_info_get_match_count (match_info);
1606
 *
1607
 *   // Print header line
1608
 *   g_print ("match_num Contents                  Return value returned start_pos returned end_pos\n");
1609
 *
1610
 *   // Iterate over each capture paren, including one that is out of range as a demonstration.
1611
 *   for (int match_num = 0; match_num <= n_matched_strings; match_num++)
1612
 *     {
1613
 *       gboolean found_match;
1614
 *       g_autofree char *paren_string = NULL;
1615
 *       int start_pos = G_MAXINT;
1616
 *       int end_pos = G_MAXINT;
1617
 *
1618
 *       found_match = g_match_info_fetch_pos (match_info,
1619
 *                                             match_num,
1620
 *                                             &start_pos,
1621
 *                                             &end_pos);
1622
 *
1623
 *       // If no match, display N/A as the found string.
1624
 *       if (start_pos == G_MAXINT || start_pos == -1)
1625
 *         paren_string = g_strdup ("N/A");
1626
 *       else
1627
 *         paren_string = g_strndup (test_string + start_pos, end_pos - start_pos);
1628
 *
1629
 *       g_print ("%-9d %-25s %-12d %-18d %d\n", match_num, paren_string, found_match, start_pos, end_pos);
1630
 *     }
1631
 *
1632
 *   return 0;
1633
 * }
1634
 * ```
1635
 *
1636
 * ```
1637
 * match_num Contents                  Return value returned start_pos returned end_pos
1638
 * 0         glib regexes are the best 1            0                  25
1639
 * 1         N/A                       1            -1                 -1
1640
 * 2         glib regexes are          1            0                  17
1641
 * 3         best                      1            21                 25
1642
 * 4         N/A                       0            2147483647         2147483647
1643
 * ```
1644
 * ## Methods Using Deterministic Finite Automata Matching
1645
 *
1646
 * The methods [method@GLib.Regex.match_all] and 
1647
 * [method@GLib.Regex.match_all_full]
1648
 * return a `GMatchInfo` using
1649
 * [Deterministic Finite Automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton)
1650
 * (DFA) pattern matching. This algorithm detects overlapping matches. You pass
1651
 * the returned `GMatchInfo` from these methods to `g_match_info_fetch_pos()`
1652
 * to determine the start and end positions of each overlapping match. Use the 
1653
 * method [method@GLib.MatchInfo.get_match_count] to determine the number 
1654
 * of overlapping matches.
1655
 *
1656
 * For example, a regex pattern is `<.*>`, and a candidate string is 
1657
 * `<a> <b> <c>`. In this scenario there are three implicit capture 
1658
 * parentheses: one for the entire string, one for `<a> <b>`, and one for `<a>`.
1659
 *
1660
 * Given this example, the following table describes the return values from
1661
 * `g_match_info_fetch_pos()` for various values of @match_num.
1662
 *
1663
 * `match_num` | Contents | Return value | Returned `start_pos` | Returned `end_pos`
1664
 * ----------- | -------- | ------------ | -------------------- | ------------------
1665
 * 0 | Matches entire string | True | 0 | 11
1666
 * 1 | Matches `<a> <b>` | True | 0 | 7
1667
 * 2 | Matches `<a>` | True | 0 | 3
1668
 * 3 | Capture paren out of range | False | Unchanged | Unchanged
1669
 *
1670
 * The following code sample and output implements this example.
1671
 *
1672
 * ``` { .c }
1673
 * #include <glib.h>
1674
 *
1675
 * int
1676
 * main (int argc, char *argv[])
1677
 * {
1678
 *   g_autoptr(GError) local_error = NULL;
1679
 *   const char *regex_pattern = "<.*>";
1680
 *   const char *test_string = "<a> <b> <c>";
1681
 *   g_autoptr(GRegex) regex = NULL;
1682
 * 
1683
 *   regex = g_regex_new (regex_pattern,
1684
 *                        G_REGEX_DEFAULT,
1685
 *                        G_REGEX_MATCH_DEFAULT,
1686
 *                        &local_error);
1687
 *   if (regex == NULL)
1688
 *     {
1689
 *       g_printerr ("Error creating regex: %s\n", local_error->message);
1690
 *       return -1;
1691
 *     }
1692
 *
1693
 *   g_autoptr(GMatchInfo) match_info = NULL;
1694
 *   g_regex_match_all (regex, test_string, G_REGEX_MATCH_DEFAULT, &match_info);
1695
 *
1696
 *   int n_matched_strings = g_match_info_get_match_count (match_info);
1697
 *
1698
 *   // Print header line 
1699
 *   g_print ("match_num Contents                  Return value returned start_pos returned end_pos\n");
1700
 * 
1701
 *   // Iterate over each capture paren, including one that is out of range as a demonstration.
1702
 *   for (int match_num = 0; match_num <= n_matched_strings; match_num++)
1703
 *     {
1704
 *       gboolean found_match;
1705
 *       g_autofree char *paren_string = NULL;
1706
 *       int start_pos = G_MAXINT;
1707
 *       int end_pos = G_MAXINT;
1708
 *
1709
 *       found_match = g_match_info_fetch_pos (match_info, match_num, &start_pos, &end_pos);
1710
 *
1711
 *       // If no match, display N/A as the found string.
1712
 *       if (start_pos == G_MAXINT || start_pos == -1)
1713
 *         paren_string = g_strdup ("N/A");
1714
 *       else
1715
 *         paren_string = g_strndup (test_string + start_pos, end_pos - start_pos);
1716
 *
1717
 *       g_print ("%-9d %-25s %-12d %-18d %d\n", match_num, paren_string, found_match, start_pos, end_pos);
1718
 *     }
1719
 *
1720
 *   return 0;
1721
 * }
1722
 * ```
1723
 *
1724
 * ```
1725
 * match_num Contents                  Return value returned start_pos returned end_pos
1726
 * 0         <a> <b> <c>               1            0                  11
1727
 * 1         <a> <b>                   1            0                  7
1728
 * 2         <a>                       1            0                  3
1729
 * 3         N/A                       0            2147483647         2147483647
1730
 * ```
1731
 *
1732
 * Returns: True if @match_num is within range, false otherwise. If
1733
 *   the capture paren has a match, @start_pos and @end_pos contain the 
1734
 *   start and end positions (in bytes) of the matching substring. If the 
1735
 *   capture paren has no match, @start_pos and @end_pos are `-1`. If 
1736
 *   @match_num is out of range, @start_pos and @end_pos are left unchanged.
1737
 *
1738
 * Since: 2.14
1739
 */
1740
gboolean
1741
g_match_info_fetch_pos (const GMatchInfo *match_info,
1742
                        gint              match_num,
1743
                        gint             *start_pos,
1744
                        gint             *end_pos)
1745
0
{
1746
0
  g_return_val_if_fail (match_info != NULL, FALSE);
1747
0
  g_return_val_if_fail (match_num >= 0, FALSE);
1748
1749
  /* check whether there was an error */
1750
0
  if (match_info->matches < 0)
1751
0
    return FALSE;
1752
1753
  /* make sure the sub expression number they're requesting is less than
1754
   * the total number of sub expressions in the regex. When matching all
1755
   * (g_regex_match_all()), also compare against the number of matches */
1756
0
  if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches))
1757
0
    return FALSE;
1758
1759
0
  if (start_pos != NULL)
1760
0
    *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1;
1761
1762
0
  if (end_pos != NULL)
1763
0
    *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1;
1764
1765
0
  return TRUE;
1766
0
}
1767
1768
/*
1769
 * Returns number of first matched subpattern with name @name.
1770
 * There may be more than one in case when DUPNAMES is used,
1771
 * and not all subpatterns with that name match;
1772
 * pcre2_substring_number_from_name() does not work in that case.
1773
 */
1774
static gint
1775
get_matched_substring_number (const GMatchInfo *match_info,
1776
                              const gchar      *name)
1777
0
{
1778
0
  gint entrysize;
1779
0
  PCRE2_SPTR first, last;
1780
0
  guchar *entry;
1781
1782
0
  if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES))
1783
0
    return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR8) name);
1784
1785
  /* This code is analogous to code from pcre2_substring.c:
1786
   * pcre2_substring_get_byname() */
1787
0
  entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re,
1788
0
                                              (PCRE2_SPTR8) name,
1789
0
                                              &first,
1790
0
                                              &last);
1791
1792
0
  if (entrysize <= 0)
1793
0
    return entrysize;
1794
1795
0
  for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
1796
0
    {
1797
0
      guint n = (entry[0] << 8) + entry[1];
1798
0
      if (n * 2 < match_info->n_offsets && match_info->offsets[n * 2] >= 0)
1799
0
        return n;
1800
0
    }
1801
1802
0
  return (first[0] << 8) + first[1];
1803
0
}
1804
1805
/**
1806
 * g_match_info_fetch_named:
1807
 * @match_info: #GMatchInfo structure
1808
 * @name: name of the subexpression
1809
 *
1810
 * Retrieves the text matching the capturing parentheses named @name.
1811
 *
1812
 * If @name is a valid sub pattern name but it didn't match anything
1813
 * (e.g. sub pattern `"X"`, matching `"b"` against `"(?P<X>a)?b"`)
1814
 * then an empty string is returned.
1815
 *
1816
 * The string is fetched from the string passed to the match function,
1817
 * so you cannot call this function after freeing the string.
1818
 *
1819
 * Returns: (nullable): The matched substring, or %NULL if an error
1820
 *     occurred. You have to free the string yourself
1821
 *
1822
 * Since: 2.14
1823
 */
1824
gchar *
1825
g_match_info_fetch_named (const GMatchInfo *match_info,
1826
                          const gchar      *name)
1827
0
{
1828
0
  gint num;
1829
1830
0
  g_return_val_if_fail (match_info != NULL, NULL);
1831
0
  g_return_val_if_fail (name != NULL, NULL);
1832
1833
0
  num = get_matched_substring_number (match_info, name);
1834
0
  if (num < 0)
1835
0
    return NULL;
1836
0
  else
1837
0
    return g_match_info_fetch (match_info, num);
1838
0
}
1839
1840
/**
1841
 * g_match_info_fetch_named_pos:
1842
 * @match_info: #GMatchInfo structure
1843
 * @name: name of the subexpression
1844
 * @start_pos: (out) (optional): pointer to location where to store
1845
 *     the start position, or %NULL
1846
 * @end_pos: (out) (optional): pointer to location where to store
1847
 *     the end position (the byte after the final byte of the match), or %NULL
1848
 *
1849
 * Retrieves the position in bytes of the capturing parentheses named @name.
1850
 *
1851
 * If @name is a valid sub pattern name but it didn't match anything
1852
 * (e.g. sub pattern `"X"`, matching `"b"` against `"(?P<X>a)?b"`)
1853
 * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
1854
 *
1855
 * As @end_pos is set to the byte after the final byte of the match (on success),
1856
 * the length of the match can be calculated as `end_pos - start_pos`.
1857
 *
1858
 * Returns: %TRUE if the position was fetched, %FALSE otherwise.
1859
 *     If the position cannot be fetched, @start_pos and @end_pos
1860
 *     are left unchanged.
1861
 *
1862
 * Since: 2.14
1863
 */
1864
gboolean
1865
g_match_info_fetch_named_pos (const GMatchInfo *match_info,
1866
                              const gchar      *name,
1867
                              gint             *start_pos,
1868
                              gint             *end_pos)
1869
0
{
1870
0
  gint num;
1871
1872
0
  g_return_val_if_fail (match_info != NULL, FALSE);
1873
0
  g_return_val_if_fail (name != NULL, FALSE);
1874
1875
0
  num = get_matched_substring_number (match_info, name);
1876
0
  if (num < 0)
1877
0
    return FALSE;
1878
1879
0
  return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
1880
0
}
1881
1882
/**
1883
 * g_match_info_fetch_all:
1884
 * @match_info: a #GMatchInfo structure
1885
 *
1886
 * Bundles up pointers to each of the matching substrings from a match
1887
 * and stores them in an array of gchar pointers. The first element in
1888
 * the returned array is the match number 0, i.e. the entire matched
1889
 * text.
1890
 *
1891
 * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
1892
 * "b" against "(a)?b") then an empty string is inserted.
1893
 *
1894
 * If the last match was obtained using the DFA algorithm, that is using
1895
 * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1896
 * strings are not that matched by sets of parentheses but that of the
1897
 * matched substring. Substrings are matched in reverse order of length,
1898
 * so the first one is the longest match.
1899
 *
1900
 * The strings are fetched from the string passed to the match function,
1901
 * so you cannot call this function after freeing the string.
1902
 *
1903
 * Returns: (transfer full): a %NULL-terminated array of gchar *
1904
 *     pointers.  It must be freed using g_strfreev(). If the previous
1905
 *     match failed %NULL is returned
1906
 *
1907
 * Since: 2.14
1908
 */
1909
gchar **
1910
g_match_info_fetch_all (const GMatchInfo *match_info)
1911
0
{
1912
0
  gchar **result;
1913
0
  gint i;
1914
1915
0
  g_return_val_if_fail (match_info != NULL, NULL);
1916
1917
0
  if (match_info->matches < 0)
1918
0
    return NULL;
1919
1920
0
  result = g_new (gchar *, match_info->matches + 1);
1921
0
  for (i = 0; i < match_info->matches; i++)
1922
0
    result[i] = g_match_info_fetch (match_info, i);
1923
0
  result[i] = NULL;
1924
1925
0
  return result;
1926
0
}
1927
1928
1929
/* GRegex */
1930
1931
G_DEFINE_QUARK (g-regex-error-quark, g_regex_error)
1932
1933
/**
1934
 * g_regex_ref:
1935
 * @regex: a #GRegex
1936
 *
1937
 * Increases reference count of @regex by 1.
1938
 *
1939
 * Returns: @regex
1940
 *
1941
 * Since: 2.14
1942
 */
1943
GRegex *
1944
g_regex_ref (GRegex *regex)
1945
131k
{
1946
131k
  g_return_val_if_fail (regex != NULL, NULL);
1947
131k
  g_atomic_int_inc (&regex->ref_count);
1948
131k
  return regex;
1949
131k
}
1950
1951
/**
1952
 * g_regex_unref:
1953
 * @regex: a #GRegex
1954
 *
1955
 * Decreases reference count of @regex by 1. When reference count drops
1956
 * to zero, it frees all the memory associated with the regex structure.
1957
 *
1958
 * Since: 2.14
1959
 */
1960
void
1961
g_regex_unref (GRegex *regex)
1962
131k
{
1963
131k
  g_return_if_fail (regex != NULL);
1964
1965
131k
  if (g_atomic_int_dec_and_test (&regex->ref_count))
1966
0
    {
1967
0
      g_free (regex->pattern);
1968
0
      if (regex->pcre_re != NULL)
1969
0
        pcre2_code_free (regex->pcre_re);
1970
0
      g_free (regex);
1971
0
    }
1972
131k
}
1973
1974
static pcre2_code * regex_compile (const gchar  *pattern,
1975
                                   uint32_t      compile_options,
1976
                                   uint32_t      newline_options,
1977
                                   uint32_t      bsr_options,
1978
                                   GError      **error);
1979
1980
static uint32_t get_pcre2_inline_compile_options (pcre2_code *re,
1981
                                                  uint32_t    compile_options);
1982
1983
/**
1984
 * g_regex_new:
1985
 * @pattern: the regular expression
1986
 * @compile_options: compile options for the regular expression, or 0
1987
 * @match_options: match options for the regular expression, or 0
1988
 * @error: return location for a #GError
1989
 *
1990
 * Compiles the regular expression to an internal form, and does
1991
 * the initial setup of the #GRegex structure.
1992
 *
1993
 * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call
1994
 *   g_regex_unref() when you are done with it
1995
 *
1996
 * Since: 2.14
1997
 */
1998
GRegex *
1999
g_regex_new (const gchar         *pattern,
2000
             GRegexCompileFlags   compile_options,
2001
             GRegexMatchFlags     match_options,
2002
             GError             **error)
2003
16
{
2004
16
  GRegex *regex;
2005
16
  pcre2_code *re;
2006
16
  static gsize initialised = 0;
2007
16
  uint32_t pcre_compile_options;
2008
16
  uint32_t pcre_match_options;
2009
16
  uint32_t newline_options;
2010
16
  uint32_t bsr_options;
2011
2012
16
  g_return_val_if_fail (pattern != NULL, NULL);
2013
16
  g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2014
16
G_GNUC_BEGIN_IGNORE_DEPRECATIONS
2015
16
  g_return_val_if_fail ((compile_options & ~(G_REGEX_COMPILE_MASK |
2016
16
                                             G_REGEX_JAVASCRIPT_COMPAT)) == 0, NULL);
2017
16
G_GNUC_END_IGNORE_DEPRECATIONS
2018
16
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2019
2020
16
  if (g_once_init_enter (&initialised))
2021
4
    {
2022
4
      int supports_utf8;
2023
2024
4
      pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8);
2025
4
      if (!supports_utf8)
2026
0
        g_critical (_("PCRE library is compiled without UTF8 support"));
2027
2028
4
      g_once_init_leave (&initialised, supports_utf8 ? 1 : 2);
2029
4
    }
2030
2031
16
  if (G_UNLIKELY (initialised != 1))
2032
0
    {
2033
0
      g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, 
2034
0
                           _("PCRE library is compiled with incompatible options"));
2035
0
      return NULL;
2036
0
    }
2037
2038
16
  pcre_compile_options = get_pcre2_compile_options (compile_options);
2039
16
  pcre_match_options = get_pcre2_match_options (match_options, compile_options);
2040
2041
16
  newline_options = get_pcre2_newline_match_options (match_options);
2042
16
  if (newline_options == 0)
2043
16
    newline_options = get_pcre2_newline_compile_options (compile_options);
2044
2045
16
  if (newline_options == 0)
2046
0
    {
2047
0
      g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS,
2048
0
                   "Invalid newline flags");
2049
0
      return NULL;
2050
0
    }
2051
2052
16
  bsr_options = get_pcre2_bsr_match_options (match_options);
2053
16
  if (!bsr_options)
2054
16
    bsr_options = get_pcre2_bsr_compile_options (compile_options);
2055
2056
16
  re = regex_compile (pattern, pcre_compile_options,
2057
16
                      newline_options, bsr_options, error);
2058
16
  if (re == NULL)
2059
0
    return NULL;
2060
2061
16
  pcre_compile_options |=
2062
16
    get_pcre2_inline_compile_options (re, pcre_compile_options);
2063
2064
16
  regex = g_new0 (GRegex, 1);
2065
16
  regex->ref_count = 1;
2066
16
  regex->pattern = g_strdup (pattern);
2067
16
  regex->pcre_re = re;
2068
16
  regex->compile_opts = pcre_compile_options;
2069
16
  regex->orig_compile_opts = compile_options;
2070
16
  regex->match_opts = pcre_match_options;
2071
16
  regex->orig_match_opts = match_options;
2072
2073
16
  return regex;
2074
16
}
2075
2076
static pcre2_code *
2077
regex_compile (const gchar  *pattern,
2078
               uint32_t      compile_options,
2079
               uint32_t      newline_options,
2080
               uint32_t      bsr_options,
2081
               GError      **error)
2082
16
{
2083
16
  pcre2_code *re;
2084
16
  pcre2_compile_context *context;
2085
16
  const gchar *errmsg;
2086
16
  PCRE2_SIZE erroffset;
2087
16
  gint errcode;
2088
2089
16
  context = pcre2_compile_context_create (NULL);
2090
2091
  /* set newline options */
2092
16
  if (pcre2_set_newline (context, newline_options) != 0)
2093
0
    {
2094
0
      g_set_error (error, G_REGEX_ERROR,
2095
0
                   G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS,
2096
0
                   "Invalid newline flags");
2097
0
      pcre2_compile_context_free (context);
2098
0
      return NULL;
2099
0
    }
2100
2101
  /* set bsr options */
2102
16
  if (pcre2_set_bsr (context, bsr_options) != 0)
2103
0
    {
2104
0
      g_set_error (error, G_REGEX_ERROR,
2105
0
                   G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS,
2106
0
                   "Invalid BSR flags");
2107
0
      pcre2_compile_context_free (context);
2108
0
      return NULL;
2109
0
    }
2110
2111
  /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */
2112
16
  if (compile_options & PCRE2_UTF)
2113
4
    compile_options |= PCRE2_NO_UTF_CHECK;
2114
2115
16
  compile_options |= PCRE2_UCP;
2116
2117
  /* compile the pattern */
2118
16
  re = pcre2_compile ((PCRE2_SPTR8) pattern,
2119
16
                      PCRE2_ZERO_TERMINATED,
2120
16
                      compile_options,
2121
16
                      &errcode,
2122
16
                      &erroffset,
2123
16
                      context);
2124
16
  pcre2_compile_context_free (context);
2125
2126
  /* if the compilation failed, set the error member and return
2127
   * immediately */
2128
16
  if (re == NULL)
2129
0
    {
2130
0
      GError *tmp_error;
2131
0
      gchar *offset_str;
2132
0
      gchar *pcre2_errmsg = NULL;
2133
0
      int original_errcode;
2134
2135
      /* Translate the PCRE error code to GRegexError and use a translated
2136
       * error message if possible */
2137
0
      original_errcode = errcode;
2138
0
      translate_compile_error (&errcode, &errmsg);
2139
2140
0
      if (!errmsg)
2141
0
        {
2142
0
          errmsg = _("unknown error");
2143
0
          pcre2_errmsg = get_pcre2_error_string (original_errcode);
2144
0
        }
2145
2146
      /* PCRE uses byte offsets but we want to show character offsets */
2147
0
      erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);
2148
2149
0
      offset_str = g_strdup_printf ("%" G_GSIZE_FORMAT, erroffset);
2150
0
      tmp_error = g_error_new (G_REGEX_ERROR, errcode,
2151
0
                               _("Error while compiling regular expression ‘%s’ "
2152
0
                                 "at char %s: %s"),
2153
0
                               pattern, offset_str,
2154
0
                               pcre2_errmsg ? pcre2_errmsg : errmsg);
2155
0
      g_propagate_error (error, tmp_error);
2156
0
      g_free (offset_str);
2157
0
      g_clear_pointer (&pcre2_errmsg, g_free);
2158
2159
0
      return NULL;
2160
0
    }
2161
2162
16
  return re;
2163
16
}
2164
2165
static uint32_t
2166
get_pcre2_inline_compile_options (pcre2_code *re,
2167
                                  uint32_t    compile_options)
2168
16
{
2169
16
  uint32_t pcre_compile_options;
2170
16
  uint32_t nonpcre_compile_options;
2171
2172
  /* For options set at the beginning of the pattern, pcre puts them into
2173
   * compile options, e.g. "(?i)foo" will make the pcre structure store
2174
   * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */
2175
16
  nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
2176
16
  pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options);
2177
16
  compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK;
2178
16
  compile_options |= nonpcre_compile_options;
2179
2180
16
  if (!(compile_options & PCRE2_DUPNAMES))
2181
16
    {
2182
16
      uint32_t jchanged = 0;
2183
16
      pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged);
2184
16
      if (jchanged)
2185
0
        compile_options |= PCRE2_DUPNAMES;
2186
16
    }
2187
2188
16
  return compile_options;
2189
16
}
2190
2191
/**
2192
 * g_regex_get_pattern:
2193
 * @regex: a #GRegex structure
2194
 *
2195
 * Gets the pattern string associated with @regex, i.e. a copy of
2196
 * the string passed to g_regex_new().
2197
 *
2198
 * Returns: the pattern of @regex
2199
 *
2200
 * Since: 2.14
2201
 */
2202
const gchar *
2203
g_regex_get_pattern (const GRegex *regex)
2204
0
{
2205
0
  g_return_val_if_fail (regex != NULL, NULL);
2206
2207
0
  return regex->pattern;
2208
0
}
2209
2210
/**
2211
 * g_regex_get_max_backref:
2212
 * @regex: a #GRegex
2213
 *
2214
 * Returns the number of the highest back reference
2215
 * in the pattern, or 0 if the pattern does not contain
2216
 * back references.
2217
 *
2218
 * Returns: the number of the highest back reference
2219
 *
2220
 * Since: 2.14
2221
 */
2222
gint
2223
g_regex_get_max_backref (const GRegex *regex)
2224
0
{
2225
0
  uint32_t value;
2226
2227
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value);
2228
2229
0
  return value;
2230
0
}
2231
2232
/**
2233
 * g_regex_get_capture_count:
2234
 * @regex: a #GRegex
2235
 *
2236
 * Returns the number of capturing subpatterns in the pattern.
2237
 *
2238
 * Returns: the number of capturing subpatterns
2239
 *
2240
 * Since: 2.14
2241
 */
2242
gint
2243
g_regex_get_capture_count (const GRegex *regex)
2244
0
{
2245
0
  uint32_t value;
2246
2247
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value);
2248
2249
0
  return value;
2250
0
}
2251
2252
/**
2253
 * g_regex_get_has_cr_or_lf:
2254
 * @regex: a #GRegex structure
2255
 *
2256
 * Checks whether the pattern contains explicit CR or LF references.
2257
 *
2258
 * Returns: %TRUE if the pattern contains explicit CR or LF references
2259
 *
2260
 * Since: 2.34
2261
 */
2262
gboolean
2263
g_regex_get_has_cr_or_lf (const GRegex *regex)
2264
0
{
2265
0
  uint32_t value;
2266
2267
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value);
2268
2269
0
  return !!value;
2270
0
}
2271
2272
/**
2273
 * g_regex_get_max_lookbehind:
2274
 * @regex: a #GRegex structure
2275
 *
2276
 * Gets the number of characters in the longest lookbehind assertion in the
2277
 * pattern. This information is useful when doing multi-segment matching using
2278
 * the partial matching facilities.
2279
 *
2280
 * Returns: the number of characters in the longest lookbehind assertion.
2281
 *
2282
 * Since: 2.38
2283
 */
2284
gint
2285
g_regex_get_max_lookbehind (const GRegex *regex)
2286
0
{
2287
0
  uint32_t max_lookbehind;
2288
2289
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND,
2290
0
                      &max_lookbehind);
2291
2292
0
  return max_lookbehind;
2293
0
}
2294
2295
/**
2296
 * g_regex_get_compile_flags:
2297
 * @regex: a #GRegex
2298
 *
2299
 * Returns the compile options that @regex was created with.
2300
 *
2301
 * Depending on the version of PCRE that is used, this may or may not
2302
 * include flags set by option expressions such as `(?i)` found at the
2303
 * top-level within the compiled pattern.
2304
 *
2305
 * Returns: flags from #GRegexCompileFlags
2306
 *
2307
 * Since: 2.26
2308
 */
2309
GRegexCompileFlags
2310
g_regex_get_compile_flags (const GRegex *regex)
2311
0
{
2312
0
  GRegexCompileFlags extra_flags;
2313
0
  uint32_t info_value;
2314
2315
0
  g_return_val_if_fail (regex != NULL, 0);
2316
2317
  /* Preserve original G_REGEX_OPTIMIZE */
2318
0
  extra_flags = (regex->orig_compile_opts & G_REGEX_OPTIMIZE);
2319
2320
  /* Also include the newline options */
2321
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_NEWLINE, &info_value);
2322
0
  switch (info_value)
2323
0
    {
2324
0
    case PCRE2_NEWLINE_ANYCRLF:
2325
0
      extra_flags |= G_REGEX_NEWLINE_ANYCRLF;
2326
0
      break;
2327
0
    case PCRE2_NEWLINE_CRLF:
2328
0
      extra_flags |= G_REGEX_NEWLINE_CRLF;
2329
0
      break;
2330
0
    case PCRE2_NEWLINE_LF:
2331
0
      extra_flags |= G_REGEX_NEWLINE_LF;
2332
0
      break;
2333
0
    case PCRE2_NEWLINE_CR:
2334
0
      extra_flags |= G_REGEX_NEWLINE_CR;
2335
0
      break;
2336
0
    default:
2337
0
      break;
2338
0
    }
2339
2340
  /* Also include the bsr options */
2341
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BSR, &info_value);
2342
0
  switch (info_value)
2343
0
    {
2344
0
    case PCRE2_BSR_ANYCRLF:
2345
0
      extra_flags |= G_REGEX_BSR_ANYCRLF;
2346
0
      break;
2347
0
    default:
2348
0
      break;
2349
0
    }
2350
2351
0
  return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags;
2352
0
}
2353
2354
/**
2355
 * g_regex_get_match_flags:
2356
 * @regex: a #GRegex
2357
 *
2358
 * Returns the match options that @regex was created with.
2359
 *
2360
 * Returns: flags from #GRegexMatchFlags
2361
 *
2362
 * Since: 2.26
2363
 */
2364
GRegexMatchFlags
2365
g_regex_get_match_flags (const GRegex *regex)
2366
0
{
2367
0
  uint32_t flags;
2368
2369
0
  g_return_val_if_fail (regex != NULL, 0);
2370
2371
0
  flags = g_regex_match_flags_from_pcre2 (regex->match_opts);
2372
0
  flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK);
2373
0
  flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF));
2374
2375
0
  return flags;
2376
0
}
2377
2378
/**
2379
 * g_regex_match_simple:
2380
 * @pattern: the regular expression
2381
 * @string: the string to scan for matches
2382
 * @compile_options: compile options for the regular expression, or 0
2383
 * @match_options: match options, or 0
2384
 *
2385
 * Scans for a match in @string for @pattern.
2386
 *
2387
 * This function is equivalent to g_regex_match() but it does not
2388
 * require to compile the pattern with g_regex_new(), avoiding some
2389
 * lines of code when you need just to do a match without extracting
2390
 * substrings, capture counts, and so on.
2391
 *
2392
 * If this function is to be called on the same @pattern more than
2393
 * once, it's more efficient to compile the pattern once with
2394
 * g_regex_new() and then use g_regex_match().
2395
 *
2396
 * Returns: %TRUE if the string matched, %FALSE otherwise
2397
 *
2398
 * Since: 2.14
2399
 */
2400
gboolean
2401
g_regex_match_simple (const gchar        *pattern,
2402
                      const gchar        *string,
2403
                      GRegexCompileFlags  compile_options,
2404
                      GRegexMatchFlags    match_options)
2405
0
{
2406
0
  GRegex *regex;
2407
0
  gboolean result;
2408
2409
0
  regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL);
2410
0
  if (!regex)
2411
0
    return FALSE;
2412
0
  result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
2413
0
  g_regex_unref (regex);
2414
0
  return result;
2415
0
}
2416
2417
/**
2418
 * g_regex_match:
2419
 * @regex: a #GRegex structure from g_regex_new()
2420
 * @string: the string to scan for matches
2421
 * @match_options: match options
2422
 * @match_info: (out) (optional): pointer to location where to store
2423
 *     the #GMatchInfo, or %NULL if you do not need it
2424
 *
2425
 * Scans for a match in @string for the pattern in @regex.
2426
 * The @match_options are combined with the match options specified
2427
 * when the @regex structure was created, letting you have more
2428
 * flexibility in reusing #GRegex structures.
2429
 *
2430
 * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2431
 *
2432
 * A #GMatchInfo structure, used to get information on the match,
2433
 * is stored in @match_info if not %NULL. Note that if @match_info
2434
 * is not %NULL then it is created even if the function returns %FALSE,
2435
 * i.e. you must free it regardless if regular expression actually matched.
2436
 *
2437
 * To retrieve all the non-overlapping matches of the pattern in
2438
 * string you can use g_match_info_next().
2439
 *
2440
 * |[<!-- language="C" --> 
2441
 * static void
2442
 * print_uppercase_words (const gchar *string)
2443
 * {
2444
 *   // Print all uppercase-only words.
2445
 *   GRegex *regex;
2446
 *   GMatchInfo *match_info;
2447
 *  
2448
 *   regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
2449
 *   g_regex_match (regex, string, 0, &match_info);
2450
 *   while (g_match_info_matches (match_info))
2451
 *     {
2452
 *       gchar *word = g_match_info_fetch (match_info, 0);
2453
 *       g_print ("Found: %s\n", word);
2454
 *       g_free (word);
2455
 *       g_match_info_next (match_info, NULL);
2456
 *     }
2457
 *   g_match_info_free (match_info);
2458
 *   g_regex_unref (regex);
2459
 * }
2460
 * ]|
2461
 *
2462
 * @string is not copied and is used in #GMatchInfo internally. If
2463
 * you use any #GMatchInfo method (except g_match_info_free()) after
2464
 * freeing or modifying @string then the behaviour is undefined.
2465
 *
2466
 * Returns: %TRUE is the string matched, %FALSE otherwise
2467
 *
2468
 * Since: 2.14
2469
 */
2470
gboolean
2471
g_regex_match (const GRegex      *regex,
2472
               const gchar       *string,
2473
               GRegexMatchFlags   match_options,
2474
               GMatchInfo       **match_info)
2475
131k
{
2476
131k
  return g_regex_match_full (regex, string, -1, 0, match_options,
2477
131k
                             match_info, NULL);
2478
131k
}
2479
2480
/**
2481
 * g_regex_match_full:
2482
 * @regex: a #GRegex structure from g_regex_new()
2483
 * @string: (array length=string_len): the string to scan for matches
2484
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2485
 * @start_position: starting index of the string to match, in bytes
2486
 * @match_options: match options
2487
 * @match_info: (out) (optional): pointer to location where to store
2488
 *     the #GMatchInfo, or %NULL if you do not need it
2489
 * @error: location to store the error occurring, or %NULL to ignore errors
2490
 *
2491
 * Scans for a match in @string for the pattern in @regex.
2492
 * The @match_options are combined with the match options specified
2493
 * when the @regex structure was created, letting you have more
2494
 * flexibility in reusing #GRegex structures.
2495
 *
2496
 * Setting @start_position differs from just passing over a shortened
2497
 * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
2498
 * that begins with any kind of lookbehind assertion, such as "\b".
2499
 *
2500
 * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2501
 *
2502
 * A #GMatchInfo structure, used to get information on the match, is
2503
 * stored in @match_info if not %NULL. Note that if @match_info is
2504
 * not %NULL then it is created even if the function returns %FALSE,
2505
 * i.e. you must free it regardless if regular expression actually
2506
 * matched.
2507
 *
2508
 * @string is not copied and is used in #GMatchInfo internally. If
2509
 * you use any #GMatchInfo method (except g_match_info_free()) after
2510
 * freeing or modifying @string then the behaviour is undefined.
2511
 *
2512
 * To retrieve all the non-overlapping matches of the pattern in
2513
 * string you can use g_match_info_next().
2514
 *
2515
 * |[<!-- language="C" --> 
2516
 * static void
2517
 * print_uppercase_words (const gchar *string)
2518
 * {
2519
 *   // Print all uppercase-only words.
2520
 *   GRegex *regex;
2521
 *   GMatchInfo *match_info;
2522
 *   GError *error = NULL;
2523
 *   
2524
 *   regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
2525
 *   g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);
2526
 *   while (g_match_info_matches (match_info))
2527
 *     {
2528
 *       gchar *word = g_match_info_fetch (match_info, 0);
2529
 *       g_print ("Found: %s\n", word);
2530
 *       g_free (word);
2531
 *       g_match_info_next (match_info, &error);
2532
 *     }
2533
 *   g_match_info_free (match_info);
2534
 *   g_regex_unref (regex);
2535
 *   if (error != NULL)
2536
 *     {
2537
 *       g_printerr ("Error while matching: %s\n", error->message);
2538
 *       g_error_free (error);
2539
 *     }
2540
 * }
2541
 * ]|
2542
 *
2543
 * Returns: %TRUE is the string matched, %FALSE otherwise
2544
 *
2545
 * Since: 2.14
2546
 */
2547
gboolean
2548
g_regex_match_full (const GRegex      *regex,
2549
                    const gchar       *string,
2550
                    gssize             string_len,
2551
                    gint               start_position,
2552
                    GRegexMatchFlags   match_options,
2553
                    GMatchInfo       **match_info,
2554
                    GError           **error)
2555
131k
{
2556
131k
  GMatchInfo *info;
2557
131k
  gboolean match_ok;
2558
2559
131k
  g_return_val_if_fail (regex != NULL, FALSE);
2560
131k
  g_return_val_if_fail (string != NULL, FALSE);
2561
131k
  g_return_val_if_fail (start_position >= 0, FALSE);
2562
131k
  g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
2563
131k
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
2564
2565
131k
  info = match_info_new (regex, string, string_len, start_position,
2566
131k
                         match_options, FALSE);
2567
131k
  match_ok = g_match_info_next (info, error);
2568
131k
  if (match_info != NULL)
2569
0
    *match_info = info;
2570
131k
  else
2571
131k
    g_match_info_free (info);
2572
2573
131k
  return match_ok;
2574
131k
}
2575
2576
/**
2577
 * g_regex_match_all:
2578
 * @regex: a #GRegex structure from g_regex_new()
2579
 * @string: the string to scan for matches
2580
 * @match_options: match options
2581
 * @match_info: (out) (optional): pointer to location where to store
2582
 *     the #GMatchInfo, or %NULL if you do not need it
2583
 *
2584
 * Using the standard algorithm for regular expression matching only
2585
 * the longest match in the string is retrieved. This function uses
2586
 * a different algorithm so it can retrieve all the possible matches.
2587
 * For more documentation see g_regex_match_all_full().
2588
 *
2589
 * A #GMatchInfo structure, used to get information on the match, is
2590
 * stored in @match_info if not %NULL. Note that if @match_info is
2591
 * not %NULL then it is created even if the function returns %FALSE,
2592
 * i.e. you must free it regardless if regular expression actually
2593
 * matched.
2594
 *
2595
 * @string is not copied and is used in #GMatchInfo internally. If
2596
 * you use any #GMatchInfo method (except g_match_info_free()) after
2597
 * freeing or modifying @string then the behaviour is undefined.
2598
 *
2599
 * Returns: %TRUE is the string matched, %FALSE otherwise
2600
 *
2601
 * Since: 2.14
2602
 */
2603
gboolean
2604
g_regex_match_all (const GRegex      *regex,
2605
                   const gchar       *string,
2606
                   GRegexMatchFlags   match_options,
2607
                   GMatchInfo       **match_info)
2608
0
{
2609
0
  return g_regex_match_all_full (regex, string, -1, 0, match_options,
2610
0
                                 match_info, NULL);
2611
0
}
2612
2613
/**
2614
 * g_regex_match_all_full:
2615
 * @regex: a #GRegex structure from g_regex_new()
2616
 * @string: (array length=string_len): the string to scan for matches
2617
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2618
 * @start_position: starting index of the string to match, in bytes
2619
 * @match_options: match options
2620
 * @match_info: (out) (optional): pointer to location where to store
2621
 *     the #GMatchInfo, or %NULL if you do not need it
2622
 * @error: location to store the error occurring, or %NULL to ignore errors
2623
 *
2624
 * Using the standard algorithm for regular expression matching only
2625
 * the longest match in the @string is retrieved, it is not possible
2626
 * to obtain all the available matches. For instance matching
2627
 * `"<a> <b> <c>"` against the pattern `"<.*>"`
2628
 * you get `"<a> <b> <c>"`.
2629
 *
2630
 * This function uses a different algorithm (called DFA, i.e. deterministic
2631
 * finite automaton), so it can retrieve all the possible matches, all
2632
 * starting at the same point in the string. For instance matching
2633
 * `"<a> <b> <c>"` against the pattern `"<.*>"`
2634
 * you would obtain three matches: `"<a> <b> <c>"`,
2635
 * `"<a> <b>"` and `"<a>"`.
2636
 *
2637
 * The number of matched strings is retrieved using
2638
 * g_match_info_get_match_count(). To obtain the matched strings and
2639
 * their position you can use, respectively, g_match_info_fetch() and
2640
 * g_match_info_fetch_pos(). Note that the strings are returned in
2641
 * reverse order of length; that is, the longest matching string is
2642
 * given first.
2643
 *
2644
 * Note that the DFA algorithm is slower than the standard one and it
2645
 * is not able to capture substrings, so backreferences do not work.
2646
 *
2647
 * Setting @start_position differs from just passing over a shortened
2648
 * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
2649
 * that begins with any kind of lookbehind assertion, such as "\b".
2650
 *
2651
 * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2652
 *
2653
 * A #GMatchInfo structure, used to get information on the match, is
2654
 * stored in @match_info if not %NULL. Note that if @match_info is
2655
 * not %NULL then it is created even if the function returns %FALSE,
2656
 * i.e. you must free it regardless if regular expression actually
2657
 * matched.
2658
 *
2659
 * @string is not copied and is used in #GMatchInfo internally. If
2660
 * you use any #GMatchInfo method (except g_match_info_free()) after
2661
 * freeing or modifying @string then the behaviour is undefined.
2662
 *
2663
 * Returns: %TRUE is the string matched, %FALSE otherwise
2664
 *
2665
 * Since: 2.14
2666
 */
2667
gboolean
2668
g_regex_match_all_full (const GRegex      *regex,
2669
                        const gchar       *string,
2670
                        gssize             string_len,
2671
                        gint               start_position,
2672
                        GRegexMatchFlags   match_options,
2673
                        GMatchInfo       **match_info,
2674
                        GError           **error)
2675
0
{
2676
0
  GMatchInfo *info;
2677
0
  gboolean done;
2678
0
  pcre2_code *pcre_re;
2679
0
  gboolean retval;
2680
0
  uint32_t newline_options;
2681
0
  uint32_t bsr_options;
2682
2683
0
  g_return_val_if_fail (regex != NULL, FALSE);
2684
0
  g_return_val_if_fail (string != NULL, FALSE);
2685
0
  g_return_val_if_fail (start_position >= 0, FALSE);
2686
0
  g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
2687
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
2688
2689
0
  newline_options = get_pcre2_newline_match_options (match_options);
2690
0
  if (!newline_options)
2691
0
    newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts);
2692
2693
0
  bsr_options = get_pcre2_bsr_match_options (match_options);
2694
0
  if (!bsr_options)
2695
0
    bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts);
2696
2697
  /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an
2698
   * optimization for normal regex matching, but results in omitting some
2699
   * shorter matches here, and an observable behaviour change.
2700
   *
2701
   * DFA matching is rather niche, and very rarely used according to
2702
   * codesearch.debian.net, so don't bother caching the recompiled RE. */
2703
0
  pcre_re = regex_compile (regex->pattern,
2704
0
                           regex->compile_opts | PCRE2_NO_AUTO_POSSESS,
2705
0
                           newline_options, bsr_options, error);
2706
0
  if (pcre_re == NULL)
2707
0
    return FALSE;
2708
2709
0
  info = match_info_new (regex, string, string_len, start_position,
2710
0
                         match_options, TRUE);
2711
2712
0
  done = FALSE;
2713
0
  while (!done)
2714
0
    {
2715
0
      done = TRUE;
2716
0
      info->matches = pcre2_dfa_match (pcre_re,
2717
0
                                       (PCRE2_SPTR8) info->string, info->string_len,
2718
0
                                       info->pos,
2719
0
                                       (regex->match_opts | info->match_opts),
2720
0
                                       info->match_data,
2721
0
                                       info->match_context,
2722
0
                                       info->workspace, info->n_workspace);
2723
0
      if (info->matches == PCRE2_ERROR_DFA_WSSIZE)
2724
0
        {
2725
          /* info->workspace is too small. */
2726
0
          info->n_workspace *= 2;
2727
0
          info->workspace = g_realloc_n (info->workspace,
2728
0
                                         info->n_workspace,
2729
0
                                         sizeof (gint));
2730
0
          done = FALSE;
2731
0
        }
2732
0
      else if (info->matches == 0)
2733
0
        {
2734
          /* info->offsets is too small. */
2735
0
          info->n_offsets *= 2;
2736
0
          info->offsets = g_realloc_n (info->offsets,
2737
0
                                       info->n_offsets,
2738
0
                                       sizeof (gint));
2739
0
          pcre2_match_data_free (info->match_data);
2740
0
          info->match_data = pcre2_match_data_create (info->n_offsets, NULL);
2741
0
          done = FALSE;
2742
0
        }
2743
0
      else if (IS_PCRE2_ERROR (info->matches))
2744
0
        {
2745
0
          gchar *error_msg = get_match_error_message (info->matches);
2746
2747
0
          g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
2748
0
                       _("Error while matching regular expression %s: %s"),
2749
0
                       regex->pattern, error_msg);
2750
0
          g_clear_pointer (&error_msg, g_free);
2751
0
        }
2752
0
      else if (info->matches != PCRE2_ERROR_NOMATCH)
2753
0
        {
2754
0
          if (!recalc_match_offsets (info, error))
2755
0
            info->matches = PCRE2_ERROR_NOMATCH;
2756
0
        }
2757
0
    }
2758
2759
0
  pcre2_code_free (pcre_re);
2760
2761
  /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only
2762
   * holds true for a single match, rather than matching all */
2763
2764
  /* set info->pos to -1 so that a call to g_match_info_next() fails. */
2765
0
  info->pos = -1;
2766
0
  retval = info->matches >= 0;
2767
2768
0
  if (match_info != NULL)
2769
0
    *match_info = info;
2770
0
  else
2771
0
    g_match_info_free (info);
2772
2773
0
  return retval;
2774
0
}
2775
2776
/**
2777
 * g_regex_get_string_number:
2778
 * @regex: #GRegex structure
2779
 * @name: name of the subexpression
2780
 *
2781
 * Retrieves the number of the subexpression named @name.
2782
 *
2783
 * Returns: The number of the subexpression or -1 if @name
2784
 *   does not exists
2785
 *
2786
 * Since: 2.14
2787
 */
2788
gint
2789
g_regex_get_string_number (const GRegex *regex,
2790
                           const gchar  *name)
2791
0
{
2792
0
  gint num;
2793
2794
0
  g_return_val_if_fail (regex != NULL, -1);
2795
0
  g_return_val_if_fail (name != NULL, -1);
2796
2797
0
  num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR8) name);
2798
0
  if (num == PCRE2_ERROR_NOSUBSTRING)
2799
0
    num = -1;
2800
2801
0
  return num;
2802
0
}
2803
2804
/**
2805
 * g_regex_split_simple:
2806
 * @pattern: the regular expression
2807
 * @string: the string to scan for matches
2808
 * @compile_options: compile options for the regular expression, or 0
2809
 * @match_options: match options, or 0
2810
 *
2811
 * Breaks the string on the pattern, and returns an array of
2812
 * the tokens. If the pattern contains capturing parentheses,
2813
 * then the text for each of the substrings will also be returned.
2814
 * If the pattern does not match anywhere in the string, then the
2815
 * whole string is returned as the first token.
2816
 *
2817
 * This function is equivalent to g_regex_split() but it does
2818
 * not require to compile the pattern with g_regex_new(), avoiding
2819
 * some lines of code when you need just to do a split without
2820
 * extracting substrings, capture counts, and so on.
2821
 *
2822
 * If this function is to be called on the same @pattern more than
2823
 * once, it's more efficient to compile the pattern once with
2824
 * g_regex_new() and then use g_regex_split().
2825
 *
2826
 * As a special case, the result of splitting the empty string ""
2827
 * is an empty vector, not a vector containing a single string.
2828
 * The reason for this special case is that being able to represent
2829
 * an empty vector is typically more useful than consistent handling
2830
 * of empty elements. If you do need to represent empty elements,
2831
 * you'll need to check for the empty string before calling this
2832
 * function.
2833
 *
2834
 * A pattern that can match empty strings splits @string into
2835
 * separate characters wherever it matches the empty string between
2836
 * characters. For example splitting "ab c" using as a separator
2837
 * "\s*", you will get "a", "b" and "c".
2838
 *
2839
 * Returns: (transfer full): a %NULL-terminated array of strings. Free
2840
 * it using g_strfreev()
2841
 *
2842
 * Since: 2.14
2843
 **/
2844
gchar **
2845
g_regex_split_simple (const gchar        *pattern,
2846
                      const gchar        *string,
2847
                      GRegexCompileFlags  compile_options,
2848
                      GRegexMatchFlags    match_options)
2849
0
{
2850
0
  GRegex *regex;
2851
0
  gchar **result;
2852
2853
0
  regex = g_regex_new (pattern, compile_options, 0, NULL);
2854
0
  if (!regex)
2855
0
    return NULL;
2856
2857
0
  result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
2858
0
  g_regex_unref (regex);
2859
0
  return result;
2860
0
}
2861
2862
/**
2863
 * g_regex_split:
2864
 * @regex: a #GRegex structure
2865
 * @string: the string to split with the pattern
2866
 * @match_options: match time option flags
2867
 *
2868
 * Breaks the string on the pattern, and returns an array of the tokens.
2869
 * If the pattern contains capturing parentheses, then the text for each
2870
 * of the substrings will also be returned. If the pattern does not match
2871
 * anywhere in the string, then the whole string is returned as the first
2872
 * token.
2873
 *
2874
 * As a special case, the result of splitting the empty string "" is an
2875
 * empty vector, not a vector containing a single string. The reason for
2876
 * this special case is that being able to represent an empty vector is
2877
 * typically more useful than consistent handling of empty elements. If
2878
 * you do need to represent empty elements, you'll need to check for the
2879
 * empty string before calling this function.
2880
 *
2881
 * A pattern that can match empty strings splits @string into separate
2882
 * characters wherever it matches the empty string between characters.
2883
 * For example splitting "ab c" using as a separator "\s*", you will get
2884
 * "a", "b" and "c".
2885
 *
2886
 * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2887
 * it using g_strfreev()
2888
 *
2889
 * Since: 2.14
2890
 **/
2891
gchar **
2892
g_regex_split (const GRegex     *regex,
2893
               const gchar      *string,
2894
               GRegexMatchFlags  match_options)
2895
0
{
2896
0
  return g_regex_split_full (regex, string, -1, 0,
2897
0
                             match_options, 0, NULL);
2898
0
}
2899
2900
/**
2901
 * g_regex_split_full:
2902
 * @regex: a #GRegex structure
2903
 * @string: (array length=string_len): the string to split with the pattern
2904
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2905
 * @start_position: starting index of the string to match, in bytes
2906
 * @match_options: match time option flags
2907
 * @max_tokens: the maximum number of tokens to split @string into.
2908
 *   If this is less than 1, the string is split completely
2909
 * @error: return location for a #GError
2910
 *
2911
 * Breaks the string on the pattern, and returns an array of the tokens.
2912
 * If the pattern contains capturing parentheses, then the text for each
2913
 * of the substrings will also be returned. If the pattern does not match
2914
 * anywhere in the string, then the whole string is returned as the first
2915
 * token.
2916
 *
2917
 * As a special case, the result of splitting the empty string "" is an
2918
 * empty vector, not a vector containing a single string. The reason for
2919
 * this special case is that being able to represent an empty vector is
2920
 * typically more useful than consistent handling of empty elements. If
2921
 * you do need to represent empty elements, you'll need to check for the
2922
 * empty string before calling this function.
2923
 *
2924
 * A pattern that can match empty strings splits @string into separate
2925
 * characters wherever it matches the empty string between characters.
2926
 * For example splitting "ab c" using as a separator "\s*", you will get
2927
 * "a", "b" and "c".
2928
 *
2929
 * Setting @start_position differs from just passing over a shortened
2930
 * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
2931
 * that begins with any kind of lookbehind assertion, such as "\b".
2932
 *
2933
 * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2934
 * it using g_strfreev()
2935
 *
2936
 * Since: 2.14
2937
 **/
2938
gchar **
2939
g_regex_split_full (const GRegex      *regex,
2940
                    const gchar       *string,
2941
                    gssize             string_len,
2942
                    gint               start_position,
2943
                    GRegexMatchFlags   match_options,
2944
                    gint               max_tokens,
2945
                    GError           **error)
2946
0
{
2947
0
  GError *tmp_error = NULL;
2948
0
  GMatchInfo *match_info;
2949
0
  GList *list, *last;
2950
0
  gint i;
2951
0
  gint token_count;
2952
0
  gboolean match_ok;
2953
  /* position of the last separator. */
2954
0
  gint last_separator_end;
2955
  /* was the last match 0 bytes long? */
2956
0
  gboolean last_match_is_empty;
2957
  /* the returned array of char **s */
2958
0
  gchar **string_list;
2959
2960
0
  g_return_val_if_fail (regex != NULL, NULL);
2961
0
  g_return_val_if_fail (string != NULL, NULL);
2962
0
  g_return_val_if_fail (start_position >= 0, NULL);
2963
0
  g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2964
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2965
2966
0
  if (max_tokens <= 0)
2967
0
    max_tokens = G_MAXINT;
2968
2969
0
  if (string_len < 0)
2970
0
    string_len = strlen (string);
2971
2972
  /* zero-length string */
2973
0
  if (string_len - start_position == 0)
2974
0
    return g_new0 (gchar *, 1);
2975
2976
0
  if (max_tokens == 1)
2977
0
    {
2978
0
      string_list = g_new0 (gchar *, 2);
2979
0
      string_list[0] = g_strndup (&string[start_position],
2980
0
                                  string_len - start_position);
2981
0
      return string_list;
2982
0
    }
2983
2984
0
  list = NULL;
2985
0
  token_count = 0;
2986
0
  last_separator_end = start_position;
2987
0
  last_match_is_empty = FALSE;
2988
2989
0
  match_ok = g_regex_match_full (regex, string, string_len, start_position,
2990
0
                                 match_options, &match_info, &tmp_error);
2991
2992
0
  while (tmp_error == NULL)
2993
0
    {
2994
0
      if (match_ok)
2995
0
        {
2996
0
          last_match_is_empty =
2997
0
                    (match_info->offsets[0] == match_info->offsets[1]);
2998
2999
          /* we need to skip empty separators at the same position of the end
3000
           * of another separator. e.g. the string is "a b" and the separator
3001
           * is " *", so from 1 to 2 we have a match and at position 2 we have
3002
           * an empty match. */
3003
0
          if (last_separator_end != match_info->offsets[1])
3004
0
            {
3005
0
              gchar *token;
3006
0
              gint match_count;
3007
3008
0
              token = g_strndup (string + last_separator_end,
3009
0
                                 match_info->offsets[0] - last_separator_end);
3010
0
              list = g_list_prepend (list, token);
3011
0
              token_count++;
3012
3013
              /* if there were substrings, these need to be added to
3014
               * the list. */
3015
0
              match_count = g_match_info_get_match_count (match_info);
3016
0
              if (match_count > 1)
3017
0
                {
3018
0
                  for (i = 1; i < match_count; i++)
3019
0
                    list = g_list_prepend (list, g_match_info_fetch (match_info, i));
3020
0
                }
3021
0
            }
3022
0
        }
3023
0
      else
3024
0
        {
3025
          /* if there was no match, copy to end of string. */
3026
0
          if (!last_match_is_empty)
3027
0
            {
3028
0
              gchar *token = g_strndup (string + last_separator_end,
3029
0
                                        match_info->string_len - last_separator_end);
3030
0
              list = g_list_prepend (list, token);
3031
0
            }
3032
          /* no more tokens, end the loop. */
3033
0
          break;
3034
0
        }
3035
3036
      /* -1 to leave room for the last part. */
3037
0
      if (token_count >= max_tokens - 1)
3038
0
        {
3039
          /* we have reached the maximum number of tokens, so we copy
3040
           * the remaining part of the string. */
3041
0
          if (last_match_is_empty)
3042
0
            {
3043
              /* the last match was empty, so we have moved one char
3044
               * after the real position to avoid empty matches at the
3045
               * same position. */
3046
0
              match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
3047
0
            }
3048
          /* the if is needed in the case we have terminated the available
3049
           * tokens, but we are at the end of the string, so there are no
3050
           * characters left to copy. */
3051
0
          if (string_len > match_info->pos)
3052
0
            {
3053
0
              gchar *token = g_strndup (string + match_info->pos,
3054
0
                                        string_len - match_info->pos);
3055
0
              list = g_list_prepend (list, token);
3056
0
            }
3057
          /* end the loop. */
3058
0
          break;
3059
0
        }
3060
3061
0
      last_separator_end = match_info->pos;
3062
0
      if (last_match_is_empty)
3063
        /* if the last match was empty, g_match_info_next() has moved
3064
         * forward to avoid infinite loops, but we still need to copy that
3065
         * character. */
3066
0
        last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
3067
3068
0
      match_ok = g_match_info_next (match_info, &tmp_error);
3069
0
    }
3070
0
  g_match_info_free (match_info);
3071
0
  if (tmp_error != NULL)
3072
0
    {
3073
0
      g_propagate_error (error, tmp_error);
3074
0
      g_list_free_full (list, g_free);
3075
0
      return NULL;
3076
0
    }
3077
3078
0
  string_list = g_new (gchar *, g_list_length (list) + 1);
3079
0
  i = 0;
3080
0
  for (last = g_list_last (list); last; last = g_list_previous (last))
3081
0
    string_list[i++] = last->data;
3082
0
  string_list[i] = NULL;
3083
0
  g_list_free (list);
3084
3085
0
  return string_list;
3086
0
}
3087
3088
enum
3089
{
3090
  REPL_TYPE_STRING,
3091
  REPL_TYPE_CHARACTER,
3092
  REPL_TYPE_SYMBOLIC_REFERENCE,
3093
  REPL_TYPE_NUMERIC_REFERENCE,
3094
  REPL_TYPE_CHANGE_CASE
3095
};
3096
3097
typedef enum
3098
{
3099
  CHANGE_CASE_NONE         = 1 << 0,
3100
  CHANGE_CASE_UPPER        = 1 << 1,
3101
  CHANGE_CASE_LOWER        = 1 << 2,
3102
  CHANGE_CASE_UPPER_SINGLE = 1 << 3,
3103
  CHANGE_CASE_LOWER_SINGLE = 1 << 4,
3104
  CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
3105
  CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
3106
  CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
3107
} ChangeCase;
3108
3109
struct _InterpolationData
3110
{
3111
  gchar     *text;
3112
  gint       type;
3113
  gint       num;
3114
  gchar      c;
3115
  ChangeCase change_case;
3116
};
3117
3118
static void
3119
free_interpolation_data (InterpolationData *data)
3120
0
{
3121
0
  g_free (data->text);
3122
0
  g_free (data);
3123
0
}
3124
3125
static const gchar *
3126
expand_escape (const gchar        *replacement,
3127
               const gchar        *p,
3128
               InterpolationData  *data,
3129
               GError            **error)
3130
0
{
3131
0
  const gchar *q, *r;
3132
0
  gint x, d, h, i;
3133
0
  const gchar *error_detail;
3134
0
  gint base = 0;
3135
0
  GError *tmp_error = NULL;
3136
3137
0
  p++;
3138
0
  switch (*p)
3139
0
    {
3140
0
    case 't':
3141
0
      p++;
3142
0
      data->c = '\t';
3143
0
      data->type = REPL_TYPE_CHARACTER;
3144
0
      break;
3145
0
    case 'n':
3146
0
      p++;
3147
0
      data->c = '\n';
3148
0
      data->type = REPL_TYPE_CHARACTER;
3149
0
      break;
3150
0
    case 'v':
3151
0
      p++;
3152
0
      data->c = '\v';
3153
0
      data->type = REPL_TYPE_CHARACTER;
3154
0
      break;
3155
0
    case 'r':
3156
0
      p++;
3157
0
      data->c = '\r';
3158
0
      data->type = REPL_TYPE_CHARACTER;
3159
0
      break;
3160
0
    case 'f':
3161
0
      p++;
3162
0
      data->c = '\f';
3163
0
      data->type = REPL_TYPE_CHARACTER;
3164
0
      break;
3165
0
    case 'a':
3166
0
      p++;
3167
0
      data->c = '\a';
3168
0
      data->type = REPL_TYPE_CHARACTER;
3169
0
      break;
3170
0
    case 'b':
3171
0
      p++;
3172
0
      data->c = '\b';
3173
0
      data->type = REPL_TYPE_CHARACTER;
3174
0
      break;
3175
0
    case '\\':
3176
0
      p++;
3177
0
      data->c = '\\';
3178
0
      data->type = REPL_TYPE_CHARACTER;
3179
0
      break;
3180
0
    case 'x':
3181
0
      p++;
3182
0
      x = 0;
3183
0
      if (*p == '{')
3184
0
        {
3185
0
          p++;
3186
0
          do
3187
0
            {
3188
0
              h = g_ascii_xdigit_value (*p);
3189
0
              if (h < 0)
3190
0
                {
3191
0
                  error_detail = _("hexadecimal digit or “}” expected");
3192
0
                  goto error;
3193
0
                }
3194
0
              x = x * 16 + h;
3195
0
              p++;
3196
0
            }
3197
0
          while (*p != '}');
3198
0
          p++;
3199
0
        }
3200
0
      else
3201
0
        {
3202
0
          for (i = 0; i < 2; i++)
3203
0
            {
3204
0
              h = g_ascii_xdigit_value (*p);
3205
0
              if (h < 0)
3206
0
                {
3207
0
                  error_detail = _("hexadecimal digit expected");
3208
0
                  goto error;
3209
0
                }
3210
0
              x = x * 16 + h;
3211
0
              p++;
3212
0
            }
3213
0
        }
3214
0
      data->type = REPL_TYPE_STRING;
3215
0
      data->text = g_new0 (gchar, 8);
3216
0
      g_unichar_to_utf8 (x, data->text);
3217
0
      break;
3218
0
    case 'l':
3219
0
      p++;
3220
0
      data->type = REPL_TYPE_CHANGE_CASE;
3221
0
      data->change_case = CHANGE_CASE_LOWER_SINGLE;
3222
0
      break;
3223
0
    case 'u':
3224
0
      p++;
3225
0
      data->type = REPL_TYPE_CHANGE_CASE;
3226
0
      data->change_case = CHANGE_CASE_UPPER_SINGLE;
3227
0
      break;
3228
0
    case 'L':
3229
0
      p++;
3230
0
      data->type = REPL_TYPE_CHANGE_CASE;
3231
0
      data->change_case = CHANGE_CASE_LOWER;
3232
0
      break;
3233
0
    case 'U':
3234
0
      p++;
3235
0
      data->type = REPL_TYPE_CHANGE_CASE;
3236
0
      data->change_case = CHANGE_CASE_UPPER;
3237
0
      break;
3238
0
    case 'E':
3239
0
      p++;
3240
0
      data->type = REPL_TYPE_CHANGE_CASE;
3241
0
      data->change_case = CHANGE_CASE_NONE;
3242
0
      break;
3243
0
    case 'g':
3244
0
      p++;
3245
0
      if (*p != '<')
3246
0
        {
3247
0
          error_detail = _("missing “<” in symbolic reference");
3248
0
          goto error;
3249
0
        }
3250
0
      q = p + 1;
3251
0
      do
3252
0
        {
3253
0
          p++;
3254
0
          if (!*p)
3255
0
            {
3256
0
              error_detail = _("unfinished symbolic reference");
3257
0
              goto error;
3258
0
            }
3259
0
        }
3260
0
      while (*p != '>');
3261
0
      if (p - q == 0)
3262
0
        {
3263
0
          error_detail = _("zero-length symbolic reference");
3264
0
          goto error;
3265
0
        }
3266
0
      if (g_ascii_isdigit (*q))
3267
0
        {
3268
0
          x = 0;
3269
0
          do
3270
0
            {
3271
0
              h = g_ascii_digit_value (*q);
3272
0
              if (h < 0)
3273
0
                {
3274
0
                  error_detail = _("digit expected");
3275
0
                  p = q;
3276
0
                  goto error;
3277
0
                }
3278
0
              x = x * 10 + h;
3279
0
              q++;
3280
0
            }
3281
0
          while (q != p);
3282
0
          data->num = x;
3283
0
          data->type = REPL_TYPE_NUMERIC_REFERENCE;
3284
0
        }
3285
0
      else
3286
0
        {
3287
0
          r = q;
3288
0
          do
3289
0
            {
3290
0
              if (!g_ascii_isalnum (*r))
3291
0
                {
3292
0
                  error_detail = _("illegal symbolic reference");
3293
0
                  p = r;
3294
0
                  goto error;
3295
0
                }
3296
0
              r++;
3297
0
            }
3298
0
          while (r != p);
3299
0
          data->text = g_strndup (q, p - q);
3300
0
          data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
3301
0
        }
3302
0
      p++;
3303
0
      break;
3304
0
    case '0':
3305
      /* if \0 is followed by a number is an octal number representing a
3306
       * character, else it is a numeric reference. */
3307
0
      if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
3308
0
        {
3309
0
          base = 8;
3310
0
          p = g_utf8_next_char (p);
3311
0
        }
3312
0
      G_GNUC_FALLTHROUGH;
3313
0
    case '1':
3314
0
    case '2':
3315
0
    case '3':
3316
0
    case '4':
3317
0
    case '5':
3318
0
    case '6':
3319
0
    case '7':
3320
0
    case '8':
3321
0
    case '9':
3322
0
      x = 0;
3323
0
      d = 0;
3324
0
      for (i = 0; i < 3; i++)
3325
0
        {
3326
0
          h = g_ascii_digit_value (*p);
3327
0
          if (h < 0)
3328
0
            break;
3329
0
          if (h > 7)
3330
0
            {
3331
0
              if (base == 8)
3332
0
                break;
3333
0
              else
3334
0
                base = 10;
3335
0
            }
3336
0
          if (i == 2 && base == 10)
3337
0
            break;
3338
0
          x = x * 8 + h;
3339
0
          d = d * 10 + h;
3340
0
          p++;
3341
0
        }
3342
0
      if (base == 8 || i == 3)
3343
0
        {
3344
0
          data->type = REPL_TYPE_STRING;
3345
0
          data->text = g_new0 (gchar, 8);
3346
0
          g_unichar_to_utf8 (x, data->text);
3347
0
        }
3348
0
      else
3349
0
        {
3350
0
          data->type = REPL_TYPE_NUMERIC_REFERENCE;
3351
0
          data->num = d;
3352
0
        }
3353
0
      break;
3354
0
    case 0:
3355
0
      error_detail = _("stray final “\\”");
3356
0
      goto error;
3357
0
      break;
3358
0
    default:
3359
0
      error_detail = _("unknown escape sequence");
3360
0
      goto error;
3361
0
    }
3362
3363
0
  return p;
3364
3365
0
 error:
3366
  /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
3367
0
  tmp_error = g_error_new (G_REGEX_ERROR,
3368
0
                           G_REGEX_ERROR_REPLACE,
3369
0
                           _("Error while parsing replacement "
3370
0
                             "text “%s” at char %lu: %s"),
3371
0
                           replacement,
3372
0
                           (gulong)(p - replacement),
3373
0
                           error_detail);
3374
0
  g_propagate_error (error, tmp_error);
3375
3376
0
  return NULL;
3377
0
}
3378
3379
static GList *
3380
split_replacement (const gchar  *replacement,
3381
                   GError      **error)
3382
0
{
3383
0
  GList *list = NULL;
3384
0
  InterpolationData *data;
3385
0
  const gchar *p, *start;
3386
3387
0
  start = p = replacement;
3388
0
  while (*p)
3389
0
    {
3390
0
      if (*p == '\\')
3391
0
        {
3392
0
          data = g_new0 (InterpolationData, 1);
3393
0
          start = p = expand_escape (replacement, p, data, error);
3394
0
          if (p == NULL)
3395
0
            {
3396
0
              g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3397
0
              free_interpolation_data (data);
3398
3399
0
              return NULL;
3400
0
            }
3401
0
          list = g_list_prepend (list, data);
3402
0
        }
3403
0
      else
3404
0
        {
3405
0
          p++;
3406
0
          if (*p == '\\' || *p == '\0')
3407
0
            {
3408
0
              if (p - start > 0)
3409
0
                {
3410
0
                  data = g_new0 (InterpolationData, 1);
3411
0
                  data->text = g_strndup (start, p - start);
3412
0
                  data->type = REPL_TYPE_STRING;
3413
0
                  list = g_list_prepend (list, data);
3414
0
                }
3415
0
            }
3416
0
        }
3417
0
    }
3418
3419
0
  return g_list_reverse (list);
3420
0
}
3421
3422
/* Change the case of c based on change_case. */
3423
#define CHANGE_CASE(c, change_case) \
3424
0
        (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
3425
0
                g_unichar_tolower (c) : \
3426
0
                g_unichar_toupper (c))
3427
3428
static void
3429
string_append (GString     *string,
3430
               const gchar *text,
3431
               ChangeCase  *change_case)
3432
0
{
3433
0
  gunichar c;
3434
3435
0
  if (text[0] == '\0')
3436
0
    return;
3437
3438
0
  if (*change_case == CHANGE_CASE_NONE)
3439
0
    {
3440
0
      g_string_append (string, text);
3441
0
    }
3442
0
  else if (*change_case & CHANGE_CASE_SINGLE_MASK)
3443
0
    {
3444
0
      c = g_utf8_get_char (text);
3445
0
      g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
3446
0
      g_string_append (string, g_utf8_next_char (text));
3447
0
      *change_case = CHANGE_CASE_NONE;
3448
0
    }
3449
0
  else
3450
0
    {
3451
0
      while (*text != '\0')
3452
0
        {
3453
0
          c = g_utf8_get_char (text);
3454
0
          g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
3455
0
          text = g_utf8_next_char (text);
3456
0
        }
3457
0
    }
3458
0
}
3459
3460
static gboolean
3461
interpolate_replacement (const GMatchInfo *match_info,
3462
                         GString          *result,
3463
                         gpointer          data)
3464
0
{
3465
0
  GList *list;
3466
0
  InterpolationData *idata;
3467
0
  gchar *match;
3468
0
  ChangeCase change_case = CHANGE_CASE_NONE;
3469
3470
0
  for (list = data; list; list = list->next)
3471
0
    {
3472
0
      idata = list->data;
3473
0
      switch (idata->type)
3474
0
        {
3475
0
        case REPL_TYPE_STRING:
3476
0
          string_append (result, idata->text, &change_case);
3477
0
          break;
3478
0
        case REPL_TYPE_CHARACTER:
3479
0
          g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
3480
0
          if (change_case & CHANGE_CASE_SINGLE_MASK)
3481
0
            change_case = CHANGE_CASE_NONE;
3482
0
          break;
3483
0
        case REPL_TYPE_NUMERIC_REFERENCE:
3484
0
          match = g_match_info_fetch (match_info, idata->num);
3485
0
          if (match)
3486
0
            {
3487
0
              string_append (result, match, &change_case);
3488
0
              g_free (match);
3489
0
            }
3490
0
          break;
3491
0
        case REPL_TYPE_SYMBOLIC_REFERENCE:
3492
0
          match = g_match_info_fetch_named (match_info, idata->text);
3493
0
          if (match)
3494
0
            {
3495
0
              string_append (result, match, &change_case);
3496
0
              g_free (match);
3497
0
            }
3498
0
          break;
3499
0
        case REPL_TYPE_CHANGE_CASE:
3500
0
          change_case = idata->change_case;
3501
0
          break;
3502
0
        }
3503
0
    }
3504
3505
0
  return FALSE;
3506
0
}
3507
3508
/* whether actual match_info is needed for replacement, i.e.
3509
 * whether there are references
3510
 */
3511
static gboolean
3512
interpolation_list_needs_match (GList *list)
3513
0
{
3514
0
  while (list != NULL)
3515
0
    {
3516
0
      InterpolationData *data = list->data;
3517
3518
0
      if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
3519
0
          data->type == REPL_TYPE_NUMERIC_REFERENCE)
3520
0
        {
3521
0
          return TRUE;
3522
0
        }
3523
3524
0
      list = list->next;
3525
0
    }
3526
3527
0
  return FALSE;
3528
0
}
3529
3530
/**
3531
 * g_regex_replace:
3532
 * @regex: a #GRegex structure
3533
 * @string: (array length=string_len): the string to perform matches against
3534
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3535
 * @start_position: starting index of the string to match, in bytes
3536
 * @replacement: text to replace each match with
3537
 * @match_options: options for the match
3538
 * @error: location to store the error occurring, or %NULL to ignore errors
3539
 *
3540
 * Replaces all occurrences of the pattern in @regex with the
3541
 * replacement text. Backreferences of the form `\number` or
3542
 * `\g<number>` in the replacement text are interpolated by the
3543
 * number-th captured subexpression of the match, `\g<name>` refers
3544
 * to the captured subexpression with the given name. `\0` refers
3545
 * to the complete match, but `\0` followed by a number is the octal
3546
 * representation of a character. To include a literal `\` in the
3547
 * replacement, write `\\\\`.
3548
 *
3549
 * There are also escapes that changes the case of the following text:
3550
 *
3551
 * - \l: Convert to lower case the next character
3552
 * - \u: Convert to upper case the next character
3553
 * - \L: Convert to lower case till \E
3554
 * - \U: Convert to upper case till \E
3555
 * - \E: End case modification
3556
 *
3557
 * If you do not need to use backreferences use g_regex_replace_literal().
3558
 *
3559
 * The @replacement string must be UTF-8 encoded even if %G_REGEX_RAW was
3560
 * passed to g_regex_new(). If you want to use not UTF-8 encoded strings
3561
 * you can use g_regex_replace_literal().
3562
 *
3563
 * Setting @start_position differs from just passing over a shortened
3564
 * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern that
3565
 * begins with any kind of lookbehind assertion, such as "\b".
3566
 *
3567
 * Returns: a newly allocated string containing the replacements
3568
 *
3569
 * Since: 2.14
3570
 */
3571
gchar *
3572
g_regex_replace (const GRegex      *regex,
3573
                 const gchar       *string,
3574
                 gssize             string_len,
3575
                 gint               start_position,
3576
                 const gchar       *replacement,
3577
                 GRegexMatchFlags   match_options,
3578
                 GError           **error)
3579
0
{
3580
0
  gchar *result;
3581
0
  GList *list;
3582
0
  GError *tmp_error = NULL;
3583
3584
0
  g_return_val_if_fail (regex != NULL, NULL);
3585
0
  g_return_val_if_fail (string != NULL, NULL);
3586
0
  g_return_val_if_fail (start_position >= 0, NULL);
3587
0
  g_return_val_if_fail (replacement != NULL, NULL);
3588
0
  g_return_val_if_fail (error == NULL || *error == NULL, NULL);
3589
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3590
3591
0
  list = split_replacement (replacement, &tmp_error);
3592
0
  if (tmp_error != NULL)
3593
0
    {
3594
0
      g_propagate_error (error, tmp_error);
3595
0
      return NULL;
3596
0
    }
3597
3598
0
  result = g_regex_replace_eval (regex,
3599
0
                                 string, string_len, start_position,
3600
0
                                 match_options,
3601
0
                                 interpolate_replacement,
3602
0
                                 (gpointer)list,
3603
0
                                 &tmp_error);
3604
0
  if (tmp_error != NULL)
3605
0
    g_propagate_error (error, tmp_error);
3606
3607
0
  g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3608
3609
0
  return result;
3610
0
}
3611
3612
static gboolean
3613
literal_replacement (const GMatchInfo *match_info,
3614
                     GString          *result,
3615
                     gpointer          data)
3616
0
{
3617
0
  g_string_append (result, data);
3618
0
  return FALSE;
3619
0
}
3620
3621
/**
3622
 * g_regex_replace_literal:
3623
 * @regex: a #GRegex structure
3624
 * @string: (array length=string_len): the string to perform matches against
3625
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3626
 * @start_position: starting index of the string to match, in bytes
3627
 * @replacement: text to replace each match with
3628
 * @match_options: options for the match
3629
 * @error: location to store the error occurring, or %NULL to ignore errors
3630
 *
3631
 * Replaces all occurrences of the pattern in @regex with the
3632
 * replacement text. @replacement is replaced literally, to
3633
 * include backreferences use g_regex_replace().
3634
 *
3635
 * Setting @start_position differs from just passing over a
3636
 * shortened string and setting %G_REGEX_MATCH_NOTBOL in the
3637
 * case of a pattern that begins with any kind of lookbehind
3638
 * assertion, such as "\b".
3639
 *
3640
 * Returns: a newly allocated string containing the replacements
3641
 *
3642
 * Since: 2.14
3643
 */
3644
gchar *
3645
g_regex_replace_literal (const GRegex      *regex,
3646
                         const gchar       *string,
3647
                         gssize             string_len,
3648
                         gint               start_position,
3649
                         const gchar       *replacement,
3650
                         GRegexMatchFlags   match_options,
3651
                         GError           **error)
3652
0
{
3653
0
  g_return_val_if_fail (replacement != NULL, NULL);
3654
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3655
3656
0
  return g_regex_replace_eval (regex,
3657
0
                               string, string_len, start_position,
3658
0
                               match_options,
3659
0
                               literal_replacement,
3660
0
                               (gpointer)replacement,
3661
0
                               error);
3662
0
}
3663
3664
/**
3665
 * g_regex_replace_eval:
3666
 * @regex: a #GRegex structure from g_regex_new()
3667
 * @string: (array length=string_len): string to perform matches against
3668
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3669
 * @start_position: starting index of the string to match, in bytes
3670
 * @match_options: options for the match
3671
 * @eval: (scope call): a function to call for each match
3672
 * @user_data: user data to pass to the function
3673
 * @error: location to store the error occurring, or %NULL to ignore errors
3674
 *
3675
 * Replaces occurrences of the pattern in regex with the output of
3676
 * @eval for that occurrence.
3677
 *
3678
 * Setting @start_position differs from just passing over a shortened
3679
 * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
3680
 * that begins with any kind of lookbehind assertion, such as "\b".
3681
 *
3682
 * The following example uses g_regex_replace_eval() to replace multiple
3683
 * strings at once:
3684
 * |[<!-- language="C" --> 
3685
 * static gboolean
3686
 * eval_cb (const GMatchInfo *info,
3687
 *          GString          *res,
3688
 *          gpointer          data)
3689
 * {
3690
 *   gchar *match;
3691
 *   gchar *r;
3692
 *
3693
 *    match = g_match_info_fetch (info, 0);
3694
 *    r = g_hash_table_lookup ((GHashTable *)data, match);
3695
 *    g_string_append (res, r);
3696
 *    g_free (match);
3697
 *
3698
 *    return FALSE;
3699
 * }
3700
 *
3701
 * ...
3702
 *
3703
 * GRegex *reg;
3704
 * GHashTable *h;
3705
 * gchar *res;
3706
 *
3707
 * h = g_hash_table_new (g_str_hash, g_str_equal);
3708
 *
3709
 * g_hash_table_insert (h, "1", "ONE");
3710
 * g_hash_table_insert (h, "2", "TWO");
3711
 * g_hash_table_insert (h, "3", "THREE");
3712
 * g_hash_table_insert (h, "4", "FOUR");
3713
 *
3714
 * reg = g_regex_new ("1|2|3|4", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
3715
 * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);
3716
 * g_hash_table_destroy (h);
3717
 *
3718
 * ...
3719
 * ]|
3720
 *
3721
 * Returns: a newly allocated string containing the replacements
3722
 *
3723
 * Since: 2.14
3724
 */
3725
gchar *
3726
g_regex_replace_eval (const GRegex        *regex,
3727
                      const gchar         *string,
3728
                      gssize               string_len,
3729
                      gint                 start_position,
3730
                      GRegexMatchFlags     match_options,
3731
                      GRegexEvalCallback   eval,
3732
                      gpointer             user_data,
3733
                      GError             **error)
3734
0
{
3735
0
  GMatchInfo *match_info;
3736
0
  GString *result;
3737
0
  gint str_pos = 0;
3738
0
  gboolean done = FALSE;
3739
0
  GError *tmp_error = NULL;
3740
3741
0
  g_return_val_if_fail (regex != NULL, NULL);
3742
0
  g_return_val_if_fail (string != NULL, NULL);
3743
0
  g_return_val_if_fail (start_position >= 0, NULL);
3744
0
  g_return_val_if_fail (eval != NULL, NULL);
3745
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3746
3747
0
  if (string_len < 0)
3748
0
    string_len = strlen (string);
3749
3750
0
  result = g_string_sized_new (string_len);
3751
3752
  /* run down the string making matches. */
3753
0
  g_regex_match_full (regex, string, string_len, start_position,
3754
0
                      match_options, &match_info, &tmp_error);
3755
0
  while (!done && g_match_info_matches (match_info))
3756
0
    {
3757
0
      g_string_append_len (result,
3758
0
                           string + str_pos,
3759
0
                           match_info->offsets[0] - str_pos);
3760
0
      done = (*eval) (match_info, result, user_data);
3761
0
      str_pos = match_info->offsets[1];
3762
0
      g_match_info_next (match_info, &tmp_error);
3763
0
    }
3764
0
  g_match_info_free (match_info);
3765
0
  if (tmp_error != NULL)
3766
0
    {
3767
0
      g_propagate_error (error, tmp_error);
3768
0
      g_string_free (result, TRUE);
3769
0
      return NULL;
3770
0
    }
3771
3772
0
  g_string_append_len (result, string + str_pos, string_len - str_pos);
3773
0
  return g_string_free (result, FALSE);
3774
0
}
3775
3776
/**
3777
 * g_regex_check_replacement:
3778
 * @replacement: the replacement string
3779
 * @has_references: (out) (optional): location to store information about
3780
 *   references in @replacement or %NULL
3781
 * @error: location to store error
3782
 *
3783
 * Checks whether @replacement is a valid replacement string
3784
 * (see g_regex_replace()), i.e. that all escape sequences in
3785
 * it are valid.
3786
 *
3787
 * If @has_references is not %NULL then @replacement is checked
3788
 * for pattern references. For instance, replacement text 'foo\n'
3789
 * does not contain references and may be evaluated without information
3790
 * about actual match, but '\0\1' (whole match followed by first
3791
 * subpattern) requires valid #GMatchInfo object.
3792
 *
3793
 * Returns: whether @replacement is a valid replacement string
3794
 *
3795
 * Since: 2.14
3796
 */
3797
gboolean
3798
g_regex_check_replacement (const gchar  *replacement,
3799
                           gboolean     *has_references,
3800
                           GError      **error)
3801
0
{
3802
0
  GList *list;
3803
0
  GError *tmp = NULL;
3804
3805
0
  list = split_replacement (replacement, &tmp);
3806
3807
0
  if (tmp)
3808
0
  {
3809
0
    g_propagate_error (error, tmp);
3810
0
    return FALSE;
3811
0
  }
3812
3813
0
  if (has_references)
3814
0
    *has_references = interpolation_list_needs_match (list);
3815
3816
0
  g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3817
3818
0
  return TRUE;
3819
0
}
3820
3821
/**
3822
 * g_regex_escape_nul:
3823
 * @string: the string to escape
3824
 * @length: the length of @string
3825
 *
3826
 * Escapes the nul characters in @string to "\x00".  It can be used
3827
 * to compile a regex with embedded nul characters.
3828
 *
3829
 * For completeness, @length can be -1 for a nul-terminated string.
3830
 * In this case the output string will be of course equal to @string.
3831
 *
3832
 * Returns: a newly-allocated escaped string
3833
 *
3834
 * Since: 2.30
3835
 */
3836
gchar *
3837
g_regex_escape_nul (const gchar *string,
3838
                    gint         length)
3839
0
{
3840
0
  GString *escaped;
3841
0
  const gchar *p, *piece_start, *end;
3842
0
  gint backslashes;
3843
3844
0
  g_return_val_if_fail (string != NULL, NULL);
3845
3846
0
  if (length < 0)
3847
0
    return g_strdup (string);
3848
3849
0
  end = string + length;
3850
0
  p = piece_start = string;
3851
0
  escaped = g_string_sized_new (length + 1);
3852
3853
0
  backslashes = 0;
3854
0
  while (p < end)
3855
0
    {
3856
0
      switch (*p)
3857
0
        {
3858
0
        case '\0':
3859
0
          if (p != piece_start)
3860
0
            {
3861
              /* copy the previous piece. */
3862
0
              g_string_append_len (escaped, piece_start, p - piece_start);
3863
0
            }
3864
0
          if ((backslashes & 1) == 0)
3865
0
            g_string_append_c (escaped, '\\');
3866
0
          g_string_append_c (escaped, 'x');
3867
0
          g_string_append_c (escaped, '0');
3868
0
          g_string_append_c (escaped, '0');
3869
0
          piece_start = ++p;
3870
0
          backslashes = 0;
3871
0
          break;
3872
0
        case '\\':
3873
0
          backslashes++;
3874
0
          ++p;
3875
0
          break;
3876
0
        default:
3877
0
          backslashes = 0;
3878
0
          p = g_utf8_next_char (p);
3879
0
          break;
3880
0
        }
3881
0
    }
3882
3883
0
  if (piece_start < end)
3884
0
    g_string_append_len (escaped, piece_start, end - piece_start);
3885
3886
0
  return g_string_free (escaped, FALSE);
3887
0
}
3888
3889
/**
3890
 * g_regex_escape_string:
3891
 * @string: the string to escape
3892
 * @length: the length of @string, in bytes, or -1 if @string is nul-terminated
3893
 *
3894
 * Escapes the special characters used for regular expressions
3895
 * in @string, for instance "a.b*c" becomes "a\.b\*c". This
3896
 * function is useful to dynamically generate regular expressions.
3897
 *
3898
 * @string can contain nul characters that are replaced with "\0",
3899
 * in this case remember to specify the correct length of @string
3900
 * in @length.
3901
 *
3902
 * Returns: a newly-allocated escaped string
3903
 *
3904
 * Since: 2.14
3905
 */
3906
gchar *
3907
g_regex_escape_string (const gchar *string,
3908
                       gint         length)
3909
0
{
3910
0
  GString *escaped;
3911
0
  const char *p, *piece_start, *end;
3912
3913
0
  g_return_val_if_fail (string != NULL, NULL);
3914
3915
0
  if (length < 0)
3916
0
    length = strlen (string);
3917
3918
0
  end = string + length;
3919
0
  p = piece_start = string;
3920
0
  escaped = g_string_sized_new (length + 1);
3921
3922
0
  while (p < end)
3923
0
    {
3924
0
      switch (*p)
3925
0
        {
3926
0
        case '\0':
3927
0
        case '\\':
3928
0
        case '|':
3929
0
        case '(':
3930
0
        case ')':
3931
0
        case '[':
3932
0
        case ']':
3933
0
        case '{':
3934
0
        case '}':
3935
0
        case '^':
3936
0
        case '$':
3937
0
        case '*':
3938
0
        case '+':
3939
0
        case '?':
3940
0
        case '.':
3941
0
          if (p != piece_start)
3942
            /* copy the previous piece. */
3943
0
            g_string_append_len (escaped, piece_start, p - piece_start);
3944
0
          g_string_append_c (escaped, '\\');
3945
0
          if (*p == '\0')
3946
0
            g_string_append_c (escaped, '0');
3947
0
          else
3948
0
            g_string_append_c (escaped, *p);
3949
0
          piece_start = ++p;
3950
0
          break;
3951
0
        default:
3952
0
          p = g_utf8_next_char (p);
3953
0
          break;
3954
0
        }
3955
0
  }
3956
3957
0
  if (piece_start < end)
3958
0
    g_string_append_len (escaped, piece_start, end - piece_start);
3959
3960
0
  return g_string_free (escaped, FALSE);
3961
0
}