Coverage Report

Created: 2025-08-28 06:24

/src/glib/glib/gregex.c
Line
Count
Source (jump to first uncovered line)
1
/* GRegex -- regular expression API wrapper around PCRE.
2
 *
3
 * Copyright (C) 1999, 2000 Scott Wimer
4
 * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
5
 * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
6
 * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com>
7
 *
8
 * SPDX-License-Identifier: LGPL-2.1-or-later
9
 *
10
 * This library is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * This library is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public License
21
 * along with this library; if not, see <http://www.gnu.org/licenses/>.
22
 */
23
24
#include "config.h"
25
26
#include <stdint.h>
27
#include <string.h>
28
29
#define PCRE2_CODE_UNIT_WIDTH 8
30
#include <pcre2.h>
31
32
#include "gtypes.h"
33
#include "gregex.h"
34
#include "glibintl.h"
35
#include "glist.h"
36
#include "gmessages.h"
37
#include "gstrfuncs.h"
38
#include "gatomic.h"
39
#include "gtestutils.h"
40
#include "gthread.h"
41
42
/**
43
 * GRegex:
44
 *
45
 * A `GRegex` is a compiled form of a regular expression.
46
 * 
47
 * After instantiating a `GRegex`, you can use its methods to find matches
48
 * in a string, replace matches within a string, or split the string at matches.
49
 *
50
 * `GRegex` implements regular expression pattern matching using syntax and 
51
 * semantics (such as character classes, quantifiers, and capture groups) 
52
 * similar to Perl regular expression. See the 
53
 * [PCRE documentation](man:pcre2pattern(3)) for details.
54
 *
55
 * A typical scenario for regex pattern matching is to check if a string 
56
 * matches a pattern. The following statements implement this scenario.
57
 * 
58
 * ``` { .c }
59
 * const char *regex_pattern = ".*GLib.*";
60
 * const char *string_to_search = "You will love the GLib implementation of regex";
61
 * g_autoptr(GMatchInfo) match_info = NULL;
62
 * g_autoptr(GRegex) regex = NULL;
63
 *
64
 * regex = g_regex_new (regex_pattern, G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
65
 * g_assert (regex != NULL);
66
 * 
67
 * if (g_regex_match (regex, string_to_search, G_REGEX_MATCH_DEFAULT, &match_info))
68
 *   {
69
 *     int start_pos, end_pos;
70
 *     g_match_info_fetch_pos (match_info, 0, &start_pos, &end_pos);
71
 *     g_print ("Match successful! Overall pattern matches bytes %d to %d\n", start_pos, end_pos);
72
 *   }
73
 * else
74
 *   {
75
 *     g_print ("No match!\n");
76
 *   }
77
 * ```
78
 * 
79
 * The constructor for `GRegex` includes two sets of bitmapped flags:
80
81
 * * [flags@GLib.RegexCompileFlags]—These flags 
82
 * control how GLib compiles the regex. There are options for case 
83
 * sensitivity, multiline, ignoring whitespace, etc.
84
 * * [flags@GLib.RegexMatchFlags]—These flags control 
85
 * `GRegex`’s matching behavior, such as anchoring and customizing definitions 
86
 * for newline characters.
87
 * 
88
 * Some regex patterns include backslash assertions, such as `\d` (digit) or 
89
 * `\D` (non-digit). The regex pattern must escape those backslashes. For 
90
 * example, the pattern `"\\d\\D"` matches a digit followed by a non-digit.
91
 *
92
 * GLib’s implementation of pattern matching includes a `start_position` 
93
 * argument for some of the match, replace, and split methods. Specifying 
94
 * a start position provides flexibility when you want to ignore the first 
95
 * _n_ characters of a string, but want to incorporate backslash assertions 
96
 * at character _n_ - 1. For example, a database field contains inconsistent
97
 * spelling for a job title: `healthcare provider` and `health-care provider`.
98
 * The database manager wants to make the spelling consistent by adding a 
99
 * hyphen when it is missing. The following regex pattern tests for the string 
100
 * `care` preceded by a non-word boundary character (instead of a hyphen) 
101
 * and followed by a space.
102
 *
103
 * ``` { .c }
104
 * const char *regex_pattern = "\\Bcare\\s";
105
 * ```
106
 *
107
 * An efficient way to match with this pattern is to start examining at 
108
 * `start_position` 6 in the string `healthcare` or `health-care`.
109
110
 * ``` { .c }
111
 * const char *regex_pattern = "\\Bcare\\s";
112
 * const char *string_to_search = "healthcare provider";
113
 * g_autoptr(GMatchInfo) match_info = NULL;
114
 * g_autoptr(GRegex) regex = NULL;
115
 *
116
 * regex = g_regex_new (
117
 *   regex_pattern,
118
 *   G_REGEX_DEFAULT,
119
 *   G_REGEX_MATCH_DEFAULT,
120
 *   NULL);
121
 * g_assert (regex != NULL);
122
 * 
123
 * g_regex_match_full (
124
 *   regex, 
125
 *   string_to_search, 
126
 *   -1,
127
 *   6, // position of 'c' in the test string.
128
 *   G_REGEX_MATCH_DEFAULT, 
129
 *   &match_info,
130
 *   NULL);
131
 * ```
132
 * 
133
 * The method [method@GLib.Regex.match_full] (and other methods implementing 
134
 * `start_pos`) allow for lookback before the start position to determine if 
135
 * the previous character satisfies an assertion.
136
 *
137
 * Unless you set the [flags@GLib.RegexCompileFlags.RAW] as one of 
138
 * the `GRegexCompileFlags`, all the strings passed to `GRegex` methods must 
139
 * be encoded in UTF-8. The lengths and the positions inside the strings are 
140
 * in bytes and not in characters, so, for instance, `\xc3\xa0` (i.e., `à`) 
141
 * is two bytes long but it is treated as a single character. If you set 
142
 * `G_REGEX_RAW`, the strings can be non-valid UTF-8 strings and a byte is 
143
 * treated as a character, so `\xc3\xa0` is two bytes and two characters long.
144
 *
145
 * Regarding line endings, `\n` matches a `\n` character, and `\r` matches 
146
 * a `\r` character. More generally, `\R` matches all typical line endings: 
147
 * CR + LF (`\r\n`), LF (linefeed, U+000A, `\n`), VT (vertical tab, U+000B, 
148
 * `\v`), FF (formfeed, U+000C, `\f`), CR (carriage return, U+000D, `\r`), 
149
 * NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph 
150
 * separator, U+2029).
151
 * 
152
 * The behaviour of the dot, circumflex, and dollar metacharacters are 
153
 * affected by newline characters. By default, `GRegex` matches any newline 
154
 * character matched by `\R`. You can limit the matched newline characters by 
155
 * specifying the [flags@GLib.RegexMatchFlags.NEWLINE_CR], 
156
 * [flags@GLib.RegexMatchFlags.NEWLINE_LF], and 
157
 * [flags@GLib.RegexMatchFlags.NEWLINE_CRLF] compile options, and 
158
 * with [flags@GLib.RegexMatchFlags.NEWLINE_ANY], 
159
 * [flags@GLib.RegexMatchFlags.NEWLINE_CR], 
160
 * [flags@GLib.RegexMatchFlags.NEWLINE_LF] and 
161
 * [flags@GLib.RegexMatchFlags.NEWLINE_CRLF] match options. 
162
 * These settings are also relevant when compiling a pattern if 
163
 * [flags@GLib.RegexCompileFlags.EXTENDED] is set and an unescaped 
164
 * `#` outside a character class is encountered. This indicates a comment 
165
 * that lasts until after the next newline.
166
 * 
167
 * Because `GRegex` does not modify its internal state between creation and 
168
 * destruction, you can create and modify the same `GRegex` instance from 
169
 * different threads. In contrast, [struct@GLib.MatchInfo] is not thread safe.
170
 * 
171
 * The regular expression low-level functionalities are obtained through
172
 * the excellent [PCRE](http://www.pcre.org/) library written by Philip Hazel.
173
 *
174
 * Since: 2.14
175
 */
176
177
0
#define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED       | \
178
0
                                   PCRE2_NO_UTF_CHECK   | \
179
0
                                   PCRE2_ENDANCHORED)
180
181
/* Mask of all the possible values for GRegexCompileFlags. */
182
0
#define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT          | \
183
0
                              G_REGEX_CASELESS         | \
184
0
                              G_REGEX_MULTILINE        | \
185
0
                              G_REGEX_DOTALL           | \
186
0
                              G_REGEX_EXTENDED         | \
187
0
                              G_REGEX_ANCHORED         | \
188
0
                              G_REGEX_DOLLAR_ENDONLY   | \
189
0
                              G_REGEX_UNGREEDY         | \
190
0
                              G_REGEX_RAW              | \
191
0
                              G_REGEX_NO_AUTO_CAPTURE  | \
192
0
                              G_REGEX_OPTIMIZE         | \
193
0
                              G_REGEX_FIRSTLINE        | \
194
0
                              G_REGEX_DUPNAMES         | \
195
0
                              G_REGEX_NEWLINE_CR       | \
196
0
                              G_REGEX_NEWLINE_LF       | \
197
0
                              G_REGEX_NEWLINE_CRLF     | \
198
0
                              G_REGEX_NEWLINE_ANYCRLF  | \
199
0
                              G_REGEX_BSR_ANYCRLF)
200
201
0
#define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS    | \
202
0
                                    PCRE2_ALT_BSUX             | \
203
0
                                    PCRE2_AUTO_CALLOUT         | \
204
0
                                    PCRE2_CASELESS             | \
205
0
                                    PCRE2_DOLLAR_ENDONLY       | \
206
0
                                    PCRE2_DOTALL               | \
207
0
                                    PCRE2_DUPNAMES             | \
208
0
                                    PCRE2_EXTENDED             | \
209
0
                                    PCRE2_FIRSTLINE            | \
210
0
                                    PCRE2_MATCH_UNSET_BACKREF  | \
211
0
                                    PCRE2_MULTILINE            | \
212
0
                                    PCRE2_NEVER_UCP            | \
213
0
                                    PCRE2_NEVER_UTF            | \
214
0
                                    PCRE2_NO_AUTO_CAPTURE      | \
215
0
                                    PCRE2_NO_AUTO_POSSESS      | \
216
0
                                    PCRE2_NO_DOTSTAR_ANCHOR    | \
217
0
                                    PCRE2_NO_START_OPTIMIZE    | \
218
0
                                    PCRE2_UCP                  | \
219
0
                                    PCRE2_UNGREEDY             | \
220
0
                                    PCRE2_UTF                  | \
221
0
                                    PCRE2_NEVER_BACKSLASH_C    | \
222
0
                                    PCRE2_ALT_CIRCUMFLEX       | \
223
0
                                    PCRE2_ALT_VERBNAMES        | \
224
0
                                    PCRE2_USE_OFFSET_LIMIT     | \
225
0
                                    PCRE2_EXTENDED_MORE        | \
226
0
                                    PCRE2_LITERAL              | \
227
0
                                    PCRE2_MATCH_INVALID_UTF    | \
228
0
                                    G_REGEX_PCRE_GENERIC_MASK)
229
230
0
#define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF)
231
232
/* Mask of all the possible values for GRegexMatchFlags. */
233
0
#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT          | \
234
0
                            G_REGEX_MATCH_ANCHORED         | \
235
0
                            G_REGEX_MATCH_NOTBOL           | \
236
0
                            G_REGEX_MATCH_NOTEOL           | \
237
0
                            G_REGEX_MATCH_NOTEMPTY         | \
238
0
                            G_REGEX_MATCH_PARTIAL          | \
239
0
                            G_REGEX_MATCH_NEWLINE_CR       | \
240
0
                            G_REGEX_MATCH_NEWLINE_LF       | \
241
0
                            G_REGEX_MATCH_NEWLINE_CRLF     | \
242
0
                            G_REGEX_MATCH_NEWLINE_ANY      | \
243
0
                            G_REGEX_MATCH_NEWLINE_ANYCRLF  | \
244
0
                            G_REGEX_MATCH_BSR_ANYCRLF      | \
245
0
                            G_REGEX_MATCH_BSR_ANY          | \
246
0
                            G_REGEX_MATCH_PARTIAL_SOFT     | \
247
0
                            G_REGEX_MATCH_PARTIAL_HARD     | \
248
0
                            G_REGEX_MATCH_NOTEMPTY_ATSTART)
249
250
0
#define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL                      |\
251
0
                                  PCRE2_NOTEOL                      |\
252
0
                                  PCRE2_NOTEMPTY                    |\
253
0
                                  PCRE2_NOTEMPTY_ATSTART            |\
254
0
                                  PCRE2_PARTIAL_SOFT                |\
255
0
                                  PCRE2_PARTIAL_HARD                |\
256
0
                                  PCRE2_NO_JIT                      |\
257
0
                                  PCRE2_COPY_MATCHED_SUBJECT        |\
258
0
                                  G_REGEX_PCRE_GENERIC_MASK)
259
260
/* TODO: Support PCRE2_NEWLINE_NUL */
261
#define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR |     \
262
                              PCRE2_NEWLINE_LF |     \
263
                              PCRE2_NEWLINE_CRLF |   \
264
                              PCRE2_NEWLINE_ANYCRLF)
265
266
/* Some match options are not supported when using JIT as stated in the
267
 * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section:
268
 *   https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5
269
 */
270
0
#define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \
271
0
                                               PCRE2_ENDANCHORED)
272
273
0
#define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR      | \
274
0
                                      G_REGEX_NEWLINE_LF      | \
275
0
                                      G_REGEX_NEWLINE_CRLF    | \
276
0
                                      G_REGEX_NEWLINE_ANYCRLF)
277
278
0
#define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR      | \
279
0
                                    G_REGEX_MATCH_NEWLINE_LF      | \
280
0
                                    G_REGEX_MATCH_NEWLINE_CRLF    | \
281
0
                                    G_REGEX_MATCH_NEWLINE_ANY    | \
282
0
                                    G_REGEX_MATCH_NEWLINE_ANYCRLF)
283
284
/* if the string is in UTF-8 use g_utf8_ functions, else use
285
 * use just +/- 1. */
286
0
#define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
287
0
                                ((s) + 1) : \
288
0
                                g_utf8_next_char (s))
289
0
#define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
290
0
                                ((s) - 1) : \
291
0
                                g_utf8_prev_char (s))
292
293
struct _GMatchInfo
294
{
295
  gint ref_count;               /* the ref count (atomic) */
296
  GRegex *regex;                /* the regex */
297
  uint32_t match_opts;          /* pcre match options used at match time on the regex */
298
  gint matches;                 /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */
299
  uint32_t n_subpatterns;       /* total number of sub patterns in the regex */
300
  gint pos;                     /* position in the string where last match left off */
301
  uint32_t n_offsets;           /* number of offsets */
302
  gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
303
  gint *workspace;              /* workspace for pcre2_dfa_match() */
304
  PCRE2_SIZE n_workspace;       /* number of workspace elements */
305
  const gchar *string;          /* string passed to the match function */
306
  gssize string_len;            /* length of string, in bytes */
307
  pcre2_match_context *match_context;
308
  pcre2_match_data *match_data;
309
  pcre2_jit_stack *jit_stack;
310
};
311
312
typedef enum
313
{
314
  JIT_STATUS_DEFAULT,
315
  JIT_STATUS_ENABLED,
316
  JIT_STATUS_DISABLED
317
} JITStatus;
318
319
struct _GRegex
320
{
321
  gint ref_count;               /* the ref count for the immutable part (atomic) */
322
  gchar *pattern;               /* the pattern */
323
  pcre2_code *pcre_re;          /* compiled form of the pattern */
324
  uint32_t compile_opts;        /* options used at compile time on the pattern, pcre2 values */
325
  GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */
326
  uint32_t match_opts;          /* pcre2 options used at match time on the regex */
327
  GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */
328
  uint32_t jit_options;         /* options which were enabled for jit compiler */
329
  JITStatus jit_status;         /* indicates the status of jit compiler for this compiled regex */
330
  /* The jit_status here does _not_ correspond to whether we used the JIT in the last invocation,
331
   * which may be affected by match_options or a JIT_STACK_LIMIT error, but whether it was ever
332
   * enabled for the current regex AND current set of jit_options.
333
   * JIT_STATUS_DEFAULT means enablement was never tried,
334
   * JIT_STATUS_ENABLED means it was tried and successful (even if we're not currently using it),
335
   * and JIT_STATUS_DISABLED means it was tried and failed (so we shouldn't try again).
336
   */
337
};
338
339
/* TRUE if ret is an error code, FALSE otherwise. */
340
0
#define IS_PCRE2_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL)
341
342
typedef struct _InterpolationData InterpolationData;
343
static gboolean  interpolation_list_needs_match (GList *list);
344
static gboolean  interpolate_replacement        (const GMatchInfo *match_info,
345
                                                 GString *result,
346
                                                 gpointer data);
347
static GList    *split_replacement              (const gchar *replacement,
348
                                                 GError **error);
349
static void      free_interpolation_data        (InterpolationData *data);
350
351
static uint32_t
352
get_pcre2_compile_options (GRegexCompileFlags compile_flags)
353
0
{
354
  /* Maps compile flags to pcre2 values */
355
0
  uint32_t pcre2_flags = 0;
356
357
0
  if (compile_flags & G_REGEX_CASELESS)
358
0
    pcre2_flags |= PCRE2_CASELESS;
359
0
  if (compile_flags & G_REGEX_MULTILINE)
360
0
    pcre2_flags |= PCRE2_MULTILINE;
361
0
  if (compile_flags & G_REGEX_DOTALL)
362
0
    pcre2_flags |= PCRE2_DOTALL;
363
0
  if (compile_flags & G_REGEX_EXTENDED)
364
0
    pcre2_flags |= PCRE2_EXTENDED;
365
0
  if (compile_flags & G_REGEX_ANCHORED)
366
0
    pcre2_flags |= PCRE2_ANCHORED;
367
0
  if (compile_flags & G_REGEX_DOLLAR_ENDONLY)
368
0
    pcre2_flags |= PCRE2_DOLLAR_ENDONLY;
369
0
  if (compile_flags & G_REGEX_UNGREEDY)
370
0
    pcre2_flags |= PCRE2_UNGREEDY;
371
0
  if (!(compile_flags & G_REGEX_RAW))
372
0
    pcre2_flags |= PCRE2_UTF;
373
0
  if (compile_flags & G_REGEX_NO_AUTO_CAPTURE)
374
0
    pcre2_flags |= PCRE2_NO_AUTO_CAPTURE;
375
0
  if (compile_flags & G_REGEX_FIRSTLINE)
376
0
    pcre2_flags |= PCRE2_FIRSTLINE;
377
0
  if (compile_flags & G_REGEX_DUPNAMES)
378
0
    pcre2_flags |= PCRE2_DUPNAMES;
379
380
0
  return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK;
381
0
}
382
383
static uint32_t
384
get_pcre2_match_options (GRegexMatchFlags   match_flags,
385
                         GRegexCompileFlags compile_flags)
386
0
{
387
  /* Maps match flags to pcre2 values */
388
0
  uint32_t pcre2_flags = 0;
389
390
0
  if (match_flags & G_REGEX_MATCH_ANCHORED)
391
0
    pcre2_flags |= PCRE2_ANCHORED;
392
0
  if (match_flags & G_REGEX_MATCH_NOTBOL)
393
0
    pcre2_flags |= PCRE2_NOTBOL;
394
0
  if (match_flags & G_REGEX_MATCH_NOTEOL)
395
0
    pcre2_flags |= PCRE2_NOTEOL;
396
0
  if (match_flags & G_REGEX_MATCH_NOTEMPTY)
397
0
    pcre2_flags |= PCRE2_NOTEMPTY;
398
0
  if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT)
399
0
    pcre2_flags |= PCRE2_PARTIAL_SOFT;
400
0
  if (match_flags & G_REGEX_MATCH_PARTIAL_HARD)
401
0
    pcre2_flags |= PCRE2_PARTIAL_HARD;
402
0
  if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART)
403
0
    pcre2_flags |= PCRE2_NOTEMPTY_ATSTART;
404
405
0
  if (compile_flags & G_REGEX_RAW)
406
0
    pcre2_flags |= PCRE2_NO_UTF_CHECK;
407
408
0
  return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK;
409
0
}
410
411
static GRegexCompileFlags
412
g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags)
413
0
{
414
0
  GRegexCompileFlags compile_flags = G_REGEX_DEFAULT;
415
416
0
  if (pcre2_flags & PCRE2_CASELESS)
417
0
    compile_flags |= G_REGEX_CASELESS;
418
0
  if (pcre2_flags & PCRE2_MULTILINE)
419
0
    compile_flags |= G_REGEX_MULTILINE;
420
0
  if (pcre2_flags & PCRE2_DOTALL)
421
0
    compile_flags |= G_REGEX_DOTALL;
422
0
  if (pcre2_flags & PCRE2_EXTENDED)
423
0
    compile_flags |= G_REGEX_EXTENDED;
424
0
  if (pcre2_flags & PCRE2_ANCHORED)
425
0
    compile_flags |= G_REGEX_ANCHORED;
426
0
  if (pcre2_flags & PCRE2_DOLLAR_ENDONLY)
427
0
    compile_flags |= G_REGEX_DOLLAR_ENDONLY;
428
0
  if (pcre2_flags & PCRE2_UNGREEDY)
429
0
    compile_flags |= G_REGEX_UNGREEDY;
430
0
  if (!(pcre2_flags & PCRE2_UTF))
431
0
    compile_flags |= G_REGEX_RAW;
432
0
  if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE)
433
0
    compile_flags |= G_REGEX_NO_AUTO_CAPTURE;
434
0
  if (pcre2_flags & PCRE2_FIRSTLINE)
435
0
    compile_flags |= G_REGEX_FIRSTLINE;
436
0
  if (pcre2_flags & PCRE2_DUPNAMES)
437
0
    compile_flags |= G_REGEX_DUPNAMES;
438
439
0
  return compile_flags & G_REGEX_COMPILE_MASK;
440
0
}
441
442
static GRegexMatchFlags
443
g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags)
444
0
{
445
0
  GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT;
446
447
0
  if (pcre2_flags & PCRE2_ANCHORED)
448
0
    match_flags |= G_REGEX_MATCH_ANCHORED;
449
0
  if (pcre2_flags & PCRE2_NOTBOL)
450
0
    match_flags |= G_REGEX_MATCH_NOTBOL;
451
0
  if (pcre2_flags & PCRE2_NOTEOL)
452
0
    match_flags |= G_REGEX_MATCH_NOTEOL;
453
0
  if (pcre2_flags & PCRE2_NOTEMPTY)
454
0
    match_flags |= G_REGEX_MATCH_NOTEMPTY;
455
0
  if (pcre2_flags & PCRE2_PARTIAL_SOFT)
456
0
    match_flags |= G_REGEX_MATCH_PARTIAL_SOFT;
457
0
  if (pcre2_flags & PCRE2_PARTIAL_HARD)
458
0
    match_flags |= G_REGEX_MATCH_PARTIAL_HARD;
459
0
  if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART)
460
0
    match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART;
461
462
0
  return (match_flags & G_REGEX_MATCH_MASK);
463
0
}
464
465
static uint32_t
466
get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags)
467
0
{
468
0
  compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK;
469
470
0
  switch (compile_flags)
471
0
    {
472
0
    case G_REGEX_NEWLINE_CR:
473
0
      return PCRE2_NEWLINE_CR;
474
0
    case G_REGEX_NEWLINE_LF:
475
0
      return PCRE2_NEWLINE_LF;
476
0
    case G_REGEX_NEWLINE_CRLF:
477
0
      return PCRE2_NEWLINE_CRLF;
478
0
    case G_REGEX_NEWLINE_ANYCRLF:
479
0
      return PCRE2_NEWLINE_ANYCRLF;
480
0
    default:
481
0
      if (compile_flags != 0)
482
0
        return 0;
483
484
0
      return PCRE2_NEWLINE_ANY;
485
0
    }
486
0
}
487
488
static uint32_t
489
get_pcre2_newline_match_options (GRegexMatchFlags match_flags)
490
0
{
491
0
  switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK)
492
0
    {
493
0
    case G_REGEX_MATCH_NEWLINE_CR:
494
0
      return PCRE2_NEWLINE_CR;
495
0
    case G_REGEX_MATCH_NEWLINE_LF:
496
0
      return PCRE2_NEWLINE_LF;
497
0
    case G_REGEX_MATCH_NEWLINE_CRLF:
498
0
      return PCRE2_NEWLINE_CRLF;
499
0
    case G_REGEX_MATCH_NEWLINE_ANY:
500
0
      return PCRE2_NEWLINE_ANY;
501
0
    case G_REGEX_MATCH_NEWLINE_ANYCRLF:
502
0
      return PCRE2_NEWLINE_ANYCRLF;
503
0
    default:
504
0
      return 0;
505
0
    }
506
0
}
507
508
static uint32_t
509
get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags)
510
0
{
511
0
  if (compile_flags & G_REGEX_BSR_ANYCRLF)
512
0
    return PCRE2_BSR_ANYCRLF;
513
514
0
  return PCRE2_BSR_UNICODE;
515
0
}
516
517
static uint32_t
518
get_pcre2_bsr_match_options (GRegexMatchFlags match_flags)
519
0
{
520
0
  if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF)
521
0
    return PCRE2_BSR_ANYCRLF;
522
523
0
  if (match_flags & G_REGEX_MATCH_BSR_ANY)
524
0
    return PCRE2_BSR_UNICODE;
525
526
0
  return 0;
527
0
}
528
529
static char *
530
get_pcre2_error_string (int errcode)
531
0
{
532
0
  PCRE2_UCHAR8 error_msg[2048];
533
0
  int err_length;
534
535
0
  err_length = pcre2_get_error_message (errcode, error_msg,
536
0
                                        G_N_ELEMENTS (error_msg));
537
538
0
  if (err_length <= 0)
539
0
    return NULL;
540
541
  /* The array is always filled with a trailing zero */
542
0
  g_assert ((size_t) err_length < G_N_ELEMENTS (error_msg));
543
0
  return g_memdup2 (error_msg, err_length + 1);
544
0
}
545
546
static const gchar *
547
translate_match_error (gint errcode)
548
0
{
549
0
  switch (errcode)
550
0
    {
551
0
    case PCRE2_ERROR_NOMATCH:
552
      /* not an error */
553
0
      break;
554
0
    case PCRE2_ERROR_NULL:
555
      /* NULL argument, this should not happen in GRegex */
556
0
      g_critical ("A NULL argument was passed to PCRE");
557
0
      break;
558
0
    case PCRE2_ERROR_BADOPTION:
559
0
      return "bad options";
560
0
    case PCRE2_ERROR_BADMAGIC:
561
0
      return _("corrupted object");
562
0
    case PCRE2_ERROR_NOMEMORY:
563
0
      return _("out of memory");
564
0
    case PCRE2_ERROR_NOSUBSTRING:
565
      /* not used by pcre2_match() */
566
0
      break;
567
0
    case PCRE2_ERROR_MATCHLIMIT:
568
0
    case PCRE2_ERROR_CALLOUT:
569
      /* callouts are not implemented */
570
0
      break;
571
0
    case PCRE2_ERROR_BADUTFOFFSET:
572
      /* we do not check if strings are valid */
573
0
      break;
574
0
    case PCRE2_ERROR_PARTIAL:
575
      /* not an error */
576
0
      break;
577
0
    case PCRE2_ERROR_INTERNAL:
578
0
      return _("internal error");
579
0
    case PCRE2_ERROR_DFA_UITEM:
580
0
      return _("the pattern contains items not supported for partial matching");
581
0
    case PCRE2_ERROR_DFA_UCOND:
582
0
      return _("back references as conditions are not supported for partial matching");
583
0
    case PCRE2_ERROR_DFA_WSSIZE:
584
      /* handled expanding the workspace */
585
0
      break;
586
0
    case PCRE2_ERROR_DFA_RECURSE:
587
0
    case PCRE2_ERROR_RECURSIONLIMIT:
588
0
      return _("recursion limit reached");
589
0
    case PCRE2_ERROR_BADOFFSET:
590
0
      return _("bad offset");
591
0
    case PCRE2_ERROR_RECURSELOOP:
592
0
      return _("recursion loop");
593
0
    case PCRE2_ERROR_JIT_BADOPTION:
594
      /* should not happen in GRegex since we check modes before each match */
595
0
      return _("matching mode is requested that was not compiled for JIT");
596
0
    default:
597
0
      break;
598
0
    }
599
0
  return NULL;
600
0
}
601
602
static char *
603
get_match_error_message (int errcode)
604
0
{
605
0
  const char *msg = translate_match_error (errcode);
606
0
  char *error_string;
607
608
0
  if (msg)
609
0
    return g_strdup (msg);
610
611
0
  error_string = get_pcre2_error_string (errcode);
612
613
0
  if (error_string)
614
0
    return error_string;
615
616
0
  return g_strdup (_("unknown error"));
617
0
}
618
619
static void
620
translate_compile_error (gint *errcode, const gchar **errmsg)
621
0
{
622
  /* If errcode is known we put the translatable error message in
623
   * errmsg. If errcode is unknown we put the generic
624
   * G_REGEX_ERROR_COMPILE error code in errcode.
625
   * Note that there can be more PCRE errors with the same GRegexError
626
   * and that some PCRE errors are useless for us.
627
   */
628
0
  gint original_errcode = *errcode;
629
630
0
  *errcode = -1;
631
0
  *errmsg = NULL;
632
633
0
  switch (original_errcode)
634
0
    {
635
0
    case PCRE2_ERROR_END_BACKSLASH:
636
0
      *errcode = G_REGEX_ERROR_STRAY_BACKSLASH;
637
0
      *errmsg = _("\\ at end of pattern");
638
0
      break;
639
0
    case PCRE2_ERROR_END_BACKSLASH_C:
640
0
      *errcode = G_REGEX_ERROR_MISSING_CONTROL_CHAR;
641
0
      *errmsg = _("\\c at end of pattern");
642
0
      break;
643
0
    case PCRE2_ERROR_UNKNOWN_ESCAPE:
644
0
    case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE:
645
0
      *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
646
0
      *errmsg = _("unrecognized character following \\");
647
0
      break;
648
0
    case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER:
649
0
      *errcode = G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER;
650
0
      *errmsg = _("numbers out of order in {} quantifier");
651
0
      break;
652
0
    case PCRE2_ERROR_QUANTIFIER_TOO_BIG:
653
0
      *errcode = G_REGEX_ERROR_QUANTIFIER_TOO_BIG;
654
0
      *errmsg = _("number too big in {} quantifier");
655
0
      break;
656
0
    case PCRE2_ERROR_MISSING_SQUARE_BRACKET:
657
0
      *errcode = G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS;
658
0
      *errmsg = _("missing terminating ] for character class");
659
0
      break;
660
0
    case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS:
661
0
      *errcode = G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS;
662
0
      *errmsg = _("invalid escape sequence in character class");
663
0
      break;
664
0
    case PCRE2_ERROR_CLASS_RANGE_ORDER:
665
0
      *errcode = G_REGEX_ERROR_RANGE_OUT_OF_ORDER;
666
0
      *errmsg = _("range out of order in character class");
667
0
      break;
668
0
    case PCRE2_ERROR_QUANTIFIER_INVALID:
669
0
    case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT:
670
0
      *errcode = G_REGEX_ERROR_NOTHING_TO_REPEAT;
671
0
      *errmsg = _("nothing to repeat");
672
0
      break;
673
0
    case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY:
674
0
      *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
675
0
      *errmsg = _("unrecognized character after (? or (?-");
676
0
      break;
677
0
    case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS:
678
0
      *errcode = G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS;
679
0
      *errmsg = _("POSIX named classes are supported only within a class");
680
0
      break;
681
0
    case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING:
682
0
      *errcode = G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED;
683
0
      *errmsg = _("POSIX collating elements are not supported");
684
0
      break;
685
0
    case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS:
686
0
    case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS:
687
0
    case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING:
688
0
      *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
689
0
      *errmsg = _("missing terminating )");
690
0
      break;
691
0
    case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE:
692
0
      *errcode = G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE;
693
0
      *errmsg = _("reference to non-existent subpattern");
694
0
      break;
695
0
    case PCRE2_ERROR_MISSING_COMMENT_CLOSING:
696
0
      *errcode = G_REGEX_ERROR_UNTERMINATED_COMMENT;
697
0
      *errmsg = _("missing ) after comment");
698
0
      break;
699
0
    case PCRE2_ERROR_PATTERN_TOO_LARGE:
700
0
      *errcode = G_REGEX_ERROR_EXPRESSION_TOO_LARGE;
701
0
      *errmsg = _("regular expression is too large");
702
0
      break;
703
0
    case PCRE2_ERROR_MISSING_CONDITION_CLOSING:
704
0
      *errcode = G_REGEX_ERROR_MALFORMED_CONDITION;
705
0
      *errmsg = _("malformed number or name after (?(");
706
0
      break;
707
0
    case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH:
708
0
      *errcode = G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND;
709
0
      *errmsg = _("lookbehind assertion is not fixed length");
710
0
      break;
711
0
    case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES:
712
0
      *errcode = G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES;
713
0
      *errmsg = _("conditional group contains more than two branches");
714
0
      break;
715
0
    case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED:
716
0
      *errcode = G_REGEX_ERROR_ASSERTION_EXPECTED;
717
0
      *errmsg = _("assertion expected after (?(");
718
0
      break;
719
0
    case PCRE2_ERROR_BAD_RELATIVE_REFERENCE:
720
0
      *errcode = G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE;
721
0
      *errmsg = _("a numbered reference must not be zero");
722
0
      break;
723
0
    case PCRE2_ERROR_UNKNOWN_POSIX_CLASS:
724
0
      *errcode = G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME;
725
0
      *errmsg = _("unknown POSIX class name");
726
0
      break;
727
0
    case PCRE2_ERROR_CODE_POINT_TOO_BIG:
728
0
    case PCRE2_ERROR_INVALID_HEXADECIMAL:
729
0
      *errcode = G_REGEX_ERROR_HEX_CODE_TOO_LARGE;
730
0
      *errmsg = _("character value in \\x{...} sequence is too large");
731
0
      break;
732
0
    case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C:
733
0
      *errcode = G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND;
734
0
      *errmsg = _("\\C not allowed in lookbehind assertion");
735
0
      break;
736
0
    case PCRE2_ERROR_MISSING_NAME_TERMINATOR:
737
0
      *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR;
738
0
      *errmsg = _("missing terminator in subpattern name");
739
0
      break;
740
0
    case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME:
741
0
      *errcode = G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME;
742
0
      *errmsg = _("two named subpatterns have the same name");
743
0
      break;
744
0
    case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY:
745
0
      *errcode = G_REGEX_ERROR_MALFORMED_PROPERTY;
746
0
      *errmsg = _("malformed \\P or \\p sequence");
747
0
      break;
748
0
    case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY:
749
0
      *errcode = G_REGEX_ERROR_UNKNOWN_PROPERTY;
750
0
      *errmsg = _("unknown property name after \\P or \\p");
751
0
      break;
752
0
    case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG:
753
0
      *errcode = G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG;
754
0
      *errmsg = _("subpattern name is too long (maximum 32 characters)");
755
0
      break;
756
0
    case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS:
757
0
      *errcode = G_REGEX_ERROR_TOO_MANY_SUBPATTERNS;
758
0
      *errmsg = _("too many named subpatterns (maximum 10,000)");
759
0
      break;
760
0
    case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG:
761
0
      *errcode = G_REGEX_ERROR_INVALID_OCTAL_VALUE;
762
0
      *errmsg = _("octal value is greater than \\377");
763
0
      break;
764
0
    case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES:
765
0
      *errcode = G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE;
766
0
      *errmsg = _("DEFINE group contains more than one branch");
767
0
      break;
768
0
    case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE:
769
0
      *errcode = G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS;
770
0
      *errmsg = _("inconsistent NEWLINE options");
771
0
      break;
772
0
    case PCRE2_ERROR_BACKSLASH_G_SYNTAX:
773
0
      *errcode = G_REGEX_ERROR_MISSING_BACK_REFERENCE;
774
0
      *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "
775
0
                  "number, or by a plain number");
776
0
      break;
777
0
    case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED:
778
0
      *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN;
779
0
      *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)");
780
0
      break;
781
0
    case PCRE2_ERROR_VERB_UNKNOWN:
782
0
      *errcode = G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB;
783
0
      *errmsg = _("(*VERB) not recognized");
784
0
      break;
785
0
    case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG:
786
0
      *errcode = G_REGEX_ERROR_NUMBER_TOO_BIG;
787
0
      *errmsg = _("number is too big");
788
0
      break;
789
0
    case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED:
790
0
      *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME;
791
0
      *errmsg = _("missing subpattern name after (?&");
792
0
      break;
793
0
    case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH:
794
0
      *errcode = G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME;
795
0
      *errmsg = _("different names for subpatterns of the same number are not allowed");
796
0
      break;
797
0
    case PCRE2_ERROR_MARK_MISSING_ARGUMENT:
798
0
      *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED;
799
0
      *errmsg = _("(*MARK) must have an argument");
800
0
      break;
801
0
    case PCRE2_ERROR_BACKSLASH_C_SYNTAX:
802
0
      *errcode = G_REGEX_ERROR_INVALID_CONTROL_CHAR;
803
0
      *errmsg = _( "\\c must be followed by an ASCII character");
804
0
      break;
805
0
    case PCRE2_ERROR_BACKSLASH_K_SYNTAX:
806
0
      *errcode = G_REGEX_ERROR_MISSING_NAME;
807
0
      *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");
808
0
      break;
809
0
    case PCRE2_ERROR_BACKSLASH_N_IN_CLASS:
810
0
      *errcode = G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS;
811
0
      *errmsg = _("\\N is not supported in a class");
812
0
      break;
813
0
    case PCRE2_ERROR_VERB_NAME_TOO_LONG:
814
0
      *errcode = G_REGEX_ERROR_NAME_TOO_LONG;
815
0
      *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)");
816
0
      break;
817
0
    case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW:
818
0
      *errcode = G_REGEX_ERROR_INTERNAL;
819
0
      *errmsg = _("code overflow");
820
0
      break;
821
0
    case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P:
822
0
      *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
823
0
      *errmsg = _("unrecognized character after (?P");
824
0
      break;
825
0
    case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE:
826
0
      *errcode = G_REGEX_ERROR_INTERNAL;
827
0
      *errmsg = _("overran compiling workspace");
828
0
      break;
829
0
    case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN:
830
0
      *errcode = G_REGEX_ERROR_INTERNAL;
831
0
      *errmsg = _("previously-checked referenced subpattern not found");
832
0
      break;
833
0
    case PCRE2_ERROR_HEAP_FAILED:
834
0
    case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW:
835
0
    case PCRE2_ERROR_UNICODE_NOT_SUPPORTED:
836
0
    case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT:
837
0
    case PCRE2_ERROR_NO_SURROGATES_IN_UTF16:
838
0
    case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS:
839
0
    case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE:
840
0
    case PCRE2_ERROR_INTERNAL_STUDY_ERROR:
841
0
    case PCRE2_ERROR_UTF_IS_DISABLED:
842
0
    case PCRE2_ERROR_UCP_IS_DISABLED:
843
0
    case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS:
844
0
    case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED:
845
0
    case PCRE2_ERROR_INTERNAL_BAD_CODE:
846
0
    case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP:
847
0
      *errcode = G_REGEX_ERROR_INTERNAL;
848
0
      break;
849
0
    case PCRE2_ERROR_INVALID_SUBPATTERN_NAME:
850
0
    case PCRE2_ERROR_CLASS_INVALID_RANGE:
851
0
    case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE:
852
0
    case PCRE2_ERROR_PARENTHESES_STACK_CHECK:
853
0
    case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED:
854
0
    case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG:
855
0
    case PCRE2_ERROR_MISSING_CALLOUT_CLOSING:
856
0
    case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB:
857
0
    case PCRE2_ERROR_NULL_PATTERN:
858
0
    case PCRE2_ERROR_BAD_OPTIONS:
859
0
    case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP:
860
0
    case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE:
861
0
    case PCRE2_ERROR_INVALID_OCTAL:
862
0
    case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG:
863
0
    case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG:
864
0
    case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS:
865
0
    case PCRE2_ERROR_VERSION_CONDITION_SYNTAX:
866
0
    case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER:
867
0
    case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER:
868
0
    case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED:
869
0
    case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP:
870
0
    case PCRE2_ERROR_PATTERN_TOO_COMPLICATED:
871
0
    case PCRE2_ERROR_LOOKBEHIND_TOO_LONG:
872
0
    case PCRE2_ERROR_PATTERN_STRING_TOO_LONG:
873
0
    case PCRE2_ERROR_BAD_LITERAL_OPTIONS:
874
0
    default:
875
0
      *errcode = G_REGEX_ERROR_COMPILE;
876
0
      break;
877
0
    }
878
879
0
  g_assert (*errcode != -1);
880
0
}
881
882
/* GMatchInfo */
883
884
static GMatchInfo *
885
match_info_new (const GRegex     *regex,
886
                const gchar      *string,
887
                gint              string_len,
888
                gint              start_position,
889
                GRegexMatchFlags  match_options,
890
                gboolean          is_dfa)
891
0
{
892
0
  GMatchInfo *match_info;
893
894
0
  if (string_len < 0)
895
0
    string_len = strlen (string);
896
897
0
  match_info = g_new0 (GMatchInfo, 1);
898
0
  match_info->ref_count = 1;
899
0
  match_info->regex = g_regex_ref ((GRegex *)regex);
900
0
  match_info->string = string;
901
0
  match_info->string_len = string_len;
902
0
  match_info->matches = PCRE2_ERROR_NOMATCH;
903
0
  match_info->pos = start_position;
904
0
  match_info->match_opts =
905
0
    get_pcre2_match_options (match_options, regex->orig_compile_opts);
906
907
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT,
908
0
                      &match_info->n_subpatterns);
909
910
0
  match_info->match_context = pcre2_match_context_create (NULL);
911
912
0
  if (is_dfa)
913
0
    {
914
      /* These values should be enough for most cases, if they are not
915
       * enough g_regex_match_all_full() will expand them. */
916
0
      match_info->n_workspace = 100;
917
0
      match_info->workspace = g_new (gint, match_info->n_workspace);
918
0
    }
919
920
0
  match_info->n_offsets = 2;
921
0
  match_info->offsets = g_new0 (gint, match_info->n_offsets);
922
  /* Set an invalid position for the previous match. */
923
0
  match_info->offsets[0] = -1;
924
0
  match_info->offsets[1] = -1;
925
926
0
  match_info->match_data = pcre2_match_data_create_from_pattern (
927
0
      match_info->regex->pcre_re,
928
0
      NULL);
929
930
0
  return match_info;
931
0
}
932
933
static gboolean
934
recalc_match_offsets (GMatchInfo *match_info,
935
                      GError     **error)
936
0
{
937
0
  PCRE2_SIZE *ovector;
938
0
  uint32_t ovector_size = 0;
939
0
  uint32_t pre_n_offset;
940
0
  uint32_t i;
941
942
0
  g_assert (!IS_PCRE2_ERROR (match_info->matches));
943
944
0
  if (match_info->matches == PCRE2_ERROR_PARTIAL)
945
0
    ovector_size = 1;
946
0
  else if (match_info->matches > 0)
947
0
    ovector_size = match_info->matches;
948
949
0
  g_assert (ovector_size != 0);
950
951
0
  if (pcre2_get_ovector_count (match_info->match_data) < ovector_size)
952
0
    {
953
0
      g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
954
0
                   _("Error while matching regular expression %s: %s"),
955
0
                   match_info->regex->pattern, _("code overflow"));
956
0
      return FALSE;
957
0
    }
958
959
0
  pre_n_offset = match_info->n_offsets;
960
0
  match_info->n_offsets = ovector_size * 2;
961
0
  ovector = pcre2_get_ovector_pointer (match_info->match_data);
962
963
0
  if (match_info->n_offsets != pre_n_offset)
964
0
    {
965
0
      match_info->offsets = g_realloc_n (match_info->offsets,
966
0
                                         match_info->n_offsets,
967
0
                                         sizeof (gint));
968
0
    }
969
970
0
  for (i = 0; i < match_info->n_offsets; i++)
971
0
    {
972
0
      match_info->offsets[i] = (int) ovector[i];
973
0
    }
974
975
0
  return TRUE;
976
0
}
977
978
static JITStatus
979
enable_jit_with_match_options (GMatchInfo  *match_info,
980
                               uint32_t  match_options)
981
0
{
982
0
  gint retval;
983
0
  uint32_t old_jit_options, new_jit_options;
984
985
0
  if (!(match_info->regex->orig_compile_opts & G_REGEX_OPTIMIZE))
986
0
    return JIT_STATUS_DISABLED;
987
988
0
  if (match_info->regex->jit_status == JIT_STATUS_DISABLED)
989
0
    return JIT_STATUS_DISABLED;
990
991
0
  if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS)
992
0
    return JIT_STATUS_DISABLED;
993
994
0
  old_jit_options = match_info->regex->jit_options;
995
0
  new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE;
996
0
  if (match_options & PCRE2_PARTIAL_HARD)
997
0
    new_jit_options |= PCRE2_JIT_PARTIAL_HARD;
998
0
  if (match_options & PCRE2_PARTIAL_SOFT)
999
0
    new_jit_options |= PCRE2_JIT_PARTIAL_SOFT;
1000
1001
  /* no new options enabled */
1002
0
  if (new_jit_options == old_jit_options)
1003
0
    {
1004
0
      g_assert (match_info->regex->jit_status != JIT_STATUS_DEFAULT);
1005
0
      return match_info->regex->jit_status;
1006
0
    }
1007
1008
0
  retval = pcre2_jit_compile (match_info->regex->pcre_re, new_jit_options);
1009
0
  if (retval == 0)
1010
0
    {
1011
0
      match_info->regex->jit_status = JIT_STATUS_ENABLED;
1012
1013
0
      match_info->regex->jit_options = new_jit_options;
1014
      /* Set min stack size for JIT to 32KiB and max to 512KiB */
1015
0
      match_info->jit_stack = pcre2_jit_stack_create (1 << 15, 1 << 19, NULL);
1016
0
      pcre2_jit_stack_assign (match_info->match_context, NULL, match_info->jit_stack);
1017
0
    }
1018
0
  else
1019
0
    {
1020
0
      match_info->regex->jit_status = JIT_STATUS_DISABLED;
1021
1022
0
      switch (retval)
1023
0
        {
1024
0
        case PCRE2_ERROR_NOMEMORY:
1025
0
          g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, "
1026
0
                   "but JIT was unable to allocate executable memory for the "
1027
0
                   "compiler. Falling back to interpretive code.");
1028
0
          break;
1029
0
        case PCRE2_ERROR_JIT_BADOPTION:
1030
0
          g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, "
1031
0
                   "but JIT support is not available. Falling back to "
1032
0
                   "interpretive code.");
1033
0
          break;
1034
0
        default:
1035
0
          g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, "
1036
0
                   "but request for JIT support had unexpectedly failed (error %d). "
1037
0
                   "Falling back to interpretive code.",
1038
0
                   retval);
1039
0
          break;
1040
0
        }
1041
0
    }
1042
1043
0
  return match_info->regex->jit_status;
1044
1045
0
  g_assert_not_reached ();
1046
0
}
1047
1048
/**
1049
 * g_match_info_get_regex:
1050
 * @match_info: a #GMatchInfo
1051
 *
1052
 * Returns #GRegex object used in @match_info. It belongs to Glib
1053
 * and must not be freed. Use g_regex_ref() if you need to keep it
1054
 * after you free @match_info object.
1055
 *
1056
 * Returns: (transfer none): #GRegex object used in @match_info
1057
 *
1058
 * Since: 2.14
1059
 */
1060
GRegex *
1061
g_match_info_get_regex (const GMatchInfo *match_info)
1062
0
{
1063
0
  g_return_val_if_fail (match_info != NULL, NULL);
1064
0
  return match_info->regex;
1065
0
}
1066
1067
/**
1068
 * g_match_info_get_string:
1069
 * @match_info: a #GMatchInfo
1070
 *
1071
 * Returns the string searched with @match_info. This is the
1072
 * string passed to g_regex_match() or g_regex_replace() so
1073
 * you may not free it before calling this function.
1074
 *
1075
 * Returns: the string searched with @match_info
1076
 *
1077
 * Since: 2.14
1078
 */
1079
const gchar *
1080
g_match_info_get_string (const GMatchInfo *match_info)
1081
0
{
1082
0
  g_return_val_if_fail (match_info != NULL, NULL);
1083
0
  return match_info->string;
1084
0
}
1085
1086
/**
1087
 * g_match_info_ref:
1088
 * @match_info: a #GMatchInfo
1089
 *
1090
 * Increases reference count of @match_info by 1.
1091
 *
1092
 * Returns: @match_info
1093
 *
1094
 * Since: 2.30
1095
 */
1096
GMatchInfo       *
1097
g_match_info_ref (GMatchInfo *match_info)
1098
0
{
1099
0
  g_return_val_if_fail (match_info != NULL, NULL);
1100
0
  g_atomic_int_inc (&match_info->ref_count);
1101
0
  return match_info;
1102
0
}
1103
1104
/**
1105
 * g_match_info_unref:
1106
 * @match_info: a #GMatchInfo
1107
 *
1108
 * Decreases reference count of @match_info by 1. When reference count drops
1109
 * to zero, it frees all the memory associated with the match_info structure.
1110
 *
1111
 * Since: 2.30
1112
 */
1113
void
1114
g_match_info_unref (GMatchInfo *match_info)
1115
0
{
1116
0
  if (g_atomic_int_dec_and_test (&match_info->ref_count))
1117
0
    {
1118
0
      g_regex_unref (match_info->regex);
1119
0
      if (match_info->match_context)
1120
0
        pcre2_match_context_free (match_info->match_context);
1121
0
      if (match_info->jit_stack)
1122
0
        pcre2_jit_stack_free (match_info->jit_stack);
1123
0
      if (match_info->match_data)
1124
0
        pcre2_match_data_free (match_info->match_data);
1125
0
      g_free (match_info->offsets);
1126
0
      g_free (match_info->workspace);
1127
0
      g_free (match_info);
1128
0
    }
1129
0
}
1130
1131
/**
1132
 * g_match_info_free:
1133
 * @match_info: (nullable): a #GMatchInfo, or %NULL
1134
 *
1135
 * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does
1136
 * nothing.
1137
 *
1138
 * Since: 2.14
1139
 */
1140
void
1141
g_match_info_free (GMatchInfo *match_info)
1142
690
{
1143
690
  if (match_info == NULL)
1144
690
    return;
1145
1146
0
  g_match_info_unref (match_info);
1147
0
}
1148
1149
/**
1150
 * g_match_info_next:
1151
 * @match_info: a #GMatchInfo structure
1152
 * @error: location to store the error occurring, or %NULL to ignore errors
1153
 *
1154
 * Scans for the next match using the same parameters of the previous
1155
 * call to g_regex_match_full() or g_regex_match() that returned
1156
 * @match_info.
1157
 *
1158
 * The match is done on the string passed to the match function, so you
1159
 * cannot free it before calling this function.
1160
 *
1161
 * Returns: %TRUE is the string matched, %FALSE otherwise
1162
 *
1163
 * Since: 2.14
1164
 */
1165
gboolean
1166
g_match_info_next (GMatchInfo  *match_info,
1167
                   GError     **error)
1168
0
{
1169
0
  JITStatus jit_status;
1170
0
  gint prev_match_start;
1171
0
  gint prev_match_end;
1172
0
  uint32_t opts;
1173
1174
0
  g_return_val_if_fail (match_info != NULL, FALSE);
1175
0
  g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1176
0
  g_return_val_if_fail (match_info->pos >= 0, FALSE);
1177
1178
0
  prev_match_start = match_info->offsets[0];
1179
0
  prev_match_end = match_info->offsets[1];
1180
1181
0
  if (match_info->pos > match_info->string_len)
1182
0
    {
1183
      /* we have reached the end of the string */
1184
0
      match_info->pos = -1;
1185
0
      match_info->matches = PCRE2_ERROR_NOMATCH;
1186
0
      return FALSE;
1187
0
    }
1188
1189
0
  opts = match_info->regex->match_opts | match_info->match_opts;
1190
1191
0
  jit_status = enable_jit_with_match_options (match_info, opts);
1192
0
  if (jit_status == JIT_STATUS_ENABLED)
1193
0
    {
1194
0
      match_info->matches = pcre2_jit_match (match_info->regex->pcre_re,
1195
0
                                             (PCRE2_SPTR8) match_info->string,
1196
0
                                             match_info->string_len,
1197
0
                                             match_info->pos,
1198
0
                                             opts,
1199
0
                                             match_info->match_data,
1200
0
                                             match_info->match_context);
1201
      /* if the JIT stack limit was reached, fall back to non-JIT matching in
1202
       * the next conditional statement */
1203
0
      if (match_info->matches == PCRE2_ERROR_JIT_STACKLIMIT)
1204
0
        {
1205
0
          g_debug ("PCRE2 JIT stack limit reached, falling back to "
1206
0
                   "non-optimized matching.");
1207
0
          opts |= PCRE2_NO_JIT;
1208
0
          jit_status = JIT_STATUS_DISABLED;
1209
0
        }
1210
0
    }
1211
1212
0
  if (jit_status != JIT_STATUS_ENABLED)
1213
0
    {
1214
0
      match_info->matches = pcre2_match (match_info->regex->pcre_re,
1215
0
                                         (PCRE2_SPTR8) match_info->string,
1216
0
                                         match_info->string_len,
1217
0
                                         match_info->pos,
1218
0
                                         opts,
1219
0
                                         match_info->match_data,
1220
0
                                         match_info->match_context);
1221
0
    }
1222
1223
0
  if (IS_PCRE2_ERROR (match_info->matches))
1224
0
    {
1225
0
      gchar *error_msg = get_match_error_message (match_info->matches);
1226
1227
0
      g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1228
0
                   _("Error while matching regular expression %s: %s"),
1229
0
                   match_info->regex->pattern, error_msg);
1230
0
      g_clear_pointer (&error_msg, g_free);
1231
0
      return FALSE;
1232
0
    }
1233
0
  else if (match_info->matches == 0)
1234
0
    {
1235
      /* info->offsets is too small. */
1236
0
      match_info->n_offsets *= 2;
1237
0
      match_info->offsets = g_realloc_n (match_info->offsets,
1238
0
                                         match_info->n_offsets,
1239
0
                                         sizeof (gint));
1240
1241
0
      pcre2_match_data_free (match_info->match_data);
1242
0
      match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL);
1243
1244
0
      return g_match_info_next (match_info, error);
1245
0
    }
1246
0
  else if (match_info->matches == PCRE2_ERROR_NOMATCH)
1247
0
    {
1248
      /* We're done with this match info */
1249
0
      match_info->pos = -1;
1250
0
      return FALSE;
1251
0
    }
1252
0
  else
1253
0
    if (!recalc_match_offsets (match_info, error))
1254
0
      return FALSE;
1255
1256
  /* avoid infinite loops if the pattern is an empty string or something
1257
   * equivalent */
1258
0
  if (match_info->pos == match_info->offsets[1])
1259
0
    {
1260
0
      if (match_info->pos > match_info->string_len)
1261
0
        {
1262
          /* we have reached the end of the string */
1263
0
          match_info->pos = -1;
1264
0
          match_info->matches = PCRE2_ERROR_NOMATCH;
1265
0
          return FALSE;
1266
0
        }
1267
1268
0
      match_info->pos = NEXT_CHAR (match_info->regex,
1269
0
                                   &match_info->string[match_info->pos]) -
1270
0
                                   match_info->string;
1271
0
    }
1272
0
  else
1273
0
    {
1274
0
      match_info->pos = match_info->offsets[1];
1275
0
    }
1276
1277
0
  g_assert (match_info->matches < 0 ||
1278
0
            (uint32_t) match_info->matches <= match_info->n_subpatterns + 1);
1279
1280
  /* it's possible to get two identical matches when we are matching
1281
   * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and
1282
   * the string is "RegExTest" we have:
1283
   *  - search at position 0: match from 0 to 0
1284
   *  - search at position 1: match from 3 to 3
1285
   *  - search at position 3: match from 3 to 3 (duplicate)
1286
   *  - search at position 4: match from 5 to 5
1287
   *  - search at position 5: match from 5 to 5 (duplicate)
1288
   *  - search at position 6: no match -> stop
1289
   * so we have to ignore the duplicates.
1290
   * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */
1291
0
  if (match_info->matches >= 0 &&
1292
0
      prev_match_start == match_info->offsets[0] &&
1293
0
      prev_match_end == match_info->offsets[1])
1294
0
    {
1295
      /* ignore this match and search the next one */
1296
0
      return g_match_info_next (match_info, error);
1297
0
    }
1298
1299
0
  return match_info->matches >= 0;
1300
0
}
1301
1302
/**
1303
 * g_match_info_matches:
1304
 * @match_info: a #GMatchInfo structure
1305
 *
1306
 * Returns whether the previous match operation succeeded.
1307
 *
1308
 * Returns: %TRUE if the previous match operation succeeded,
1309
 *   %FALSE otherwise
1310
 *
1311
 * Since: 2.14
1312
 */
1313
gboolean
1314
g_match_info_matches (const GMatchInfo *match_info)
1315
0
{
1316
0
  g_return_val_if_fail (match_info != NULL, FALSE);
1317
1318
0
  return match_info->matches >= 0;
1319
0
}
1320
1321
/**
1322
 * g_match_info_get_match_count:
1323
 * @match_info: a #GMatchInfo structure
1324
 *
1325
 * Retrieves the number of matched substrings (including substring 0,
1326
 * that is the whole matched text), so 1 is returned if the pattern
1327
 * has no substrings in it and 0 is returned if the match failed.
1328
 *
1329
 * If the last match was obtained using the DFA algorithm, that is
1330
 * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
1331
 * count is not that of the number of capturing parentheses but that of
1332
 * the number of matched substrings.
1333
 *
1334
 * Returns: Number of matched substrings, or -1 if an error occurred
1335
 *
1336
 * Since: 2.14
1337
 */
1338
gint
1339
g_match_info_get_match_count (const GMatchInfo *match_info)
1340
0
{
1341
0
  g_return_val_if_fail (match_info, -1);
1342
1343
0
  if (match_info->matches == PCRE2_ERROR_NOMATCH)
1344
    /* no match */
1345
0
    return 0;
1346
0
  else if (match_info->matches < PCRE2_ERROR_NOMATCH)
1347
    /* error */
1348
0
    return -1;
1349
0
  else
1350
    /* match */
1351
0
    return match_info->matches;
1352
0
}
1353
1354
/**
1355
 * g_match_info_is_partial_match:
1356
 * @match_info: a #GMatchInfo structure
1357
 *
1358
 * Usually if the string passed to g_regex_match*() matches as far as
1359
 * it goes, but is too short to match the entire pattern, %FALSE is
1360
 * returned. There are circumstances where it might be helpful to
1361
 * distinguish this case from other cases in which there is no match.
1362
 *
1363
 * Consider, for example, an application where a human is required to
1364
 * type in data for a field with specific formatting requirements. An
1365
 * example might be a date in the form ddmmmyy, defined by the pattern
1366
 * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
1367
 * If the application sees the user’s keystrokes one by one, and can
1368
 * check that what has been typed so far is potentially valid, it is
1369
 * able to raise an error as soon as a mistake is made.
1370
 *
1371
 * GRegex supports the concept of partial matching by means of the
1372
 * %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD flags.
1373
 * When they are used, the return code for
1374
 * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
1375
 * for a complete match, %FALSE otherwise. But, when these functions
1376
 * return %FALSE, you can check if the match was partial calling
1377
 * g_match_info_is_partial_match().
1378
 *
1379
 * The difference between %G_REGEX_MATCH_PARTIAL_SOFT and
1380
 * %G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered
1381
 * with %G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a
1382
 * possible complete match, while with %G_REGEX_MATCH_PARTIAL_HARD matching
1383
 * stops at the partial match.
1384
 * When both %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD
1385
 * are set, the latter takes precedence.
1386
 *
1387
 * There were formerly some restrictions on the pattern for partial matching.
1388
 * The restrictions no longer apply.
1389
 *
1390
 * See pcrepartial(3) for more information on partial matching.
1391
 *
1392
 * Returns: %TRUE if the match was partial, %FALSE otherwise
1393
 *
1394
 * Since: 2.14
1395
 */
1396
gboolean
1397
g_match_info_is_partial_match (const GMatchInfo *match_info)
1398
0
{
1399
0
  g_return_val_if_fail (match_info != NULL, FALSE);
1400
1401
0
  return match_info->matches == PCRE2_ERROR_PARTIAL;
1402
0
}
1403
1404
/**
1405
 * g_match_info_expand_references:
1406
 * @match_info: (nullable): a #GMatchInfo or %NULL
1407
 * @string_to_expand: the string to expand
1408
 * @error: location to store the error occurring, or %NULL to ignore errors
1409
 *
1410
 * Returns a new string containing the text in @string_to_expand with
1411
 * references and escape sequences expanded. References refer to the last
1412
 * match done with @string against @regex and have the same syntax used by
1413
 * g_regex_replace().
1414
 *
1415
 * The @string_to_expand must be UTF-8 encoded even if %G_REGEX_RAW was
1416
 * passed to g_regex_new().
1417
 *
1418
 * The backreferences are extracted from the string passed to the match
1419
 * function, so you cannot call this function after freeing the string.
1420
 *
1421
 * @match_info may be %NULL in which case @string_to_expand must not
1422
 * contain references. For instance "foo\n" does not refer to an actual
1423
 * pattern and '\n' merely will be replaced with \n character,
1424
 * while to expand "\0" (whole match) one needs the result of a match.
1425
 * Use g_regex_check_replacement() to find out whether @string_to_expand
1426
 * contains references.
1427
 *
1428
 * Returns: (nullable): the expanded string, or %NULL if an error occurred
1429
 *
1430
 * Since: 2.14
1431
 */
1432
gchar *
1433
g_match_info_expand_references (const GMatchInfo  *match_info,
1434
                                const gchar       *string_to_expand,
1435
                                GError           **error)
1436
0
{
1437
0
  GString *result;
1438
0
  GList *list;
1439
0
  GError *tmp_error = NULL;
1440
1441
0
  g_return_val_if_fail (string_to_expand != NULL, NULL);
1442
0
  g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1443
1444
0
  list = split_replacement (string_to_expand, &tmp_error);
1445
0
  if (tmp_error != NULL)
1446
0
    {
1447
0
      g_propagate_error (error, tmp_error);
1448
0
      return NULL;
1449
0
    }
1450
1451
0
  if (!match_info && interpolation_list_needs_match (list))
1452
0
    {
1453
0
      g_critical ("String '%s' contains references to the match, can't "
1454
0
                  "expand references without GMatchInfo object",
1455
0
                  string_to_expand);
1456
0
      return NULL;
1457
0
    }
1458
1459
0
  result = g_string_sized_new (strlen (string_to_expand));
1460
0
  interpolate_replacement (match_info, result, list);
1461
1462
0
  g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
1463
1464
0
  return g_string_free (result, FALSE);
1465
0
}
1466
1467
/**
1468
 * g_match_info_fetch:
1469
 * @match_info: #GMatchInfo structure
1470
 * @match_num: number of the sub expression
1471
 *
1472
 * Retrieves the text matching the @match_num'th capturing
1473
 * parentheses. 0 is the full text of the match, 1 is the first paren
1474
 * set, 2 the second, and so on.
1475
 *
1476
 * If @match_num is a valid sub pattern but it didn't match anything
1477
 * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
1478
 * string is returned.
1479
 *
1480
 * If the match was obtained using the DFA algorithm, that is using
1481
 * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1482
 * string is not that of a set of parentheses but that of a matched
1483
 * substring. Substrings are matched in reverse order of length, so
1484
 * 0 is the longest match.
1485
 *
1486
 * The string is fetched from the string passed to the match function,
1487
 * so you cannot call this function after freeing the string.
1488
 *
1489
 * Returns: (nullable): The matched substring, or %NULL if an error
1490
 *     occurred. You have to free the string yourself
1491
 *
1492
 * Since: 2.14
1493
 */
1494
gchar *
1495
g_match_info_fetch (const GMatchInfo *match_info,
1496
                    gint              match_num)
1497
0
{
1498
0
  gchar *match = NULL;
1499
0
  gint start, end;
1500
1501
0
  g_return_val_if_fail (match_info != NULL, NULL);
1502
0
  g_return_val_if_fail (match_num >= 0, NULL);
1503
1504
  /* match_num does not exist or it didn't matched, i.e. matching "b"
1505
   * against "(a)?b" then group 0 is empty. */
1506
0
  if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
1507
0
    match = NULL;
1508
0
  else if (start == -1)
1509
0
    match = g_strdup ("");
1510
0
  else
1511
0
    match = g_strndup (&match_info->string[start], end - start);
1512
1513
0
  return match;
1514
0
}
1515
1516
/**
1517
 * g_match_info_fetch_pos:
1518
 * @match_info: #GMatchInfo structure
1519
 * @match_num: number of the capture parenthesis
1520
 * @start_pos: (out) (optional): pointer to location where to store
1521
 *     the start position, or %NULL
1522
 * @end_pos: (out) (optional): pointer to location where to store
1523
 *     the end position (the byte after the final byte of the match), or %NULL
1524
 *
1525
 * Returns the start and end positions (in bytes) of a successfully matching 
1526
 * capture parenthesis.
1527
 * 
1528
 * Valid values for @match_num are `0` for the full text of the match,
1529
 * `1` for the first paren set, `2` for the second, and so on.
1530
 *
1531
 * As @end_pos is set to the byte after the final byte of the match (on success),
1532
 * the length of the match can be calculated as `end_pos - start_pos`.
1533
 *
1534
 * As a best practice, initialize @start_pos and @end_pos to identifiable 
1535
 * values, such as `G_MAXINT`, so that you can test if 
1536
 * `g_match_info_fetch_pos()` actually changed the value for a given 
1537
 * capture parenthesis.
1538
 *
1539
 * The parameter @match_num corresponds to a matched capture parenthesis. The 
1540
 * actual value you use for @match_num depends on the method used to generate
1541
 * @match_info. The following sections describe those methods.
1542
 * 
1543
 * ## Methods Using Non-deterministic Finite Automata Matching
1544
 *
1545
 * The methods [method@GLib.Regex.match] and [method@GLib.Regex.match_full]
1546
 * return a [struct@GLib.MatchInfo] using traditional (greedy) pattern
1547
 * matching, also known as 
1548
 * [Non-deterministic Finite Automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton)
1549
 * (NFA) matching. You pass the returned `GMatchInfo` from these methods to 
1550
 * `g_match_info_fetch_pos()` to determine the start and end positions 
1551
 * of capture parentheses. The values for @match_num correspond to the capture 
1552
 * parentheses in order, with `0` corresponding to the entire matched string.
1553
 * 
1554
 * @match_num can refer to a capture parenthesis with no match. For example, 
1555
 * the string `b` matches against the pattern `(a)?b`, but the capture
1556
 * parenthesis `(a)` has no match. In this case, `g_match_info_fetch_pos()`
1557
 * returns true and sets @start_pos and @end_pos to `-1` when called with
1558
 * `match_num` as `1` (for `(a)`).
1559
 *
1560
 * For an expanded example, a regex pattern is `(a)?(.*?)the (.*)`, 
1561
 * and a candidate string is `glib regexes are the best`. In this scenario 
1562
 * there are four capture parentheses numbered 0–3: an implicit one 
1563
 * for the entire string, and three explicitly declared in the regex pattern.
1564
 *
1565
 * Given this example, the following table describes the return values 
1566
 * from `g_match_info_fetch_pos()` for various values of @match_num.
1567
 *
1568
 * `match_num` | Contents | Return value | Returned `start_pos` | Returned `end_pos`
1569
 * ----------- | -------- | ------------ | -------------------- | ------------------
1570
 * 0 | Matches entire string | True | 0 | 25
1571
 * 1 | Does not match first character | True | -1 | -1
1572
 * 2 | All text before `the ` | True | 0 | 17
1573
 * 3 | All text after `the ` | True | 21 | 25
1574
 * 4 | Capture paren out of range | False | Unchanged | Unchanged
1575
 *
1576
 * The following code sample and output implements this example.
1577
 *
1578
 * ``` { .c }
1579
 * #include <glib.h>
1580
 *
1581
 * int
1582
 * main (int argc, char *argv[])
1583
 * {
1584
 *   g_autoptr(GError) local_error = NULL;
1585
 *   const char *regex_pattern = "(a)?(.*?)the (.*)";
1586
 *   const char *test_string = "glib regexes are the best";
1587
 *   g_autoptr(GRegex) regex = NULL;
1588
 *
1589
 *   regex = g_regex_new (regex_pattern,
1590
 *                        G_REGEX_DEFAULT,
1591
 *                        G_REGEX_MATCH_DEFAULT,
1592
 *                        &local_error);
1593
 *   if (regex == NULL)
1594
 *     {
1595
 *       g_printerr ("Error creating regex: %s\n", local_error->message);
1596
 *       return 1;
1597
 *     }
1598
 *
1599
 *   g_autoptr(GMatchInfo) match_info = NULL;
1600
 *   g_regex_match (regex, test_string, G_REGEX_MATCH_DEFAULT, &match_info);
1601
 *
1602
 *   int n_matched_strings = g_match_info_get_match_count (match_info);
1603
 *
1604
 *   // Print header line
1605
 *   g_print ("match_num Contents                  Return value returned start_pos returned end_pos\n");
1606
 *
1607
 *   // Iterate over each capture paren, including one that is out of range as a demonstration.
1608
 *   for (int match_num = 0; match_num <= n_matched_strings; match_num++)
1609
 *     {
1610
 *       gboolean found_match;
1611
 *       g_autofree char *paren_string = NULL;
1612
 *       int start_pos = G_MAXINT;
1613
 *       int end_pos = G_MAXINT;
1614
 *
1615
 *       found_match = g_match_info_fetch_pos (match_info,
1616
 *                                             match_num,
1617
 *                                             &start_pos,
1618
 *                                             &end_pos);
1619
 *
1620
 *       // If no match, display N/A as the found string.
1621
 *       if (start_pos == G_MAXINT || start_pos == -1)
1622
 *         paren_string = g_strdup ("N/A");
1623
 *       else
1624
 *         paren_string = g_strndup (test_string + start_pos, end_pos - start_pos);
1625
 *
1626
 *       g_print ("%-9d %-25s %-12d %-18d %d\n", match_num, paren_string, found_match, start_pos, end_pos);
1627
 *     }
1628
 *
1629
 *   return 0;
1630
 * }
1631
 * ```
1632
 *
1633
 * ```
1634
 * match_num Contents                  Return value returned start_pos returned end_pos
1635
 * 0         glib regexes are the best 1            0                  25
1636
 * 1         N/A                       1            -1                 -1
1637
 * 2         glib regexes are          1            0                  17
1638
 * 3         best                      1            21                 25
1639
 * 4         N/A                       0            2147483647         2147483647
1640
 * ```
1641
 * ## Methods Using Deterministic Finite Automata Matching
1642
 *
1643
 * The methods [method@GLib.Regex.match_all] and 
1644
 * [method@GLib.Regex.match_all_full]
1645
 * return a `GMatchInfo` using
1646
 * [Deterministic Finite Automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton)
1647
 * (DFA) pattern matching. This algorithm detects overlapping matches. You pass
1648
 * the returned `GMatchInfo` from these methods to `g_match_info_fetch_pos()`
1649
 * to determine the start and end positions of each overlapping match. Use the 
1650
 * method [method@GLib.MatchInfo.get_match_count] to determine the number 
1651
 * of overlapping matches.
1652
 *
1653
 * For example, a regex pattern is `<.*>`, and a candidate string is 
1654
 * `<a> <b> <c>`. In this scenario there are three implicit capture 
1655
 * parentheses: one for the entire string, one for `<a> <b>`, and one for `<a>`.
1656
 *
1657
 * Given this example, the following table describes the return values from
1658
 * `g_match_info_fetch_pos()` for various values of @match_num.
1659
 *
1660
 * `match_num` | Contents | Return value | Returned `start_pos` | Returned `end_pos`
1661
 * ----------- | -------- | ------------ | -------------------- | ------------------
1662
 * 0 | Matches entire string | True | 0 | 11
1663
 * 1 | Matches `<a> <b>` | True | 0 | 7
1664
 * 2 | Matches `<a>` | True | 0 | 3
1665
 * 3 | Capture paren out of range | False | Unchanged | Unchanged
1666
 *
1667
 * The following code sample and output implements this example.
1668
 *
1669
 * ``` { .c }
1670
 * #include <glib.h>
1671
 *
1672
 * int
1673
 * main (int argc, char *argv[])
1674
 * {
1675
 *   g_autoptr(GError) local_error = NULL;
1676
 *   const char *regex_pattern = "<.*>";
1677
 *   const char *test_string = "<a> <b> <c>";
1678
 *   g_autoptr(GRegex) regex = NULL;
1679
 * 
1680
 *   regex = g_regex_new (regex_pattern,
1681
 *                        G_REGEX_DEFAULT,
1682
 *                        G_REGEX_MATCH_DEFAULT,
1683
 *                        &local_error);
1684
 *   if (regex == NULL)
1685
 *     {
1686
 *       g_printerr ("Error creating regex: %s\n", local_error->message);
1687
 *       return -1;
1688
 *     }
1689
 *
1690
 *   g_autoptr(GMatchInfo) match_info = NULL;
1691
 *   g_regex_match_all (regex, test_string, G_REGEX_MATCH_DEFAULT, &match_info);
1692
 *
1693
 *   int n_matched_strings = g_match_info_get_match_count (match_info);
1694
 *
1695
 *   // Print header line 
1696
 *   g_print ("match_num Contents                  Return value returned start_pos returned end_pos\n");
1697
 * 
1698
 *   // Iterate over each capture paren, including one that is out of range as a demonstration.
1699
 *   for (int match_num = 0; match_num <= n_matched_strings; match_num++)
1700
 *     {
1701
 *       gboolean found_match;
1702
 *       g_autofree char *paren_string = NULL;
1703
 *       int start_pos = G_MAXINT;
1704
 *       int end_pos = G_MAXINT;
1705
 *
1706
 *       found_match = g_match_info_fetch_pos (match_info, match_num, &start_pos, &end_pos);
1707
 *
1708
 *       // If no match, display N/A as the found string.
1709
 *       if (start_pos == G_MAXINT || start_pos == -1)
1710
 *         paren_string = g_strdup ("N/A");
1711
 *       else
1712
 *         paren_string = g_strndup (test_string + start_pos, end_pos - start_pos);
1713
 *
1714
 *       g_print ("%-9d %-25s %-12d %-18d %d\n", match_num, paren_string, found_match, start_pos, end_pos);
1715
 *     }
1716
 *
1717
 *   return 0;
1718
 * }
1719
 * ```
1720
 *
1721
 * ```
1722
 * match_num Contents                  Return value returned start_pos returned end_pos
1723
 * 0         <a> <b> <c>               1            0                  11
1724
 * 1         <a> <b>                   1            0                  7
1725
 * 2         <a>                       1            0                  3
1726
 * 3         N/A                       0            2147483647         2147483647
1727
 * ```
1728
 *
1729
 * Returns: True if @match_num is within range, false otherwise. If
1730
 *   the capture paren has a match, @start_pos and @end_pos contain the 
1731
 *   start and end positions (in bytes) of the matching substring. If the 
1732
 *   capture paren has no match, @start_pos and @end_pos are `-1`. If 
1733
 *   @match_num is out of range, @start_pos and @end_pos are left unchanged.
1734
 *
1735
 * Since: 2.14
1736
 */
1737
gboolean
1738
g_match_info_fetch_pos (const GMatchInfo *match_info,
1739
                        gint              match_num,
1740
                        gint             *start_pos,
1741
                        gint             *end_pos)
1742
0
{
1743
0
  g_return_val_if_fail (match_info != NULL, FALSE);
1744
0
  g_return_val_if_fail (match_num >= 0, FALSE);
1745
1746
  /* check whether there was an error */
1747
0
  if (match_info->matches < 0)
1748
0
    return FALSE;
1749
1750
  /* make sure the sub expression number they're requesting is less than
1751
   * the total number of sub expressions in the regex. When matching all
1752
   * (g_regex_match_all()), also compare against the number of matches */
1753
0
  if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches))
1754
0
    return FALSE;
1755
1756
0
  if (start_pos != NULL)
1757
0
    *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1;
1758
1759
0
  if (end_pos != NULL)
1760
0
    *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1;
1761
1762
0
  return TRUE;
1763
0
}
1764
1765
/*
1766
 * Returns number of first matched subpattern with name @name.
1767
 * There may be more than one in case when DUPNAMES is used,
1768
 * and not all subpatterns with that name match;
1769
 * pcre2_substring_number_from_name() does not work in that case.
1770
 */
1771
static gint
1772
get_matched_substring_number (const GMatchInfo *match_info,
1773
                              const gchar      *name)
1774
0
{
1775
0
  gint entrysize;
1776
0
  PCRE2_SPTR first, last;
1777
0
  guchar *entry;
1778
1779
0
  if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES))
1780
0
    return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR8) name);
1781
1782
  /* This code is analogous to code from pcre2_substring.c:
1783
   * pcre2_substring_get_byname() */
1784
0
  entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re,
1785
0
                                              (PCRE2_SPTR8) name,
1786
0
                                              &first,
1787
0
                                              &last);
1788
1789
0
  if (entrysize <= 0)
1790
0
    return entrysize;
1791
1792
0
  for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
1793
0
    {
1794
0
      guint n = (entry[0] << 8) + entry[1];
1795
0
      if (n * 2 < match_info->n_offsets && match_info->offsets[n * 2] >= 0)
1796
0
        return n;
1797
0
    }
1798
1799
0
  return (first[0] << 8) + first[1];
1800
0
}
1801
1802
/**
1803
 * g_match_info_fetch_named:
1804
 * @match_info: #GMatchInfo structure
1805
 * @name: name of the subexpression
1806
 *
1807
 * Retrieves the text matching the capturing parentheses named @name.
1808
 *
1809
 * If @name is a valid sub pattern name but it didn't match anything
1810
 * (e.g. sub pattern `"X"`, matching `"b"` against `"(?P<X>a)?b"`)
1811
 * then an empty string is returned.
1812
 *
1813
 * The string is fetched from the string passed to the match function,
1814
 * so you cannot call this function after freeing the string.
1815
 *
1816
 * Returns: (nullable): The matched substring, or %NULL if an error
1817
 *     occurred. You have to free the string yourself
1818
 *
1819
 * Since: 2.14
1820
 */
1821
gchar *
1822
g_match_info_fetch_named (const GMatchInfo *match_info,
1823
                          const gchar      *name)
1824
0
{
1825
0
  gint num;
1826
1827
0
  g_return_val_if_fail (match_info != NULL, NULL);
1828
0
  g_return_val_if_fail (name != NULL, NULL);
1829
1830
0
  num = get_matched_substring_number (match_info, name);
1831
0
  if (num < 0)
1832
0
    return NULL;
1833
0
  else
1834
0
    return g_match_info_fetch (match_info, num);
1835
0
}
1836
1837
/**
1838
 * g_match_info_fetch_named_pos:
1839
 * @match_info: #GMatchInfo structure
1840
 * @name: name of the subexpression
1841
 * @start_pos: (out) (optional): pointer to location where to store
1842
 *     the start position, or %NULL
1843
 * @end_pos: (out) (optional): pointer to location where to store
1844
 *     the end position (the byte after the final byte of the match), or %NULL
1845
 *
1846
 * Retrieves the position in bytes of the capturing parentheses named @name.
1847
 *
1848
 * If @name is a valid sub pattern name but it didn't match anything
1849
 * (e.g. sub pattern `"X"`, matching `"b"` against `"(?P<X>a)?b"`)
1850
 * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
1851
 *
1852
 * As @end_pos is set to the byte after the final byte of the match (on success),
1853
 * the length of the match can be calculated as `end_pos - start_pos`.
1854
 *
1855
 * Returns: %TRUE if the position was fetched, %FALSE otherwise.
1856
 *     If the position cannot be fetched, @start_pos and @end_pos
1857
 *     are left unchanged.
1858
 *
1859
 * Since: 2.14
1860
 */
1861
gboolean
1862
g_match_info_fetch_named_pos (const GMatchInfo *match_info,
1863
                              const gchar      *name,
1864
                              gint             *start_pos,
1865
                              gint             *end_pos)
1866
0
{
1867
0
  gint num;
1868
1869
0
  g_return_val_if_fail (match_info != NULL, FALSE);
1870
0
  g_return_val_if_fail (name != NULL, FALSE);
1871
1872
0
  num = get_matched_substring_number (match_info, name);
1873
0
  if (num < 0)
1874
0
    return FALSE;
1875
1876
0
  return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
1877
0
}
1878
1879
/**
1880
 * g_match_info_fetch_all:
1881
 * @match_info: a #GMatchInfo structure
1882
 *
1883
 * Bundles up pointers to each of the matching substrings from a match
1884
 * and stores them in an array of gchar pointers. The first element in
1885
 * the returned array is the match number 0, i.e. the entire matched
1886
 * text.
1887
 *
1888
 * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
1889
 * "b" against "(a)?b") then an empty string is inserted.
1890
 *
1891
 * If the last match was obtained using the DFA algorithm, that is using
1892
 * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1893
 * strings are not that matched by sets of parentheses but that of the
1894
 * matched substring. Substrings are matched in reverse order of length,
1895
 * so the first one is the longest match.
1896
 *
1897
 * The strings are fetched from the string passed to the match function,
1898
 * so you cannot call this function after freeing the string.
1899
 *
1900
 * Returns: (transfer full): a %NULL-terminated array of gchar *
1901
 *     pointers.  It must be freed using g_strfreev(). If the previous
1902
 *     match failed %NULL is returned
1903
 *
1904
 * Since: 2.14
1905
 */
1906
gchar **
1907
g_match_info_fetch_all (const GMatchInfo *match_info)
1908
0
{
1909
0
  gchar **result;
1910
0
  gint i;
1911
1912
0
  g_return_val_if_fail (match_info != NULL, NULL);
1913
1914
0
  if (match_info->matches < 0)
1915
0
    return NULL;
1916
1917
0
  result = g_new (gchar *, match_info->matches + 1);
1918
0
  for (i = 0; i < match_info->matches; i++)
1919
0
    result[i] = g_match_info_fetch (match_info, i);
1920
0
  result[i] = NULL;
1921
1922
0
  return result;
1923
0
}
1924
1925
1926
/* GRegex */
1927
1928
G_DEFINE_QUARK (g-regex-error-quark, g_regex_error)
1929
1930
/**
1931
 * g_regex_ref:
1932
 * @regex: a #GRegex
1933
 *
1934
 * Increases reference count of @regex by 1.
1935
 *
1936
 * Returns: @regex
1937
 *
1938
 * Since: 2.14
1939
 */
1940
GRegex *
1941
g_regex_ref (GRegex *regex)
1942
0
{
1943
0
  g_return_val_if_fail (regex != NULL, NULL);
1944
0
  g_atomic_int_inc (&regex->ref_count);
1945
0
  return regex;
1946
0
}
1947
1948
/**
1949
 * g_regex_unref:
1950
 * @regex: a #GRegex
1951
 *
1952
 * Decreases reference count of @regex by 1. When reference count drops
1953
 * to zero, it frees all the memory associated with the regex structure.
1954
 *
1955
 * Since: 2.14
1956
 */
1957
void
1958
g_regex_unref (GRegex *regex)
1959
0
{
1960
0
  g_return_if_fail (regex != NULL);
1961
1962
0
  if (g_atomic_int_dec_and_test (&regex->ref_count))
1963
0
    {
1964
0
      g_free (regex->pattern);
1965
0
      if (regex->pcre_re != NULL)
1966
0
        pcre2_code_free (regex->pcre_re);
1967
0
      g_free (regex);
1968
0
    }
1969
0
}
1970
1971
static pcre2_code * regex_compile (const gchar  *pattern,
1972
                                   uint32_t      compile_options,
1973
                                   uint32_t      newline_options,
1974
                                   uint32_t      bsr_options,
1975
                                   GError      **error);
1976
1977
static uint32_t get_pcre2_inline_compile_options (pcre2_code *re,
1978
                                                  uint32_t    compile_options);
1979
1980
/**
1981
 * g_regex_new:
1982
 * @pattern: the regular expression
1983
 * @compile_options: compile options for the regular expression, or 0
1984
 * @match_options: match options for the regular expression, or 0
1985
 * @error: return location for a #GError
1986
 *
1987
 * Compiles the regular expression to an internal form, and does
1988
 * the initial setup of the #GRegex structure.
1989
 *
1990
 * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call
1991
 *   g_regex_unref() when you are done with it
1992
 *
1993
 * Since: 2.14
1994
 */
1995
GRegex *
1996
g_regex_new (const gchar         *pattern,
1997
             GRegexCompileFlags   compile_options,
1998
             GRegexMatchFlags     match_options,
1999
             GError             **error)
2000
0
{
2001
0
  GRegex *regex;
2002
0
  pcre2_code *re;
2003
0
  static gsize initialised = 0;
2004
0
  uint32_t pcre_compile_options;
2005
0
  uint32_t pcre_match_options;
2006
0
  uint32_t newline_options;
2007
0
  uint32_t bsr_options;
2008
2009
0
  g_return_val_if_fail (pattern != NULL, NULL);
2010
0
  g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2011
0
G_GNUC_BEGIN_IGNORE_DEPRECATIONS
2012
0
  g_return_val_if_fail ((compile_options & ~(G_REGEX_COMPILE_MASK |
2013
0
                                             G_REGEX_JAVASCRIPT_COMPAT)) == 0, NULL);
2014
0
G_GNUC_END_IGNORE_DEPRECATIONS
2015
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2016
2017
0
  if (g_once_init_enter (&initialised))
2018
0
    {
2019
0
      int supports_utf8;
2020
2021
0
      pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8);
2022
0
      if (!supports_utf8)
2023
0
        g_critical (_("PCRE library is compiled without UTF8 support"));
2024
2025
0
      g_once_init_leave (&initialised, supports_utf8 ? 1 : 2);
2026
0
    }
2027
2028
0
  if (G_UNLIKELY (initialised != 1))
2029
0
    {
2030
0
      g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, 
2031
0
                           _("PCRE library is compiled with incompatible options"));
2032
0
      return NULL;
2033
0
    }
2034
2035
0
  pcre_compile_options = get_pcre2_compile_options (compile_options);
2036
0
  pcre_match_options = get_pcre2_match_options (match_options, compile_options);
2037
2038
0
  newline_options = get_pcre2_newline_match_options (match_options);
2039
0
  if (newline_options == 0)
2040
0
    newline_options = get_pcre2_newline_compile_options (compile_options);
2041
2042
0
  if (newline_options == 0)
2043
0
    {
2044
0
      g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS,
2045
0
                   "Invalid newline flags");
2046
0
      return NULL;
2047
0
    }
2048
2049
0
  bsr_options = get_pcre2_bsr_match_options (match_options);
2050
0
  if (!bsr_options)
2051
0
    bsr_options = get_pcre2_bsr_compile_options (compile_options);
2052
2053
0
  re = regex_compile (pattern, pcre_compile_options,
2054
0
                      newline_options, bsr_options, error);
2055
0
  if (re == NULL)
2056
0
    return NULL;
2057
2058
0
  pcre_compile_options |=
2059
0
    get_pcre2_inline_compile_options (re, pcre_compile_options);
2060
2061
0
  regex = g_new0 (GRegex, 1);
2062
0
  regex->ref_count = 1;
2063
0
  regex->pattern = g_strdup (pattern);
2064
0
  regex->pcre_re = re;
2065
0
  regex->compile_opts = pcre_compile_options;
2066
0
  regex->orig_compile_opts = compile_options;
2067
0
  regex->match_opts = pcre_match_options;
2068
0
  regex->orig_match_opts = match_options;
2069
2070
0
  return regex;
2071
0
}
2072
2073
static pcre2_code *
2074
regex_compile (const gchar  *pattern,
2075
               uint32_t      compile_options,
2076
               uint32_t      newline_options,
2077
               uint32_t      bsr_options,
2078
               GError      **error)
2079
0
{
2080
0
  pcre2_code *re;
2081
0
  pcre2_compile_context *context;
2082
0
  const gchar *errmsg;
2083
0
  PCRE2_SIZE erroffset;
2084
0
  gint errcode;
2085
2086
0
  context = pcre2_compile_context_create (NULL);
2087
2088
  /* set newline options */
2089
0
  if (pcre2_set_newline (context, newline_options) != 0)
2090
0
    {
2091
0
      g_set_error (error, G_REGEX_ERROR,
2092
0
                   G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS,
2093
0
                   "Invalid newline flags");
2094
0
      pcre2_compile_context_free (context);
2095
0
      return NULL;
2096
0
    }
2097
2098
  /* set bsr options */
2099
0
  if (pcre2_set_bsr (context, bsr_options) != 0)
2100
0
    {
2101
0
      g_set_error (error, G_REGEX_ERROR,
2102
0
                   G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS,
2103
0
                   "Invalid BSR flags");
2104
0
      pcre2_compile_context_free (context);
2105
0
      return NULL;
2106
0
    }
2107
2108
  /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */
2109
0
  if (compile_options & PCRE2_UTF)
2110
0
    compile_options |= PCRE2_NO_UTF_CHECK;
2111
2112
0
  compile_options |= PCRE2_UCP;
2113
2114
  /* compile the pattern */
2115
0
  re = pcre2_compile ((PCRE2_SPTR8) pattern,
2116
0
                      PCRE2_ZERO_TERMINATED,
2117
0
                      compile_options,
2118
0
                      &errcode,
2119
0
                      &erroffset,
2120
0
                      context);
2121
0
  pcre2_compile_context_free (context);
2122
2123
  /* if the compilation failed, set the error member and return
2124
   * immediately */
2125
0
  if (re == NULL)
2126
0
    {
2127
0
      GError *tmp_error;
2128
0
      gchar *offset_str;
2129
0
      gchar *pcre2_errmsg = NULL;
2130
0
      int original_errcode;
2131
2132
      /* Translate the PCRE error code to GRegexError and use a translated
2133
       * error message if possible */
2134
0
      original_errcode = errcode;
2135
0
      translate_compile_error (&errcode, &errmsg);
2136
2137
0
      if (!errmsg)
2138
0
        {
2139
0
          errmsg = _("unknown error");
2140
0
          pcre2_errmsg = get_pcre2_error_string (original_errcode);
2141
0
        }
2142
2143
      /* PCRE uses byte offsets but we want to show character offsets */
2144
0
      erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);
2145
2146
0
      offset_str = g_strdup_printf ("%" G_GSIZE_FORMAT, erroffset);
2147
0
      tmp_error = g_error_new (G_REGEX_ERROR, errcode,
2148
0
                               _("Error while compiling regular expression ‘%s’ "
2149
0
                                 "at char %s: %s"),
2150
0
                               pattern, offset_str,
2151
0
                               pcre2_errmsg ? pcre2_errmsg : errmsg);
2152
0
      g_propagate_error (error, tmp_error);
2153
0
      g_free (offset_str);
2154
0
      g_clear_pointer (&pcre2_errmsg, g_free);
2155
2156
0
      return NULL;
2157
0
    }
2158
2159
0
  return re;
2160
0
}
2161
2162
static uint32_t
2163
get_pcre2_inline_compile_options (pcre2_code *re,
2164
                                  uint32_t    compile_options)
2165
0
{
2166
0
  uint32_t pcre_compile_options;
2167
0
  uint32_t nonpcre_compile_options;
2168
2169
  /* For options set at the beginning of the pattern, pcre puts them into
2170
   * compile options, e.g. "(?i)foo" will make the pcre structure store
2171
   * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */
2172
0
  nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
2173
0
  pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options);
2174
0
  compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK;
2175
0
  compile_options |= nonpcre_compile_options;
2176
2177
0
  if (!(compile_options & PCRE2_DUPNAMES))
2178
0
    {
2179
0
      uint32_t jchanged = 0;
2180
0
      pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged);
2181
0
      if (jchanged)
2182
0
        compile_options |= PCRE2_DUPNAMES;
2183
0
    }
2184
2185
0
  return compile_options;
2186
0
}
2187
2188
/**
2189
 * g_regex_get_pattern:
2190
 * @regex: a #GRegex structure
2191
 *
2192
 * Gets the pattern string associated with @regex, i.e. a copy of
2193
 * the string passed to g_regex_new().
2194
 *
2195
 * Returns: the pattern of @regex
2196
 *
2197
 * Since: 2.14
2198
 */
2199
const gchar *
2200
g_regex_get_pattern (const GRegex *regex)
2201
0
{
2202
0
  g_return_val_if_fail (regex != NULL, NULL);
2203
2204
0
  return regex->pattern;
2205
0
}
2206
2207
/**
2208
 * g_regex_get_max_backref:
2209
 * @regex: a #GRegex
2210
 *
2211
 * Returns the number of the highest back reference
2212
 * in the pattern, or 0 if the pattern does not contain
2213
 * back references.
2214
 *
2215
 * Returns: the number of the highest back reference
2216
 *
2217
 * Since: 2.14
2218
 */
2219
gint
2220
g_regex_get_max_backref (const GRegex *regex)
2221
0
{
2222
0
  uint32_t value;
2223
2224
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value);
2225
2226
0
  return value;
2227
0
}
2228
2229
/**
2230
 * g_regex_get_capture_count:
2231
 * @regex: a #GRegex
2232
 *
2233
 * Returns the number of capturing subpatterns in the pattern.
2234
 *
2235
 * Returns: the number of capturing subpatterns
2236
 *
2237
 * Since: 2.14
2238
 */
2239
gint
2240
g_regex_get_capture_count (const GRegex *regex)
2241
0
{
2242
0
  uint32_t value;
2243
2244
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value);
2245
2246
0
  return value;
2247
0
}
2248
2249
/**
2250
 * g_regex_get_has_cr_or_lf:
2251
 * @regex: a #GRegex structure
2252
 *
2253
 * Checks whether the pattern contains explicit CR or LF references.
2254
 *
2255
 * Returns: %TRUE if the pattern contains explicit CR or LF references
2256
 *
2257
 * Since: 2.34
2258
 */
2259
gboolean
2260
g_regex_get_has_cr_or_lf (const GRegex *regex)
2261
0
{
2262
0
  uint32_t value;
2263
2264
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value);
2265
2266
0
  return !!value;
2267
0
}
2268
2269
/**
2270
 * g_regex_get_max_lookbehind:
2271
 * @regex: a #GRegex structure
2272
 *
2273
 * Gets the number of characters in the longest lookbehind assertion in the
2274
 * pattern. This information is useful when doing multi-segment matching using
2275
 * the partial matching facilities.
2276
 *
2277
 * Returns: the number of characters in the longest lookbehind assertion.
2278
 *
2279
 * Since: 2.38
2280
 */
2281
gint
2282
g_regex_get_max_lookbehind (const GRegex *regex)
2283
0
{
2284
0
  uint32_t max_lookbehind;
2285
2286
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND,
2287
0
                      &max_lookbehind);
2288
2289
0
  return max_lookbehind;
2290
0
}
2291
2292
/**
2293
 * g_regex_get_compile_flags:
2294
 * @regex: a #GRegex
2295
 *
2296
 * Returns the compile options that @regex was created with.
2297
 *
2298
 * Depending on the version of PCRE that is used, this may or may not
2299
 * include flags set by option expressions such as `(?i)` found at the
2300
 * top-level within the compiled pattern.
2301
 *
2302
 * Returns: flags from #GRegexCompileFlags
2303
 *
2304
 * Since: 2.26
2305
 */
2306
GRegexCompileFlags
2307
g_regex_get_compile_flags (const GRegex *regex)
2308
0
{
2309
0
  GRegexCompileFlags extra_flags;
2310
0
  uint32_t info_value;
2311
2312
0
  g_return_val_if_fail (regex != NULL, 0);
2313
2314
  /* Preserve original G_REGEX_OPTIMIZE */
2315
0
  extra_flags = (regex->orig_compile_opts & G_REGEX_OPTIMIZE);
2316
2317
  /* Also include the newline options */
2318
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_NEWLINE, &info_value);
2319
0
  switch (info_value)
2320
0
    {
2321
0
    case PCRE2_NEWLINE_ANYCRLF:
2322
0
      extra_flags |= G_REGEX_NEWLINE_ANYCRLF;
2323
0
      break;
2324
0
    case PCRE2_NEWLINE_CRLF:
2325
0
      extra_flags |= G_REGEX_NEWLINE_CRLF;
2326
0
      break;
2327
0
    case PCRE2_NEWLINE_LF:
2328
0
      extra_flags |= G_REGEX_NEWLINE_LF;
2329
0
      break;
2330
0
    case PCRE2_NEWLINE_CR:
2331
0
      extra_flags |= G_REGEX_NEWLINE_CR;
2332
0
      break;
2333
0
    default:
2334
0
      break;
2335
0
    }
2336
2337
  /* Also include the bsr options */
2338
0
  pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BSR, &info_value);
2339
0
  switch (info_value)
2340
0
    {
2341
0
    case PCRE2_BSR_ANYCRLF:
2342
0
      extra_flags |= G_REGEX_BSR_ANYCRLF;
2343
0
      break;
2344
0
    default:
2345
0
      break;
2346
0
    }
2347
2348
0
  return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags;
2349
0
}
2350
2351
/**
2352
 * g_regex_get_match_flags:
2353
 * @regex: a #GRegex
2354
 *
2355
 * Returns the match options that @regex was created with.
2356
 *
2357
 * Returns: flags from #GRegexMatchFlags
2358
 *
2359
 * Since: 2.26
2360
 */
2361
GRegexMatchFlags
2362
g_regex_get_match_flags (const GRegex *regex)
2363
0
{
2364
0
  uint32_t flags;
2365
2366
0
  g_return_val_if_fail (regex != NULL, 0);
2367
2368
0
  flags = g_regex_match_flags_from_pcre2 (regex->match_opts);
2369
0
  flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK);
2370
0
  flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF));
2371
2372
0
  return flags;
2373
0
}
2374
2375
/**
2376
 * g_regex_match_simple:
2377
 * @pattern: the regular expression
2378
 * @string: the string to scan for matches
2379
 * @compile_options: compile options for the regular expression, or 0
2380
 * @match_options: match options, or 0
2381
 *
2382
 * Scans for a match in @string for @pattern.
2383
 *
2384
 * This function is equivalent to g_regex_match() but it does not
2385
 * require to compile the pattern with g_regex_new(), avoiding some
2386
 * lines of code when you need just to do a match without extracting
2387
 * substrings, capture counts, and so on.
2388
 *
2389
 * If this function is to be called on the same @pattern more than
2390
 * once, it's more efficient to compile the pattern once with
2391
 * g_regex_new() and then use g_regex_match().
2392
 *
2393
 * Returns: %TRUE if the string matched, %FALSE otherwise
2394
 *
2395
 * Since: 2.14
2396
 */
2397
gboolean
2398
g_regex_match_simple (const gchar        *pattern,
2399
                      const gchar        *string,
2400
                      GRegexCompileFlags  compile_options,
2401
                      GRegexMatchFlags    match_options)
2402
0
{
2403
0
  GRegex *regex;
2404
0
  gboolean result;
2405
2406
0
  regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL);
2407
0
  if (!regex)
2408
0
    return FALSE;
2409
0
  result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
2410
0
  g_regex_unref (regex);
2411
0
  return result;
2412
0
}
2413
2414
/**
2415
 * g_regex_match:
2416
 * @regex: a #GRegex structure from g_regex_new()
2417
 * @string: the string to scan for matches
2418
 * @match_options: match options
2419
 * @match_info: (out) (optional): pointer to location where to store
2420
 *     the #GMatchInfo, or %NULL if you do not need it
2421
 *
2422
 * Scans for a match in @string for the pattern in @regex.
2423
 * The @match_options are combined with the match options specified
2424
 * when the @regex structure was created, letting you have more
2425
 * flexibility in reusing #GRegex structures.
2426
 *
2427
 * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2428
 *
2429
 * A #GMatchInfo structure, used to get information on the match,
2430
 * is stored in @match_info if not %NULL. Note that if @match_info
2431
 * is not %NULL then it is created even if the function returns %FALSE,
2432
 * i.e. you must free it regardless if regular expression actually matched.
2433
 *
2434
 * To retrieve all the non-overlapping matches of the pattern in
2435
 * string you can use g_match_info_next().
2436
 *
2437
 * |[<!-- language="C" --> 
2438
 * static void
2439
 * print_uppercase_words (const gchar *string)
2440
 * {
2441
 *   // Print all uppercase-only words.
2442
 *   GRegex *regex;
2443
 *   GMatchInfo *match_info;
2444
 *  
2445
 *   regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
2446
 *   g_regex_match (regex, string, 0, &match_info);
2447
 *   while (g_match_info_matches (match_info))
2448
 *     {
2449
 *       gchar *word = g_match_info_fetch (match_info, 0);
2450
 *       g_print ("Found: %s\n", word);
2451
 *       g_free (word);
2452
 *       g_match_info_next (match_info, NULL);
2453
 *     }
2454
 *   g_match_info_free (match_info);
2455
 *   g_regex_unref (regex);
2456
 * }
2457
 * ]|
2458
 *
2459
 * @string is not copied and is used in #GMatchInfo internally. If
2460
 * you use any #GMatchInfo method (except g_match_info_free()) after
2461
 * freeing or modifying @string then the behaviour is undefined.
2462
 *
2463
 * Returns: %TRUE is the string matched, %FALSE otherwise
2464
 *
2465
 * Since: 2.14
2466
 */
2467
gboolean
2468
g_regex_match (const GRegex      *regex,
2469
               const gchar       *string,
2470
               GRegexMatchFlags   match_options,
2471
               GMatchInfo       **match_info)
2472
690
{
2473
690
  return g_regex_match_full (regex, string, -1, 0, match_options,
2474
690
                             match_info, NULL);
2475
690
}
2476
2477
/**
2478
 * g_regex_match_full:
2479
 * @regex: a #GRegex structure from g_regex_new()
2480
 * @string: (array length=string_len): the string to scan for matches
2481
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2482
 * @start_position: starting index of the string to match, in bytes
2483
 * @match_options: match options
2484
 * @match_info: (out) (optional): pointer to location where to store
2485
 *     the #GMatchInfo, or %NULL if you do not need it
2486
 * @error: location to store the error occurring, or %NULL to ignore errors
2487
 *
2488
 * Scans for a match in @string for the pattern in @regex.
2489
 * The @match_options are combined with the match options specified
2490
 * when the @regex structure was created, letting you have more
2491
 * flexibility in reusing #GRegex structures.
2492
 *
2493
 * Setting @start_position differs from just passing over a shortened
2494
 * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
2495
 * that begins with any kind of lookbehind assertion, such as "\b".
2496
 *
2497
 * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2498
 *
2499
 * A #GMatchInfo structure, used to get information on the match, is
2500
 * stored in @match_info if not %NULL. Note that if @match_info is
2501
 * not %NULL then it is created even if the function returns %FALSE,
2502
 * i.e. you must free it regardless if regular expression actually
2503
 * matched.
2504
 *
2505
 * @string is not copied and is used in #GMatchInfo internally. If
2506
 * you use any #GMatchInfo method (except g_match_info_free()) after
2507
 * freeing or modifying @string then the behaviour is undefined.
2508
 *
2509
 * To retrieve all the non-overlapping matches of the pattern in
2510
 * string you can use g_match_info_next().
2511
 *
2512
 * |[<!-- language="C" --> 
2513
 * static void
2514
 * print_uppercase_words (const gchar *string)
2515
 * {
2516
 *   // Print all uppercase-only words.
2517
 *   GRegex *regex;
2518
 *   GMatchInfo *match_info;
2519
 *   GError *error = NULL;
2520
 *   
2521
 *   regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
2522
 *   g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);
2523
 *   while (g_match_info_matches (match_info))
2524
 *     {
2525
 *       gchar *word = g_match_info_fetch (match_info, 0);
2526
 *       g_print ("Found: %s\n", word);
2527
 *       g_free (word);
2528
 *       g_match_info_next (match_info, &error);
2529
 *     }
2530
 *   g_match_info_free (match_info);
2531
 *   g_regex_unref (regex);
2532
 *   if (error != NULL)
2533
 *     {
2534
 *       g_printerr ("Error while matching: %s\n", error->message);
2535
 *       g_error_free (error);
2536
 *     }
2537
 * }
2538
 * ]|
2539
 *
2540
 * Returns: %TRUE is the string matched, %FALSE otherwise
2541
 *
2542
 * Since: 2.14
2543
 */
2544
gboolean
2545
g_regex_match_full (const GRegex      *regex,
2546
                    const gchar       *string,
2547
                    gssize             string_len,
2548
                    gint               start_position,
2549
                    GRegexMatchFlags   match_options,
2550
                    GMatchInfo       **match_info,
2551
                    GError           **error)
2552
690
{
2553
690
  GMatchInfo *info;
2554
690
  gboolean match_ok;
2555
2556
690
  g_return_val_if_fail (regex != NULL, FALSE);
2557
690
  g_return_val_if_fail (string != NULL, FALSE);
2558
0
  g_return_val_if_fail (start_position >= 0, FALSE);
2559
0
  g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
2560
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
2561
2562
0
  info = match_info_new (regex, string, string_len, start_position,
2563
0
                         match_options, FALSE);
2564
0
  match_ok = g_match_info_next (info, error);
2565
0
  if (match_info != NULL)
2566
0
    *match_info = info;
2567
0
  else
2568
0
    g_match_info_free (info);
2569
2570
0
  return match_ok;
2571
0
}
2572
2573
/**
2574
 * g_regex_match_all:
2575
 * @regex: a #GRegex structure from g_regex_new()
2576
 * @string: the string to scan for matches
2577
 * @match_options: match options
2578
 * @match_info: (out) (optional): pointer to location where to store
2579
 *     the #GMatchInfo, or %NULL if you do not need it
2580
 *
2581
 * Using the standard algorithm for regular expression matching only
2582
 * the longest match in the string is retrieved. This function uses
2583
 * a different algorithm so it can retrieve all the possible matches.
2584
 * For more documentation see g_regex_match_all_full().
2585
 *
2586
 * A #GMatchInfo structure, used to get information on the match, is
2587
 * stored in @match_info if not %NULL. Note that if @match_info is
2588
 * not %NULL then it is created even if the function returns %FALSE,
2589
 * i.e. you must free it regardless if regular expression actually
2590
 * matched.
2591
 *
2592
 * @string is not copied and is used in #GMatchInfo internally. If
2593
 * you use any #GMatchInfo method (except g_match_info_free()) after
2594
 * freeing or modifying @string then the behaviour is undefined.
2595
 *
2596
 * Returns: %TRUE is the string matched, %FALSE otherwise
2597
 *
2598
 * Since: 2.14
2599
 */
2600
gboolean
2601
g_regex_match_all (const GRegex      *regex,
2602
                   const gchar       *string,
2603
                   GRegexMatchFlags   match_options,
2604
                   GMatchInfo       **match_info)
2605
0
{
2606
0
  return g_regex_match_all_full (regex, string, -1, 0, match_options,
2607
0
                                 match_info, NULL);
2608
0
}
2609
2610
/**
2611
 * g_regex_match_all_full:
2612
 * @regex: a #GRegex structure from g_regex_new()
2613
 * @string: (array length=string_len): the string to scan for matches
2614
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2615
 * @start_position: starting index of the string to match, in bytes
2616
 * @match_options: match options
2617
 * @match_info: (out) (optional): pointer to location where to store
2618
 *     the #GMatchInfo, or %NULL if you do not need it
2619
 * @error: location to store the error occurring, or %NULL to ignore errors
2620
 *
2621
 * Using the standard algorithm for regular expression matching only
2622
 * the longest match in the @string is retrieved, it is not possible
2623
 * to obtain all the available matches. For instance matching
2624
 * `"<a> <b> <c>"` against the pattern `"<.*>"`
2625
 * you get `"<a> <b> <c>"`.
2626
 *
2627
 * This function uses a different algorithm (called DFA, i.e. deterministic
2628
 * finite automaton), so it can retrieve all the possible matches, all
2629
 * starting at the same point in the string. For instance matching
2630
 * `"<a> <b> <c>"` against the pattern `"<.*>"`
2631
 * you would obtain three matches: `"<a> <b> <c>"`,
2632
 * `"<a> <b>"` and `"<a>"`.
2633
 *
2634
 * The number of matched strings is retrieved using
2635
 * g_match_info_get_match_count(). To obtain the matched strings and
2636
 * their position you can use, respectively, g_match_info_fetch() and
2637
 * g_match_info_fetch_pos(). Note that the strings are returned in
2638
 * reverse order of length; that is, the longest matching string is
2639
 * given first.
2640
 *
2641
 * Note that the DFA algorithm is slower than the standard one and it
2642
 * is not able to capture substrings, so backreferences do not work.
2643
 *
2644
 * Setting @start_position differs from just passing over a shortened
2645
 * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
2646
 * that begins with any kind of lookbehind assertion, such as "\b".
2647
 *
2648
 * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2649
 *
2650
 * A #GMatchInfo structure, used to get information on the match, is
2651
 * stored in @match_info if not %NULL. Note that if @match_info is
2652
 * not %NULL then it is created even if the function returns %FALSE,
2653
 * i.e. you must free it regardless if regular expression actually
2654
 * matched.
2655
 *
2656
 * @string is not copied and is used in #GMatchInfo internally. If
2657
 * you use any #GMatchInfo method (except g_match_info_free()) after
2658
 * freeing or modifying @string then the behaviour is undefined.
2659
 *
2660
 * Returns: %TRUE is the string matched, %FALSE otherwise
2661
 *
2662
 * Since: 2.14
2663
 */
2664
gboolean
2665
g_regex_match_all_full (const GRegex      *regex,
2666
                        const gchar       *string,
2667
                        gssize             string_len,
2668
                        gint               start_position,
2669
                        GRegexMatchFlags   match_options,
2670
                        GMatchInfo       **match_info,
2671
                        GError           **error)
2672
0
{
2673
0
  GMatchInfo *info;
2674
0
  gboolean done;
2675
0
  pcre2_code *pcre_re;
2676
0
  gboolean retval;
2677
0
  uint32_t newline_options;
2678
0
  uint32_t bsr_options;
2679
2680
0
  g_return_val_if_fail (regex != NULL, FALSE);
2681
0
  g_return_val_if_fail (string != NULL, FALSE);
2682
0
  g_return_val_if_fail (start_position >= 0, FALSE);
2683
0
  g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
2684
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
2685
2686
0
  newline_options = get_pcre2_newline_match_options (match_options);
2687
0
  if (!newline_options)
2688
0
    newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts);
2689
2690
0
  bsr_options = get_pcre2_bsr_match_options (match_options);
2691
0
  if (!bsr_options)
2692
0
    bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts);
2693
2694
  /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an
2695
   * optimization for normal regex matching, but results in omitting some
2696
   * shorter matches here, and an observable behaviour change.
2697
   *
2698
   * DFA matching is rather niche, and very rarely used according to
2699
   * codesearch.debian.net, so don't bother caching the recompiled RE. */
2700
0
  pcre_re = regex_compile (regex->pattern,
2701
0
                           regex->compile_opts | PCRE2_NO_AUTO_POSSESS,
2702
0
                           newline_options, bsr_options, error);
2703
0
  if (pcre_re == NULL)
2704
0
    return FALSE;
2705
2706
0
  info = match_info_new (regex, string, string_len, start_position,
2707
0
                         match_options, TRUE);
2708
2709
0
  done = FALSE;
2710
0
  while (!done)
2711
0
    {
2712
0
      done = TRUE;
2713
0
      info->matches = pcre2_dfa_match (pcre_re,
2714
0
                                       (PCRE2_SPTR8) info->string, info->string_len,
2715
0
                                       info->pos,
2716
0
                                       (regex->match_opts | info->match_opts),
2717
0
                                       info->match_data,
2718
0
                                       info->match_context,
2719
0
                                       info->workspace, info->n_workspace);
2720
0
      if (info->matches == PCRE2_ERROR_DFA_WSSIZE)
2721
0
        {
2722
          /* info->workspace is too small. */
2723
0
          info->n_workspace *= 2;
2724
0
          info->workspace = g_realloc_n (info->workspace,
2725
0
                                         info->n_workspace,
2726
0
                                         sizeof (gint));
2727
0
          done = FALSE;
2728
0
        }
2729
0
      else if (info->matches == 0)
2730
0
        {
2731
          /* info->offsets is too small. */
2732
0
          info->n_offsets *= 2;
2733
0
          info->offsets = g_realloc_n (info->offsets,
2734
0
                                       info->n_offsets,
2735
0
                                       sizeof (gint));
2736
0
          pcre2_match_data_free (info->match_data);
2737
0
          info->match_data = pcre2_match_data_create (info->n_offsets, NULL);
2738
0
          done = FALSE;
2739
0
        }
2740
0
      else if (IS_PCRE2_ERROR (info->matches))
2741
0
        {
2742
0
          gchar *error_msg = get_match_error_message (info->matches);
2743
2744
0
          g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
2745
0
                       _("Error while matching regular expression %s: %s"),
2746
0
                       regex->pattern, error_msg);
2747
0
          g_clear_pointer (&error_msg, g_free);
2748
0
        }
2749
0
      else if (info->matches != PCRE2_ERROR_NOMATCH)
2750
0
        {
2751
0
          if (!recalc_match_offsets (info, error))
2752
0
            info->matches = PCRE2_ERROR_NOMATCH;
2753
0
        }
2754
0
    }
2755
2756
0
  pcre2_code_free (pcre_re);
2757
2758
  /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only
2759
   * holds true for a single match, rather than matching all */
2760
2761
  /* set info->pos to -1 so that a call to g_match_info_next() fails. */
2762
0
  info->pos = -1;
2763
0
  retval = info->matches >= 0;
2764
2765
0
  if (match_info != NULL)
2766
0
    *match_info = info;
2767
0
  else
2768
0
    g_match_info_free (info);
2769
2770
0
  return retval;
2771
0
}
2772
2773
/**
2774
 * g_regex_get_string_number:
2775
 * @regex: #GRegex structure
2776
 * @name: name of the subexpression
2777
 *
2778
 * Retrieves the number of the subexpression named @name.
2779
 *
2780
 * Returns: The number of the subexpression or -1 if @name
2781
 *   does not exists
2782
 *
2783
 * Since: 2.14
2784
 */
2785
gint
2786
g_regex_get_string_number (const GRegex *regex,
2787
                           const gchar  *name)
2788
0
{
2789
0
  gint num;
2790
2791
0
  g_return_val_if_fail (regex != NULL, -1);
2792
0
  g_return_val_if_fail (name != NULL, -1);
2793
2794
0
  num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR8) name);
2795
0
  if (num == PCRE2_ERROR_NOSUBSTRING)
2796
0
    num = -1;
2797
2798
0
  return num;
2799
0
}
2800
2801
/**
2802
 * g_regex_split_simple:
2803
 * @pattern: the regular expression
2804
 * @string: the string to scan for matches
2805
 * @compile_options: compile options for the regular expression, or 0
2806
 * @match_options: match options, or 0
2807
 *
2808
 * Breaks the string on the pattern, and returns an array of
2809
 * the tokens. If the pattern contains capturing parentheses,
2810
 * then the text for each of the substrings will also be returned.
2811
 * If the pattern does not match anywhere in the string, then the
2812
 * whole string is returned as the first token.
2813
 *
2814
 * This function is equivalent to g_regex_split() but it does
2815
 * not require to compile the pattern with g_regex_new(), avoiding
2816
 * some lines of code when you need just to do a split without
2817
 * extracting substrings, capture counts, and so on.
2818
 *
2819
 * If this function is to be called on the same @pattern more than
2820
 * once, it's more efficient to compile the pattern once with
2821
 * g_regex_new() and then use g_regex_split().
2822
 *
2823
 * As a special case, the result of splitting the empty string ""
2824
 * is an empty vector, not a vector containing a single string.
2825
 * The reason for this special case is that being able to represent
2826
 * an empty vector is typically more useful than consistent handling
2827
 * of empty elements. If you do need to represent empty elements,
2828
 * you'll need to check for the empty string before calling this
2829
 * function.
2830
 *
2831
 * A pattern that can match empty strings splits @string into
2832
 * separate characters wherever it matches the empty string between
2833
 * characters. For example splitting "ab c" using as a separator
2834
 * "\s*", you will get "a", "b" and "c".
2835
 *
2836
 * Returns: (transfer full): a %NULL-terminated array of strings. Free
2837
 * it using g_strfreev()
2838
 *
2839
 * Since: 2.14
2840
 **/
2841
gchar **
2842
g_regex_split_simple (const gchar        *pattern,
2843
                      const gchar        *string,
2844
                      GRegexCompileFlags  compile_options,
2845
                      GRegexMatchFlags    match_options)
2846
0
{
2847
0
  GRegex *regex;
2848
0
  gchar **result;
2849
2850
0
  regex = g_regex_new (pattern, compile_options, 0, NULL);
2851
0
  if (!regex)
2852
0
    return NULL;
2853
2854
0
  result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
2855
0
  g_regex_unref (regex);
2856
0
  return result;
2857
0
}
2858
2859
/**
2860
 * g_regex_split:
2861
 * @regex: a #GRegex structure
2862
 * @string: the string to split with the pattern
2863
 * @match_options: match time option flags
2864
 *
2865
 * Breaks the string on the pattern, and returns an array of the tokens.
2866
 * If the pattern contains capturing parentheses, then the text for each
2867
 * of the substrings will also be returned. If the pattern does not match
2868
 * anywhere in the string, then the whole string is returned as the first
2869
 * token.
2870
 *
2871
 * As a special case, the result of splitting the empty string "" is an
2872
 * empty vector, not a vector containing a single string. The reason for
2873
 * this special case is that being able to represent an empty vector is
2874
 * typically more useful than consistent handling of empty elements. If
2875
 * you do need to represent empty elements, you'll need to check for the
2876
 * empty string before calling this function.
2877
 *
2878
 * A pattern that can match empty strings splits @string into separate
2879
 * characters wherever it matches the empty string between characters.
2880
 * For example splitting "ab c" using as a separator "\s*", you will get
2881
 * "a", "b" and "c".
2882
 *
2883
 * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2884
 * it using g_strfreev()
2885
 *
2886
 * Since: 2.14
2887
 **/
2888
gchar **
2889
g_regex_split (const GRegex     *regex,
2890
               const gchar      *string,
2891
               GRegexMatchFlags  match_options)
2892
0
{
2893
0
  return g_regex_split_full (regex, string, -1, 0,
2894
0
                             match_options, 0, NULL);
2895
0
}
2896
2897
/**
2898
 * g_regex_split_full:
2899
 * @regex: a #GRegex structure
2900
 * @string: (array length=string_len): the string to split with the pattern
2901
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2902
 * @start_position: starting index of the string to match, in bytes
2903
 * @match_options: match time option flags
2904
 * @max_tokens: the maximum number of tokens to split @string into.
2905
 *   If this is less than 1, the string is split completely
2906
 * @error: return location for a #GError
2907
 *
2908
 * Breaks the string on the pattern, and returns an array of the tokens.
2909
 * If the pattern contains capturing parentheses, then the text for each
2910
 * of the substrings will also be returned. If the pattern does not match
2911
 * anywhere in the string, then the whole string is returned as the first
2912
 * token.
2913
 *
2914
 * As a special case, the result of splitting the empty string "" is an
2915
 * empty vector, not a vector containing a single string. The reason for
2916
 * this special case is that being able to represent an empty vector is
2917
 * typically more useful than consistent handling of empty elements. If
2918
 * you do need to represent empty elements, you'll need to check for the
2919
 * empty string before calling this function.
2920
 *
2921
 * A pattern that can match empty strings splits @string into separate
2922
 * characters wherever it matches the empty string between characters.
2923
 * For example splitting "ab c" using as a separator "\s*", you will get
2924
 * "a", "b" and "c".
2925
 *
2926
 * Setting @start_position differs from just passing over a shortened
2927
 * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
2928
 * that begins with any kind of lookbehind assertion, such as "\b".
2929
 *
2930
 * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2931
 * it using g_strfreev()
2932
 *
2933
 * Since: 2.14
2934
 **/
2935
gchar **
2936
g_regex_split_full (const GRegex      *regex,
2937
                    const gchar       *string,
2938
                    gssize             string_len,
2939
                    gint               start_position,
2940
                    GRegexMatchFlags   match_options,
2941
                    gint               max_tokens,
2942
                    GError           **error)
2943
0
{
2944
0
  GError *tmp_error = NULL;
2945
0
  GMatchInfo *match_info;
2946
0
  GList *list, *last;
2947
0
  gint i;
2948
0
  gint token_count;
2949
0
  gboolean match_ok;
2950
  /* position of the last separator. */
2951
0
  gint last_separator_end;
2952
  /* was the last match 0 bytes long? */
2953
0
  gboolean last_match_is_empty;
2954
  /* the returned array of char **s */
2955
0
  gchar **string_list;
2956
2957
0
  g_return_val_if_fail (regex != NULL, NULL);
2958
0
  g_return_val_if_fail (string != NULL, NULL);
2959
0
  g_return_val_if_fail (start_position >= 0, NULL);
2960
0
  g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2961
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2962
2963
0
  if (max_tokens <= 0)
2964
0
    max_tokens = G_MAXINT;
2965
2966
0
  if (string_len < 0)
2967
0
    string_len = strlen (string);
2968
2969
  /* zero-length string */
2970
0
  if (string_len - start_position == 0)
2971
0
    return g_new0 (gchar *, 1);
2972
2973
0
  if (max_tokens == 1)
2974
0
    {
2975
0
      string_list = g_new0 (gchar *, 2);
2976
0
      string_list[0] = g_strndup (&string[start_position],
2977
0
                                  string_len - start_position);
2978
0
      return string_list;
2979
0
    }
2980
2981
0
  list = NULL;
2982
0
  token_count = 0;
2983
0
  last_separator_end = start_position;
2984
0
  last_match_is_empty = FALSE;
2985
2986
0
  match_ok = g_regex_match_full (regex, string, string_len, start_position,
2987
0
                                 match_options, &match_info, &tmp_error);
2988
2989
0
  while (tmp_error == NULL)
2990
0
    {
2991
0
      if (match_ok)
2992
0
        {
2993
0
          last_match_is_empty =
2994
0
                    (match_info->offsets[0] == match_info->offsets[1]);
2995
2996
          /* we need to skip empty separators at the same position of the end
2997
           * of another separator. e.g. the string is "a b" and the separator
2998
           * is " *", so from 1 to 2 we have a match and at position 2 we have
2999
           * an empty match. */
3000
0
          if (last_separator_end != match_info->offsets[1])
3001
0
            {
3002
0
              gchar *token;
3003
0
              gint match_count;
3004
3005
0
              token = g_strndup (string + last_separator_end,
3006
0
                                 match_info->offsets[0] - last_separator_end);
3007
0
              list = g_list_prepend (list, token);
3008
0
              token_count++;
3009
3010
              /* if there were substrings, these need to be added to
3011
               * the list. */
3012
0
              match_count = g_match_info_get_match_count (match_info);
3013
0
              if (match_count > 1)
3014
0
                {
3015
0
                  for (i = 1; i < match_count; i++)
3016
0
                    list = g_list_prepend (list, g_match_info_fetch (match_info, i));
3017
0
                }
3018
0
            }
3019
0
        }
3020
0
      else
3021
0
        {
3022
          /* if there was no match, copy to end of string. */
3023
0
          if (!last_match_is_empty)
3024
0
            {
3025
0
              gchar *token = g_strndup (string + last_separator_end,
3026
0
                                        match_info->string_len - last_separator_end);
3027
0
              list = g_list_prepend (list, token);
3028
0
            }
3029
          /* no more tokens, end the loop. */
3030
0
          break;
3031
0
        }
3032
3033
      /* -1 to leave room for the last part. */
3034
0
      if (token_count >= max_tokens - 1)
3035
0
        {
3036
          /* we have reached the maximum number of tokens, so we copy
3037
           * the remaining part of the string. */
3038
0
          if (last_match_is_empty)
3039
0
            {
3040
              /* the last match was empty, so we have moved one char
3041
               * after the real position to avoid empty matches at the
3042
               * same position. */
3043
0
              match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
3044
0
            }
3045
          /* the if is needed in the case we have terminated the available
3046
           * tokens, but we are at the end of the string, so there are no
3047
           * characters left to copy. */
3048
0
          if (string_len > match_info->pos)
3049
0
            {
3050
0
              gchar *token = g_strndup (string + match_info->pos,
3051
0
                                        string_len - match_info->pos);
3052
0
              list = g_list_prepend (list, token);
3053
0
            }
3054
          /* end the loop. */
3055
0
          break;
3056
0
        }
3057
3058
0
      last_separator_end = match_info->pos;
3059
0
      if (last_match_is_empty)
3060
        /* if the last match was empty, g_match_info_next() has moved
3061
         * forward to avoid infinite loops, but we still need to copy that
3062
         * character. */
3063
0
        last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
3064
3065
0
      match_ok = g_match_info_next (match_info, &tmp_error);
3066
0
    }
3067
0
  g_match_info_free (match_info);
3068
0
  if (tmp_error != NULL)
3069
0
    {
3070
0
      g_propagate_error (error, tmp_error);
3071
0
      g_list_free_full (list, g_free);
3072
0
      return NULL;
3073
0
    }
3074
3075
0
  string_list = g_new (gchar *, g_list_length (list) + 1);
3076
0
  i = 0;
3077
0
  for (last = g_list_last (list); last; last = g_list_previous (last))
3078
0
    string_list[i++] = last->data;
3079
0
  string_list[i] = NULL;
3080
0
  g_list_free (list);
3081
3082
0
  return string_list;
3083
0
}
3084
3085
enum
3086
{
3087
  REPL_TYPE_STRING,
3088
  REPL_TYPE_CHARACTER,
3089
  REPL_TYPE_SYMBOLIC_REFERENCE,
3090
  REPL_TYPE_NUMERIC_REFERENCE,
3091
  REPL_TYPE_CHANGE_CASE
3092
};
3093
3094
typedef enum
3095
{
3096
  CHANGE_CASE_NONE         = 1 << 0,
3097
  CHANGE_CASE_UPPER        = 1 << 1,
3098
  CHANGE_CASE_LOWER        = 1 << 2,
3099
  CHANGE_CASE_UPPER_SINGLE = 1 << 3,
3100
  CHANGE_CASE_LOWER_SINGLE = 1 << 4,
3101
  CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
3102
  CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
3103
  CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
3104
} ChangeCase;
3105
3106
struct _InterpolationData
3107
{
3108
  gchar     *text;
3109
  gint       type;
3110
  gint       num;
3111
  gchar      c;
3112
  ChangeCase change_case;
3113
};
3114
3115
static void
3116
free_interpolation_data (InterpolationData *data)
3117
0
{
3118
0
  g_free (data->text);
3119
0
  g_free (data);
3120
0
}
3121
3122
static const gchar *
3123
expand_escape (const gchar        *replacement,
3124
               const gchar        *p,
3125
               InterpolationData  *data,
3126
               GError            **error)
3127
0
{
3128
0
  const gchar *q, *r;
3129
0
  gint x, d, h, i;
3130
0
  const gchar *error_detail;
3131
0
  gint base = 0;
3132
0
  GError *tmp_error = NULL;
3133
3134
0
  p++;
3135
0
  switch (*p)
3136
0
    {
3137
0
    case 't':
3138
0
      p++;
3139
0
      data->c = '\t';
3140
0
      data->type = REPL_TYPE_CHARACTER;
3141
0
      break;
3142
0
    case 'n':
3143
0
      p++;
3144
0
      data->c = '\n';
3145
0
      data->type = REPL_TYPE_CHARACTER;
3146
0
      break;
3147
0
    case 'v':
3148
0
      p++;
3149
0
      data->c = '\v';
3150
0
      data->type = REPL_TYPE_CHARACTER;
3151
0
      break;
3152
0
    case 'r':
3153
0
      p++;
3154
0
      data->c = '\r';
3155
0
      data->type = REPL_TYPE_CHARACTER;
3156
0
      break;
3157
0
    case 'f':
3158
0
      p++;
3159
0
      data->c = '\f';
3160
0
      data->type = REPL_TYPE_CHARACTER;
3161
0
      break;
3162
0
    case 'a':
3163
0
      p++;
3164
0
      data->c = '\a';
3165
0
      data->type = REPL_TYPE_CHARACTER;
3166
0
      break;
3167
0
    case 'b':
3168
0
      p++;
3169
0
      data->c = '\b';
3170
0
      data->type = REPL_TYPE_CHARACTER;
3171
0
      break;
3172
0
    case '\\':
3173
0
      p++;
3174
0
      data->c = '\\';
3175
0
      data->type = REPL_TYPE_CHARACTER;
3176
0
      break;
3177
0
    case 'x':
3178
0
      p++;
3179
0
      x = 0;
3180
0
      if (*p == '{')
3181
0
        {
3182
0
          p++;
3183
0
          do
3184
0
            {
3185
0
              h = g_ascii_xdigit_value (*p);
3186
0
              if (h < 0)
3187
0
                {
3188
0
                  error_detail = _("hexadecimal digit or “}” expected");
3189
0
                  goto error;
3190
0
                }
3191
0
              x = x * 16 + h;
3192
0
              p++;
3193
0
            }
3194
0
          while (*p != '}');
3195
0
          p++;
3196
0
        }
3197
0
      else
3198
0
        {
3199
0
          for (i = 0; i < 2; i++)
3200
0
            {
3201
0
              h = g_ascii_xdigit_value (*p);
3202
0
              if (h < 0)
3203
0
                {
3204
0
                  error_detail = _("hexadecimal digit expected");
3205
0
                  goto error;
3206
0
                }
3207
0
              x = x * 16 + h;
3208
0
              p++;
3209
0
            }
3210
0
        }
3211
0
      data->type = REPL_TYPE_STRING;
3212
0
      data->text = g_new0 (gchar, 8);
3213
0
      g_unichar_to_utf8 (x, data->text);
3214
0
      break;
3215
0
    case 'l':
3216
0
      p++;
3217
0
      data->type = REPL_TYPE_CHANGE_CASE;
3218
0
      data->change_case = CHANGE_CASE_LOWER_SINGLE;
3219
0
      break;
3220
0
    case 'u':
3221
0
      p++;
3222
0
      data->type = REPL_TYPE_CHANGE_CASE;
3223
0
      data->change_case = CHANGE_CASE_UPPER_SINGLE;
3224
0
      break;
3225
0
    case 'L':
3226
0
      p++;
3227
0
      data->type = REPL_TYPE_CHANGE_CASE;
3228
0
      data->change_case = CHANGE_CASE_LOWER;
3229
0
      break;
3230
0
    case 'U':
3231
0
      p++;
3232
0
      data->type = REPL_TYPE_CHANGE_CASE;
3233
0
      data->change_case = CHANGE_CASE_UPPER;
3234
0
      break;
3235
0
    case 'E':
3236
0
      p++;
3237
0
      data->type = REPL_TYPE_CHANGE_CASE;
3238
0
      data->change_case = CHANGE_CASE_NONE;
3239
0
      break;
3240
0
    case 'g':
3241
0
      p++;
3242
0
      if (*p != '<')
3243
0
        {
3244
0
          error_detail = _("missing “<” in symbolic reference");
3245
0
          goto error;
3246
0
        }
3247
0
      q = p + 1;
3248
0
      do
3249
0
        {
3250
0
          p++;
3251
0
          if (!*p)
3252
0
            {
3253
0
              error_detail = _("unfinished symbolic reference");
3254
0
              goto error;
3255
0
            }
3256
0
        }
3257
0
      while (*p != '>');
3258
0
      if (p - q == 0)
3259
0
        {
3260
0
          error_detail = _("zero-length symbolic reference");
3261
0
          goto error;
3262
0
        }
3263
0
      if (g_ascii_isdigit (*q))
3264
0
        {
3265
0
          x = 0;
3266
0
          do
3267
0
            {
3268
0
              h = g_ascii_digit_value (*q);
3269
0
              if (h < 0)
3270
0
                {
3271
0
                  error_detail = _("digit expected");
3272
0
                  p = q;
3273
0
                  goto error;
3274
0
                }
3275
0
              x = x * 10 + h;
3276
0
              q++;
3277
0
            }
3278
0
          while (q != p);
3279
0
          data->num = x;
3280
0
          data->type = REPL_TYPE_NUMERIC_REFERENCE;
3281
0
        }
3282
0
      else
3283
0
        {
3284
0
          r = q;
3285
0
          do
3286
0
            {
3287
0
              if (!g_ascii_isalnum (*r))
3288
0
                {
3289
0
                  error_detail = _("illegal symbolic reference");
3290
0
                  p = r;
3291
0
                  goto error;
3292
0
                }
3293
0
              r++;
3294
0
            }
3295
0
          while (r != p);
3296
0
          data->text = g_strndup (q, p - q);
3297
0
          data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
3298
0
        }
3299
0
      p++;
3300
0
      break;
3301
0
    case '0':
3302
      /* if \0 is followed by a number is an octal number representing a
3303
       * character, else it is a numeric reference. */
3304
0
      if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
3305
0
        {
3306
0
          base = 8;
3307
0
          p = g_utf8_next_char (p);
3308
0
        }
3309
0
      G_GNUC_FALLTHROUGH;
3310
0
    case '1':
3311
0
    case '2':
3312
0
    case '3':
3313
0
    case '4':
3314
0
    case '5':
3315
0
    case '6':
3316
0
    case '7':
3317
0
    case '8':
3318
0
    case '9':
3319
0
      x = 0;
3320
0
      d = 0;
3321
0
      for (i = 0; i < 3; i++)
3322
0
        {
3323
0
          h = g_ascii_digit_value (*p);
3324
0
          if (h < 0)
3325
0
            break;
3326
0
          if (h > 7)
3327
0
            {
3328
0
              if (base == 8)
3329
0
                break;
3330
0
              else
3331
0
                base = 10;
3332
0
            }
3333
0
          if (i == 2 && base == 10)
3334
0
            break;
3335
0
          x = x * 8 + h;
3336
0
          d = d * 10 + h;
3337
0
          p++;
3338
0
        }
3339
0
      if (base == 8 || i == 3)
3340
0
        {
3341
0
          data->type = REPL_TYPE_STRING;
3342
0
          data->text = g_new0 (gchar, 8);
3343
0
          g_unichar_to_utf8 (x, data->text);
3344
0
        }
3345
0
      else
3346
0
        {
3347
0
          data->type = REPL_TYPE_NUMERIC_REFERENCE;
3348
0
          data->num = d;
3349
0
        }
3350
0
      break;
3351
0
    case 0:
3352
0
      error_detail = _("stray final “\\”");
3353
0
      goto error;
3354
0
      break;
3355
0
    default:
3356
0
      error_detail = _("unknown escape sequence");
3357
0
      goto error;
3358
0
    }
3359
3360
0
  return p;
3361
3362
0
 error:
3363
  /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
3364
0
  tmp_error = g_error_new (G_REGEX_ERROR,
3365
0
                           G_REGEX_ERROR_REPLACE,
3366
0
                           _("Error while parsing replacement "
3367
0
                             "text “%s” at char %lu: %s"),
3368
0
                           replacement,
3369
0
                           (gulong)(p - replacement),
3370
0
                           error_detail);
3371
0
  g_propagate_error (error, tmp_error);
3372
3373
0
  return NULL;
3374
0
}
3375
3376
static GList *
3377
split_replacement (const gchar  *replacement,
3378
                   GError      **error)
3379
0
{
3380
0
  GList *list = NULL;
3381
0
  InterpolationData *data;
3382
0
  const gchar *p, *start;
3383
3384
0
  start = p = replacement;
3385
0
  while (*p)
3386
0
    {
3387
0
      if (*p == '\\')
3388
0
        {
3389
0
          data = g_new0 (InterpolationData, 1);
3390
0
          start = p = expand_escape (replacement, p, data, error);
3391
0
          if (p == NULL)
3392
0
            {
3393
0
              g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3394
0
              free_interpolation_data (data);
3395
3396
0
              return NULL;
3397
0
            }
3398
0
          list = g_list_prepend (list, data);
3399
0
        }
3400
0
      else
3401
0
        {
3402
0
          p++;
3403
0
          if (*p == '\\' || *p == '\0')
3404
0
            {
3405
0
              if (p - start > 0)
3406
0
                {
3407
0
                  data = g_new0 (InterpolationData, 1);
3408
0
                  data->text = g_strndup (start, p - start);
3409
0
                  data->type = REPL_TYPE_STRING;
3410
0
                  list = g_list_prepend (list, data);
3411
0
                }
3412
0
            }
3413
0
        }
3414
0
    }
3415
3416
0
  return g_list_reverse (list);
3417
0
}
3418
3419
/* Change the case of c based on change_case. */
3420
#define CHANGE_CASE(c, change_case) \
3421
0
        (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
3422
0
                g_unichar_tolower (c) : \
3423
0
                g_unichar_toupper (c))
3424
3425
static void
3426
string_append (GString     *string,
3427
               const gchar *text,
3428
               ChangeCase  *change_case)
3429
0
{
3430
0
  gunichar c;
3431
3432
0
  if (text[0] == '\0')
3433
0
    return;
3434
3435
0
  if (*change_case == CHANGE_CASE_NONE)
3436
0
    {
3437
0
      g_string_append (string, text);
3438
0
    }
3439
0
  else if (*change_case & CHANGE_CASE_SINGLE_MASK)
3440
0
    {
3441
0
      c = g_utf8_get_char (text);
3442
0
      g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
3443
0
      g_string_append (string, g_utf8_next_char (text));
3444
0
      *change_case = CHANGE_CASE_NONE;
3445
0
    }
3446
0
  else
3447
0
    {
3448
0
      while (*text != '\0')
3449
0
        {
3450
0
          c = g_utf8_get_char (text);
3451
0
          g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
3452
0
          text = g_utf8_next_char (text);
3453
0
        }
3454
0
    }
3455
0
}
3456
3457
static gboolean
3458
interpolate_replacement (const GMatchInfo *match_info,
3459
                         GString          *result,
3460
                         gpointer          data)
3461
0
{
3462
0
  GList *list;
3463
0
  InterpolationData *idata;
3464
0
  gchar *match;
3465
0
  ChangeCase change_case = CHANGE_CASE_NONE;
3466
3467
0
  for (list = data; list; list = list->next)
3468
0
    {
3469
0
      idata = list->data;
3470
0
      switch (idata->type)
3471
0
        {
3472
0
        case REPL_TYPE_STRING:
3473
0
          string_append (result, idata->text, &change_case);
3474
0
          break;
3475
0
        case REPL_TYPE_CHARACTER:
3476
0
          g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
3477
0
          if (change_case & CHANGE_CASE_SINGLE_MASK)
3478
0
            change_case = CHANGE_CASE_NONE;
3479
0
          break;
3480
0
        case REPL_TYPE_NUMERIC_REFERENCE:
3481
0
          match = g_match_info_fetch (match_info, idata->num);
3482
0
          if (match)
3483
0
            {
3484
0
              string_append (result, match, &change_case);
3485
0
              g_free (match);
3486
0
            }
3487
0
          break;
3488
0
        case REPL_TYPE_SYMBOLIC_REFERENCE:
3489
0
          match = g_match_info_fetch_named (match_info, idata->text);
3490
0
          if (match)
3491
0
            {
3492
0
              string_append (result, match, &change_case);
3493
0
              g_free (match);
3494
0
            }
3495
0
          break;
3496
0
        case REPL_TYPE_CHANGE_CASE:
3497
0
          change_case = idata->change_case;
3498
0
          break;
3499
0
        }
3500
0
    }
3501
3502
0
  return FALSE;
3503
0
}
3504
3505
/* whether actual match_info is needed for replacement, i.e.
3506
 * whether there are references
3507
 */
3508
static gboolean
3509
interpolation_list_needs_match (GList *list)
3510
0
{
3511
0
  while (list != NULL)
3512
0
    {
3513
0
      InterpolationData *data = list->data;
3514
3515
0
      if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
3516
0
          data->type == REPL_TYPE_NUMERIC_REFERENCE)
3517
0
        {
3518
0
          return TRUE;
3519
0
        }
3520
3521
0
      list = list->next;
3522
0
    }
3523
3524
0
  return FALSE;
3525
0
}
3526
3527
/**
3528
 * g_regex_replace:
3529
 * @regex: a #GRegex structure
3530
 * @string: (array length=string_len): the string to perform matches against
3531
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3532
 * @start_position: starting index of the string to match, in bytes
3533
 * @replacement: text to replace each match with
3534
 * @match_options: options for the match
3535
 * @error: location to store the error occurring, or %NULL to ignore errors
3536
 *
3537
 * Replaces all occurrences of the pattern in @regex with the
3538
 * replacement text. Backreferences of the form `\number` or
3539
 * `\g<number>` in the replacement text are interpolated by the
3540
 * number-th captured subexpression of the match, `\g<name>` refers
3541
 * to the captured subexpression with the given name. `\0` refers
3542
 * to the complete match, but `\0` followed by a number is the octal
3543
 * representation of a character. To include a literal `\` in the
3544
 * replacement, write `\\\\`.
3545
 *
3546
 * There are also escapes that changes the case of the following text:
3547
 *
3548
 * - \l: Convert to lower case the next character
3549
 * - \u: Convert to upper case the next character
3550
 * - \L: Convert to lower case till \E
3551
 * - \U: Convert to upper case till \E
3552
 * - \E: End case modification
3553
 *
3554
 * If you do not need to use backreferences use g_regex_replace_literal().
3555
 *
3556
 * The @replacement string must be UTF-8 encoded even if %G_REGEX_RAW was
3557
 * passed to g_regex_new(). If you want to use not UTF-8 encoded strings
3558
 * you can use g_regex_replace_literal().
3559
 *
3560
 * Setting @start_position differs from just passing over a shortened
3561
 * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern that
3562
 * begins with any kind of lookbehind assertion, such as "\b".
3563
 *
3564
 * Returns: a newly allocated string containing the replacements
3565
 *
3566
 * Since: 2.14
3567
 */
3568
gchar *
3569
g_regex_replace (const GRegex      *regex,
3570
                 const gchar       *string,
3571
                 gssize             string_len,
3572
                 gint               start_position,
3573
                 const gchar       *replacement,
3574
                 GRegexMatchFlags   match_options,
3575
                 GError           **error)
3576
0
{
3577
0
  gchar *result;
3578
0
  GList *list;
3579
0
  GError *tmp_error = NULL;
3580
3581
0
  g_return_val_if_fail (regex != NULL, NULL);
3582
0
  g_return_val_if_fail (string != NULL, NULL);
3583
0
  g_return_val_if_fail (start_position >= 0, NULL);
3584
0
  g_return_val_if_fail (replacement != NULL, NULL);
3585
0
  g_return_val_if_fail (error == NULL || *error == NULL, NULL);
3586
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3587
3588
0
  list = split_replacement (replacement, &tmp_error);
3589
0
  if (tmp_error != NULL)
3590
0
    {
3591
0
      g_propagate_error (error, tmp_error);
3592
0
      return NULL;
3593
0
    }
3594
3595
0
  result = g_regex_replace_eval (regex,
3596
0
                                 string, string_len, start_position,
3597
0
                                 match_options,
3598
0
                                 interpolate_replacement,
3599
0
                                 (gpointer)list,
3600
0
                                 &tmp_error);
3601
0
  if (tmp_error != NULL)
3602
0
    g_propagate_error (error, tmp_error);
3603
3604
0
  g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3605
3606
0
  return result;
3607
0
}
3608
3609
static gboolean
3610
literal_replacement (const GMatchInfo *match_info,
3611
                     GString          *result,
3612
                     gpointer          data)
3613
0
{
3614
0
  g_string_append (result, data);
3615
0
  return FALSE;
3616
0
}
3617
3618
/**
3619
 * g_regex_replace_literal:
3620
 * @regex: a #GRegex structure
3621
 * @string: (array length=string_len): the string to perform matches against
3622
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3623
 * @start_position: starting index of the string to match, in bytes
3624
 * @replacement: text to replace each match with
3625
 * @match_options: options for the match
3626
 * @error: location to store the error occurring, or %NULL to ignore errors
3627
 *
3628
 * Replaces all occurrences of the pattern in @regex with the
3629
 * replacement text. @replacement is replaced literally, to
3630
 * include backreferences use g_regex_replace().
3631
 *
3632
 * Setting @start_position differs from just passing over a
3633
 * shortened string and setting %G_REGEX_MATCH_NOTBOL in the
3634
 * case of a pattern that begins with any kind of lookbehind
3635
 * assertion, such as "\b".
3636
 *
3637
 * Returns: a newly allocated string containing the replacements
3638
 *
3639
 * Since: 2.14
3640
 */
3641
gchar *
3642
g_regex_replace_literal (const GRegex      *regex,
3643
                         const gchar       *string,
3644
                         gssize             string_len,
3645
                         gint               start_position,
3646
                         const gchar       *replacement,
3647
                         GRegexMatchFlags   match_options,
3648
                         GError           **error)
3649
0
{
3650
0
  g_return_val_if_fail (replacement != NULL, NULL);
3651
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3652
3653
0
  return g_regex_replace_eval (regex,
3654
0
                               string, string_len, start_position,
3655
0
                               match_options,
3656
0
                               literal_replacement,
3657
0
                               (gpointer)replacement,
3658
0
                               error);
3659
0
}
3660
3661
/**
3662
 * g_regex_replace_eval:
3663
 * @regex: a #GRegex structure from g_regex_new()
3664
 * @string: (array length=string_len): string to perform matches against
3665
 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3666
 * @start_position: starting index of the string to match, in bytes
3667
 * @match_options: options for the match
3668
 * @eval: (scope call): a function to call for each match
3669
 * @user_data: user data to pass to the function
3670
 * @error: location to store the error occurring, or %NULL to ignore errors
3671
 *
3672
 * Replaces occurrences of the pattern in regex with the output of
3673
 * @eval for that occurrence.
3674
 *
3675
 * Setting @start_position differs from just passing over a shortened
3676
 * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern
3677
 * that begins with any kind of lookbehind assertion, such as "\b".
3678
 *
3679
 * The following example uses g_regex_replace_eval() to replace multiple
3680
 * strings at once:
3681
 * |[<!-- language="C" --> 
3682
 * static gboolean
3683
 * eval_cb (const GMatchInfo *info,
3684
 *          GString          *res,
3685
 *          gpointer          data)
3686
 * {
3687
 *   gchar *match;
3688
 *   gchar *r;
3689
 *
3690
 *    match = g_match_info_fetch (info, 0);
3691
 *    r = g_hash_table_lookup ((GHashTable *)data, match);
3692
 *    g_string_append (res, r);
3693
 *    g_free (match);
3694
 *
3695
 *    return FALSE;
3696
 * }
3697
 *
3698
 * ...
3699
 *
3700
 * GRegex *reg;
3701
 * GHashTable *h;
3702
 * gchar *res;
3703
 *
3704
 * h = g_hash_table_new (g_str_hash, g_str_equal);
3705
 *
3706
 * g_hash_table_insert (h, "1", "ONE");
3707
 * g_hash_table_insert (h, "2", "TWO");
3708
 * g_hash_table_insert (h, "3", "THREE");
3709
 * g_hash_table_insert (h, "4", "FOUR");
3710
 *
3711
 * reg = g_regex_new ("1|2|3|4", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL);
3712
 * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);
3713
 * g_hash_table_destroy (h);
3714
 *
3715
 * ...
3716
 * ]|
3717
 *
3718
 * Returns: a newly allocated string containing the replacements
3719
 *
3720
 * Since: 2.14
3721
 */
3722
gchar *
3723
g_regex_replace_eval (const GRegex        *regex,
3724
                      const gchar         *string,
3725
                      gssize               string_len,
3726
                      gint                 start_position,
3727
                      GRegexMatchFlags     match_options,
3728
                      GRegexEvalCallback   eval,
3729
                      gpointer             user_data,
3730
                      GError             **error)
3731
0
{
3732
0
  GMatchInfo *match_info;
3733
0
  GString *result;
3734
0
  gint str_pos = 0;
3735
0
  gboolean done = FALSE;
3736
0
  GError *tmp_error = NULL;
3737
3738
0
  g_return_val_if_fail (regex != NULL, NULL);
3739
0
  g_return_val_if_fail (string != NULL, NULL);
3740
0
  g_return_val_if_fail (start_position >= 0, NULL);
3741
0
  g_return_val_if_fail (eval != NULL, NULL);
3742
0
  g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3743
3744
0
  if (string_len < 0)
3745
0
    string_len = strlen (string);
3746
3747
0
  result = g_string_sized_new (string_len);
3748
3749
  /* run down the string making matches. */
3750
0
  g_regex_match_full (regex, string, string_len, start_position,
3751
0
                      match_options, &match_info, &tmp_error);
3752
0
  while (!done && g_match_info_matches (match_info))
3753
0
    {
3754
0
      g_string_append_len (result,
3755
0
                           string + str_pos,
3756
0
                           match_info->offsets[0] - str_pos);
3757
0
      done = (*eval) (match_info, result, user_data);
3758
0
      str_pos = match_info->offsets[1];
3759
0
      g_match_info_next (match_info, &tmp_error);
3760
0
    }
3761
0
  g_match_info_free (match_info);
3762
0
  if (tmp_error != NULL)
3763
0
    {
3764
0
      g_propagate_error (error, tmp_error);
3765
0
      g_string_free (result, TRUE);
3766
0
      return NULL;
3767
0
    }
3768
3769
0
  g_string_append_len (result, string + str_pos, string_len - str_pos);
3770
0
  return g_string_free (result, FALSE);
3771
0
}
3772
3773
/**
3774
 * g_regex_check_replacement:
3775
 * @replacement: the replacement string
3776
 * @has_references: (out) (optional): location to store information about
3777
 *   references in @replacement or %NULL
3778
 * @error: location to store error
3779
 *
3780
 * Checks whether @replacement is a valid replacement string
3781
 * (see g_regex_replace()), i.e. that all escape sequences in
3782
 * it are valid.
3783
 *
3784
 * If @has_references is not %NULL then @replacement is checked
3785
 * for pattern references. For instance, replacement text 'foo\n'
3786
 * does not contain references and may be evaluated without information
3787
 * about actual match, but '\0\1' (whole match followed by first
3788
 * subpattern) requires valid #GMatchInfo object.
3789
 *
3790
 * Returns: whether @replacement is a valid replacement string
3791
 *
3792
 * Since: 2.14
3793
 */
3794
gboolean
3795
g_regex_check_replacement (const gchar  *replacement,
3796
                           gboolean     *has_references,
3797
                           GError      **error)
3798
0
{
3799
0
  GList *list;
3800
0
  GError *tmp = NULL;
3801
3802
0
  list = split_replacement (replacement, &tmp);
3803
3804
0
  if (tmp)
3805
0
  {
3806
0
    g_propagate_error (error, tmp);
3807
0
    return FALSE;
3808
0
  }
3809
3810
0
  if (has_references)
3811
0
    *has_references = interpolation_list_needs_match (list);
3812
3813
0
  g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3814
3815
0
  return TRUE;
3816
0
}
3817
3818
/**
3819
 * g_regex_escape_nul:
3820
 * @string: the string to escape
3821
 * @length: the length of @string
3822
 *
3823
 * Escapes the nul characters in @string to "\x00".  It can be used
3824
 * to compile a regex with embedded nul characters.
3825
 *
3826
 * For completeness, @length can be -1 for a nul-terminated string.
3827
 * In this case the output string will be of course equal to @string.
3828
 *
3829
 * Returns: a newly-allocated escaped string
3830
 *
3831
 * Since: 2.30
3832
 */
3833
gchar *
3834
g_regex_escape_nul (const gchar *string,
3835
                    gint         length)
3836
0
{
3837
0
  GString *escaped;
3838
0
  const gchar *p, *piece_start, *end;
3839
0
  gint backslashes;
3840
3841
0
  g_return_val_if_fail (string != NULL, NULL);
3842
3843
0
  if (length < 0)
3844
0
    return g_strdup (string);
3845
3846
0
  end = string + length;
3847
0
  p = piece_start = string;
3848
0
  escaped = g_string_sized_new (length + 1);
3849
3850
0
  backslashes = 0;
3851
0
  while (p < end)
3852
0
    {
3853
0
      switch (*p)
3854
0
        {
3855
0
        case '\0':
3856
0
          if (p != piece_start)
3857
0
            {
3858
              /* copy the previous piece. */
3859
0
              g_string_append_len (escaped, piece_start, p - piece_start);
3860
0
            }
3861
0
          if ((backslashes & 1) == 0)
3862
0
            g_string_append_c (escaped, '\\');
3863
0
          g_string_append_c (escaped, 'x');
3864
0
          g_string_append_c (escaped, '0');
3865
0
          g_string_append_c (escaped, '0');
3866
0
          piece_start = ++p;
3867
0
          backslashes = 0;
3868
0
          break;
3869
0
        case '\\':
3870
0
          backslashes++;
3871
0
          ++p;
3872
0
          break;
3873
0
        default:
3874
0
          backslashes = 0;
3875
0
          p = g_utf8_next_char (p);
3876
0
          break;
3877
0
        }
3878
0
    }
3879
3880
0
  if (piece_start < end)
3881
0
    g_string_append_len (escaped, piece_start, end - piece_start);
3882
3883
0
  return g_string_free (escaped, FALSE);
3884
0
}
3885
3886
/**
3887
 * g_regex_escape_string:
3888
 * @string: the string to escape
3889
 * @length: the length of @string, in bytes, or -1 if @string is nul-terminated
3890
 *
3891
 * Escapes the special characters used for regular expressions
3892
 * in @string, for instance "a.b*c" becomes "a\.b\*c". This
3893
 * function is useful to dynamically generate regular expressions.
3894
 *
3895
 * @string can contain nul characters that are replaced with "\0",
3896
 * in this case remember to specify the correct length of @string
3897
 * in @length.
3898
 *
3899
 * Returns: a newly-allocated escaped string
3900
 *
3901
 * Since: 2.14
3902
 */
3903
gchar *
3904
g_regex_escape_string (const gchar *string,
3905
                       gint         length)
3906
0
{
3907
0
  GString *escaped;
3908
0
  const char *p, *piece_start, *end;
3909
3910
0
  g_return_val_if_fail (string != NULL, NULL);
3911
3912
0
  if (length < 0)
3913
0
    length = strlen (string);
3914
3915
0
  end = string + length;
3916
0
  p = piece_start = string;
3917
0
  escaped = g_string_sized_new (length + 1);
3918
3919
0
  while (p < end)
3920
0
    {
3921
0
      switch (*p)
3922
0
        {
3923
0
        case '\0':
3924
0
        case '\\':
3925
0
        case '|':
3926
0
        case '(':
3927
0
        case ')':
3928
0
        case '[':
3929
0
        case ']':
3930
0
        case '{':
3931
0
        case '}':
3932
0
        case '^':
3933
0
        case '$':
3934
0
        case '*':
3935
0
        case '+':
3936
0
        case '?':
3937
0
        case '.':
3938
0
          if (p != piece_start)
3939
            /* copy the previous piece. */
3940
0
            g_string_append_len (escaped, piece_start, p - piece_start);
3941
0
          g_string_append_c (escaped, '\\');
3942
0
          if (*p == '\0')
3943
0
            g_string_append_c (escaped, '0');
3944
0
          else
3945
0
            g_string_append_c (escaped, *p);
3946
0
          piece_start = ++p;
3947
0
          break;
3948
0
        default:
3949
0
          p = g_utf8_next_char (p);
3950
0
          break;
3951
0
        }
3952
0
  }
3953
3954
0
  if (piece_start < end)
3955
0
    g_string_append_len (escaped, piece_start, end - piece_start);
3956
3957
0
  return g_string_free (escaped, FALSE);
3958
0
}