Line | Count | Source (jump to first uncovered line) |
1 | | /* GRegex -- regular expression API wrapper around PCRE. |
2 | | * |
3 | | * Copyright (C) 1999, 2000 Scott Wimer |
4 | | * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com> |
5 | | * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org> |
6 | | * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com> |
7 | | * |
8 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
9 | | * |
10 | | * This library is free software; you can redistribute it and/or |
11 | | * modify it under the terms of the GNU Lesser General Public |
12 | | * License as published by the Free Software Foundation; either |
13 | | * version 2.1 of the License, or (at your option) any later version. |
14 | | * |
15 | | * This library is distributed in the hope that it will be useful, |
16 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | | * Lesser General Public License for more details. |
19 | | * |
20 | | * You should have received a copy of the GNU Lesser General Public License |
21 | | * along with this library; if not, see <http://www.gnu.org/licenses/>. |
22 | | */ |
23 | | |
24 | | #include "config.h" |
25 | | |
26 | | #include <stdint.h> |
27 | | #include <string.h> |
28 | | |
29 | | #define PCRE2_CODE_UNIT_WIDTH 8 |
30 | | #include <pcre2.h> |
31 | | |
32 | | #include "gtypes.h" |
33 | | #include "gregex.h" |
34 | | #include "glibintl.h" |
35 | | #include "glist.h" |
36 | | #include "gmessages.h" |
37 | | #include "gstrfuncs.h" |
38 | | #include "gatomic.h" |
39 | | #include "gtestutils.h" |
40 | | #include "gthread.h" |
41 | | |
42 | | /** |
43 | | * GRegex: |
44 | | * |
45 | | * A `GRegex` is a compiled form of a regular expression. |
46 | | * |
47 | | * After instantiating a `GRegex`, you can use its methods to find matches |
48 | | * in a string, replace matches within a string, or split the string at matches. |
49 | | * |
50 | | * `GRegex` implements regular expression pattern matching using syntax and |
51 | | * semantics (such as character classes, quantifiers, and capture groups) |
52 | | * similar to Perl regular expression. See the |
53 | | * [PCRE documentation](man:pcre2pattern(3)) for details. |
54 | | * |
55 | | * A typical scenario for regex pattern matching is to check if a string |
56 | | * matches a pattern. The following statements implement this scenario. |
57 | | * |
58 | | * ``` { .c } |
59 | | * const char *regex_pattern = ".*GLib.*"; |
60 | | * const char *string_to_search = "You will love the GLib implementation of regex"; |
61 | | * g_autoptr(GMatchInfo) match_info = NULL; |
62 | | * g_autoptr(GRegex) regex = NULL; |
63 | | * |
64 | | * regex = g_regex_new (regex_pattern, G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
65 | | * g_assert (regex != NULL); |
66 | | * |
67 | | * if (g_regex_match (regex, string_to_search, G_REGEX_MATCH_DEFAULT, &match_info)) |
68 | | * { |
69 | | * int start_pos, end_pos; |
70 | | * g_match_info_fetch_pos (match_info, 0, &start_pos, &end_pos); |
71 | | * g_print ("Match successful! Overall pattern matches bytes %d to %d\n", start_pos, end_pos); |
72 | | * } |
73 | | * else |
74 | | * { |
75 | | * g_print ("No match!\n"); |
76 | | * } |
77 | | * ``` |
78 | | * |
79 | | * The constructor for `GRegex` includes two sets of bitmapped flags: |
80 | | |
81 | | * * [flags@GLib.RegexCompileFlags]—These flags |
82 | | * control how GLib compiles the regex. There are options for case |
83 | | * sensitivity, multiline, ignoring whitespace, etc. |
84 | | * * [flags@GLib.RegexMatchFlags]—These flags control |
85 | | * `GRegex`’s matching behavior, such as anchoring and customizing definitions |
86 | | * for newline characters. |
87 | | * |
88 | | * Some regex patterns include backslash assertions, such as `\d` (digit) or |
89 | | * `\D` (non-digit). The regex pattern must escape those backslashes. For |
90 | | * example, the pattern `"\\d\\D"` matches a digit followed by a non-digit. |
91 | | * |
92 | | * GLib’s implementation of pattern matching includes a `start_position` |
93 | | * argument for some of the match, replace, and split methods. Specifying |
94 | | * a start position provides flexibility when you want to ignore the first |
95 | | * _n_ characters of a string, but want to incorporate backslash assertions |
96 | | * at character _n_ - 1. For example, a database field contains inconsistent |
97 | | * spelling for a job title: `healthcare provider` and `health-care provider`. |
98 | | * The database manager wants to make the spelling consistent by adding a |
99 | | * hyphen when it is missing. The following regex pattern tests for the string |
100 | | * `care` preceded by a non-word boundary character (instead of a hyphen) |
101 | | * and followed by a space. |
102 | | * |
103 | | * ``` { .c } |
104 | | * const char *regex_pattern = "\\Bcare\\s"; |
105 | | * ``` |
106 | | * |
107 | | * An efficient way to match with this pattern is to start examining at |
108 | | * `start_position` 6 in the string `healthcare` or `health-care`. |
109 | | |
110 | | * ``` { .c } |
111 | | * const char *regex_pattern = "\\Bcare\\s"; |
112 | | * const char *string_to_search = "healthcare provider"; |
113 | | * g_autoptr(GMatchInfo) match_info = NULL; |
114 | | * g_autoptr(GRegex) regex = NULL; |
115 | | * |
116 | | * regex = g_regex_new ( |
117 | | * regex_pattern, |
118 | | * G_REGEX_DEFAULT, |
119 | | * G_REGEX_MATCH_DEFAULT, |
120 | | * NULL); |
121 | | * g_assert (regex != NULL); |
122 | | * |
123 | | * g_regex_match_full ( |
124 | | * regex, |
125 | | * string_to_search, |
126 | | * -1, |
127 | | * 6, // position of 'c' in the test string. |
128 | | * G_REGEX_MATCH_DEFAULT, |
129 | | * &match_info, |
130 | | * NULL); |
131 | | * ``` |
132 | | * |
133 | | * The method [method@GLib.Regex.match_full] (and other methods implementing |
134 | | * `start_pos`) allow for lookback before the start position to determine if |
135 | | * the previous character satisfies an assertion. |
136 | | * |
137 | | * Unless you set the [flags@GLib.RegexCompileFlags.RAW] as one of |
138 | | * the `GRegexCompileFlags`, all the strings passed to `GRegex` methods must |
139 | | * be encoded in UTF-8. The lengths and the positions inside the strings are |
140 | | * in bytes and not in characters, so, for instance, `\xc3\xa0` (i.e., `à`) |
141 | | * is two bytes long but it is treated as a single character. If you set |
142 | | * `G_REGEX_RAW`, the strings can be non-valid UTF-8 strings and a byte is |
143 | | * treated as a character, so `\xc3\xa0` is two bytes and two characters long. |
144 | | * |
145 | | * Regarding line endings, `\n` matches a `\n` character, and `\r` matches |
146 | | * a `\r` character. More generally, `\R` matches all typical line endings: |
147 | | * CR + LF (`\r\n`), LF (linefeed, U+000A, `\n`), VT (vertical tab, U+000B, |
148 | | * `\v`), FF (formfeed, U+000C, `\f`), CR (carriage return, U+000D, `\r`), |
149 | | * NEL (next line, U+0085), LS (line separator, U+2028), and PS (paragraph |
150 | | * separator, U+2029). |
151 | | * |
152 | | * The behaviour of the dot, circumflex, and dollar metacharacters are |
153 | | * affected by newline characters. By default, `GRegex` matches any newline |
154 | | * character matched by `\R`. You can limit the matched newline characters by |
155 | | * specifying the [flags@GLib.RegexMatchFlags.NEWLINE_CR], |
156 | | * [flags@GLib.RegexMatchFlags.NEWLINE_LF], and |
157 | | * [flags@GLib.RegexMatchFlags.NEWLINE_CRLF] compile options, and |
158 | | * with [flags@GLib.RegexMatchFlags.NEWLINE_ANY], |
159 | | * [flags@GLib.RegexMatchFlags.NEWLINE_CR], |
160 | | * [flags@GLib.RegexMatchFlags.NEWLINE_LF] and |
161 | | * [flags@GLib.RegexMatchFlags.NEWLINE_CRLF] match options. |
162 | | * These settings are also relevant when compiling a pattern if |
163 | | * [flags@GLib.RegexCompileFlags.EXTENDED] is set and an unescaped |
164 | | * `#` outside a character class is encountered. This indicates a comment |
165 | | * that lasts until after the next newline. |
166 | | * |
167 | | * Because `GRegex` does not modify its internal state between creation and |
168 | | * destruction, you can create and modify the same `GRegex` instance from |
169 | | * different threads. In contrast, [struct@GLib.MatchInfo] is not thread safe. |
170 | | * |
171 | | * The regular expression low-level functionalities are obtained through |
172 | | * the excellent [PCRE](http://www.pcre.org/) library written by Philip Hazel. |
173 | | * |
174 | | * Since: 2.14 |
175 | | */ |
176 | | |
177 | 0 | #define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED | \ |
178 | 0 | PCRE2_NO_UTF_CHECK | \ |
179 | 0 | PCRE2_ENDANCHORED) |
180 | | |
181 | | /* Mask of all the possible values for GRegexCompileFlags. */ |
182 | 0 | #define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT | \ |
183 | 0 | G_REGEX_CASELESS | \ |
184 | 0 | G_REGEX_MULTILINE | \ |
185 | 0 | G_REGEX_DOTALL | \ |
186 | 0 | G_REGEX_EXTENDED | \ |
187 | 0 | G_REGEX_ANCHORED | \ |
188 | 0 | G_REGEX_DOLLAR_ENDONLY | \ |
189 | 0 | G_REGEX_UNGREEDY | \ |
190 | 0 | G_REGEX_RAW | \ |
191 | 0 | G_REGEX_NO_AUTO_CAPTURE | \ |
192 | 0 | G_REGEX_OPTIMIZE | \ |
193 | 0 | G_REGEX_FIRSTLINE | \ |
194 | 0 | G_REGEX_DUPNAMES | \ |
195 | 0 | G_REGEX_NEWLINE_CR | \ |
196 | 0 | G_REGEX_NEWLINE_LF | \ |
197 | 0 | G_REGEX_NEWLINE_CRLF | \ |
198 | 0 | G_REGEX_NEWLINE_ANYCRLF | \ |
199 | 0 | G_REGEX_BSR_ANYCRLF) |
200 | | |
201 | 0 | #define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS | \ |
202 | 0 | PCRE2_ALT_BSUX | \ |
203 | 0 | PCRE2_AUTO_CALLOUT | \ |
204 | 0 | PCRE2_CASELESS | \ |
205 | 0 | PCRE2_DOLLAR_ENDONLY | \ |
206 | 0 | PCRE2_DOTALL | \ |
207 | 0 | PCRE2_DUPNAMES | \ |
208 | 0 | PCRE2_EXTENDED | \ |
209 | 0 | PCRE2_FIRSTLINE | \ |
210 | 0 | PCRE2_MATCH_UNSET_BACKREF | \ |
211 | 0 | PCRE2_MULTILINE | \ |
212 | 0 | PCRE2_NEVER_UCP | \ |
213 | 0 | PCRE2_NEVER_UTF | \ |
214 | 0 | PCRE2_NO_AUTO_CAPTURE | \ |
215 | 0 | PCRE2_NO_AUTO_POSSESS | \ |
216 | 0 | PCRE2_NO_DOTSTAR_ANCHOR | \ |
217 | 0 | PCRE2_NO_START_OPTIMIZE | \ |
218 | 0 | PCRE2_UCP | \ |
219 | 0 | PCRE2_UNGREEDY | \ |
220 | 0 | PCRE2_UTF | \ |
221 | 0 | PCRE2_NEVER_BACKSLASH_C | \ |
222 | 0 | PCRE2_ALT_CIRCUMFLEX | \ |
223 | 0 | PCRE2_ALT_VERBNAMES | \ |
224 | 0 | PCRE2_USE_OFFSET_LIMIT | \ |
225 | 0 | PCRE2_EXTENDED_MORE | \ |
226 | 0 | PCRE2_LITERAL | \ |
227 | 0 | PCRE2_MATCH_INVALID_UTF | \ |
228 | 0 | G_REGEX_PCRE_GENERIC_MASK) |
229 | | |
230 | 0 | #define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF) |
231 | | |
232 | | /* Mask of all the possible values for GRegexMatchFlags. */ |
233 | 0 | #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT | \ |
234 | 0 | G_REGEX_MATCH_ANCHORED | \ |
235 | 0 | G_REGEX_MATCH_NOTBOL | \ |
236 | 0 | G_REGEX_MATCH_NOTEOL | \ |
237 | 0 | G_REGEX_MATCH_NOTEMPTY | \ |
238 | 0 | G_REGEX_MATCH_PARTIAL | \ |
239 | 0 | G_REGEX_MATCH_NEWLINE_CR | \ |
240 | 0 | G_REGEX_MATCH_NEWLINE_LF | \ |
241 | 0 | G_REGEX_MATCH_NEWLINE_CRLF | \ |
242 | 0 | G_REGEX_MATCH_NEWLINE_ANY | \ |
243 | 0 | G_REGEX_MATCH_NEWLINE_ANYCRLF | \ |
244 | 0 | G_REGEX_MATCH_BSR_ANYCRLF | \ |
245 | 0 | G_REGEX_MATCH_BSR_ANY | \ |
246 | 0 | G_REGEX_MATCH_PARTIAL_SOFT | \ |
247 | 0 | G_REGEX_MATCH_PARTIAL_HARD | \ |
248 | 0 | G_REGEX_MATCH_NOTEMPTY_ATSTART) |
249 | | |
250 | 0 | #define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL |\ |
251 | 0 | PCRE2_NOTEOL |\ |
252 | 0 | PCRE2_NOTEMPTY |\ |
253 | 0 | PCRE2_NOTEMPTY_ATSTART |\ |
254 | 0 | PCRE2_PARTIAL_SOFT |\ |
255 | 0 | PCRE2_PARTIAL_HARD |\ |
256 | 0 | PCRE2_NO_JIT |\ |
257 | 0 | PCRE2_COPY_MATCHED_SUBJECT |\ |
258 | 0 | G_REGEX_PCRE_GENERIC_MASK) |
259 | | |
260 | | /* TODO: Support PCRE2_NEWLINE_NUL */ |
261 | | #define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR | \ |
262 | | PCRE2_NEWLINE_LF | \ |
263 | | PCRE2_NEWLINE_CRLF | \ |
264 | | PCRE2_NEWLINE_ANYCRLF) |
265 | | |
266 | | /* Some match options are not supported when using JIT as stated in the |
267 | | * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section: |
268 | | * https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5 |
269 | | */ |
270 | 0 | #define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \ |
271 | 0 | PCRE2_ENDANCHORED) |
272 | | |
273 | 0 | #define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR | \ |
274 | 0 | G_REGEX_NEWLINE_LF | \ |
275 | 0 | G_REGEX_NEWLINE_CRLF | \ |
276 | 0 | G_REGEX_NEWLINE_ANYCRLF) |
277 | | |
278 | 0 | #define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR | \ |
279 | 0 | G_REGEX_MATCH_NEWLINE_LF | \ |
280 | 0 | G_REGEX_MATCH_NEWLINE_CRLF | \ |
281 | 0 | G_REGEX_MATCH_NEWLINE_ANY | \ |
282 | 0 | G_REGEX_MATCH_NEWLINE_ANYCRLF) |
283 | | |
284 | | /* if the string is in UTF-8 use g_utf8_ functions, else use |
285 | | * use just +/- 1. */ |
286 | 0 | #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
287 | 0 | ((s) + 1) : \ |
288 | 0 | g_utf8_next_char (s)) |
289 | 0 | #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
290 | 0 | ((s) - 1) : \ |
291 | 0 | g_utf8_prev_char (s)) |
292 | | |
293 | | struct _GMatchInfo |
294 | | { |
295 | | gint ref_count; /* the ref count (atomic) */ |
296 | | GRegex *regex; /* the regex */ |
297 | | uint32_t match_opts; /* pcre match options used at match time on the regex */ |
298 | | gint matches; /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */ |
299 | | uint32_t n_subpatterns; /* total number of sub patterns in the regex */ |
300 | | gint pos; /* position in the string where last match left off */ |
301 | | uint32_t n_offsets; /* number of offsets */ |
302 | | gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ |
303 | | gint *workspace; /* workspace for pcre2_dfa_match() */ |
304 | | PCRE2_SIZE n_workspace; /* number of workspace elements */ |
305 | | const gchar *string; /* string passed to the match function */ |
306 | | gssize string_len; /* length of string, in bytes */ |
307 | | pcre2_match_context *match_context; |
308 | | pcre2_match_data *match_data; |
309 | | pcre2_jit_stack *jit_stack; |
310 | | }; |
311 | | |
312 | | typedef enum |
313 | | { |
314 | | JIT_STATUS_DEFAULT, |
315 | | JIT_STATUS_ENABLED, |
316 | | JIT_STATUS_DISABLED |
317 | | } JITStatus; |
318 | | |
319 | | struct _GRegex |
320 | | { |
321 | | gint ref_count; /* the ref count for the immutable part (atomic) */ |
322 | | gchar *pattern; /* the pattern */ |
323 | | pcre2_code *pcre_re; /* compiled form of the pattern */ |
324 | | uint32_t compile_opts; /* options used at compile time on the pattern, pcre2 values */ |
325 | | GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */ |
326 | | uint32_t match_opts; /* pcre2 options used at match time on the regex */ |
327 | | GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */ |
328 | | uint32_t jit_options; /* options which were enabled for jit compiler */ |
329 | | JITStatus jit_status; /* indicates the status of jit compiler for this compiled regex */ |
330 | | /* The jit_status here does _not_ correspond to whether we used the JIT in the last invocation, |
331 | | * which may be affected by match_options or a JIT_STACK_LIMIT error, but whether it was ever |
332 | | * enabled for the current regex AND current set of jit_options. |
333 | | * JIT_STATUS_DEFAULT means enablement was never tried, |
334 | | * JIT_STATUS_ENABLED means it was tried and successful (even if we're not currently using it), |
335 | | * and JIT_STATUS_DISABLED means it was tried and failed (so we shouldn't try again). |
336 | | */ |
337 | | }; |
338 | | |
339 | | /* TRUE if ret is an error code, FALSE otherwise. */ |
340 | 0 | #define IS_PCRE2_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL) |
341 | | |
342 | | typedef struct _InterpolationData InterpolationData; |
343 | | static gboolean interpolation_list_needs_match (GList *list); |
344 | | static gboolean interpolate_replacement (const GMatchInfo *match_info, |
345 | | GString *result, |
346 | | gpointer data); |
347 | | static GList *split_replacement (const gchar *replacement, |
348 | | GError **error); |
349 | | static void free_interpolation_data (InterpolationData *data); |
350 | | |
351 | | static uint32_t |
352 | | get_pcre2_compile_options (GRegexCompileFlags compile_flags) |
353 | 0 | { |
354 | | /* Maps compile flags to pcre2 values */ |
355 | 0 | uint32_t pcre2_flags = 0; |
356 | |
|
357 | 0 | if (compile_flags & G_REGEX_CASELESS) |
358 | 0 | pcre2_flags |= PCRE2_CASELESS; |
359 | 0 | if (compile_flags & G_REGEX_MULTILINE) |
360 | 0 | pcre2_flags |= PCRE2_MULTILINE; |
361 | 0 | if (compile_flags & G_REGEX_DOTALL) |
362 | 0 | pcre2_flags |= PCRE2_DOTALL; |
363 | 0 | if (compile_flags & G_REGEX_EXTENDED) |
364 | 0 | pcre2_flags |= PCRE2_EXTENDED; |
365 | 0 | if (compile_flags & G_REGEX_ANCHORED) |
366 | 0 | pcre2_flags |= PCRE2_ANCHORED; |
367 | 0 | if (compile_flags & G_REGEX_DOLLAR_ENDONLY) |
368 | 0 | pcre2_flags |= PCRE2_DOLLAR_ENDONLY; |
369 | 0 | if (compile_flags & G_REGEX_UNGREEDY) |
370 | 0 | pcre2_flags |= PCRE2_UNGREEDY; |
371 | 0 | if (!(compile_flags & G_REGEX_RAW)) |
372 | 0 | pcre2_flags |= PCRE2_UTF; |
373 | 0 | if (compile_flags & G_REGEX_NO_AUTO_CAPTURE) |
374 | 0 | pcre2_flags |= PCRE2_NO_AUTO_CAPTURE; |
375 | 0 | if (compile_flags & G_REGEX_FIRSTLINE) |
376 | 0 | pcre2_flags |= PCRE2_FIRSTLINE; |
377 | 0 | if (compile_flags & G_REGEX_DUPNAMES) |
378 | 0 | pcre2_flags |= PCRE2_DUPNAMES; |
379 | |
|
380 | 0 | return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK; |
381 | 0 | } |
382 | | |
383 | | static uint32_t |
384 | | get_pcre2_match_options (GRegexMatchFlags match_flags, |
385 | | GRegexCompileFlags compile_flags) |
386 | 0 | { |
387 | | /* Maps match flags to pcre2 values */ |
388 | 0 | uint32_t pcre2_flags = 0; |
389 | |
|
390 | 0 | if (match_flags & G_REGEX_MATCH_ANCHORED) |
391 | 0 | pcre2_flags |= PCRE2_ANCHORED; |
392 | 0 | if (match_flags & G_REGEX_MATCH_NOTBOL) |
393 | 0 | pcre2_flags |= PCRE2_NOTBOL; |
394 | 0 | if (match_flags & G_REGEX_MATCH_NOTEOL) |
395 | 0 | pcre2_flags |= PCRE2_NOTEOL; |
396 | 0 | if (match_flags & G_REGEX_MATCH_NOTEMPTY) |
397 | 0 | pcre2_flags |= PCRE2_NOTEMPTY; |
398 | 0 | if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT) |
399 | 0 | pcre2_flags |= PCRE2_PARTIAL_SOFT; |
400 | 0 | if (match_flags & G_REGEX_MATCH_PARTIAL_HARD) |
401 | 0 | pcre2_flags |= PCRE2_PARTIAL_HARD; |
402 | 0 | if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART) |
403 | 0 | pcre2_flags |= PCRE2_NOTEMPTY_ATSTART; |
404 | |
|
405 | 0 | if (compile_flags & G_REGEX_RAW) |
406 | 0 | pcre2_flags |= PCRE2_NO_UTF_CHECK; |
407 | |
|
408 | 0 | return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK; |
409 | 0 | } |
410 | | |
411 | | static GRegexCompileFlags |
412 | | g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags) |
413 | 0 | { |
414 | 0 | GRegexCompileFlags compile_flags = G_REGEX_DEFAULT; |
415 | |
|
416 | 0 | if (pcre2_flags & PCRE2_CASELESS) |
417 | 0 | compile_flags |= G_REGEX_CASELESS; |
418 | 0 | if (pcre2_flags & PCRE2_MULTILINE) |
419 | 0 | compile_flags |= G_REGEX_MULTILINE; |
420 | 0 | if (pcre2_flags & PCRE2_DOTALL) |
421 | 0 | compile_flags |= G_REGEX_DOTALL; |
422 | 0 | if (pcre2_flags & PCRE2_EXTENDED) |
423 | 0 | compile_flags |= G_REGEX_EXTENDED; |
424 | 0 | if (pcre2_flags & PCRE2_ANCHORED) |
425 | 0 | compile_flags |= G_REGEX_ANCHORED; |
426 | 0 | if (pcre2_flags & PCRE2_DOLLAR_ENDONLY) |
427 | 0 | compile_flags |= G_REGEX_DOLLAR_ENDONLY; |
428 | 0 | if (pcre2_flags & PCRE2_UNGREEDY) |
429 | 0 | compile_flags |= G_REGEX_UNGREEDY; |
430 | 0 | if (!(pcre2_flags & PCRE2_UTF)) |
431 | 0 | compile_flags |= G_REGEX_RAW; |
432 | 0 | if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE) |
433 | 0 | compile_flags |= G_REGEX_NO_AUTO_CAPTURE; |
434 | 0 | if (pcre2_flags & PCRE2_FIRSTLINE) |
435 | 0 | compile_flags |= G_REGEX_FIRSTLINE; |
436 | 0 | if (pcre2_flags & PCRE2_DUPNAMES) |
437 | 0 | compile_flags |= G_REGEX_DUPNAMES; |
438 | |
|
439 | 0 | return compile_flags & G_REGEX_COMPILE_MASK; |
440 | 0 | } |
441 | | |
442 | | static GRegexMatchFlags |
443 | | g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags) |
444 | 0 | { |
445 | 0 | GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT; |
446 | |
|
447 | 0 | if (pcre2_flags & PCRE2_ANCHORED) |
448 | 0 | match_flags |= G_REGEX_MATCH_ANCHORED; |
449 | 0 | if (pcre2_flags & PCRE2_NOTBOL) |
450 | 0 | match_flags |= G_REGEX_MATCH_NOTBOL; |
451 | 0 | if (pcre2_flags & PCRE2_NOTEOL) |
452 | 0 | match_flags |= G_REGEX_MATCH_NOTEOL; |
453 | 0 | if (pcre2_flags & PCRE2_NOTEMPTY) |
454 | 0 | match_flags |= G_REGEX_MATCH_NOTEMPTY; |
455 | 0 | if (pcre2_flags & PCRE2_PARTIAL_SOFT) |
456 | 0 | match_flags |= G_REGEX_MATCH_PARTIAL_SOFT; |
457 | 0 | if (pcre2_flags & PCRE2_PARTIAL_HARD) |
458 | 0 | match_flags |= G_REGEX_MATCH_PARTIAL_HARD; |
459 | 0 | if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART) |
460 | 0 | match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART; |
461 | |
|
462 | 0 | return (match_flags & G_REGEX_MATCH_MASK); |
463 | 0 | } |
464 | | |
465 | | static uint32_t |
466 | | get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags) |
467 | 0 | { |
468 | 0 | compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK; |
469 | |
|
470 | 0 | switch (compile_flags) |
471 | 0 | { |
472 | 0 | case G_REGEX_NEWLINE_CR: |
473 | 0 | return PCRE2_NEWLINE_CR; |
474 | 0 | case G_REGEX_NEWLINE_LF: |
475 | 0 | return PCRE2_NEWLINE_LF; |
476 | 0 | case G_REGEX_NEWLINE_CRLF: |
477 | 0 | return PCRE2_NEWLINE_CRLF; |
478 | 0 | case G_REGEX_NEWLINE_ANYCRLF: |
479 | 0 | return PCRE2_NEWLINE_ANYCRLF; |
480 | 0 | default: |
481 | 0 | if (compile_flags != 0) |
482 | 0 | return 0; |
483 | | |
484 | 0 | return PCRE2_NEWLINE_ANY; |
485 | 0 | } |
486 | 0 | } |
487 | | |
488 | | static uint32_t |
489 | | get_pcre2_newline_match_options (GRegexMatchFlags match_flags) |
490 | 0 | { |
491 | 0 | switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK) |
492 | 0 | { |
493 | 0 | case G_REGEX_MATCH_NEWLINE_CR: |
494 | 0 | return PCRE2_NEWLINE_CR; |
495 | 0 | case G_REGEX_MATCH_NEWLINE_LF: |
496 | 0 | return PCRE2_NEWLINE_LF; |
497 | 0 | case G_REGEX_MATCH_NEWLINE_CRLF: |
498 | 0 | return PCRE2_NEWLINE_CRLF; |
499 | 0 | case G_REGEX_MATCH_NEWLINE_ANY: |
500 | 0 | return PCRE2_NEWLINE_ANY; |
501 | 0 | case G_REGEX_MATCH_NEWLINE_ANYCRLF: |
502 | 0 | return PCRE2_NEWLINE_ANYCRLF; |
503 | 0 | default: |
504 | 0 | return 0; |
505 | 0 | } |
506 | 0 | } |
507 | | |
508 | | static uint32_t |
509 | | get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags) |
510 | 0 | { |
511 | 0 | if (compile_flags & G_REGEX_BSR_ANYCRLF) |
512 | 0 | return PCRE2_BSR_ANYCRLF; |
513 | | |
514 | 0 | return PCRE2_BSR_UNICODE; |
515 | 0 | } |
516 | | |
517 | | static uint32_t |
518 | | get_pcre2_bsr_match_options (GRegexMatchFlags match_flags) |
519 | 0 | { |
520 | 0 | if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF) |
521 | 0 | return PCRE2_BSR_ANYCRLF; |
522 | | |
523 | 0 | if (match_flags & G_REGEX_MATCH_BSR_ANY) |
524 | 0 | return PCRE2_BSR_UNICODE; |
525 | | |
526 | 0 | return 0; |
527 | 0 | } |
528 | | |
529 | | static char * |
530 | | get_pcre2_error_string (int errcode) |
531 | 0 | { |
532 | 0 | PCRE2_UCHAR8 error_msg[2048]; |
533 | 0 | int err_length; |
534 | |
|
535 | 0 | err_length = pcre2_get_error_message (errcode, error_msg, |
536 | 0 | G_N_ELEMENTS (error_msg)); |
537 | |
|
538 | 0 | if (err_length <= 0) |
539 | 0 | return NULL; |
540 | | |
541 | | /* The array is always filled with a trailing zero */ |
542 | 0 | g_assert ((size_t) err_length < G_N_ELEMENTS (error_msg)); |
543 | 0 | return g_memdup2 (error_msg, err_length + 1); |
544 | 0 | } |
545 | | |
546 | | static const gchar * |
547 | | translate_match_error (gint errcode) |
548 | 0 | { |
549 | 0 | switch (errcode) |
550 | 0 | { |
551 | 0 | case PCRE2_ERROR_NOMATCH: |
552 | | /* not an error */ |
553 | 0 | break; |
554 | 0 | case PCRE2_ERROR_NULL: |
555 | | /* NULL argument, this should not happen in GRegex */ |
556 | 0 | g_critical ("A NULL argument was passed to PCRE"); |
557 | 0 | break; |
558 | 0 | case PCRE2_ERROR_BADOPTION: |
559 | 0 | return "bad options"; |
560 | 0 | case PCRE2_ERROR_BADMAGIC: |
561 | 0 | return _("corrupted object"); |
562 | 0 | case PCRE2_ERROR_NOMEMORY: |
563 | 0 | return _("out of memory"); |
564 | 0 | case PCRE2_ERROR_NOSUBSTRING: |
565 | | /* not used by pcre2_match() */ |
566 | 0 | break; |
567 | 0 | case PCRE2_ERROR_MATCHLIMIT: |
568 | 0 | case PCRE2_ERROR_CALLOUT: |
569 | | /* callouts are not implemented */ |
570 | 0 | break; |
571 | 0 | case PCRE2_ERROR_BADUTFOFFSET: |
572 | | /* we do not check if strings are valid */ |
573 | 0 | break; |
574 | 0 | case PCRE2_ERROR_PARTIAL: |
575 | | /* not an error */ |
576 | 0 | break; |
577 | 0 | case PCRE2_ERROR_INTERNAL: |
578 | 0 | return _("internal error"); |
579 | 0 | case PCRE2_ERROR_DFA_UITEM: |
580 | 0 | return _("the pattern contains items not supported for partial matching"); |
581 | 0 | case PCRE2_ERROR_DFA_UCOND: |
582 | 0 | return _("back references as conditions are not supported for partial matching"); |
583 | 0 | case PCRE2_ERROR_DFA_WSSIZE: |
584 | | /* handled expanding the workspace */ |
585 | 0 | break; |
586 | 0 | case PCRE2_ERROR_DFA_RECURSE: |
587 | 0 | case PCRE2_ERROR_RECURSIONLIMIT: |
588 | 0 | return _("recursion limit reached"); |
589 | 0 | case PCRE2_ERROR_BADOFFSET: |
590 | 0 | return _("bad offset"); |
591 | 0 | case PCRE2_ERROR_RECURSELOOP: |
592 | 0 | return _("recursion loop"); |
593 | 0 | case PCRE2_ERROR_JIT_BADOPTION: |
594 | | /* should not happen in GRegex since we check modes before each match */ |
595 | 0 | return _("matching mode is requested that was not compiled for JIT"); |
596 | 0 | default: |
597 | 0 | break; |
598 | 0 | } |
599 | 0 | return NULL; |
600 | 0 | } |
601 | | |
602 | | static char * |
603 | | get_match_error_message (int errcode) |
604 | 0 | { |
605 | 0 | const char *msg = translate_match_error (errcode); |
606 | 0 | char *error_string; |
607 | |
|
608 | 0 | if (msg) |
609 | 0 | return g_strdup (msg); |
610 | | |
611 | 0 | error_string = get_pcre2_error_string (errcode); |
612 | |
|
613 | 0 | if (error_string) |
614 | 0 | return error_string; |
615 | | |
616 | 0 | return g_strdup (_("unknown error")); |
617 | 0 | } |
618 | | |
619 | | static void |
620 | | translate_compile_error (gint *errcode, const gchar **errmsg) |
621 | 0 | { |
622 | | /* If errcode is known we put the translatable error message in |
623 | | * errmsg. If errcode is unknown we put the generic |
624 | | * G_REGEX_ERROR_COMPILE error code in errcode. |
625 | | * Note that there can be more PCRE errors with the same GRegexError |
626 | | * and that some PCRE errors are useless for us. |
627 | | */ |
628 | 0 | gint original_errcode = *errcode; |
629 | |
|
630 | 0 | *errcode = -1; |
631 | 0 | *errmsg = NULL; |
632 | |
|
633 | 0 | switch (original_errcode) |
634 | 0 | { |
635 | 0 | case PCRE2_ERROR_END_BACKSLASH: |
636 | 0 | *errcode = G_REGEX_ERROR_STRAY_BACKSLASH; |
637 | 0 | *errmsg = _("\\ at end of pattern"); |
638 | 0 | break; |
639 | 0 | case PCRE2_ERROR_END_BACKSLASH_C: |
640 | 0 | *errcode = G_REGEX_ERROR_MISSING_CONTROL_CHAR; |
641 | 0 | *errmsg = _("\\c at end of pattern"); |
642 | 0 | break; |
643 | 0 | case PCRE2_ERROR_UNKNOWN_ESCAPE: |
644 | 0 | case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE: |
645 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; |
646 | 0 | *errmsg = _("unrecognized character following \\"); |
647 | 0 | break; |
648 | 0 | case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER: |
649 | 0 | *errcode = G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER; |
650 | 0 | *errmsg = _("numbers out of order in {} quantifier"); |
651 | 0 | break; |
652 | 0 | case PCRE2_ERROR_QUANTIFIER_TOO_BIG: |
653 | 0 | *errcode = G_REGEX_ERROR_QUANTIFIER_TOO_BIG; |
654 | 0 | *errmsg = _("number too big in {} quantifier"); |
655 | 0 | break; |
656 | 0 | case PCRE2_ERROR_MISSING_SQUARE_BRACKET: |
657 | 0 | *errcode = G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS; |
658 | 0 | *errmsg = _("missing terminating ] for character class"); |
659 | 0 | break; |
660 | 0 | case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS: |
661 | 0 | *errcode = G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS; |
662 | 0 | *errmsg = _("invalid escape sequence in character class"); |
663 | 0 | break; |
664 | 0 | case PCRE2_ERROR_CLASS_RANGE_ORDER: |
665 | 0 | *errcode = G_REGEX_ERROR_RANGE_OUT_OF_ORDER; |
666 | 0 | *errmsg = _("range out of order in character class"); |
667 | 0 | break; |
668 | 0 | case PCRE2_ERROR_QUANTIFIER_INVALID: |
669 | 0 | case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT: |
670 | 0 | *errcode = G_REGEX_ERROR_NOTHING_TO_REPEAT; |
671 | 0 | *errmsg = _("nothing to repeat"); |
672 | 0 | break; |
673 | 0 | case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY: |
674 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
675 | 0 | *errmsg = _("unrecognized character after (? or (?-"); |
676 | 0 | break; |
677 | 0 | case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS: |
678 | 0 | *errcode = G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS; |
679 | 0 | *errmsg = _("POSIX named classes are supported only within a class"); |
680 | 0 | break; |
681 | 0 | case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING: |
682 | 0 | *errcode = G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED; |
683 | 0 | *errmsg = _("POSIX collating elements are not supported"); |
684 | 0 | break; |
685 | 0 | case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS: |
686 | 0 | case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS: |
687 | 0 | case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING: |
688 | 0 | *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; |
689 | 0 | *errmsg = _("missing terminating )"); |
690 | 0 | break; |
691 | 0 | case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE: |
692 | 0 | *errcode = G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE; |
693 | 0 | *errmsg = _("reference to non-existent subpattern"); |
694 | 0 | break; |
695 | 0 | case PCRE2_ERROR_MISSING_COMMENT_CLOSING: |
696 | 0 | *errcode = G_REGEX_ERROR_UNTERMINATED_COMMENT; |
697 | 0 | *errmsg = _("missing ) after comment"); |
698 | 0 | break; |
699 | 0 | case PCRE2_ERROR_PATTERN_TOO_LARGE: |
700 | 0 | *errcode = G_REGEX_ERROR_EXPRESSION_TOO_LARGE; |
701 | 0 | *errmsg = _("regular expression is too large"); |
702 | 0 | break; |
703 | 0 | case PCRE2_ERROR_MISSING_CONDITION_CLOSING: |
704 | 0 | *errcode = G_REGEX_ERROR_MALFORMED_CONDITION; |
705 | 0 | *errmsg = _("malformed number or name after (?("); |
706 | 0 | break; |
707 | 0 | case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH: |
708 | 0 | *errcode = G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND; |
709 | 0 | *errmsg = _("lookbehind assertion is not fixed length"); |
710 | 0 | break; |
711 | 0 | case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES: |
712 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES; |
713 | 0 | *errmsg = _("conditional group contains more than two branches"); |
714 | 0 | break; |
715 | 0 | case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED: |
716 | 0 | *errcode = G_REGEX_ERROR_ASSERTION_EXPECTED; |
717 | 0 | *errmsg = _("assertion expected after (?("); |
718 | 0 | break; |
719 | 0 | case PCRE2_ERROR_BAD_RELATIVE_REFERENCE: |
720 | 0 | *errcode = G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE; |
721 | 0 | *errmsg = _("a numbered reference must not be zero"); |
722 | 0 | break; |
723 | 0 | case PCRE2_ERROR_UNKNOWN_POSIX_CLASS: |
724 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME; |
725 | 0 | *errmsg = _("unknown POSIX class name"); |
726 | 0 | break; |
727 | 0 | case PCRE2_ERROR_CODE_POINT_TOO_BIG: |
728 | 0 | case PCRE2_ERROR_INVALID_HEXADECIMAL: |
729 | 0 | *errcode = G_REGEX_ERROR_HEX_CODE_TOO_LARGE; |
730 | 0 | *errmsg = _("character value in \\x{...} sequence is too large"); |
731 | 0 | break; |
732 | 0 | case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C: |
733 | 0 | *errcode = G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND; |
734 | 0 | *errmsg = _("\\C not allowed in lookbehind assertion"); |
735 | 0 | break; |
736 | 0 | case PCRE2_ERROR_MISSING_NAME_TERMINATOR: |
737 | 0 | *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR; |
738 | 0 | *errmsg = _("missing terminator in subpattern name"); |
739 | 0 | break; |
740 | 0 | case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME: |
741 | 0 | *errcode = G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME; |
742 | 0 | *errmsg = _("two named subpatterns have the same name"); |
743 | 0 | break; |
744 | 0 | case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY: |
745 | 0 | *errcode = G_REGEX_ERROR_MALFORMED_PROPERTY; |
746 | 0 | *errmsg = _("malformed \\P or \\p sequence"); |
747 | 0 | break; |
748 | 0 | case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY: |
749 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_PROPERTY; |
750 | 0 | *errmsg = _("unknown property name after \\P or \\p"); |
751 | 0 | break; |
752 | 0 | case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG: |
753 | 0 | *errcode = G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG; |
754 | 0 | *errmsg = _("subpattern name is too long (maximum 32 characters)"); |
755 | 0 | break; |
756 | 0 | case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS: |
757 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_SUBPATTERNS; |
758 | 0 | *errmsg = _("too many named subpatterns (maximum 10,000)"); |
759 | 0 | break; |
760 | 0 | case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG: |
761 | 0 | *errcode = G_REGEX_ERROR_INVALID_OCTAL_VALUE; |
762 | 0 | *errmsg = _("octal value is greater than \\377"); |
763 | 0 | break; |
764 | 0 | case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES: |
765 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE; |
766 | 0 | *errmsg = _("DEFINE group contains more than one branch"); |
767 | 0 | break; |
768 | 0 | case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE: |
769 | 0 | *errcode = G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS; |
770 | 0 | *errmsg = _("inconsistent NEWLINE options"); |
771 | 0 | break; |
772 | 0 | case PCRE2_ERROR_BACKSLASH_G_SYNTAX: |
773 | 0 | *errcode = G_REGEX_ERROR_MISSING_BACK_REFERENCE; |
774 | 0 | *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or " |
775 | 0 | "number, or by a plain number"); |
776 | 0 | break; |
777 | 0 | case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED: |
778 | 0 | *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN; |
779 | 0 | *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)"); |
780 | 0 | break; |
781 | 0 | case PCRE2_ERROR_VERB_UNKNOWN: |
782 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB; |
783 | 0 | *errmsg = _("(*VERB) not recognized"); |
784 | 0 | break; |
785 | 0 | case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG: |
786 | 0 | *errcode = G_REGEX_ERROR_NUMBER_TOO_BIG; |
787 | 0 | *errmsg = _("number is too big"); |
788 | 0 | break; |
789 | 0 | case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED: |
790 | 0 | *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME; |
791 | 0 | *errmsg = _("missing subpattern name after (?&"); |
792 | 0 | break; |
793 | 0 | case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH: |
794 | 0 | *errcode = G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME; |
795 | 0 | *errmsg = _("different names for subpatterns of the same number are not allowed"); |
796 | 0 | break; |
797 | 0 | case PCRE2_ERROR_MARK_MISSING_ARGUMENT: |
798 | 0 | *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED; |
799 | 0 | *errmsg = _("(*MARK) must have an argument"); |
800 | 0 | break; |
801 | 0 | case PCRE2_ERROR_BACKSLASH_C_SYNTAX: |
802 | 0 | *errcode = G_REGEX_ERROR_INVALID_CONTROL_CHAR; |
803 | 0 | *errmsg = _( "\\c must be followed by an ASCII character"); |
804 | 0 | break; |
805 | 0 | case PCRE2_ERROR_BACKSLASH_K_SYNTAX: |
806 | 0 | *errcode = G_REGEX_ERROR_MISSING_NAME; |
807 | 0 | *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name"); |
808 | 0 | break; |
809 | 0 | case PCRE2_ERROR_BACKSLASH_N_IN_CLASS: |
810 | 0 | *errcode = G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS; |
811 | 0 | *errmsg = _("\\N is not supported in a class"); |
812 | 0 | break; |
813 | 0 | case PCRE2_ERROR_VERB_NAME_TOO_LONG: |
814 | 0 | *errcode = G_REGEX_ERROR_NAME_TOO_LONG; |
815 | 0 | *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)"); |
816 | 0 | break; |
817 | 0 | case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW: |
818 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
819 | 0 | *errmsg = _("code overflow"); |
820 | 0 | break; |
821 | 0 | case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P: |
822 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
823 | 0 | *errmsg = _("unrecognized character after (?P"); |
824 | 0 | break; |
825 | 0 | case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE: |
826 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
827 | 0 | *errmsg = _("overran compiling workspace"); |
828 | 0 | break; |
829 | 0 | case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN: |
830 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
831 | 0 | *errmsg = _("previously-checked referenced subpattern not found"); |
832 | 0 | break; |
833 | 0 | case PCRE2_ERROR_HEAP_FAILED: |
834 | 0 | case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW: |
835 | 0 | case PCRE2_ERROR_UNICODE_NOT_SUPPORTED: |
836 | 0 | case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT: |
837 | 0 | case PCRE2_ERROR_NO_SURROGATES_IN_UTF16: |
838 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS: |
839 | 0 | case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE: |
840 | 0 | case PCRE2_ERROR_INTERNAL_STUDY_ERROR: |
841 | 0 | case PCRE2_ERROR_UTF_IS_DISABLED: |
842 | 0 | case PCRE2_ERROR_UCP_IS_DISABLED: |
843 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS: |
844 | 0 | case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED: |
845 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE: |
846 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP: |
847 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
848 | 0 | break; |
849 | 0 | case PCRE2_ERROR_INVALID_SUBPATTERN_NAME: |
850 | 0 | case PCRE2_ERROR_CLASS_INVALID_RANGE: |
851 | 0 | case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE: |
852 | 0 | case PCRE2_ERROR_PARENTHESES_STACK_CHECK: |
853 | 0 | case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED: |
854 | 0 | case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG: |
855 | 0 | case PCRE2_ERROR_MISSING_CALLOUT_CLOSING: |
856 | 0 | case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB: |
857 | 0 | case PCRE2_ERROR_NULL_PATTERN: |
858 | 0 | case PCRE2_ERROR_BAD_OPTIONS: |
859 | 0 | case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP: |
860 | 0 | case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE: |
861 | 0 | case PCRE2_ERROR_INVALID_OCTAL: |
862 | 0 | case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG: |
863 | 0 | case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG: |
864 | 0 | case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS: |
865 | 0 | case PCRE2_ERROR_VERSION_CONDITION_SYNTAX: |
866 | 0 | case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER: |
867 | 0 | case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER: |
868 | 0 | case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED: |
869 | 0 | case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP: |
870 | 0 | case PCRE2_ERROR_PATTERN_TOO_COMPLICATED: |
871 | 0 | case PCRE2_ERROR_LOOKBEHIND_TOO_LONG: |
872 | 0 | case PCRE2_ERROR_PATTERN_STRING_TOO_LONG: |
873 | 0 | case PCRE2_ERROR_BAD_LITERAL_OPTIONS: |
874 | 0 | default: |
875 | 0 | *errcode = G_REGEX_ERROR_COMPILE; |
876 | 0 | break; |
877 | 0 | } |
878 | | |
879 | 0 | g_assert (*errcode != -1); |
880 | 0 | } |
881 | | |
882 | | /* GMatchInfo */ |
883 | | |
884 | | static GMatchInfo * |
885 | | match_info_new (const GRegex *regex, |
886 | | const gchar *string, |
887 | | gint string_len, |
888 | | gint start_position, |
889 | | GRegexMatchFlags match_options, |
890 | | gboolean is_dfa) |
891 | 0 | { |
892 | 0 | GMatchInfo *match_info; |
893 | |
|
894 | 0 | if (string_len < 0) |
895 | 0 | string_len = strlen (string); |
896 | |
|
897 | 0 | match_info = g_new0 (GMatchInfo, 1); |
898 | 0 | match_info->ref_count = 1; |
899 | 0 | match_info->regex = g_regex_ref ((GRegex *)regex); |
900 | 0 | match_info->string = string; |
901 | 0 | match_info->string_len = string_len; |
902 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
903 | 0 | match_info->pos = start_position; |
904 | 0 | match_info->match_opts = |
905 | 0 | get_pcre2_match_options (match_options, regex->orig_compile_opts); |
906 | |
|
907 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, |
908 | 0 | &match_info->n_subpatterns); |
909 | |
|
910 | 0 | match_info->match_context = pcre2_match_context_create (NULL); |
911 | |
|
912 | 0 | if (is_dfa) |
913 | 0 | { |
914 | | /* These values should be enough for most cases, if they are not |
915 | | * enough g_regex_match_all_full() will expand them. */ |
916 | 0 | match_info->n_workspace = 100; |
917 | 0 | match_info->workspace = g_new (gint, match_info->n_workspace); |
918 | 0 | } |
919 | |
|
920 | 0 | match_info->n_offsets = 2; |
921 | 0 | match_info->offsets = g_new0 (gint, match_info->n_offsets); |
922 | | /* Set an invalid position for the previous match. */ |
923 | 0 | match_info->offsets[0] = -1; |
924 | 0 | match_info->offsets[1] = -1; |
925 | |
|
926 | 0 | match_info->match_data = pcre2_match_data_create_from_pattern ( |
927 | 0 | match_info->regex->pcre_re, |
928 | 0 | NULL); |
929 | |
|
930 | 0 | return match_info; |
931 | 0 | } |
932 | | |
933 | | static gboolean |
934 | | recalc_match_offsets (GMatchInfo *match_info, |
935 | | GError **error) |
936 | 0 | { |
937 | 0 | PCRE2_SIZE *ovector; |
938 | 0 | uint32_t ovector_size = 0; |
939 | 0 | uint32_t pre_n_offset; |
940 | 0 | uint32_t i; |
941 | |
|
942 | 0 | g_assert (!IS_PCRE2_ERROR (match_info->matches)); |
943 | | |
944 | 0 | if (match_info->matches == PCRE2_ERROR_PARTIAL) |
945 | 0 | ovector_size = 1; |
946 | 0 | else if (match_info->matches > 0) |
947 | 0 | ovector_size = match_info->matches; |
948 | |
|
949 | 0 | g_assert (ovector_size != 0); |
950 | | |
951 | 0 | if (pcre2_get_ovector_count (match_info->match_data) < ovector_size) |
952 | 0 | { |
953 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
954 | 0 | _("Error while matching regular expression %s: %s"), |
955 | 0 | match_info->regex->pattern, _("code overflow")); |
956 | 0 | return FALSE; |
957 | 0 | } |
958 | | |
959 | 0 | pre_n_offset = match_info->n_offsets; |
960 | 0 | match_info->n_offsets = ovector_size * 2; |
961 | 0 | ovector = pcre2_get_ovector_pointer (match_info->match_data); |
962 | |
|
963 | 0 | if (match_info->n_offsets != pre_n_offset) |
964 | 0 | { |
965 | 0 | match_info->offsets = g_realloc_n (match_info->offsets, |
966 | 0 | match_info->n_offsets, |
967 | 0 | sizeof (gint)); |
968 | 0 | } |
969 | |
|
970 | 0 | for (i = 0; i < match_info->n_offsets; i++) |
971 | 0 | { |
972 | 0 | match_info->offsets[i] = (int) ovector[i]; |
973 | 0 | } |
974 | |
|
975 | 0 | return TRUE; |
976 | 0 | } |
977 | | |
978 | | static JITStatus |
979 | | enable_jit_with_match_options (GMatchInfo *match_info, |
980 | | uint32_t match_options) |
981 | 0 | { |
982 | 0 | gint retval; |
983 | 0 | uint32_t old_jit_options, new_jit_options; |
984 | |
|
985 | 0 | if (!(match_info->regex->orig_compile_opts & G_REGEX_OPTIMIZE)) |
986 | 0 | return JIT_STATUS_DISABLED; |
987 | | |
988 | 0 | if (match_info->regex->jit_status == JIT_STATUS_DISABLED) |
989 | 0 | return JIT_STATUS_DISABLED; |
990 | | |
991 | 0 | if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS) |
992 | 0 | return JIT_STATUS_DISABLED; |
993 | | |
994 | 0 | old_jit_options = match_info->regex->jit_options; |
995 | 0 | new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE; |
996 | 0 | if (match_options & PCRE2_PARTIAL_HARD) |
997 | 0 | new_jit_options |= PCRE2_JIT_PARTIAL_HARD; |
998 | 0 | if (match_options & PCRE2_PARTIAL_SOFT) |
999 | 0 | new_jit_options |= PCRE2_JIT_PARTIAL_SOFT; |
1000 | | |
1001 | | /* no new options enabled */ |
1002 | 0 | if (new_jit_options == old_jit_options) |
1003 | 0 | { |
1004 | 0 | g_assert (match_info->regex->jit_status != JIT_STATUS_DEFAULT); |
1005 | 0 | return match_info->regex->jit_status; |
1006 | 0 | } |
1007 | | |
1008 | 0 | retval = pcre2_jit_compile (match_info->regex->pcre_re, new_jit_options); |
1009 | 0 | if (retval == 0) |
1010 | 0 | { |
1011 | 0 | match_info->regex->jit_status = JIT_STATUS_ENABLED; |
1012 | |
|
1013 | 0 | match_info->regex->jit_options = new_jit_options; |
1014 | | /* Set min stack size for JIT to 32KiB and max to 512KiB */ |
1015 | 0 | match_info->jit_stack = pcre2_jit_stack_create (1 << 15, 1 << 19, NULL); |
1016 | 0 | pcre2_jit_stack_assign (match_info->match_context, NULL, match_info->jit_stack); |
1017 | 0 | } |
1018 | 0 | else |
1019 | 0 | { |
1020 | 0 | match_info->regex->jit_status = JIT_STATUS_DISABLED; |
1021 | |
|
1022 | 0 | switch (retval) |
1023 | 0 | { |
1024 | 0 | case PCRE2_ERROR_NOMEMORY: |
1025 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
1026 | 0 | "but JIT was unable to allocate executable memory for the " |
1027 | 0 | "compiler. Falling back to interpretive code."); |
1028 | 0 | break; |
1029 | 0 | case PCRE2_ERROR_JIT_BADOPTION: |
1030 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
1031 | 0 | "but JIT support is not available. Falling back to " |
1032 | 0 | "interpretive code."); |
1033 | 0 | break; |
1034 | 0 | default: |
1035 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
1036 | 0 | "but request for JIT support had unexpectedly failed (error %d). " |
1037 | 0 | "Falling back to interpretive code.", |
1038 | 0 | retval); |
1039 | 0 | break; |
1040 | 0 | } |
1041 | 0 | } |
1042 | | |
1043 | 0 | return match_info->regex->jit_status; |
1044 | | |
1045 | 0 | g_assert_not_reached (); |
1046 | 0 | } |
1047 | | |
1048 | | /** |
1049 | | * g_match_info_get_regex: |
1050 | | * @match_info: a #GMatchInfo |
1051 | | * |
1052 | | * Returns #GRegex object used in @match_info. It belongs to Glib |
1053 | | * and must not be freed. Use g_regex_ref() if you need to keep it |
1054 | | * after you free @match_info object. |
1055 | | * |
1056 | | * Returns: (transfer none): #GRegex object used in @match_info |
1057 | | * |
1058 | | * Since: 2.14 |
1059 | | */ |
1060 | | GRegex * |
1061 | | g_match_info_get_regex (const GMatchInfo *match_info) |
1062 | 0 | { |
1063 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1064 | 0 | return match_info->regex; |
1065 | 0 | } |
1066 | | |
1067 | | /** |
1068 | | * g_match_info_get_string: |
1069 | | * @match_info: a #GMatchInfo |
1070 | | * |
1071 | | * Returns the string searched with @match_info. This is the |
1072 | | * string passed to g_regex_match() or g_regex_replace() so |
1073 | | * you may not free it before calling this function. |
1074 | | * |
1075 | | * Returns: the string searched with @match_info |
1076 | | * |
1077 | | * Since: 2.14 |
1078 | | */ |
1079 | | const gchar * |
1080 | | g_match_info_get_string (const GMatchInfo *match_info) |
1081 | 0 | { |
1082 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1083 | 0 | return match_info->string; |
1084 | 0 | } |
1085 | | |
1086 | | /** |
1087 | | * g_match_info_ref: |
1088 | | * @match_info: a #GMatchInfo |
1089 | | * |
1090 | | * Increases reference count of @match_info by 1. |
1091 | | * |
1092 | | * Returns: @match_info |
1093 | | * |
1094 | | * Since: 2.30 |
1095 | | */ |
1096 | | GMatchInfo * |
1097 | | g_match_info_ref (GMatchInfo *match_info) |
1098 | 0 | { |
1099 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1100 | 0 | g_atomic_int_inc (&match_info->ref_count); |
1101 | 0 | return match_info; |
1102 | 0 | } |
1103 | | |
1104 | | /** |
1105 | | * g_match_info_unref: |
1106 | | * @match_info: a #GMatchInfo |
1107 | | * |
1108 | | * Decreases reference count of @match_info by 1. When reference count drops |
1109 | | * to zero, it frees all the memory associated with the match_info structure. |
1110 | | * |
1111 | | * Since: 2.30 |
1112 | | */ |
1113 | | void |
1114 | | g_match_info_unref (GMatchInfo *match_info) |
1115 | 0 | { |
1116 | 0 | if (g_atomic_int_dec_and_test (&match_info->ref_count)) |
1117 | 0 | { |
1118 | 0 | g_regex_unref (match_info->regex); |
1119 | 0 | if (match_info->match_context) |
1120 | 0 | pcre2_match_context_free (match_info->match_context); |
1121 | 0 | if (match_info->jit_stack) |
1122 | 0 | pcre2_jit_stack_free (match_info->jit_stack); |
1123 | 0 | if (match_info->match_data) |
1124 | 0 | pcre2_match_data_free (match_info->match_data); |
1125 | 0 | g_free (match_info->offsets); |
1126 | 0 | g_free (match_info->workspace); |
1127 | 0 | g_free (match_info); |
1128 | 0 | } |
1129 | 0 | } |
1130 | | |
1131 | | /** |
1132 | | * g_match_info_free: |
1133 | | * @match_info: (nullable): a #GMatchInfo, or %NULL |
1134 | | * |
1135 | | * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does |
1136 | | * nothing. |
1137 | | * |
1138 | | * Since: 2.14 |
1139 | | */ |
1140 | | void |
1141 | | g_match_info_free (GMatchInfo *match_info) |
1142 | 690 | { |
1143 | 690 | if (match_info == NULL) |
1144 | 690 | return; |
1145 | | |
1146 | 0 | g_match_info_unref (match_info); |
1147 | 0 | } |
1148 | | |
1149 | | /** |
1150 | | * g_match_info_next: |
1151 | | * @match_info: a #GMatchInfo structure |
1152 | | * @error: location to store the error occurring, or %NULL to ignore errors |
1153 | | * |
1154 | | * Scans for the next match using the same parameters of the previous |
1155 | | * call to g_regex_match_full() or g_regex_match() that returned |
1156 | | * @match_info. |
1157 | | * |
1158 | | * The match is done on the string passed to the match function, so you |
1159 | | * cannot free it before calling this function. |
1160 | | * |
1161 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
1162 | | * |
1163 | | * Since: 2.14 |
1164 | | */ |
1165 | | gboolean |
1166 | | g_match_info_next (GMatchInfo *match_info, |
1167 | | GError **error) |
1168 | 0 | { |
1169 | 0 | JITStatus jit_status; |
1170 | 0 | gint prev_match_start; |
1171 | 0 | gint prev_match_end; |
1172 | 0 | uint32_t opts; |
1173 | |
|
1174 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1175 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
1176 | 0 | g_return_val_if_fail (match_info->pos >= 0, FALSE); |
1177 | | |
1178 | 0 | prev_match_start = match_info->offsets[0]; |
1179 | 0 | prev_match_end = match_info->offsets[1]; |
1180 | |
|
1181 | 0 | if (match_info->pos > match_info->string_len) |
1182 | 0 | { |
1183 | | /* we have reached the end of the string */ |
1184 | 0 | match_info->pos = -1; |
1185 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
1186 | 0 | return FALSE; |
1187 | 0 | } |
1188 | | |
1189 | 0 | opts = match_info->regex->match_opts | match_info->match_opts; |
1190 | |
|
1191 | 0 | jit_status = enable_jit_with_match_options (match_info, opts); |
1192 | 0 | if (jit_status == JIT_STATUS_ENABLED) |
1193 | 0 | { |
1194 | 0 | match_info->matches = pcre2_jit_match (match_info->regex->pcre_re, |
1195 | 0 | (PCRE2_SPTR8) match_info->string, |
1196 | 0 | match_info->string_len, |
1197 | 0 | match_info->pos, |
1198 | 0 | opts, |
1199 | 0 | match_info->match_data, |
1200 | 0 | match_info->match_context); |
1201 | | /* if the JIT stack limit was reached, fall back to non-JIT matching in |
1202 | | * the next conditional statement */ |
1203 | 0 | if (match_info->matches == PCRE2_ERROR_JIT_STACKLIMIT) |
1204 | 0 | { |
1205 | 0 | g_debug ("PCRE2 JIT stack limit reached, falling back to " |
1206 | 0 | "non-optimized matching."); |
1207 | 0 | opts |= PCRE2_NO_JIT; |
1208 | 0 | jit_status = JIT_STATUS_DISABLED; |
1209 | 0 | } |
1210 | 0 | } |
1211 | |
|
1212 | 0 | if (jit_status != JIT_STATUS_ENABLED) |
1213 | 0 | { |
1214 | 0 | match_info->matches = pcre2_match (match_info->regex->pcre_re, |
1215 | 0 | (PCRE2_SPTR8) match_info->string, |
1216 | 0 | match_info->string_len, |
1217 | 0 | match_info->pos, |
1218 | 0 | opts, |
1219 | 0 | match_info->match_data, |
1220 | 0 | match_info->match_context); |
1221 | 0 | } |
1222 | |
|
1223 | 0 | if (IS_PCRE2_ERROR (match_info->matches)) |
1224 | 0 | { |
1225 | 0 | gchar *error_msg = get_match_error_message (match_info->matches); |
1226 | |
|
1227 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
1228 | 0 | _("Error while matching regular expression %s: %s"), |
1229 | 0 | match_info->regex->pattern, error_msg); |
1230 | 0 | g_clear_pointer (&error_msg, g_free); |
1231 | 0 | return FALSE; |
1232 | 0 | } |
1233 | 0 | else if (match_info->matches == 0) |
1234 | 0 | { |
1235 | | /* info->offsets is too small. */ |
1236 | 0 | match_info->n_offsets *= 2; |
1237 | 0 | match_info->offsets = g_realloc_n (match_info->offsets, |
1238 | 0 | match_info->n_offsets, |
1239 | 0 | sizeof (gint)); |
1240 | |
|
1241 | 0 | pcre2_match_data_free (match_info->match_data); |
1242 | 0 | match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL); |
1243 | |
|
1244 | 0 | return g_match_info_next (match_info, error); |
1245 | 0 | } |
1246 | 0 | else if (match_info->matches == PCRE2_ERROR_NOMATCH) |
1247 | 0 | { |
1248 | | /* We're done with this match info */ |
1249 | 0 | match_info->pos = -1; |
1250 | 0 | return FALSE; |
1251 | 0 | } |
1252 | 0 | else |
1253 | 0 | if (!recalc_match_offsets (match_info, error)) |
1254 | 0 | return FALSE; |
1255 | | |
1256 | | /* avoid infinite loops if the pattern is an empty string or something |
1257 | | * equivalent */ |
1258 | 0 | if (match_info->pos == match_info->offsets[1]) |
1259 | 0 | { |
1260 | 0 | if (match_info->pos > match_info->string_len) |
1261 | 0 | { |
1262 | | /* we have reached the end of the string */ |
1263 | 0 | match_info->pos = -1; |
1264 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
1265 | 0 | return FALSE; |
1266 | 0 | } |
1267 | | |
1268 | 0 | match_info->pos = NEXT_CHAR (match_info->regex, |
1269 | 0 | &match_info->string[match_info->pos]) - |
1270 | 0 | match_info->string; |
1271 | 0 | } |
1272 | 0 | else |
1273 | 0 | { |
1274 | 0 | match_info->pos = match_info->offsets[1]; |
1275 | 0 | } |
1276 | | |
1277 | 0 | g_assert (match_info->matches < 0 || |
1278 | 0 | (uint32_t) match_info->matches <= match_info->n_subpatterns + 1); |
1279 | | |
1280 | | /* it's possible to get two identical matches when we are matching |
1281 | | * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and |
1282 | | * the string is "RegExTest" we have: |
1283 | | * - search at position 0: match from 0 to 0 |
1284 | | * - search at position 1: match from 3 to 3 |
1285 | | * - search at position 3: match from 3 to 3 (duplicate) |
1286 | | * - search at position 4: match from 5 to 5 |
1287 | | * - search at position 5: match from 5 to 5 (duplicate) |
1288 | | * - search at position 6: no match -> stop |
1289 | | * so we have to ignore the duplicates. |
1290 | | * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */ |
1291 | 0 | if (match_info->matches >= 0 && |
1292 | 0 | prev_match_start == match_info->offsets[0] && |
1293 | 0 | prev_match_end == match_info->offsets[1]) |
1294 | 0 | { |
1295 | | /* ignore this match and search the next one */ |
1296 | 0 | return g_match_info_next (match_info, error); |
1297 | 0 | } |
1298 | | |
1299 | 0 | return match_info->matches >= 0; |
1300 | 0 | } |
1301 | | |
1302 | | /** |
1303 | | * g_match_info_matches: |
1304 | | * @match_info: a #GMatchInfo structure |
1305 | | * |
1306 | | * Returns whether the previous match operation succeeded. |
1307 | | * |
1308 | | * Returns: %TRUE if the previous match operation succeeded, |
1309 | | * %FALSE otherwise |
1310 | | * |
1311 | | * Since: 2.14 |
1312 | | */ |
1313 | | gboolean |
1314 | | g_match_info_matches (const GMatchInfo *match_info) |
1315 | 0 | { |
1316 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1317 | | |
1318 | 0 | return match_info->matches >= 0; |
1319 | 0 | } |
1320 | | |
1321 | | /** |
1322 | | * g_match_info_get_match_count: |
1323 | | * @match_info: a #GMatchInfo structure |
1324 | | * |
1325 | | * Retrieves the number of matched substrings (including substring 0, |
1326 | | * that is the whole matched text), so 1 is returned if the pattern |
1327 | | * has no substrings in it and 0 is returned if the match failed. |
1328 | | * |
1329 | | * If the last match was obtained using the DFA algorithm, that is |
1330 | | * using g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1331 | | * count is not that of the number of capturing parentheses but that of |
1332 | | * the number of matched substrings. |
1333 | | * |
1334 | | * Returns: Number of matched substrings, or -1 if an error occurred |
1335 | | * |
1336 | | * Since: 2.14 |
1337 | | */ |
1338 | | gint |
1339 | | g_match_info_get_match_count (const GMatchInfo *match_info) |
1340 | 0 | { |
1341 | 0 | g_return_val_if_fail (match_info, -1); |
1342 | | |
1343 | 0 | if (match_info->matches == PCRE2_ERROR_NOMATCH) |
1344 | | /* no match */ |
1345 | 0 | return 0; |
1346 | 0 | else if (match_info->matches < PCRE2_ERROR_NOMATCH) |
1347 | | /* error */ |
1348 | 0 | return -1; |
1349 | 0 | else |
1350 | | /* match */ |
1351 | 0 | return match_info->matches; |
1352 | 0 | } |
1353 | | |
1354 | | /** |
1355 | | * g_match_info_is_partial_match: |
1356 | | * @match_info: a #GMatchInfo structure |
1357 | | * |
1358 | | * Usually if the string passed to g_regex_match*() matches as far as |
1359 | | * it goes, but is too short to match the entire pattern, %FALSE is |
1360 | | * returned. There are circumstances where it might be helpful to |
1361 | | * distinguish this case from other cases in which there is no match. |
1362 | | * |
1363 | | * Consider, for example, an application where a human is required to |
1364 | | * type in data for a field with specific formatting requirements. An |
1365 | | * example might be a date in the form ddmmmyy, defined by the pattern |
1366 | | * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$". |
1367 | | * If the application sees the user’s keystrokes one by one, and can |
1368 | | * check that what has been typed so far is potentially valid, it is |
1369 | | * able to raise an error as soon as a mistake is made. |
1370 | | * |
1371 | | * GRegex supports the concept of partial matching by means of the |
1372 | | * %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD flags. |
1373 | | * When they are used, the return code for |
1374 | | * g_regex_match() or g_regex_match_full() is, as usual, %TRUE |
1375 | | * for a complete match, %FALSE otherwise. But, when these functions |
1376 | | * return %FALSE, you can check if the match was partial calling |
1377 | | * g_match_info_is_partial_match(). |
1378 | | * |
1379 | | * The difference between %G_REGEX_MATCH_PARTIAL_SOFT and |
1380 | | * %G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered |
1381 | | * with %G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a |
1382 | | * possible complete match, while with %G_REGEX_MATCH_PARTIAL_HARD matching |
1383 | | * stops at the partial match. |
1384 | | * When both %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD |
1385 | | * are set, the latter takes precedence. |
1386 | | * |
1387 | | * There were formerly some restrictions on the pattern for partial matching. |
1388 | | * The restrictions no longer apply. |
1389 | | * |
1390 | | * See pcrepartial(3) for more information on partial matching. |
1391 | | * |
1392 | | * Returns: %TRUE if the match was partial, %FALSE otherwise |
1393 | | * |
1394 | | * Since: 2.14 |
1395 | | */ |
1396 | | gboolean |
1397 | | g_match_info_is_partial_match (const GMatchInfo *match_info) |
1398 | 0 | { |
1399 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1400 | | |
1401 | 0 | return match_info->matches == PCRE2_ERROR_PARTIAL; |
1402 | 0 | } |
1403 | | |
1404 | | /** |
1405 | | * g_match_info_expand_references: |
1406 | | * @match_info: (nullable): a #GMatchInfo or %NULL |
1407 | | * @string_to_expand: the string to expand |
1408 | | * @error: location to store the error occurring, or %NULL to ignore errors |
1409 | | * |
1410 | | * Returns a new string containing the text in @string_to_expand with |
1411 | | * references and escape sequences expanded. References refer to the last |
1412 | | * match done with @string against @regex and have the same syntax used by |
1413 | | * g_regex_replace(). |
1414 | | * |
1415 | | * The @string_to_expand must be UTF-8 encoded even if %G_REGEX_RAW was |
1416 | | * passed to g_regex_new(). |
1417 | | * |
1418 | | * The backreferences are extracted from the string passed to the match |
1419 | | * function, so you cannot call this function after freeing the string. |
1420 | | * |
1421 | | * @match_info may be %NULL in which case @string_to_expand must not |
1422 | | * contain references. For instance "foo\n" does not refer to an actual |
1423 | | * pattern and '\n' merely will be replaced with \n character, |
1424 | | * while to expand "\0" (whole match) one needs the result of a match. |
1425 | | * Use g_regex_check_replacement() to find out whether @string_to_expand |
1426 | | * contains references. |
1427 | | * |
1428 | | * Returns: (nullable): the expanded string, or %NULL if an error occurred |
1429 | | * |
1430 | | * Since: 2.14 |
1431 | | */ |
1432 | | gchar * |
1433 | | g_match_info_expand_references (const GMatchInfo *match_info, |
1434 | | const gchar *string_to_expand, |
1435 | | GError **error) |
1436 | 0 | { |
1437 | 0 | GString *result; |
1438 | 0 | GList *list; |
1439 | 0 | GError *tmp_error = NULL; |
1440 | |
|
1441 | 0 | g_return_val_if_fail (string_to_expand != NULL, NULL); |
1442 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
1443 | | |
1444 | 0 | list = split_replacement (string_to_expand, &tmp_error); |
1445 | 0 | if (tmp_error != NULL) |
1446 | 0 | { |
1447 | 0 | g_propagate_error (error, tmp_error); |
1448 | 0 | return NULL; |
1449 | 0 | } |
1450 | | |
1451 | 0 | if (!match_info && interpolation_list_needs_match (list)) |
1452 | 0 | { |
1453 | 0 | g_critical ("String '%s' contains references to the match, can't " |
1454 | 0 | "expand references without GMatchInfo object", |
1455 | 0 | string_to_expand); |
1456 | 0 | return NULL; |
1457 | 0 | } |
1458 | | |
1459 | 0 | result = g_string_sized_new (strlen (string_to_expand)); |
1460 | 0 | interpolate_replacement (match_info, result, list); |
1461 | |
|
1462 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
1463 | |
|
1464 | 0 | return g_string_free (result, FALSE); |
1465 | 0 | } |
1466 | | |
1467 | | /** |
1468 | | * g_match_info_fetch: |
1469 | | * @match_info: #GMatchInfo structure |
1470 | | * @match_num: number of the sub expression |
1471 | | * |
1472 | | * Retrieves the text matching the @match_num'th capturing |
1473 | | * parentheses. 0 is the full text of the match, 1 is the first paren |
1474 | | * set, 2 the second, and so on. |
1475 | | * |
1476 | | * If @match_num is a valid sub pattern but it didn't match anything |
1477 | | * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty |
1478 | | * string is returned. |
1479 | | * |
1480 | | * If the match was obtained using the DFA algorithm, that is using |
1481 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1482 | | * string is not that of a set of parentheses but that of a matched |
1483 | | * substring. Substrings are matched in reverse order of length, so |
1484 | | * 0 is the longest match. |
1485 | | * |
1486 | | * The string is fetched from the string passed to the match function, |
1487 | | * so you cannot call this function after freeing the string. |
1488 | | * |
1489 | | * Returns: (nullable): The matched substring, or %NULL if an error |
1490 | | * occurred. You have to free the string yourself |
1491 | | * |
1492 | | * Since: 2.14 |
1493 | | */ |
1494 | | gchar * |
1495 | | g_match_info_fetch (const GMatchInfo *match_info, |
1496 | | gint match_num) |
1497 | 0 | { |
1498 | 0 | gchar *match = NULL; |
1499 | 0 | gint start, end; |
1500 | |
|
1501 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1502 | 0 | g_return_val_if_fail (match_num >= 0, NULL); |
1503 | | |
1504 | | /* match_num does not exist or it didn't matched, i.e. matching "b" |
1505 | | * against "(a)?b" then group 0 is empty. */ |
1506 | 0 | if (!g_match_info_fetch_pos (match_info, match_num, &start, &end)) |
1507 | 0 | match = NULL; |
1508 | 0 | else if (start == -1) |
1509 | 0 | match = g_strdup (""); |
1510 | 0 | else |
1511 | 0 | match = g_strndup (&match_info->string[start], end - start); |
1512 | |
|
1513 | 0 | return match; |
1514 | 0 | } |
1515 | | |
1516 | | /** |
1517 | | * g_match_info_fetch_pos: |
1518 | | * @match_info: #GMatchInfo structure |
1519 | | * @match_num: number of the capture parenthesis |
1520 | | * @start_pos: (out) (optional): pointer to location where to store |
1521 | | * the start position, or %NULL |
1522 | | * @end_pos: (out) (optional): pointer to location where to store |
1523 | | * the end position (the byte after the final byte of the match), or %NULL |
1524 | | * |
1525 | | * Returns the start and end positions (in bytes) of a successfully matching |
1526 | | * capture parenthesis. |
1527 | | * |
1528 | | * Valid values for @match_num are `0` for the full text of the match, |
1529 | | * `1` for the first paren set, `2` for the second, and so on. |
1530 | | * |
1531 | | * As @end_pos is set to the byte after the final byte of the match (on success), |
1532 | | * the length of the match can be calculated as `end_pos - start_pos`. |
1533 | | * |
1534 | | * As a best practice, initialize @start_pos and @end_pos to identifiable |
1535 | | * values, such as `G_MAXINT`, so that you can test if |
1536 | | * `g_match_info_fetch_pos()` actually changed the value for a given |
1537 | | * capture parenthesis. |
1538 | | * |
1539 | | * The parameter @match_num corresponds to a matched capture parenthesis. The |
1540 | | * actual value you use for @match_num depends on the method used to generate |
1541 | | * @match_info. The following sections describe those methods. |
1542 | | * |
1543 | | * ## Methods Using Non-deterministic Finite Automata Matching |
1544 | | * |
1545 | | * The methods [method@GLib.Regex.match] and [method@GLib.Regex.match_full] |
1546 | | * return a [struct@GLib.MatchInfo] using traditional (greedy) pattern |
1547 | | * matching, also known as |
1548 | | * [Non-deterministic Finite Automaton](https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton) |
1549 | | * (NFA) matching. You pass the returned `GMatchInfo` from these methods to |
1550 | | * `g_match_info_fetch_pos()` to determine the start and end positions |
1551 | | * of capture parentheses. The values for @match_num correspond to the capture |
1552 | | * parentheses in order, with `0` corresponding to the entire matched string. |
1553 | | * |
1554 | | * @match_num can refer to a capture parenthesis with no match. For example, |
1555 | | * the string `b` matches against the pattern `(a)?b`, but the capture |
1556 | | * parenthesis `(a)` has no match. In this case, `g_match_info_fetch_pos()` |
1557 | | * returns true and sets @start_pos and @end_pos to `-1` when called with |
1558 | | * `match_num` as `1` (for `(a)`). |
1559 | | * |
1560 | | * For an expanded example, a regex pattern is `(a)?(.*?)the (.*)`, |
1561 | | * and a candidate string is `glib regexes are the best`. In this scenario |
1562 | | * there are four capture parentheses numbered 0–3: an implicit one |
1563 | | * for the entire string, and three explicitly declared in the regex pattern. |
1564 | | * |
1565 | | * Given this example, the following table describes the return values |
1566 | | * from `g_match_info_fetch_pos()` for various values of @match_num. |
1567 | | * |
1568 | | * `match_num` | Contents | Return value | Returned `start_pos` | Returned `end_pos` |
1569 | | * ----------- | -------- | ------------ | -------------------- | ------------------ |
1570 | | * 0 | Matches entire string | True | 0 | 25 |
1571 | | * 1 | Does not match first character | True | -1 | -1 |
1572 | | * 2 | All text before `the ` | True | 0 | 17 |
1573 | | * 3 | All text after `the ` | True | 21 | 25 |
1574 | | * 4 | Capture paren out of range | False | Unchanged | Unchanged |
1575 | | * |
1576 | | * The following code sample and output implements this example. |
1577 | | * |
1578 | | * ``` { .c } |
1579 | | * #include <glib.h> |
1580 | | * |
1581 | | * int |
1582 | | * main (int argc, char *argv[]) |
1583 | | * { |
1584 | | * g_autoptr(GError) local_error = NULL; |
1585 | | * const char *regex_pattern = "(a)?(.*?)the (.*)"; |
1586 | | * const char *test_string = "glib regexes are the best"; |
1587 | | * g_autoptr(GRegex) regex = NULL; |
1588 | | * |
1589 | | * regex = g_regex_new (regex_pattern, |
1590 | | * G_REGEX_DEFAULT, |
1591 | | * G_REGEX_MATCH_DEFAULT, |
1592 | | * &local_error); |
1593 | | * if (regex == NULL) |
1594 | | * { |
1595 | | * g_printerr ("Error creating regex: %s\n", local_error->message); |
1596 | | * return 1; |
1597 | | * } |
1598 | | * |
1599 | | * g_autoptr(GMatchInfo) match_info = NULL; |
1600 | | * g_regex_match (regex, test_string, G_REGEX_MATCH_DEFAULT, &match_info); |
1601 | | * |
1602 | | * int n_matched_strings = g_match_info_get_match_count (match_info); |
1603 | | * |
1604 | | * // Print header line |
1605 | | * g_print ("match_num Contents Return value returned start_pos returned end_pos\n"); |
1606 | | * |
1607 | | * // Iterate over each capture paren, including one that is out of range as a demonstration. |
1608 | | * for (int match_num = 0; match_num <= n_matched_strings; match_num++) |
1609 | | * { |
1610 | | * gboolean found_match; |
1611 | | * g_autofree char *paren_string = NULL; |
1612 | | * int start_pos = G_MAXINT; |
1613 | | * int end_pos = G_MAXINT; |
1614 | | * |
1615 | | * found_match = g_match_info_fetch_pos (match_info, |
1616 | | * match_num, |
1617 | | * &start_pos, |
1618 | | * &end_pos); |
1619 | | * |
1620 | | * // If no match, display N/A as the found string. |
1621 | | * if (start_pos == G_MAXINT || start_pos == -1) |
1622 | | * paren_string = g_strdup ("N/A"); |
1623 | | * else |
1624 | | * paren_string = g_strndup (test_string + start_pos, end_pos - start_pos); |
1625 | | * |
1626 | | * g_print ("%-9d %-25s %-12d %-18d %d\n", match_num, paren_string, found_match, start_pos, end_pos); |
1627 | | * } |
1628 | | * |
1629 | | * return 0; |
1630 | | * } |
1631 | | * ``` |
1632 | | * |
1633 | | * ``` |
1634 | | * match_num Contents Return value returned start_pos returned end_pos |
1635 | | * 0 glib regexes are the best 1 0 25 |
1636 | | * 1 N/A 1 -1 -1 |
1637 | | * 2 glib regexes are 1 0 17 |
1638 | | * 3 best 1 21 25 |
1639 | | * 4 N/A 0 2147483647 2147483647 |
1640 | | * ``` |
1641 | | * ## Methods Using Deterministic Finite Automata Matching |
1642 | | * |
1643 | | * The methods [method@GLib.Regex.match_all] and |
1644 | | * [method@GLib.Regex.match_all_full] |
1645 | | * return a `GMatchInfo` using |
1646 | | * [Deterministic Finite Automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) |
1647 | | * (DFA) pattern matching. This algorithm detects overlapping matches. You pass |
1648 | | * the returned `GMatchInfo` from these methods to `g_match_info_fetch_pos()` |
1649 | | * to determine the start and end positions of each overlapping match. Use the |
1650 | | * method [method@GLib.MatchInfo.get_match_count] to determine the number |
1651 | | * of overlapping matches. |
1652 | | * |
1653 | | * For example, a regex pattern is `<.*>`, and a candidate string is |
1654 | | * `<a> <b> <c>`. In this scenario there are three implicit capture |
1655 | | * parentheses: one for the entire string, one for `<a> <b>`, and one for `<a>`. |
1656 | | * |
1657 | | * Given this example, the following table describes the return values from |
1658 | | * `g_match_info_fetch_pos()` for various values of @match_num. |
1659 | | * |
1660 | | * `match_num` | Contents | Return value | Returned `start_pos` | Returned `end_pos` |
1661 | | * ----------- | -------- | ------------ | -------------------- | ------------------ |
1662 | | * 0 | Matches entire string | True | 0 | 11 |
1663 | | * 1 | Matches `<a> <b>` | True | 0 | 7 |
1664 | | * 2 | Matches `<a>` | True | 0 | 3 |
1665 | | * 3 | Capture paren out of range | False | Unchanged | Unchanged |
1666 | | * |
1667 | | * The following code sample and output implements this example. |
1668 | | * |
1669 | | * ``` { .c } |
1670 | | * #include <glib.h> |
1671 | | * |
1672 | | * int |
1673 | | * main (int argc, char *argv[]) |
1674 | | * { |
1675 | | * g_autoptr(GError) local_error = NULL; |
1676 | | * const char *regex_pattern = "<.*>"; |
1677 | | * const char *test_string = "<a> <b> <c>"; |
1678 | | * g_autoptr(GRegex) regex = NULL; |
1679 | | * |
1680 | | * regex = g_regex_new (regex_pattern, |
1681 | | * G_REGEX_DEFAULT, |
1682 | | * G_REGEX_MATCH_DEFAULT, |
1683 | | * &local_error); |
1684 | | * if (regex == NULL) |
1685 | | * { |
1686 | | * g_printerr ("Error creating regex: %s\n", local_error->message); |
1687 | | * return -1; |
1688 | | * } |
1689 | | * |
1690 | | * g_autoptr(GMatchInfo) match_info = NULL; |
1691 | | * g_regex_match_all (regex, test_string, G_REGEX_MATCH_DEFAULT, &match_info); |
1692 | | * |
1693 | | * int n_matched_strings = g_match_info_get_match_count (match_info); |
1694 | | * |
1695 | | * // Print header line |
1696 | | * g_print ("match_num Contents Return value returned start_pos returned end_pos\n"); |
1697 | | * |
1698 | | * // Iterate over each capture paren, including one that is out of range as a demonstration. |
1699 | | * for (int match_num = 0; match_num <= n_matched_strings; match_num++) |
1700 | | * { |
1701 | | * gboolean found_match; |
1702 | | * g_autofree char *paren_string = NULL; |
1703 | | * int start_pos = G_MAXINT; |
1704 | | * int end_pos = G_MAXINT; |
1705 | | * |
1706 | | * found_match = g_match_info_fetch_pos (match_info, match_num, &start_pos, &end_pos); |
1707 | | * |
1708 | | * // If no match, display N/A as the found string. |
1709 | | * if (start_pos == G_MAXINT || start_pos == -1) |
1710 | | * paren_string = g_strdup ("N/A"); |
1711 | | * else |
1712 | | * paren_string = g_strndup (test_string + start_pos, end_pos - start_pos); |
1713 | | * |
1714 | | * g_print ("%-9d %-25s %-12d %-18d %d\n", match_num, paren_string, found_match, start_pos, end_pos); |
1715 | | * } |
1716 | | * |
1717 | | * return 0; |
1718 | | * } |
1719 | | * ``` |
1720 | | * |
1721 | | * ``` |
1722 | | * match_num Contents Return value returned start_pos returned end_pos |
1723 | | * 0 <a> <b> <c> 1 0 11 |
1724 | | * 1 <a> <b> 1 0 7 |
1725 | | * 2 <a> 1 0 3 |
1726 | | * 3 N/A 0 2147483647 2147483647 |
1727 | | * ``` |
1728 | | * |
1729 | | * Returns: True if @match_num is within range, false otherwise. If |
1730 | | * the capture paren has a match, @start_pos and @end_pos contain the |
1731 | | * start and end positions (in bytes) of the matching substring. If the |
1732 | | * capture paren has no match, @start_pos and @end_pos are `-1`. If |
1733 | | * @match_num is out of range, @start_pos and @end_pos are left unchanged. |
1734 | | * |
1735 | | * Since: 2.14 |
1736 | | */ |
1737 | | gboolean |
1738 | | g_match_info_fetch_pos (const GMatchInfo *match_info, |
1739 | | gint match_num, |
1740 | | gint *start_pos, |
1741 | | gint *end_pos) |
1742 | 0 | { |
1743 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1744 | 0 | g_return_val_if_fail (match_num >= 0, FALSE); |
1745 | | |
1746 | | /* check whether there was an error */ |
1747 | 0 | if (match_info->matches < 0) |
1748 | 0 | return FALSE; |
1749 | | |
1750 | | /* make sure the sub expression number they're requesting is less than |
1751 | | * the total number of sub expressions in the regex. When matching all |
1752 | | * (g_regex_match_all()), also compare against the number of matches */ |
1753 | 0 | if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches)) |
1754 | 0 | return FALSE; |
1755 | | |
1756 | 0 | if (start_pos != NULL) |
1757 | 0 | *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1; |
1758 | |
|
1759 | 0 | if (end_pos != NULL) |
1760 | 0 | *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1; |
1761 | |
|
1762 | 0 | return TRUE; |
1763 | 0 | } |
1764 | | |
1765 | | /* |
1766 | | * Returns number of first matched subpattern with name @name. |
1767 | | * There may be more than one in case when DUPNAMES is used, |
1768 | | * and not all subpatterns with that name match; |
1769 | | * pcre2_substring_number_from_name() does not work in that case. |
1770 | | */ |
1771 | | static gint |
1772 | | get_matched_substring_number (const GMatchInfo *match_info, |
1773 | | const gchar *name) |
1774 | 0 | { |
1775 | 0 | gint entrysize; |
1776 | 0 | PCRE2_SPTR first, last; |
1777 | 0 | guchar *entry; |
1778 | |
|
1779 | 0 | if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES)) |
1780 | 0 | return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR8) name); |
1781 | | |
1782 | | /* This code is analogous to code from pcre2_substring.c: |
1783 | | * pcre2_substring_get_byname() */ |
1784 | 0 | entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re, |
1785 | 0 | (PCRE2_SPTR8) name, |
1786 | 0 | &first, |
1787 | 0 | &last); |
1788 | |
|
1789 | 0 | if (entrysize <= 0) |
1790 | 0 | return entrysize; |
1791 | | |
1792 | 0 | for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize) |
1793 | 0 | { |
1794 | 0 | guint n = (entry[0] << 8) + entry[1]; |
1795 | 0 | if (n * 2 < match_info->n_offsets && match_info->offsets[n * 2] >= 0) |
1796 | 0 | return n; |
1797 | 0 | } |
1798 | | |
1799 | 0 | return (first[0] << 8) + first[1]; |
1800 | 0 | } |
1801 | | |
1802 | | /** |
1803 | | * g_match_info_fetch_named: |
1804 | | * @match_info: #GMatchInfo structure |
1805 | | * @name: name of the subexpression |
1806 | | * |
1807 | | * Retrieves the text matching the capturing parentheses named @name. |
1808 | | * |
1809 | | * If @name is a valid sub pattern name but it didn't match anything |
1810 | | * (e.g. sub pattern `"X"`, matching `"b"` against `"(?P<X>a)?b"`) |
1811 | | * then an empty string is returned. |
1812 | | * |
1813 | | * The string is fetched from the string passed to the match function, |
1814 | | * so you cannot call this function after freeing the string. |
1815 | | * |
1816 | | * Returns: (nullable): The matched substring, or %NULL if an error |
1817 | | * occurred. You have to free the string yourself |
1818 | | * |
1819 | | * Since: 2.14 |
1820 | | */ |
1821 | | gchar * |
1822 | | g_match_info_fetch_named (const GMatchInfo *match_info, |
1823 | | const gchar *name) |
1824 | 0 | { |
1825 | 0 | gint num; |
1826 | |
|
1827 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1828 | 0 | g_return_val_if_fail (name != NULL, NULL); |
1829 | | |
1830 | 0 | num = get_matched_substring_number (match_info, name); |
1831 | 0 | if (num < 0) |
1832 | 0 | return NULL; |
1833 | 0 | else |
1834 | 0 | return g_match_info_fetch (match_info, num); |
1835 | 0 | } |
1836 | | |
1837 | | /** |
1838 | | * g_match_info_fetch_named_pos: |
1839 | | * @match_info: #GMatchInfo structure |
1840 | | * @name: name of the subexpression |
1841 | | * @start_pos: (out) (optional): pointer to location where to store |
1842 | | * the start position, or %NULL |
1843 | | * @end_pos: (out) (optional): pointer to location where to store |
1844 | | * the end position (the byte after the final byte of the match), or %NULL |
1845 | | * |
1846 | | * Retrieves the position in bytes of the capturing parentheses named @name. |
1847 | | * |
1848 | | * If @name is a valid sub pattern name but it didn't match anything |
1849 | | * (e.g. sub pattern `"X"`, matching `"b"` against `"(?P<X>a)?b"`) |
1850 | | * then @start_pos and @end_pos are set to -1 and %TRUE is returned. |
1851 | | * |
1852 | | * As @end_pos is set to the byte after the final byte of the match (on success), |
1853 | | * the length of the match can be calculated as `end_pos - start_pos`. |
1854 | | * |
1855 | | * Returns: %TRUE if the position was fetched, %FALSE otherwise. |
1856 | | * If the position cannot be fetched, @start_pos and @end_pos |
1857 | | * are left unchanged. |
1858 | | * |
1859 | | * Since: 2.14 |
1860 | | */ |
1861 | | gboolean |
1862 | | g_match_info_fetch_named_pos (const GMatchInfo *match_info, |
1863 | | const gchar *name, |
1864 | | gint *start_pos, |
1865 | | gint *end_pos) |
1866 | 0 | { |
1867 | 0 | gint num; |
1868 | |
|
1869 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1870 | 0 | g_return_val_if_fail (name != NULL, FALSE); |
1871 | | |
1872 | 0 | num = get_matched_substring_number (match_info, name); |
1873 | 0 | if (num < 0) |
1874 | 0 | return FALSE; |
1875 | | |
1876 | 0 | return g_match_info_fetch_pos (match_info, num, start_pos, end_pos); |
1877 | 0 | } |
1878 | | |
1879 | | /** |
1880 | | * g_match_info_fetch_all: |
1881 | | * @match_info: a #GMatchInfo structure |
1882 | | * |
1883 | | * Bundles up pointers to each of the matching substrings from a match |
1884 | | * and stores them in an array of gchar pointers. The first element in |
1885 | | * the returned array is the match number 0, i.e. the entire matched |
1886 | | * text. |
1887 | | * |
1888 | | * If a sub pattern didn't match anything (e.g. sub pattern 1, matching |
1889 | | * "b" against "(a)?b") then an empty string is inserted. |
1890 | | * |
1891 | | * If the last match was obtained using the DFA algorithm, that is using |
1892 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1893 | | * strings are not that matched by sets of parentheses but that of the |
1894 | | * matched substring. Substrings are matched in reverse order of length, |
1895 | | * so the first one is the longest match. |
1896 | | * |
1897 | | * The strings are fetched from the string passed to the match function, |
1898 | | * so you cannot call this function after freeing the string. |
1899 | | * |
1900 | | * Returns: (transfer full): a %NULL-terminated array of gchar * |
1901 | | * pointers. It must be freed using g_strfreev(). If the previous |
1902 | | * match failed %NULL is returned |
1903 | | * |
1904 | | * Since: 2.14 |
1905 | | */ |
1906 | | gchar ** |
1907 | | g_match_info_fetch_all (const GMatchInfo *match_info) |
1908 | 0 | { |
1909 | 0 | gchar **result; |
1910 | 0 | gint i; |
1911 | |
|
1912 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1913 | | |
1914 | 0 | if (match_info->matches < 0) |
1915 | 0 | return NULL; |
1916 | | |
1917 | 0 | result = g_new (gchar *, match_info->matches + 1); |
1918 | 0 | for (i = 0; i < match_info->matches; i++) |
1919 | 0 | result[i] = g_match_info_fetch (match_info, i); |
1920 | 0 | result[i] = NULL; |
1921 | |
|
1922 | 0 | return result; |
1923 | 0 | } |
1924 | | |
1925 | | |
1926 | | /* GRegex */ |
1927 | | |
1928 | | G_DEFINE_QUARK (g-regex-error-quark, g_regex_error) |
1929 | | |
1930 | | /** |
1931 | | * g_regex_ref: |
1932 | | * @regex: a #GRegex |
1933 | | * |
1934 | | * Increases reference count of @regex by 1. |
1935 | | * |
1936 | | * Returns: @regex |
1937 | | * |
1938 | | * Since: 2.14 |
1939 | | */ |
1940 | | GRegex * |
1941 | | g_regex_ref (GRegex *regex) |
1942 | 0 | { |
1943 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
1944 | 0 | g_atomic_int_inc (®ex->ref_count); |
1945 | 0 | return regex; |
1946 | 0 | } |
1947 | | |
1948 | | /** |
1949 | | * g_regex_unref: |
1950 | | * @regex: a #GRegex |
1951 | | * |
1952 | | * Decreases reference count of @regex by 1. When reference count drops |
1953 | | * to zero, it frees all the memory associated with the regex structure. |
1954 | | * |
1955 | | * Since: 2.14 |
1956 | | */ |
1957 | | void |
1958 | | g_regex_unref (GRegex *regex) |
1959 | 0 | { |
1960 | 0 | g_return_if_fail (regex != NULL); |
1961 | | |
1962 | 0 | if (g_atomic_int_dec_and_test (®ex->ref_count)) |
1963 | 0 | { |
1964 | 0 | g_free (regex->pattern); |
1965 | 0 | if (regex->pcre_re != NULL) |
1966 | 0 | pcre2_code_free (regex->pcre_re); |
1967 | 0 | g_free (regex); |
1968 | 0 | } |
1969 | 0 | } |
1970 | | |
1971 | | static pcre2_code * regex_compile (const gchar *pattern, |
1972 | | uint32_t compile_options, |
1973 | | uint32_t newline_options, |
1974 | | uint32_t bsr_options, |
1975 | | GError **error); |
1976 | | |
1977 | | static uint32_t get_pcre2_inline_compile_options (pcre2_code *re, |
1978 | | uint32_t compile_options); |
1979 | | |
1980 | | /** |
1981 | | * g_regex_new: |
1982 | | * @pattern: the regular expression |
1983 | | * @compile_options: compile options for the regular expression, or 0 |
1984 | | * @match_options: match options for the regular expression, or 0 |
1985 | | * @error: return location for a #GError |
1986 | | * |
1987 | | * Compiles the regular expression to an internal form, and does |
1988 | | * the initial setup of the #GRegex structure. |
1989 | | * |
1990 | | * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call |
1991 | | * g_regex_unref() when you are done with it |
1992 | | * |
1993 | | * Since: 2.14 |
1994 | | */ |
1995 | | GRegex * |
1996 | | g_regex_new (const gchar *pattern, |
1997 | | GRegexCompileFlags compile_options, |
1998 | | GRegexMatchFlags match_options, |
1999 | | GError **error) |
2000 | 0 | { |
2001 | 0 | GRegex *regex; |
2002 | 0 | pcre2_code *re; |
2003 | 0 | static gsize initialised = 0; |
2004 | 0 | uint32_t pcre_compile_options; |
2005 | 0 | uint32_t pcre_match_options; |
2006 | 0 | uint32_t newline_options; |
2007 | 0 | uint32_t bsr_options; |
2008 | |
|
2009 | 0 | g_return_val_if_fail (pattern != NULL, NULL); |
2010 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
2011 | 0 | G_GNUC_BEGIN_IGNORE_DEPRECATIONS |
2012 | 0 | g_return_val_if_fail ((compile_options & ~(G_REGEX_COMPILE_MASK | |
2013 | 0 | G_REGEX_JAVASCRIPT_COMPAT)) == 0, NULL); |
2014 | 0 | G_GNUC_END_IGNORE_DEPRECATIONS |
2015 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
2016 | | |
2017 | 0 | if (g_once_init_enter (&initialised)) |
2018 | 0 | { |
2019 | 0 | int supports_utf8; |
2020 | |
|
2021 | 0 | pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8); |
2022 | 0 | if (!supports_utf8) |
2023 | 0 | g_critical (_("PCRE library is compiled without UTF8 support")); |
2024 | |
|
2025 | 0 | g_once_init_leave (&initialised, supports_utf8 ? 1 : 2); |
2026 | 0 | } |
2027 | |
|
2028 | 0 | if (G_UNLIKELY (initialised != 1)) |
2029 | 0 | { |
2030 | 0 | g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, |
2031 | 0 | _("PCRE library is compiled with incompatible options")); |
2032 | 0 | return NULL; |
2033 | 0 | } |
2034 | | |
2035 | 0 | pcre_compile_options = get_pcre2_compile_options (compile_options); |
2036 | 0 | pcre_match_options = get_pcre2_match_options (match_options, compile_options); |
2037 | |
|
2038 | 0 | newline_options = get_pcre2_newline_match_options (match_options); |
2039 | 0 | if (newline_options == 0) |
2040 | 0 | newline_options = get_pcre2_newline_compile_options (compile_options); |
2041 | |
|
2042 | 0 | if (newline_options == 0) |
2043 | 0 | { |
2044 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
2045 | 0 | "Invalid newline flags"); |
2046 | 0 | return NULL; |
2047 | 0 | } |
2048 | | |
2049 | 0 | bsr_options = get_pcre2_bsr_match_options (match_options); |
2050 | 0 | if (!bsr_options) |
2051 | 0 | bsr_options = get_pcre2_bsr_compile_options (compile_options); |
2052 | |
|
2053 | 0 | re = regex_compile (pattern, pcre_compile_options, |
2054 | 0 | newline_options, bsr_options, error); |
2055 | 0 | if (re == NULL) |
2056 | 0 | return NULL; |
2057 | | |
2058 | 0 | pcre_compile_options |= |
2059 | 0 | get_pcre2_inline_compile_options (re, pcre_compile_options); |
2060 | |
|
2061 | 0 | regex = g_new0 (GRegex, 1); |
2062 | 0 | regex->ref_count = 1; |
2063 | 0 | regex->pattern = g_strdup (pattern); |
2064 | 0 | regex->pcre_re = re; |
2065 | 0 | regex->compile_opts = pcre_compile_options; |
2066 | 0 | regex->orig_compile_opts = compile_options; |
2067 | 0 | regex->match_opts = pcre_match_options; |
2068 | 0 | regex->orig_match_opts = match_options; |
2069 | |
|
2070 | 0 | return regex; |
2071 | 0 | } |
2072 | | |
2073 | | static pcre2_code * |
2074 | | regex_compile (const gchar *pattern, |
2075 | | uint32_t compile_options, |
2076 | | uint32_t newline_options, |
2077 | | uint32_t bsr_options, |
2078 | | GError **error) |
2079 | 0 | { |
2080 | 0 | pcre2_code *re; |
2081 | 0 | pcre2_compile_context *context; |
2082 | 0 | const gchar *errmsg; |
2083 | 0 | PCRE2_SIZE erroffset; |
2084 | 0 | gint errcode; |
2085 | |
|
2086 | 0 | context = pcre2_compile_context_create (NULL); |
2087 | | |
2088 | | /* set newline options */ |
2089 | 0 | if (pcre2_set_newline (context, newline_options) != 0) |
2090 | 0 | { |
2091 | 0 | g_set_error (error, G_REGEX_ERROR, |
2092 | 0 | G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
2093 | 0 | "Invalid newline flags"); |
2094 | 0 | pcre2_compile_context_free (context); |
2095 | 0 | return NULL; |
2096 | 0 | } |
2097 | | |
2098 | | /* set bsr options */ |
2099 | 0 | if (pcre2_set_bsr (context, bsr_options) != 0) |
2100 | 0 | { |
2101 | 0 | g_set_error (error, G_REGEX_ERROR, |
2102 | 0 | G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
2103 | 0 | "Invalid BSR flags"); |
2104 | 0 | pcre2_compile_context_free (context); |
2105 | 0 | return NULL; |
2106 | 0 | } |
2107 | | |
2108 | | /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */ |
2109 | 0 | if (compile_options & PCRE2_UTF) |
2110 | 0 | compile_options |= PCRE2_NO_UTF_CHECK; |
2111 | |
|
2112 | 0 | compile_options |= PCRE2_UCP; |
2113 | | |
2114 | | /* compile the pattern */ |
2115 | 0 | re = pcre2_compile ((PCRE2_SPTR8) pattern, |
2116 | 0 | PCRE2_ZERO_TERMINATED, |
2117 | 0 | compile_options, |
2118 | 0 | &errcode, |
2119 | 0 | &erroffset, |
2120 | 0 | context); |
2121 | 0 | pcre2_compile_context_free (context); |
2122 | | |
2123 | | /* if the compilation failed, set the error member and return |
2124 | | * immediately */ |
2125 | 0 | if (re == NULL) |
2126 | 0 | { |
2127 | 0 | GError *tmp_error; |
2128 | 0 | gchar *offset_str; |
2129 | 0 | gchar *pcre2_errmsg = NULL; |
2130 | 0 | int original_errcode; |
2131 | | |
2132 | | /* Translate the PCRE error code to GRegexError and use a translated |
2133 | | * error message if possible */ |
2134 | 0 | original_errcode = errcode; |
2135 | 0 | translate_compile_error (&errcode, &errmsg); |
2136 | |
|
2137 | 0 | if (!errmsg) |
2138 | 0 | { |
2139 | 0 | errmsg = _("unknown error"); |
2140 | 0 | pcre2_errmsg = get_pcre2_error_string (original_errcode); |
2141 | 0 | } |
2142 | | |
2143 | | /* PCRE uses byte offsets but we want to show character offsets */ |
2144 | 0 | erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]); |
2145 | |
|
2146 | 0 | offset_str = g_strdup_printf ("%" G_GSIZE_FORMAT, erroffset); |
2147 | 0 | tmp_error = g_error_new (G_REGEX_ERROR, errcode, |
2148 | 0 | _("Error while compiling regular expression ‘%s’ " |
2149 | 0 | "at char %s: %s"), |
2150 | 0 | pattern, offset_str, |
2151 | 0 | pcre2_errmsg ? pcre2_errmsg : errmsg); |
2152 | 0 | g_propagate_error (error, tmp_error); |
2153 | 0 | g_free (offset_str); |
2154 | 0 | g_clear_pointer (&pcre2_errmsg, g_free); |
2155 | |
|
2156 | 0 | return NULL; |
2157 | 0 | } |
2158 | | |
2159 | 0 | return re; |
2160 | 0 | } |
2161 | | |
2162 | | static uint32_t |
2163 | | get_pcre2_inline_compile_options (pcre2_code *re, |
2164 | | uint32_t compile_options) |
2165 | 0 | { |
2166 | 0 | uint32_t pcre_compile_options; |
2167 | 0 | uint32_t nonpcre_compile_options; |
2168 | | |
2169 | | /* For options set at the beginning of the pattern, pcre puts them into |
2170 | | * compile options, e.g. "(?i)foo" will make the pcre structure store |
2171 | | * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */ |
2172 | 0 | nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; |
2173 | 0 | pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options); |
2174 | 0 | compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK; |
2175 | 0 | compile_options |= nonpcre_compile_options; |
2176 | |
|
2177 | 0 | if (!(compile_options & PCRE2_DUPNAMES)) |
2178 | 0 | { |
2179 | 0 | uint32_t jchanged = 0; |
2180 | 0 | pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged); |
2181 | 0 | if (jchanged) |
2182 | 0 | compile_options |= PCRE2_DUPNAMES; |
2183 | 0 | } |
2184 | |
|
2185 | 0 | return compile_options; |
2186 | 0 | } |
2187 | | |
2188 | | /** |
2189 | | * g_regex_get_pattern: |
2190 | | * @regex: a #GRegex structure |
2191 | | * |
2192 | | * Gets the pattern string associated with @regex, i.e. a copy of |
2193 | | * the string passed to g_regex_new(). |
2194 | | * |
2195 | | * Returns: the pattern of @regex |
2196 | | * |
2197 | | * Since: 2.14 |
2198 | | */ |
2199 | | const gchar * |
2200 | | g_regex_get_pattern (const GRegex *regex) |
2201 | 0 | { |
2202 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
2203 | | |
2204 | 0 | return regex->pattern; |
2205 | 0 | } |
2206 | | |
2207 | | /** |
2208 | | * g_regex_get_max_backref: |
2209 | | * @regex: a #GRegex |
2210 | | * |
2211 | | * Returns the number of the highest back reference |
2212 | | * in the pattern, or 0 if the pattern does not contain |
2213 | | * back references. |
2214 | | * |
2215 | | * Returns: the number of the highest back reference |
2216 | | * |
2217 | | * Since: 2.14 |
2218 | | */ |
2219 | | gint |
2220 | | g_regex_get_max_backref (const GRegex *regex) |
2221 | 0 | { |
2222 | 0 | uint32_t value; |
2223 | |
|
2224 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value); |
2225 | |
|
2226 | 0 | return value; |
2227 | 0 | } |
2228 | | |
2229 | | /** |
2230 | | * g_regex_get_capture_count: |
2231 | | * @regex: a #GRegex |
2232 | | * |
2233 | | * Returns the number of capturing subpatterns in the pattern. |
2234 | | * |
2235 | | * Returns: the number of capturing subpatterns |
2236 | | * |
2237 | | * Since: 2.14 |
2238 | | */ |
2239 | | gint |
2240 | | g_regex_get_capture_count (const GRegex *regex) |
2241 | 0 | { |
2242 | 0 | uint32_t value; |
2243 | |
|
2244 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value); |
2245 | |
|
2246 | 0 | return value; |
2247 | 0 | } |
2248 | | |
2249 | | /** |
2250 | | * g_regex_get_has_cr_or_lf: |
2251 | | * @regex: a #GRegex structure |
2252 | | * |
2253 | | * Checks whether the pattern contains explicit CR or LF references. |
2254 | | * |
2255 | | * Returns: %TRUE if the pattern contains explicit CR or LF references |
2256 | | * |
2257 | | * Since: 2.34 |
2258 | | */ |
2259 | | gboolean |
2260 | | g_regex_get_has_cr_or_lf (const GRegex *regex) |
2261 | 0 | { |
2262 | 0 | uint32_t value; |
2263 | |
|
2264 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value); |
2265 | |
|
2266 | 0 | return !!value; |
2267 | 0 | } |
2268 | | |
2269 | | /** |
2270 | | * g_regex_get_max_lookbehind: |
2271 | | * @regex: a #GRegex structure |
2272 | | * |
2273 | | * Gets the number of characters in the longest lookbehind assertion in the |
2274 | | * pattern. This information is useful when doing multi-segment matching using |
2275 | | * the partial matching facilities. |
2276 | | * |
2277 | | * Returns: the number of characters in the longest lookbehind assertion. |
2278 | | * |
2279 | | * Since: 2.38 |
2280 | | */ |
2281 | | gint |
2282 | | g_regex_get_max_lookbehind (const GRegex *regex) |
2283 | 0 | { |
2284 | 0 | uint32_t max_lookbehind; |
2285 | |
|
2286 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND, |
2287 | 0 | &max_lookbehind); |
2288 | |
|
2289 | 0 | return max_lookbehind; |
2290 | 0 | } |
2291 | | |
2292 | | /** |
2293 | | * g_regex_get_compile_flags: |
2294 | | * @regex: a #GRegex |
2295 | | * |
2296 | | * Returns the compile options that @regex was created with. |
2297 | | * |
2298 | | * Depending on the version of PCRE that is used, this may or may not |
2299 | | * include flags set by option expressions such as `(?i)` found at the |
2300 | | * top-level within the compiled pattern. |
2301 | | * |
2302 | | * Returns: flags from #GRegexCompileFlags |
2303 | | * |
2304 | | * Since: 2.26 |
2305 | | */ |
2306 | | GRegexCompileFlags |
2307 | | g_regex_get_compile_flags (const GRegex *regex) |
2308 | 0 | { |
2309 | 0 | GRegexCompileFlags extra_flags; |
2310 | 0 | uint32_t info_value; |
2311 | |
|
2312 | 0 | g_return_val_if_fail (regex != NULL, 0); |
2313 | | |
2314 | | /* Preserve original G_REGEX_OPTIMIZE */ |
2315 | 0 | extra_flags = (regex->orig_compile_opts & G_REGEX_OPTIMIZE); |
2316 | | |
2317 | | /* Also include the newline options */ |
2318 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_NEWLINE, &info_value); |
2319 | 0 | switch (info_value) |
2320 | 0 | { |
2321 | 0 | case PCRE2_NEWLINE_ANYCRLF: |
2322 | 0 | extra_flags |= G_REGEX_NEWLINE_ANYCRLF; |
2323 | 0 | break; |
2324 | 0 | case PCRE2_NEWLINE_CRLF: |
2325 | 0 | extra_flags |= G_REGEX_NEWLINE_CRLF; |
2326 | 0 | break; |
2327 | 0 | case PCRE2_NEWLINE_LF: |
2328 | 0 | extra_flags |= G_REGEX_NEWLINE_LF; |
2329 | 0 | break; |
2330 | 0 | case PCRE2_NEWLINE_CR: |
2331 | 0 | extra_flags |= G_REGEX_NEWLINE_CR; |
2332 | 0 | break; |
2333 | 0 | default: |
2334 | 0 | break; |
2335 | 0 | } |
2336 | | |
2337 | | /* Also include the bsr options */ |
2338 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BSR, &info_value); |
2339 | 0 | switch (info_value) |
2340 | 0 | { |
2341 | 0 | case PCRE2_BSR_ANYCRLF: |
2342 | 0 | extra_flags |= G_REGEX_BSR_ANYCRLF; |
2343 | 0 | break; |
2344 | 0 | default: |
2345 | 0 | break; |
2346 | 0 | } |
2347 | | |
2348 | 0 | return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags; |
2349 | 0 | } |
2350 | | |
2351 | | /** |
2352 | | * g_regex_get_match_flags: |
2353 | | * @regex: a #GRegex |
2354 | | * |
2355 | | * Returns the match options that @regex was created with. |
2356 | | * |
2357 | | * Returns: flags from #GRegexMatchFlags |
2358 | | * |
2359 | | * Since: 2.26 |
2360 | | */ |
2361 | | GRegexMatchFlags |
2362 | | g_regex_get_match_flags (const GRegex *regex) |
2363 | 0 | { |
2364 | 0 | uint32_t flags; |
2365 | |
|
2366 | 0 | g_return_val_if_fail (regex != NULL, 0); |
2367 | | |
2368 | 0 | flags = g_regex_match_flags_from_pcre2 (regex->match_opts); |
2369 | 0 | flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK); |
2370 | 0 | flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF)); |
2371 | |
|
2372 | 0 | return flags; |
2373 | 0 | } |
2374 | | |
2375 | | /** |
2376 | | * g_regex_match_simple: |
2377 | | * @pattern: the regular expression |
2378 | | * @string: the string to scan for matches |
2379 | | * @compile_options: compile options for the regular expression, or 0 |
2380 | | * @match_options: match options, or 0 |
2381 | | * |
2382 | | * Scans for a match in @string for @pattern. |
2383 | | * |
2384 | | * This function is equivalent to g_regex_match() but it does not |
2385 | | * require to compile the pattern with g_regex_new(), avoiding some |
2386 | | * lines of code when you need just to do a match without extracting |
2387 | | * substrings, capture counts, and so on. |
2388 | | * |
2389 | | * If this function is to be called on the same @pattern more than |
2390 | | * once, it's more efficient to compile the pattern once with |
2391 | | * g_regex_new() and then use g_regex_match(). |
2392 | | * |
2393 | | * Returns: %TRUE if the string matched, %FALSE otherwise |
2394 | | * |
2395 | | * Since: 2.14 |
2396 | | */ |
2397 | | gboolean |
2398 | | g_regex_match_simple (const gchar *pattern, |
2399 | | const gchar *string, |
2400 | | GRegexCompileFlags compile_options, |
2401 | | GRegexMatchFlags match_options) |
2402 | 0 | { |
2403 | 0 | GRegex *regex; |
2404 | 0 | gboolean result; |
2405 | |
|
2406 | 0 | regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL); |
2407 | 0 | if (!regex) |
2408 | 0 | return FALSE; |
2409 | 0 | result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL); |
2410 | 0 | g_regex_unref (regex); |
2411 | 0 | return result; |
2412 | 0 | } |
2413 | | |
2414 | | /** |
2415 | | * g_regex_match: |
2416 | | * @regex: a #GRegex structure from g_regex_new() |
2417 | | * @string: the string to scan for matches |
2418 | | * @match_options: match options |
2419 | | * @match_info: (out) (optional): pointer to location where to store |
2420 | | * the #GMatchInfo, or %NULL if you do not need it |
2421 | | * |
2422 | | * Scans for a match in @string for the pattern in @regex. |
2423 | | * The @match_options are combined with the match options specified |
2424 | | * when the @regex structure was created, letting you have more |
2425 | | * flexibility in reusing #GRegex structures. |
2426 | | * |
2427 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2428 | | * |
2429 | | * A #GMatchInfo structure, used to get information on the match, |
2430 | | * is stored in @match_info if not %NULL. Note that if @match_info |
2431 | | * is not %NULL then it is created even if the function returns %FALSE, |
2432 | | * i.e. you must free it regardless if regular expression actually matched. |
2433 | | * |
2434 | | * To retrieve all the non-overlapping matches of the pattern in |
2435 | | * string you can use g_match_info_next(). |
2436 | | * |
2437 | | * |[<!-- language="C" --> |
2438 | | * static void |
2439 | | * print_uppercase_words (const gchar *string) |
2440 | | * { |
2441 | | * // Print all uppercase-only words. |
2442 | | * GRegex *regex; |
2443 | | * GMatchInfo *match_info; |
2444 | | * |
2445 | | * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
2446 | | * g_regex_match (regex, string, 0, &match_info); |
2447 | | * while (g_match_info_matches (match_info)) |
2448 | | * { |
2449 | | * gchar *word = g_match_info_fetch (match_info, 0); |
2450 | | * g_print ("Found: %s\n", word); |
2451 | | * g_free (word); |
2452 | | * g_match_info_next (match_info, NULL); |
2453 | | * } |
2454 | | * g_match_info_free (match_info); |
2455 | | * g_regex_unref (regex); |
2456 | | * } |
2457 | | * ]| |
2458 | | * |
2459 | | * @string is not copied and is used in #GMatchInfo internally. If |
2460 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2461 | | * freeing or modifying @string then the behaviour is undefined. |
2462 | | * |
2463 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2464 | | * |
2465 | | * Since: 2.14 |
2466 | | */ |
2467 | | gboolean |
2468 | | g_regex_match (const GRegex *regex, |
2469 | | const gchar *string, |
2470 | | GRegexMatchFlags match_options, |
2471 | | GMatchInfo **match_info) |
2472 | 690 | { |
2473 | 690 | return g_regex_match_full (regex, string, -1, 0, match_options, |
2474 | 690 | match_info, NULL); |
2475 | 690 | } |
2476 | | |
2477 | | /** |
2478 | | * g_regex_match_full: |
2479 | | * @regex: a #GRegex structure from g_regex_new() |
2480 | | * @string: (array length=string_len): the string to scan for matches |
2481 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2482 | | * @start_position: starting index of the string to match, in bytes |
2483 | | * @match_options: match options |
2484 | | * @match_info: (out) (optional): pointer to location where to store |
2485 | | * the #GMatchInfo, or %NULL if you do not need it |
2486 | | * @error: location to store the error occurring, or %NULL to ignore errors |
2487 | | * |
2488 | | * Scans for a match in @string for the pattern in @regex. |
2489 | | * The @match_options are combined with the match options specified |
2490 | | * when the @regex structure was created, letting you have more |
2491 | | * flexibility in reusing #GRegex structures. |
2492 | | * |
2493 | | * Setting @start_position differs from just passing over a shortened |
2494 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2495 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2496 | | * |
2497 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2498 | | * |
2499 | | * A #GMatchInfo structure, used to get information on the match, is |
2500 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2501 | | * not %NULL then it is created even if the function returns %FALSE, |
2502 | | * i.e. you must free it regardless if regular expression actually |
2503 | | * matched. |
2504 | | * |
2505 | | * @string is not copied and is used in #GMatchInfo internally. If |
2506 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2507 | | * freeing or modifying @string then the behaviour is undefined. |
2508 | | * |
2509 | | * To retrieve all the non-overlapping matches of the pattern in |
2510 | | * string you can use g_match_info_next(). |
2511 | | * |
2512 | | * |[<!-- language="C" --> |
2513 | | * static void |
2514 | | * print_uppercase_words (const gchar *string) |
2515 | | * { |
2516 | | * // Print all uppercase-only words. |
2517 | | * GRegex *regex; |
2518 | | * GMatchInfo *match_info; |
2519 | | * GError *error = NULL; |
2520 | | * |
2521 | | * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
2522 | | * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error); |
2523 | | * while (g_match_info_matches (match_info)) |
2524 | | * { |
2525 | | * gchar *word = g_match_info_fetch (match_info, 0); |
2526 | | * g_print ("Found: %s\n", word); |
2527 | | * g_free (word); |
2528 | | * g_match_info_next (match_info, &error); |
2529 | | * } |
2530 | | * g_match_info_free (match_info); |
2531 | | * g_regex_unref (regex); |
2532 | | * if (error != NULL) |
2533 | | * { |
2534 | | * g_printerr ("Error while matching: %s\n", error->message); |
2535 | | * g_error_free (error); |
2536 | | * } |
2537 | | * } |
2538 | | * ]| |
2539 | | * |
2540 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2541 | | * |
2542 | | * Since: 2.14 |
2543 | | */ |
2544 | | gboolean |
2545 | | g_regex_match_full (const GRegex *regex, |
2546 | | const gchar *string, |
2547 | | gssize string_len, |
2548 | | gint start_position, |
2549 | | GRegexMatchFlags match_options, |
2550 | | GMatchInfo **match_info, |
2551 | | GError **error) |
2552 | 690 | { |
2553 | 690 | GMatchInfo *info; |
2554 | 690 | gboolean match_ok; |
2555 | | |
2556 | 690 | g_return_val_if_fail (regex != NULL, FALSE); |
2557 | 690 | g_return_val_if_fail (string != NULL, FALSE); |
2558 | 0 | g_return_val_if_fail (start_position >= 0, FALSE); |
2559 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
2560 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
2561 | | |
2562 | 0 | info = match_info_new (regex, string, string_len, start_position, |
2563 | 0 | match_options, FALSE); |
2564 | 0 | match_ok = g_match_info_next (info, error); |
2565 | 0 | if (match_info != NULL) |
2566 | 0 | *match_info = info; |
2567 | 0 | else |
2568 | 0 | g_match_info_free (info); |
2569 | |
|
2570 | 0 | return match_ok; |
2571 | 0 | } |
2572 | | |
2573 | | /** |
2574 | | * g_regex_match_all: |
2575 | | * @regex: a #GRegex structure from g_regex_new() |
2576 | | * @string: the string to scan for matches |
2577 | | * @match_options: match options |
2578 | | * @match_info: (out) (optional): pointer to location where to store |
2579 | | * the #GMatchInfo, or %NULL if you do not need it |
2580 | | * |
2581 | | * Using the standard algorithm for regular expression matching only |
2582 | | * the longest match in the string is retrieved. This function uses |
2583 | | * a different algorithm so it can retrieve all the possible matches. |
2584 | | * For more documentation see g_regex_match_all_full(). |
2585 | | * |
2586 | | * A #GMatchInfo structure, used to get information on the match, is |
2587 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2588 | | * not %NULL then it is created even if the function returns %FALSE, |
2589 | | * i.e. you must free it regardless if regular expression actually |
2590 | | * matched. |
2591 | | * |
2592 | | * @string is not copied and is used in #GMatchInfo internally. If |
2593 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2594 | | * freeing or modifying @string then the behaviour is undefined. |
2595 | | * |
2596 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2597 | | * |
2598 | | * Since: 2.14 |
2599 | | */ |
2600 | | gboolean |
2601 | | g_regex_match_all (const GRegex *regex, |
2602 | | const gchar *string, |
2603 | | GRegexMatchFlags match_options, |
2604 | | GMatchInfo **match_info) |
2605 | 0 | { |
2606 | 0 | return g_regex_match_all_full (regex, string, -1, 0, match_options, |
2607 | 0 | match_info, NULL); |
2608 | 0 | } |
2609 | | |
2610 | | /** |
2611 | | * g_regex_match_all_full: |
2612 | | * @regex: a #GRegex structure from g_regex_new() |
2613 | | * @string: (array length=string_len): the string to scan for matches |
2614 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2615 | | * @start_position: starting index of the string to match, in bytes |
2616 | | * @match_options: match options |
2617 | | * @match_info: (out) (optional): pointer to location where to store |
2618 | | * the #GMatchInfo, or %NULL if you do not need it |
2619 | | * @error: location to store the error occurring, or %NULL to ignore errors |
2620 | | * |
2621 | | * Using the standard algorithm for regular expression matching only |
2622 | | * the longest match in the @string is retrieved, it is not possible |
2623 | | * to obtain all the available matches. For instance matching |
2624 | | * `"<a> <b> <c>"` against the pattern `"<.*>"` |
2625 | | * you get `"<a> <b> <c>"`. |
2626 | | * |
2627 | | * This function uses a different algorithm (called DFA, i.e. deterministic |
2628 | | * finite automaton), so it can retrieve all the possible matches, all |
2629 | | * starting at the same point in the string. For instance matching |
2630 | | * `"<a> <b> <c>"` against the pattern `"<.*>"` |
2631 | | * you would obtain three matches: `"<a> <b> <c>"`, |
2632 | | * `"<a> <b>"` and `"<a>"`. |
2633 | | * |
2634 | | * The number of matched strings is retrieved using |
2635 | | * g_match_info_get_match_count(). To obtain the matched strings and |
2636 | | * their position you can use, respectively, g_match_info_fetch() and |
2637 | | * g_match_info_fetch_pos(). Note that the strings are returned in |
2638 | | * reverse order of length; that is, the longest matching string is |
2639 | | * given first. |
2640 | | * |
2641 | | * Note that the DFA algorithm is slower than the standard one and it |
2642 | | * is not able to capture substrings, so backreferences do not work. |
2643 | | * |
2644 | | * Setting @start_position differs from just passing over a shortened |
2645 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2646 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2647 | | * |
2648 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2649 | | * |
2650 | | * A #GMatchInfo structure, used to get information on the match, is |
2651 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2652 | | * not %NULL then it is created even if the function returns %FALSE, |
2653 | | * i.e. you must free it regardless if regular expression actually |
2654 | | * matched. |
2655 | | * |
2656 | | * @string is not copied and is used in #GMatchInfo internally. If |
2657 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2658 | | * freeing or modifying @string then the behaviour is undefined. |
2659 | | * |
2660 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2661 | | * |
2662 | | * Since: 2.14 |
2663 | | */ |
2664 | | gboolean |
2665 | | g_regex_match_all_full (const GRegex *regex, |
2666 | | const gchar *string, |
2667 | | gssize string_len, |
2668 | | gint start_position, |
2669 | | GRegexMatchFlags match_options, |
2670 | | GMatchInfo **match_info, |
2671 | | GError **error) |
2672 | 0 | { |
2673 | 0 | GMatchInfo *info; |
2674 | 0 | gboolean done; |
2675 | 0 | pcre2_code *pcre_re; |
2676 | 0 | gboolean retval; |
2677 | 0 | uint32_t newline_options; |
2678 | 0 | uint32_t bsr_options; |
2679 | |
|
2680 | 0 | g_return_val_if_fail (regex != NULL, FALSE); |
2681 | 0 | g_return_val_if_fail (string != NULL, FALSE); |
2682 | 0 | g_return_val_if_fail (start_position >= 0, FALSE); |
2683 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
2684 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
2685 | | |
2686 | 0 | newline_options = get_pcre2_newline_match_options (match_options); |
2687 | 0 | if (!newline_options) |
2688 | 0 | newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts); |
2689 | |
|
2690 | 0 | bsr_options = get_pcre2_bsr_match_options (match_options); |
2691 | 0 | if (!bsr_options) |
2692 | 0 | bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts); |
2693 | | |
2694 | | /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an |
2695 | | * optimization for normal regex matching, but results in omitting some |
2696 | | * shorter matches here, and an observable behaviour change. |
2697 | | * |
2698 | | * DFA matching is rather niche, and very rarely used according to |
2699 | | * codesearch.debian.net, so don't bother caching the recompiled RE. */ |
2700 | 0 | pcre_re = regex_compile (regex->pattern, |
2701 | 0 | regex->compile_opts | PCRE2_NO_AUTO_POSSESS, |
2702 | 0 | newline_options, bsr_options, error); |
2703 | 0 | if (pcre_re == NULL) |
2704 | 0 | return FALSE; |
2705 | | |
2706 | 0 | info = match_info_new (regex, string, string_len, start_position, |
2707 | 0 | match_options, TRUE); |
2708 | |
|
2709 | 0 | done = FALSE; |
2710 | 0 | while (!done) |
2711 | 0 | { |
2712 | 0 | done = TRUE; |
2713 | 0 | info->matches = pcre2_dfa_match (pcre_re, |
2714 | 0 | (PCRE2_SPTR8) info->string, info->string_len, |
2715 | 0 | info->pos, |
2716 | 0 | (regex->match_opts | info->match_opts), |
2717 | 0 | info->match_data, |
2718 | 0 | info->match_context, |
2719 | 0 | info->workspace, info->n_workspace); |
2720 | 0 | if (info->matches == PCRE2_ERROR_DFA_WSSIZE) |
2721 | 0 | { |
2722 | | /* info->workspace is too small. */ |
2723 | 0 | info->n_workspace *= 2; |
2724 | 0 | info->workspace = g_realloc_n (info->workspace, |
2725 | 0 | info->n_workspace, |
2726 | 0 | sizeof (gint)); |
2727 | 0 | done = FALSE; |
2728 | 0 | } |
2729 | 0 | else if (info->matches == 0) |
2730 | 0 | { |
2731 | | /* info->offsets is too small. */ |
2732 | 0 | info->n_offsets *= 2; |
2733 | 0 | info->offsets = g_realloc_n (info->offsets, |
2734 | 0 | info->n_offsets, |
2735 | 0 | sizeof (gint)); |
2736 | 0 | pcre2_match_data_free (info->match_data); |
2737 | 0 | info->match_data = pcre2_match_data_create (info->n_offsets, NULL); |
2738 | 0 | done = FALSE; |
2739 | 0 | } |
2740 | 0 | else if (IS_PCRE2_ERROR (info->matches)) |
2741 | 0 | { |
2742 | 0 | gchar *error_msg = get_match_error_message (info->matches); |
2743 | |
|
2744 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
2745 | 0 | _("Error while matching regular expression %s: %s"), |
2746 | 0 | regex->pattern, error_msg); |
2747 | 0 | g_clear_pointer (&error_msg, g_free); |
2748 | 0 | } |
2749 | 0 | else if (info->matches != PCRE2_ERROR_NOMATCH) |
2750 | 0 | { |
2751 | 0 | if (!recalc_match_offsets (info, error)) |
2752 | 0 | info->matches = PCRE2_ERROR_NOMATCH; |
2753 | 0 | } |
2754 | 0 | } |
2755 | |
|
2756 | 0 | pcre2_code_free (pcre_re); |
2757 | | |
2758 | | /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only |
2759 | | * holds true for a single match, rather than matching all */ |
2760 | | |
2761 | | /* set info->pos to -1 so that a call to g_match_info_next() fails. */ |
2762 | 0 | info->pos = -1; |
2763 | 0 | retval = info->matches >= 0; |
2764 | |
|
2765 | 0 | if (match_info != NULL) |
2766 | 0 | *match_info = info; |
2767 | 0 | else |
2768 | 0 | g_match_info_free (info); |
2769 | |
|
2770 | 0 | return retval; |
2771 | 0 | } |
2772 | | |
2773 | | /** |
2774 | | * g_regex_get_string_number: |
2775 | | * @regex: #GRegex structure |
2776 | | * @name: name of the subexpression |
2777 | | * |
2778 | | * Retrieves the number of the subexpression named @name. |
2779 | | * |
2780 | | * Returns: The number of the subexpression or -1 if @name |
2781 | | * does not exists |
2782 | | * |
2783 | | * Since: 2.14 |
2784 | | */ |
2785 | | gint |
2786 | | g_regex_get_string_number (const GRegex *regex, |
2787 | | const gchar *name) |
2788 | 0 | { |
2789 | 0 | gint num; |
2790 | |
|
2791 | 0 | g_return_val_if_fail (regex != NULL, -1); |
2792 | 0 | g_return_val_if_fail (name != NULL, -1); |
2793 | | |
2794 | 0 | num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR8) name); |
2795 | 0 | if (num == PCRE2_ERROR_NOSUBSTRING) |
2796 | 0 | num = -1; |
2797 | |
|
2798 | 0 | return num; |
2799 | 0 | } |
2800 | | |
2801 | | /** |
2802 | | * g_regex_split_simple: |
2803 | | * @pattern: the regular expression |
2804 | | * @string: the string to scan for matches |
2805 | | * @compile_options: compile options for the regular expression, or 0 |
2806 | | * @match_options: match options, or 0 |
2807 | | * |
2808 | | * Breaks the string on the pattern, and returns an array of |
2809 | | * the tokens. If the pattern contains capturing parentheses, |
2810 | | * then the text for each of the substrings will also be returned. |
2811 | | * If the pattern does not match anywhere in the string, then the |
2812 | | * whole string is returned as the first token. |
2813 | | * |
2814 | | * This function is equivalent to g_regex_split() but it does |
2815 | | * not require to compile the pattern with g_regex_new(), avoiding |
2816 | | * some lines of code when you need just to do a split without |
2817 | | * extracting substrings, capture counts, and so on. |
2818 | | * |
2819 | | * If this function is to be called on the same @pattern more than |
2820 | | * once, it's more efficient to compile the pattern once with |
2821 | | * g_regex_new() and then use g_regex_split(). |
2822 | | * |
2823 | | * As a special case, the result of splitting the empty string "" |
2824 | | * is an empty vector, not a vector containing a single string. |
2825 | | * The reason for this special case is that being able to represent |
2826 | | * an empty vector is typically more useful than consistent handling |
2827 | | * of empty elements. If you do need to represent empty elements, |
2828 | | * you'll need to check for the empty string before calling this |
2829 | | * function. |
2830 | | * |
2831 | | * A pattern that can match empty strings splits @string into |
2832 | | * separate characters wherever it matches the empty string between |
2833 | | * characters. For example splitting "ab c" using as a separator |
2834 | | * "\s*", you will get "a", "b" and "c". |
2835 | | * |
2836 | | * Returns: (transfer full): a %NULL-terminated array of strings. Free |
2837 | | * it using g_strfreev() |
2838 | | * |
2839 | | * Since: 2.14 |
2840 | | **/ |
2841 | | gchar ** |
2842 | | g_regex_split_simple (const gchar *pattern, |
2843 | | const gchar *string, |
2844 | | GRegexCompileFlags compile_options, |
2845 | | GRegexMatchFlags match_options) |
2846 | 0 | { |
2847 | 0 | GRegex *regex; |
2848 | 0 | gchar **result; |
2849 | |
|
2850 | 0 | regex = g_regex_new (pattern, compile_options, 0, NULL); |
2851 | 0 | if (!regex) |
2852 | 0 | return NULL; |
2853 | | |
2854 | 0 | result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL); |
2855 | 0 | g_regex_unref (regex); |
2856 | 0 | return result; |
2857 | 0 | } |
2858 | | |
2859 | | /** |
2860 | | * g_regex_split: |
2861 | | * @regex: a #GRegex structure |
2862 | | * @string: the string to split with the pattern |
2863 | | * @match_options: match time option flags |
2864 | | * |
2865 | | * Breaks the string on the pattern, and returns an array of the tokens. |
2866 | | * If the pattern contains capturing parentheses, then the text for each |
2867 | | * of the substrings will also be returned. If the pattern does not match |
2868 | | * anywhere in the string, then the whole string is returned as the first |
2869 | | * token. |
2870 | | * |
2871 | | * As a special case, the result of splitting the empty string "" is an |
2872 | | * empty vector, not a vector containing a single string. The reason for |
2873 | | * this special case is that being able to represent an empty vector is |
2874 | | * typically more useful than consistent handling of empty elements. If |
2875 | | * you do need to represent empty elements, you'll need to check for the |
2876 | | * empty string before calling this function. |
2877 | | * |
2878 | | * A pattern that can match empty strings splits @string into separate |
2879 | | * characters wherever it matches the empty string between characters. |
2880 | | * For example splitting "ab c" using as a separator "\s*", you will get |
2881 | | * "a", "b" and "c". |
2882 | | * |
2883 | | * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
2884 | | * it using g_strfreev() |
2885 | | * |
2886 | | * Since: 2.14 |
2887 | | **/ |
2888 | | gchar ** |
2889 | | g_regex_split (const GRegex *regex, |
2890 | | const gchar *string, |
2891 | | GRegexMatchFlags match_options) |
2892 | 0 | { |
2893 | 0 | return g_regex_split_full (regex, string, -1, 0, |
2894 | 0 | match_options, 0, NULL); |
2895 | 0 | } |
2896 | | |
2897 | | /** |
2898 | | * g_regex_split_full: |
2899 | | * @regex: a #GRegex structure |
2900 | | * @string: (array length=string_len): the string to split with the pattern |
2901 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2902 | | * @start_position: starting index of the string to match, in bytes |
2903 | | * @match_options: match time option flags |
2904 | | * @max_tokens: the maximum number of tokens to split @string into. |
2905 | | * If this is less than 1, the string is split completely |
2906 | | * @error: return location for a #GError |
2907 | | * |
2908 | | * Breaks the string on the pattern, and returns an array of the tokens. |
2909 | | * If the pattern contains capturing parentheses, then the text for each |
2910 | | * of the substrings will also be returned. If the pattern does not match |
2911 | | * anywhere in the string, then the whole string is returned as the first |
2912 | | * token. |
2913 | | * |
2914 | | * As a special case, the result of splitting the empty string "" is an |
2915 | | * empty vector, not a vector containing a single string. The reason for |
2916 | | * this special case is that being able to represent an empty vector is |
2917 | | * typically more useful than consistent handling of empty elements. If |
2918 | | * you do need to represent empty elements, you'll need to check for the |
2919 | | * empty string before calling this function. |
2920 | | * |
2921 | | * A pattern that can match empty strings splits @string into separate |
2922 | | * characters wherever it matches the empty string between characters. |
2923 | | * For example splitting "ab c" using as a separator "\s*", you will get |
2924 | | * "a", "b" and "c". |
2925 | | * |
2926 | | * Setting @start_position differs from just passing over a shortened |
2927 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2928 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2929 | | * |
2930 | | * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
2931 | | * it using g_strfreev() |
2932 | | * |
2933 | | * Since: 2.14 |
2934 | | **/ |
2935 | | gchar ** |
2936 | | g_regex_split_full (const GRegex *regex, |
2937 | | const gchar *string, |
2938 | | gssize string_len, |
2939 | | gint start_position, |
2940 | | GRegexMatchFlags match_options, |
2941 | | gint max_tokens, |
2942 | | GError **error) |
2943 | 0 | { |
2944 | 0 | GError *tmp_error = NULL; |
2945 | 0 | GMatchInfo *match_info; |
2946 | 0 | GList *list, *last; |
2947 | 0 | gint i; |
2948 | 0 | gint token_count; |
2949 | 0 | gboolean match_ok; |
2950 | | /* position of the last separator. */ |
2951 | 0 | gint last_separator_end; |
2952 | | /* was the last match 0 bytes long? */ |
2953 | 0 | gboolean last_match_is_empty; |
2954 | | /* the returned array of char **s */ |
2955 | 0 | gchar **string_list; |
2956 | |
|
2957 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
2958 | 0 | g_return_val_if_fail (string != NULL, NULL); |
2959 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
2960 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
2961 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
2962 | | |
2963 | 0 | if (max_tokens <= 0) |
2964 | 0 | max_tokens = G_MAXINT; |
2965 | |
|
2966 | 0 | if (string_len < 0) |
2967 | 0 | string_len = strlen (string); |
2968 | | |
2969 | | /* zero-length string */ |
2970 | 0 | if (string_len - start_position == 0) |
2971 | 0 | return g_new0 (gchar *, 1); |
2972 | | |
2973 | 0 | if (max_tokens == 1) |
2974 | 0 | { |
2975 | 0 | string_list = g_new0 (gchar *, 2); |
2976 | 0 | string_list[0] = g_strndup (&string[start_position], |
2977 | 0 | string_len - start_position); |
2978 | 0 | return string_list; |
2979 | 0 | } |
2980 | | |
2981 | 0 | list = NULL; |
2982 | 0 | token_count = 0; |
2983 | 0 | last_separator_end = start_position; |
2984 | 0 | last_match_is_empty = FALSE; |
2985 | |
|
2986 | 0 | match_ok = g_regex_match_full (regex, string, string_len, start_position, |
2987 | 0 | match_options, &match_info, &tmp_error); |
2988 | |
|
2989 | 0 | while (tmp_error == NULL) |
2990 | 0 | { |
2991 | 0 | if (match_ok) |
2992 | 0 | { |
2993 | 0 | last_match_is_empty = |
2994 | 0 | (match_info->offsets[0] == match_info->offsets[1]); |
2995 | | |
2996 | | /* we need to skip empty separators at the same position of the end |
2997 | | * of another separator. e.g. the string is "a b" and the separator |
2998 | | * is " *", so from 1 to 2 we have a match and at position 2 we have |
2999 | | * an empty match. */ |
3000 | 0 | if (last_separator_end != match_info->offsets[1]) |
3001 | 0 | { |
3002 | 0 | gchar *token; |
3003 | 0 | gint match_count; |
3004 | |
|
3005 | 0 | token = g_strndup (string + last_separator_end, |
3006 | 0 | match_info->offsets[0] - last_separator_end); |
3007 | 0 | list = g_list_prepend (list, token); |
3008 | 0 | token_count++; |
3009 | | |
3010 | | /* if there were substrings, these need to be added to |
3011 | | * the list. */ |
3012 | 0 | match_count = g_match_info_get_match_count (match_info); |
3013 | 0 | if (match_count > 1) |
3014 | 0 | { |
3015 | 0 | for (i = 1; i < match_count; i++) |
3016 | 0 | list = g_list_prepend (list, g_match_info_fetch (match_info, i)); |
3017 | 0 | } |
3018 | 0 | } |
3019 | 0 | } |
3020 | 0 | else |
3021 | 0 | { |
3022 | | /* if there was no match, copy to end of string. */ |
3023 | 0 | if (!last_match_is_empty) |
3024 | 0 | { |
3025 | 0 | gchar *token = g_strndup (string + last_separator_end, |
3026 | 0 | match_info->string_len - last_separator_end); |
3027 | 0 | list = g_list_prepend (list, token); |
3028 | 0 | } |
3029 | | /* no more tokens, end the loop. */ |
3030 | 0 | break; |
3031 | 0 | } |
3032 | | |
3033 | | /* -1 to leave room for the last part. */ |
3034 | 0 | if (token_count >= max_tokens - 1) |
3035 | 0 | { |
3036 | | /* we have reached the maximum number of tokens, so we copy |
3037 | | * the remaining part of the string. */ |
3038 | 0 | if (last_match_is_empty) |
3039 | 0 | { |
3040 | | /* the last match was empty, so we have moved one char |
3041 | | * after the real position to avoid empty matches at the |
3042 | | * same position. */ |
3043 | 0 | match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string; |
3044 | 0 | } |
3045 | | /* the if is needed in the case we have terminated the available |
3046 | | * tokens, but we are at the end of the string, so there are no |
3047 | | * characters left to copy. */ |
3048 | 0 | if (string_len > match_info->pos) |
3049 | 0 | { |
3050 | 0 | gchar *token = g_strndup (string + match_info->pos, |
3051 | 0 | string_len - match_info->pos); |
3052 | 0 | list = g_list_prepend (list, token); |
3053 | 0 | } |
3054 | | /* end the loop. */ |
3055 | 0 | break; |
3056 | 0 | } |
3057 | | |
3058 | 0 | last_separator_end = match_info->pos; |
3059 | 0 | if (last_match_is_empty) |
3060 | | /* if the last match was empty, g_match_info_next() has moved |
3061 | | * forward to avoid infinite loops, but we still need to copy that |
3062 | | * character. */ |
3063 | 0 | last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string; |
3064 | |
|
3065 | 0 | match_ok = g_match_info_next (match_info, &tmp_error); |
3066 | 0 | } |
3067 | 0 | g_match_info_free (match_info); |
3068 | 0 | if (tmp_error != NULL) |
3069 | 0 | { |
3070 | 0 | g_propagate_error (error, tmp_error); |
3071 | 0 | g_list_free_full (list, g_free); |
3072 | 0 | return NULL; |
3073 | 0 | } |
3074 | | |
3075 | 0 | string_list = g_new (gchar *, g_list_length (list) + 1); |
3076 | 0 | i = 0; |
3077 | 0 | for (last = g_list_last (list); last; last = g_list_previous (last)) |
3078 | 0 | string_list[i++] = last->data; |
3079 | 0 | string_list[i] = NULL; |
3080 | 0 | g_list_free (list); |
3081 | |
|
3082 | 0 | return string_list; |
3083 | 0 | } |
3084 | | |
3085 | | enum |
3086 | | { |
3087 | | REPL_TYPE_STRING, |
3088 | | REPL_TYPE_CHARACTER, |
3089 | | REPL_TYPE_SYMBOLIC_REFERENCE, |
3090 | | REPL_TYPE_NUMERIC_REFERENCE, |
3091 | | REPL_TYPE_CHANGE_CASE |
3092 | | }; |
3093 | | |
3094 | | typedef enum |
3095 | | { |
3096 | | CHANGE_CASE_NONE = 1 << 0, |
3097 | | CHANGE_CASE_UPPER = 1 << 1, |
3098 | | CHANGE_CASE_LOWER = 1 << 2, |
3099 | | CHANGE_CASE_UPPER_SINGLE = 1 << 3, |
3100 | | CHANGE_CASE_LOWER_SINGLE = 1 << 4, |
3101 | | CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE, |
3102 | | CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE, |
3103 | | CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE |
3104 | | } ChangeCase; |
3105 | | |
3106 | | struct _InterpolationData |
3107 | | { |
3108 | | gchar *text; |
3109 | | gint type; |
3110 | | gint num; |
3111 | | gchar c; |
3112 | | ChangeCase change_case; |
3113 | | }; |
3114 | | |
3115 | | static void |
3116 | | free_interpolation_data (InterpolationData *data) |
3117 | 0 | { |
3118 | 0 | g_free (data->text); |
3119 | 0 | g_free (data); |
3120 | 0 | } |
3121 | | |
3122 | | static const gchar * |
3123 | | expand_escape (const gchar *replacement, |
3124 | | const gchar *p, |
3125 | | InterpolationData *data, |
3126 | | GError **error) |
3127 | 0 | { |
3128 | 0 | const gchar *q, *r; |
3129 | 0 | gint x, d, h, i; |
3130 | 0 | const gchar *error_detail; |
3131 | 0 | gint base = 0; |
3132 | 0 | GError *tmp_error = NULL; |
3133 | |
|
3134 | 0 | p++; |
3135 | 0 | switch (*p) |
3136 | 0 | { |
3137 | 0 | case 't': |
3138 | 0 | p++; |
3139 | 0 | data->c = '\t'; |
3140 | 0 | data->type = REPL_TYPE_CHARACTER; |
3141 | 0 | break; |
3142 | 0 | case 'n': |
3143 | 0 | p++; |
3144 | 0 | data->c = '\n'; |
3145 | 0 | data->type = REPL_TYPE_CHARACTER; |
3146 | 0 | break; |
3147 | 0 | case 'v': |
3148 | 0 | p++; |
3149 | 0 | data->c = '\v'; |
3150 | 0 | data->type = REPL_TYPE_CHARACTER; |
3151 | 0 | break; |
3152 | 0 | case 'r': |
3153 | 0 | p++; |
3154 | 0 | data->c = '\r'; |
3155 | 0 | data->type = REPL_TYPE_CHARACTER; |
3156 | 0 | break; |
3157 | 0 | case 'f': |
3158 | 0 | p++; |
3159 | 0 | data->c = '\f'; |
3160 | 0 | data->type = REPL_TYPE_CHARACTER; |
3161 | 0 | break; |
3162 | 0 | case 'a': |
3163 | 0 | p++; |
3164 | 0 | data->c = '\a'; |
3165 | 0 | data->type = REPL_TYPE_CHARACTER; |
3166 | 0 | break; |
3167 | 0 | case 'b': |
3168 | 0 | p++; |
3169 | 0 | data->c = '\b'; |
3170 | 0 | data->type = REPL_TYPE_CHARACTER; |
3171 | 0 | break; |
3172 | 0 | case '\\': |
3173 | 0 | p++; |
3174 | 0 | data->c = '\\'; |
3175 | 0 | data->type = REPL_TYPE_CHARACTER; |
3176 | 0 | break; |
3177 | 0 | case 'x': |
3178 | 0 | p++; |
3179 | 0 | x = 0; |
3180 | 0 | if (*p == '{') |
3181 | 0 | { |
3182 | 0 | p++; |
3183 | 0 | do |
3184 | 0 | { |
3185 | 0 | h = g_ascii_xdigit_value (*p); |
3186 | 0 | if (h < 0) |
3187 | 0 | { |
3188 | 0 | error_detail = _("hexadecimal digit or “}” expected"); |
3189 | 0 | goto error; |
3190 | 0 | } |
3191 | 0 | x = x * 16 + h; |
3192 | 0 | p++; |
3193 | 0 | } |
3194 | 0 | while (*p != '}'); |
3195 | 0 | p++; |
3196 | 0 | } |
3197 | 0 | else |
3198 | 0 | { |
3199 | 0 | for (i = 0; i < 2; i++) |
3200 | 0 | { |
3201 | 0 | h = g_ascii_xdigit_value (*p); |
3202 | 0 | if (h < 0) |
3203 | 0 | { |
3204 | 0 | error_detail = _("hexadecimal digit expected"); |
3205 | 0 | goto error; |
3206 | 0 | } |
3207 | 0 | x = x * 16 + h; |
3208 | 0 | p++; |
3209 | 0 | } |
3210 | 0 | } |
3211 | 0 | data->type = REPL_TYPE_STRING; |
3212 | 0 | data->text = g_new0 (gchar, 8); |
3213 | 0 | g_unichar_to_utf8 (x, data->text); |
3214 | 0 | break; |
3215 | 0 | case 'l': |
3216 | 0 | p++; |
3217 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
3218 | 0 | data->change_case = CHANGE_CASE_LOWER_SINGLE; |
3219 | 0 | break; |
3220 | 0 | case 'u': |
3221 | 0 | p++; |
3222 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
3223 | 0 | data->change_case = CHANGE_CASE_UPPER_SINGLE; |
3224 | 0 | break; |
3225 | 0 | case 'L': |
3226 | 0 | p++; |
3227 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
3228 | 0 | data->change_case = CHANGE_CASE_LOWER; |
3229 | 0 | break; |
3230 | 0 | case 'U': |
3231 | 0 | p++; |
3232 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
3233 | 0 | data->change_case = CHANGE_CASE_UPPER; |
3234 | 0 | break; |
3235 | 0 | case 'E': |
3236 | 0 | p++; |
3237 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
3238 | 0 | data->change_case = CHANGE_CASE_NONE; |
3239 | 0 | break; |
3240 | 0 | case 'g': |
3241 | 0 | p++; |
3242 | 0 | if (*p != '<') |
3243 | 0 | { |
3244 | 0 | error_detail = _("missing “<” in symbolic reference"); |
3245 | 0 | goto error; |
3246 | 0 | } |
3247 | 0 | q = p + 1; |
3248 | 0 | do |
3249 | 0 | { |
3250 | 0 | p++; |
3251 | 0 | if (!*p) |
3252 | 0 | { |
3253 | 0 | error_detail = _("unfinished symbolic reference"); |
3254 | 0 | goto error; |
3255 | 0 | } |
3256 | 0 | } |
3257 | 0 | while (*p != '>'); |
3258 | 0 | if (p - q == 0) |
3259 | 0 | { |
3260 | 0 | error_detail = _("zero-length symbolic reference"); |
3261 | 0 | goto error; |
3262 | 0 | } |
3263 | 0 | if (g_ascii_isdigit (*q)) |
3264 | 0 | { |
3265 | 0 | x = 0; |
3266 | 0 | do |
3267 | 0 | { |
3268 | 0 | h = g_ascii_digit_value (*q); |
3269 | 0 | if (h < 0) |
3270 | 0 | { |
3271 | 0 | error_detail = _("digit expected"); |
3272 | 0 | p = q; |
3273 | 0 | goto error; |
3274 | 0 | } |
3275 | 0 | x = x * 10 + h; |
3276 | 0 | q++; |
3277 | 0 | } |
3278 | 0 | while (q != p); |
3279 | 0 | data->num = x; |
3280 | 0 | data->type = REPL_TYPE_NUMERIC_REFERENCE; |
3281 | 0 | } |
3282 | 0 | else |
3283 | 0 | { |
3284 | 0 | r = q; |
3285 | 0 | do |
3286 | 0 | { |
3287 | 0 | if (!g_ascii_isalnum (*r)) |
3288 | 0 | { |
3289 | 0 | error_detail = _("illegal symbolic reference"); |
3290 | 0 | p = r; |
3291 | 0 | goto error; |
3292 | 0 | } |
3293 | 0 | r++; |
3294 | 0 | } |
3295 | 0 | while (r != p); |
3296 | 0 | data->text = g_strndup (q, p - q); |
3297 | 0 | data->type = REPL_TYPE_SYMBOLIC_REFERENCE; |
3298 | 0 | } |
3299 | 0 | p++; |
3300 | 0 | break; |
3301 | 0 | case '0': |
3302 | | /* if \0 is followed by a number is an octal number representing a |
3303 | | * character, else it is a numeric reference. */ |
3304 | 0 | if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0) |
3305 | 0 | { |
3306 | 0 | base = 8; |
3307 | 0 | p = g_utf8_next_char (p); |
3308 | 0 | } |
3309 | 0 | G_GNUC_FALLTHROUGH; |
3310 | 0 | case '1': |
3311 | 0 | case '2': |
3312 | 0 | case '3': |
3313 | 0 | case '4': |
3314 | 0 | case '5': |
3315 | 0 | case '6': |
3316 | 0 | case '7': |
3317 | 0 | case '8': |
3318 | 0 | case '9': |
3319 | 0 | x = 0; |
3320 | 0 | d = 0; |
3321 | 0 | for (i = 0; i < 3; i++) |
3322 | 0 | { |
3323 | 0 | h = g_ascii_digit_value (*p); |
3324 | 0 | if (h < 0) |
3325 | 0 | break; |
3326 | 0 | if (h > 7) |
3327 | 0 | { |
3328 | 0 | if (base == 8) |
3329 | 0 | break; |
3330 | 0 | else |
3331 | 0 | base = 10; |
3332 | 0 | } |
3333 | 0 | if (i == 2 && base == 10) |
3334 | 0 | break; |
3335 | 0 | x = x * 8 + h; |
3336 | 0 | d = d * 10 + h; |
3337 | 0 | p++; |
3338 | 0 | } |
3339 | 0 | if (base == 8 || i == 3) |
3340 | 0 | { |
3341 | 0 | data->type = REPL_TYPE_STRING; |
3342 | 0 | data->text = g_new0 (gchar, 8); |
3343 | 0 | g_unichar_to_utf8 (x, data->text); |
3344 | 0 | } |
3345 | 0 | else |
3346 | 0 | { |
3347 | 0 | data->type = REPL_TYPE_NUMERIC_REFERENCE; |
3348 | 0 | data->num = d; |
3349 | 0 | } |
3350 | 0 | break; |
3351 | 0 | case 0: |
3352 | 0 | error_detail = _("stray final “\\”"); |
3353 | 0 | goto error; |
3354 | 0 | break; |
3355 | 0 | default: |
3356 | 0 | error_detail = _("unknown escape sequence"); |
3357 | 0 | goto error; |
3358 | 0 | } |
3359 | | |
3360 | 0 | return p; |
3361 | | |
3362 | 0 | error: |
3363 | | /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */ |
3364 | 0 | tmp_error = g_error_new (G_REGEX_ERROR, |
3365 | 0 | G_REGEX_ERROR_REPLACE, |
3366 | 0 | _("Error while parsing replacement " |
3367 | 0 | "text “%s” at char %lu: %s"), |
3368 | 0 | replacement, |
3369 | 0 | (gulong)(p - replacement), |
3370 | 0 | error_detail); |
3371 | 0 | g_propagate_error (error, tmp_error); |
3372 | |
|
3373 | 0 | return NULL; |
3374 | 0 | } |
3375 | | |
3376 | | static GList * |
3377 | | split_replacement (const gchar *replacement, |
3378 | | GError **error) |
3379 | 0 | { |
3380 | 0 | GList *list = NULL; |
3381 | 0 | InterpolationData *data; |
3382 | 0 | const gchar *p, *start; |
3383 | |
|
3384 | 0 | start = p = replacement; |
3385 | 0 | while (*p) |
3386 | 0 | { |
3387 | 0 | if (*p == '\\') |
3388 | 0 | { |
3389 | 0 | data = g_new0 (InterpolationData, 1); |
3390 | 0 | start = p = expand_escape (replacement, p, data, error); |
3391 | 0 | if (p == NULL) |
3392 | 0 | { |
3393 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3394 | 0 | free_interpolation_data (data); |
3395 | |
|
3396 | 0 | return NULL; |
3397 | 0 | } |
3398 | 0 | list = g_list_prepend (list, data); |
3399 | 0 | } |
3400 | 0 | else |
3401 | 0 | { |
3402 | 0 | p++; |
3403 | 0 | if (*p == '\\' || *p == '\0') |
3404 | 0 | { |
3405 | 0 | if (p - start > 0) |
3406 | 0 | { |
3407 | 0 | data = g_new0 (InterpolationData, 1); |
3408 | 0 | data->text = g_strndup (start, p - start); |
3409 | 0 | data->type = REPL_TYPE_STRING; |
3410 | 0 | list = g_list_prepend (list, data); |
3411 | 0 | } |
3412 | 0 | } |
3413 | 0 | } |
3414 | 0 | } |
3415 | | |
3416 | 0 | return g_list_reverse (list); |
3417 | 0 | } |
3418 | | |
3419 | | /* Change the case of c based on change_case. */ |
3420 | | #define CHANGE_CASE(c, change_case) \ |
3421 | 0 | (((change_case) & CHANGE_CASE_LOWER_MASK) ? \ |
3422 | 0 | g_unichar_tolower (c) : \ |
3423 | 0 | g_unichar_toupper (c)) |
3424 | | |
3425 | | static void |
3426 | | string_append (GString *string, |
3427 | | const gchar *text, |
3428 | | ChangeCase *change_case) |
3429 | 0 | { |
3430 | 0 | gunichar c; |
3431 | |
|
3432 | 0 | if (text[0] == '\0') |
3433 | 0 | return; |
3434 | | |
3435 | 0 | if (*change_case == CHANGE_CASE_NONE) |
3436 | 0 | { |
3437 | 0 | g_string_append (string, text); |
3438 | 0 | } |
3439 | 0 | else if (*change_case & CHANGE_CASE_SINGLE_MASK) |
3440 | 0 | { |
3441 | 0 | c = g_utf8_get_char (text); |
3442 | 0 | g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
3443 | 0 | g_string_append (string, g_utf8_next_char (text)); |
3444 | 0 | *change_case = CHANGE_CASE_NONE; |
3445 | 0 | } |
3446 | 0 | else |
3447 | 0 | { |
3448 | 0 | while (*text != '\0') |
3449 | 0 | { |
3450 | 0 | c = g_utf8_get_char (text); |
3451 | 0 | g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
3452 | 0 | text = g_utf8_next_char (text); |
3453 | 0 | } |
3454 | 0 | } |
3455 | 0 | } |
3456 | | |
3457 | | static gboolean |
3458 | | interpolate_replacement (const GMatchInfo *match_info, |
3459 | | GString *result, |
3460 | | gpointer data) |
3461 | 0 | { |
3462 | 0 | GList *list; |
3463 | 0 | InterpolationData *idata; |
3464 | 0 | gchar *match; |
3465 | 0 | ChangeCase change_case = CHANGE_CASE_NONE; |
3466 | |
|
3467 | 0 | for (list = data; list; list = list->next) |
3468 | 0 | { |
3469 | 0 | idata = list->data; |
3470 | 0 | switch (idata->type) |
3471 | 0 | { |
3472 | 0 | case REPL_TYPE_STRING: |
3473 | 0 | string_append (result, idata->text, &change_case); |
3474 | 0 | break; |
3475 | 0 | case REPL_TYPE_CHARACTER: |
3476 | 0 | g_string_append_c (result, CHANGE_CASE (idata->c, change_case)); |
3477 | 0 | if (change_case & CHANGE_CASE_SINGLE_MASK) |
3478 | 0 | change_case = CHANGE_CASE_NONE; |
3479 | 0 | break; |
3480 | 0 | case REPL_TYPE_NUMERIC_REFERENCE: |
3481 | 0 | match = g_match_info_fetch (match_info, idata->num); |
3482 | 0 | if (match) |
3483 | 0 | { |
3484 | 0 | string_append (result, match, &change_case); |
3485 | 0 | g_free (match); |
3486 | 0 | } |
3487 | 0 | break; |
3488 | 0 | case REPL_TYPE_SYMBOLIC_REFERENCE: |
3489 | 0 | match = g_match_info_fetch_named (match_info, idata->text); |
3490 | 0 | if (match) |
3491 | 0 | { |
3492 | 0 | string_append (result, match, &change_case); |
3493 | 0 | g_free (match); |
3494 | 0 | } |
3495 | 0 | break; |
3496 | 0 | case REPL_TYPE_CHANGE_CASE: |
3497 | 0 | change_case = idata->change_case; |
3498 | 0 | break; |
3499 | 0 | } |
3500 | 0 | } |
3501 | | |
3502 | 0 | return FALSE; |
3503 | 0 | } |
3504 | | |
3505 | | /* whether actual match_info is needed for replacement, i.e. |
3506 | | * whether there are references |
3507 | | */ |
3508 | | static gboolean |
3509 | | interpolation_list_needs_match (GList *list) |
3510 | 0 | { |
3511 | 0 | while (list != NULL) |
3512 | 0 | { |
3513 | 0 | InterpolationData *data = list->data; |
3514 | |
|
3515 | 0 | if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE || |
3516 | 0 | data->type == REPL_TYPE_NUMERIC_REFERENCE) |
3517 | 0 | { |
3518 | 0 | return TRUE; |
3519 | 0 | } |
3520 | | |
3521 | 0 | list = list->next; |
3522 | 0 | } |
3523 | | |
3524 | 0 | return FALSE; |
3525 | 0 | } |
3526 | | |
3527 | | /** |
3528 | | * g_regex_replace: |
3529 | | * @regex: a #GRegex structure |
3530 | | * @string: (array length=string_len): the string to perform matches against |
3531 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3532 | | * @start_position: starting index of the string to match, in bytes |
3533 | | * @replacement: text to replace each match with |
3534 | | * @match_options: options for the match |
3535 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3536 | | * |
3537 | | * Replaces all occurrences of the pattern in @regex with the |
3538 | | * replacement text. Backreferences of the form `\number` or |
3539 | | * `\g<number>` in the replacement text are interpolated by the |
3540 | | * number-th captured subexpression of the match, `\g<name>` refers |
3541 | | * to the captured subexpression with the given name. `\0` refers |
3542 | | * to the complete match, but `\0` followed by a number is the octal |
3543 | | * representation of a character. To include a literal `\` in the |
3544 | | * replacement, write `\\\\`. |
3545 | | * |
3546 | | * There are also escapes that changes the case of the following text: |
3547 | | * |
3548 | | * - \l: Convert to lower case the next character |
3549 | | * - \u: Convert to upper case the next character |
3550 | | * - \L: Convert to lower case till \E |
3551 | | * - \U: Convert to upper case till \E |
3552 | | * - \E: End case modification |
3553 | | * |
3554 | | * If you do not need to use backreferences use g_regex_replace_literal(). |
3555 | | * |
3556 | | * The @replacement string must be UTF-8 encoded even if %G_REGEX_RAW was |
3557 | | * passed to g_regex_new(). If you want to use not UTF-8 encoded strings |
3558 | | * you can use g_regex_replace_literal(). |
3559 | | * |
3560 | | * Setting @start_position differs from just passing over a shortened |
3561 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern that |
3562 | | * begins with any kind of lookbehind assertion, such as "\b". |
3563 | | * |
3564 | | * Returns: a newly allocated string containing the replacements |
3565 | | * |
3566 | | * Since: 2.14 |
3567 | | */ |
3568 | | gchar * |
3569 | | g_regex_replace (const GRegex *regex, |
3570 | | const gchar *string, |
3571 | | gssize string_len, |
3572 | | gint start_position, |
3573 | | const gchar *replacement, |
3574 | | GRegexMatchFlags match_options, |
3575 | | GError **error) |
3576 | 0 | { |
3577 | 0 | gchar *result; |
3578 | 0 | GList *list; |
3579 | 0 | GError *tmp_error = NULL; |
3580 | |
|
3581 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
3582 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3583 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
3584 | 0 | g_return_val_if_fail (replacement != NULL, NULL); |
3585 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
3586 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3587 | | |
3588 | 0 | list = split_replacement (replacement, &tmp_error); |
3589 | 0 | if (tmp_error != NULL) |
3590 | 0 | { |
3591 | 0 | g_propagate_error (error, tmp_error); |
3592 | 0 | return NULL; |
3593 | 0 | } |
3594 | | |
3595 | 0 | result = g_regex_replace_eval (regex, |
3596 | 0 | string, string_len, start_position, |
3597 | 0 | match_options, |
3598 | 0 | interpolate_replacement, |
3599 | 0 | (gpointer)list, |
3600 | 0 | &tmp_error); |
3601 | 0 | if (tmp_error != NULL) |
3602 | 0 | g_propagate_error (error, tmp_error); |
3603 | |
|
3604 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3605 | |
|
3606 | 0 | return result; |
3607 | 0 | } |
3608 | | |
3609 | | static gboolean |
3610 | | literal_replacement (const GMatchInfo *match_info, |
3611 | | GString *result, |
3612 | | gpointer data) |
3613 | 0 | { |
3614 | 0 | g_string_append (result, data); |
3615 | 0 | return FALSE; |
3616 | 0 | } |
3617 | | |
3618 | | /** |
3619 | | * g_regex_replace_literal: |
3620 | | * @regex: a #GRegex structure |
3621 | | * @string: (array length=string_len): the string to perform matches against |
3622 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3623 | | * @start_position: starting index of the string to match, in bytes |
3624 | | * @replacement: text to replace each match with |
3625 | | * @match_options: options for the match |
3626 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3627 | | * |
3628 | | * Replaces all occurrences of the pattern in @regex with the |
3629 | | * replacement text. @replacement is replaced literally, to |
3630 | | * include backreferences use g_regex_replace(). |
3631 | | * |
3632 | | * Setting @start_position differs from just passing over a |
3633 | | * shortened string and setting %G_REGEX_MATCH_NOTBOL in the |
3634 | | * case of a pattern that begins with any kind of lookbehind |
3635 | | * assertion, such as "\b". |
3636 | | * |
3637 | | * Returns: a newly allocated string containing the replacements |
3638 | | * |
3639 | | * Since: 2.14 |
3640 | | */ |
3641 | | gchar * |
3642 | | g_regex_replace_literal (const GRegex *regex, |
3643 | | const gchar *string, |
3644 | | gssize string_len, |
3645 | | gint start_position, |
3646 | | const gchar *replacement, |
3647 | | GRegexMatchFlags match_options, |
3648 | | GError **error) |
3649 | 0 | { |
3650 | 0 | g_return_val_if_fail (replacement != NULL, NULL); |
3651 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3652 | | |
3653 | 0 | return g_regex_replace_eval (regex, |
3654 | 0 | string, string_len, start_position, |
3655 | 0 | match_options, |
3656 | 0 | literal_replacement, |
3657 | 0 | (gpointer)replacement, |
3658 | 0 | error); |
3659 | 0 | } |
3660 | | |
3661 | | /** |
3662 | | * g_regex_replace_eval: |
3663 | | * @regex: a #GRegex structure from g_regex_new() |
3664 | | * @string: (array length=string_len): string to perform matches against |
3665 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3666 | | * @start_position: starting index of the string to match, in bytes |
3667 | | * @match_options: options for the match |
3668 | | * @eval: (scope call): a function to call for each match |
3669 | | * @user_data: user data to pass to the function |
3670 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3671 | | * |
3672 | | * Replaces occurrences of the pattern in regex with the output of |
3673 | | * @eval for that occurrence. |
3674 | | * |
3675 | | * Setting @start_position differs from just passing over a shortened |
3676 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
3677 | | * that begins with any kind of lookbehind assertion, such as "\b". |
3678 | | * |
3679 | | * The following example uses g_regex_replace_eval() to replace multiple |
3680 | | * strings at once: |
3681 | | * |[<!-- language="C" --> |
3682 | | * static gboolean |
3683 | | * eval_cb (const GMatchInfo *info, |
3684 | | * GString *res, |
3685 | | * gpointer data) |
3686 | | * { |
3687 | | * gchar *match; |
3688 | | * gchar *r; |
3689 | | * |
3690 | | * match = g_match_info_fetch (info, 0); |
3691 | | * r = g_hash_table_lookup ((GHashTable *)data, match); |
3692 | | * g_string_append (res, r); |
3693 | | * g_free (match); |
3694 | | * |
3695 | | * return FALSE; |
3696 | | * } |
3697 | | * |
3698 | | * ... |
3699 | | * |
3700 | | * GRegex *reg; |
3701 | | * GHashTable *h; |
3702 | | * gchar *res; |
3703 | | * |
3704 | | * h = g_hash_table_new (g_str_hash, g_str_equal); |
3705 | | * |
3706 | | * g_hash_table_insert (h, "1", "ONE"); |
3707 | | * g_hash_table_insert (h, "2", "TWO"); |
3708 | | * g_hash_table_insert (h, "3", "THREE"); |
3709 | | * g_hash_table_insert (h, "4", "FOUR"); |
3710 | | * |
3711 | | * reg = g_regex_new ("1|2|3|4", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
3712 | | * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL); |
3713 | | * g_hash_table_destroy (h); |
3714 | | * |
3715 | | * ... |
3716 | | * ]| |
3717 | | * |
3718 | | * Returns: a newly allocated string containing the replacements |
3719 | | * |
3720 | | * Since: 2.14 |
3721 | | */ |
3722 | | gchar * |
3723 | | g_regex_replace_eval (const GRegex *regex, |
3724 | | const gchar *string, |
3725 | | gssize string_len, |
3726 | | gint start_position, |
3727 | | GRegexMatchFlags match_options, |
3728 | | GRegexEvalCallback eval, |
3729 | | gpointer user_data, |
3730 | | GError **error) |
3731 | 0 | { |
3732 | 0 | GMatchInfo *match_info; |
3733 | 0 | GString *result; |
3734 | 0 | gint str_pos = 0; |
3735 | 0 | gboolean done = FALSE; |
3736 | 0 | GError *tmp_error = NULL; |
3737 | |
|
3738 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
3739 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3740 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
3741 | 0 | g_return_val_if_fail (eval != NULL, NULL); |
3742 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3743 | | |
3744 | 0 | if (string_len < 0) |
3745 | 0 | string_len = strlen (string); |
3746 | |
|
3747 | 0 | result = g_string_sized_new (string_len); |
3748 | | |
3749 | | /* run down the string making matches. */ |
3750 | 0 | g_regex_match_full (regex, string, string_len, start_position, |
3751 | 0 | match_options, &match_info, &tmp_error); |
3752 | 0 | while (!done && g_match_info_matches (match_info)) |
3753 | 0 | { |
3754 | 0 | g_string_append_len (result, |
3755 | 0 | string + str_pos, |
3756 | 0 | match_info->offsets[0] - str_pos); |
3757 | 0 | done = (*eval) (match_info, result, user_data); |
3758 | 0 | str_pos = match_info->offsets[1]; |
3759 | 0 | g_match_info_next (match_info, &tmp_error); |
3760 | 0 | } |
3761 | 0 | g_match_info_free (match_info); |
3762 | 0 | if (tmp_error != NULL) |
3763 | 0 | { |
3764 | 0 | g_propagate_error (error, tmp_error); |
3765 | 0 | g_string_free (result, TRUE); |
3766 | 0 | return NULL; |
3767 | 0 | } |
3768 | | |
3769 | 0 | g_string_append_len (result, string + str_pos, string_len - str_pos); |
3770 | 0 | return g_string_free (result, FALSE); |
3771 | 0 | } |
3772 | | |
3773 | | /** |
3774 | | * g_regex_check_replacement: |
3775 | | * @replacement: the replacement string |
3776 | | * @has_references: (out) (optional): location to store information about |
3777 | | * references in @replacement or %NULL |
3778 | | * @error: location to store error |
3779 | | * |
3780 | | * Checks whether @replacement is a valid replacement string |
3781 | | * (see g_regex_replace()), i.e. that all escape sequences in |
3782 | | * it are valid. |
3783 | | * |
3784 | | * If @has_references is not %NULL then @replacement is checked |
3785 | | * for pattern references. For instance, replacement text 'foo\n' |
3786 | | * does not contain references and may be evaluated without information |
3787 | | * about actual match, but '\0\1' (whole match followed by first |
3788 | | * subpattern) requires valid #GMatchInfo object. |
3789 | | * |
3790 | | * Returns: whether @replacement is a valid replacement string |
3791 | | * |
3792 | | * Since: 2.14 |
3793 | | */ |
3794 | | gboolean |
3795 | | g_regex_check_replacement (const gchar *replacement, |
3796 | | gboolean *has_references, |
3797 | | GError **error) |
3798 | 0 | { |
3799 | 0 | GList *list; |
3800 | 0 | GError *tmp = NULL; |
3801 | |
|
3802 | 0 | list = split_replacement (replacement, &tmp); |
3803 | |
|
3804 | 0 | if (tmp) |
3805 | 0 | { |
3806 | 0 | g_propagate_error (error, tmp); |
3807 | 0 | return FALSE; |
3808 | 0 | } |
3809 | | |
3810 | 0 | if (has_references) |
3811 | 0 | *has_references = interpolation_list_needs_match (list); |
3812 | |
|
3813 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3814 | |
|
3815 | 0 | return TRUE; |
3816 | 0 | } |
3817 | | |
3818 | | /** |
3819 | | * g_regex_escape_nul: |
3820 | | * @string: the string to escape |
3821 | | * @length: the length of @string |
3822 | | * |
3823 | | * Escapes the nul characters in @string to "\x00". It can be used |
3824 | | * to compile a regex with embedded nul characters. |
3825 | | * |
3826 | | * For completeness, @length can be -1 for a nul-terminated string. |
3827 | | * In this case the output string will be of course equal to @string. |
3828 | | * |
3829 | | * Returns: a newly-allocated escaped string |
3830 | | * |
3831 | | * Since: 2.30 |
3832 | | */ |
3833 | | gchar * |
3834 | | g_regex_escape_nul (const gchar *string, |
3835 | | gint length) |
3836 | 0 | { |
3837 | 0 | GString *escaped; |
3838 | 0 | const gchar *p, *piece_start, *end; |
3839 | 0 | gint backslashes; |
3840 | |
|
3841 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3842 | | |
3843 | 0 | if (length < 0) |
3844 | 0 | return g_strdup (string); |
3845 | | |
3846 | 0 | end = string + length; |
3847 | 0 | p = piece_start = string; |
3848 | 0 | escaped = g_string_sized_new (length + 1); |
3849 | |
|
3850 | 0 | backslashes = 0; |
3851 | 0 | while (p < end) |
3852 | 0 | { |
3853 | 0 | switch (*p) |
3854 | 0 | { |
3855 | 0 | case '\0': |
3856 | 0 | if (p != piece_start) |
3857 | 0 | { |
3858 | | /* copy the previous piece. */ |
3859 | 0 | g_string_append_len (escaped, piece_start, p - piece_start); |
3860 | 0 | } |
3861 | 0 | if ((backslashes & 1) == 0) |
3862 | 0 | g_string_append_c (escaped, '\\'); |
3863 | 0 | g_string_append_c (escaped, 'x'); |
3864 | 0 | g_string_append_c (escaped, '0'); |
3865 | 0 | g_string_append_c (escaped, '0'); |
3866 | 0 | piece_start = ++p; |
3867 | 0 | backslashes = 0; |
3868 | 0 | break; |
3869 | 0 | case '\\': |
3870 | 0 | backslashes++; |
3871 | 0 | ++p; |
3872 | 0 | break; |
3873 | 0 | default: |
3874 | 0 | backslashes = 0; |
3875 | 0 | p = g_utf8_next_char (p); |
3876 | 0 | break; |
3877 | 0 | } |
3878 | 0 | } |
3879 | | |
3880 | 0 | if (piece_start < end) |
3881 | 0 | g_string_append_len (escaped, piece_start, end - piece_start); |
3882 | |
|
3883 | 0 | return g_string_free (escaped, FALSE); |
3884 | 0 | } |
3885 | | |
3886 | | /** |
3887 | | * g_regex_escape_string: |
3888 | | * @string: the string to escape |
3889 | | * @length: the length of @string, in bytes, or -1 if @string is nul-terminated |
3890 | | * |
3891 | | * Escapes the special characters used for regular expressions |
3892 | | * in @string, for instance "a.b*c" becomes "a\.b\*c". This |
3893 | | * function is useful to dynamically generate regular expressions. |
3894 | | * |
3895 | | * @string can contain nul characters that are replaced with "\0", |
3896 | | * in this case remember to specify the correct length of @string |
3897 | | * in @length. |
3898 | | * |
3899 | | * Returns: a newly-allocated escaped string |
3900 | | * |
3901 | | * Since: 2.14 |
3902 | | */ |
3903 | | gchar * |
3904 | | g_regex_escape_string (const gchar *string, |
3905 | | gint length) |
3906 | 0 | { |
3907 | 0 | GString *escaped; |
3908 | 0 | const char *p, *piece_start, *end; |
3909 | |
|
3910 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3911 | | |
3912 | 0 | if (length < 0) |
3913 | 0 | length = strlen (string); |
3914 | |
|
3915 | 0 | end = string + length; |
3916 | 0 | p = piece_start = string; |
3917 | 0 | escaped = g_string_sized_new (length + 1); |
3918 | |
|
3919 | 0 | while (p < end) |
3920 | 0 | { |
3921 | 0 | switch (*p) |
3922 | 0 | { |
3923 | 0 | case '\0': |
3924 | 0 | case '\\': |
3925 | 0 | case '|': |
3926 | 0 | case '(': |
3927 | 0 | case ')': |
3928 | 0 | case '[': |
3929 | 0 | case ']': |
3930 | 0 | case '{': |
3931 | 0 | case '}': |
3932 | 0 | case '^': |
3933 | 0 | case '$': |
3934 | 0 | case '*': |
3935 | 0 | case '+': |
3936 | 0 | case '?': |
3937 | 0 | case '.': |
3938 | 0 | if (p != piece_start) |
3939 | | /* copy the previous piece. */ |
3940 | 0 | g_string_append_len (escaped, piece_start, p - piece_start); |
3941 | 0 | g_string_append_c (escaped, '\\'); |
3942 | 0 | if (*p == '\0') |
3943 | 0 | g_string_append_c (escaped, '0'); |
3944 | 0 | else |
3945 | 0 | g_string_append_c (escaped, *p); |
3946 | 0 | piece_start = ++p; |
3947 | 0 | break; |
3948 | 0 | default: |
3949 | 0 | p = g_utf8_next_char (p); |
3950 | 0 | break; |
3951 | 0 | } |
3952 | 0 | } |
3953 | | |
3954 | 0 | if (piece_start < end) |
3955 | 0 | g_string_append_len (escaped, piece_start, end - piece_start); |
3956 | |
|
3957 | 0 | return g_string_free (escaped, FALSE); |
3958 | 0 | } |