Line | Count | Source (jump to first uncovered line) |
1 | | /* GRegex -- regular expression API wrapper around PCRE. |
2 | | * |
3 | | * Copyright (C) 1999, 2000 Scott Wimer |
4 | | * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com> |
5 | | * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org> |
6 | | * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com> |
7 | | * |
8 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
9 | | * |
10 | | * This library is free software; you can redistribute it and/or |
11 | | * modify it under the terms of the GNU Lesser General Public |
12 | | * License as published by the Free Software Foundation; either |
13 | | * version 2.1 of the License, or (at your option) any later version. |
14 | | * |
15 | | * This library is distributed in the hope that it will be useful, |
16 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | | * Lesser General Public License for more details. |
19 | | * |
20 | | * You should have received a copy of the GNU Lesser General Public License |
21 | | * along with this library; if not, see <http://www.gnu.org/licenses/>. |
22 | | */ |
23 | | |
24 | | #include "config.h" |
25 | | |
26 | | #include <stdint.h> |
27 | | #include <string.h> |
28 | | |
29 | | #define PCRE2_CODE_UNIT_WIDTH 8 |
30 | | #include <pcre2.h> |
31 | | |
32 | | #include "gtypes.h" |
33 | | #include "gregex.h" |
34 | | #include "glibintl.h" |
35 | | #include "glist.h" |
36 | | #include "gmessages.h" |
37 | | #include "gstrfuncs.h" |
38 | | #include "gatomic.h" |
39 | | #include "gtestutils.h" |
40 | | #include "gthread.h" |
41 | | |
42 | | /** |
43 | | * SECTION:gregex |
44 | | * @title: Perl-compatible regular expressions |
45 | | * @short_description: matches strings against regular expressions |
46 | | * @see_also: [Regular expression syntax][glib-regex-syntax] |
47 | | * |
48 | | * The g_regex_*() functions implement regular |
49 | | * expression pattern matching using syntax and semantics similar to |
50 | | * Perl regular expression. |
51 | | * |
52 | | * Some functions accept a @start_position argument, setting it differs |
53 | | * from just passing over a shortened string and setting %G_REGEX_MATCH_NOTBOL |
54 | | * in the case of a pattern that begins with any kind of lookbehind assertion. |
55 | | * For example, consider the pattern "\Biss\B" which finds occurrences of "iss" |
56 | | * in the middle of words. ("\B" matches only if the current position in the |
57 | | * subject is not a word boundary.) When applied to the string "Mississipi" |
58 | | * from the fourth byte, namely "issipi", it does not match, because "\B" is |
59 | | * always false at the start of the subject, which is deemed to be a word |
60 | | * boundary. However, if the entire string is passed , but with |
61 | | * @start_position set to 4, it finds the second occurrence of "iss" because |
62 | | * it is able to look behind the starting point to discover that it is |
63 | | * preceded by a letter. |
64 | | * |
65 | | * Note that, unless you set the %G_REGEX_RAW flag, all the strings passed |
66 | | * to these functions must be encoded in UTF-8. The lengths and the positions |
67 | | * inside the strings are in bytes and not in characters, so, for instance, |
68 | | * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a |
69 | | * single character. If you set %G_REGEX_RAW the strings can be non-valid |
70 | | * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two |
71 | | * bytes and two characters long. |
72 | | * |
73 | | * When matching a pattern, "\n" matches only against a "\n" character in |
74 | | * the string, and "\r" matches only a "\r" character. To match any newline |
75 | | * sequence use "\R". This particular group matches either the two-character |
76 | | * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed, |
77 | | * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), |
78 | | * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line |
79 | | * separator, U+2028), or PS (paragraph separator, U+2029). |
80 | | * |
81 | | * The behaviour of the dot, circumflex, and dollar metacharacters are |
82 | | * affected by newline characters, the default is to recognize any newline |
83 | | * character (the same characters recognized by "\R"). This can be changed |
84 | | * with %G_REGEX_NEWLINE_CR, %G_REGEX_NEWLINE_LF and %G_REGEX_NEWLINE_CRLF |
85 | | * compile options, and with %G_REGEX_MATCH_NEWLINE_ANY, |
86 | | * %G_REGEX_MATCH_NEWLINE_CR, %G_REGEX_MATCH_NEWLINE_LF and |
87 | | * %G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also |
88 | | * relevant when compiling a pattern if %G_REGEX_EXTENDED is set, and an |
89 | | * unescaped "#" outside a character class is encountered. This indicates |
90 | | * a comment that lasts until after the next newline. |
91 | | * |
92 | | * Creating and manipulating the same #GRegex structure from different |
93 | | * threads is not a problem as #GRegex does not modify its internal |
94 | | * state between creation and destruction, on the other hand #GMatchInfo |
95 | | * is not threadsafe. |
96 | | * |
97 | | * The regular expressions low-level functionalities are obtained through |
98 | | * the excellent |
99 | | * [PCRE](http://www.pcre.org/) |
100 | | * library written by Philip Hazel. |
101 | | */ |
102 | | |
103 | 3.19k | #define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED | \ |
104 | 3.19k | PCRE2_NO_UTF_CHECK | \ |
105 | 3.19k | PCRE2_ENDANCHORED) |
106 | | |
107 | | /* Mask of all the possible values for GRegexCompileFlags. */ |
108 | 0 | #define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT | \ |
109 | 0 | G_REGEX_CASELESS | \ |
110 | 0 | G_REGEX_MULTILINE | \ |
111 | 0 | G_REGEX_DOTALL | \ |
112 | 0 | G_REGEX_EXTENDED | \ |
113 | 0 | G_REGEX_ANCHORED | \ |
114 | 0 | G_REGEX_DOLLAR_ENDONLY | \ |
115 | 0 | G_REGEX_UNGREEDY | \ |
116 | 0 | G_REGEX_RAW | \ |
117 | 0 | G_REGEX_NO_AUTO_CAPTURE | \ |
118 | 0 | G_REGEX_OPTIMIZE | \ |
119 | 0 | G_REGEX_FIRSTLINE | \ |
120 | 0 | G_REGEX_DUPNAMES | \ |
121 | 0 | G_REGEX_NEWLINE_CR | \ |
122 | 0 | G_REGEX_NEWLINE_LF | \ |
123 | 0 | G_REGEX_NEWLINE_CRLF | \ |
124 | 0 | G_REGEX_NEWLINE_ANYCRLF | \ |
125 | 0 | G_REGEX_BSR_ANYCRLF) |
126 | | |
127 | 2 | #define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS | \ |
128 | 2 | PCRE2_ALT_BSUX | \ |
129 | 2 | PCRE2_AUTO_CALLOUT | \ |
130 | 2 | PCRE2_CASELESS | \ |
131 | 2 | PCRE2_DOLLAR_ENDONLY | \ |
132 | 2 | PCRE2_DOTALL | \ |
133 | 2 | PCRE2_DUPNAMES | \ |
134 | 2 | PCRE2_EXTENDED | \ |
135 | 2 | PCRE2_FIRSTLINE | \ |
136 | 2 | PCRE2_MATCH_UNSET_BACKREF | \ |
137 | 2 | PCRE2_MULTILINE | \ |
138 | 2 | PCRE2_NEVER_UCP | \ |
139 | 2 | PCRE2_NEVER_UTF | \ |
140 | 2 | PCRE2_NO_AUTO_CAPTURE | \ |
141 | 2 | PCRE2_NO_AUTO_POSSESS | \ |
142 | 2 | PCRE2_NO_DOTSTAR_ANCHOR | \ |
143 | 2 | PCRE2_NO_START_OPTIMIZE | \ |
144 | 2 | PCRE2_UCP | \ |
145 | 2 | PCRE2_UNGREEDY | \ |
146 | 2 | PCRE2_UTF | \ |
147 | 2 | PCRE2_NEVER_BACKSLASH_C | \ |
148 | 2 | PCRE2_ALT_CIRCUMFLEX | \ |
149 | 2 | PCRE2_ALT_VERBNAMES | \ |
150 | 2 | PCRE2_USE_OFFSET_LIMIT | \ |
151 | 2 | PCRE2_EXTENDED_MORE | \ |
152 | 2 | PCRE2_LITERAL | \ |
153 | 2 | PCRE2_MATCH_INVALID_UTF | \ |
154 | 2 | G_REGEX_PCRE_GENERIC_MASK) |
155 | | |
156 | 1 | #define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF) |
157 | | |
158 | | /* Mask of all the possible values for GRegexMatchFlags. */ |
159 | 0 | #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT | \ |
160 | 0 | G_REGEX_MATCH_ANCHORED | \ |
161 | 0 | G_REGEX_MATCH_NOTBOL | \ |
162 | 0 | G_REGEX_MATCH_NOTEOL | \ |
163 | 0 | G_REGEX_MATCH_NOTEMPTY | \ |
164 | 0 | G_REGEX_MATCH_PARTIAL | \ |
165 | 0 | G_REGEX_MATCH_NEWLINE_CR | \ |
166 | 0 | G_REGEX_MATCH_NEWLINE_LF | \ |
167 | 0 | G_REGEX_MATCH_NEWLINE_CRLF | \ |
168 | 0 | G_REGEX_MATCH_NEWLINE_ANY | \ |
169 | 0 | G_REGEX_MATCH_NEWLINE_ANYCRLF | \ |
170 | 0 | G_REGEX_MATCH_BSR_ANYCRLF | \ |
171 | 0 | G_REGEX_MATCH_BSR_ANY | \ |
172 | 0 | G_REGEX_MATCH_PARTIAL_SOFT | \ |
173 | 0 | G_REGEX_MATCH_PARTIAL_HARD | \ |
174 | 0 | G_REGEX_MATCH_NOTEMPTY_ATSTART) |
175 | | |
176 | 3.19k | #define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL |\ |
177 | 3.19k | PCRE2_NOTEOL |\ |
178 | 3.19k | PCRE2_NOTEMPTY |\ |
179 | 3.19k | PCRE2_NOTEMPTY_ATSTART |\ |
180 | 3.19k | PCRE2_PARTIAL_SOFT |\ |
181 | 3.19k | PCRE2_PARTIAL_HARD |\ |
182 | 3.19k | PCRE2_NO_JIT |\ |
183 | 3.19k | PCRE2_COPY_MATCHED_SUBJECT |\ |
184 | 3.19k | G_REGEX_PCRE_GENERIC_MASK) |
185 | | |
186 | | /* TODO: Support PCRE2_NEWLINE_NUL */ |
187 | | #define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR | \ |
188 | | PCRE2_NEWLINE_LF | \ |
189 | | PCRE2_NEWLINE_CRLF | \ |
190 | | PCRE2_NEWLINE_ANYCRLF) |
191 | | |
192 | | /* Some match options are not supported when using JIT as stated in the |
193 | | * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section: |
194 | | * https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5 |
195 | | */ |
196 | 0 | #define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \ |
197 | 0 | PCRE2_ENDANCHORED) |
198 | | |
199 | 1 | #define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR | \ |
200 | 1 | G_REGEX_NEWLINE_LF | \ |
201 | 1 | G_REGEX_NEWLINE_CRLF | \ |
202 | 1 | G_REGEX_NEWLINE_ANYCRLF) |
203 | | |
204 | 1 | #define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR | \ |
205 | 1 | G_REGEX_MATCH_NEWLINE_LF | \ |
206 | 1 | G_REGEX_MATCH_NEWLINE_CRLF | \ |
207 | 1 | G_REGEX_MATCH_NEWLINE_ANY | \ |
208 | 1 | G_REGEX_MATCH_NEWLINE_ANYCRLF) |
209 | | |
210 | | /* if the string is in UTF-8 use g_utf8_ functions, else use |
211 | | * use just +/- 1. */ |
212 | 0 | #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
213 | 0 | ((s) + 1) : \ |
214 | 0 | g_utf8_next_char (s)) |
215 | 0 | #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
216 | 0 | ((s) - 1) : \ |
217 | 0 | g_utf8_prev_char (s)) |
218 | | |
219 | | struct _GMatchInfo |
220 | | { |
221 | | gint ref_count; /* the ref count (atomic) */ |
222 | | GRegex *regex; /* the regex */ |
223 | | uint32_t match_opts; /* pcre match options used at match time on the regex */ |
224 | | gint matches; /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */ |
225 | | uint32_t n_subpatterns; /* total number of sub patterns in the regex */ |
226 | | gint pos; /* position in the string where last match left off */ |
227 | | uint32_t n_offsets; /* number of offsets */ |
228 | | gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ |
229 | | gint *workspace; /* workspace for pcre2_dfa_match() */ |
230 | | PCRE2_SIZE n_workspace; /* number of workspace elements */ |
231 | | const gchar *string; /* string passed to the match function */ |
232 | | gssize string_len; /* length of string, in bytes */ |
233 | | pcre2_match_context *match_context; |
234 | | pcre2_match_data *match_data; |
235 | | pcre2_jit_stack *jit_stack; |
236 | | }; |
237 | | |
238 | | typedef enum |
239 | | { |
240 | | JIT_STATUS_DEFAULT, |
241 | | JIT_STATUS_ENABLED, |
242 | | JIT_STATUS_DISABLED |
243 | | } JITStatus; |
244 | | |
245 | | struct _GRegex |
246 | | { |
247 | | gint ref_count; /* the ref count for the immutable part (atomic) */ |
248 | | gchar *pattern; /* the pattern */ |
249 | | pcre2_code *pcre_re; /* compiled form of the pattern */ |
250 | | uint32_t compile_opts; /* options used at compile time on the pattern, pcre2 values */ |
251 | | GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */ |
252 | | uint32_t match_opts; /* pcre2 options used at match time on the regex */ |
253 | | GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */ |
254 | | uint32_t jit_options; /* options which were enabled for jit compiler */ |
255 | | JITStatus jit_status; /* indicates the status of jit compiler for this compiled regex */ |
256 | | /* The jit_status here does _not_ correspond to whether we used the JIT in the last invocation, |
257 | | * which may be affected by match_options or a JIT_STACK_LIMIT error, but whether it was ever |
258 | | * enabled for the current regex AND current set of jit_options. |
259 | | * JIT_STATUS_DEFAULT means enablement was never tried, |
260 | | * JIT_STATUS_ENABLED means it was tried and successful (even if we're not currently using it), |
261 | | * and JIT_STATUS_DISABLED means it was tried and failed (so we shouldn't try again). |
262 | | */ |
263 | | }; |
264 | | |
265 | | /* TRUE if ret is an error code, FALSE otherwise. */ |
266 | 3.19k | #define IS_PCRE2_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL) |
267 | | |
268 | | typedef struct _InterpolationData InterpolationData; |
269 | | static gboolean interpolation_list_needs_match (GList *list); |
270 | | static gboolean interpolate_replacement (const GMatchInfo *match_info, |
271 | | GString *result, |
272 | | gpointer data); |
273 | | static GList *split_replacement (const gchar *replacement, |
274 | | GError **error); |
275 | | static void free_interpolation_data (InterpolationData *data); |
276 | | |
277 | | static uint32_t |
278 | | get_pcre2_compile_options (GRegexCompileFlags compile_flags) |
279 | 1 | { |
280 | | /* Maps compile flags to pcre2 values */ |
281 | 1 | uint32_t pcre2_flags = 0; |
282 | | |
283 | 1 | if (compile_flags & G_REGEX_CASELESS) |
284 | 0 | pcre2_flags |= PCRE2_CASELESS; |
285 | 1 | if (compile_flags & G_REGEX_MULTILINE) |
286 | 0 | pcre2_flags |= PCRE2_MULTILINE; |
287 | 1 | if (compile_flags & G_REGEX_DOTALL) |
288 | 0 | pcre2_flags |= PCRE2_DOTALL; |
289 | 1 | if (compile_flags & G_REGEX_EXTENDED) |
290 | 0 | pcre2_flags |= PCRE2_EXTENDED; |
291 | 1 | if (compile_flags & G_REGEX_ANCHORED) |
292 | 0 | pcre2_flags |= PCRE2_ANCHORED; |
293 | 1 | if (compile_flags & G_REGEX_DOLLAR_ENDONLY) |
294 | 0 | pcre2_flags |= PCRE2_DOLLAR_ENDONLY; |
295 | 1 | if (compile_flags & G_REGEX_UNGREEDY) |
296 | 0 | pcre2_flags |= PCRE2_UNGREEDY; |
297 | 1 | if (!(compile_flags & G_REGEX_RAW)) |
298 | 1 | pcre2_flags |= PCRE2_UTF; |
299 | 1 | if (compile_flags & G_REGEX_NO_AUTO_CAPTURE) |
300 | 0 | pcre2_flags |= PCRE2_NO_AUTO_CAPTURE; |
301 | 1 | if (compile_flags & G_REGEX_FIRSTLINE) |
302 | 0 | pcre2_flags |= PCRE2_FIRSTLINE; |
303 | 1 | if (compile_flags & G_REGEX_DUPNAMES) |
304 | 0 | pcre2_flags |= PCRE2_DUPNAMES; |
305 | | |
306 | 1 | return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK; |
307 | 1 | } |
308 | | |
309 | | static uint32_t |
310 | | get_pcre2_match_options (GRegexMatchFlags match_flags, |
311 | | GRegexCompileFlags compile_flags) |
312 | 3.19k | { |
313 | | /* Maps match flags to pcre2 values */ |
314 | 3.19k | uint32_t pcre2_flags = 0; |
315 | | |
316 | 3.19k | if (match_flags & G_REGEX_MATCH_ANCHORED) |
317 | 0 | pcre2_flags |= PCRE2_ANCHORED; |
318 | 3.19k | if (match_flags & G_REGEX_MATCH_NOTBOL) |
319 | 0 | pcre2_flags |= PCRE2_NOTBOL; |
320 | 3.19k | if (match_flags & G_REGEX_MATCH_NOTEOL) |
321 | 0 | pcre2_flags |= PCRE2_NOTEOL; |
322 | 3.19k | if (match_flags & G_REGEX_MATCH_NOTEMPTY) |
323 | 0 | pcre2_flags |= PCRE2_NOTEMPTY; |
324 | 3.19k | if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT) |
325 | 0 | pcre2_flags |= PCRE2_PARTIAL_SOFT; |
326 | 3.19k | if (match_flags & G_REGEX_MATCH_PARTIAL_HARD) |
327 | 0 | pcre2_flags |= PCRE2_PARTIAL_HARD; |
328 | 3.19k | if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART) |
329 | 0 | pcre2_flags |= PCRE2_NOTEMPTY_ATSTART; |
330 | | |
331 | 3.19k | if (compile_flags & G_REGEX_RAW) |
332 | 0 | pcre2_flags |= PCRE2_NO_UTF_CHECK; |
333 | | |
334 | 3.19k | return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK; |
335 | 3.19k | } |
336 | | |
337 | | static GRegexCompileFlags |
338 | | g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags) |
339 | 0 | { |
340 | 0 | GRegexCompileFlags compile_flags = G_REGEX_DEFAULT; |
341 | |
|
342 | 0 | if (pcre2_flags & PCRE2_CASELESS) |
343 | 0 | compile_flags |= G_REGEX_CASELESS; |
344 | 0 | if (pcre2_flags & PCRE2_MULTILINE) |
345 | 0 | compile_flags |= G_REGEX_MULTILINE; |
346 | 0 | if (pcre2_flags & PCRE2_DOTALL) |
347 | 0 | compile_flags |= G_REGEX_DOTALL; |
348 | 0 | if (pcre2_flags & PCRE2_EXTENDED) |
349 | 0 | compile_flags |= G_REGEX_EXTENDED; |
350 | 0 | if (pcre2_flags & PCRE2_ANCHORED) |
351 | 0 | compile_flags |= G_REGEX_ANCHORED; |
352 | 0 | if (pcre2_flags & PCRE2_DOLLAR_ENDONLY) |
353 | 0 | compile_flags |= G_REGEX_DOLLAR_ENDONLY; |
354 | 0 | if (pcre2_flags & PCRE2_UNGREEDY) |
355 | 0 | compile_flags |= G_REGEX_UNGREEDY; |
356 | 0 | if (!(pcre2_flags & PCRE2_UTF)) |
357 | 0 | compile_flags |= G_REGEX_RAW; |
358 | 0 | if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE) |
359 | 0 | compile_flags |= G_REGEX_NO_AUTO_CAPTURE; |
360 | 0 | if (pcre2_flags & PCRE2_FIRSTLINE) |
361 | 0 | compile_flags |= G_REGEX_FIRSTLINE; |
362 | 0 | if (pcre2_flags & PCRE2_DUPNAMES) |
363 | 0 | compile_flags |= G_REGEX_DUPNAMES; |
364 | |
|
365 | 0 | return compile_flags & G_REGEX_COMPILE_MASK; |
366 | 0 | } |
367 | | |
368 | | static GRegexMatchFlags |
369 | | g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags) |
370 | 0 | { |
371 | 0 | GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT; |
372 | |
|
373 | 0 | if (pcre2_flags & PCRE2_ANCHORED) |
374 | 0 | match_flags |= G_REGEX_MATCH_ANCHORED; |
375 | 0 | if (pcre2_flags & PCRE2_NOTBOL) |
376 | 0 | match_flags |= G_REGEX_MATCH_NOTBOL; |
377 | 0 | if (pcre2_flags & PCRE2_NOTEOL) |
378 | 0 | match_flags |= G_REGEX_MATCH_NOTEOL; |
379 | 0 | if (pcre2_flags & PCRE2_NOTEMPTY) |
380 | 0 | match_flags |= G_REGEX_MATCH_NOTEMPTY; |
381 | 0 | if (pcre2_flags & PCRE2_PARTIAL_SOFT) |
382 | 0 | match_flags |= G_REGEX_MATCH_PARTIAL_SOFT; |
383 | 0 | if (pcre2_flags & PCRE2_PARTIAL_HARD) |
384 | 0 | match_flags |= G_REGEX_MATCH_PARTIAL_HARD; |
385 | 0 | if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART) |
386 | 0 | match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART; |
387 | |
|
388 | 0 | return (match_flags & G_REGEX_MATCH_MASK); |
389 | 0 | } |
390 | | |
391 | | static uint32_t |
392 | | get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags) |
393 | 1 | { |
394 | 1 | compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK; |
395 | | |
396 | 1 | switch (compile_flags) |
397 | 1 | { |
398 | 0 | case G_REGEX_NEWLINE_CR: |
399 | 0 | return PCRE2_NEWLINE_CR; |
400 | 0 | case G_REGEX_NEWLINE_LF: |
401 | 0 | return PCRE2_NEWLINE_LF; |
402 | 0 | case G_REGEX_NEWLINE_CRLF: |
403 | 0 | return PCRE2_NEWLINE_CRLF; |
404 | 0 | case G_REGEX_NEWLINE_ANYCRLF: |
405 | 0 | return PCRE2_NEWLINE_ANYCRLF; |
406 | 1 | default: |
407 | 1 | if (compile_flags != 0) |
408 | 0 | return 0; |
409 | | |
410 | 1 | return PCRE2_NEWLINE_ANY; |
411 | 1 | } |
412 | 1 | } |
413 | | |
414 | | static uint32_t |
415 | | get_pcre2_newline_match_options (GRegexMatchFlags match_flags) |
416 | 1 | { |
417 | 1 | switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK) |
418 | 1 | { |
419 | 0 | case G_REGEX_MATCH_NEWLINE_CR: |
420 | 0 | return PCRE2_NEWLINE_CR; |
421 | 0 | case G_REGEX_MATCH_NEWLINE_LF: |
422 | 0 | return PCRE2_NEWLINE_LF; |
423 | 0 | case G_REGEX_MATCH_NEWLINE_CRLF: |
424 | 0 | return PCRE2_NEWLINE_CRLF; |
425 | 0 | case G_REGEX_MATCH_NEWLINE_ANY: |
426 | 0 | return PCRE2_NEWLINE_ANY; |
427 | 0 | case G_REGEX_MATCH_NEWLINE_ANYCRLF: |
428 | 0 | return PCRE2_NEWLINE_ANYCRLF; |
429 | 1 | default: |
430 | 1 | return 0; |
431 | 1 | } |
432 | 1 | } |
433 | | |
434 | | static uint32_t |
435 | | get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags) |
436 | 1 | { |
437 | 1 | if (compile_flags & G_REGEX_BSR_ANYCRLF) |
438 | 0 | return PCRE2_BSR_ANYCRLF; |
439 | | |
440 | 1 | return PCRE2_BSR_UNICODE; |
441 | 1 | } |
442 | | |
443 | | static uint32_t |
444 | | get_pcre2_bsr_match_options (GRegexMatchFlags match_flags) |
445 | 1 | { |
446 | 1 | if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF) |
447 | 0 | return PCRE2_BSR_ANYCRLF; |
448 | | |
449 | 1 | if (match_flags & G_REGEX_MATCH_BSR_ANY) |
450 | 0 | return PCRE2_BSR_UNICODE; |
451 | | |
452 | 1 | return 0; |
453 | 1 | } |
454 | | |
455 | | static char * |
456 | | get_pcre2_error_string (int errcode) |
457 | 314 | { |
458 | 314 | PCRE2_UCHAR8 error_msg[2048]; |
459 | 314 | int err_length; |
460 | | |
461 | 314 | err_length = pcre2_get_error_message (errcode, error_msg, |
462 | 314 | G_N_ELEMENTS (error_msg)); |
463 | | |
464 | 314 | if (err_length <= 0) |
465 | 0 | return NULL; |
466 | | |
467 | | /* The array is always filled with a trailing zero */ |
468 | 314 | g_assert ((size_t) err_length < G_N_ELEMENTS (error_msg)); |
469 | 314 | return g_memdup2 (error_msg, err_length + 1); |
470 | 314 | } |
471 | | |
472 | | static const gchar * |
473 | | translate_match_error (gint errcode) |
474 | 314 | { |
475 | 314 | switch (errcode) |
476 | 314 | { |
477 | 0 | case PCRE2_ERROR_NOMATCH: |
478 | | /* not an error */ |
479 | 0 | break; |
480 | 0 | case PCRE2_ERROR_NULL: |
481 | | /* NULL argument, this should not happen in GRegex */ |
482 | 0 | g_critical ("A NULL argument was passed to PCRE"); |
483 | 0 | break; |
484 | 0 | case PCRE2_ERROR_BADOPTION: |
485 | 0 | return "bad options"; |
486 | 0 | case PCRE2_ERROR_BADMAGIC: |
487 | 0 | return _("corrupted object"); |
488 | 0 | case PCRE2_ERROR_NOMEMORY: |
489 | 0 | return _("out of memory"); |
490 | 0 | case PCRE2_ERROR_NOSUBSTRING: |
491 | | /* not used by pcre2_match() */ |
492 | 0 | break; |
493 | 0 | case PCRE2_ERROR_MATCHLIMIT: |
494 | 0 | case PCRE2_ERROR_CALLOUT: |
495 | | /* callouts are not implemented */ |
496 | 0 | break; |
497 | 0 | case PCRE2_ERROR_BADUTFOFFSET: |
498 | | /* we do not check if strings are valid */ |
499 | 0 | break; |
500 | 0 | case PCRE2_ERROR_PARTIAL: |
501 | | /* not an error */ |
502 | 0 | break; |
503 | 0 | case PCRE2_ERROR_INTERNAL: |
504 | 0 | return _("internal error"); |
505 | 0 | case PCRE2_ERROR_DFA_UITEM: |
506 | 0 | return _("the pattern contains items not supported for partial matching"); |
507 | 0 | case PCRE2_ERROR_DFA_UCOND: |
508 | 0 | return _("back references as conditions are not supported for partial matching"); |
509 | 0 | case PCRE2_ERROR_DFA_WSSIZE: |
510 | | /* handled expanding the workspace */ |
511 | 0 | break; |
512 | 0 | case PCRE2_ERROR_DFA_RECURSE: |
513 | 0 | case PCRE2_ERROR_RECURSIONLIMIT: |
514 | 0 | return _("recursion limit reached"); |
515 | 0 | case PCRE2_ERROR_BADOFFSET: |
516 | 0 | return _("bad offset"); |
517 | 0 | case PCRE2_ERROR_RECURSELOOP: |
518 | 0 | return _("recursion loop"); |
519 | 0 | case PCRE2_ERROR_JIT_BADOPTION: |
520 | | /* should not happen in GRegex since we check modes before each match */ |
521 | 0 | return _("matching mode is requested that was not compiled for JIT"); |
522 | 314 | default: |
523 | 314 | break; |
524 | 314 | } |
525 | 314 | return NULL; |
526 | 314 | } |
527 | | |
528 | | static char * |
529 | | get_match_error_message (int errcode) |
530 | 314 | { |
531 | 314 | const char *msg = translate_match_error (errcode); |
532 | 314 | char *error_string; |
533 | | |
534 | 314 | if (msg) |
535 | 0 | return g_strdup (msg); |
536 | | |
537 | 314 | error_string = get_pcre2_error_string (errcode); |
538 | | |
539 | 314 | if (error_string) |
540 | 314 | return error_string; |
541 | | |
542 | 0 | return g_strdup (_("unknown error")); |
543 | 314 | } |
544 | | |
545 | | static void |
546 | | translate_compile_error (gint *errcode, const gchar **errmsg) |
547 | 0 | { |
548 | | /* If errcode is known we put the translatable error message in |
549 | | * errmsg. If errcode is unknown we put the generic |
550 | | * G_REGEX_ERROR_COMPILE error code in errcode. |
551 | | * Note that there can be more PCRE errors with the same GRegexError |
552 | | * and that some PCRE errors are useless for us. |
553 | | */ |
554 | 0 | gint original_errcode = *errcode; |
555 | |
|
556 | 0 | *errcode = -1; |
557 | 0 | *errmsg = NULL; |
558 | |
|
559 | 0 | switch (original_errcode) |
560 | 0 | { |
561 | 0 | case PCRE2_ERROR_END_BACKSLASH: |
562 | 0 | *errcode = G_REGEX_ERROR_STRAY_BACKSLASH; |
563 | 0 | *errmsg = _("\\ at end of pattern"); |
564 | 0 | break; |
565 | 0 | case PCRE2_ERROR_END_BACKSLASH_C: |
566 | 0 | *errcode = G_REGEX_ERROR_MISSING_CONTROL_CHAR; |
567 | 0 | *errmsg = _("\\c at end of pattern"); |
568 | 0 | break; |
569 | 0 | case PCRE2_ERROR_UNKNOWN_ESCAPE: |
570 | 0 | case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE: |
571 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; |
572 | 0 | *errmsg = _("unrecognized character following \\"); |
573 | 0 | break; |
574 | 0 | case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER: |
575 | 0 | *errcode = G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER; |
576 | 0 | *errmsg = _("numbers out of order in {} quantifier"); |
577 | 0 | break; |
578 | 0 | case PCRE2_ERROR_QUANTIFIER_TOO_BIG: |
579 | 0 | *errcode = G_REGEX_ERROR_QUANTIFIER_TOO_BIG; |
580 | 0 | *errmsg = _("number too big in {} quantifier"); |
581 | 0 | break; |
582 | 0 | case PCRE2_ERROR_MISSING_SQUARE_BRACKET: |
583 | 0 | *errcode = G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS; |
584 | 0 | *errmsg = _("missing terminating ] for character class"); |
585 | 0 | break; |
586 | 0 | case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS: |
587 | 0 | *errcode = G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS; |
588 | 0 | *errmsg = _("invalid escape sequence in character class"); |
589 | 0 | break; |
590 | 0 | case PCRE2_ERROR_CLASS_RANGE_ORDER: |
591 | 0 | *errcode = G_REGEX_ERROR_RANGE_OUT_OF_ORDER; |
592 | 0 | *errmsg = _("range out of order in character class"); |
593 | 0 | break; |
594 | 0 | case PCRE2_ERROR_QUANTIFIER_INVALID: |
595 | 0 | case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT: |
596 | 0 | *errcode = G_REGEX_ERROR_NOTHING_TO_REPEAT; |
597 | 0 | *errmsg = _("nothing to repeat"); |
598 | 0 | break; |
599 | 0 | case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY: |
600 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
601 | 0 | *errmsg = _("unrecognized character after (? or (?-"); |
602 | 0 | break; |
603 | 0 | case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS: |
604 | 0 | *errcode = G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS; |
605 | 0 | *errmsg = _("POSIX named classes are supported only within a class"); |
606 | 0 | break; |
607 | 0 | case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING: |
608 | 0 | *errcode = G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED; |
609 | 0 | *errmsg = _("POSIX collating elements are not supported"); |
610 | 0 | break; |
611 | 0 | case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS: |
612 | 0 | case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS: |
613 | 0 | case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING: |
614 | 0 | *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; |
615 | 0 | *errmsg = _("missing terminating )"); |
616 | 0 | break; |
617 | 0 | case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE: |
618 | 0 | *errcode = G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE; |
619 | 0 | *errmsg = _("reference to non-existent subpattern"); |
620 | 0 | break; |
621 | 0 | case PCRE2_ERROR_MISSING_COMMENT_CLOSING: |
622 | 0 | *errcode = G_REGEX_ERROR_UNTERMINATED_COMMENT; |
623 | 0 | *errmsg = _("missing ) after comment"); |
624 | 0 | break; |
625 | 0 | case PCRE2_ERROR_PATTERN_TOO_LARGE: |
626 | 0 | *errcode = G_REGEX_ERROR_EXPRESSION_TOO_LARGE; |
627 | 0 | *errmsg = _("regular expression is too large"); |
628 | 0 | break; |
629 | 0 | case PCRE2_ERROR_MISSING_CONDITION_CLOSING: |
630 | 0 | *errcode = G_REGEX_ERROR_MALFORMED_CONDITION; |
631 | 0 | *errmsg = _("malformed number or name after (?("); |
632 | 0 | break; |
633 | 0 | case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH: |
634 | 0 | *errcode = G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND; |
635 | 0 | *errmsg = _("lookbehind assertion is not fixed length"); |
636 | 0 | break; |
637 | 0 | case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES: |
638 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES; |
639 | 0 | *errmsg = _("conditional group contains more than two branches"); |
640 | 0 | break; |
641 | 0 | case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED: |
642 | 0 | *errcode = G_REGEX_ERROR_ASSERTION_EXPECTED; |
643 | 0 | *errmsg = _("assertion expected after (?("); |
644 | 0 | break; |
645 | 0 | case PCRE2_ERROR_BAD_RELATIVE_REFERENCE: |
646 | 0 | *errcode = G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE; |
647 | 0 | *errmsg = _("a numbered reference must not be zero"); |
648 | 0 | break; |
649 | 0 | case PCRE2_ERROR_UNKNOWN_POSIX_CLASS: |
650 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME; |
651 | 0 | *errmsg = _("unknown POSIX class name"); |
652 | 0 | break; |
653 | 0 | case PCRE2_ERROR_CODE_POINT_TOO_BIG: |
654 | 0 | case PCRE2_ERROR_INVALID_HEXADECIMAL: |
655 | 0 | *errcode = G_REGEX_ERROR_HEX_CODE_TOO_LARGE; |
656 | 0 | *errmsg = _("character value in \\x{...} sequence is too large"); |
657 | 0 | break; |
658 | 0 | case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C: |
659 | 0 | *errcode = G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND; |
660 | 0 | *errmsg = _("\\C not allowed in lookbehind assertion"); |
661 | 0 | break; |
662 | 0 | case PCRE2_ERROR_MISSING_NAME_TERMINATOR: |
663 | 0 | *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR; |
664 | 0 | *errmsg = _("missing terminator in subpattern name"); |
665 | 0 | break; |
666 | 0 | case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME: |
667 | 0 | *errcode = G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME; |
668 | 0 | *errmsg = _("two named subpatterns have the same name"); |
669 | 0 | break; |
670 | 0 | case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY: |
671 | 0 | *errcode = G_REGEX_ERROR_MALFORMED_PROPERTY; |
672 | 0 | *errmsg = _("malformed \\P or \\p sequence"); |
673 | 0 | break; |
674 | 0 | case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY: |
675 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_PROPERTY; |
676 | 0 | *errmsg = _("unknown property name after \\P or \\p"); |
677 | 0 | break; |
678 | 0 | case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG: |
679 | 0 | *errcode = G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG; |
680 | 0 | *errmsg = _("subpattern name is too long (maximum 32 characters)"); |
681 | 0 | break; |
682 | 0 | case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS: |
683 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_SUBPATTERNS; |
684 | 0 | *errmsg = _("too many named subpatterns (maximum 10,000)"); |
685 | 0 | break; |
686 | 0 | case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG: |
687 | 0 | *errcode = G_REGEX_ERROR_INVALID_OCTAL_VALUE; |
688 | 0 | *errmsg = _("octal value is greater than \\377"); |
689 | 0 | break; |
690 | 0 | case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES: |
691 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE; |
692 | 0 | *errmsg = _("DEFINE group contains more than one branch"); |
693 | 0 | break; |
694 | 0 | case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE: |
695 | 0 | *errcode = G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS; |
696 | 0 | *errmsg = _("inconsistent NEWLINE options"); |
697 | 0 | break; |
698 | 0 | case PCRE2_ERROR_BACKSLASH_G_SYNTAX: |
699 | 0 | *errcode = G_REGEX_ERROR_MISSING_BACK_REFERENCE; |
700 | 0 | *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or " |
701 | 0 | "number, or by a plain number"); |
702 | 0 | break; |
703 | 0 | case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED: |
704 | 0 | *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN; |
705 | 0 | *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)"); |
706 | 0 | break; |
707 | 0 | case PCRE2_ERROR_VERB_UNKNOWN: |
708 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB; |
709 | 0 | *errmsg = _("(*VERB) not recognized"); |
710 | 0 | break; |
711 | 0 | case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG: |
712 | 0 | *errcode = G_REGEX_ERROR_NUMBER_TOO_BIG; |
713 | 0 | *errmsg = _("number is too big"); |
714 | 0 | break; |
715 | 0 | case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED: |
716 | 0 | *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME; |
717 | 0 | *errmsg = _("missing subpattern name after (?&"); |
718 | 0 | break; |
719 | 0 | case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH: |
720 | 0 | *errcode = G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME; |
721 | 0 | *errmsg = _("different names for subpatterns of the same number are not allowed"); |
722 | 0 | break; |
723 | 0 | case PCRE2_ERROR_MARK_MISSING_ARGUMENT: |
724 | 0 | *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED; |
725 | 0 | *errmsg = _("(*MARK) must have an argument"); |
726 | 0 | break; |
727 | 0 | case PCRE2_ERROR_BACKSLASH_C_SYNTAX: |
728 | 0 | *errcode = G_REGEX_ERROR_INVALID_CONTROL_CHAR; |
729 | 0 | *errmsg = _( "\\c must be followed by an ASCII character"); |
730 | 0 | break; |
731 | 0 | case PCRE2_ERROR_BACKSLASH_K_SYNTAX: |
732 | 0 | *errcode = G_REGEX_ERROR_MISSING_NAME; |
733 | 0 | *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name"); |
734 | 0 | break; |
735 | 0 | case PCRE2_ERROR_BACKSLASH_N_IN_CLASS: |
736 | 0 | *errcode = G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS; |
737 | 0 | *errmsg = _("\\N is not supported in a class"); |
738 | 0 | break; |
739 | 0 | case PCRE2_ERROR_VERB_NAME_TOO_LONG: |
740 | 0 | *errcode = G_REGEX_ERROR_NAME_TOO_LONG; |
741 | 0 | *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)"); |
742 | 0 | break; |
743 | 0 | case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW: |
744 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
745 | 0 | *errmsg = _("code overflow"); |
746 | 0 | break; |
747 | 0 | case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P: |
748 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
749 | 0 | *errmsg = _("unrecognized character after (?P"); |
750 | 0 | break; |
751 | 0 | case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE: |
752 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
753 | 0 | *errmsg = _("overran compiling workspace"); |
754 | 0 | break; |
755 | 0 | case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN: |
756 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
757 | 0 | *errmsg = _("previously-checked referenced subpattern not found"); |
758 | 0 | break; |
759 | 0 | case PCRE2_ERROR_HEAP_FAILED: |
760 | 0 | case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW: |
761 | 0 | case PCRE2_ERROR_UNICODE_NOT_SUPPORTED: |
762 | 0 | case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT: |
763 | 0 | case PCRE2_ERROR_NO_SURROGATES_IN_UTF16: |
764 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS: |
765 | 0 | case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE: |
766 | 0 | case PCRE2_ERROR_INTERNAL_STUDY_ERROR: |
767 | 0 | case PCRE2_ERROR_UTF_IS_DISABLED: |
768 | 0 | case PCRE2_ERROR_UCP_IS_DISABLED: |
769 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS: |
770 | 0 | case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED: |
771 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE: |
772 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP: |
773 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
774 | 0 | break; |
775 | 0 | case PCRE2_ERROR_INVALID_SUBPATTERN_NAME: |
776 | 0 | case PCRE2_ERROR_CLASS_INVALID_RANGE: |
777 | 0 | case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE: |
778 | 0 | case PCRE2_ERROR_PARENTHESES_STACK_CHECK: |
779 | 0 | case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED: |
780 | 0 | case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG: |
781 | 0 | case PCRE2_ERROR_MISSING_CALLOUT_CLOSING: |
782 | 0 | case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB: |
783 | 0 | case PCRE2_ERROR_NULL_PATTERN: |
784 | 0 | case PCRE2_ERROR_BAD_OPTIONS: |
785 | 0 | case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP: |
786 | 0 | case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE: |
787 | 0 | case PCRE2_ERROR_INVALID_OCTAL: |
788 | 0 | case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG: |
789 | 0 | case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG: |
790 | 0 | case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS: |
791 | 0 | case PCRE2_ERROR_VERSION_CONDITION_SYNTAX: |
792 | 0 | case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER: |
793 | 0 | case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER: |
794 | 0 | case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED: |
795 | 0 | case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP: |
796 | 0 | case PCRE2_ERROR_PATTERN_TOO_COMPLICATED: |
797 | 0 | case PCRE2_ERROR_LOOKBEHIND_TOO_LONG: |
798 | 0 | case PCRE2_ERROR_PATTERN_STRING_TOO_LONG: |
799 | 0 | case PCRE2_ERROR_BAD_LITERAL_OPTIONS: |
800 | 0 | default: |
801 | 0 | *errcode = G_REGEX_ERROR_COMPILE; |
802 | 0 | break; |
803 | 0 | } |
804 | | |
805 | 0 | g_assert (*errcode != -1); |
806 | 0 | } |
807 | | |
808 | | /* GMatchInfo */ |
809 | | |
810 | | static GMatchInfo * |
811 | | match_info_new (const GRegex *regex, |
812 | | const gchar *string, |
813 | | gint string_len, |
814 | | gint start_position, |
815 | | GRegexMatchFlags match_options, |
816 | | gboolean is_dfa) |
817 | 3.19k | { |
818 | 3.19k | GMatchInfo *match_info; |
819 | | |
820 | 3.19k | if (string_len < 0) |
821 | 3.19k | string_len = strlen (string); |
822 | | |
823 | 3.19k | match_info = g_new0 (GMatchInfo, 1); |
824 | 3.19k | match_info->ref_count = 1; |
825 | 3.19k | match_info->regex = g_regex_ref ((GRegex *)regex); |
826 | 3.19k | match_info->string = string; |
827 | 3.19k | match_info->string_len = string_len; |
828 | 3.19k | match_info->matches = PCRE2_ERROR_NOMATCH; |
829 | 3.19k | match_info->pos = start_position; |
830 | 3.19k | match_info->match_opts = |
831 | 3.19k | get_pcre2_match_options (match_options, regex->orig_compile_opts); |
832 | | |
833 | 3.19k | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, |
834 | 3.19k | &match_info->n_subpatterns); |
835 | | |
836 | 3.19k | match_info->match_context = pcre2_match_context_create (NULL); |
837 | | |
838 | 3.19k | if (is_dfa) |
839 | 0 | { |
840 | | /* These values should be enough for most cases, if they are not |
841 | | * enough g_regex_match_all_full() will expand them. */ |
842 | 0 | match_info->n_workspace = 100; |
843 | 0 | match_info->workspace = g_new (gint, match_info->n_workspace); |
844 | 0 | } |
845 | | |
846 | 3.19k | match_info->n_offsets = 2; |
847 | 3.19k | match_info->offsets = g_new0 (gint, match_info->n_offsets); |
848 | | /* Set an invalid position for the previous match. */ |
849 | 3.19k | match_info->offsets[0] = -1; |
850 | 3.19k | match_info->offsets[1] = -1; |
851 | | |
852 | 3.19k | match_info->match_data = pcre2_match_data_create_from_pattern ( |
853 | 3.19k | match_info->regex->pcre_re, |
854 | 3.19k | NULL); |
855 | | |
856 | 3.19k | return match_info; |
857 | 3.19k | } |
858 | | |
859 | | static gboolean |
860 | | recalc_match_offsets (GMatchInfo *match_info, |
861 | | GError **error) |
862 | 194 | { |
863 | 194 | PCRE2_SIZE *ovector; |
864 | 194 | uint32_t ovector_size = 0; |
865 | 194 | uint32_t pre_n_offset; |
866 | 194 | uint32_t i; |
867 | | |
868 | 194 | g_assert (!IS_PCRE2_ERROR (match_info->matches)); |
869 | | |
870 | 194 | if (match_info->matches == PCRE2_ERROR_PARTIAL) |
871 | 0 | ovector_size = 1; |
872 | 194 | else if (match_info->matches > 0) |
873 | 194 | ovector_size = match_info->matches; |
874 | | |
875 | 194 | g_assert (ovector_size != 0); |
876 | | |
877 | 194 | if (pcre2_get_ovector_count (match_info->match_data) < ovector_size) |
878 | 0 | { |
879 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
880 | 0 | _("Error while matching regular expression %s: %s"), |
881 | 0 | match_info->regex->pattern, _("code overflow")); |
882 | 0 | return FALSE; |
883 | 0 | } |
884 | | |
885 | 194 | pre_n_offset = match_info->n_offsets; |
886 | 194 | match_info->n_offsets = ovector_size * 2; |
887 | 194 | ovector = pcre2_get_ovector_pointer (match_info->match_data); |
888 | | |
889 | 194 | if (match_info->n_offsets != pre_n_offset) |
890 | 194 | { |
891 | 194 | match_info->offsets = g_realloc_n (match_info->offsets, |
892 | 194 | match_info->n_offsets, |
893 | 194 | sizeof (gint)); |
894 | 194 | } |
895 | | |
896 | 970 | for (i = 0; i < match_info->n_offsets; i++) |
897 | 776 | { |
898 | 776 | match_info->offsets[i] = (int) ovector[i]; |
899 | 776 | } |
900 | | |
901 | 194 | return TRUE; |
902 | 194 | } |
903 | | |
904 | | static JITStatus |
905 | | enable_jit_with_match_options (GMatchInfo *match_info, |
906 | | uint32_t match_options) |
907 | 3.19k | { |
908 | 3.19k | gint retval; |
909 | 3.19k | uint32_t old_jit_options, new_jit_options; |
910 | | |
911 | 3.19k | if (!(match_info->regex->orig_compile_opts & G_REGEX_OPTIMIZE)) |
912 | 3.19k | return JIT_STATUS_DISABLED; |
913 | | |
914 | 0 | if (match_info->regex->jit_status == JIT_STATUS_DISABLED) |
915 | 0 | return JIT_STATUS_DISABLED; |
916 | | |
917 | 0 | if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS) |
918 | 0 | return JIT_STATUS_DISABLED; |
919 | | |
920 | 0 | old_jit_options = match_info->regex->jit_options; |
921 | 0 | new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE; |
922 | 0 | if (match_options & PCRE2_PARTIAL_HARD) |
923 | 0 | new_jit_options |= PCRE2_JIT_PARTIAL_HARD; |
924 | 0 | if (match_options & PCRE2_PARTIAL_SOFT) |
925 | 0 | new_jit_options |= PCRE2_JIT_PARTIAL_SOFT; |
926 | | |
927 | | /* no new options enabled */ |
928 | 0 | if (new_jit_options == old_jit_options) |
929 | 0 | { |
930 | 0 | g_assert (match_info->regex->jit_status != JIT_STATUS_DEFAULT); |
931 | 0 | return match_info->regex->jit_status; |
932 | 0 | } |
933 | | |
934 | 0 | retval = pcre2_jit_compile (match_info->regex->pcre_re, new_jit_options); |
935 | 0 | if (retval == 0) |
936 | 0 | { |
937 | 0 | match_info->regex->jit_status = JIT_STATUS_ENABLED; |
938 | |
|
939 | 0 | match_info->regex->jit_options = new_jit_options; |
940 | | /* Set min stack size for JIT to 32KiB and max to 512KiB */ |
941 | 0 | match_info->jit_stack = pcre2_jit_stack_create (1 << 15, 1 << 19, NULL); |
942 | 0 | pcre2_jit_stack_assign (match_info->match_context, NULL, match_info->jit_stack); |
943 | 0 | } |
944 | 0 | else |
945 | 0 | { |
946 | 0 | match_info->regex->jit_status = JIT_STATUS_DISABLED; |
947 | |
|
948 | 0 | switch (retval) |
949 | 0 | { |
950 | 0 | case PCRE2_ERROR_NOMEMORY: |
951 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
952 | 0 | "but JIT was unable to allocate executable memory for the " |
953 | 0 | "compiler. Falling back to interpretive code."); |
954 | 0 | break; |
955 | 0 | case PCRE2_ERROR_JIT_BADOPTION: |
956 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
957 | 0 | "but JIT support is not available. Falling back to " |
958 | 0 | "interpretive code."); |
959 | 0 | break; |
960 | 0 | default: |
961 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
962 | 0 | "but request for JIT support had unexpectedly failed (error %d). " |
963 | 0 | "Falling back to interpretive code.", |
964 | 0 | retval); |
965 | 0 | break; |
966 | 0 | } |
967 | 0 | } |
968 | | |
969 | 0 | return match_info->regex->jit_status; |
970 | | |
971 | 0 | g_assert_not_reached (); |
972 | 0 | } |
973 | | |
974 | | /** |
975 | | * g_match_info_get_regex: |
976 | | * @match_info: a #GMatchInfo |
977 | | * |
978 | | * Returns #GRegex object used in @match_info. It belongs to Glib |
979 | | * and must not be freed. Use g_regex_ref() if you need to keep it |
980 | | * after you free @match_info object. |
981 | | * |
982 | | * Returns: (transfer none): #GRegex object used in @match_info |
983 | | * |
984 | | * Since: 2.14 |
985 | | */ |
986 | | GRegex * |
987 | | g_match_info_get_regex (const GMatchInfo *match_info) |
988 | 0 | { |
989 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
990 | 0 | return match_info->regex; |
991 | 0 | } |
992 | | |
993 | | /** |
994 | | * g_match_info_get_string: |
995 | | * @match_info: a #GMatchInfo |
996 | | * |
997 | | * Returns the string searched with @match_info. This is the |
998 | | * string passed to g_regex_match() or g_regex_replace() so |
999 | | * you may not free it before calling this function. |
1000 | | * |
1001 | | * Returns: the string searched with @match_info |
1002 | | * |
1003 | | * Since: 2.14 |
1004 | | */ |
1005 | | const gchar * |
1006 | | g_match_info_get_string (const GMatchInfo *match_info) |
1007 | 0 | { |
1008 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1009 | 0 | return match_info->string; |
1010 | 0 | } |
1011 | | |
1012 | | /** |
1013 | | * g_match_info_ref: |
1014 | | * @match_info: a #GMatchInfo |
1015 | | * |
1016 | | * Increases reference count of @match_info by 1. |
1017 | | * |
1018 | | * Returns: @match_info |
1019 | | * |
1020 | | * Since: 2.30 |
1021 | | */ |
1022 | | GMatchInfo * |
1023 | | g_match_info_ref (GMatchInfo *match_info) |
1024 | 0 | { |
1025 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1026 | 0 | g_atomic_int_inc (&match_info->ref_count); |
1027 | 0 | return match_info; |
1028 | 0 | } |
1029 | | |
1030 | | /** |
1031 | | * g_match_info_unref: |
1032 | | * @match_info: a #GMatchInfo |
1033 | | * |
1034 | | * Decreases reference count of @match_info by 1. When reference count drops |
1035 | | * to zero, it frees all the memory associated with the match_info structure. |
1036 | | * |
1037 | | * Since: 2.30 |
1038 | | */ |
1039 | | void |
1040 | | g_match_info_unref (GMatchInfo *match_info) |
1041 | 3.19k | { |
1042 | 3.19k | if (g_atomic_int_dec_and_test (&match_info->ref_count)) |
1043 | 3.19k | { |
1044 | 3.19k | g_regex_unref (match_info->regex); |
1045 | 3.19k | if (match_info->match_context) |
1046 | 3.19k | pcre2_match_context_free (match_info->match_context); |
1047 | 3.19k | if (match_info->jit_stack) |
1048 | 0 | pcre2_jit_stack_free (match_info->jit_stack); |
1049 | 3.19k | if (match_info->match_data) |
1050 | 3.19k | pcre2_match_data_free (match_info->match_data); |
1051 | 3.19k | g_free (match_info->offsets); |
1052 | 3.19k | g_free (match_info->workspace); |
1053 | 3.19k | g_free (match_info); |
1054 | 3.19k | } |
1055 | 3.19k | } |
1056 | | |
1057 | | /** |
1058 | | * g_match_info_free: |
1059 | | * @match_info: (nullable): a #GMatchInfo, or %NULL |
1060 | | * |
1061 | | * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does |
1062 | | * nothing. |
1063 | | * |
1064 | | * Since: 2.14 |
1065 | | */ |
1066 | | void |
1067 | | g_match_info_free (GMatchInfo *match_info) |
1068 | 0 | { |
1069 | 0 | if (match_info == NULL) |
1070 | 0 | return; |
1071 | | |
1072 | 0 | g_match_info_unref (match_info); |
1073 | 0 | } |
1074 | | |
1075 | | /** |
1076 | | * g_match_info_next: |
1077 | | * @match_info: a #GMatchInfo structure |
1078 | | * @error: location to store the error occurring, or %NULL to ignore errors |
1079 | | * |
1080 | | * Scans for the next match using the same parameters of the previous |
1081 | | * call to g_regex_match_full() or g_regex_match() that returned |
1082 | | * @match_info. |
1083 | | * |
1084 | | * The match is done on the string passed to the match function, so you |
1085 | | * cannot free it before calling this function. |
1086 | | * |
1087 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
1088 | | * |
1089 | | * Since: 2.14 |
1090 | | */ |
1091 | | gboolean |
1092 | | g_match_info_next (GMatchInfo *match_info, |
1093 | | GError **error) |
1094 | 3.19k | { |
1095 | 3.19k | JITStatus jit_status; |
1096 | 3.19k | gint prev_match_start; |
1097 | 3.19k | gint prev_match_end; |
1098 | 3.19k | uint32_t opts; |
1099 | | |
1100 | 3.19k | g_return_val_if_fail (match_info != NULL, FALSE); |
1101 | 3.19k | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
1102 | 3.19k | g_return_val_if_fail (match_info->pos >= 0, FALSE); |
1103 | | |
1104 | 3.19k | prev_match_start = match_info->offsets[0]; |
1105 | 3.19k | prev_match_end = match_info->offsets[1]; |
1106 | | |
1107 | 3.19k | if (match_info->pos > match_info->string_len) |
1108 | 0 | { |
1109 | | /* we have reached the end of the string */ |
1110 | 0 | match_info->pos = -1; |
1111 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
1112 | 0 | return FALSE; |
1113 | 0 | } |
1114 | | |
1115 | 3.19k | opts = match_info->regex->match_opts | match_info->match_opts; |
1116 | | |
1117 | 3.19k | jit_status = enable_jit_with_match_options (match_info, opts); |
1118 | 3.19k | if (jit_status == JIT_STATUS_ENABLED) |
1119 | 0 | { |
1120 | 0 | match_info->matches = pcre2_jit_match (match_info->regex->pcre_re, |
1121 | 0 | (PCRE2_SPTR8) match_info->string, |
1122 | 0 | match_info->string_len, |
1123 | 0 | match_info->pos, |
1124 | 0 | opts, |
1125 | 0 | match_info->match_data, |
1126 | 0 | match_info->match_context); |
1127 | | /* if the JIT stack limit was reached, fall back to non-JIT matching in |
1128 | | * the next conditional statement */ |
1129 | 0 | if (match_info->matches == PCRE2_ERROR_JIT_STACKLIMIT) |
1130 | 0 | { |
1131 | 0 | g_debug ("PCRE2 JIT stack limit reached, falling back to " |
1132 | 0 | "non-optimized matching."); |
1133 | 0 | opts |= PCRE2_NO_JIT; |
1134 | 0 | jit_status = JIT_STATUS_DISABLED; |
1135 | 0 | } |
1136 | 0 | } |
1137 | | |
1138 | 3.19k | if (jit_status != JIT_STATUS_ENABLED) |
1139 | 3.19k | { |
1140 | 3.19k | match_info->matches = pcre2_match (match_info->regex->pcre_re, |
1141 | 3.19k | (PCRE2_SPTR8) match_info->string, |
1142 | 3.19k | match_info->string_len, |
1143 | 3.19k | match_info->pos, |
1144 | 3.19k | opts, |
1145 | 3.19k | match_info->match_data, |
1146 | 3.19k | match_info->match_context); |
1147 | 3.19k | } |
1148 | | |
1149 | 3.19k | if (IS_PCRE2_ERROR (match_info->matches)) |
1150 | 314 | { |
1151 | 314 | gchar *error_msg = get_match_error_message (match_info->matches); |
1152 | | |
1153 | 314 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
1154 | 314 | _("Error while matching regular expression %s: %s"), |
1155 | 314 | match_info->regex->pattern, error_msg); |
1156 | 314 | g_clear_pointer (&error_msg, g_free); |
1157 | 314 | return FALSE; |
1158 | 314 | } |
1159 | 2.87k | else if (match_info->matches == 0) |
1160 | 0 | { |
1161 | | /* info->offsets is too small. */ |
1162 | 0 | match_info->n_offsets *= 2; |
1163 | 0 | match_info->offsets = g_realloc_n (match_info->offsets, |
1164 | 0 | match_info->n_offsets, |
1165 | 0 | sizeof (gint)); |
1166 | |
|
1167 | 0 | pcre2_match_data_free (match_info->match_data); |
1168 | 0 | match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL); |
1169 | |
|
1170 | 0 | return g_match_info_next (match_info, error); |
1171 | 0 | } |
1172 | 2.87k | else if (match_info->matches == PCRE2_ERROR_NOMATCH) |
1173 | 2.68k | { |
1174 | | /* We're done with this match info */ |
1175 | 2.68k | match_info->pos = -1; |
1176 | 2.68k | return FALSE; |
1177 | 2.68k | } |
1178 | 194 | else |
1179 | 194 | if (!recalc_match_offsets (match_info, error)) |
1180 | 0 | return FALSE; |
1181 | | |
1182 | | /* avoid infinite loops if the pattern is an empty string or something |
1183 | | * equivalent */ |
1184 | 194 | if (match_info->pos == match_info->offsets[1]) |
1185 | 0 | { |
1186 | 0 | if (match_info->pos > match_info->string_len) |
1187 | 0 | { |
1188 | | /* we have reached the end of the string */ |
1189 | 0 | match_info->pos = -1; |
1190 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
1191 | 0 | return FALSE; |
1192 | 0 | } |
1193 | | |
1194 | 0 | match_info->pos = NEXT_CHAR (match_info->regex, |
1195 | 0 | &match_info->string[match_info->pos]) - |
1196 | 0 | match_info->string; |
1197 | 0 | } |
1198 | 194 | else |
1199 | 194 | { |
1200 | 194 | match_info->pos = match_info->offsets[1]; |
1201 | 194 | } |
1202 | | |
1203 | 194 | g_assert (match_info->matches < 0 || |
1204 | 194 | (uint32_t) match_info->matches <= match_info->n_subpatterns + 1); |
1205 | | |
1206 | | /* it's possible to get two identical matches when we are matching |
1207 | | * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and |
1208 | | * the string is "RegExTest" we have: |
1209 | | * - search at position 0: match from 0 to 0 |
1210 | | * - search at position 1: match from 3 to 3 |
1211 | | * - search at position 3: match from 3 to 3 (duplicate) |
1212 | | * - search at position 4: match from 5 to 5 |
1213 | | * - search at position 5: match from 5 to 5 (duplicate) |
1214 | | * - search at position 6: no match -> stop |
1215 | | * so we have to ignore the duplicates. |
1216 | | * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */ |
1217 | 194 | if (match_info->matches >= 0 && |
1218 | 194 | prev_match_start == match_info->offsets[0] && |
1219 | 194 | prev_match_end == match_info->offsets[1]) |
1220 | 0 | { |
1221 | | /* ignore this match and search the next one */ |
1222 | 0 | return g_match_info_next (match_info, error); |
1223 | 0 | } |
1224 | | |
1225 | 194 | return match_info->matches >= 0; |
1226 | 194 | } |
1227 | | |
1228 | | /** |
1229 | | * g_match_info_matches: |
1230 | | * @match_info: a #GMatchInfo structure |
1231 | | * |
1232 | | * Returns whether the previous match operation succeeded. |
1233 | | * |
1234 | | * Returns: %TRUE if the previous match operation succeeded, |
1235 | | * %FALSE otherwise |
1236 | | * |
1237 | | * Since: 2.14 |
1238 | | */ |
1239 | | gboolean |
1240 | | g_match_info_matches (const GMatchInfo *match_info) |
1241 | 0 | { |
1242 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1243 | | |
1244 | 0 | return match_info->matches >= 0; |
1245 | 0 | } |
1246 | | |
1247 | | /** |
1248 | | * g_match_info_get_match_count: |
1249 | | * @match_info: a #GMatchInfo structure |
1250 | | * |
1251 | | * Retrieves the number of matched substrings (including substring 0, |
1252 | | * that is the whole matched text), so 1 is returned if the pattern |
1253 | | * has no substrings in it and 0 is returned if the match failed. |
1254 | | * |
1255 | | * If the last match was obtained using the DFA algorithm, that is |
1256 | | * using g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1257 | | * count is not that of the number of capturing parentheses but that of |
1258 | | * the number of matched substrings. |
1259 | | * |
1260 | | * Returns: Number of matched substrings, or -1 if an error occurred |
1261 | | * |
1262 | | * Since: 2.14 |
1263 | | */ |
1264 | | gint |
1265 | | g_match_info_get_match_count (const GMatchInfo *match_info) |
1266 | 0 | { |
1267 | 0 | g_return_val_if_fail (match_info, -1); |
1268 | | |
1269 | 0 | if (match_info->matches == PCRE2_ERROR_NOMATCH) |
1270 | | /* no match */ |
1271 | 0 | return 0; |
1272 | 0 | else if (match_info->matches < PCRE2_ERROR_NOMATCH) |
1273 | | /* error */ |
1274 | 0 | return -1; |
1275 | 0 | else |
1276 | | /* match */ |
1277 | 0 | return match_info->matches; |
1278 | 0 | } |
1279 | | |
1280 | | /** |
1281 | | * g_match_info_is_partial_match: |
1282 | | * @match_info: a #GMatchInfo structure |
1283 | | * |
1284 | | * Usually if the string passed to g_regex_match*() matches as far as |
1285 | | * it goes, but is too short to match the entire pattern, %FALSE is |
1286 | | * returned. There are circumstances where it might be helpful to |
1287 | | * distinguish this case from other cases in which there is no match. |
1288 | | * |
1289 | | * Consider, for example, an application where a human is required to |
1290 | | * type in data for a field with specific formatting requirements. An |
1291 | | * example might be a date in the form ddmmmyy, defined by the pattern |
1292 | | * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$". |
1293 | | * If the application sees the user’s keystrokes one by one, and can |
1294 | | * check that what has been typed so far is potentially valid, it is |
1295 | | * able to raise an error as soon as a mistake is made. |
1296 | | * |
1297 | | * GRegex supports the concept of partial matching by means of the |
1298 | | * %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD flags. |
1299 | | * When they are used, the return code for |
1300 | | * g_regex_match() or g_regex_match_full() is, as usual, %TRUE |
1301 | | * for a complete match, %FALSE otherwise. But, when these functions |
1302 | | * return %FALSE, you can check if the match was partial calling |
1303 | | * g_match_info_is_partial_match(). |
1304 | | * |
1305 | | * The difference between %G_REGEX_MATCH_PARTIAL_SOFT and |
1306 | | * %G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered |
1307 | | * with %G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a |
1308 | | * possible complete match, while with %G_REGEX_MATCH_PARTIAL_HARD matching |
1309 | | * stops at the partial match. |
1310 | | * When both %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD |
1311 | | * are set, the latter takes precedence. |
1312 | | * |
1313 | | * There were formerly some restrictions on the pattern for partial matching. |
1314 | | * The restrictions no longer apply. |
1315 | | * |
1316 | | * See pcrepartial(3) for more information on partial matching. |
1317 | | * |
1318 | | * Returns: %TRUE if the match was partial, %FALSE otherwise |
1319 | | * |
1320 | | * Since: 2.14 |
1321 | | */ |
1322 | | gboolean |
1323 | | g_match_info_is_partial_match (const GMatchInfo *match_info) |
1324 | 0 | { |
1325 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1326 | | |
1327 | 0 | return match_info->matches == PCRE2_ERROR_PARTIAL; |
1328 | 0 | } |
1329 | | |
1330 | | /** |
1331 | | * g_match_info_expand_references: |
1332 | | * @match_info: (nullable): a #GMatchInfo or %NULL |
1333 | | * @string_to_expand: the string to expand |
1334 | | * @error: location to store the error occurring, or %NULL to ignore errors |
1335 | | * |
1336 | | * Returns a new string containing the text in @string_to_expand with |
1337 | | * references and escape sequences expanded. References refer to the last |
1338 | | * match done with @string against @regex and have the same syntax used by |
1339 | | * g_regex_replace(). |
1340 | | * |
1341 | | * The @string_to_expand must be UTF-8 encoded even if %G_REGEX_RAW was |
1342 | | * passed to g_regex_new(). |
1343 | | * |
1344 | | * The backreferences are extracted from the string passed to the match |
1345 | | * function, so you cannot call this function after freeing the string. |
1346 | | * |
1347 | | * @match_info may be %NULL in which case @string_to_expand must not |
1348 | | * contain references. For instance "foo\n" does not refer to an actual |
1349 | | * pattern and '\n' merely will be replaced with \n character, |
1350 | | * while to expand "\0" (whole match) one needs the result of a match. |
1351 | | * Use g_regex_check_replacement() to find out whether @string_to_expand |
1352 | | * contains references. |
1353 | | * |
1354 | | * Returns: (nullable): the expanded string, or %NULL if an error occurred |
1355 | | * |
1356 | | * Since: 2.14 |
1357 | | */ |
1358 | | gchar * |
1359 | | g_match_info_expand_references (const GMatchInfo *match_info, |
1360 | | const gchar *string_to_expand, |
1361 | | GError **error) |
1362 | 0 | { |
1363 | 0 | GString *result; |
1364 | 0 | GList *list; |
1365 | 0 | GError *tmp_error = NULL; |
1366 | |
|
1367 | 0 | g_return_val_if_fail (string_to_expand != NULL, NULL); |
1368 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
1369 | | |
1370 | 0 | list = split_replacement (string_to_expand, &tmp_error); |
1371 | 0 | if (tmp_error != NULL) |
1372 | 0 | { |
1373 | 0 | g_propagate_error (error, tmp_error); |
1374 | 0 | return NULL; |
1375 | 0 | } |
1376 | | |
1377 | 0 | if (!match_info && interpolation_list_needs_match (list)) |
1378 | 0 | { |
1379 | 0 | g_critical ("String '%s' contains references to the match, can't " |
1380 | 0 | "expand references without GMatchInfo object", |
1381 | 0 | string_to_expand); |
1382 | 0 | return NULL; |
1383 | 0 | } |
1384 | | |
1385 | 0 | result = g_string_sized_new (strlen (string_to_expand)); |
1386 | 0 | interpolate_replacement (match_info, result, list); |
1387 | |
|
1388 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
1389 | |
|
1390 | 0 | return g_string_free (result, FALSE); |
1391 | 0 | } |
1392 | | |
1393 | | /** |
1394 | | * g_match_info_fetch: |
1395 | | * @match_info: #GMatchInfo structure |
1396 | | * @match_num: number of the sub expression |
1397 | | * |
1398 | | * Retrieves the text matching the @match_num'th capturing |
1399 | | * parentheses. 0 is the full text of the match, 1 is the first paren |
1400 | | * set, 2 the second, and so on. |
1401 | | * |
1402 | | * If @match_num is a valid sub pattern but it didn't match anything |
1403 | | * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty |
1404 | | * string is returned. |
1405 | | * |
1406 | | * If the match was obtained using the DFA algorithm, that is using |
1407 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1408 | | * string is not that of a set of parentheses but that of a matched |
1409 | | * substring. Substrings are matched in reverse order of length, so |
1410 | | * 0 is the longest match. |
1411 | | * |
1412 | | * The string is fetched from the string passed to the match function, |
1413 | | * so you cannot call this function after freeing the string. |
1414 | | * |
1415 | | * Returns: (nullable): The matched substring, or %NULL if an error |
1416 | | * occurred. You have to free the string yourself |
1417 | | * |
1418 | | * Since: 2.14 |
1419 | | */ |
1420 | | gchar * |
1421 | | g_match_info_fetch (const GMatchInfo *match_info, |
1422 | | gint match_num) |
1423 | 194 | { |
1424 | 194 | gchar *match = NULL; |
1425 | 194 | gint start, end; |
1426 | | |
1427 | 194 | g_return_val_if_fail (match_info != NULL, NULL); |
1428 | 194 | g_return_val_if_fail (match_num >= 0, NULL); |
1429 | | |
1430 | | /* match_num does not exist or it didn't matched, i.e. matching "b" |
1431 | | * against "(a)?b" then group 0 is empty. */ |
1432 | 194 | if (!g_match_info_fetch_pos (match_info, match_num, &start, &end)) |
1433 | 0 | match = NULL; |
1434 | 194 | else if (start == -1) |
1435 | 0 | match = g_strdup (""); |
1436 | 194 | else |
1437 | 194 | match = g_strndup (&match_info->string[start], end - start); |
1438 | | |
1439 | 194 | return match; |
1440 | 194 | } |
1441 | | |
1442 | | /** |
1443 | | * g_match_info_fetch_pos: |
1444 | | * @match_info: #GMatchInfo structure |
1445 | | * @match_num: number of the sub expression |
1446 | | * @start_pos: (out) (optional): pointer to location where to store |
1447 | | * the start position, or %NULL |
1448 | | * @end_pos: (out) (optional): pointer to location where to store |
1449 | | * the end position, or %NULL |
1450 | | * |
1451 | | * Retrieves the position in bytes of the @match_num'th capturing |
1452 | | * parentheses. 0 is the full text of the match, 1 is the first |
1453 | | * paren set, 2 the second, and so on. |
1454 | | * |
1455 | | * If @match_num is a valid sub pattern but it didn't match anything |
1456 | | * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos |
1457 | | * and @end_pos are set to -1 and %TRUE is returned. |
1458 | | * |
1459 | | * If the match was obtained using the DFA algorithm, that is using |
1460 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1461 | | * position is not that of a set of parentheses but that of a matched |
1462 | | * substring. Substrings are matched in reverse order of length, so |
1463 | | * 0 is the longest match. |
1464 | | * |
1465 | | * Returns: %TRUE if the position was fetched, %FALSE otherwise. If |
1466 | | * the position cannot be fetched, @start_pos and @end_pos are left |
1467 | | * unchanged |
1468 | | * |
1469 | | * Since: 2.14 |
1470 | | */ |
1471 | | gboolean |
1472 | | g_match_info_fetch_pos (const GMatchInfo *match_info, |
1473 | | gint match_num, |
1474 | | gint *start_pos, |
1475 | | gint *end_pos) |
1476 | 194 | { |
1477 | 194 | g_return_val_if_fail (match_info != NULL, FALSE); |
1478 | 194 | g_return_val_if_fail (match_num >= 0, FALSE); |
1479 | | |
1480 | | /* check whether there was an error */ |
1481 | 194 | if (match_info->matches < 0) |
1482 | 0 | return FALSE; |
1483 | | |
1484 | | /* make sure the sub expression number they're requesting is less than |
1485 | | * the total number of sub expressions in the regex. When matching all |
1486 | | * (g_regex_match_all()), also compare against the number of matches */ |
1487 | 194 | if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches)) |
1488 | 0 | return FALSE; |
1489 | | |
1490 | 194 | if (start_pos != NULL) |
1491 | 194 | *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1; |
1492 | | |
1493 | 194 | if (end_pos != NULL) |
1494 | 194 | *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1; |
1495 | | |
1496 | 194 | return TRUE; |
1497 | 194 | } |
1498 | | |
1499 | | /* |
1500 | | * Returns number of first matched subpattern with name @name. |
1501 | | * There may be more than one in case when DUPNAMES is used, |
1502 | | * and not all subpatterns with that name match; |
1503 | | * pcre2_substring_number_from_name() does not work in that case. |
1504 | | */ |
1505 | | static gint |
1506 | | get_matched_substring_number (const GMatchInfo *match_info, |
1507 | | const gchar *name) |
1508 | 0 | { |
1509 | 0 | gint entrysize; |
1510 | 0 | PCRE2_SPTR first, last; |
1511 | 0 | guchar *entry; |
1512 | |
|
1513 | 0 | if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES)) |
1514 | 0 | return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR8) name); |
1515 | | |
1516 | | /* This code is analogous to code from pcre2_substring.c: |
1517 | | * pcre2_substring_get_byname() */ |
1518 | 0 | entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re, |
1519 | 0 | (PCRE2_SPTR8) name, |
1520 | 0 | &first, |
1521 | 0 | &last); |
1522 | |
|
1523 | 0 | if (entrysize <= 0) |
1524 | 0 | return entrysize; |
1525 | | |
1526 | 0 | for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize) |
1527 | 0 | { |
1528 | 0 | guint n = (entry[0] << 8) + entry[1]; |
1529 | 0 | if (n * 2 < match_info->n_offsets && match_info->offsets[n * 2] >= 0) |
1530 | 0 | return n; |
1531 | 0 | } |
1532 | | |
1533 | 0 | return (first[0] << 8) + first[1]; |
1534 | 0 | } |
1535 | | |
1536 | | /** |
1537 | | * g_match_info_fetch_named: |
1538 | | * @match_info: #GMatchInfo structure |
1539 | | * @name: name of the subexpression |
1540 | | * |
1541 | | * Retrieves the text matching the capturing parentheses named @name. |
1542 | | * |
1543 | | * If @name is a valid sub pattern name but it didn't match anything |
1544 | | * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") |
1545 | | * then an empty string is returned. |
1546 | | * |
1547 | | * The string is fetched from the string passed to the match function, |
1548 | | * so you cannot call this function after freeing the string. |
1549 | | * |
1550 | | * Returns: (nullable): The matched substring, or %NULL if an error |
1551 | | * occurred. You have to free the string yourself |
1552 | | * |
1553 | | * Since: 2.14 |
1554 | | */ |
1555 | | gchar * |
1556 | | g_match_info_fetch_named (const GMatchInfo *match_info, |
1557 | | const gchar *name) |
1558 | 0 | { |
1559 | 0 | gint num; |
1560 | |
|
1561 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1562 | 0 | g_return_val_if_fail (name != NULL, NULL); |
1563 | | |
1564 | 0 | num = get_matched_substring_number (match_info, name); |
1565 | 0 | if (num < 0) |
1566 | 0 | return NULL; |
1567 | 0 | else |
1568 | 0 | return g_match_info_fetch (match_info, num); |
1569 | 0 | } |
1570 | | |
1571 | | /** |
1572 | | * g_match_info_fetch_named_pos: |
1573 | | * @match_info: #GMatchInfo structure |
1574 | | * @name: name of the subexpression |
1575 | | * @start_pos: (out) (optional): pointer to location where to store |
1576 | | * the start position, or %NULL |
1577 | | * @end_pos: (out) (optional): pointer to location where to store |
1578 | | * the end position, or %NULL |
1579 | | * |
1580 | | * Retrieves the position in bytes of the capturing parentheses named @name. |
1581 | | * |
1582 | | * If @name is a valid sub pattern name but it didn't match anything |
1583 | | * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") |
1584 | | * then @start_pos and @end_pos are set to -1 and %TRUE is returned. |
1585 | | * |
1586 | | * Returns: %TRUE if the position was fetched, %FALSE otherwise. |
1587 | | * If the position cannot be fetched, @start_pos and @end_pos |
1588 | | * are left unchanged. |
1589 | | * |
1590 | | * Since: 2.14 |
1591 | | */ |
1592 | | gboolean |
1593 | | g_match_info_fetch_named_pos (const GMatchInfo *match_info, |
1594 | | const gchar *name, |
1595 | | gint *start_pos, |
1596 | | gint *end_pos) |
1597 | 0 | { |
1598 | 0 | gint num; |
1599 | |
|
1600 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1601 | 0 | g_return_val_if_fail (name != NULL, FALSE); |
1602 | | |
1603 | 0 | num = get_matched_substring_number (match_info, name); |
1604 | 0 | if (num < 0) |
1605 | 0 | return FALSE; |
1606 | | |
1607 | 0 | return g_match_info_fetch_pos (match_info, num, start_pos, end_pos); |
1608 | 0 | } |
1609 | | |
1610 | | /** |
1611 | | * g_match_info_fetch_all: |
1612 | | * @match_info: a #GMatchInfo structure |
1613 | | * |
1614 | | * Bundles up pointers to each of the matching substrings from a match |
1615 | | * and stores them in an array of gchar pointers. The first element in |
1616 | | * the returned array is the match number 0, i.e. the entire matched |
1617 | | * text. |
1618 | | * |
1619 | | * If a sub pattern didn't match anything (e.g. sub pattern 1, matching |
1620 | | * "b" against "(a)?b") then an empty string is inserted. |
1621 | | * |
1622 | | * If the last match was obtained using the DFA algorithm, that is using |
1623 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1624 | | * strings are not that matched by sets of parentheses but that of the |
1625 | | * matched substring. Substrings are matched in reverse order of length, |
1626 | | * so the first one is the longest match. |
1627 | | * |
1628 | | * The strings are fetched from the string passed to the match function, |
1629 | | * so you cannot call this function after freeing the string. |
1630 | | * |
1631 | | * Returns: (transfer full): a %NULL-terminated array of gchar * |
1632 | | * pointers. It must be freed using g_strfreev(). If the previous |
1633 | | * match failed %NULL is returned |
1634 | | * |
1635 | | * Since: 2.14 |
1636 | | */ |
1637 | | gchar ** |
1638 | | g_match_info_fetch_all (const GMatchInfo *match_info) |
1639 | 0 | { |
1640 | 0 | gchar **result; |
1641 | 0 | gint i; |
1642 | |
|
1643 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1644 | | |
1645 | 0 | if (match_info->matches < 0) |
1646 | 0 | return NULL; |
1647 | | |
1648 | 0 | result = g_new (gchar *, match_info->matches + 1); |
1649 | 0 | for (i = 0; i < match_info->matches; i++) |
1650 | 0 | result[i] = g_match_info_fetch (match_info, i); |
1651 | 0 | result[i] = NULL; |
1652 | |
|
1653 | 0 | return result; |
1654 | 0 | } |
1655 | | |
1656 | | |
1657 | | /* GRegex */ |
1658 | | |
1659 | | G_DEFINE_QUARK (g-regex-error-quark, g_regex_error) |
1660 | | |
1661 | | /** |
1662 | | * g_regex_ref: |
1663 | | * @regex: a #GRegex |
1664 | | * |
1665 | | * Increases reference count of @regex by 1. |
1666 | | * |
1667 | | * Returns: @regex |
1668 | | * |
1669 | | * Since: 2.14 |
1670 | | */ |
1671 | | GRegex * |
1672 | | g_regex_ref (GRegex *regex) |
1673 | 3.19k | { |
1674 | 3.19k | g_return_val_if_fail (regex != NULL, NULL); |
1675 | 3.19k | g_atomic_int_inc (®ex->ref_count); |
1676 | 3.19k | return regex; |
1677 | 3.19k | } |
1678 | | |
1679 | | /** |
1680 | | * g_regex_unref: |
1681 | | * @regex: a #GRegex |
1682 | | * |
1683 | | * Decreases reference count of @regex by 1. When reference count drops |
1684 | | * to zero, it frees all the memory associated with the regex structure. |
1685 | | * |
1686 | | * Since: 2.14 |
1687 | | */ |
1688 | | void |
1689 | | g_regex_unref (GRegex *regex) |
1690 | 3.19k | { |
1691 | 3.19k | g_return_if_fail (regex != NULL); |
1692 | | |
1693 | 3.19k | if (g_atomic_int_dec_and_test (®ex->ref_count)) |
1694 | 0 | { |
1695 | 0 | g_free (regex->pattern); |
1696 | 0 | if (regex->pcre_re != NULL) |
1697 | 0 | pcre2_code_free (regex->pcre_re); |
1698 | 0 | g_free (regex); |
1699 | 0 | } |
1700 | 3.19k | } |
1701 | | |
1702 | | static pcre2_code * regex_compile (const gchar *pattern, |
1703 | | uint32_t compile_options, |
1704 | | uint32_t newline_options, |
1705 | | uint32_t bsr_options, |
1706 | | GError **error); |
1707 | | |
1708 | | static uint32_t get_pcre2_inline_compile_options (pcre2_code *re, |
1709 | | uint32_t compile_options); |
1710 | | |
1711 | | /** |
1712 | | * g_regex_new: |
1713 | | * @pattern: the regular expression |
1714 | | * @compile_options: compile options for the regular expression, or 0 |
1715 | | * @match_options: match options for the regular expression, or 0 |
1716 | | * @error: return location for a #GError |
1717 | | * |
1718 | | * Compiles the regular expression to an internal form, and does |
1719 | | * the initial setup of the #GRegex structure. |
1720 | | * |
1721 | | * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call |
1722 | | * g_regex_unref() when you are done with it |
1723 | | * |
1724 | | * Since: 2.14 |
1725 | | */ |
1726 | | GRegex * |
1727 | | g_regex_new (const gchar *pattern, |
1728 | | GRegexCompileFlags compile_options, |
1729 | | GRegexMatchFlags match_options, |
1730 | | GError **error) |
1731 | 1 | { |
1732 | 1 | GRegex *regex; |
1733 | 1 | pcre2_code *re; |
1734 | 1 | static gsize initialised = 0; |
1735 | 1 | uint32_t pcre_compile_options; |
1736 | 1 | uint32_t pcre_match_options; |
1737 | 1 | uint32_t newline_options; |
1738 | 1 | uint32_t bsr_options; |
1739 | | |
1740 | 1 | g_return_val_if_fail (pattern != NULL, NULL); |
1741 | 1 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
1742 | 1 | G_GNUC_BEGIN_IGNORE_DEPRECATIONS |
1743 | 1 | g_return_val_if_fail ((compile_options & ~(G_REGEX_COMPILE_MASK | |
1744 | 1 | G_REGEX_JAVASCRIPT_COMPAT)) == 0, NULL); |
1745 | 1 | G_GNUC_END_IGNORE_DEPRECATIONS |
1746 | 1 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
1747 | | |
1748 | 1 | if (g_once_init_enter (&initialised)) |
1749 | 1 | { |
1750 | 1 | int supports_utf8; |
1751 | | |
1752 | 1 | pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8); |
1753 | 1 | if (!supports_utf8) |
1754 | 0 | g_critical (_("PCRE library is compiled without UTF8 support")); |
1755 | | |
1756 | 1 | g_once_init_leave (&initialised, supports_utf8 ? 1 : 2); |
1757 | 1 | } |
1758 | | |
1759 | 1 | if (G_UNLIKELY (initialised != 1)) |
1760 | 0 | { |
1761 | 0 | g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, |
1762 | 0 | _("PCRE library is compiled with incompatible options")); |
1763 | 0 | return NULL; |
1764 | 0 | } |
1765 | | |
1766 | 1 | pcre_compile_options = get_pcre2_compile_options (compile_options); |
1767 | 1 | pcre_match_options = get_pcre2_match_options (match_options, compile_options); |
1768 | | |
1769 | 1 | newline_options = get_pcre2_newline_match_options (match_options); |
1770 | 1 | if (newline_options == 0) |
1771 | 1 | newline_options = get_pcre2_newline_compile_options (compile_options); |
1772 | | |
1773 | 1 | if (newline_options == 0) |
1774 | 0 | { |
1775 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
1776 | 0 | "Invalid newline flags"); |
1777 | 0 | return NULL; |
1778 | 0 | } |
1779 | | |
1780 | 1 | bsr_options = get_pcre2_bsr_match_options (match_options); |
1781 | 1 | if (!bsr_options) |
1782 | 1 | bsr_options = get_pcre2_bsr_compile_options (compile_options); |
1783 | | |
1784 | 1 | re = regex_compile (pattern, pcre_compile_options, |
1785 | 1 | newline_options, bsr_options, error); |
1786 | 1 | if (re == NULL) |
1787 | 0 | return NULL; |
1788 | | |
1789 | 1 | pcre_compile_options |= |
1790 | 1 | get_pcre2_inline_compile_options (re, pcre_compile_options); |
1791 | | |
1792 | 1 | regex = g_new0 (GRegex, 1); |
1793 | 1 | regex->ref_count = 1; |
1794 | 1 | regex->pattern = g_strdup (pattern); |
1795 | 1 | regex->pcre_re = re; |
1796 | 1 | regex->compile_opts = pcre_compile_options; |
1797 | 1 | regex->orig_compile_opts = compile_options; |
1798 | 1 | regex->match_opts = pcre_match_options; |
1799 | 1 | regex->orig_match_opts = match_options; |
1800 | | |
1801 | 1 | return regex; |
1802 | 1 | } |
1803 | | |
1804 | | static pcre2_code * |
1805 | | regex_compile (const gchar *pattern, |
1806 | | uint32_t compile_options, |
1807 | | uint32_t newline_options, |
1808 | | uint32_t bsr_options, |
1809 | | GError **error) |
1810 | 1 | { |
1811 | 1 | pcre2_code *re; |
1812 | 1 | pcre2_compile_context *context; |
1813 | 1 | const gchar *errmsg; |
1814 | 1 | PCRE2_SIZE erroffset; |
1815 | 1 | gint errcode; |
1816 | | |
1817 | 1 | context = pcre2_compile_context_create (NULL); |
1818 | | |
1819 | | /* set newline options */ |
1820 | 1 | if (pcre2_set_newline (context, newline_options) != 0) |
1821 | 0 | { |
1822 | 0 | g_set_error (error, G_REGEX_ERROR, |
1823 | 0 | G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
1824 | 0 | "Invalid newline flags"); |
1825 | 0 | pcre2_compile_context_free (context); |
1826 | 0 | return NULL; |
1827 | 0 | } |
1828 | | |
1829 | | /* set bsr options */ |
1830 | 1 | if (pcre2_set_bsr (context, bsr_options) != 0) |
1831 | 0 | { |
1832 | 0 | g_set_error (error, G_REGEX_ERROR, |
1833 | 0 | G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
1834 | 0 | "Invalid BSR flags"); |
1835 | 0 | pcre2_compile_context_free (context); |
1836 | 0 | return NULL; |
1837 | 0 | } |
1838 | | |
1839 | | /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */ |
1840 | 1 | if (compile_options & PCRE2_UTF) |
1841 | 1 | compile_options |= PCRE2_NO_UTF_CHECK; |
1842 | | |
1843 | 1 | compile_options |= PCRE2_UCP; |
1844 | | |
1845 | | /* compile the pattern */ |
1846 | 1 | re = pcre2_compile ((PCRE2_SPTR8) pattern, |
1847 | 1 | PCRE2_ZERO_TERMINATED, |
1848 | 1 | compile_options, |
1849 | 1 | &errcode, |
1850 | 1 | &erroffset, |
1851 | 1 | context); |
1852 | 1 | pcre2_compile_context_free (context); |
1853 | | |
1854 | | /* if the compilation failed, set the error member and return |
1855 | | * immediately */ |
1856 | 1 | if (re == NULL) |
1857 | 0 | { |
1858 | 0 | GError *tmp_error; |
1859 | 0 | gchar *offset_str; |
1860 | 0 | gchar *pcre2_errmsg = NULL; |
1861 | 0 | int original_errcode; |
1862 | | |
1863 | | /* Translate the PCRE error code to GRegexError and use a translated |
1864 | | * error message if possible */ |
1865 | 0 | original_errcode = errcode; |
1866 | 0 | translate_compile_error (&errcode, &errmsg); |
1867 | |
|
1868 | 0 | if (!errmsg) |
1869 | 0 | { |
1870 | 0 | errmsg = _("unknown error"); |
1871 | 0 | pcre2_errmsg = get_pcre2_error_string (original_errcode); |
1872 | 0 | } |
1873 | | |
1874 | | /* PCRE uses byte offsets but we want to show character offsets */ |
1875 | 0 | erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]); |
1876 | |
|
1877 | 0 | offset_str = g_strdup_printf ("%" G_GSIZE_FORMAT, erroffset); |
1878 | 0 | tmp_error = g_error_new (G_REGEX_ERROR, errcode, |
1879 | 0 | _("Error while compiling regular expression ‘%s’ " |
1880 | 0 | "at char %s: %s"), |
1881 | 0 | pattern, offset_str, |
1882 | 0 | pcre2_errmsg ? pcre2_errmsg : errmsg); |
1883 | 0 | g_propagate_error (error, tmp_error); |
1884 | 0 | g_free (offset_str); |
1885 | 0 | g_clear_pointer (&pcre2_errmsg, g_free); |
1886 | |
|
1887 | 0 | return NULL; |
1888 | 0 | } |
1889 | | |
1890 | 1 | return re; |
1891 | 1 | } |
1892 | | |
1893 | | static uint32_t |
1894 | | get_pcre2_inline_compile_options (pcre2_code *re, |
1895 | | uint32_t compile_options) |
1896 | 1 | { |
1897 | 1 | uint32_t pcre_compile_options; |
1898 | 1 | uint32_t nonpcre_compile_options; |
1899 | | |
1900 | | /* For options set at the beginning of the pattern, pcre puts them into |
1901 | | * compile options, e.g. "(?i)foo" will make the pcre structure store |
1902 | | * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */ |
1903 | 1 | nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; |
1904 | 1 | pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options); |
1905 | 1 | compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK; |
1906 | 1 | compile_options |= nonpcre_compile_options; |
1907 | | |
1908 | 1 | if (!(compile_options & PCRE2_DUPNAMES)) |
1909 | 1 | { |
1910 | 1 | uint32_t jchanged = 0; |
1911 | 1 | pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged); |
1912 | 1 | if (jchanged) |
1913 | 0 | compile_options |= PCRE2_DUPNAMES; |
1914 | 1 | } |
1915 | | |
1916 | 1 | return compile_options; |
1917 | 1 | } |
1918 | | |
1919 | | /** |
1920 | | * g_regex_get_pattern: |
1921 | | * @regex: a #GRegex structure |
1922 | | * |
1923 | | * Gets the pattern string associated with @regex, i.e. a copy of |
1924 | | * the string passed to g_regex_new(). |
1925 | | * |
1926 | | * Returns: the pattern of @regex |
1927 | | * |
1928 | | * Since: 2.14 |
1929 | | */ |
1930 | | const gchar * |
1931 | | g_regex_get_pattern (const GRegex *regex) |
1932 | 0 | { |
1933 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
1934 | | |
1935 | 0 | return regex->pattern; |
1936 | 0 | } |
1937 | | |
1938 | | /** |
1939 | | * g_regex_get_max_backref: |
1940 | | * @regex: a #GRegex |
1941 | | * |
1942 | | * Returns the number of the highest back reference |
1943 | | * in the pattern, or 0 if the pattern does not contain |
1944 | | * back references. |
1945 | | * |
1946 | | * Returns: the number of the highest back reference |
1947 | | * |
1948 | | * Since: 2.14 |
1949 | | */ |
1950 | | gint |
1951 | | g_regex_get_max_backref (const GRegex *regex) |
1952 | 0 | { |
1953 | 0 | uint32_t value; |
1954 | |
|
1955 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value); |
1956 | |
|
1957 | 0 | return value; |
1958 | 0 | } |
1959 | | |
1960 | | /** |
1961 | | * g_regex_get_capture_count: |
1962 | | * @regex: a #GRegex |
1963 | | * |
1964 | | * Returns the number of capturing subpatterns in the pattern. |
1965 | | * |
1966 | | * Returns: the number of capturing subpatterns |
1967 | | * |
1968 | | * Since: 2.14 |
1969 | | */ |
1970 | | gint |
1971 | | g_regex_get_capture_count (const GRegex *regex) |
1972 | 0 | { |
1973 | 0 | uint32_t value; |
1974 | |
|
1975 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value); |
1976 | |
|
1977 | 0 | return value; |
1978 | 0 | } |
1979 | | |
1980 | | /** |
1981 | | * g_regex_get_has_cr_or_lf: |
1982 | | * @regex: a #GRegex structure |
1983 | | * |
1984 | | * Checks whether the pattern contains explicit CR or LF references. |
1985 | | * |
1986 | | * Returns: %TRUE if the pattern contains explicit CR or LF references |
1987 | | * |
1988 | | * Since: 2.34 |
1989 | | */ |
1990 | | gboolean |
1991 | | g_regex_get_has_cr_or_lf (const GRegex *regex) |
1992 | 0 | { |
1993 | 0 | uint32_t value; |
1994 | |
|
1995 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value); |
1996 | |
|
1997 | 0 | return !!value; |
1998 | 0 | } |
1999 | | |
2000 | | /** |
2001 | | * g_regex_get_max_lookbehind: |
2002 | | * @regex: a #GRegex structure |
2003 | | * |
2004 | | * Gets the number of characters in the longest lookbehind assertion in the |
2005 | | * pattern. This information is useful when doing multi-segment matching using |
2006 | | * the partial matching facilities. |
2007 | | * |
2008 | | * Returns: the number of characters in the longest lookbehind assertion. |
2009 | | * |
2010 | | * Since: 2.38 |
2011 | | */ |
2012 | | gint |
2013 | | g_regex_get_max_lookbehind (const GRegex *regex) |
2014 | 0 | { |
2015 | 0 | uint32_t max_lookbehind; |
2016 | |
|
2017 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND, |
2018 | 0 | &max_lookbehind); |
2019 | |
|
2020 | 0 | return max_lookbehind; |
2021 | 0 | } |
2022 | | |
2023 | | /** |
2024 | | * g_regex_get_compile_flags: |
2025 | | * @regex: a #GRegex |
2026 | | * |
2027 | | * Returns the compile options that @regex was created with. |
2028 | | * |
2029 | | * Depending on the version of PCRE that is used, this may or may not |
2030 | | * include flags set by option expressions such as `(?i)` found at the |
2031 | | * top-level within the compiled pattern. |
2032 | | * |
2033 | | * Returns: flags from #GRegexCompileFlags |
2034 | | * |
2035 | | * Since: 2.26 |
2036 | | */ |
2037 | | GRegexCompileFlags |
2038 | | g_regex_get_compile_flags (const GRegex *regex) |
2039 | 0 | { |
2040 | 0 | GRegexCompileFlags extra_flags; |
2041 | 0 | uint32_t info_value; |
2042 | |
|
2043 | 0 | g_return_val_if_fail (regex != NULL, 0); |
2044 | | |
2045 | | /* Preserve original G_REGEX_OPTIMIZE */ |
2046 | 0 | extra_flags = (regex->orig_compile_opts & G_REGEX_OPTIMIZE); |
2047 | | |
2048 | | /* Also include the newline options */ |
2049 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_NEWLINE, &info_value); |
2050 | 0 | switch (info_value) |
2051 | 0 | { |
2052 | 0 | case PCRE2_NEWLINE_ANYCRLF: |
2053 | 0 | extra_flags |= G_REGEX_NEWLINE_ANYCRLF; |
2054 | 0 | break; |
2055 | 0 | case PCRE2_NEWLINE_CRLF: |
2056 | 0 | extra_flags |= G_REGEX_NEWLINE_CRLF; |
2057 | 0 | break; |
2058 | 0 | case PCRE2_NEWLINE_LF: |
2059 | 0 | extra_flags |= G_REGEX_NEWLINE_LF; |
2060 | 0 | break; |
2061 | 0 | case PCRE2_NEWLINE_CR: |
2062 | 0 | extra_flags |= G_REGEX_NEWLINE_CR; |
2063 | 0 | break; |
2064 | 0 | default: |
2065 | 0 | break; |
2066 | 0 | } |
2067 | | |
2068 | | /* Also include the bsr options */ |
2069 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BSR, &info_value); |
2070 | 0 | switch (info_value) |
2071 | 0 | { |
2072 | 0 | case PCRE2_BSR_ANYCRLF: |
2073 | 0 | extra_flags |= G_REGEX_BSR_ANYCRLF; |
2074 | 0 | break; |
2075 | 0 | default: |
2076 | 0 | break; |
2077 | 0 | } |
2078 | | |
2079 | 0 | return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags; |
2080 | 0 | } |
2081 | | |
2082 | | /** |
2083 | | * g_regex_get_match_flags: |
2084 | | * @regex: a #GRegex |
2085 | | * |
2086 | | * Returns the match options that @regex was created with. |
2087 | | * |
2088 | | * Returns: flags from #GRegexMatchFlags |
2089 | | * |
2090 | | * Since: 2.26 |
2091 | | */ |
2092 | | GRegexMatchFlags |
2093 | | g_regex_get_match_flags (const GRegex *regex) |
2094 | 0 | { |
2095 | 0 | uint32_t flags; |
2096 | |
|
2097 | 0 | g_return_val_if_fail (regex != NULL, 0); |
2098 | | |
2099 | 0 | flags = g_regex_match_flags_from_pcre2 (regex->match_opts); |
2100 | 0 | flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK); |
2101 | 0 | flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF)); |
2102 | |
|
2103 | 0 | return flags; |
2104 | 0 | } |
2105 | | |
2106 | | /** |
2107 | | * g_regex_match_simple: |
2108 | | * @pattern: the regular expression |
2109 | | * @string: the string to scan for matches |
2110 | | * @compile_options: compile options for the regular expression, or 0 |
2111 | | * @match_options: match options, or 0 |
2112 | | * |
2113 | | * Scans for a match in @string for @pattern. |
2114 | | * |
2115 | | * This function is equivalent to g_regex_match() but it does not |
2116 | | * require to compile the pattern with g_regex_new(), avoiding some |
2117 | | * lines of code when you need just to do a match without extracting |
2118 | | * substrings, capture counts, and so on. |
2119 | | * |
2120 | | * If this function is to be called on the same @pattern more than |
2121 | | * once, it's more efficient to compile the pattern once with |
2122 | | * g_regex_new() and then use g_regex_match(). |
2123 | | * |
2124 | | * Returns: %TRUE if the string matched, %FALSE otherwise |
2125 | | * |
2126 | | * Since: 2.14 |
2127 | | */ |
2128 | | gboolean |
2129 | | g_regex_match_simple (const gchar *pattern, |
2130 | | const gchar *string, |
2131 | | GRegexCompileFlags compile_options, |
2132 | | GRegexMatchFlags match_options) |
2133 | 0 | { |
2134 | 0 | GRegex *regex; |
2135 | 0 | gboolean result; |
2136 | |
|
2137 | 0 | regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL); |
2138 | 0 | if (!regex) |
2139 | 0 | return FALSE; |
2140 | 0 | result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL); |
2141 | 0 | g_regex_unref (regex); |
2142 | 0 | return result; |
2143 | 0 | } |
2144 | | |
2145 | | /** |
2146 | | * g_regex_match: |
2147 | | * @regex: a #GRegex structure from g_regex_new() |
2148 | | * @string: the string to scan for matches |
2149 | | * @match_options: match options |
2150 | | * @match_info: (out) (optional): pointer to location where to store |
2151 | | * the #GMatchInfo, or %NULL if you do not need it |
2152 | | * |
2153 | | * Scans for a match in @string for the pattern in @regex. |
2154 | | * The @match_options are combined with the match options specified |
2155 | | * when the @regex structure was created, letting you have more |
2156 | | * flexibility in reusing #GRegex structures. |
2157 | | * |
2158 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2159 | | * |
2160 | | * A #GMatchInfo structure, used to get information on the match, |
2161 | | * is stored in @match_info if not %NULL. Note that if @match_info |
2162 | | * is not %NULL then it is created even if the function returns %FALSE, |
2163 | | * i.e. you must free it regardless if regular expression actually matched. |
2164 | | * |
2165 | | * To retrieve all the non-overlapping matches of the pattern in |
2166 | | * string you can use g_match_info_next(). |
2167 | | * |
2168 | | * |[<!-- language="C" --> |
2169 | | * static void |
2170 | | * print_uppercase_words (const gchar *string) |
2171 | | * { |
2172 | | * // Print all uppercase-only words. |
2173 | | * GRegex *regex; |
2174 | | * GMatchInfo *match_info; |
2175 | | * |
2176 | | * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
2177 | | * g_regex_match (regex, string, 0, &match_info); |
2178 | | * while (g_match_info_matches (match_info)) |
2179 | | * { |
2180 | | * gchar *word = g_match_info_fetch (match_info, 0); |
2181 | | * g_print ("Found: %s\n", word); |
2182 | | * g_free (word); |
2183 | | * g_match_info_next (match_info, NULL); |
2184 | | * } |
2185 | | * g_match_info_free (match_info); |
2186 | | * g_regex_unref (regex); |
2187 | | * } |
2188 | | * ]| |
2189 | | * |
2190 | | * @string is not copied and is used in #GMatchInfo internally. If |
2191 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2192 | | * freeing or modifying @string then the behaviour is undefined. |
2193 | | * |
2194 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2195 | | * |
2196 | | * Since: 2.14 |
2197 | | */ |
2198 | | gboolean |
2199 | | g_regex_match (const GRegex *regex, |
2200 | | const gchar *string, |
2201 | | GRegexMatchFlags match_options, |
2202 | | GMatchInfo **match_info) |
2203 | 3.19k | { |
2204 | 3.19k | return g_regex_match_full (regex, string, -1, 0, match_options, |
2205 | 3.19k | match_info, NULL); |
2206 | 3.19k | } |
2207 | | |
2208 | | /** |
2209 | | * g_regex_match_full: |
2210 | | * @regex: a #GRegex structure from g_regex_new() |
2211 | | * @string: (array length=string_len): the string to scan for matches |
2212 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2213 | | * @start_position: starting index of the string to match, in bytes |
2214 | | * @match_options: match options |
2215 | | * @match_info: (out) (optional): pointer to location where to store |
2216 | | * the #GMatchInfo, or %NULL if you do not need it |
2217 | | * @error: location to store the error occurring, or %NULL to ignore errors |
2218 | | * |
2219 | | * Scans for a match in @string for the pattern in @regex. |
2220 | | * The @match_options are combined with the match options specified |
2221 | | * when the @regex structure was created, letting you have more |
2222 | | * flexibility in reusing #GRegex structures. |
2223 | | * |
2224 | | * Setting @start_position differs from just passing over a shortened |
2225 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2226 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2227 | | * |
2228 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2229 | | * |
2230 | | * A #GMatchInfo structure, used to get information on the match, is |
2231 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2232 | | * not %NULL then it is created even if the function returns %FALSE, |
2233 | | * i.e. you must free it regardless if regular expression actually |
2234 | | * matched. |
2235 | | * |
2236 | | * @string is not copied and is used in #GMatchInfo internally. If |
2237 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2238 | | * freeing or modifying @string then the behaviour is undefined. |
2239 | | * |
2240 | | * To retrieve all the non-overlapping matches of the pattern in |
2241 | | * string you can use g_match_info_next(). |
2242 | | * |
2243 | | * |[<!-- language="C" --> |
2244 | | * static void |
2245 | | * print_uppercase_words (const gchar *string) |
2246 | | * { |
2247 | | * // Print all uppercase-only words. |
2248 | | * GRegex *regex; |
2249 | | * GMatchInfo *match_info; |
2250 | | * GError *error = NULL; |
2251 | | * |
2252 | | * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
2253 | | * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error); |
2254 | | * while (g_match_info_matches (match_info)) |
2255 | | * { |
2256 | | * gchar *word = g_match_info_fetch (match_info, 0); |
2257 | | * g_print ("Found: %s\n", word); |
2258 | | * g_free (word); |
2259 | | * g_match_info_next (match_info, &error); |
2260 | | * } |
2261 | | * g_match_info_free (match_info); |
2262 | | * g_regex_unref (regex); |
2263 | | * if (error != NULL) |
2264 | | * { |
2265 | | * g_printerr ("Error while matching: %s\n", error->message); |
2266 | | * g_error_free (error); |
2267 | | * } |
2268 | | * } |
2269 | | * ]| |
2270 | | * |
2271 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2272 | | * |
2273 | | * Since: 2.14 |
2274 | | */ |
2275 | | gboolean |
2276 | | g_regex_match_full (const GRegex *regex, |
2277 | | const gchar *string, |
2278 | | gssize string_len, |
2279 | | gint start_position, |
2280 | | GRegexMatchFlags match_options, |
2281 | | GMatchInfo **match_info, |
2282 | | GError **error) |
2283 | 3.19k | { |
2284 | 3.19k | GMatchInfo *info; |
2285 | 3.19k | gboolean match_ok; |
2286 | | |
2287 | 3.19k | g_return_val_if_fail (regex != NULL, FALSE); |
2288 | 3.19k | g_return_val_if_fail (string != NULL, FALSE); |
2289 | 3.19k | g_return_val_if_fail (start_position >= 0, FALSE); |
2290 | 3.19k | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
2291 | 3.19k | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
2292 | | |
2293 | 3.19k | info = match_info_new (regex, string, string_len, start_position, |
2294 | 3.19k | match_options, FALSE); |
2295 | 3.19k | match_ok = g_match_info_next (info, error); |
2296 | 3.19k | if (match_info != NULL) |
2297 | 3.19k | *match_info = info; |
2298 | 0 | else |
2299 | 0 | g_match_info_free (info); |
2300 | | |
2301 | 3.19k | return match_ok; |
2302 | 3.19k | } |
2303 | | |
2304 | | /** |
2305 | | * g_regex_match_all: |
2306 | | * @regex: a #GRegex structure from g_regex_new() |
2307 | | * @string: the string to scan for matches |
2308 | | * @match_options: match options |
2309 | | * @match_info: (out) (optional): pointer to location where to store |
2310 | | * the #GMatchInfo, or %NULL if you do not need it |
2311 | | * |
2312 | | * Using the standard algorithm for regular expression matching only |
2313 | | * the longest match in the string is retrieved. This function uses |
2314 | | * a different algorithm so it can retrieve all the possible matches. |
2315 | | * For more documentation see g_regex_match_all_full(). |
2316 | | * |
2317 | | * A #GMatchInfo structure, used to get information on the match, is |
2318 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2319 | | * not %NULL then it is created even if the function returns %FALSE, |
2320 | | * i.e. you must free it regardless if regular expression actually |
2321 | | * matched. |
2322 | | * |
2323 | | * @string is not copied and is used in #GMatchInfo internally. If |
2324 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2325 | | * freeing or modifying @string then the behaviour is undefined. |
2326 | | * |
2327 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2328 | | * |
2329 | | * Since: 2.14 |
2330 | | */ |
2331 | | gboolean |
2332 | | g_regex_match_all (const GRegex *regex, |
2333 | | const gchar *string, |
2334 | | GRegexMatchFlags match_options, |
2335 | | GMatchInfo **match_info) |
2336 | 0 | { |
2337 | 0 | return g_regex_match_all_full (regex, string, -1, 0, match_options, |
2338 | 0 | match_info, NULL); |
2339 | 0 | } |
2340 | | |
2341 | | /** |
2342 | | * g_regex_match_all_full: |
2343 | | * @regex: a #GRegex structure from g_regex_new() |
2344 | | * @string: (array length=string_len): the string to scan for matches |
2345 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2346 | | * @start_position: starting index of the string to match, in bytes |
2347 | | * @match_options: match options |
2348 | | * @match_info: (out) (optional): pointer to location where to store |
2349 | | * the #GMatchInfo, or %NULL if you do not need it |
2350 | | * @error: location to store the error occurring, or %NULL to ignore errors |
2351 | | * |
2352 | | * Using the standard algorithm for regular expression matching only |
2353 | | * the longest match in the @string is retrieved, it is not possible |
2354 | | * to obtain all the available matches. For instance matching |
2355 | | * "<a> <b> <c>" against the pattern "<.*>" |
2356 | | * you get "<a> <b> <c>". |
2357 | | * |
2358 | | * This function uses a different algorithm (called DFA, i.e. deterministic |
2359 | | * finite automaton), so it can retrieve all the possible matches, all |
2360 | | * starting at the same point in the string. For instance matching |
2361 | | * "<a> <b> <c>" against the pattern "<.*>;" |
2362 | | * you would obtain three matches: "<a> <b> <c>", |
2363 | | * "<a> <b>" and "<a>". |
2364 | | * |
2365 | | * The number of matched strings is retrieved using |
2366 | | * g_match_info_get_match_count(). To obtain the matched strings and |
2367 | | * their position you can use, respectively, g_match_info_fetch() and |
2368 | | * g_match_info_fetch_pos(). Note that the strings are returned in |
2369 | | * reverse order of length; that is, the longest matching string is |
2370 | | * given first. |
2371 | | * |
2372 | | * Note that the DFA algorithm is slower than the standard one and it |
2373 | | * is not able to capture substrings, so backreferences do not work. |
2374 | | * |
2375 | | * Setting @start_position differs from just passing over a shortened |
2376 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2377 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2378 | | * |
2379 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2380 | | * |
2381 | | * A #GMatchInfo structure, used to get information on the match, is |
2382 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2383 | | * not %NULL then it is created even if the function returns %FALSE, |
2384 | | * i.e. you must free it regardless if regular expression actually |
2385 | | * matched. |
2386 | | * |
2387 | | * @string is not copied and is used in #GMatchInfo internally. If |
2388 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2389 | | * freeing or modifying @string then the behaviour is undefined. |
2390 | | * |
2391 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2392 | | * |
2393 | | * Since: 2.14 |
2394 | | */ |
2395 | | gboolean |
2396 | | g_regex_match_all_full (const GRegex *regex, |
2397 | | const gchar *string, |
2398 | | gssize string_len, |
2399 | | gint start_position, |
2400 | | GRegexMatchFlags match_options, |
2401 | | GMatchInfo **match_info, |
2402 | | GError **error) |
2403 | 0 | { |
2404 | 0 | GMatchInfo *info; |
2405 | 0 | gboolean done; |
2406 | 0 | pcre2_code *pcre_re; |
2407 | 0 | gboolean retval; |
2408 | 0 | uint32_t newline_options; |
2409 | 0 | uint32_t bsr_options; |
2410 | |
|
2411 | 0 | g_return_val_if_fail (regex != NULL, FALSE); |
2412 | 0 | g_return_val_if_fail (string != NULL, FALSE); |
2413 | 0 | g_return_val_if_fail (start_position >= 0, FALSE); |
2414 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
2415 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
2416 | | |
2417 | 0 | newline_options = get_pcre2_newline_match_options (match_options); |
2418 | 0 | if (!newline_options) |
2419 | 0 | newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts); |
2420 | |
|
2421 | 0 | bsr_options = get_pcre2_bsr_match_options (match_options); |
2422 | 0 | if (!bsr_options) |
2423 | 0 | bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts); |
2424 | | |
2425 | | /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an |
2426 | | * optimization for normal regex matching, but results in omitting some |
2427 | | * shorter matches here, and an observable behaviour change. |
2428 | | * |
2429 | | * DFA matching is rather niche, and very rarely used according to |
2430 | | * codesearch.debian.net, so don't bother caching the recompiled RE. */ |
2431 | 0 | pcre_re = regex_compile (regex->pattern, |
2432 | 0 | regex->compile_opts | PCRE2_NO_AUTO_POSSESS, |
2433 | 0 | newline_options, bsr_options, error); |
2434 | 0 | if (pcre_re == NULL) |
2435 | 0 | return FALSE; |
2436 | | |
2437 | 0 | info = match_info_new (regex, string, string_len, start_position, |
2438 | 0 | match_options, TRUE); |
2439 | |
|
2440 | 0 | done = FALSE; |
2441 | 0 | while (!done) |
2442 | 0 | { |
2443 | 0 | done = TRUE; |
2444 | 0 | info->matches = pcre2_dfa_match (pcre_re, |
2445 | 0 | (PCRE2_SPTR8) info->string, info->string_len, |
2446 | 0 | info->pos, |
2447 | 0 | (regex->match_opts | info->match_opts), |
2448 | 0 | info->match_data, |
2449 | 0 | info->match_context, |
2450 | 0 | info->workspace, info->n_workspace); |
2451 | 0 | if (info->matches == PCRE2_ERROR_DFA_WSSIZE) |
2452 | 0 | { |
2453 | | /* info->workspace is too small. */ |
2454 | 0 | info->n_workspace *= 2; |
2455 | 0 | info->workspace = g_realloc_n (info->workspace, |
2456 | 0 | info->n_workspace, |
2457 | 0 | sizeof (gint)); |
2458 | 0 | done = FALSE; |
2459 | 0 | } |
2460 | 0 | else if (info->matches == 0) |
2461 | 0 | { |
2462 | | /* info->offsets is too small. */ |
2463 | 0 | info->n_offsets *= 2; |
2464 | 0 | info->offsets = g_realloc_n (info->offsets, |
2465 | 0 | info->n_offsets, |
2466 | 0 | sizeof (gint)); |
2467 | 0 | pcre2_match_data_free (info->match_data); |
2468 | 0 | info->match_data = pcre2_match_data_create (info->n_offsets, NULL); |
2469 | 0 | done = FALSE; |
2470 | 0 | } |
2471 | 0 | else if (IS_PCRE2_ERROR (info->matches)) |
2472 | 0 | { |
2473 | 0 | gchar *error_msg = get_match_error_message (info->matches); |
2474 | |
|
2475 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
2476 | 0 | _("Error while matching regular expression %s: %s"), |
2477 | 0 | regex->pattern, error_msg); |
2478 | 0 | g_clear_pointer (&error_msg, g_free); |
2479 | 0 | } |
2480 | 0 | else if (info->matches != PCRE2_ERROR_NOMATCH) |
2481 | 0 | { |
2482 | 0 | if (!recalc_match_offsets (info, error)) |
2483 | 0 | info->matches = PCRE2_ERROR_NOMATCH; |
2484 | 0 | } |
2485 | 0 | } |
2486 | |
|
2487 | 0 | pcre2_code_free (pcre_re); |
2488 | | |
2489 | | /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only |
2490 | | * holds true for a single match, rather than matching all */ |
2491 | | |
2492 | | /* set info->pos to -1 so that a call to g_match_info_next() fails. */ |
2493 | 0 | info->pos = -1; |
2494 | 0 | retval = info->matches >= 0; |
2495 | |
|
2496 | 0 | if (match_info != NULL) |
2497 | 0 | *match_info = info; |
2498 | 0 | else |
2499 | 0 | g_match_info_free (info); |
2500 | |
|
2501 | 0 | return retval; |
2502 | 0 | } |
2503 | | |
2504 | | /** |
2505 | | * g_regex_get_string_number: |
2506 | | * @regex: #GRegex structure |
2507 | | * @name: name of the subexpression |
2508 | | * |
2509 | | * Retrieves the number of the subexpression named @name. |
2510 | | * |
2511 | | * Returns: The number of the subexpression or -1 if @name |
2512 | | * does not exists |
2513 | | * |
2514 | | * Since: 2.14 |
2515 | | */ |
2516 | | gint |
2517 | | g_regex_get_string_number (const GRegex *regex, |
2518 | | const gchar *name) |
2519 | 0 | { |
2520 | 0 | gint num; |
2521 | |
|
2522 | 0 | g_return_val_if_fail (regex != NULL, -1); |
2523 | 0 | g_return_val_if_fail (name != NULL, -1); |
2524 | | |
2525 | 0 | num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR8) name); |
2526 | 0 | if (num == PCRE2_ERROR_NOSUBSTRING) |
2527 | 0 | num = -1; |
2528 | |
|
2529 | 0 | return num; |
2530 | 0 | } |
2531 | | |
2532 | | /** |
2533 | | * g_regex_split_simple: |
2534 | | * @pattern: the regular expression |
2535 | | * @string: the string to scan for matches |
2536 | | * @compile_options: compile options for the regular expression, or 0 |
2537 | | * @match_options: match options, or 0 |
2538 | | * |
2539 | | * Breaks the string on the pattern, and returns an array of |
2540 | | * the tokens. If the pattern contains capturing parentheses, |
2541 | | * then the text for each of the substrings will also be returned. |
2542 | | * If the pattern does not match anywhere in the string, then the |
2543 | | * whole string is returned as the first token. |
2544 | | * |
2545 | | * This function is equivalent to g_regex_split() but it does |
2546 | | * not require to compile the pattern with g_regex_new(), avoiding |
2547 | | * some lines of code when you need just to do a split without |
2548 | | * extracting substrings, capture counts, and so on. |
2549 | | * |
2550 | | * If this function is to be called on the same @pattern more than |
2551 | | * once, it's more efficient to compile the pattern once with |
2552 | | * g_regex_new() and then use g_regex_split(). |
2553 | | * |
2554 | | * As a special case, the result of splitting the empty string "" |
2555 | | * is an empty vector, not a vector containing a single string. |
2556 | | * The reason for this special case is that being able to represent |
2557 | | * an empty vector is typically more useful than consistent handling |
2558 | | * of empty elements. If you do need to represent empty elements, |
2559 | | * you'll need to check for the empty string before calling this |
2560 | | * function. |
2561 | | * |
2562 | | * A pattern that can match empty strings splits @string into |
2563 | | * separate characters wherever it matches the empty string between |
2564 | | * characters. For example splitting "ab c" using as a separator |
2565 | | * "\s*", you will get "a", "b" and "c". |
2566 | | * |
2567 | | * Returns: (transfer full): a %NULL-terminated array of strings. Free |
2568 | | * it using g_strfreev() |
2569 | | * |
2570 | | * Since: 2.14 |
2571 | | **/ |
2572 | | gchar ** |
2573 | | g_regex_split_simple (const gchar *pattern, |
2574 | | const gchar *string, |
2575 | | GRegexCompileFlags compile_options, |
2576 | | GRegexMatchFlags match_options) |
2577 | 0 | { |
2578 | 0 | GRegex *regex; |
2579 | 0 | gchar **result; |
2580 | |
|
2581 | 0 | regex = g_regex_new (pattern, compile_options, 0, NULL); |
2582 | 0 | if (!regex) |
2583 | 0 | return NULL; |
2584 | | |
2585 | 0 | result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL); |
2586 | 0 | g_regex_unref (regex); |
2587 | 0 | return result; |
2588 | 0 | } |
2589 | | |
2590 | | /** |
2591 | | * g_regex_split: |
2592 | | * @regex: a #GRegex structure |
2593 | | * @string: the string to split with the pattern |
2594 | | * @match_options: match time option flags |
2595 | | * |
2596 | | * Breaks the string on the pattern, and returns an array of the tokens. |
2597 | | * If the pattern contains capturing parentheses, then the text for each |
2598 | | * of the substrings will also be returned. If the pattern does not match |
2599 | | * anywhere in the string, then the whole string is returned as the first |
2600 | | * token. |
2601 | | * |
2602 | | * As a special case, the result of splitting the empty string "" is an |
2603 | | * empty vector, not a vector containing a single string. The reason for |
2604 | | * this special case is that being able to represent an empty vector is |
2605 | | * typically more useful than consistent handling of empty elements. If |
2606 | | * you do need to represent empty elements, you'll need to check for the |
2607 | | * empty string before calling this function. |
2608 | | * |
2609 | | * A pattern that can match empty strings splits @string into separate |
2610 | | * characters wherever it matches the empty string between characters. |
2611 | | * For example splitting "ab c" using as a separator "\s*", you will get |
2612 | | * "a", "b" and "c". |
2613 | | * |
2614 | | * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
2615 | | * it using g_strfreev() |
2616 | | * |
2617 | | * Since: 2.14 |
2618 | | **/ |
2619 | | gchar ** |
2620 | | g_regex_split (const GRegex *regex, |
2621 | | const gchar *string, |
2622 | | GRegexMatchFlags match_options) |
2623 | 0 | { |
2624 | 0 | return g_regex_split_full (regex, string, -1, 0, |
2625 | 0 | match_options, 0, NULL); |
2626 | 0 | } |
2627 | | |
2628 | | /** |
2629 | | * g_regex_split_full: |
2630 | | * @regex: a #GRegex structure |
2631 | | * @string: (array length=string_len): the string to split with the pattern |
2632 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2633 | | * @start_position: starting index of the string to match, in bytes |
2634 | | * @match_options: match time option flags |
2635 | | * @max_tokens: the maximum number of tokens to split @string into. |
2636 | | * If this is less than 1, the string is split completely |
2637 | | * @error: return location for a #GError |
2638 | | * |
2639 | | * Breaks the string on the pattern, and returns an array of the tokens. |
2640 | | * If the pattern contains capturing parentheses, then the text for each |
2641 | | * of the substrings will also be returned. If the pattern does not match |
2642 | | * anywhere in the string, then the whole string is returned as the first |
2643 | | * token. |
2644 | | * |
2645 | | * As a special case, the result of splitting the empty string "" is an |
2646 | | * empty vector, not a vector containing a single string. The reason for |
2647 | | * this special case is that being able to represent an empty vector is |
2648 | | * typically more useful than consistent handling of empty elements. If |
2649 | | * you do need to represent empty elements, you'll need to check for the |
2650 | | * empty string before calling this function. |
2651 | | * |
2652 | | * A pattern that can match empty strings splits @string into separate |
2653 | | * characters wherever it matches the empty string between characters. |
2654 | | * For example splitting "ab c" using as a separator "\s*", you will get |
2655 | | * "a", "b" and "c". |
2656 | | * |
2657 | | * Setting @start_position differs from just passing over a shortened |
2658 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2659 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2660 | | * |
2661 | | * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
2662 | | * it using g_strfreev() |
2663 | | * |
2664 | | * Since: 2.14 |
2665 | | **/ |
2666 | | gchar ** |
2667 | | g_regex_split_full (const GRegex *regex, |
2668 | | const gchar *string, |
2669 | | gssize string_len, |
2670 | | gint start_position, |
2671 | | GRegexMatchFlags match_options, |
2672 | | gint max_tokens, |
2673 | | GError **error) |
2674 | 0 | { |
2675 | 0 | GError *tmp_error = NULL; |
2676 | 0 | GMatchInfo *match_info; |
2677 | 0 | GList *list, *last; |
2678 | 0 | gint i; |
2679 | 0 | gint token_count; |
2680 | 0 | gboolean match_ok; |
2681 | | /* position of the last separator. */ |
2682 | 0 | gint last_separator_end; |
2683 | | /* was the last match 0 bytes long? */ |
2684 | 0 | gboolean last_match_is_empty; |
2685 | | /* the returned array of char **s */ |
2686 | 0 | gchar **string_list; |
2687 | |
|
2688 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
2689 | 0 | g_return_val_if_fail (string != NULL, NULL); |
2690 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
2691 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
2692 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
2693 | | |
2694 | 0 | if (max_tokens <= 0) |
2695 | 0 | max_tokens = G_MAXINT; |
2696 | |
|
2697 | 0 | if (string_len < 0) |
2698 | 0 | string_len = strlen (string); |
2699 | | |
2700 | | /* zero-length string */ |
2701 | 0 | if (string_len - start_position == 0) |
2702 | 0 | return g_new0 (gchar *, 1); |
2703 | | |
2704 | 0 | if (max_tokens == 1) |
2705 | 0 | { |
2706 | 0 | string_list = g_new0 (gchar *, 2); |
2707 | 0 | string_list[0] = g_strndup (&string[start_position], |
2708 | 0 | string_len - start_position); |
2709 | 0 | return string_list; |
2710 | 0 | } |
2711 | | |
2712 | 0 | list = NULL; |
2713 | 0 | token_count = 0; |
2714 | 0 | last_separator_end = start_position; |
2715 | 0 | last_match_is_empty = FALSE; |
2716 | |
|
2717 | 0 | match_ok = g_regex_match_full (regex, string, string_len, start_position, |
2718 | 0 | match_options, &match_info, &tmp_error); |
2719 | |
|
2720 | 0 | while (tmp_error == NULL) |
2721 | 0 | { |
2722 | 0 | if (match_ok) |
2723 | 0 | { |
2724 | 0 | last_match_is_empty = |
2725 | 0 | (match_info->offsets[0] == match_info->offsets[1]); |
2726 | | |
2727 | | /* we need to skip empty separators at the same position of the end |
2728 | | * of another separator. e.g. the string is "a b" and the separator |
2729 | | * is " *", so from 1 to 2 we have a match and at position 2 we have |
2730 | | * an empty match. */ |
2731 | 0 | if (last_separator_end != match_info->offsets[1]) |
2732 | 0 | { |
2733 | 0 | gchar *token; |
2734 | 0 | gint match_count; |
2735 | |
|
2736 | 0 | token = g_strndup (string + last_separator_end, |
2737 | 0 | match_info->offsets[0] - last_separator_end); |
2738 | 0 | list = g_list_prepend (list, token); |
2739 | 0 | token_count++; |
2740 | | |
2741 | | /* if there were substrings, these need to be added to |
2742 | | * the list. */ |
2743 | 0 | match_count = g_match_info_get_match_count (match_info); |
2744 | 0 | if (match_count > 1) |
2745 | 0 | { |
2746 | 0 | for (i = 1; i < match_count; i++) |
2747 | 0 | list = g_list_prepend (list, g_match_info_fetch (match_info, i)); |
2748 | 0 | } |
2749 | 0 | } |
2750 | 0 | } |
2751 | 0 | else |
2752 | 0 | { |
2753 | | /* if there was no match, copy to end of string. */ |
2754 | 0 | if (!last_match_is_empty) |
2755 | 0 | { |
2756 | 0 | gchar *token = g_strndup (string + last_separator_end, |
2757 | 0 | match_info->string_len - last_separator_end); |
2758 | 0 | list = g_list_prepend (list, token); |
2759 | 0 | } |
2760 | | /* no more tokens, end the loop. */ |
2761 | 0 | break; |
2762 | 0 | } |
2763 | | |
2764 | | /* -1 to leave room for the last part. */ |
2765 | 0 | if (token_count >= max_tokens - 1) |
2766 | 0 | { |
2767 | | /* we have reached the maximum number of tokens, so we copy |
2768 | | * the remaining part of the string. */ |
2769 | 0 | if (last_match_is_empty) |
2770 | 0 | { |
2771 | | /* the last match was empty, so we have moved one char |
2772 | | * after the real position to avoid empty matches at the |
2773 | | * same position. */ |
2774 | 0 | match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string; |
2775 | 0 | } |
2776 | | /* the if is needed in the case we have terminated the available |
2777 | | * tokens, but we are at the end of the string, so there are no |
2778 | | * characters left to copy. */ |
2779 | 0 | if (string_len > match_info->pos) |
2780 | 0 | { |
2781 | 0 | gchar *token = g_strndup (string + match_info->pos, |
2782 | 0 | string_len - match_info->pos); |
2783 | 0 | list = g_list_prepend (list, token); |
2784 | 0 | } |
2785 | | /* end the loop. */ |
2786 | 0 | break; |
2787 | 0 | } |
2788 | | |
2789 | 0 | last_separator_end = match_info->pos; |
2790 | 0 | if (last_match_is_empty) |
2791 | | /* if the last match was empty, g_match_info_next() has moved |
2792 | | * forward to avoid infinite loops, but we still need to copy that |
2793 | | * character. */ |
2794 | 0 | last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string; |
2795 | |
|
2796 | 0 | match_ok = g_match_info_next (match_info, &tmp_error); |
2797 | 0 | } |
2798 | 0 | g_match_info_free (match_info); |
2799 | 0 | if (tmp_error != NULL) |
2800 | 0 | { |
2801 | 0 | g_propagate_error (error, tmp_error); |
2802 | 0 | g_list_free_full (list, g_free); |
2803 | 0 | return NULL; |
2804 | 0 | } |
2805 | | |
2806 | 0 | string_list = g_new (gchar *, g_list_length (list) + 1); |
2807 | 0 | i = 0; |
2808 | 0 | for (last = g_list_last (list); last; last = g_list_previous (last)) |
2809 | 0 | string_list[i++] = last->data; |
2810 | 0 | string_list[i] = NULL; |
2811 | 0 | g_list_free (list); |
2812 | |
|
2813 | 0 | return string_list; |
2814 | 0 | } |
2815 | | |
2816 | | enum |
2817 | | { |
2818 | | REPL_TYPE_STRING, |
2819 | | REPL_TYPE_CHARACTER, |
2820 | | REPL_TYPE_SYMBOLIC_REFERENCE, |
2821 | | REPL_TYPE_NUMERIC_REFERENCE, |
2822 | | REPL_TYPE_CHANGE_CASE |
2823 | | }; |
2824 | | |
2825 | | typedef enum |
2826 | | { |
2827 | | CHANGE_CASE_NONE = 1 << 0, |
2828 | | CHANGE_CASE_UPPER = 1 << 1, |
2829 | | CHANGE_CASE_LOWER = 1 << 2, |
2830 | | CHANGE_CASE_UPPER_SINGLE = 1 << 3, |
2831 | | CHANGE_CASE_LOWER_SINGLE = 1 << 4, |
2832 | | CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE, |
2833 | | CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE, |
2834 | | CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE |
2835 | | } ChangeCase; |
2836 | | |
2837 | | struct _InterpolationData |
2838 | | { |
2839 | | gchar *text; |
2840 | | gint type; |
2841 | | gint num; |
2842 | | gchar c; |
2843 | | ChangeCase change_case; |
2844 | | }; |
2845 | | |
2846 | | static void |
2847 | | free_interpolation_data (InterpolationData *data) |
2848 | 0 | { |
2849 | 0 | g_free (data->text); |
2850 | 0 | g_free (data); |
2851 | 0 | } |
2852 | | |
2853 | | static const gchar * |
2854 | | expand_escape (const gchar *replacement, |
2855 | | const gchar *p, |
2856 | | InterpolationData *data, |
2857 | | GError **error) |
2858 | 0 | { |
2859 | 0 | const gchar *q, *r; |
2860 | 0 | gint x, d, h, i; |
2861 | 0 | const gchar *error_detail; |
2862 | 0 | gint base = 0; |
2863 | 0 | GError *tmp_error = NULL; |
2864 | |
|
2865 | 0 | p++; |
2866 | 0 | switch (*p) |
2867 | 0 | { |
2868 | 0 | case 't': |
2869 | 0 | p++; |
2870 | 0 | data->c = '\t'; |
2871 | 0 | data->type = REPL_TYPE_CHARACTER; |
2872 | 0 | break; |
2873 | 0 | case 'n': |
2874 | 0 | p++; |
2875 | 0 | data->c = '\n'; |
2876 | 0 | data->type = REPL_TYPE_CHARACTER; |
2877 | 0 | break; |
2878 | 0 | case 'v': |
2879 | 0 | p++; |
2880 | 0 | data->c = '\v'; |
2881 | 0 | data->type = REPL_TYPE_CHARACTER; |
2882 | 0 | break; |
2883 | 0 | case 'r': |
2884 | 0 | p++; |
2885 | 0 | data->c = '\r'; |
2886 | 0 | data->type = REPL_TYPE_CHARACTER; |
2887 | 0 | break; |
2888 | 0 | case 'f': |
2889 | 0 | p++; |
2890 | 0 | data->c = '\f'; |
2891 | 0 | data->type = REPL_TYPE_CHARACTER; |
2892 | 0 | break; |
2893 | 0 | case 'a': |
2894 | 0 | p++; |
2895 | 0 | data->c = '\a'; |
2896 | 0 | data->type = REPL_TYPE_CHARACTER; |
2897 | 0 | break; |
2898 | 0 | case 'b': |
2899 | 0 | p++; |
2900 | 0 | data->c = '\b'; |
2901 | 0 | data->type = REPL_TYPE_CHARACTER; |
2902 | 0 | break; |
2903 | 0 | case '\\': |
2904 | 0 | p++; |
2905 | 0 | data->c = '\\'; |
2906 | 0 | data->type = REPL_TYPE_CHARACTER; |
2907 | 0 | break; |
2908 | 0 | case 'x': |
2909 | 0 | p++; |
2910 | 0 | x = 0; |
2911 | 0 | if (*p == '{') |
2912 | 0 | { |
2913 | 0 | p++; |
2914 | 0 | do |
2915 | 0 | { |
2916 | 0 | h = g_ascii_xdigit_value (*p); |
2917 | 0 | if (h < 0) |
2918 | 0 | { |
2919 | 0 | error_detail = _("hexadecimal digit or “}” expected"); |
2920 | 0 | goto error; |
2921 | 0 | } |
2922 | 0 | x = x * 16 + h; |
2923 | 0 | p++; |
2924 | 0 | } |
2925 | 0 | while (*p != '}'); |
2926 | 0 | p++; |
2927 | 0 | } |
2928 | 0 | else |
2929 | 0 | { |
2930 | 0 | for (i = 0; i < 2; i++) |
2931 | 0 | { |
2932 | 0 | h = g_ascii_xdigit_value (*p); |
2933 | 0 | if (h < 0) |
2934 | 0 | { |
2935 | 0 | error_detail = _("hexadecimal digit expected"); |
2936 | 0 | goto error; |
2937 | 0 | } |
2938 | 0 | x = x * 16 + h; |
2939 | 0 | p++; |
2940 | 0 | } |
2941 | 0 | } |
2942 | 0 | data->type = REPL_TYPE_STRING; |
2943 | 0 | data->text = g_new0 (gchar, 8); |
2944 | 0 | g_unichar_to_utf8 (x, data->text); |
2945 | 0 | break; |
2946 | 0 | case 'l': |
2947 | 0 | p++; |
2948 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2949 | 0 | data->change_case = CHANGE_CASE_LOWER_SINGLE; |
2950 | 0 | break; |
2951 | 0 | case 'u': |
2952 | 0 | p++; |
2953 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2954 | 0 | data->change_case = CHANGE_CASE_UPPER_SINGLE; |
2955 | 0 | break; |
2956 | 0 | case 'L': |
2957 | 0 | p++; |
2958 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2959 | 0 | data->change_case = CHANGE_CASE_LOWER; |
2960 | 0 | break; |
2961 | 0 | case 'U': |
2962 | 0 | p++; |
2963 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2964 | 0 | data->change_case = CHANGE_CASE_UPPER; |
2965 | 0 | break; |
2966 | 0 | case 'E': |
2967 | 0 | p++; |
2968 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2969 | 0 | data->change_case = CHANGE_CASE_NONE; |
2970 | 0 | break; |
2971 | 0 | case 'g': |
2972 | 0 | p++; |
2973 | 0 | if (*p != '<') |
2974 | 0 | { |
2975 | 0 | error_detail = _("missing “<” in symbolic reference"); |
2976 | 0 | goto error; |
2977 | 0 | } |
2978 | 0 | q = p + 1; |
2979 | 0 | do |
2980 | 0 | { |
2981 | 0 | p++; |
2982 | 0 | if (!*p) |
2983 | 0 | { |
2984 | 0 | error_detail = _("unfinished symbolic reference"); |
2985 | 0 | goto error; |
2986 | 0 | } |
2987 | 0 | } |
2988 | 0 | while (*p != '>'); |
2989 | 0 | if (p - q == 0) |
2990 | 0 | { |
2991 | 0 | error_detail = _("zero-length symbolic reference"); |
2992 | 0 | goto error; |
2993 | 0 | } |
2994 | 0 | if (g_ascii_isdigit (*q)) |
2995 | 0 | { |
2996 | 0 | x = 0; |
2997 | 0 | do |
2998 | 0 | { |
2999 | 0 | h = g_ascii_digit_value (*q); |
3000 | 0 | if (h < 0) |
3001 | 0 | { |
3002 | 0 | error_detail = _("digit expected"); |
3003 | 0 | p = q; |
3004 | 0 | goto error; |
3005 | 0 | } |
3006 | 0 | x = x * 10 + h; |
3007 | 0 | q++; |
3008 | 0 | } |
3009 | 0 | while (q != p); |
3010 | 0 | data->num = x; |
3011 | 0 | data->type = REPL_TYPE_NUMERIC_REFERENCE; |
3012 | 0 | } |
3013 | 0 | else |
3014 | 0 | { |
3015 | 0 | r = q; |
3016 | 0 | do |
3017 | 0 | { |
3018 | 0 | if (!g_ascii_isalnum (*r)) |
3019 | 0 | { |
3020 | 0 | error_detail = _("illegal symbolic reference"); |
3021 | 0 | p = r; |
3022 | 0 | goto error; |
3023 | 0 | } |
3024 | 0 | r++; |
3025 | 0 | } |
3026 | 0 | while (r != p); |
3027 | 0 | data->text = g_strndup (q, p - q); |
3028 | 0 | data->type = REPL_TYPE_SYMBOLIC_REFERENCE; |
3029 | 0 | } |
3030 | 0 | p++; |
3031 | 0 | break; |
3032 | 0 | case '0': |
3033 | | /* if \0 is followed by a number is an octal number representing a |
3034 | | * character, else it is a numeric reference. */ |
3035 | 0 | if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0) |
3036 | 0 | { |
3037 | 0 | base = 8; |
3038 | 0 | p = g_utf8_next_char (p); |
3039 | 0 | } |
3040 | 0 | G_GNUC_FALLTHROUGH; |
3041 | 0 | case '1': |
3042 | 0 | case '2': |
3043 | 0 | case '3': |
3044 | 0 | case '4': |
3045 | 0 | case '5': |
3046 | 0 | case '6': |
3047 | 0 | case '7': |
3048 | 0 | case '8': |
3049 | 0 | case '9': |
3050 | 0 | x = 0; |
3051 | 0 | d = 0; |
3052 | 0 | for (i = 0; i < 3; i++) |
3053 | 0 | { |
3054 | 0 | h = g_ascii_digit_value (*p); |
3055 | 0 | if (h < 0) |
3056 | 0 | break; |
3057 | 0 | if (h > 7) |
3058 | 0 | { |
3059 | 0 | if (base == 8) |
3060 | 0 | break; |
3061 | 0 | else |
3062 | 0 | base = 10; |
3063 | 0 | } |
3064 | 0 | if (i == 2 && base == 10) |
3065 | 0 | break; |
3066 | 0 | x = x * 8 + h; |
3067 | 0 | d = d * 10 + h; |
3068 | 0 | p++; |
3069 | 0 | } |
3070 | 0 | if (base == 8 || i == 3) |
3071 | 0 | { |
3072 | 0 | data->type = REPL_TYPE_STRING; |
3073 | 0 | data->text = g_new0 (gchar, 8); |
3074 | 0 | g_unichar_to_utf8 (x, data->text); |
3075 | 0 | } |
3076 | 0 | else |
3077 | 0 | { |
3078 | 0 | data->type = REPL_TYPE_NUMERIC_REFERENCE; |
3079 | 0 | data->num = d; |
3080 | 0 | } |
3081 | 0 | break; |
3082 | 0 | case 0: |
3083 | 0 | error_detail = _("stray final “\\”"); |
3084 | 0 | goto error; |
3085 | 0 | break; |
3086 | 0 | default: |
3087 | 0 | error_detail = _("unknown escape sequence"); |
3088 | 0 | goto error; |
3089 | 0 | } |
3090 | | |
3091 | 0 | return p; |
3092 | | |
3093 | 0 | error: |
3094 | | /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */ |
3095 | 0 | tmp_error = g_error_new (G_REGEX_ERROR, |
3096 | 0 | G_REGEX_ERROR_REPLACE, |
3097 | 0 | _("Error while parsing replacement " |
3098 | 0 | "text “%s” at char %lu: %s"), |
3099 | 0 | replacement, |
3100 | 0 | (gulong)(p - replacement), |
3101 | 0 | error_detail); |
3102 | 0 | g_propagate_error (error, tmp_error); |
3103 | |
|
3104 | 0 | return NULL; |
3105 | 0 | } |
3106 | | |
3107 | | static GList * |
3108 | | split_replacement (const gchar *replacement, |
3109 | | GError **error) |
3110 | 0 | { |
3111 | 0 | GList *list = NULL; |
3112 | 0 | InterpolationData *data; |
3113 | 0 | const gchar *p, *start; |
3114 | |
|
3115 | 0 | start = p = replacement; |
3116 | 0 | while (*p) |
3117 | 0 | { |
3118 | 0 | if (*p == '\\') |
3119 | 0 | { |
3120 | 0 | data = g_new0 (InterpolationData, 1); |
3121 | 0 | start = p = expand_escape (replacement, p, data, error); |
3122 | 0 | if (p == NULL) |
3123 | 0 | { |
3124 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3125 | 0 | free_interpolation_data (data); |
3126 | |
|
3127 | 0 | return NULL; |
3128 | 0 | } |
3129 | 0 | list = g_list_prepend (list, data); |
3130 | 0 | } |
3131 | 0 | else |
3132 | 0 | { |
3133 | 0 | p++; |
3134 | 0 | if (*p == '\\' || *p == '\0') |
3135 | 0 | { |
3136 | 0 | if (p - start > 0) |
3137 | 0 | { |
3138 | 0 | data = g_new0 (InterpolationData, 1); |
3139 | 0 | data->text = g_strndup (start, p - start); |
3140 | 0 | data->type = REPL_TYPE_STRING; |
3141 | 0 | list = g_list_prepend (list, data); |
3142 | 0 | } |
3143 | 0 | } |
3144 | 0 | } |
3145 | 0 | } |
3146 | | |
3147 | 0 | return g_list_reverse (list); |
3148 | 0 | } |
3149 | | |
3150 | | /* Change the case of c based on change_case. */ |
3151 | | #define CHANGE_CASE(c, change_case) \ |
3152 | 0 | (((change_case) & CHANGE_CASE_LOWER_MASK) ? \ |
3153 | 0 | g_unichar_tolower (c) : \ |
3154 | 0 | g_unichar_toupper (c)) |
3155 | | |
3156 | | static void |
3157 | | string_append (GString *string, |
3158 | | const gchar *text, |
3159 | | ChangeCase *change_case) |
3160 | 0 | { |
3161 | 0 | gunichar c; |
3162 | |
|
3163 | 0 | if (text[0] == '\0') |
3164 | 0 | return; |
3165 | | |
3166 | 0 | if (*change_case == CHANGE_CASE_NONE) |
3167 | 0 | { |
3168 | 0 | g_string_append (string, text); |
3169 | 0 | } |
3170 | 0 | else if (*change_case & CHANGE_CASE_SINGLE_MASK) |
3171 | 0 | { |
3172 | 0 | c = g_utf8_get_char (text); |
3173 | 0 | g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
3174 | 0 | g_string_append (string, g_utf8_next_char (text)); |
3175 | 0 | *change_case = CHANGE_CASE_NONE; |
3176 | 0 | } |
3177 | 0 | else |
3178 | 0 | { |
3179 | 0 | while (*text != '\0') |
3180 | 0 | { |
3181 | 0 | c = g_utf8_get_char (text); |
3182 | 0 | g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
3183 | 0 | text = g_utf8_next_char (text); |
3184 | 0 | } |
3185 | 0 | } |
3186 | 0 | } |
3187 | | |
3188 | | static gboolean |
3189 | | interpolate_replacement (const GMatchInfo *match_info, |
3190 | | GString *result, |
3191 | | gpointer data) |
3192 | 0 | { |
3193 | 0 | GList *list; |
3194 | 0 | InterpolationData *idata; |
3195 | 0 | gchar *match; |
3196 | 0 | ChangeCase change_case = CHANGE_CASE_NONE; |
3197 | |
|
3198 | 0 | for (list = data; list; list = list->next) |
3199 | 0 | { |
3200 | 0 | idata = list->data; |
3201 | 0 | switch (idata->type) |
3202 | 0 | { |
3203 | 0 | case REPL_TYPE_STRING: |
3204 | 0 | string_append (result, idata->text, &change_case); |
3205 | 0 | break; |
3206 | 0 | case REPL_TYPE_CHARACTER: |
3207 | 0 | g_string_append_c (result, CHANGE_CASE (idata->c, change_case)); |
3208 | 0 | if (change_case & CHANGE_CASE_SINGLE_MASK) |
3209 | 0 | change_case = CHANGE_CASE_NONE; |
3210 | 0 | break; |
3211 | 0 | case REPL_TYPE_NUMERIC_REFERENCE: |
3212 | 0 | match = g_match_info_fetch (match_info, idata->num); |
3213 | 0 | if (match) |
3214 | 0 | { |
3215 | 0 | string_append (result, match, &change_case); |
3216 | 0 | g_free (match); |
3217 | 0 | } |
3218 | 0 | break; |
3219 | 0 | case REPL_TYPE_SYMBOLIC_REFERENCE: |
3220 | 0 | match = g_match_info_fetch_named (match_info, idata->text); |
3221 | 0 | if (match) |
3222 | 0 | { |
3223 | 0 | string_append (result, match, &change_case); |
3224 | 0 | g_free (match); |
3225 | 0 | } |
3226 | 0 | break; |
3227 | 0 | case REPL_TYPE_CHANGE_CASE: |
3228 | 0 | change_case = idata->change_case; |
3229 | 0 | break; |
3230 | 0 | } |
3231 | 0 | } |
3232 | | |
3233 | 0 | return FALSE; |
3234 | 0 | } |
3235 | | |
3236 | | /* whether actual match_info is needed for replacement, i.e. |
3237 | | * whether there are references |
3238 | | */ |
3239 | | static gboolean |
3240 | | interpolation_list_needs_match (GList *list) |
3241 | 0 | { |
3242 | 0 | while (list != NULL) |
3243 | 0 | { |
3244 | 0 | InterpolationData *data = list->data; |
3245 | |
|
3246 | 0 | if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE || |
3247 | 0 | data->type == REPL_TYPE_NUMERIC_REFERENCE) |
3248 | 0 | { |
3249 | 0 | return TRUE; |
3250 | 0 | } |
3251 | | |
3252 | 0 | list = list->next; |
3253 | 0 | } |
3254 | | |
3255 | 0 | return FALSE; |
3256 | 0 | } |
3257 | | |
3258 | | /** |
3259 | | * g_regex_replace: |
3260 | | * @regex: a #GRegex structure |
3261 | | * @string: (array length=string_len): the string to perform matches against |
3262 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3263 | | * @start_position: starting index of the string to match, in bytes |
3264 | | * @replacement: text to replace each match with |
3265 | | * @match_options: options for the match |
3266 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3267 | | * |
3268 | | * Replaces all occurrences of the pattern in @regex with the |
3269 | | * replacement text. Backreferences of the form '\number' or |
3270 | | * '\g<number>' in the replacement text are interpolated by the |
3271 | | * number-th captured subexpression of the match, '\g<name>' refers |
3272 | | * to the captured subexpression with the given name. '\0' refers |
3273 | | * to the complete match, but '\0' followed by a number is the octal |
3274 | | * representation of a character. To include a literal '\' in the |
3275 | | * replacement, write '\\\\'. |
3276 | | * |
3277 | | * There are also escapes that changes the case of the following text: |
3278 | | * |
3279 | | * - \l: Convert to lower case the next character |
3280 | | * - \u: Convert to upper case the next character |
3281 | | * - \L: Convert to lower case till \E |
3282 | | * - \U: Convert to upper case till \E |
3283 | | * - \E: End case modification |
3284 | | * |
3285 | | * If you do not need to use backreferences use g_regex_replace_literal(). |
3286 | | * |
3287 | | * The @replacement string must be UTF-8 encoded even if %G_REGEX_RAW was |
3288 | | * passed to g_regex_new(). If you want to use not UTF-8 encoded strings |
3289 | | * you can use g_regex_replace_literal(). |
3290 | | * |
3291 | | * Setting @start_position differs from just passing over a shortened |
3292 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern that |
3293 | | * begins with any kind of lookbehind assertion, such as "\b". |
3294 | | * |
3295 | | * Returns: a newly allocated string containing the replacements |
3296 | | * |
3297 | | * Since: 2.14 |
3298 | | */ |
3299 | | gchar * |
3300 | | g_regex_replace (const GRegex *regex, |
3301 | | const gchar *string, |
3302 | | gssize string_len, |
3303 | | gint start_position, |
3304 | | const gchar *replacement, |
3305 | | GRegexMatchFlags match_options, |
3306 | | GError **error) |
3307 | 0 | { |
3308 | 0 | gchar *result; |
3309 | 0 | GList *list; |
3310 | 0 | GError *tmp_error = NULL; |
3311 | |
|
3312 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
3313 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3314 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
3315 | 0 | g_return_val_if_fail (replacement != NULL, NULL); |
3316 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
3317 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3318 | | |
3319 | 0 | list = split_replacement (replacement, &tmp_error); |
3320 | 0 | if (tmp_error != NULL) |
3321 | 0 | { |
3322 | 0 | g_propagate_error (error, tmp_error); |
3323 | 0 | return NULL; |
3324 | 0 | } |
3325 | | |
3326 | 0 | result = g_regex_replace_eval (regex, |
3327 | 0 | string, string_len, start_position, |
3328 | 0 | match_options, |
3329 | 0 | interpolate_replacement, |
3330 | 0 | (gpointer)list, |
3331 | 0 | &tmp_error); |
3332 | 0 | if (tmp_error != NULL) |
3333 | 0 | g_propagate_error (error, tmp_error); |
3334 | |
|
3335 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3336 | |
|
3337 | 0 | return result; |
3338 | 0 | } |
3339 | | |
3340 | | static gboolean |
3341 | | literal_replacement (const GMatchInfo *match_info, |
3342 | | GString *result, |
3343 | | gpointer data) |
3344 | 0 | { |
3345 | 0 | g_string_append (result, data); |
3346 | 0 | return FALSE; |
3347 | 0 | } |
3348 | | |
3349 | | /** |
3350 | | * g_regex_replace_literal: |
3351 | | * @regex: a #GRegex structure |
3352 | | * @string: (array length=string_len): the string to perform matches against |
3353 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3354 | | * @start_position: starting index of the string to match, in bytes |
3355 | | * @replacement: text to replace each match with |
3356 | | * @match_options: options for the match |
3357 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3358 | | * |
3359 | | * Replaces all occurrences of the pattern in @regex with the |
3360 | | * replacement text. @replacement is replaced literally, to |
3361 | | * include backreferences use g_regex_replace(). |
3362 | | * |
3363 | | * Setting @start_position differs from just passing over a |
3364 | | * shortened string and setting %G_REGEX_MATCH_NOTBOL in the |
3365 | | * case of a pattern that begins with any kind of lookbehind |
3366 | | * assertion, such as "\b". |
3367 | | * |
3368 | | * Returns: a newly allocated string containing the replacements |
3369 | | * |
3370 | | * Since: 2.14 |
3371 | | */ |
3372 | | gchar * |
3373 | | g_regex_replace_literal (const GRegex *regex, |
3374 | | const gchar *string, |
3375 | | gssize string_len, |
3376 | | gint start_position, |
3377 | | const gchar *replacement, |
3378 | | GRegexMatchFlags match_options, |
3379 | | GError **error) |
3380 | 0 | { |
3381 | 0 | g_return_val_if_fail (replacement != NULL, NULL); |
3382 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3383 | | |
3384 | 0 | return g_regex_replace_eval (regex, |
3385 | 0 | string, string_len, start_position, |
3386 | 0 | match_options, |
3387 | 0 | literal_replacement, |
3388 | 0 | (gpointer)replacement, |
3389 | 0 | error); |
3390 | 0 | } |
3391 | | |
3392 | | /** |
3393 | | * g_regex_replace_eval: |
3394 | | * @regex: a #GRegex structure from g_regex_new() |
3395 | | * @string: (array length=string_len): string to perform matches against |
3396 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3397 | | * @start_position: starting index of the string to match, in bytes |
3398 | | * @match_options: options for the match |
3399 | | * @eval: a function to call for each match |
3400 | | * @user_data: user data to pass to the function |
3401 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3402 | | * |
3403 | | * Replaces occurrences of the pattern in regex with the output of |
3404 | | * @eval for that occurrence. |
3405 | | * |
3406 | | * Setting @start_position differs from just passing over a shortened |
3407 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
3408 | | * that begins with any kind of lookbehind assertion, such as "\b". |
3409 | | * |
3410 | | * The following example uses g_regex_replace_eval() to replace multiple |
3411 | | * strings at once: |
3412 | | * |[<!-- language="C" --> |
3413 | | * static gboolean |
3414 | | * eval_cb (const GMatchInfo *info, |
3415 | | * GString *res, |
3416 | | * gpointer data) |
3417 | | * { |
3418 | | * gchar *match; |
3419 | | * gchar *r; |
3420 | | * |
3421 | | * match = g_match_info_fetch (info, 0); |
3422 | | * r = g_hash_table_lookup ((GHashTable *)data, match); |
3423 | | * g_string_append (res, r); |
3424 | | * g_free (match); |
3425 | | * |
3426 | | * return FALSE; |
3427 | | * } |
3428 | | * |
3429 | | * ... |
3430 | | * |
3431 | | * GRegex *reg; |
3432 | | * GHashTable *h; |
3433 | | * gchar *res; |
3434 | | * |
3435 | | * h = g_hash_table_new (g_str_hash, g_str_equal); |
3436 | | * |
3437 | | * g_hash_table_insert (h, "1", "ONE"); |
3438 | | * g_hash_table_insert (h, "2", "TWO"); |
3439 | | * g_hash_table_insert (h, "3", "THREE"); |
3440 | | * g_hash_table_insert (h, "4", "FOUR"); |
3441 | | * |
3442 | | * reg = g_regex_new ("1|2|3|4", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
3443 | | * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL); |
3444 | | * g_hash_table_destroy (h); |
3445 | | * |
3446 | | * ... |
3447 | | * ]| |
3448 | | * |
3449 | | * Returns: a newly allocated string containing the replacements |
3450 | | * |
3451 | | * Since: 2.14 |
3452 | | */ |
3453 | | gchar * |
3454 | | g_regex_replace_eval (const GRegex *regex, |
3455 | | const gchar *string, |
3456 | | gssize string_len, |
3457 | | gint start_position, |
3458 | | GRegexMatchFlags match_options, |
3459 | | GRegexEvalCallback eval, |
3460 | | gpointer user_data, |
3461 | | GError **error) |
3462 | 0 | { |
3463 | 0 | GMatchInfo *match_info; |
3464 | 0 | GString *result; |
3465 | 0 | gint str_pos = 0; |
3466 | 0 | gboolean done = FALSE; |
3467 | 0 | GError *tmp_error = NULL; |
3468 | |
|
3469 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
3470 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3471 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
3472 | 0 | g_return_val_if_fail (eval != NULL, NULL); |
3473 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3474 | | |
3475 | 0 | if (string_len < 0) |
3476 | 0 | string_len = strlen (string); |
3477 | |
|
3478 | 0 | result = g_string_sized_new (string_len); |
3479 | | |
3480 | | /* run down the string making matches. */ |
3481 | 0 | g_regex_match_full (regex, string, string_len, start_position, |
3482 | 0 | match_options, &match_info, &tmp_error); |
3483 | 0 | while (!done && g_match_info_matches (match_info)) |
3484 | 0 | { |
3485 | 0 | g_string_append_len (result, |
3486 | 0 | string + str_pos, |
3487 | 0 | match_info->offsets[0] - str_pos); |
3488 | 0 | done = (*eval) (match_info, result, user_data); |
3489 | 0 | str_pos = match_info->offsets[1]; |
3490 | 0 | g_match_info_next (match_info, &tmp_error); |
3491 | 0 | } |
3492 | 0 | g_match_info_free (match_info); |
3493 | 0 | if (tmp_error != NULL) |
3494 | 0 | { |
3495 | 0 | g_propagate_error (error, tmp_error); |
3496 | 0 | g_string_free (result, TRUE); |
3497 | 0 | return NULL; |
3498 | 0 | } |
3499 | | |
3500 | 0 | g_string_append_len (result, string + str_pos, string_len - str_pos); |
3501 | 0 | return g_string_free (result, FALSE); |
3502 | 0 | } |
3503 | | |
3504 | | /** |
3505 | | * g_regex_check_replacement: |
3506 | | * @replacement: the replacement string |
3507 | | * @has_references: (out) (optional): location to store information about |
3508 | | * references in @replacement or %NULL |
3509 | | * @error: location to store error |
3510 | | * |
3511 | | * Checks whether @replacement is a valid replacement string |
3512 | | * (see g_regex_replace()), i.e. that all escape sequences in |
3513 | | * it are valid. |
3514 | | * |
3515 | | * If @has_references is not %NULL then @replacement is checked |
3516 | | * for pattern references. For instance, replacement text 'foo\n' |
3517 | | * does not contain references and may be evaluated without information |
3518 | | * about actual match, but '\0\1' (whole match followed by first |
3519 | | * subpattern) requires valid #GMatchInfo object. |
3520 | | * |
3521 | | * Returns: whether @replacement is a valid replacement string |
3522 | | * |
3523 | | * Since: 2.14 |
3524 | | */ |
3525 | | gboolean |
3526 | | g_regex_check_replacement (const gchar *replacement, |
3527 | | gboolean *has_references, |
3528 | | GError **error) |
3529 | 0 | { |
3530 | 0 | GList *list; |
3531 | 0 | GError *tmp = NULL; |
3532 | |
|
3533 | 0 | list = split_replacement (replacement, &tmp); |
3534 | |
|
3535 | 0 | if (tmp) |
3536 | 0 | { |
3537 | 0 | g_propagate_error (error, tmp); |
3538 | 0 | return FALSE; |
3539 | 0 | } |
3540 | | |
3541 | 0 | if (has_references) |
3542 | 0 | *has_references = interpolation_list_needs_match (list); |
3543 | |
|
3544 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3545 | |
|
3546 | 0 | return TRUE; |
3547 | 0 | } |
3548 | | |
3549 | | /** |
3550 | | * g_regex_escape_nul: |
3551 | | * @string: the string to escape |
3552 | | * @length: the length of @string |
3553 | | * |
3554 | | * Escapes the nul characters in @string to "\x00". It can be used |
3555 | | * to compile a regex with embedded nul characters. |
3556 | | * |
3557 | | * For completeness, @length can be -1 for a nul-terminated string. |
3558 | | * In this case the output string will be of course equal to @string. |
3559 | | * |
3560 | | * Returns: a newly-allocated escaped string |
3561 | | * |
3562 | | * Since: 2.30 |
3563 | | */ |
3564 | | gchar * |
3565 | | g_regex_escape_nul (const gchar *string, |
3566 | | gint length) |
3567 | 0 | { |
3568 | 0 | GString *escaped; |
3569 | 0 | const gchar *p, *piece_start, *end; |
3570 | 0 | gint backslashes; |
3571 | |
|
3572 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3573 | | |
3574 | 0 | if (length < 0) |
3575 | 0 | return g_strdup (string); |
3576 | | |
3577 | 0 | end = string + length; |
3578 | 0 | p = piece_start = string; |
3579 | 0 | escaped = g_string_sized_new (length + 1); |
3580 | |
|
3581 | 0 | backslashes = 0; |
3582 | 0 | while (p < end) |
3583 | 0 | { |
3584 | 0 | switch (*p) |
3585 | 0 | { |
3586 | 0 | case '\0': |
3587 | 0 | if (p != piece_start) |
3588 | 0 | { |
3589 | | /* copy the previous piece. */ |
3590 | 0 | g_string_append_len (escaped, piece_start, p - piece_start); |
3591 | 0 | } |
3592 | 0 | if ((backslashes & 1) == 0) |
3593 | 0 | g_string_append_c (escaped, '\\'); |
3594 | 0 | g_string_append_c (escaped, 'x'); |
3595 | 0 | g_string_append_c (escaped, '0'); |
3596 | 0 | g_string_append_c (escaped, '0'); |
3597 | 0 | piece_start = ++p; |
3598 | 0 | backslashes = 0; |
3599 | 0 | break; |
3600 | 0 | case '\\': |
3601 | 0 | backslashes++; |
3602 | 0 | ++p; |
3603 | 0 | break; |
3604 | 0 | default: |
3605 | 0 | backslashes = 0; |
3606 | 0 | p = g_utf8_next_char (p); |
3607 | 0 | break; |
3608 | 0 | } |
3609 | 0 | } |
3610 | | |
3611 | 0 | if (piece_start < end) |
3612 | 0 | g_string_append_len (escaped, piece_start, end - piece_start); |
3613 | |
|
3614 | 0 | return g_string_free (escaped, FALSE); |
3615 | 0 | } |
3616 | | |
3617 | | /** |
3618 | | * g_regex_escape_string: |
3619 | | * @string: the string to escape |
3620 | | * @length: the length of @string, in bytes, or -1 if @string is nul-terminated |
3621 | | * |
3622 | | * Escapes the special characters used for regular expressions |
3623 | | * in @string, for instance "a.b*c" becomes "a\.b\*c". This |
3624 | | * function is useful to dynamically generate regular expressions. |
3625 | | * |
3626 | | * @string can contain nul characters that are replaced with "\0", |
3627 | | * in this case remember to specify the correct length of @string |
3628 | | * in @length. |
3629 | | * |
3630 | | * Returns: a newly-allocated escaped string |
3631 | | * |
3632 | | * Since: 2.14 |
3633 | | */ |
3634 | | gchar * |
3635 | | g_regex_escape_string (const gchar *string, |
3636 | | gint length) |
3637 | 0 | { |
3638 | 0 | GString *escaped; |
3639 | 0 | const char *p, *piece_start, *end; |
3640 | |
|
3641 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3642 | | |
3643 | 0 | if (length < 0) |
3644 | 0 | length = strlen (string); |
3645 | |
|
3646 | 0 | end = string + length; |
3647 | 0 | p = piece_start = string; |
3648 | 0 | escaped = g_string_sized_new (length + 1); |
3649 | |
|
3650 | 0 | while (p < end) |
3651 | 0 | { |
3652 | 0 | switch (*p) |
3653 | 0 | { |
3654 | 0 | case '\0': |
3655 | 0 | case '\\': |
3656 | 0 | case '|': |
3657 | 0 | case '(': |
3658 | 0 | case ')': |
3659 | 0 | case '[': |
3660 | 0 | case ']': |
3661 | 0 | case '{': |
3662 | 0 | case '}': |
3663 | 0 | case '^': |
3664 | 0 | case '$': |
3665 | 0 | case '*': |
3666 | 0 | case '+': |
3667 | 0 | case '?': |
3668 | 0 | case '.': |
3669 | 0 | if (p != piece_start) |
3670 | | /* copy the previous piece. */ |
3671 | 0 | g_string_append_len (escaped, piece_start, p - piece_start); |
3672 | 0 | g_string_append_c (escaped, '\\'); |
3673 | 0 | if (*p == '\0') |
3674 | 0 | g_string_append_c (escaped, '0'); |
3675 | 0 | else |
3676 | 0 | g_string_append_c (escaped, *p); |
3677 | 0 | piece_start = ++p; |
3678 | 0 | break; |
3679 | 0 | default: |
3680 | 0 | p = g_utf8_next_char (p); |
3681 | 0 | break; |
3682 | 0 | } |
3683 | 0 | } |
3684 | | |
3685 | 0 | if (piece_start < end) |
3686 | 0 | g_string_append_len (escaped, piece_start, end - piece_start); |
3687 | |
|
3688 | 0 | return g_string_free (escaped, FALSE); |
3689 | 0 | } |