/src/tinysparql/subprojects/glib-2.80.3/glib/gregex.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* GRegex -- regular expression API wrapper around PCRE. |
2 | | * |
3 | | * Copyright (C) 1999, 2000 Scott Wimer |
4 | | * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com> |
5 | | * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org> |
6 | | * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com> |
7 | | * |
8 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
9 | | * |
10 | | * This library is free software; you can redistribute it and/or |
11 | | * modify it under the terms of the GNU Lesser General Public |
12 | | * License as published by the Free Software Foundation; either |
13 | | * version 2.1 of the License, or (at your option) any later version. |
14 | | * |
15 | | * This library is distributed in the hope that it will be useful, |
16 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | | * Lesser General Public License for more details. |
19 | | * |
20 | | * You should have received a copy of the GNU Lesser General Public License |
21 | | * along with this library; if not, see <http://www.gnu.org/licenses/>. |
22 | | */ |
23 | | |
24 | | #include "config.h" |
25 | | |
26 | | #include <stdint.h> |
27 | | #include <string.h> |
28 | | |
29 | | #define PCRE2_CODE_UNIT_WIDTH 8 |
30 | | #include <pcre2.h> |
31 | | |
32 | | #include "gtypes.h" |
33 | | #include "gregex.h" |
34 | | #include "glibintl.h" |
35 | | #include "glist.h" |
36 | | #include "gmessages.h" |
37 | | #include "gstrfuncs.h" |
38 | | #include "gatomic.h" |
39 | | #include "gtestutils.h" |
40 | | #include "gthread.h" |
41 | | |
42 | | /** |
43 | | * GRegex: |
44 | | * |
45 | | * A `GRegex` is the "compiled" form of a regular expression pattern. |
46 | | * |
47 | | * `GRegex` implements regular expression pattern matching using syntax and |
48 | | * semantics similar to Perl regular expression. See the |
49 | | * [PCRE documentation](man:pcrepattern(3)) for the syntax definition. |
50 | | * |
51 | | * Some functions accept a @start_position argument, setting it differs |
52 | | * from just passing over a shortened string and setting %G_REGEX_MATCH_NOTBOL |
53 | | * in the case of a pattern that begins with any kind of lookbehind assertion. |
54 | | * For example, consider the pattern "\Biss\B" which finds occurrences of "iss" |
55 | | * in the middle of words. ("\B" matches only if the current position in the |
56 | | * subject is not a word boundary.) When applied to the string "Mississipi" |
57 | | * from the fourth byte, namely "issipi", it does not match, because "\B" is |
58 | | * always false at the start of the subject, which is deemed to be a word |
59 | | * boundary. However, if the entire string is passed , but with |
60 | | * @start_position set to 4, it finds the second occurrence of "iss" because |
61 | | * it is able to look behind the starting point to discover that it is |
62 | | * preceded by a letter. |
63 | | * |
64 | | * Note that, unless you set the %G_REGEX_RAW flag, all the strings passed |
65 | | * to these functions must be encoded in UTF-8. The lengths and the positions |
66 | | * inside the strings are in bytes and not in characters, so, for instance, |
67 | | * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a |
68 | | * single character. If you set %G_REGEX_RAW the strings can be non-valid |
69 | | * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two |
70 | | * bytes and two characters long. |
71 | | * |
72 | | * When matching a pattern, "\n" matches only against a "\n" character in |
73 | | * the string, and "\r" matches only a "\r" character. To match any newline |
74 | | * sequence use "\R". This particular group matches either the two-character |
75 | | * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed, |
76 | | * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), |
77 | | * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line |
78 | | * separator, U+2028), or PS (paragraph separator, U+2029). |
79 | | * |
80 | | * The behaviour of the dot, circumflex, and dollar metacharacters are |
81 | | * affected by newline characters, the default is to recognize any newline |
82 | | * character (the same characters recognized by "\R"). This can be changed |
83 | | * with `G_REGEX_NEWLINE_CR`, `G_REGEX_NEWLINE_LF` and `G_REGEX_NEWLINE_CRLF` |
84 | | * compile options, and with `G_REGEX_MATCH_NEWLINE_ANY`, |
85 | | * `G_REGEX_MATCH_NEWLINE_CR`, `G_REGEX_MATCH_NEWLINE_LF` and |
86 | | * `G_REGEX_MATCH_NEWLINE_CRLF` match options. These settings are also |
87 | | * relevant when compiling a pattern if `G_REGEX_EXTENDED` is set, and an |
88 | | * unescaped "#" outside a character class is encountered. This indicates |
89 | | * a comment that lasts until after the next newline. |
90 | | * |
91 | | * Creating and manipulating the same `GRegex` structure from different |
92 | | * threads is not a problem as `GRegex` does not modify its internal |
93 | | * state between creation and destruction, on the other hand `GMatchInfo` |
94 | | * is not threadsafe. |
95 | | * |
96 | | * The regular expressions low-level functionalities are obtained through |
97 | | * the excellent [PCRE](http://www.pcre.org/) library written by Philip Hazel. |
98 | | * |
99 | | * Since: 2.14 |
100 | | */ |
101 | | |
102 | 0 | #define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED | \ |
103 | 0 | PCRE2_NO_UTF_CHECK | \ |
104 | 0 | PCRE2_ENDANCHORED) |
105 | | |
106 | | /* Mask of all the possible values for GRegexCompileFlags. */ |
107 | 0 | #define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT | \ |
108 | 0 | G_REGEX_CASELESS | \ |
109 | 0 | G_REGEX_MULTILINE | \ |
110 | 0 | G_REGEX_DOTALL | \ |
111 | 0 | G_REGEX_EXTENDED | \ |
112 | 0 | G_REGEX_ANCHORED | \ |
113 | 0 | G_REGEX_DOLLAR_ENDONLY | \ |
114 | 0 | G_REGEX_UNGREEDY | \ |
115 | 0 | G_REGEX_RAW | \ |
116 | 0 | G_REGEX_NO_AUTO_CAPTURE | \ |
117 | 0 | G_REGEX_OPTIMIZE | \ |
118 | 0 | G_REGEX_FIRSTLINE | \ |
119 | 0 | G_REGEX_DUPNAMES | \ |
120 | 0 | G_REGEX_NEWLINE_CR | \ |
121 | 0 | G_REGEX_NEWLINE_LF | \ |
122 | 0 | G_REGEX_NEWLINE_CRLF | \ |
123 | 0 | G_REGEX_NEWLINE_ANYCRLF | \ |
124 | 0 | G_REGEX_BSR_ANYCRLF) |
125 | | |
126 | 0 | #define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS | \ |
127 | 0 | PCRE2_ALT_BSUX | \ |
128 | 0 | PCRE2_AUTO_CALLOUT | \ |
129 | 0 | PCRE2_CASELESS | \ |
130 | 0 | PCRE2_DOLLAR_ENDONLY | \ |
131 | 0 | PCRE2_DOTALL | \ |
132 | 0 | PCRE2_DUPNAMES | \ |
133 | 0 | PCRE2_EXTENDED | \ |
134 | 0 | PCRE2_FIRSTLINE | \ |
135 | 0 | PCRE2_MATCH_UNSET_BACKREF | \ |
136 | 0 | PCRE2_MULTILINE | \ |
137 | 0 | PCRE2_NEVER_UCP | \ |
138 | 0 | PCRE2_NEVER_UTF | \ |
139 | 0 | PCRE2_NO_AUTO_CAPTURE | \ |
140 | 0 | PCRE2_NO_AUTO_POSSESS | \ |
141 | 0 | PCRE2_NO_DOTSTAR_ANCHOR | \ |
142 | 0 | PCRE2_NO_START_OPTIMIZE | \ |
143 | 0 | PCRE2_UCP | \ |
144 | 0 | PCRE2_UNGREEDY | \ |
145 | 0 | PCRE2_UTF | \ |
146 | 0 | PCRE2_NEVER_BACKSLASH_C | \ |
147 | 0 | PCRE2_ALT_CIRCUMFLEX | \ |
148 | 0 | PCRE2_ALT_VERBNAMES | \ |
149 | 0 | PCRE2_USE_OFFSET_LIMIT | \ |
150 | 0 | PCRE2_EXTENDED_MORE | \ |
151 | 0 | PCRE2_LITERAL | \ |
152 | 0 | PCRE2_MATCH_INVALID_UTF | \ |
153 | 0 | G_REGEX_PCRE_GENERIC_MASK) |
154 | | |
155 | 0 | #define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF) |
156 | | |
157 | | /* Mask of all the possible values for GRegexMatchFlags. */ |
158 | 0 | #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT | \ |
159 | 0 | G_REGEX_MATCH_ANCHORED | \ |
160 | 0 | G_REGEX_MATCH_NOTBOL | \ |
161 | 0 | G_REGEX_MATCH_NOTEOL | \ |
162 | 0 | G_REGEX_MATCH_NOTEMPTY | \ |
163 | 0 | G_REGEX_MATCH_PARTIAL | \ |
164 | 0 | G_REGEX_MATCH_NEWLINE_CR | \ |
165 | 0 | G_REGEX_MATCH_NEWLINE_LF | \ |
166 | 0 | G_REGEX_MATCH_NEWLINE_CRLF | \ |
167 | 0 | G_REGEX_MATCH_NEWLINE_ANY | \ |
168 | 0 | G_REGEX_MATCH_NEWLINE_ANYCRLF | \ |
169 | 0 | G_REGEX_MATCH_BSR_ANYCRLF | \ |
170 | 0 | G_REGEX_MATCH_BSR_ANY | \ |
171 | 0 | G_REGEX_MATCH_PARTIAL_SOFT | \ |
172 | 0 | G_REGEX_MATCH_PARTIAL_HARD | \ |
173 | 0 | G_REGEX_MATCH_NOTEMPTY_ATSTART) |
174 | | |
175 | 0 | #define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL |\ |
176 | 0 | PCRE2_NOTEOL |\ |
177 | 0 | PCRE2_NOTEMPTY |\ |
178 | 0 | PCRE2_NOTEMPTY_ATSTART |\ |
179 | 0 | PCRE2_PARTIAL_SOFT |\ |
180 | 0 | PCRE2_PARTIAL_HARD |\ |
181 | 0 | PCRE2_NO_JIT |\ |
182 | 0 | PCRE2_COPY_MATCHED_SUBJECT |\ |
183 | 0 | G_REGEX_PCRE_GENERIC_MASK) |
184 | | |
185 | | /* TODO: Support PCRE2_NEWLINE_NUL */ |
186 | | #define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR | \ |
187 | | PCRE2_NEWLINE_LF | \ |
188 | | PCRE2_NEWLINE_CRLF | \ |
189 | | PCRE2_NEWLINE_ANYCRLF) |
190 | | |
191 | | /* Some match options are not supported when using JIT as stated in the |
192 | | * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section: |
193 | | * https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5 |
194 | | */ |
195 | 0 | #define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \ |
196 | 0 | PCRE2_ENDANCHORED) |
197 | | |
198 | 0 | #define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR | \ |
199 | 0 | G_REGEX_NEWLINE_LF | \ |
200 | 0 | G_REGEX_NEWLINE_CRLF | \ |
201 | 0 | G_REGEX_NEWLINE_ANYCRLF) |
202 | | |
203 | 0 | #define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR | \ |
204 | 0 | G_REGEX_MATCH_NEWLINE_LF | \ |
205 | 0 | G_REGEX_MATCH_NEWLINE_CRLF | \ |
206 | 0 | G_REGEX_MATCH_NEWLINE_ANY | \ |
207 | 0 | G_REGEX_MATCH_NEWLINE_ANYCRLF) |
208 | | |
209 | | /* if the string is in UTF-8 use g_utf8_ functions, else use |
210 | | * use just +/- 1. */ |
211 | 0 | #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
212 | 0 | ((s) + 1) : \ |
213 | 0 | g_utf8_next_char (s)) |
214 | 0 | #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
215 | 0 | ((s) - 1) : \ |
216 | 0 | g_utf8_prev_char (s)) |
217 | | |
218 | | struct _GMatchInfo |
219 | | { |
220 | | gint ref_count; /* the ref count (atomic) */ |
221 | | GRegex *regex; /* the regex */ |
222 | | uint32_t match_opts; /* pcre match options used at match time on the regex */ |
223 | | gint matches; /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */ |
224 | | uint32_t n_subpatterns; /* total number of sub patterns in the regex */ |
225 | | gint pos; /* position in the string where last match left off */ |
226 | | uint32_t n_offsets; /* number of offsets */ |
227 | | gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ |
228 | | gint *workspace; /* workspace for pcre2_dfa_match() */ |
229 | | PCRE2_SIZE n_workspace; /* number of workspace elements */ |
230 | | const gchar *string; /* string passed to the match function */ |
231 | | gssize string_len; /* length of string, in bytes */ |
232 | | pcre2_match_context *match_context; |
233 | | pcre2_match_data *match_data; |
234 | | pcre2_jit_stack *jit_stack; |
235 | | }; |
236 | | |
237 | | typedef enum |
238 | | { |
239 | | JIT_STATUS_DEFAULT, |
240 | | JIT_STATUS_ENABLED, |
241 | | JIT_STATUS_DISABLED |
242 | | } JITStatus; |
243 | | |
244 | | struct _GRegex |
245 | | { |
246 | | gint ref_count; /* the ref count for the immutable part (atomic) */ |
247 | | gchar *pattern; /* the pattern */ |
248 | | pcre2_code *pcre_re; /* compiled form of the pattern */ |
249 | | uint32_t compile_opts; /* options used at compile time on the pattern, pcre2 values */ |
250 | | GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */ |
251 | | uint32_t match_opts; /* pcre2 options used at match time on the regex */ |
252 | | GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */ |
253 | | uint32_t jit_options; /* options which were enabled for jit compiler */ |
254 | | JITStatus jit_status; /* indicates the status of jit compiler for this compiled regex */ |
255 | | /* The jit_status here does _not_ correspond to whether we used the JIT in the last invocation, |
256 | | * which may be affected by match_options or a JIT_STACK_LIMIT error, but whether it was ever |
257 | | * enabled for the current regex AND current set of jit_options. |
258 | | * JIT_STATUS_DEFAULT means enablement was never tried, |
259 | | * JIT_STATUS_ENABLED means it was tried and successful (even if we're not currently using it), |
260 | | * and JIT_STATUS_DISABLED means it was tried and failed (so we shouldn't try again). |
261 | | */ |
262 | | }; |
263 | | |
264 | | /* TRUE if ret is an error code, FALSE otherwise. */ |
265 | 0 | #define IS_PCRE2_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL) |
266 | | |
267 | | typedef struct _InterpolationData InterpolationData; |
268 | | static gboolean interpolation_list_needs_match (GList *list); |
269 | | static gboolean interpolate_replacement (const GMatchInfo *match_info, |
270 | | GString *result, |
271 | | gpointer data); |
272 | | static GList *split_replacement (const gchar *replacement, |
273 | | GError **error); |
274 | | static void free_interpolation_data (InterpolationData *data); |
275 | | |
276 | | static uint32_t |
277 | | get_pcre2_compile_options (GRegexCompileFlags compile_flags) |
278 | 0 | { |
279 | | /* Maps compile flags to pcre2 values */ |
280 | 0 | uint32_t pcre2_flags = 0; |
281 | |
|
282 | 0 | if (compile_flags & G_REGEX_CASELESS) |
283 | 0 | pcre2_flags |= PCRE2_CASELESS; |
284 | 0 | if (compile_flags & G_REGEX_MULTILINE) |
285 | 0 | pcre2_flags |= PCRE2_MULTILINE; |
286 | 0 | if (compile_flags & G_REGEX_DOTALL) |
287 | 0 | pcre2_flags |= PCRE2_DOTALL; |
288 | 0 | if (compile_flags & G_REGEX_EXTENDED) |
289 | 0 | pcre2_flags |= PCRE2_EXTENDED; |
290 | 0 | if (compile_flags & G_REGEX_ANCHORED) |
291 | 0 | pcre2_flags |= PCRE2_ANCHORED; |
292 | 0 | if (compile_flags & G_REGEX_DOLLAR_ENDONLY) |
293 | 0 | pcre2_flags |= PCRE2_DOLLAR_ENDONLY; |
294 | 0 | if (compile_flags & G_REGEX_UNGREEDY) |
295 | 0 | pcre2_flags |= PCRE2_UNGREEDY; |
296 | 0 | if (!(compile_flags & G_REGEX_RAW)) |
297 | 0 | pcre2_flags |= PCRE2_UTF; |
298 | 0 | if (compile_flags & G_REGEX_NO_AUTO_CAPTURE) |
299 | 0 | pcre2_flags |= PCRE2_NO_AUTO_CAPTURE; |
300 | 0 | if (compile_flags & G_REGEX_FIRSTLINE) |
301 | 0 | pcre2_flags |= PCRE2_FIRSTLINE; |
302 | 0 | if (compile_flags & G_REGEX_DUPNAMES) |
303 | 0 | pcre2_flags |= PCRE2_DUPNAMES; |
304 | |
|
305 | 0 | return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK; |
306 | 0 | } |
307 | | |
308 | | static uint32_t |
309 | | get_pcre2_match_options (GRegexMatchFlags match_flags, |
310 | | GRegexCompileFlags compile_flags) |
311 | 0 | { |
312 | | /* Maps match flags to pcre2 values */ |
313 | 0 | uint32_t pcre2_flags = 0; |
314 | |
|
315 | 0 | if (match_flags & G_REGEX_MATCH_ANCHORED) |
316 | 0 | pcre2_flags |= PCRE2_ANCHORED; |
317 | 0 | if (match_flags & G_REGEX_MATCH_NOTBOL) |
318 | 0 | pcre2_flags |= PCRE2_NOTBOL; |
319 | 0 | if (match_flags & G_REGEX_MATCH_NOTEOL) |
320 | 0 | pcre2_flags |= PCRE2_NOTEOL; |
321 | 0 | if (match_flags & G_REGEX_MATCH_NOTEMPTY) |
322 | 0 | pcre2_flags |= PCRE2_NOTEMPTY; |
323 | 0 | if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT) |
324 | 0 | pcre2_flags |= PCRE2_PARTIAL_SOFT; |
325 | 0 | if (match_flags & G_REGEX_MATCH_PARTIAL_HARD) |
326 | 0 | pcre2_flags |= PCRE2_PARTIAL_HARD; |
327 | 0 | if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART) |
328 | 0 | pcre2_flags |= PCRE2_NOTEMPTY_ATSTART; |
329 | |
|
330 | 0 | if (compile_flags & G_REGEX_RAW) |
331 | 0 | pcre2_flags |= PCRE2_NO_UTF_CHECK; |
332 | |
|
333 | 0 | return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK; |
334 | 0 | } |
335 | | |
336 | | static GRegexCompileFlags |
337 | | g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags) |
338 | 0 | { |
339 | 0 | GRegexCompileFlags compile_flags = G_REGEX_DEFAULT; |
340 | |
|
341 | 0 | if (pcre2_flags & PCRE2_CASELESS) |
342 | 0 | compile_flags |= G_REGEX_CASELESS; |
343 | 0 | if (pcre2_flags & PCRE2_MULTILINE) |
344 | 0 | compile_flags |= G_REGEX_MULTILINE; |
345 | 0 | if (pcre2_flags & PCRE2_DOTALL) |
346 | 0 | compile_flags |= G_REGEX_DOTALL; |
347 | 0 | if (pcre2_flags & PCRE2_EXTENDED) |
348 | 0 | compile_flags |= G_REGEX_EXTENDED; |
349 | 0 | if (pcre2_flags & PCRE2_ANCHORED) |
350 | 0 | compile_flags |= G_REGEX_ANCHORED; |
351 | 0 | if (pcre2_flags & PCRE2_DOLLAR_ENDONLY) |
352 | 0 | compile_flags |= G_REGEX_DOLLAR_ENDONLY; |
353 | 0 | if (pcre2_flags & PCRE2_UNGREEDY) |
354 | 0 | compile_flags |= G_REGEX_UNGREEDY; |
355 | 0 | if (!(pcre2_flags & PCRE2_UTF)) |
356 | 0 | compile_flags |= G_REGEX_RAW; |
357 | 0 | if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE) |
358 | 0 | compile_flags |= G_REGEX_NO_AUTO_CAPTURE; |
359 | 0 | if (pcre2_flags & PCRE2_FIRSTLINE) |
360 | 0 | compile_flags |= G_REGEX_FIRSTLINE; |
361 | 0 | if (pcre2_flags & PCRE2_DUPNAMES) |
362 | 0 | compile_flags |= G_REGEX_DUPNAMES; |
363 | |
|
364 | 0 | return compile_flags & G_REGEX_COMPILE_MASK; |
365 | 0 | } |
366 | | |
367 | | static GRegexMatchFlags |
368 | | g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags) |
369 | 0 | { |
370 | 0 | GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT; |
371 | |
|
372 | 0 | if (pcre2_flags & PCRE2_ANCHORED) |
373 | 0 | match_flags |= G_REGEX_MATCH_ANCHORED; |
374 | 0 | if (pcre2_flags & PCRE2_NOTBOL) |
375 | 0 | match_flags |= G_REGEX_MATCH_NOTBOL; |
376 | 0 | if (pcre2_flags & PCRE2_NOTEOL) |
377 | 0 | match_flags |= G_REGEX_MATCH_NOTEOL; |
378 | 0 | if (pcre2_flags & PCRE2_NOTEMPTY) |
379 | 0 | match_flags |= G_REGEX_MATCH_NOTEMPTY; |
380 | 0 | if (pcre2_flags & PCRE2_PARTIAL_SOFT) |
381 | 0 | match_flags |= G_REGEX_MATCH_PARTIAL_SOFT; |
382 | 0 | if (pcre2_flags & PCRE2_PARTIAL_HARD) |
383 | 0 | match_flags |= G_REGEX_MATCH_PARTIAL_HARD; |
384 | 0 | if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART) |
385 | 0 | match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART; |
386 | |
|
387 | 0 | return (match_flags & G_REGEX_MATCH_MASK); |
388 | 0 | } |
389 | | |
390 | | static uint32_t |
391 | | get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags) |
392 | 0 | { |
393 | 0 | compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK; |
394 | |
|
395 | 0 | switch (compile_flags) |
396 | 0 | { |
397 | 0 | case G_REGEX_NEWLINE_CR: |
398 | 0 | return PCRE2_NEWLINE_CR; |
399 | 0 | case G_REGEX_NEWLINE_LF: |
400 | 0 | return PCRE2_NEWLINE_LF; |
401 | 0 | case G_REGEX_NEWLINE_CRLF: |
402 | 0 | return PCRE2_NEWLINE_CRLF; |
403 | 0 | case G_REGEX_NEWLINE_ANYCRLF: |
404 | 0 | return PCRE2_NEWLINE_ANYCRLF; |
405 | 0 | default: |
406 | 0 | if (compile_flags != 0) |
407 | 0 | return 0; |
408 | | |
409 | 0 | return PCRE2_NEWLINE_ANY; |
410 | 0 | } |
411 | 0 | } |
412 | | |
413 | | static uint32_t |
414 | | get_pcre2_newline_match_options (GRegexMatchFlags match_flags) |
415 | 0 | { |
416 | 0 | switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK) |
417 | 0 | { |
418 | 0 | case G_REGEX_MATCH_NEWLINE_CR: |
419 | 0 | return PCRE2_NEWLINE_CR; |
420 | 0 | case G_REGEX_MATCH_NEWLINE_LF: |
421 | 0 | return PCRE2_NEWLINE_LF; |
422 | 0 | case G_REGEX_MATCH_NEWLINE_CRLF: |
423 | 0 | return PCRE2_NEWLINE_CRLF; |
424 | 0 | case G_REGEX_MATCH_NEWLINE_ANY: |
425 | 0 | return PCRE2_NEWLINE_ANY; |
426 | 0 | case G_REGEX_MATCH_NEWLINE_ANYCRLF: |
427 | 0 | return PCRE2_NEWLINE_ANYCRLF; |
428 | 0 | default: |
429 | 0 | return 0; |
430 | 0 | } |
431 | 0 | } |
432 | | |
433 | | static uint32_t |
434 | | get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags) |
435 | 0 | { |
436 | 0 | if (compile_flags & G_REGEX_BSR_ANYCRLF) |
437 | 0 | return PCRE2_BSR_ANYCRLF; |
438 | | |
439 | 0 | return PCRE2_BSR_UNICODE; |
440 | 0 | } |
441 | | |
442 | | static uint32_t |
443 | | get_pcre2_bsr_match_options (GRegexMatchFlags match_flags) |
444 | 0 | { |
445 | 0 | if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF) |
446 | 0 | return PCRE2_BSR_ANYCRLF; |
447 | | |
448 | 0 | if (match_flags & G_REGEX_MATCH_BSR_ANY) |
449 | 0 | return PCRE2_BSR_UNICODE; |
450 | | |
451 | 0 | return 0; |
452 | 0 | } |
453 | | |
454 | | static char * |
455 | | get_pcre2_error_string (int errcode) |
456 | 0 | { |
457 | 0 | PCRE2_UCHAR8 error_msg[2048]; |
458 | 0 | int err_length; |
459 | |
|
460 | 0 | err_length = pcre2_get_error_message (errcode, error_msg, |
461 | 0 | G_N_ELEMENTS (error_msg)); |
462 | |
|
463 | 0 | if (err_length <= 0) |
464 | 0 | return NULL; |
465 | | |
466 | | /* The array is always filled with a trailing zero */ |
467 | 0 | g_assert ((size_t) err_length < G_N_ELEMENTS (error_msg)); |
468 | 0 | return g_memdup2 (error_msg, err_length + 1); |
469 | 0 | } |
470 | | |
471 | | static const gchar * |
472 | | translate_match_error (gint errcode) |
473 | 0 | { |
474 | 0 | switch (errcode) |
475 | 0 | { |
476 | 0 | case PCRE2_ERROR_NOMATCH: |
477 | | /* not an error */ |
478 | 0 | break; |
479 | 0 | case PCRE2_ERROR_NULL: |
480 | | /* NULL argument, this should not happen in GRegex */ |
481 | 0 | g_critical ("A NULL argument was passed to PCRE"); |
482 | 0 | break; |
483 | 0 | case PCRE2_ERROR_BADOPTION: |
484 | 0 | return "bad options"; |
485 | 0 | case PCRE2_ERROR_BADMAGIC: |
486 | 0 | return _("corrupted object"); |
487 | 0 | case PCRE2_ERROR_NOMEMORY: |
488 | 0 | return _("out of memory"); |
489 | 0 | case PCRE2_ERROR_NOSUBSTRING: |
490 | | /* not used by pcre2_match() */ |
491 | 0 | break; |
492 | 0 | case PCRE2_ERROR_MATCHLIMIT: |
493 | 0 | case PCRE2_ERROR_CALLOUT: |
494 | | /* callouts are not implemented */ |
495 | 0 | break; |
496 | 0 | case PCRE2_ERROR_BADUTFOFFSET: |
497 | | /* we do not check if strings are valid */ |
498 | 0 | break; |
499 | 0 | case PCRE2_ERROR_PARTIAL: |
500 | | /* not an error */ |
501 | 0 | break; |
502 | 0 | case PCRE2_ERROR_INTERNAL: |
503 | 0 | return _("internal error"); |
504 | 0 | case PCRE2_ERROR_DFA_UITEM: |
505 | 0 | return _("the pattern contains items not supported for partial matching"); |
506 | 0 | case PCRE2_ERROR_DFA_UCOND: |
507 | 0 | return _("back references as conditions are not supported for partial matching"); |
508 | 0 | case PCRE2_ERROR_DFA_WSSIZE: |
509 | | /* handled expanding the workspace */ |
510 | 0 | break; |
511 | 0 | case PCRE2_ERROR_DFA_RECURSE: |
512 | 0 | case PCRE2_ERROR_RECURSIONLIMIT: |
513 | 0 | return _("recursion limit reached"); |
514 | 0 | case PCRE2_ERROR_BADOFFSET: |
515 | 0 | return _("bad offset"); |
516 | 0 | case PCRE2_ERROR_RECURSELOOP: |
517 | 0 | return _("recursion loop"); |
518 | 0 | case PCRE2_ERROR_JIT_BADOPTION: |
519 | | /* should not happen in GRegex since we check modes before each match */ |
520 | 0 | return _("matching mode is requested that was not compiled for JIT"); |
521 | 0 | default: |
522 | 0 | break; |
523 | 0 | } |
524 | 0 | return NULL; |
525 | 0 | } |
526 | | |
527 | | static char * |
528 | | get_match_error_message (int errcode) |
529 | 0 | { |
530 | 0 | const char *msg = translate_match_error (errcode); |
531 | 0 | char *error_string; |
532 | |
|
533 | 0 | if (msg) |
534 | 0 | return g_strdup (msg); |
535 | | |
536 | 0 | error_string = get_pcre2_error_string (errcode); |
537 | |
|
538 | 0 | if (error_string) |
539 | 0 | return error_string; |
540 | | |
541 | 0 | return g_strdup (_("unknown error")); |
542 | 0 | } |
543 | | |
544 | | static void |
545 | | translate_compile_error (gint *errcode, const gchar **errmsg) |
546 | 0 | { |
547 | | /* If errcode is known we put the translatable error message in |
548 | | * errmsg. If errcode is unknown we put the generic |
549 | | * G_REGEX_ERROR_COMPILE error code in errcode. |
550 | | * Note that there can be more PCRE errors with the same GRegexError |
551 | | * and that some PCRE errors are useless for us. |
552 | | */ |
553 | 0 | gint original_errcode = *errcode; |
554 | |
|
555 | 0 | *errcode = -1; |
556 | 0 | *errmsg = NULL; |
557 | |
|
558 | 0 | switch (original_errcode) |
559 | 0 | { |
560 | 0 | case PCRE2_ERROR_END_BACKSLASH: |
561 | 0 | *errcode = G_REGEX_ERROR_STRAY_BACKSLASH; |
562 | 0 | *errmsg = _("\\ at end of pattern"); |
563 | 0 | break; |
564 | 0 | case PCRE2_ERROR_END_BACKSLASH_C: |
565 | 0 | *errcode = G_REGEX_ERROR_MISSING_CONTROL_CHAR; |
566 | 0 | *errmsg = _("\\c at end of pattern"); |
567 | 0 | break; |
568 | 0 | case PCRE2_ERROR_UNKNOWN_ESCAPE: |
569 | 0 | case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE: |
570 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; |
571 | 0 | *errmsg = _("unrecognized character following \\"); |
572 | 0 | break; |
573 | 0 | case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER: |
574 | 0 | *errcode = G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER; |
575 | 0 | *errmsg = _("numbers out of order in {} quantifier"); |
576 | 0 | break; |
577 | 0 | case PCRE2_ERROR_QUANTIFIER_TOO_BIG: |
578 | 0 | *errcode = G_REGEX_ERROR_QUANTIFIER_TOO_BIG; |
579 | 0 | *errmsg = _("number too big in {} quantifier"); |
580 | 0 | break; |
581 | 0 | case PCRE2_ERROR_MISSING_SQUARE_BRACKET: |
582 | 0 | *errcode = G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS; |
583 | 0 | *errmsg = _("missing terminating ] for character class"); |
584 | 0 | break; |
585 | 0 | case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS: |
586 | 0 | *errcode = G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS; |
587 | 0 | *errmsg = _("invalid escape sequence in character class"); |
588 | 0 | break; |
589 | 0 | case PCRE2_ERROR_CLASS_RANGE_ORDER: |
590 | 0 | *errcode = G_REGEX_ERROR_RANGE_OUT_OF_ORDER; |
591 | 0 | *errmsg = _("range out of order in character class"); |
592 | 0 | break; |
593 | 0 | case PCRE2_ERROR_QUANTIFIER_INVALID: |
594 | 0 | case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT: |
595 | 0 | *errcode = G_REGEX_ERROR_NOTHING_TO_REPEAT; |
596 | 0 | *errmsg = _("nothing to repeat"); |
597 | 0 | break; |
598 | 0 | case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY: |
599 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
600 | 0 | *errmsg = _("unrecognized character after (? or (?-"); |
601 | 0 | break; |
602 | 0 | case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS: |
603 | 0 | *errcode = G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS; |
604 | 0 | *errmsg = _("POSIX named classes are supported only within a class"); |
605 | 0 | break; |
606 | 0 | case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING: |
607 | 0 | *errcode = G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED; |
608 | 0 | *errmsg = _("POSIX collating elements are not supported"); |
609 | 0 | break; |
610 | 0 | case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS: |
611 | 0 | case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS: |
612 | 0 | case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING: |
613 | 0 | *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; |
614 | 0 | *errmsg = _("missing terminating )"); |
615 | 0 | break; |
616 | 0 | case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE: |
617 | 0 | *errcode = G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE; |
618 | 0 | *errmsg = _("reference to non-existent subpattern"); |
619 | 0 | break; |
620 | 0 | case PCRE2_ERROR_MISSING_COMMENT_CLOSING: |
621 | 0 | *errcode = G_REGEX_ERROR_UNTERMINATED_COMMENT; |
622 | 0 | *errmsg = _("missing ) after comment"); |
623 | 0 | break; |
624 | 0 | case PCRE2_ERROR_PATTERN_TOO_LARGE: |
625 | 0 | *errcode = G_REGEX_ERROR_EXPRESSION_TOO_LARGE; |
626 | 0 | *errmsg = _("regular expression is too large"); |
627 | 0 | break; |
628 | 0 | case PCRE2_ERROR_MISSING_CONDITION_CLOSING: |
629 | 0 | *errcode = G_REGEX_ERROR_MALFORMED_CONDITION; |
630 | 0 | *errmsg = _("malformed number or name after (?("); |
631 | 0 | break; |
632 | 0 | case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH: |
633 | 0 | *errcode = G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND; |
634 | 0 | *errmsg = _("lookbehind assertion is not fixed length"); |
635 | 0 | break; |
636 | 0 | case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES: |
637 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES; |
638 | 0 | *errmsg = _("conditional group contains more than two branches"); |
639 | 0 | break; |
640 | 0 | case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED: |
641 | 0 | *errcode = G_REGEX_ERROR_ASSERTION_EXPECTED; |
642 | 0 | *errmsg = _("assertion expected after (?("); |
643 | 0 | break; |
644 | 0 | case PCRE2_ERROR_BAD_RELATIVE_REFERENCE: |
645 | 0 | *errcode = G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE; |
646 | 0 | *errmsg = _("a numbered reference must not be zero"); |
647 | 0 | break; |
648 | 0 | case PCRE2_ERROR_UNKNOWN_POSIX_CLASS: |
649 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME; |
650 | 0 | *errmsg = _("unknown POSIX class name"); |
651 | 0 | break; |
652 | 0 | case PCRE2_ERROR_CODE_POINT_TOO_BIG: |
653 | 0 | case PCRE2_ERROR_INVALID_HEXADECIMAL: |
654 | 0 | *errcode = G_REGEX_ERROR_HEX_CODE_TOO_LARGE; |
655 | 0 | *errmsg = _("character value in \\x{...} sequence is too large"); |
656 | 0 | break; |
657 | 0 | case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C: |
658 | 0 | *errcode = G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND; |
659 | 0 | *errmsg = _("\\C not allowed in lookbehind assertion"); |
660 | 0 | break; |
661 | 0 | case PCRE2_ERROR_MISSING_NAME_TERMINATOR: |
662 | 0 | *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR; |
663 | 0 | *errmsg = _("missing terminator in subpattern name"); |
664 | 0 | break; |
665 | 0 | case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME: |
666 | 0 | *errcode = G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME; |
667 | 0 | *errmsg = _("two named subpatterns have the same name"); |
668 | 0 | break; |
669 | 0 | case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY: |
670 | 0 | *errcode = G_REGEX_ERROR_MALFORMED_PROPERTY; |
671 | 0 | *errmsg = _("malformed \\P or \\p sequence"); |
672 | 0 | break; |
673 | 0 | case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY: |
674 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_PROPERTY; |
675 | 0 | *errmsg = _("unknown property name after \\P or \\p"); |
676 | 0 | break; |
677 | 0 | case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG: |
678 | 0 | *errcode = G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG; |
679 | 0 | *errmsg = _("subpattern name is too long (maximum 32 characters)"); |
680 | 0 | break; |
681 | 0 | case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS: |
682 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_SUBPATTERNS; |
683 | 0 | *errmsg = _("too many named subpatterns (maximum 10,000)"); |
684 | 0 | break; |
685 | 0 | case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG: |
686 | 0 | *errcode = G_REGEX_ERROR_INVALID_OCTAL_VALUE; |
687 | 0 | *errmsg = _("octal value is greater than \\377"); |
688 | 0 | break; |
689 | 0 | case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES: |
690 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE; |
691 | 0 | *errmsg = _("DEFINE group contains more than one branch"); |
692 | 0 | break; |
693 | 0 | case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE: |
694 | 0 | *errcode = G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS; |
695 | 0 | *errmsg = _("inconsistent NEWLINE options"); |
696 | 0 | break; |
697 | 0 | case PCRE2_ERROR_BACKSLASH_G_SYNTAX: |
698 | 0 | *errcode = G_REGEX_ERROR_MISSING_BACK_REFERENCE; |
699 | 0 | *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or " |
700 | 0 | "number, or by a plain number"); |
701 | 0 | break; |
702 | 0 | case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED: |
703 | 0 | *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN; |
704 | 0 | *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)"); |
705 | 0 | break; |
706 | 0 | case PCRE2_ERROR_VERB_UNKNOWN: |
707 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB; |
708 | 0 | *errmsg = _("(*VERB) not recognized"); |
709 | 0 | break; |
710 | 0 | case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG: |
711 | 0 | *errcode = G_REGEX_ERROR_NUMBER_TOO_BIG; |
712 | 0 | *errmsg = _("number is too big"); |
713 | 0 | break; |
714 | 0 | case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED: |
715 | 0 | *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME; |
716 | 0 | *errmsg = _("missing subpattern name after (?&"); |
717 | 0 | break; |
718 | 0 | case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH: |
719 | 0 | *errcode = G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME; |
720 | 0 | *errmsg = _("different names for subpatterns of the same number are not allowed"); |
721 | 0 | break; |
722 | 0 | case PCRE2_ERROR_MARK_MISSING_ARGUMENT: |
723 | 0 | *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED; |
724 | 0 | *errmsg = _("(*MARK) must have an argument"); |
725 | 0 | break; |
726 | 0 | case PCRE2_ERROR_BACKSLASH_C_SYNTAX: |
727 | 0 | *errcode = G_REGEX_ERROR_INVALID_CONTROL_CHAR; |
728 | 0 | *errmsg = _( "\\c must be followed by an ASCII character"); |
729 | 0 | break; |
730 | 0 | case PCRE2_ERROR_BACKSLASH_K_SYNTAX: |
731 | 0 | *errcode = G_REGEX_ERROR_MISSING_NAME; |
732 | 0 | *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name"); |
733 | 0 | break; |
734 | 0 | case PCRE2_ERROR_BACKSLASH_N_IN_CLASS: |
735 | 0 | *errcode = G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS; |
736 | 0 | *errmsg = _("\\N is not supported in a class"); |
737 | 0 | break; |
738 | 0 | case PCRE2_ERROR_VERB_NAME_TOO_LONG: |
739 | 0 | *errcode = G_REGEX_ERROR_NAME_TOO_LONG; |
740 | 0 | *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)"); |
741 | 0 | break; |
742 | 0 | case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW: |
743 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
744 | 0 | *errmsg = _("code overflow"); |
745 | 0 | break; |
746 | 0 | case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P: |
747 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
748 | 0 | *errmsg = _("unrecognized character after (?P"); |
749 | 0 | break; |
750 | 0 | case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE: |
751 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
752 | 0 | *errmsg = _("overran compiling workspace"); |
753 | 0 | break; |
754 | 0 | case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN: |
755 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
756 | 0 | *errmsg = _("previously-checked referenced subpattern not found"); |
757 | 0 | break; |
758 | 0 | case PCRE2_ERROR_HEAP_FAILED: |
759 | 0 | case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW: |
760 | 0 | case PCRE2_ERROR_UNICODE_NOT_SUPPORTED: |
761 | 0 | case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT: |
762 | 0 | case PCRE2_ERROR_NO_SURROGATES_IN_UTF16: |
763 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS: |
764 | 0 | case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE: |
765 | 0 | case PCRE2_ERROR_INTERNAL_STUDY_ERROR: |
766 | 0 | case PCRE2_ERROR_UTF_IS_DISABLED: |
767 | 0 | case PCRE2_ERROR_UCP_IS_DISABLED: |
768 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS: |
769 | 0 | case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED: |
770 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE: |
771 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP: |
772 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
773 | 0 | break; |
774 | 0 | case PCRE2_ERROR_INVALID_SUBPATTERN_NAME: |
775 | 0 | case PCRE2_ERROR_CLASS_INVALID_RANGE: |
776 | 0 | case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE: |
777 | 0 | case PCRE2_ERROR_PARENTHESES_STACK_CHECK: |
778 | 0 | case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED: |
779 | 0 | case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG: |
780 | 0 | case PCRE2_ERROR_MISSING_CALLOUT_CLOSING: |
781 | 0 | case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB: |
782 | 0 | case PCRE2_ERROR_NULL_PATTERN: |
783 | 0 | case PCRE2_ERROR_BAD_OPTIONS: |
784 | 0 | case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP: |
785 | 0 | case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE: |
786 | 0 | case PCRE2_ERROR_INVALID_OCTAL: |
787 | 0 | case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG: |
788 | 0 | case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG: |
789 | 0 | case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS: |
790 | 0 | case PCRE2_ERROR_VERSION_CONDITION_SYNTAX: |
791 | 0 | case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER: |
792 | 0 | case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER: |
793 | 0 | case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED: |
794 | 0 | case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP: |
795 | 0 | case PCRE2_ERROR_PATTERN_TOO_COMPLICATED: |
796 | 0 | case PCRE2_ERROR_LOOKBEHIND_TOO_LONG: |
797 | 0 | case PCRE2_ERROR_PATTERN_STRING_TOO_LONG: |
798 | 0 | case PCRE2_ERROR_BAD_LITERAL_OPTIONS: |
799 | 0 | default: |
800 | 0 | *errcode = G_REGEX_ERROR_COMPILE; |
801 | 0 | break; |
802 | 0 | } |
803 | | |
804 | 0 | g_assert (*errcode != -1); |
805 | 0 | } |
806 | | |
807 | | /* GMatchInfo */ |
808 | | |
809 | | static GMatchInfo * |
810 | | match_info_new (const GRegex *regex, |
811 | | const gchar *string, |
812 | | gint string_len, |
813 | | gint start_position, |
814 | | GRegexMatchFlags match_options, |
815 | | gboolean is_dfa) |
816 | 0 | { |
817 | 0 | GMatchInfo *match_info; |
818 | |
|
819 | 0 | if (string_len < 0) |
820 | 0 | string_len = strlen (string); |
821 | |
|
822 | 0 | match_info = g_new0 (GMatchInfo, 1); |
823 | 0 | match_info->ref_count = 1; |
824 | 0 | match_info->regex = g_regex_ref ((GRegex *)regex); |
825 | 0 | match_info->string = string; |
826 | 0 | match_info->string_len = string_len; |
827 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
828 | 0 | match_info->pos = start_position; |
829 | 0 | match_info->match_opts = |
830 | 0 | get_pcre2_match_options (match_options, regex->orig_compile_opts); |
831 | |
|
832 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, |
833 | 0 | &match_info->n_subpatterns); |
834 | |
|
835 | 0 | match_info->match_context = pcre2_match_context_create (NULL); |
836 | |
|
837 | 0 | if (is_dfa) |
838 | 0 | { |
839 | | /* These values should be enough for most cases, if they are not |
840 | | * enough g_regex_match_all_full() will expand them. */ |
841 | 0 | match_info->n_workspace = 100; |
842 | 0 | match_info->workspace = g_new (gint, match_info->n_workspace); |
843 | 0 | } |
844 | |
|
845 | 0 | match_info->n_offsets = 2; |
846 | 0 | match_info->offsets = g_new0 (gint, match_info->n_offsets); |
847 | | /* Set an invalid position for the previous match. */ |
848 | 0 | match_info->offsets[0] = -1; |
849 | 0 | match_info->offsets[1] = -1; |
850 | |
|
851 | 0 | match_info->match_data = pcre2_match_data_create_from_pattern ( |
852 | 0 | match_info->regex->pcre_re, |
853 | 0 | NULL); |
854 | |
|
855 | 0 | return match_info; |
856 | 0 | } |
857 | | |
858 | | static gboolean |
859 | | recalc_match_offsets (GMatchInfo *match_info, |
860 | | GError **error) |
861 | 0 | { |
862 | 0 | PCRE2_SIZE *ovector; |
863 | 0 | uint32_t ovector_size = 0; |
864 | 0 | uint32_t pre_n_offset; |
865 | 0 | uint32_t i; |
866 | |
|
867 | 0 | g_assert (!IS_PCRE2_ERROR (match_info->matches)); |
868 | | |
869 | 0 | if (match_info->matches == PCRE2_ERROR_PARTIAL) |
870 | 0 | ovector_size = 1; |
871 | 0 | else if (match_info->matches > 0) |
872 | 0 | ovector_size = match_info->matches; |
873 | |
|
874 | 0 | g_assert (ovector_size != 0); |
875 | | |
876 | 0 | if (pcre2_get_ovector_count (match_info->match_data) < ovector_size) |
877 | 0 | { |
878 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
879 | 0 | _("Error while matching regular expression %s: %s"), |
880 | 0 | match_info->regex->pattern, _("code overflow")); |
881 | 0 | return FALSE; |
882 | 0 | } |
883 | | |
884 | 0 | pre_n_offset = match_info->n_offsets; |
885 | 0 | match_info->n_offsets = ovector_size * 2; |
886 | 0 | ovector = pcre2_get_ovector_pointer (match_info->match_data); |
887 | |
|
888 | 0 | if (match_info->n_offsets != pre_n_offset) |
889 | 0 | { |
890 | 0 | match_info->offsets = g_realloc_n (match_info->offsets, |
891 | 0 | match_info->n_offsets, |
892 | 0 | sizeof (gint)); |
893 | 0 | } |
894 | |
|
895 | 0 | for (i = 0; i < match_info->n_offsets; i++) |
896 | 0 | { |
897 | 0 | match_info->offsets[i] = (int) ovector[i]; |
898 | 0 | } |
899 | |
|
900 | 0 | return TRUE; |
901 | 0 | } |
902 | | |
903 | | static JITStatus |
904 | | enable_jit_with_match_options (GMatchInfo *match_info, |
905 | | uint32_t match_options) |
906 | 0 | { |
907 | 0 | gint retval; |
908 | 0 | uint32_t old_jit_options, new_jit_options; |
909 | |
|
910 | 0 | if (!(match_info->regex->orig_compile_opts & G_REGEX_OPTIMIZE)) |
911 | 0 | return JIT_STATUS_DISABLED; |
912 | | |
913 | 0 | if (match_info->regex->jit_status == JIT_STATUS_DISABLED) |
914 | 0 | return JIT_STATUS_DISABLED; |
915 | | |
916 | 0 | if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS) |
917 | 0 | return JIT_STATUS_DISABLED; |
918 | | |
919 | 0 | old_jit_options = match_info->regex->jit_options; |
920 | 0 | new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE; |
921 | 0 | if (match_options & PCRE2_PARTIAL_HARD) |
922 | 0 | new_jit_options |= PCRE2_JIT_PARTIAL_HARD; |
923 | 0 | if (match_options & PCRE2_PARTIAL_SOFT) |
924 | 0 | new_jit_options |= PCRE2_JIT_PARTIAL_SOFT; |
925 | | |
926 | | /* no new options enabled */ |
927 | 0 | if (new_jit_options == old_jit_options) |
928 | 0 | { |
929 | 0 | g_assert (match_info->regex->jit_status != JIT_STATUS_DEFAULT); |
930 | 0 | return match_info->regex->jit_status; |
931 | 0 | } |
932 | | |
933 | 0 | retval = pcre2_jit_compile (match_info->regex->pcre_re, new_jit_options); |
934 | 0 | if (retval == 0) |
935 | 0 | { |
936 | 0 | match_info->regex->jit_status = JIT_STATUS_ENABLED; |
937 | |
|
938 | 0 | match_info->regex->jit_options = new_jit_options; |
939 | | /* Set min stack size for JIT to 32KiB and max to 512KiB */ |
940 | 0 | match_info->jit_stack = pcre2_jit_stack_create (1 << 15, 1 << 19, NULL); |
941 | 0 | pcre2_jit_stack_assign (match_info->match_context, NULL, match_info->jit_stack); |
942 | 0 | } |
943 | 0 | else |
944 | 0 | { |
945 | 0 | match_info->regex->jit_status = JIT_STATUS_DISABLED; |
946 | |
|
947 | 0 | switch (retval) |
948 | 0 | { |
949 | 0 | case PCRE2_ERROR_NOMEMORY: |
950 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
951 | 0 | "but JIT was unable to allocate executable memory for the " |
952 | 0 | "compiler. Falling back to interpretive code."); |
953 | 0 | break; |
954 | 0 | case PCRE2_ERROR_JIT_BADOPTION: |
955 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
956 | 0 | "but JIT support is not available. Falling back to " |
957 | 0 | "interpretive code."); |
958 | 0 | break; |
959 | 0 | default: |
960 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
961 | 0 | "but request for JIT support had unexpectedly failed (error %d). " |
962 | 0 | "Falling back to interpretive code.", |
963 | 0 | retval); |
964 | 0 | break; |
965 | 0 | } |
966 | 0 | } |
967 | | |
968 | 0 | return match_info->regex->jit_status; |
969 | | |
970 | 0 | g_assert_not_reached (); |
971 | 0 | } |
972 | | |
973 | | /** |
974 | | * g_match_info_get_regex: |
975 | | * @match_info: a #GMatchInfo |
976 | | * |
977 | | * Returns #GRegex object used in @match_info. It belongs to Glib |
978 | | * and must not be freed. Use g_regex_ref() if you need to keep it |
979 | | * after you free @match_info object. |
980 | | * |
981 | | * Returns: (transfer none): #GRegex object used in @match_info |
982 | | * |
983 | | * Since: 2.14 |
984 | | */ |
985 | | GRegex * |
986 | | g_match_info_get_regex (const GMatchInfo *match_info) |
987 | 0 | { |
988 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
989 | 0 | return match_info->regex; |
990 | 0 | } |
991 | | |
992 | | /** |
993 | | * g_match_info_get_string: |
994 | | * @match_info: a #GMatchInfo |
995 | | * |
996 | | * Returns the string searched with @match_info. This is the |
997 | | * string passed to g_regex_match() or g_regex_replace() so |
998 | | * you may not free it before calling this function. |
999 | | * |
1000 | | * Returns: the string searched with @match_info |
1001 | | * |
1002 | | * Since: 2.14 |
1003 | | */ |
1004 | | const gchar * |
1005 | | g_match_info_get_string (const GMatchInfo *match_info) |
1006 | 0 | { |
1007 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1008 | 0 | return match_info->string; |
1009 | 0 | } |
1010 | | |
1011 | | /** |
1012 | | * g_match_info_ref: |
1013 | | * @match_info: a #GMatchInfo |
1014 | | * |
1015 | | * Increases reference count of @match_info by 1. |
1016 | | * |
1017 | | * Returns: @match_info |
1018 | | * |
1019 | | * Since: 2.30 |
1020 | | */ |
1021 | | GMatchInfo * |
1022 | | g_match_info_ref (GMatchInfo *match_info) |
1023 | 0 | { |
1024 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1025 | 0 | g_atomic_int_inc (&match_info->ref_count); |
1026 | 0 | return match_info; |
1027 | 0 | } |
1028 | | |
1029 | | /** |
1030 | | * g_match_info_unref: |
1031 | | * @match_info: a #GMatchInfo |
1032 | | * |
1033 | | * Decreases reference count of @match_info by 1. When reference count drops |
1034 | | * to zero, it frees all the memory associated with the match_info structure. |
1035 | | * |
1036 | | * Since: 2.30 |
1037 | | */ |
1038 | | void |
1039 | | g_match_info_unref (GMatchInfo *match_info) |
1040 | 0 | { |
1041 | 0 | if (g_atomic_int_dec_and_test (&match_info->ref_count)) |
1042 | 0 | { |
1043 | 0 | g_regex_unref (match_info->regex); |
1044 | 0 | if (match_info->match_context) |
1045 | 0 | pcre2_match_context_free (match_info->match_context); |
1046 | 0 | if (match_info->jit_stack) |
1047 | 0 | pcre2_jit_stack_free (match_info->jit_stack); |
1048 | 0 | if (match_info->match_data) |
1049 | 0 | pcre2_match_data_free (match_info->match_data); |
1050 | 0 | g_free (match_info->offsets); |
1051 | 0 | g_free (match_info->workspace); |
1052 | 0 | g_free (match_info); |
1053 | 0 | } |
1054 | 0 | } |
1055 | | |
1056 | | /** |
1057 | | * g_match_info_free: |
1058 | | * @match_info: (nullable): a #GMatchInfo, or %NULL |
1059 | | * |
1060 | | * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does |
1061 | | * nothing. |
1062 | | * |
1063 | | * Since: 2.14 |
1064 | | */ |
1065 | | void |
1066 | | g_match_info_free (GMatchInfo *match_info) |
1067 | 0 | { |
1068 | 0 | if (match_info == NULL) |
1069 | 0 | return; |
1070 | | |
1071 | 0 | g_match_info_unref (match_info); |
1072 | 0 | } |
1073 | | |
1074 | | /** |
1075 | | * g_match_info_next: |
1076 | | * @match_info: a #GMatchInfo structure |
1077 | | * @error: location to store the error occurring, or %NULL to ignore errors |
1078 | | * |
1079 | | * Scans for the next match using the same parameters of the previous |
1080 | | * call to g_regex_match_full() or g_regex_match() that returned |
1081 | | * @match_info. |
1082 | | * |
1083 | | * The match is done on the string passed to the match function, so you |
1084 | | * cannot free it before calling this function. |
1085 | | * |
1086 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
1087 | | * |
1088 | | * Since: 2.14 |
1089 | | */ |
1090 | | gboolean |
1091 | | g_match_info_next (GMatchInfo *match_info, |
1092 | | GError **error) |
1093 | 0 | { |
1094 | 0 | JITStatus jit_status; |
1095 | 0 | gint prev_match_start; |
1096 | 0 | gint prev_match_end; |
1097 | 0 | uint32_t opts; |
1098 | |
|
1099 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1100 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
1101 | 0 | g_return_val_if_fail (match_info->pos >= 0, FALSE); |
1102 | | |
1103 | 0 | prev_match_start = match_info->offsets[0]; |
1104 | 0 | prev_match_end = match_info->offsets[1]; |
1105 | |
|
1106 | 0 | if (match_info->pos > match_info->string_len) |
1107 | 0 | { |
1108 | | /* we have reached the end of the string */ |
1109 | 0 | match_info->pos = -1; |
1110 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
1111 | 0 | return FALSE; |
1112 | 0 | } |
1113 | | |
1114 | 0 | opts = match_info->regex->match_opts | match_info->match_opts; |
1115 | |
|
1116 | 0 | jit_status = enable_jit_with_match_options (match_info, opts); |
1117 | 0 | if (jit_status == JIT_STATUS_ENABLED) |
1118 | 0 | { |
1119 | 0 | match_info->matches = pcre2_jit_match (match_info->regex->pcre_re, |
1120 | 0 | (PCRE2_SPTR8) match_info->string, |
1121 | 0 | match_info->string_len, |
1122 | 0 | match_info->pos, |
1123 | 0 | opts, |
1124 | 0 | match_info->match_data, |
1125 | 0 | match_info->match_context); |
1126 | | /* if the JIT stack limit was reached, fall back to non-JIT matching in |
1127 | | * the next conditional statement */ |
1128 | 0 | if (match_info->matches == PCRE2_ERROR_JIT_STACKLIMIT) |
1129 | 0 | { |
1130 | 0 | g_debug ("PCRE2 JIT stack limit reached, falling back to " |
1131 | 0 | "non-optimized matching."); |
1132 | 0 | opts |= PCRE2_NO_JIT; |
1133 | 0 | jit_status = JIT_STATUS_DISABLED; |
1134 | 0 | } |
1135 | 0 | } |
1136 | |
|
1137 | 0 | if (jit_status != JIT_STATUS_ENABLED) |
1138 | 0 | { |
1139 | 0 | match_info->matches = pcre2_match (match_info->regex->pcre_re, |
1140 | 0 | (PCRE2_SPTR8) match_info->string, |
1141 | 0 | match_info->string_len, |
1142 | 0 | match_info->pos, |
1143 | 0 | opts, |
1144 | 0 | match_info->match_data, |
1145 | 0 | match_info->match_context); |
1146 | 0 | } |
1147 | |
|
1148 | 0 | if (IS_PCRE2_ERROR (match_info->matches)) |
1149 | 0 | { |
1150 | 0 | gchar *error_msg = get_match_error_message (match_info->matches); |
1151 | |
|
1152 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
1153 | 0 | _("Error while matching regular expression %s: %s"), |
1154 | 0 | match_info->regex->pattern, error_msg); |
1155 | 0 | g_clear_pointer (&error_msg, g_free); |
1156 | 0 | return FALSE; |
1157 | 0 | } |
1158 | 0 | else if (match_info->matches == 0) |
1159 | 0 | { |
1160 | | /* info->offsets is too small. */ |
1161 | 0 | match_info->n_offsets *= 2; |
1162 | 0 | match_info->offsets = g_realloc_n (match_info->offsets, |
1163 | 0 | match_info->n_offsets, |
1164 | 0 | sizeof (gint)); |
1165 | |
|
1166 | 0 | pcre2_match_data_free (match_info->match_data); |
1167 | 0 | match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL); |
1168 | |
|
1169 | 0 | return g_match_info_next (match_info, error); |
1170 | 0 | } |
1171 | 0 | else if (match_info->matches == PCRE2_ERROR_NOMATCH) |
1172 | 0 | { |
1173 | | /* We're done with this match info */ |
1174 | 0 | match_info->pos = -1; |
1175 | 0 | return FALSE; |
1176 | 0 | } |
1177 | 0 | else |
1178 | 0 | if (!recalc_match_offsets (match_info, error)) |
1179 | 0 | return FALSE; |
1180 | | |
1181 | | /* avoid infinite loops if the pattern is an empty string or something |
1182 | | * equivalent */ |
1183 | 0 | if (match_info->pos == match_info->offsets[1]) |
1184 | 0 | { |
1185 | 0 | if (match_info->pos > match_info->string_len) |
1186 | 0 | { |
1187 | | /* we have reached the end of the string */ |
1188 | 0 | match_info->pos = -1; |
1189 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
1190 | 0 | return FALSE; |
1191 | 0 | } |
1192 | | |
1193 | 0 | match_info->pos = NEXT_CHAR (match_info->regex, |
1194 | 0 | &match_info->string[match_info->pos]) - |
1195 | 0 | match_info->string; |
1196 | 0 | } |
1197 | 0 | else |
1198 | 0 | { |
1199 | 0 | match_info->pos = match_info->offsets[1]; |
1200 | 0 | } |
1201 | | |
1202 | 0 | g_assert (match_info->matches < 0 || |
1203 | 0 | (uint32_t) match_info->matches <= match_info->n_subpatterns + 1); |
1204 | | |
1205 | | /* it's possible to get two identical matches when we are matching |
1206 | | * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and |
1207 | | * the string is "RegExTest" we have: |
1208 | | * - search at position 0: match from 0 to 0 |
1209 | | * - search at position 1: match from 3 to 3 |
1210 | | * - search at position 3: match from 3 to 3 (duplicate) |
1211 | | * - search at position 4: match from 5 to 5 |
1212 | | * - search at position 5: match from 5 to 5 (duplicate) |
1213 | | * - search at position 6: no match -> stop |
1214 | | * so we have to ignore the duplicates. |
1215 | | * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */ |
1216 | 0 | if (match_info->matches >= 0 && |
1217 | 0 | prev_match_start == match_info->offsets[0] && |
1218 | 0 | prev_match_end == match_info->offsets[1]) |
1219 | 0 | { |
1220 | | /* ignore this match and search the next one */ |
1221 | 0 | return g_match_info_next (match_info, error); |
1222 | 0 | } |
1223 | | |
1224 | 0 | return match_info->matches >= 0; |
1225 | 0 | } |
1226 | | |
1227 | | /** |
1228 | | * g_match_info_matches: |
1229 | | * @match_info: a #GMatchInfo structure |
1230 | | * |
1231 | | * Returns whether the previous match operation succeeded. |
1232 | | * |
1233 | | * Returns: %TRUE if the previous match operation succeeded, |
1234 | | * %FALSE otherwise |
1235 | | * |
1236 | | * Since: 2.14 |
1237 | | */ |
1238 | | gboolean |
1239 | | g_match_info_matches (const GMatchInfo *match_info) |
1240 | 0 | { |
1241 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1242 | | |
1243 | 0 | return match_info->matches >= 0; |
1244 | 0 | } |
1245 | | |
1246 | | /** |
1247 | | * g_match_info_get_match_count: |
1248 | | * @match_info: a #GMatchInfo structure |
1249 | | * |
1250 | | * Retrieves the number of matched substrings (including substring 0, |
1251 | | * that is the whole matched text), so 1 is returned if the pattern |
1252 | | * has no substrings in it and 0 is returned if the match failed. |
1253 | | * |
1254 | | * If the last match was obtained using the DFA algorithm, that is |
1255 | | * using g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1256 | | * count is not that of the number of capturing parentheses but that of |
1257 | | * the number of matched substrings. |
1258 | | * |
1259 | | * Returns: Number of matched substrings, or -1 if an error occurred |
1260 | | * |
1261 | | * Since: 2.14 |
1262 | | */ |
1263 | | gint |
1264 | | g_match_info_get_match_count (const GMatchInfo *match_info) |
1265 | 0 | { |
1266 | 0 | g_return_val_if_fail (match_info, -1); |
1267 | | |
1268 | 0 | if (match_info->matches == PCRE2_ERROR_NOMATCH) |
1269 | | /* no match */ |
1270 | 0 | return 0; |
1271 | 0 | else if (match_info->matches < PCRE2_ERROR_NOMATCH) |
1272 | | /* error */ |
1273 | 0 | return -1; |
1274 | 0 | else |
1275 | | /* match */ |
1276 | 0 | return match_info->matches; |
1277 | 0 | } |
1278 | | |
1279 | | /** |
1280 | | * g_match_info_is_partial_match: |
1281 | | * @match_info: a #GMatchInfo structure |
1282 | | * |
1283 | | * Usually if the string passed to g_regex_match*() matches as far as |
1284 | | * it goes, but is too short to match the entire pattern, %FALSE is |
1285 | | * returned. There are circumstances where it might be helpful to |
1286 | | * distinguish this case from other cases in which there is no match. |
1287 | | * |
1288 | | * Consider, for example, an application where a human is required to |
1289 | | * type in data for a field with specific formatting requirements. An |
1290 | | * example might be a date in the form ddmmmyy, defined by the pattern |
1291 | | * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$". |
1292 | | * If the application sees the user’s keystrokes one by one, and can |
1293 | | * check that what has been typed so far is potentially valid, it is |
1294 | | * able to raise an error as soon as a mistake is made. |
1295 | | * |
1296 | | * GRegex supports the concept of partial matching by means of the |
1297 | | * %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD flags. |
1298 | | * When they are used, the return code for |
1299 | | * g_regex_match() or g_regex_match_full() is, as usual, %TRUE |
1300 | | * for a complete match, %FALSE otherwise. But, when these functions |
1301 | | * return %FALSE, you can check if the match was partial calling |
1302 | | * g_match_info_is_partial_match(). |
1303 | | * |
1304 | | * The difference between %G_REGEX_MATCH_PARTIAL_SOFT and |
1305 | | * %G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered |
1306 | | * with %G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a |
1307 | | * possible complete match, while with %G_REGEX_MATCH_PARTIAL_HARD matching |
1308 | | * stops at the partial match. |
1309 | | * When both %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD |
1310 | | * are set, the latter takes precedence. |
1311 | | * |
1312 | | * There were formerly some restrictions on the pattern for partial matching. |
1313 | | * The restrictions no longer apply. |
1314 | | * |
1315 | | * See pcrepartial(3) for more information on partial matching. |
1316 | | * |
1317 | | * Returns: %TRUE if the match was partial, %FALSE otherwise |
1318 | | * |
1319 | | * Since: 2.14 |
1320 | | */ |
1321 | | gboolean |
1322 | | g_match_info_is_partial_match (const GMatchInfo *match_info) |
1323 | 0 | { |
1324 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1325 | | |
1326 | 0 | return match_info->matches == PCRE2_ERROR_PARTIAL; |
1327 | 0 | } |
1328 | | |
1329 | | /** |
1330 | | * g_match_info_expand_references: |
1331 | | * @match_info: (nullable): a #GMatchInfo or %NULL |
1332 | | * @string_to_expand: the string to expand |
1333 | | * @error: location to store the error occurring, or %NULL to ignore errors |
1334 | | * |
1335 | | * Returns a new string containing the text in @string_to_expand with |
1336 | | * references and escape sequences expanded. References refer to the last |
1337 | | * match done with @string against @regex and have the same syntax used by |
1338 | | * g_regex_replace(). |
1339 | | * |
1340 | | * The @string_to_expand must be UTF-8 encoded even if %G_REGEX_RAW was |
1341 | | * passed to g_regex_new(). |
1342 | | * |
1343 | | * The backreferences are extracted from the string passed to the match |
1344 | | * function, so you cannot call this function after freeing the string. |
1345 | | * |
1346 | | * @match_info may be %NULL in which case @string_to_expand must not |
1347 | | * contain references. For instance "foo\n" does not refer to an actual |
1348 | | * pattern and '\n' merely will be replaced with \n character, |
1349 | | * while to expand "\0" (whole match) one needs the result of a match. |
1350 | | * Use g_regex_check_replacement() to find out whether @string_to_expand |
1351 | | * contains references. |
1352 | | * |
1353 | | * Returns: (nullable): the expanded string, or %NULL if an error occurred |
1354 | | * |
1355 | | * Since: 2.14 |
1356 | | */ |
1357 | | gchar * |
1358 | | g_match_info_expand_references (const GMatchInfo *match_info, |
1359 | | const gchar *string_to_expand, |
1360 | | GError **error) |
1361 | 0 | { |
1362 | 0 | GString *result; |
1363 | 0 | GList *list; |
1364 | 0 | GError *tmp_error = NULL; |
1365 | |
|
1366 | 0 | g_return_val_if_fail (string_to_expand != NULL, NULL); |
1367 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
1368 | | |
1369 | 0 | list = split_replacement (string_to_expand, &tmp_error); |
1370 | 0 | if (tmp_error != NULL) |
1371 | 0 | { |
1372 | 0 | g_propagate_error (error, tmp_error); |
1373 | 0 | return NULL; |
1374 | 0 | } |
1375 | | |
1376 | 0 | if (!match_info && interpolation_list_needs_match (list)) |
1377 | 0 | { |
1378 | 0 | g_critical ("String '%s' contains references to the match, can't " |
1379 | 0 | "expand references without GMatchInfo object", |
1380 | 0 | string_to_expand); |
1381 | 0 | return NULL; |
1382 | 0 | } |
1383 | | |
1384 | 0 | result = g_string_sized_new (strlen (string_to_expand)); |
1385 | 0 | interpolate_replacement (match_info, result, list); |
1386 | |
|
1387 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
1388 | |
|
1389 | 0 | return g_string_free (result, FALSE); |
1390 | 0 | } |
1391 | | |
1392 | | /** |
1393 | | * g_match_info_fetch: |
1394 | | * @match_info: #GMatchInfo structure |
1395 | | * @match_num: number of the sub expression |
1396 | | * |
1397 | | * Retrieves the text matching the @match_num'th capturing |
1398 | | * parentheses. 0 is the full text of the match, 1 is the first paren |
1399 | | * set, 2 the second, and so on. |
1400 | | * |
1401 | | * If @match_num is a valid sub pattern but it didn't match anything |
1402 | | * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty |
1403 | | * string is returned. |
1404 | | * |
1405 | | * If the match was obtained using the DFA algorithm, that is using |
1406 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1407 | | * string is not that of a set of parentheses but that of a matched |
1408 | | * substring. Substrings are matched in reverse order of length, so |
1409 | | * 0 is the longest match. |
1410 | | * |
1411 | | * The string is fetched from the string passed to the match function, |
1412 | | * so you cannot call this function after freeing the string. |
1413 | | * |
1414 | | * Returns: (nullable): The matched substring, or %NULL if an error |
1415 | | * occurred. You have to free the string yourself |
1416 | | * |
1417 | | * Since: 2.14 |
1418 | | */ |
1419 | | gchar * |
1420 | | g_match_info_fetch (const GMatchInfo *match_info, |
1421 | | gint match_num) |
1422 | 0 | { |
1423 | 0 | gchar *match = NULL; |
1424 | 0 | gint start, end; |
1425 | |
|
1426 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1427 | 0 | g_return_val_if_fail (match_num >= 0, NULL); |
1428 | | |
1429 | | /* match_num does not exist or it didn't matched, i.e. matching "b" |
1430 | | * against "(a)?b" then group 0 is empty. */ |
1431 | 0 | if (!g_match_info_fetch_pos (match_info, match_num, &start, &end)) |
1432 | 0 | match = NULL; |
1433 | 0 | else if (start == -1) |
1434 | 0 | match = g_strdup (""); |
1435 | 0 | else |
1436 | 0 | match = g_strndup (&match_info->string[start], end - start); |
1437 | |
|
1438 | 0 | return match; |
1439 | 0 | } |
1440 | | |
1441 | | /** |
1442 | | * g_match_info_fetch_pos: |
1443 | | * @match_info: #GMatchInfo structure |
1444 | | * @match_num: number of the sub expression |
1445 | | * @start_pos: (out) (optional): pointer to location where to store |
1446 | | * the start position, or %NULL |
1447 | | * @end_pos: (out) (optional): pointer to location where to store |
1448 | | * the end position, or %NULL |
1449 | | * |
1450 | | * Retrieves the position in bytes of the @match_num'th capturing |
1451 | | * parentheses. 0 is the full text of the match, 1 is the first |
1452 | | * paren set, 2 the second, and so on. |
1453 | | * |
1454 | | * If @match_num is a valid sub pattern but it didn't match anything |
1455 | | * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos |
1456 | | * and @end_pos are set to -1 and %TRUE is returned. |
1457 | | * |
1458 | | * If the match was obtained using the DFA algorithm, that is using |
1459 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1460 | | * position is not that of a set of parentheses but that of a matched |
1461 | | * substring. Substrings are matched in reverse order of length, so |
1462 | | * 0 is the longest match. |
1463 | | * |
1464 | | * Returns: %TRUE if the position was fetched, %FALSE otherwise. If |
1465 | | * the position cannot be fetched, @start_pos and @end_pos are left |
1466 | | * unchanged |
1467 | | * |
1468 | | * Since: 2.14 |
1469 | | */ |
1470 | | gboolean |
1471 | | g_match_info_fetch_pos (const GMatchInfo *match_info, |
1472 | | gint match_num, |
1473 | | gint *start_pos, |
1474 | | gint *end_pos) |
1475 | 0 | { |
1476 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1477 | 0 | g_return_val_if_fail (match_num >= 0, FALSE); |
1478 | | |
1479 | | /* check whether there was an error */ |
1480 | 0 | if (match_info->matches < 0) |
1481 | 0 | return FALSE; |
1482 | | |
1483 | | /* make sure the sub expression number they're requesting is less than |
1484 | | * the total number of sub expressions in the regex. When matching all |
1485 | | * (g_regex_match_all()), also compare against the number of matches */ |
1486 | 0 | if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches)) |
1487 | 0 | return FALSE; |
1488 | | |
1489 | 0 | if (start_pos != NULL) |
1490 | 0 | *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1; |
1491 | |
|
1492 | 0 | if (end_pos != NULL) |
1493 | 0 | *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1; |
1494 | |
|
1495 | 0 | return TRUE; |
1496 | 0 | } |
1497 | | |
1498 | | /* |
1499 | | * Returns number of first matched subpattern with name @name. |
1500 | | * There may be more than one in case when DUPNAMES is used, |
1501 | | * and not all subpatterns with that name match; |
1502 | | * pcre2_substring_number_from_name() does not work in that case. |
1503 | | */ |
1504 | | static gint |
1505 | | get_matched_substring_number (const GMatchInfo *match_info, |
1506 | | const gchar *name) |
1507 | 0 | { |
1508 | 0 | gint entrysize; |
1509 | 0 | PCRE2_SPTR first, last; |
1510 | 0 | guchar *entry; |
1511 | |
|
1512 | 0 | if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES)) |
1513 | 0 | return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR8) name); |
1514 | | |
1515 | | /* This code is analogous to code from pcre2_substring.c: |
1516 | | * pcre2_substring_get_byname() */ |
1517 | 0 | entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re, |
1518 | 0 | (PCRE2_SPTR8) name, |
1519 | 0 | &first, |
1520 | 0 | &last); |
1521 | |
|
1522 | 0 | if (entrysize <= 0) |
1523 | 0 | return entrysize; |
1524 | | |
1525 | 0 | for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize) |
1526 | 0 | { |
1527 | 0 | guint n = (entry[0] << 8) + entry[1]; |
1528 | 0 | if (n * 2 < match_info->n_offsets && match_info->offsets[n * 2] >= 0) |
1529 | 0 | return n; |
1530 | 0 | } |
1531 | | |
1532 | 0 | return (first[0] << 8) + first[1]; |
1533 | 0 | } |
1534 | | |
1535 | | /** |
1536 | | * g_match_info_fetch_named: |
1537 | | * @match_info: #GMatchInfo structure |
1538 | | * @name: name of the subexpression |
1539 | | * |
1540 | | * Retrieves the text matching the capturing parentheses named @name. |
1541 | | * |
1542 | | * If @name is a valid sub pattern name but it didn't match anything |
1543 | | * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") |
1544 | | * then an empty string is returned. |
1545 | | * |
1546 | | * The string is fetched from the string passed to the match function, |
1547 | | * so you cannot call this function after freeing the string. |
1548 | | * |
1549 | | * Returns: (nullable): The matched substring, or %NULL if an error |
1550 | | * occurred. You have to free the string yourself |
1551 | | * |
1552 | | * Since: 2.14 |
1553 | | */ |
1554 | | gchar * |
1555 | | g_match_info_fetch_named (const GMatchInfo *match_info, |
1556 | | const gchar *name) |
1557 | 0 | { |
1558 | 0 | gint num; |
1559 | |
|
1560 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1561 | 0 | g_return_val_if_fail (name != NULL, NULL); |
1562 | | |
1563 | 0 | num = get_matched_substring_number (match_info, name); |
1564 | 0 | if (num < 0) |
1565 | 0 | return NULL; |
1566 | 0 | else |
1567 | 0 | return g_match_info_fetch (match_info, num); |
1568 | 0 | } |
1569 | | |
1570 | | /** |
1571 | | * g_match_info_fetch_named_pos: |
1572 | | * @match_info: #GMatchInfo structure |
1573 | | * @name: name of the subexpression |
1574 | | * @start_pos: (out) (optional): pointer to location where to store |
1575 | | * the start position, or %NULL |
1576 | | * @end_pos: (out) (optional): pointer to location where to store |
1577 | | * the end position, or %NULL |
1578 | | * |
1579 | | * Retrieves the position in bytes of the capturing parentheses named @name. |
1580 | | * |
1581 | | * If @name is a valid sub pattern name but it didn't match anything |
1582 | | * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") |
1583 | | * then @start_pos and @end_pos are set to -1 and %TRUE is returned. |
1584 | | * |
1585 | | * Returns: %TRUE if the position was fetched, %FALSE otherwise. |
1586 | | * If the position cannot be fetched, @start_pos and @end_pos |
1587 | | * are left unchanged. |
1588 | | * |
1589 | | * Since: 2.14 |
1590 | | */ |
1591 | | gboolean |
1592 | | g_match_info_fetch_named_pos (const GMatchInfo *match_info, |
1593 | | const gchar *name, |
1594 | | gint *start_pos, |
1595 | | gint *end_pos) |
1596 | 0 | { |
1597 | 0 | gint num; |
1598 | |
|
1599 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1600 | 0 | g_return_val_if_fail (name != NULL, FALSE); |
1601 | | |
1602 | 0 | num = get_matched_substring_number (match_info, name); |
1603 | 0 | if (num < 0) |
1604 | 0 | return FALSE; |
1605 | | |
1606 | 0 | return g_match_info_fetch_pos (match_info, num, start_pos, end_pos); |
1607 | 0 | } |
1608 | | |
1609 | | /** |
1610 | | * g_match_info_fetch_all: |
1611 | | * @match_info: a #GMatchInfo structure |
1612 | | * |
1613 | | * Bundles up pointers to each of the matching substrings from a match |
1614 | | * and stores them in an array of gchar pointers. The first element in |
1615 | | * the returned array is the match number 0, i.e. the entire matched |
1616 | | * text. |
1617 | | * |
1618 | | * If a sub pattern didn't match anything (e.g. sub pattern 1, matching |
1619 | | * "b" against "(a)?b") then an empty string is inserted. |
1620 | | * |
1621 | | * If the last match was obtained using the DFA algorithm, that is using |
1622 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1623 | | * strings are not that matched by sets of parentheses but that of the |
1624 | | * matched substring. Substrings are matched in reverse order of length, |
1625 | | * so the first one is the longest match. |
1626 | | * |
1627 | | * The strings are fetched from the string passed to the match function, |
1628 | | * so you cannot call this function after freeing the string. |
1629 | | * |
1630 | | * Returns: (transfer full): a %NULL-terminated array of gchar * |
1631 | | * pointers. It must be freed using g_strfreev(). If the previous |
1632 | | * match failed %NULL is returned |
1633 | | * |
1634 | | * Since: 2.14 |
1635 | | */ |
1636 | | gchar ** |
1637 | | g_match_info_fetch_all (const GMatchInfo *match_info) |
1638 | 0 | { |
1639 | 0 | gchar **result; |
1640 | 0 | gint i; |
1641 | |
|
1642 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1643 | | |
1644 | 0 | if (match_info->matches < 0) |
1645 | 0 | return NULL; |
1646 | | |
1647 | 0 | result = g_new (gchar *, match_info->matches + 1); |
1648 | 0 | for (i = 0; i < match_info->matches; i++) |
1649 | 0 | result[i] = g_match_info_fetch (match_info, i); |
1650 | 0 | result[i] = NULL; |
1651 | |
|
1652 | 0 | return result; |
1653 | 0 | } |
1654 | | |
1655 | | |
1656 | | /* GRegex */ |
1657 | | |
1658 | | G_DEFINE_QUARK (g-regex-error-quark, g_regex_error) |
1659 | | |
1660 | | /** |
1661 | | * g_regex_ref: |
1662 | | * @regex: a #GRegex |
1663 | | * |
1664 | | * Increases reference count of @regex by 1. |
1665 | | * |
1666 | | * Returns: @regex |
1667 | | * |
1668 | | * Since: 2.14 |
1669 | | */ |
1670 | | GRegex * |
1671 | | g_regex_ref (GRegex *regex) |
1672 | 0 | { |
1673 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
1674 | 0 | g_atomic_int_inc (®ex->ref_count); |
1675 | 0 | return regex; |
1676 | 0 | } |
1677 | | |
1678 | | /** |
1679 | | * g_regex_unref: |
1680 | | * @regex: a #GRegex |
1681 | | * |
1682 | | * Decreases reference count of @regex by 1. When reference count drops |
1683 | | * to zero, it frees all the memory associated with the regex structure. |
1684 | | * |
1685 | | * Since: 2.14 |
1686 | | */ |
1687 | | void |
1688 | | g_regex_unref (GRegex *regex) |
1689 | 0 | { |
1690 | 0 | g_return_if_fail (regex != NULL); |
1691 | | |
1692 | 0 | if (g_atomic_int_dec_and_test (®ex->ref_count)) |
1693 | 0 | { |
1694 | 0 | g_free (regex->pattern); |
1695 | 0 | if (regex->pcre_re != NULL) |
1696 | 0 | pcre2_code_free (regex->pcre_re); |
1697 | 0 | g_free (regex); |
1698 | 0 | } |
1699 | 0 | } |
1700 | | |
1701 | | static pcre2_code * regex_compile (const gchar *pattern, |
1702 | | uint32_t compile_options, |
1703 | | uint32_t newline_options, |
1704 | | uint32_t bsr_options, |
1705 | | GError **error); |
1706 | | |
1707 | | static uint32_t get_pcre2_inline_compile_options (pcre2_code *re, |
1708 | | uint32_t compile_options); |
1709 | | |
1710 | | /** |
1711 | | * g_regex_new: |
1712 | | * @pattern: the regular expression |
1713 | | * @compile_options: compile options for the regular expression, or 0 |
1714 | | * @match_options: match options for the regular expression, or 0 |
1715 | | * @error: return location for a #GError |
1716 | | * |
1717 | | * Compiles the regular expression to an internal form, and does |
1718 | | * the initial setup of the #GRegex structure. |
1719 | | * |
1720 | | * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call |
1721 | | * g_regex_unref() when you are done with it |
1722 | | * |
1723 | | * Since: 2.14 |
1724 | | */ |
1725 | | GRegex * |
1726 | | g_regex_new (const gchar *pattern, |
1727 | | GRegexCompileFlags compile_options, |
1728 | | GRegexMatchFlags match_options, |
1729 | | GError **error) |
1730 | 0 | { |
1731 | 0 | GRegex *regex; |
1732 | 0 | pcre2_code *re; |
1733 | 0 | static gsize initialised = 0; |
1734 | 0 | uint32_t pcre_compile_options; |
1735 | 0 | uint32_t pcre_match_options; |
1736 | 0 | uint32_t newline_options; |
1737 | 0 | uint32_t bsr_options; |
1738 | |
|
1739 | 0 | g_return_val_if_fail (pattern != NULL, NULL); |
1740 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
1741 | 0 | G_GNUC_BEGIN_IGNORE_DEPRECATIONS |
1742 | 0 | g_return_val_if_fail ((compile_options & ~(G_REGEX_COMPILE_MASK | |
1743 | 0 | G_REGEX_JAVASCRIPT_COMPAT)) == 0, NULL); |
1744 | 0 | G_GNUC_END_IGNORE_DEPRECATIONS |
1745 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
1746 | | |
1747 | 0 | if (g_once_init_enter (&initialised)) |
1748 | 0 | { |
1749 | 0 | int supports_utf8; |
1750 | |
|
1751 | 0 | pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8); |
1752 | 0 | if (!supports_utf8) |
1753 | 0 | g_critical (_("PCRE library is compiled without UTF8 support")); |
1754 | |
|
1755 | 0 | g_once_init_leave (&initialised, supports_utf8 ? 1 : 2); |
1756 | 0 | } |
1757 | |
|
1758 | 0 | if (G_UNLIKELY (initialised != 1)) |
1759 | 0 | { |
1760 | 0 | g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, |
1761 | 0 | _("PCRE library is compiled with incompatible options")); |
1762 | 0 | return NULL; |
1763 | 0 | } |
1764 | | |
1765 | 0 | pcre_compile_options = get_pcre2_compile_options (compile_options); |
1766 | 0 | pcre_match_options = get_pcre2_match_options (match_options, compile_options); |
1767 | |
|
1768 | 0 | newline_options = get_pcre2_newline_match_options (match_options); |
1769 | 0 | if (newline_options == 0) |
1770 | 0 | newline_options = get_pcre2_newline_compile_options (compile_options); |
1771 | |
|
1772 | 0 | if (newline_options == 0) |
1773 | 0 | { |
1774 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
1775 | 0 | "Invalid newline flags"); |
1776 | 0 | return NULL; |
1777 | 0 | } |
1778 | | |
1779 | 0 | bsr_options = get_pcre2_bsr_match_options (match_options); |
1780 | 0 | if (!bsr_options) |
1781 | 0 | bsr_options = get_pcre2_bsr_compile_options (compile_options); |
1782 | |
|
1783 | 0 | re = regex_compile (pattern, pcre_compile_options, |
1784 | 0 | newline_options, bsr_options, error); |
1785 | 0 | if (re == NULL) |
1786 | 0 | return NULL; |
1787 | | |
1788 | 0 | pcre_compile_options |= |
1789 | 0 | get_pcre2_inline_compile_options (re, pcre_compile_options); |
1790 | |
|
1791 | 0 | regex = g_new0 (GRegex, 1); |
1792 | 0 | regex->ref_count = 1; |
1793 | 0 | regex->pattern = g_strdup (pattern); |
1794 | 0 | regex->pcre_re = re; |
1795 | 0 | regex->compile_opts = pcre_compile_options; |
1796 | 0 | regex->orig_compile_opts = compile_options; |
1797 | 0 | regex->match_opts = pcre_match_options; |
1798 | 0 | regex->orig_match_opts = match_options; |
1799 | |
|
1800 | 0 | return regex; |
1801 | 0 | } |
1802 | | |
1803 | | static pcre2_code * |
1804 | | regex_compile (const gchar *pattern, |
1805 | | uint32_t compile_options, |
1806 | | uint32_t newline_options, |
1807 | | uint32_t bsr_options, |
1808 | | GError **error) |
1809 | 0 | { |
1810 | 0 | pcre2_code *re; |
1811 | 0 | pcre2_compile_context *context; |
1812 | 0 | const gchar *errmsg; |
1813 | 0 | PCRE2_SIZE erroffset; |
1814 | 0 | gint errcode; |
1815 | |
|
1816 | 0 | context = pcre2_compile_context_create (NULL); |
1817 | | |
1818 | | /* set newline options */ |
1819 | 0 | if (pcre2_set_newline (context, newline_options) != 0) |
1820 | 0 | { |
1821 | 0 | g_set_error (error, G_REGEX_ERROR, |
1822 | 0 | G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
1823 | 0 | "Invalid newline flags"); |
1824 | 0 | pcre2_compile_context_free (context); |
1825 | 0 | return NULL; |
1826 | 0 | } |
1827 | | |
1828 | | /* set bsr options */ |
1829 | 0 | if (pcre2_set_bsr (context, bsr_options) != 0) |
1830 | 0 | { |
1831 | 0 | g_set_error (error, G_REGEX_ERROR, |
1832 | 0 | G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
1833 | 0 | "Invalid BSR flags"); |
1834 | 0 | pcre2_compile_context_free (context); |
1835 | 0 | return NULL; |
1836 | 0 | } |
1837 | | |
1838 | | /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */ |
1839 | 0 | if (compile_options & PCRE2_UTF) |
1840 | 0 | compile_options |= PCRE2_NO_UTF_CHECK; |
1841 | |
|
1842 | 0 | compile_options |= PCRE2_UCP; |
1843 | | |
1844 | | /* compile the pattern */ |
1845 | 0 | re = pcre2_compile ((PCRE2_SPTR8) pattern, |
1846 | 0 | PCRE2_ZERO_TERMINATED, |
1847 | 0 | compile_options, |
1848 | 0 | &errcode, |
1849 | 0 | &erroffset, |
1850 | 0 | context); |
1851 | 0 | pcre2_compile_context_free (context); |
1852 | | |
1853 | | /* if the compilation failed, set the error member and return |
1854 | | * immediately */ |
1855 | 0 | if (re == NULL) |
1856 | 0 | { |
1857 | 0 | GError *tmp_error; |
1858 | 0 | gchar *offset_str; |
1859 | 0 | gchar *pcre2_errmsg = NULL; |
1860 | 0 | int original_errcode; |
1861 | | |
1862 | | /* Translate the PCRE error code to GRegexError and use a translated |
1863 | | * error message if possible */ |
1864 | 0 | original_errcode = errcode; |
1865 | 0 | translate_compile_error (&errcode, &errmsg); |
1866 | |
|
1867 | 0 | if (!errmsg) |
1868 | 0 | { |
1869 | 0 | errmsg = _("unknown error"); |
1870 | 0 | pcre2_errmsg = get_pcre2_error_string (original_errcode); |
1871 | 0 | } |
1872 | | |
1873 | | /* PCRE uses byte offsets but we want to show character offsets */ |
1874 | 0 | erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]); |
1875 | |
|
1876 | 0 | offset_str = g_strdup_printf ("%" G_GSIZE_FORMAT, erroffset); |
1877 | 0 | tmp_error = g_error_new (G_REGEX_ERROR, errcode, |
1878 | 0 | _("Error while compiling regular expression ‘%s’ " |
1879 | 0 | "at char %s: %s"), |
1880 | 0 | pattern, offset_str, |
1881 | 0 | pcre2_errmsg ? pcre2_errmsg : errmsg); |
1882 | 0 | g_propagate_error (error, tmp_error); |
1883 | 0 | g_free (offset_str); |
1884 | 0 | g_clear_pointer (&pcre2_errmsg, g_free); |
1885 | |
|
1886 | 0 | return NULL; |
1887 | 0 | } |
1888 | | |
1889 | 0 | return re; |
1890 | 0 | } |
1891 | | |
1892 | | static uint32_t |
1893 | | get_pcre2_inline_compile_options (pcre2_code *re, |
1894 | | uint32_t compile_options) |
1895 | 0 | { |
1896 | 0 | uint32_t pcre_compile_options; |
1897 | 0 | uint32_t nonpcre_compile_options; |
1898 | | |
1899 | | /* For options set at the beginning of the pattern, pcre puts them into |
1900 | | * compile options, e.g. "(?i)foo" will make the pcre structure store |
1901 | | * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */ |
1902 | 0 | nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; |
1903 | 0 | pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options); |
1904 | 0 | compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK; |
1905 | 0 | compile_options |= nonpcre_compile_options; |
1906 | |
|
1907 | 0 | if (!(compile_options & PCRE2_DUPNAMES)) |
1908 | 0 | { |
1909 | 0 | uint32_t jchanged = 0; |
1910 | 0 | pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged); |
1911 | 0 | if (jchanged) |
1912 | 0 | compile_options |= PCRE2_DUPNAMES; |
1913 | 0 | } |
1914 | |
|
1915 | 0 | return compile_options; |
1916 | 0 | } |
1917 | | |
1918 | | /** |
1919 | | * g_regex_get_pattern: |
1920 | | * @regex: a #GRegex structure |
1921 | | * |
1922 | | * Gets the pattern string associated with @regex, i.e. a copy of |
1923 | | * the string passed to g_regex_new(). |
1924 | | * |
1925 | | * Returns: the pattern of @regex |
1926 | | * |
1927 | | * Since: 2.14 |
1928 | | */ |
1929 | | const gchar * |
1930 | | g_regex_get_pattern (const GRegex *regex) |
1931 | 0 | { |
1932 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
1933 | | |
1934 | 0 | return regex->pattern; |
1935 | 0 | } |
1936 | | |
1937 | | /** |
1938 | | * g_regex_get_max_backref: |
1939 | | * @regex: a #GRegex |
1940 | | * |
1941 | | * Returns the number of the highest back reference |
1942 | | * in the pattern, or 0 if the pattern does not contain |
1943 | | * back references. |
1944 | | * |
1945 | | * Returns: the number of the highest back reference |
1946 | | * |
1947 | | * Since: 2.14 |
1948 | | */ |
1949 | | gint |
1950 | | g_regex_get_max_backref (const GRegex *regex) |
1951 | 0 | { |
1952 | 0 | uint32_t value; |
1953 | |
|
1954 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value); |
1955 | |
|
1956 | 0 | return value; |
1957 | 0 | } |
1958 | | |
1959 | | /** |
1960 | | * g_regex_get_capture_count: |
1961 | | * @regex: a #GRegex |
1962 | | * |
1963 | | * Returns the number of capturing subpatterns in the pattern. |
1964 | | * |
1965 | | * Returns: the number of capturing subpatterns |
1966 | | * |
1967 | | * Since: 2.14 |
1968 | | */ |
1969 | | gint |
1970 | | g_regex_get_capture_count (const GRegex *regex) |
1971 | 0 | { |
1972 | 0 | uint32_t value; |
1973 | |
|
1974 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value); |
1975 | |
|
1976 | 0 | return value; |
1977 | 0 | } |
1978 | | |
1979 | | /** |
1980 | | * g_regex_get_has_cr_or_lf: |
1981 | | * @regex: a #GRegex structure |
1982 | | * |
1983 | | * Checks whether the pattern contains explicit CR or LF references. |
1984 | | * |
1985 | | * Returns: %TRUE if the pattern contains explicit CR or LF references |
1986 | | * |
1987 | | * Since: 2.34 |
1988 | | */ |
1989 | | gboolean |
1990 | | g_regex_get_has_cr_or_lf (const GRegex *regex) |
1991 | 0 | { |
1992 | 0 | uint32_t value; |
1993 | |
|
1994 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value); |
1995 | |
|
1996 | 0 | return !!value; |
1997 | 0 | } |
1998 | | |
1999 | | /** |
2000 | | * g_regex_get_max_lookbehind: |
2001 | | * @regex: a #GRegex structure |
2002 | | * |
2003 | | * Gets the number of characters in the longest lookbehind assertion in the |
2004 | | * pattern. This information is useful when doing multi-segment matching using |
2005 | | * the partial matching facilities. |
2006 | | * |
2007 | | * Returns: the number of characters in the longest lookbehind assertion. |
2008 | | * |
2009 | | * Since: 2.38 |
2010 | | */ |
2011 | | gint |
2012 | | g_regex_get_max_lookbehind (const GRegex *regex) |
2013 | 0 | { |
2014 | 0 | uint32_t max_lookbehind; |
2015 | |
|
2016 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND, |
2017 | 0 | &max_lookbehind); |
2018 | |
|
2019 | 0 | return max_lookbehind; |
2020 | 0 | } |
2021 | | |
2022 | | /** |
2023 | | * g_regex_get_compile_flags: |
2024 | | * @regex: a #GRegex |
2025 | | * |
2026 | | * Returns the compile options that @regex was created with. |
2027 | | * |
2028 | | * Depending on the version of PCRE that is used, this may or may not |
2029 | | * include flags set by option expressions such as `(?i)` found at the |
2030 | | * top-level within the compiled pattern. |
2031 | | * |
2032 | | * Returns: flags from #GRegexCompileFlags |
2033 | | * |
2034 | | * Since: 2.26 |
2035 | | */ |
2036 | | GRegexCompileFlags |
2037 | | g_regex_get_compile_flags (const GRegex *regex) |
2038 | 0 | { |
2039 | 0 | GRegexCompileFlags extra_flags; |
2040 | 0 | uint32_t info_value; |
2041 | |
|
2042 | 0 | g_return_val_if_fail (regex != NULL, 0); |
2043 | | |
2044 | | /* Preserve original G_REGEX_OPTIMIZE */ |
2045 | 0 | extra_flags = (regex->orig_compile_opts & G_REGEX_OPTIMIZE); |
2046 | | |
2047 | | /* Also include the newline options */ |
2048 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_NEWLINE, &info_value); |
2049 | 0 | switch (info_value) |
2050 | 0 | { |
2051 | 0 | case PCRE2_NEWLINE_ANYCRLF: |
2052 | 0 | extra_flags |= G_REGEX_NEWLINE_ANYCRLF; |
2053 | 0 | break; |
2054 | 0 | case PCRE2_NEWLINE_CRLF: |
2055 | 0 | extra_flags |= G_REGEX_NEWLINE_CRLF; |
2056 | 0 | break; |
2057 | 0 | case PCRE2_NEWLINE_LF: |
2058 | 0 | extra_flags |= G_REGEX_NEWLINE_LF; |
2059 | 0 | break; |
2060 | 0 | case PCRE2_NEWLINE_CR: |
2061 | 0 | extra_flags |= G_REGEX_NEWLINE_CR; |
2062 | 0 | break; |
2063 | 0 | default: |
2064 | 0 | break; |
2065 | 0 | } |
2066 | | |
2067 | | /* Also include the bsr options */ |
2068 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BSR, &info_value); |
2069 | 0 | switch (info_value) |
2070 | 0 | { |
2071 | 0 | case PCRE2_BSR_ANYCRLF: |
2072 | 0 | extra_flags |= G_REGEX_BSR_ANYCRLF; |
2073 | 0 | break; |
2074 | 0 | default: |
2075 | 0 | break; |
2076 | 0 | } |
2077 | | |
2078 | 0 | return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags; |
2079 | 0 | } |
2080 | | |
2081 | | /** |
2082 | | * g_regex_get_match_flags: |
2083 | | * @regex: a #GRegex |
2084 | | * |
2085 | | * Returns the match options that @regex was created with. |
2086 | | * |
2087 | | * Returns: flags from #GRegexMatchFlags |
2088 | | * |
2089 | | * Since: 2.26 |
2090 | | */ |
2091 | | GRegexMatchFlags |
2092 | | g_regex_get_match_flags (const GRegex *regex) |
2093 | 0 | { |
2094 | 0 | uint32_t flags; |
2095 | |
|
2096 | 0 | g_return_val_if_fail (regex != NULL, 0); |
2097 | | |
2098 | 0 | flags = g_regex_match_flags_from_pcre2 (regex->match_opts); |
2099 | 0 | flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK); |
2100 | 0 | flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF)); |
2101 | |
|
2102 | 0 | return flags; |
2103 | 0 | } |
2104 | | |
2105 | | /** |
2106 | | * g_regex_match_simple: |
2107 | | * @pattern: the regular expression |
2108 | | * @string: the string to scan for matches |
2109 | | * @compile_options: compile options for the regular expression, or 0 |
2110 | | * @match_options: match options, or 0 |
2111 | | * |
2112 | | * Scans for a match in @string for @pattern. |
2113 | | * |
2114 | | * This function is equivalent to g_regex_match() but it does not |
2115 | | * require to compile the pattern with g_regex_new(), avoiding some |
2116 | | * lines of code when you need just to do a match without extracting |
2117 | | * substrings, capture counts, and so on. |
2118 | | * |
2119 | | * If this function is to be called on the same @pattern more than |
2120 | | * once, it's more efficient to compile the pattern once with |
2121 | | * g_regex_new() and then use g_regex_match(). |
2122 | | * |
2123 | | * Returns: %TRUE if the string matched, %FALSE otherwise |
2124 | | * |
2125 | | * Since: 2.14 |
2126 | | */ |
2127 | | gboolean |
2128 | | g_regex_match_simple (const gchar *pattern, |
2129 | | const gchar *string, |
2130 | | GRegexCompileFlags compile_options, |
2131 | | GRegexMatchFlags match_options) |
2132 | 0 | { |
2133 | 0 | GRegex *regex; |
2134 | 0 | gboolean result; |
2135 | |
|
2136 | 0 | regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL); |
2137 | 0 | if (!regex) |
2138 | 0 | return FALSE; |
2139 | 0 | result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL); |
2140 | 0 | g_regex_unref (regex); |
2141 | 0 | return result; |
2142 | 0 | } |
2143 | | |
2144 | | /** |
2145 | | * g_regex_match: |
2146 | | * @regex: a #GRegex structure from g_regex_new() |
2147 | | * @string: the string to scan for matches |
2148 | | * @match_options: match options |
2149 | | * @match_info: (out) (optional): pointer to location where to store |
2150 | | * the #GMatchInfo, or %NULL if you do not need it |
2151 | | * |
2152 | | * Scans for a match in @string for the pattern in @regex. |
2153 | | * The @match_options are combined with the match options specified |
2154 | | * when the @regex structure was created, letting you have more |
2155 | | * flexibility in reusing #GRegex structures. |
2156 | | * |
2157 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2158 | | * |
2159 | | * A #GMatchInfo structure, used to get information on the match, |
2160 | | * is stored in @match_info if not %NULL. Note that if @match_info |
2161 | | * is not %NULL then it is created even if the function returns %FALSE, |
2162 | | * i.e. you must free it regardless if regular expression actually matched. |
2163 | | * |
2164 | | * To retrieve all the non-overlapping matches of the pattern in |
2165 | | * string you can use g_match_info_next(). |
2166 | | * |
2167 | | * |[<!-- language="C" --> |
2168 | | * static void |
2169 | | * print_uppercase_words (const gchar *string) |
2170 | | * { |
2171 | | * // Print all uppercase-only words. |
2172 | | * GRegex *regex; |
2173 | | * GMatchInfo *match_info; |
2174 | | * |
2175 | | * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
2176 | | * g_regex_match (regex, string, 0, &match_info); |
2177 | | * while (g_match_info_matches (match_info)) |
2178 | | * { |
2179 | | * gchar *word = g_match_info_fetch (match_info, 0); |
2180 | | * g_print ("Found: %s\n", word); |
2181 | | * g_free (word); |
2182 | | * g_match_info_next (match_info, NULL); |
2183 | | * } |
2184 | | * g_match_info_free (match_info); |
2185 | | * g_regex_unref (regex); |
2186 | | * } |
2187 | | * ]| |
2188 | | * |
2189 | | * @string is not copied and is used in #GMatchInfo internally. If |
2190 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2191 | | * freeing or modifying @string then the behaviour is undefined. |
2192 | | * |
2193 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2194 | | * |
2195 | | * Since: 2.14 |
2196 | | */ |
2197 | | gboolean |
2198 | | g_regex_match (const GRegex *regex, |
2199 | | const gchar *string, |
2200 | | GRegexMatchFlags match_options, |
2201 | | GMatchInfo **match_info) |
2202 | 0 | { |
2203 | 0 | return g_regex_match_full (regex, string, -1, 0, match_options, |
2204 | 0 | match_info, NULL); |
2205 | 0 | } |
2206 | | |
2207 | | /** |
2208 | | * g_regex_match_full: |
2209 | | * @regex: a #GRegex structure from g_regex_new() |
2210 | | * @string: (array length=string_len): the string to scan for matches |
2211 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2212 | | * @start_position: starting index of the string to match, in bytes |
2213 | | * @match_options: match options |
2214 | | * @match_info: (out) (optional): pointer to location where to store |
2215 | | * the #GMatchInfo, or %NULL if you do not need it |
2216 | | * @error: location to store the error occurring, or %NULL to ignore errors |
2217 | | * |
2218 | | * Scans for a match in @string for the pattern in @regex. |
2219 | | * The @match_options are combined with the match options specified |
2220 | | * when the @regex structure was created, letting you have more |
2221 | | * flexibility in reusing #GRegex structures. |
2222 | | * |
2223 | | * Setting @start_position differs from just passing over a shortened |
2224 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2225 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2226 | | * |
2227 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2228 | | * |
2229 | | * A #GMatchInfo structure, used to get information on the match, is |
2230 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2231 | | * not %NULL then it is created even if the function returns %FALSE, |
2232 | | * i.e. you must free it regardless if regular expression actually |
2233 | | * matched. |
2234 | | * |
2235 | | * @string is not copied and is used in #GMatchInfo internally. If |
2236 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2237 | | * freeing or modifying @string then the behaviour is undefined. |
2238 | | * |
2239 | | * To retrieve all the non-overlapping matches of the pattern in |
2240 | | * string you can use g_match_info_next(). |
2241 | | * |
2242 | | * |[<!-- language="C" --> |
2243 | | * static void |
2244 | | * print_uppercase_words (const gchar *string) |
2245 | | * { |
2246 | | * // Print all uppercase-only words. |
2247 | | * GRegex *regex; |
2248 | | * GMatchInfo *match_info; |
2249 | | * GError *error = NULL; |
2250 | | * |
2251 | | * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
2252 | | * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error); |
2253 | | * while (g_match_info_matches (match_info)) |
2254 | | * { |
2255 | | * gchar *word = g_match_info_fetch (match_info, 0); |
2256 | | * g_print ("Found: %s\n", word); |
2257 | | * g_free (word); |
2258 | | * g_match_info_next (match_info, &error); |
2259 | | * } |
2260 | | * g_match_info_free (match_info); |
2261 | | * g_regex_unref (regex); |
2262 | | * if (error != NULL) |
2263 | | * { |
2264 | | * g_printerr ("Error while matching: %s\n", error->message); |
2265 | | * g_error_free (error); |
2266 | | * } |
2267 | | * } |
2268 | | * ]| |
2269 | | * |
2270 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2271 | | * |
2272 | | * Since: 2.14 |
2273 | | */ |
2274 | | gboolean |
2275 | | g_regex_match_full (const GRegex *regex, |
2276 | | const gchar *string, |
2277 | | gssize string_len, |
2278 | | gint start_position, |
2279 | | GRegexMatchFlags match_options, |
2280 | | GMatchInfo **match_info, |
2281 | | GError **error) |
2282 | 0 | { |
2283 | 0 | GMatchInfo *info; |
2284 | 0 | gboolean match_ok; |
2285 | |
|
2286 | 0 | g_return_val_if_fail (regex != NULL, FALSE); |
2287 | 0 | g_return_val_if_fail (string != NULL, FALSE); |
2288 | 0 | g_return_val_if_fail (start_position >= 0, FALSE); |
2289 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
2290 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
2291 | | |
2292 | 0 | info = match_info_new (regex, string, string_len, start_position, |
2293 | 0 | match_options, FALSE); |
2294 | 0 | match_ok = g_match_info_next (info, error); |
2295 | 0 | if (match_info != NULL) |
2296 | 0 | *match_info = info; |
2297 | 0 | else |
2298 | 0 | g_match_info_free (info); |
2299 | |
|
2300 | 0 | return match_ok; |
2301 | 0 | } |
2302 | | |
2303 | | /** |
2304 | | * g_regex_match_all: |
2305 | | * @regex: a #GRegex structure from g_regex_new() |
2306 | | * @string: the string to scan for matches |
2307 | | * @match_options: match options |
2308 | | * @match_info: (out) (optional): pointer to location where to store |
2309 | | * the #GMatchInfo, or %NULL if you do not need it |
2310 | | * |
2311 | | * Using the standard algorithm for regular expression matching only |
2312 | | * the longest match in the string is retrieved. This function uses |
2313 | | * a different algorithm so it can retrieve all the possible matches. |
2314 | | * For more documentation see g_regex_match_all_full(). |
2315 | | * |
2316 | | * A #GMatchInfo structure, used to get information on the match, is |
2317 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2318 | | * not %NULL then it is created even if the function returns %FALSE, |
2319 | | * i.e. you must free it regardless if regular expression actually |
2320 | | * matched. |
2321 | | * |
2322 | | * @string is not copied and is used in #GMatchInfo internally. If |
2323 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2324 | | * freeing or modifying @string then the behaviour is undefined. |
2325 | | * |
2326 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2327 | | * |
2328 | | * Since: 2.14 |
2329 | | */ |
2330 | | gboolean |
2331 | | g_regex_match_all (const GRegex *regex, |
2332 | | const gchar *string, |
2333 | | GRegexMatchFlags match_options, |
2334 | | GMatchInfo **match_info) |
2335 | 0 | { |
2336 | 0 | return g_regex_match_all_full (regex, string, -1, 0, match_options, |
2337 | 0 | match_info, NULL); |
2338 | 0 | } |
2339 | | |
2340 | | /** |
2341 | | * g_regex_match_all_full: |
2342 | | * @regex: a #GRegex structure from g_regex_new() |
2343 | | * @string: (array length=string_len): the string to scan for matches |
2344 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2345 | | * @start_position: starting index of the string to match, in bytes |
2346 | | * @match_options: match options |
2347 | | * @match_info: (out) (optional): pointer to location where to store |
2348 | | * the #GMatchInfo, or %NULL if you do not need it |
2349 | | * @error: location to store the error occurring, or %NULL to ignore errors |
2350 | | * |
2351 | | * Using the standard algorithm for regular expression matching only |
2352 | | * the longest match in the @string is retrieved, it is not possible |
2353 | | * to obtain all the available matches. For instance matching |
2354 | | * "<a> <b> <c>" against the pattern "<.*>" |
2355 | | * you get "<a> <b> <c>". |
2356 | | * |
2357 | | * This function uses a different algorithm (called DFA, i.e. deterministic |
2358 | | * finite automaton), so it can retrieve all the possible matches, all |
2359 | | * starting at the same point in the string. For instance matching |
2360 | | * "<a> <b> <c>" against the pattern "<.*>;" |
2361 | | * you would obtain three matches: "<a> <b> <c>", |
2362 | | * "<a> <b>" and "<a>". |
2363 | | * |
2364 | | * The number of matched strings is retrieved using |
2365 | | * g_match_info_get_match_count(). To obtain the matched strings and |
2366 | | * their position you can use, respectively, g_match_info_fetch() and |
2367 | | * g_match_info_fetch_pos(). Note that the strings are returned in |
2368 | | * reverse order of length; that is, the longest matching string is |
2369 | | * given first. |
2370 | | * |
2371 | | * Note that the DFA algorithm is slower than the standard one and it |
2372 | | * is not able to capture substrings, so backreferences do not work. |
2373 | | * |
2374 | | * Setting @start_position differs from just passing over a shortened |
2375 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2376 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2377 | | * |
2378 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2379 | | * |
2380 | | * A #GMatchInfo structure, used to get information on the match, is |
2381 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2382 | | * not %NULL then it is created even if the function returns %FALSE, |
2383 | | * i.e. you must free it regardless if regular expression actually |
2384 | | * matched. |
2385 | | * |
2386 | | * @string is not copied and is used in #GMatchInfo internally. If |
2387 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2388 | | * freeing or modifying @string then the behaviour is undefined. |
2389 | | * |
2390 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2391 | | * |
2392 | | * Since: 2.14 |
2393 | | */ |
2394 | | gboolean |
2395 | | g_regex_match_all_full (const GRegex *regex, |
2396 | | const gchar *string, |
2397 | | gssize string_len, |
2398 | | gint start_position, |
2399 | | GRegexMatchFlags match_options, |
2400 | | GMatchInfo **match_info, |
2401 | | GError **error) |
2402 | 0 | { |
2403 | 0 | GMatchInfo *info; |
2404 | 0 | gboolean done; |
2405 | 0 | pcre2_code *pcre_re; |
2406 | 0 | gboolean retval; |
2407 | 0 | uint32_t newline_options; |
2408 | 0 | uint32_t bsr_options; |
2409 | |
|
2410 | 0 | g_return_val_if_fail (regex != NULL, FALSE); |
2411 | 0 | g_return_val_if_fail (string != NULL, FALSE); |
2412 | 0 | g_return_val_if_fail (start_position >= 0, FALSE); |
2413 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
2414 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
2415 | | |
2416 | 0 | newline_options = get_pcre2_newline_match_options (match_options); |
2417 | 0 | if (!newline_options) |
2418 | 0 | newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts); |
2419 | |
|
2420 | 0 | bsr_options = get_pcre2_bsr_match_options (match_options); |
2421 | 0 | if (!bsr_options) |
2422 | 0 | bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts); |
2423 | | |
2424 | | /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an |
2425 | | * optimization for normal regex matching, but results in omitting some |
2426 | | * shorter matches here, and an observable behaviour change. |
2427 | | * |
2428 | | * DFA matching is rather niche, and very rarely used according to |
2429 | | * codesearch.debian.net, so don't bother caching the recompiled RE. */ |
2430 | 0 | pcre_re = regex_compile (regex->pattern, |
2431 | 0 | regex->compile_opts | PCRE2_NO_AUTO_POSSESS, |
2432 | 0 | newline_options, bsr_options, error); |
2433 | 0 | if (pcre_re == NULL) |
2434 | 0 | return FALSE; |
2435 | | |
2436 | 0 | info = match_info_new (regex, string, string_len, start_position, |
2437 | 0 | match_options, TRUE); |
2438 | |
|
2439 | 0 | done = FALSE; |
2440 | 0 | while (!done) |
2441 | 0 | { |
2442 | 0 | done = TRUE; |
2443 | 0 | info->matches = pcre2_dfa_match (pcre_re, |
2444 | 0 | (PCRE2_SPTR8) info->string, info->string_len, |
2445 | 0 | info->pos, |
2446 | 0 | (regex->match_opts | info->match_opts), |
2447 | 0 | info->match_data, |
2448 | 0 | info->match_context, |
2449 | 0 | info->workspace, info->n_workspace); |
2450 | 0 | if (info->matches == PCRE2_ERROR_DFA_WSSIZE) |
2451 | 0 | { |
2452 | | /* info->workspace is too small. */ |
2453 | 0 | info->n_workspace *= 2; |
2454 | 0 | info->workspace = g_realloc_n (info->workspace, |
2455 | 0 | info->n_workspace, |
2456 | 0 | sizeof (gint)); |
2457 | 0 | done = FALSE; |
2458 | 0 | } |
2459 | 0 | else if (info->matches == 0) |
2460 | 0 | { |
2461 | | /* info->offsets is too small. */ |
2462 | 0 | info->n_offsets *= 2; |
2463 | 0 | info->offsets = g_realloc_n (info->offsets, |
2464 | 0 | info->n_offsets, |
2465 | 0 | sizeof (gint)); |
2466 | 0 | pcre2_match_data_free (info->match_data); |
2467 | 0 | info->match_data = pcre2_match_data_create (info->n_offsets, NULL); |
2468 | 0 | done = FALSE; |
2469 | 0 | } |
2470 | 0 | else if (IS_PCRE2_ERROR (info->matches)) |
2471 | 0 | { |
2472 | 0 | gchar *error_msg = get_match_error_message (info->matches); |
2473 | |
|
2474 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
2475 | 0 | _("Error while matching regular expression %s: %s"), |
2476 | 0 | regex->pattern, error_msg); |
2477 | 0 | g_clear_pointer (&error_msg, g_free); |
2478 | 0 | } |
2479 | 0 | else if (info->matches != PCRE2_ERROR_NOMATCH) |
2480 | 0 | { |
2481 | 0 | if (!recalc_match_offsets (info, error)) |
2482 | 0 | info->matches = PCRE2_ERROR_NOMATCH; |
2483 | 0 | } |
2484 | 0 | } |
2485 | |
|
2486 | 0 | pcre2_code_free (pcre_re); |
2487 | | |
2488 | | /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only |
2489 | | * holds true for a single match, rather than matching all */ |
2490 | | |
2491 | | /* set info->pos to -1 so that a call to g_match_info_next() fails. */ |
2492 | 0 | info->pos = -1; |
2493 | 0 | retval = info->matches >= 0; |
2494 | |
|
2495 | 0 | if (match_info != NULL) |
2496 | 0 | *match_info = info; |
2497 | 0 | else |
2498 | 0 | g_match_info_free (info); |
2499 | |
|
2500 | 0 | return retval; |
2501 | 0 | } |
2502 | | |
2503 | | /** |
2504 | | * g_regex_get_string_number: |
2505 | | * @regex: #GRegex structure |
2506 | | * @name: name of the subexpression |
2507 | | * |
2508 | | * Retrieves the number of the subexpression named @name. |
2509 | | * |
2510 | | * Returns: The number of the subexpression or -1 if @name |
2511 | | * does not exists |
2512 | | * |
2513 | | * Since: 2.14 |
2514 | | */ |
2515 | | gint |
2516 | | g_regex_get_string_number (const GRegex *regex, |
2517 | | const gchar *name) |
2518 | 0 | { |
2519 | 0 | gint num; |
2520 | |
|
2521 | 0 | g_return_val_if_fail (regex != NULL, -1); |
2522 | 0 | g_return_val_if_fail (name != NULL, -1); |
2523 | | |
2524 | 0 | num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR8) name); |
2525 | 0 | if (num == PCRE2_ERROR_NOSUBSTRING) |
2526 | 0 | num = -1; |
2527 | |
|
2528 | 0 | return num; |
2529 | 0 | } |
2530 | | |
2531 | | /** |
2532 | | * g_regex_split_simple: |
2533 | | * @pattern: the regular expression |
2534 | | * @string: the string to scan for matches |
2535 | | * @compile_options: compile options for the regular expression, or 0 |
2536 | | * @match_options: match options, or 0 |
2537 | | * |
2538 | | * Breaks the string on the pattern, and returns an array of |
2539 | | * the tokens. If the pattern contains capturing parentheses, |
2540 | | * then the text for each of the substrings will also be returned. |
2541 | | * If the pattern does not match anywhere in the string, then the |
2542 | | * whole string is returned as the first token. |
2543 | | * |
2544 | | * This function is equivalent to g_regex_split() but it does |
2545 | | * not require to compile the pattern with g_regex_new(), avoiding |
2546 | | * some lines of code when you need just to do a split without |
2547 | | * extracting substrings, capture counts, and so on. |
2548 | | * |
2549 | | * If this function is to be called on the same @pattern more than |
2550 | | * once, it's more efficient to compile the pattern once with |
2551 | | * g_regex_new() and then use g_regex_split(). |
2552 | | * |
2553 | | * As a special case, the result of splitting the empty string "" |
2554 | | * is an empty vector, not a vector containing a single string. |
2555 | | * The reason for this special case is that being able to represent |
2556 | | * an empty vector is typically more useful than consistent handling |
2557 | | * of empty elements. If you do need to represent empty elements, |
2558 | | * you'll need to check for the empty string before calling this |
2559 | | * function. |
2560 | | * |
2561 | | * A pattern that can match empty strings splits @string into |
2562 | | * separate characters wherever it matches the empty string between |
2563 | | * characters. For example splitting "ab c" using as a separator |
2564 | | * "\s*", you will get "a", "b" and "c". |
2565 | | * |
2566 | | * Returns: (transfer full): a %NULL-terminated array of strings. Free |
2567 | | * it using g_strfreev() |
2568 | | * |
2569 | | * Since: 2.14 |
2570 | | **/ |
2571 | | gchar ** |
2572 | | g_regex_split_simple (const gchar *pattern, |
2573 | | const gchar *string, |
2574 | | GRegexCompileFlags compile_options, |
2575 | | GRegexMatchFlags match_options) |
2576 | 0 | { |
2577 | 0 | GRegex *regex; |
2578 | 0 | gchar **result; |
2579 | |
|
2580 | 0 | regex = g_regex_new (pattern, compile_options, 0, NULL); |
2581 | 0 | if (!regex) |
2582 | 0 | return NULL; |
2583 | | |
2584 | 0 | result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL); |
2585 | 0 | g_regex_unref (regex); |
2586 | 0 | return result; |
2587 | 0 | } |
2588 | | |
2589 | | /** |
2590 | | * g_regex_split: |
2591 | | * @regex: a #GRegex structure |
2592 | | * @string: the string to split with the pattern |
2593 | | * @match_options: match time option flags |
2594 | | * |
2595 | | * Breaks the string on the pattern, and returns an array of the tokens. |
2596 | | * If the pattern contains capturing parentheses, then the text for each |
2597 | | * of the substrings will also be returned. If the pattern does not match |
2598 | | * anywhere in the string, then the whole string is returned as the first |
2599 | | * token. |
2600 | | * |
2601 | | * As a special case, the result of splitting the empty string "" is an |
2602 | | * empty vector, not a vector containing a single string. The reason for |
2603 | | * this special case is that being able to represent an empty vector is |
2604 | | * typically more useful than consistent handling of empty elements. If |
2605 | | * you do need to represent empty elements, you'll need to check for the |
2606 | | * empty string before calling this function. |
2607 | | * |
2608 | | * A pattern that can match empty strings splits @string into separate |
2609 | | * characters wherever it matches the empty string between characters. |
2610 | | * For example splitting "ab c" using as a separator "\s*", you will get |
2611 | | * "a", "b" and "c". |
2612 | | * |
2613 | | * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
2614 | | * it using g_strfreev() |
2615 | | * |
2616 | | * Since: 2.14 |
2617 | | **/ |
2618 | | gchar ** |
2619 | | g_regex_split (const GRegex *regex, |
2620 | | const gchar *string, |
2621 | | GRegexMatchFlags match_options) |
2622 | 0 | { |
2623 | 0 | return g_regex_split_full (regex, string, -1, 0, |
2624 | 0 | match_options, 0, NULL); |
2625 | 0 | } |
2626 | | |
2627 | | /** |
2628 | | * g_regex_split_full: |
2629 | | * @regex: a #GRegex structure |
2630 | | * @string: (array length=string_len): the string to split with the pattern |
2631 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2632 | | * @start_position: starting index of the string to match, in bytes |
2633 | | * @match_options: match time option flags |
2634 | | * @max_tokens: the maximum number of tokens to split @string into. |
2635 | | * If this is less than 1, the string is split completely |
2636 | | * @error: return location for a #GError |
2637 | | * |
2638 | | * Breaks the string on the pattern, and returns an array of the tokens. |
2639 | | * If the pattern contains capturing parentheses, then the text for each |
2640 | | * of the substrings will also be returned. If the pattern does not match |
2641 | | * anywhere in the string, then the whole string is returned as the first |
2642 | | * token. |
2643 | | * |
2644 | | * As a special case, the result of splitting the empty string "" is an |
2645 | | * empty vector, not a vector containing a single string. The reason for |
2646 | | * this special case is that being able to represent an empty vector is |
2647 | | * typically more useful than consistent handling of empty elements. If |
2648 | | * you do need to represent empty elements, you'll need to check for the |
2649 | | * empty string before calling this function. |
2650 | | * |
2651 | | * A pattern that can match empty strings splits @string into separate |
2652 | | * characters wherever it matches the empty string between characters. |
2653 | | * For example splitting "ab c" using as a separator "\s*", you will get |
2654 | | * "a", "b" and "c". |
2655 | | * |
2656 | | * Setting @start_position differs from just passing over a shortened |
2657 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2658 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2659 | | * |
2660 | | * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
2661 | | * it using g_strfreev() |
2662 | | * |
2663 | | * Since: 2.14 |
2664 | | **/ |
2665 | | gchar ** |
2666 | | g_regex_split_full (const GRegex *regex, |
2667 | | const gchar *string, |
2668 | | gssize string_len, |
2669 | | gint start_position, |
2670 | | GRegexMatchFlags match_options, |
2671 | | gint max_tokens, |
2672 | | GError **error) |
2673 | 0 | { |
2674 | 0 | GError *tmp_error = NULL; |
2675 | 0 | GMatchInfo *match_info; |
2676 | 0 | GList *list, *last; |
2677 | 0 | gint i; |
2678 | 0 | gint token_count; |
2679 | 0 | gboolean match_ok; |
2680 | | /* position of the last separator. */ |
2681 | 0 | gint last_separator_end; |
2682 | | /* was the last match 0 bytes long? */ |
2683 | 0 | gboolean last_match_is_empty; |
2684 | | /* the returned array of char **s */ |
2685 | 0 | gchar **string_list; |
2686 | |
|
2687 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
2688 | 0 | g_return_val_if_fail (string != NULL, NULL); |
2689 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
2690 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
2691 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
2692 | | |
2693 | 0 | if (max_tokens <= 0) |
2694 | 0 | max_tokens = G_MAXINT; |
2695 | |
|
2696 | 0 | if (string_len < 0) |
2697 | 0 | string_len = strlen (string); |
2698 | | |
2699 | | /* zero-length string */ |
2700 | 0 | if (string_len - start_position == 0) |
2701 | 0 | return g_new0 (gchar *, 1); |
2702 | | |
2703 | 0 | if (max_tokens == 1) |
2704 | 0 | { |
2705 | 0 | string_list = g_new0 (gchar *, 2); |
2706 | 0 | string_list[0] = g_strndup (&string[start_position], |
2707 | 0 | string_len - start_position); |
2708 | 0 | return string_list; |
2709 | 0 | } |
2710 | | |
2711 | 0 | list = NULL; |
2712 | 0 | token_count = 0; |
2713 | 0 | last_separator_end = start_position; |
2714 | 0 | last_match_is_empty = FALSE; |
2715 | |
|
2716 | 0 | match_ok = g_regex_match_full (regex, string, string_len, start_position, |
2717 | 0 | match_options, &match_info, &tmp_error); |
2718 | |
|
2719 | 0 | while (tmp_error == NULL) |
2720 | 0 | { |
2721 | 0 | if (match_ok) |
2722 | 0 | { |
2723 | 0 | last_match_is_empty = |
2724 | 0 | (match_info->offsets[0] == match_info->offsets[1]); |
2725 | | |
2726 | | /* we need to skip empty separators at the same position of the end |
2727 | | * of another separator. e.g. the string is "a b" and the separator |
2728 | | * is " *", so from 1 to 2 we have a match and at position 2 we have |
2729 | | * an empty match. */ |
2730 | 0 | if (last_separator_end != match_info->offsets[1]) |
2731 | 0 | { |
2732 | 0 | gchar *token; |
2733 | 0 | gint match_count; |
2734 | |
|
2735 | 0 | token = g_strndup (string + last_separator_end, |
2736 | 0 | match_info->offsets[0] - last_separator_end); |
2737 | 0 | list = g_list_prepend (list, token); |
2738 | 0 | token_count++; |
2739 | | |
2740 | | /* if there were substrings, these need to be added to |
2741 | | * the list. */ |
2742 | 0 | match_count = g_match_info_get_match_count (match_info); |
2743 | 0 | if (match_count > 1) |
2744 | 0 | { |
2745 | 0 | for (i = 1; i < match_count; i++) |
2746 | 0 | list = g_list_prepend (list, g_match_info_fetch (match_info, i)); |
2747 | 0 | } |
2748 | 0 | } |
2749 | 0 | } |
2750 | 0 | else |
2751 | 0 | { |
2752 | | /* if there was no match, copy to end of string. */ |
2753 | 0 | if (!last_match_is_empty) |
2754 | 0 | { |
2755 | 0 | gchar *token = g_strndup (string + last_separator_end, |
2756 | 0 | match_info->string_len - last_separator_end); |
2757 | 0 | list = g_list_prepend (list, token); |
2758 | 0 | } |
2759 | | /* no more tokens, end the loop. */ |
2760 | 0 | break; |
2761 | 0 | } |
2762 | | |
2763 | | /* -1 to leave room for the last part. */ |
2764 | 0 | if (token_count >= max_tokens - 1) |
2765 | 0 | { |
2766 | | /* we have reached the maximum number of tokens, so we copy |
2767 | | * the remaining part of the string. */ |
2768 | 0 | if (last_match_is_empty) |
2769 | 0 | { |
2770 | | /* the last match was empty, so we have moved one char |
2771 | | * after the real position to avoid empty matches at the |
2772 | | * same position. */ |
2773 | 0 | match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string; |
2774 | 0 | } |
2775 | | /* the if is needed in the case we have terminated the available |
2776 | | * tokens, but we are at the end of the string, so there are no |
2777 | | * characters left to copy. */ |
2778 | 0 | if (string_len > match_info->pos) |
2779 | 0 | { |
2780 | 0 | gchar *token = g_strndup (string + match_info->pos, |
2781 | 0 | string_len - match_info->pos); |
2782 | 0 | list = g_list_prepend (list, token); |
2783 | 0 | } |
2784 | | /* end the loop. */ |
2785 | 0 | break; |
2786 | 0 | } |
2787 | | |
2788 | 0 | last_separator_end = match_info->pos; |
2789 | 0 | if (last_match_is_empty) |
2790 | | /* if the last match was empty, g_match_info_next() has moved |
2791 | | * forward to avoid infinite loops, but we still need to copy that |
2792 | | * character. */ |
2793 | 0 | last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string; |
2794 | |
|
2795 | 0 | match_ok = g_match_info_next (match_info, &tmp_error); |
2796 | 0 | } |
2797 | 0 | g_match_info_free (match_info); |
2798 | 0 | if (tmp_error != NULL) |
2799 | 0 | { |
2800 | 0 | g_propagate_error (error, tmp_error); |
2801 | 0 | g_list_free_full (list, g_free); |
2802 | 0 | return NULL; |
2803 | 0 | } |
2804 | | |
2805 | 0 | string_list = g_new (gchar *, g_list_length (list) + 1); |
2806 | 0 | i = 0; |
2807 | 0 | for (last = g_list_last (list); last; last = g_list_previous (last)) |
2808 | 0 | string_list[i++] = last->data; |
2809 | 0 | string_list[i] = NULL; |
2810 | 0 | g_list_free (list); |
2811 | |
|
2812 | 0 | return string_list; |
2813 | 0 | } |
2814 | | |
2815 | | enum |
2816 | | { |
2817 | | REPL_TYPE_STRING, |
2818 | | REPL_TYPE_CHARACTER, |
2819 | | REPL_TYPE_SYMBOLIC_REFERENCE, |
2820 | | REPL_TYPE_NUMERIC_REFERENCE, |
2821 | | REPL_TYPE_CHANGE_CASE |
2822 | | }; |
2823 | | |
2824 | | typedef enum |
2825 | | { |
2826 | | CHANGE_CASE_NONE = 1 << 0, |
2827 | | CHANGE_CASE_UPPER = 1 << 1, |
2828 | | CHANGE_CASE_LOWER = 1 << 2, |
2829 | | CHANGE_CASE_UPPER_SINGLE = 1 << 3, |
2830 | | CHANGE_CASE_LOWER_SINGLE = 1 << 4, |
2831 | | CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE, |
2832 | | CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE, |
2833 | | CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE |
2834 | | } ChangeCase; |
2835 | | |
2836 | | struct _InterpolationData |
2837 | | { |
2838 | | gchar *text; |
2839 | | gint type; |
2840 | | gint num; |
2841 | | gchar c; |
2842 | | ChangeCase change_case; |
2843 | | }; |
2844 | | |
2845 | | static void |
2846 | | free_interpolation_data (InterpolationData *data) |
2847 | 0 | { |
2848 | 0 | g_free (data->text); |
2849 | 0 | g_free (data); |
2850 | 0 | } |
2851 | | |
2852 | | static const gchar * |
2853 | | expand_escape (const gchar *replacement, |
2854 | | const gchar *p, |
2855 | | InterpolationData *data, |
2856 | | GError **error) |
2857 | 0 | { |
2858 | 0 | const gchar *q, *r; |
2859 | 0 | gint x, d, h, i; |
2860 | 0 | const gchar *error_detail; |
2861 | 0 | gint base = 0; |
2862 | 0 | GError *tmp_error = NULL; |
2863 | |
|
2864 | 0 | p++; |
2865 | 0 | switch (*p) |
2866 | 0 | { |
2867 | 0 | case 't': |
2868 | 0 | p++; |
2869 | 0 | data->c = '\t'; |
2870 | 0 | data->type = REPL_TYPE_CHARACTER; |
2871 | 0 | break; |
2872 | 0 | case 'n': |
2873 | 0 | p++; |
2874 | 0 | data->c = '\n'; |
2875 | 0 | data->type = REPL_TYPE_CHARACTER; |
2876 | 0 | break; |
2877 | 0 | case 'v': |
2878 | 0 | p++; |
2879 | 0 | data->c = '\v'; |
2880 | 0 | data->type = REPL_TYPE_CHARACTER; |
2881 | 0 | break; |
2882 | 0 | case 'r': |
2883 | 0 | p++; |
2884 | 0 | data->c = '\r'; |
2885 | 0 | data->type = REPL_TYPE_CHARACTER; |
2886 | 0 | break; |
2887 | 0 | case 'f': |
2888 | 0 | p++; |
2889 | 0 | data->c = '\f'; |
2890 | 0 | data->type = REPL_TYPE_CHARACTER; |
2891 | 0 | break; |
2892 | 0 | case 'a': |
2893 | 0 | p++; |
2894 | 0 | data->c = '\a'; |
2895 | 0 | data->type = REPL_TYPE_CHARACTER; |
2896 | 0 | break; |
2897 | 0 | case 'b': |
2898 | 0 | p++; |
2899 | 0 | data->c = '\b'; |
2900 | 0 | data->type = REPL_TYPE_CHARACTER; |
2901 | 0 | break; |
2902 | 0 | case '\\': |
2903 | 0 | p++; |
2904 | 0 | data->c = '\\'; |
2905 | 0 | data->type = REPL_TYPE_CHARACTER; |
2906 | 0 | break; |
2907 | 0 | case 'x': |
2908 | 0 | p++; |
2909 | 0 | x = 0; |
2910 | 0 | if (*p == '{') |
2911 | 0 | { |
2912 | 0 | p++; |
2913 | 0 | do |
2914 | 0 | { |
2915 | 0 | h = g_ascii_xdigit_value (*p); |
2916 | 0 | if (h < 0) |
2917 | 0 | { |
2918 | 0 | error_detail = _("hexadecimal digit or “}” expected"); |
2919 | 0 | goto error; |
2920 | 0 | } |
2921 | 0 | x = x * 16 + h; |
2922 | 0 | p++; |
2923 | 0 | } |
2924 | 0 | while (*p != '}'); |
2925 | 0 | p++; |
2926 | 0 | } |
2927 | 0 | else |
2928 | 0 | { |
2929 | 0 | for (i = 0; i < 2; i++) |
2930 | 0 | { |
2931 | 0 | h = g_ascii_xdigit_value (*p); |
2932 | 0 | if (h < 0) |
2933 | 0 | { |
2934 | 0 | error_detail = _("hexadecimal digit expected"); |
2935 | 0 | goto error; |
2936 | 0 | } |
2937 | 0 | x = x * 16 + h; |
2938 | 0 | p++; |
2939 | 0 | } |
2940 | 0 | } |
2941 | 0 | data->type = REPL_TYPE_STRING; |
2942 | 0 | data->text = g_new0 (gchar, 8); |
2943 | 0 | g_unichar_to_utf8 (x, data->text); |
2944 | 0 | break; |
2945 | 0 | case 'l': |
2946 | 0 | p++; |
2947 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2948 | 0 | data->change_case = CHANGE_CASE_LOWER_SINGLE; |
2949 | 0 | break; |
2950 | 0 | case 'u': |
2951 | 0 | p++; |
2952 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2953 | 0 | data->change_case = CHANGE_CASE_UPPER_SINGLE; |
2954 | 0 | break; |
2955 | 0 | case 'L': |
2956 | 0 | p++; |
2957 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2958 | 0 | data->change_case = CHANGE_CASE_LOWER; |
2959 | 0 | break; |
2960 | 0 | case 'U': |
2961 | 0 | p++; |
2962 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2963 | 0 | data->change_case = CHANGE_CASE_UPPER; |
2964 | 0 | break; |
2965 | 0 | case 'E': |
2966 | 0 | p++; |
2967 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2968 | 0 | data->change_case = CHANGE_CASE_NONE; |
2969 | 0 | break; |
2970 | 0 | case 'g': |
2971 | 0 | p++; |
2972 | 0 | if (*p != '<') |
2973 | 0 | { |
2974 | 0 | error_detail = _("missing “<” in symbolic reference"); |
2975 | 0 | goto error; |
2976 | 0 | } |
2977 | 0 | q = p + 1; |
2978 | 0 | do |
2979 | 0 | { |
2980 | 0 | p++; |
2981 | 0 | if (!*p) |
2982 | 0 | { |
2983 | 0 | error_detail = _("unfinished symbolic reference"); |
2984 | 0 | goto error; |
2985 | 0 | } |
2986 | 0 | } |
2987 | 0 | while (*p != '>'); |
2988 | 0 | if (p - q == 0) |
2989 | 0 | { |
2990 | 0 | error_detail = _("zero-length symbolic reference"); |
2991 | 0 | goto error; |
2992 | 0 | } |
2993 | 0 | if (g_ascii_isdigit (*q)) |
2994 | 0 | { |
2995 | 0 | x = 0; |
2996 | 0 | do |
2997 | 0 | { |
2998 | 0 | h = g_ascii_digit_value (*q); |
2999 | 0 | if (h < 0) |
3000 | 0 | { |
3001 | 0 | error_detail = _("digit expected"); |
3002 | 0 | p = q; |
3003 | 0 | goto error; |
3004 | 0 | } |
3005 | 0 | x = x * 10 + h; |
3006 | 0 | q++; |
3007 | 0 | } |
3008 | 0 | while (q != p); |
3009 | 0 | data->num = x; |
3010 | 0 | data->type = REPL_TYPE_NUMERIC_REFERENCE; |
3011 | 0 | } |
3012 | 0 | else |
3013 | 0 | { |
3014 | 0 | r = q; |
3015 | 0 | do |
3016 | 0 | { |
3017 | 0 | if (!g_ascii_isalnum (*r)) |
3018 | 0 | { |
3019 | 0 | error_detail = _("illegal symbolic reference"); |
3020 | 0 | p = r; |
3021 | 0 | goto error; |
3022 | 0 | } |
3023 | 0 | r++; |
3024 | 0 | } |
3025 | 0 | while (r != p); |
3026 | 0 | data->text = g_strndup (q, p - q); |
3027 | 0 | data->type = REPL_TYPE_SYMBOLIC_REFERENCE; |
3028 | 0 | } |
3029 | 0 | p++; |
3030 | 0 | break; |
3031 | 0 | case '0': |
3032 | | /* if \0 is followed by a number is an octal number representing a |
3033 | | * character, else it is a numeric reference. */ |
3034 | 0 | if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0) |
3035 | 0 | { |
3036 | 0 | base = 8; |
3037 | 0 | p = g_utf8_next_char (p); |
3038 | 0 | } |
3039 | 0 | G_GNUC_FALLTHROUGH; |
3040 | 0 | case '1': |
3041 | 0 | case '2': |
3042 | 0 | case '3': |
3043 | 0 | case '4': |
3044 | 0 | case '5': |
3045 | 0 | case '6': |
3046 | 0 | case '7': |
3047 | 0 | case '8': |
3048 | 0 | case '9': |
3049 | 0 | x = 0; |
3050 | 0 | d = 0; |
3051 | 0 | for (i = 0; i < 3; i++) |
3052 | 0 | { |
3053 | 0 | h = g_ascii_digit_value (*p); |
3054 | 0 | if (h < 0) |
3055 | 0 | break; |
3056 | 0 | if (h > 7) |
3057 | 0 | { |
3058 | 0 | if (base == 8) |
3059 | 0 | break; |
3060 | 0 | else |
3061 | 0 | base = 10; |
3062 | 0 | } |
3063 | 0 | if (i == 2 && base == 10) |
3064 | 0 | break; |
3065 | 0 | x = x * 8 + h; |
3066 | 0 | d = d * 10 + h; |
3067 | 0 | p++; |
3068 | 0 | } |
3069 | 0 | if (base == 8 || i == 3) |
3070 | 0 | { |
3071 | 0 | data->type = REPL_TYPE_STRING; |
3072 | 0 | data->text = g_new0 (gchar, 8); |
3073 | 0 | g_unichar_to_utf8 (x, data->text); |
3074 | 0 | } |
3075 | 0 | else |
3076 | 0 | { |
3077 | 0 | data->type = REPL_TYPE_NUMERIC_REFERENCE; |
3078 | 0 | data->num = d; |
3079 | 0 | } |
3080 | 0 | break; |
3081 | 0 | case 0: |
3082 | 0 | error_detail = _("stray final “\\”"); |
3083 | 0 | goto error; |
3084 | 0 | break; |
3085 | 0 | default: |
3086 | 0 | error_detail = _("unknown escape sequence"); |
3087 | 0 | goto error; |
3088 | 0 | } |
3089 | | |
3090 | 0 | return p; |
3091 | | |
3092 | 0 | error: |
3093 | | /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */ |
3094 | 0 | tmp_error = g_error_new (G_REGEX_ERROR, |
3095 | 0 | G_REGEX_ERROR_REPLACE, |
3096 | 0 | _("Error while parsing replacement " |
3097 | 0 | "text “%s” at char %lu: %s"), |
3098 | 0 | replacement, |
3099 | 0 | (gulong)(p - replacement), |
3100 | 0 | error_detail); |
3101 | 0 | g_propagate_error (error, tmp_error); |
3102 | |
|
3103 | 0 | return NULL; |
3104 | 0 | } |
3105 | | |
3106 | | static GList * |
3107 | | split_replacement (const gchar *replacement, |
3108 | | GError **error) |
3109 | 0 | { |
3110 | 0 | GList *list = NULL; |
3111 | 0 | InterpolationData *data; |
3112 | 0 | const gchar *p, *start; |
3113 | |
|
3114 | 0 | start = p = replacement; |
3115 | 0 | while (*p) |
3116 | 0 | { |
3117 | 0 | if (*p == '\\') |
3118 | 0 | { |
3119 | 0 | data = g_new0 (InterpolationData, 1); |
3120 | 0 | start = p = expand_escape (replacement, p, data, error); |
3121 | 0 | if (p == NULL) |
3122 | 0 | { |
3123 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3124 | 0 | free_interpolation_data (data); |
3125 | |
|
3126 | 0 | return NULL; |
3127 | 0 | } |
3128 | 0 | list = g_list_prepend (list, data); |
3129 | 0 | } |
3130 | 0 | else |
3131 | 0 | { |
3132 | 0 | p++; |
3133 | 0 | if (*p == '\\' || *p == '\0') |
3134 | 0 | { |
3135 | 0 | if (p - start > 0) |
3136 | 0 | { |
3137 | 0 | data = g_new0 (InterpolationData, 1); |
3138 | 0 | data->text = g_strndup (start, p - start); |
3139 | 0 | data->type = REPL_TYPE_STRING; |
3140 | 0 | list = g_list_prepend (list, data); |
3141 | 0 | } |
3142 | 0 | } |
3143 | 0 | } |
3144 | 0 | } |
3145 | | |
3146 | 0 | return g_list_reverse (list); |
3147 | 0 | } |
3148 | | |
3149 | | /* Change the case of c based on change_case. */ |
3150 | | #define CHANGE_CASE(c, change_case) \ |
3151 | 0 | (((change_case) & CHANGE_CASE_LOWER_MASK) ? \ |
3152 | 0 | g_unichar_tolower (c) : \ |
3153 | 0 | g_unichar_toupper (c)) |
3154 | | |
3155 | | static void |
3156 | | string_append (GString *string, |
3157 | | const gchar *text, |
3158 | | ChangeCase *change_case) |
3159 | 0 | { |
3160 | 0 | gunichar c; |
3161 | |
|
3162 | 0 | if (text[0] == '\0') |
3163 | 0 | return; |
3164 | | |
3165 | 0 | if (*change_case == CHANGE_CASE_NONE) |
3166 | 0 | { |
3167 | 0 | g_string_append (string, text); |
3168 | 0 | } |
3169 | 0 | else if (*change_case & CHANGE_CASE_SINGLE_MASK) |
3170 | 0 | { |
3171 | 0 | c = g_utf8_get_char (text); |
3172 | 0 | g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
3173 | 0 | g_string_append (string, g_utf8_next_char (text)); |
3174 | 0 | *change_case = CHANGE_CASE_NONE; |
3175 | 0 | } |
3176 | 0 | else |
3177 | 0 | { |
3178 | 0 | while (*text != '\0') |
3179 | 0 | { |
3180 | 0 | c = g_utf8_get_char (text); |
3181 | 0 | g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
3182 | 0 | text = g_utf8_next_char (text); |
3183 | 0 | } |
3184 | 0 | } |
3185 | 0 | } |
3186 | | |
3187 | | static gboolean |
3188 | | interpolate_replacement (const GMatchInfo *match_info, |
3189 | | GString *result, |
3190 | | gpointer data) |
3191 | 0 | { |
3192 | 0 | GList *list; |
3193 | 0 | InterpolationData *idata; |
3194 | 0 | gchar *match; |
3195 | 0 | ChangeCase change_case = CHANGE_CASE_NONE; |
3196 | |
|
3197 | 0 | for (list = data; list; list = list->next) |
3198 | 0 | { |
3199 | 0 | idata = list->data; |
3200 | 0 | switch (idata->type) |
3201 | 0 | { |
3202 | 0 | case REPL_TYPE_STRING: |
3203 | 0 | string_append (result, idata->text, &change_case); |
3204 | 0 | break; |
3205 | 0 | case REPL_TYPE_CHARACTER: |
3206 | 0 | g_string_append_c (result, CHANGE_CASE (idata->c, change_case)); |
3207 | 0 | if (change_case & CHANGE_CASE_SINGLE_MASK) |
3208 | 0 | change_case = CHANGE_CASE_NONE; |
3209 | 0 | break; |
3210 | 0 | case REPL_TYPE_NUMERIC_REFERENCE: |
3211 | 0 | match = g_match_info_fetch (match_info, idata->num); |
3212 | 0 | if (match) |
3213 | 0 | { |
3214 | 0 | string_append (result, match, &change_case); |
3215 | 0 | g_free (match); |
3216 | 0 | } |
3217 | 0 | break; |
3218 | 0 | case REPL_TYPE_SYMBOLIC_REFERENCE: |
3219 | 0 | match = g_match_info_fetch_named (match_info, idata->text); |
3220 | 0 | if (match) |
3221 | 0 | { |
3222 | 0 | string_append (result, match, &change_case); |
3223 | 0 | g_free (match); |
3224 | 0 | } |
3225 | 0 | break; |
3226 | 0 | case REPL_TYPE_CHANGE_CASE: |
3227 | 0 | change_case = idata->change_case; |
3228 | 0 | break; |
3229 | 0 | } |
3230 | 0 | } |
3231 | | |
3232 | 0 | return FALSE; |
3233 | 0 | } |
3234 | | |
3235 | | /* whether actual match_info is needed for replacement, i.e. |
3236 | | * whether there are references |
3237 | | */ |
3238 | | static gboolean |
3239 | | interpolation_list_needs_match (GList *list) |
3240 | 0 | { |
3241 | 0 | while (list != NULL) |
3242 | 0 | { |
3243 | 0 | InterpolationData *data = list->data; |
3244 | |
|
3245 | 0 | if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE || |
3246 | 0 | data->type == REPL_TYPE_NUMERIC_REFERENCE) |
3247 | 0 | { |
3248 | 0 | return TRUE; |
3249 | 0 | } |
3250 | | |
3251 | 0 | list = list->next; |
3252 | 0 | } |
3253 | | |
3254 | 0 | return FALSE; |
3255 | 0 | } |
3256 | | |
3257 | | /** |
3258 | | * g_regex_replace: |
3259 | | * @regex: a #GRegex structure |
3260 | | * @string: (array length=string_len): the string to perform matches against |
3261 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3262 | | * @start_position: starting index of the string to match, in bytes |
3263 | | * @replacement: text to replace each match with |
3264 | | * @match_options: options for the match |
3265 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3266 | | * |
3267 | | * Replaces all occurrences of the pattern in @regex with the |
3268 | | * replacement text. Backreferences of the form '\number' or |
3269 | | * '\g<number>' in the replacement text are interpolated by the |
3270 | | * number-th captured subexpression of the match, '\g<name>' refers |
3271 | | * to the captured subexpression with the given name. '\0' refers |
3272 | | * to the complete match, but '\0' followed by a number is the octal |
3273 | | * representation of a character. To include a literal '\' in the |
3274 | | * replacement, write '\\\\'. |
3275 | | * |
3276 | | * There are also escapes that changes the case of the following text: |
3277 | | * |
3278 | | * - \l: Convert to lower case the next character |
3279 | | * - \u: Convert to upper case the next character |
3280 | | * - \L: Convert to lower case till \E |
3281 | | * - \U: Convert to upper case till \E |
3282 | | * - \E: End case modification |
3283 | | * |
3284 | | * If you do not need to use backreferences use g_regex_replace_literal(). |
3285 | | * |
3286 | | * The @replacement string must be UTF-8 encoded even if %G_REGEX_RAW was |
3287 | | * passed to g_regex_new(). If you want to use not UTF-8 encoded strings |
3288 | | * you can use g_regex_replace_literal(). |
3289 | | * |
3290 | | * Setting @start_position differs from just passing over a shortened |
3291 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern that |
3292 | | * begins with any kind of lookbehind assertion, such as "\b". |
3293 | | * |
3294 | | * Returns: a newly allocated string containing the replacements |
3295 | | * |
3296 | | * Since: 2.14 |
3297 | | */ |
3298 | | gchar * |
3299 | | g_regex_replace (const GRegex *regex, |
3300 | | const gchar *string, |
3301 | | gssize string_len, |
3302 | | gint start_position, |
3303 | | const gchar *replacement, |
3304 | | GRegexMatchFlags match_options, |
3305 | | GError **error) |
3306 | 0 | { |
3307 | 0 | gchar *result; |
3308 | 0 | GList *list; |
3309 | 0 | GError *tmp_error = NULL; |
3310 | |
|
3311 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
3312 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3313 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
3314 | 0 | g_return_val_if_fail (replacement != NULL, NULL); |
3315 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
3316 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3317 | | |
3318 | 0 | list = split_replacement (replacement, &tmp_error); |
3319 | 0 | if (tmp_error != NULL) |
3320 | 0 | { |
3321 | 0 | g_propagate_error (error, tmp_error); |
3322 | 0 | return NULL; |
3323 | 0 | } |
3324 | | |
3325 | 0 | result = g_regex_replace_eval (regex, |
3326 | 0 | string, string_len, start_position, |
3327 | 0 | match_options, |
3328 | 0 | interpolate_replacement, |
3329 | 0 | (gpointer)list, |
3330 | 0 | &tmp_error); |
3331 | 0 | if (tmp_error != NULL) |
3332 | 0 | g_propagate_error (error, tmp_error); |
3333 | |
|
3334 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3335 | |
|
3336 | 0 | return result; |
3337 | 0 | } |
3338 | | |
3339 | | static gboolean |
3340 | | literal_replacement (const GMatchInfo *match_info, |
3341 | | GString *result, |
3342 | | gpointer data) |
3343 | 0 | { |
3344 | 0 | g_string_append (result, data); |
3345 | 0 | return FALSE; |
3346 | 0 | } |
3347 | | |
3348 | | /** |
3349 | | * g_regex_replace_literal: |
3350 | | * @regex: a #GRegex structure |
3351 | | * @string: (array length=string_len): the string to perform matches against |
3352 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3353 | | * @start_position: starting index of the string to match, in bytes |
3354 | | * @replacement: text to replace each match with |
3355 | | * @match_options: options for the match |
3356 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3357 | | * |
3358 | | * Replaces all occurrences of the pattern in @regex with the |
3359 | | * replacement text. @replacement is replaced literally, to |
3360 | | * include backreferences use g_regex_replace(). |
3361 | | * |
3362 | | * Setting @start_position differs from just passing over a |
3363 | | * shortened string and setting %G_REGEX_MATCH_NOTBOL in the |
3364 | | * case of a pattern that begins with any kind of lookbehind |
3365 | | * assertion, such as "\b". |
3366 | | * |
3367 | | * Returns: a newly allocated string containing the replacements |
3368 | | * |
3369 | | * Since: 2.14 |
3370 | | */ |
3371 | | gchar * |
3372 | | g_regex_replace_literal (const GRegex *regex, |
3373 | | const gchar *string, |
3374 | | gssize string_len, |
3375 | | gint start_position, |
3376 | | const gchar *replacement, |
3377 | | GRegexMatchFlags match_options, |
3378 | | GError **error) |
3379 | 0 | { |
3380 | 0 | g_return_val_if_fail (replacement != NULL, NULL); |
3381 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3382 | | |
3383 | 0 | return g_regex_replace_eval (regex, |
3384 | 0 | string, string_len, start_position, |
3385 | 0 | match_options, |
3386 | 0 | literal_replacement, |
3387 | 0 | (gpointer)replacement, |
3388 | 0 | error); |
3389 | 0 | } |
3390 | | |
3391 | | /** |
3392 | | * g_regex_replace_eval: |
3393 | | * @regex: a #GRegex structure from g_regex_new() |
3394 | | * @string: (array length=string_len): string to perform matches against |
3395 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3396 | | * @start_position: starting index of the string to match, in bytes |
3397 | | * @match_options: options for the match |
3398 | | * @eval: (scope call): a function to call for each match |
3399 | | * @user_data: user data to pass to the function |
3400 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3401 | | * |
3402 | | * Replaces occurrences of the pattern in regex with the output of |
3403 | | * @eval for that occurrence. |
3404 | | * |
3405 | | * Setting @start_position differs from just passing over a shortened |
3406 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
3407 | | * that begins with any kind of lookbehind assertion, such as "\b". |
3408 | | * |
3409 | | * The following example uses g_regex_replace_eval() to replace multiple |
3410 | | * strings at once: |
3411 | | * |[<!-- language="C" --> |
3412 | | * static gboolean |
3413 | | * eval_cb (const GMatchInfo *info, |
3414 | | * GString *res, |
3415 | | * gpointer data) |
3416 | | * { |
3417 | | * gchar *match; |
3418 | | * gchar *r; |
3419 | | * |
3420 | | * match = g_match_info_fetch (info, 0); |
3421 | | * r = g_hash_table_lookup ((GHashTable *)data, match); |
3422 | | * g_string_append (res, r); |
3423 | | * g_free (match); |
3424 | | * |
3425 | | * return FALSE; |
3426 | | * } |
3427 | | * |
3428 | | * ... |
3429 | | * |
3430 | | * GRegex *reg; |
3431 | | * GHashTable *h; |
3432 | | * gchar *res; |
3433 | | * |
3434 | | * h = g_hash_table_new (g_str_hash, g_str_equal); |
3435 | | * |
3436 | | * g_hash_table_insert (h, "1", "ONE"); |
3437 | | * g_hash_table_insert (h, "2", "TWO"); |
3438 | | * g_hash_table_insert (h, "3", "THREE"); |
3439 | | * g_hash_table_insert (h, "4", "FOUR"); |
3440 | | * |
3441 | | * reg = g_regex_new ("1|2|3|4", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
3442 | | * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL); |
3443 | | * g_hash_table_destroy (h); |
3444 | | * |
3445 | | * ... |
3446 | | * ]| |
3447 | | * |
3448 | | * Returns: a newly allocated string containing the replacements |
3449 | | * |
3450 | | * Since: 2.14 |
3451 | | */ |
3452 | | gchar * |
3453 | | g_regex_replace_eval (const GRegex *regex, |
3454 | | const gchar *string, |
3455 | | gssize string_len, |
3456 | | gint start_position, |
3457 | | GRegexMatchFlags match_options, |
3458 | | GRegexEvalCallback eval, |
3459 | | gpointer user_data, |
3460 | | GError **error) |
3461 | 0 | { |
3462 | 0 | GMatchInfo *match_info; |
3463 | 0 | GString *result; |
3464 | 0 | gint str_pos = 0; |
3465 | 0 | gboolean done = FALSE; |
3466 | 0 | GError *tmp_error = NULL; |
3467 | |
|
3468 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
3469 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3470 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
3471 | 0 | g_return_val_if_fail (eval != NULL, NULL); |
3472 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3473 | | |
3474 | 0 | if (string_len < 0) |
3475 | 0 | string_len = strlen (string); |
3476 | |
|
3477 | 0 | result = g_string_sized_new (string_len); |
3478 | | |
3479 | | /* run down the string making matches. */ |
3480 | 0 | g_regex_match_full (regex, string, string_len, start_position, |
3481 | 0 | match_options, &match_info, &tmp_error); |
3482 | 0 | while (!done && g_match_info_matches (match_info)) |
3483 | 0 | { |
3484 | 0 | g_string_append_len (result, |
3485 | 0 | string + str_pos, |
3486 | 0 | match_info->offsets[0] - str_pos); |
3487 | 0 | done = (*eval) (match_info, result, user_data); |
3488 | 0 | str_pos = match_info->offsets[1]; |
3489 | 0 | g_match_info_next (match_info, &tmp_error); |
3490 | 0 | } |
3491 | 0 | g_match_info_free (match_info); |
3492 | 0 | if (tmp_error != NULL) |
3493 | 0 | { |
3494 | 0 | g_propagate_error (error, tmp_error); |
3495 | 0 | g_string_free (result, TRUE); |
3496 | 0 | return NULL; |
3497 | 0 | } |
3498 | | |
3499 | 0 | g_string_append_len (result, string + str_pos, string_len - str_pos); |
3500 | 0 | return g_string_free (result, FALSE); |
3501 | 0 | } |
3502 | | |
3503 | | /** |
3504 | | * g_regex_check_replacement: |
3505 | | * @replacement: the replacement string |
3506 | | * @has_references: (out) (optional): location to store information about |
3507 | | * references in @replacement or %NULL |
3508 | | * @error: location to store error |
3509 | | * |
3510 | | * Checks whether @replacement is a valid replacement string |
3511 | | * (see g_regex_replace()), i.e. that all escape sequences in |
3512 | | * it are valid. |
3513 | | * |
3514 | | * If @has_references is not %NULL then @replacement is checked |
3515 | | * for pattern references. For instance, replacement text 'foo\n' |
3516 | | * does not contain references and may be evaluated without information |
3517 | | * about actual match, but '\0\1' (whole match followed by first |
3518 | | * subpattern) requires valid #GMatchInfo object. |
3519 | | * |
3520 | | * Returns: whether @replacement is a valid replacement string |
3521 | | * |
3522 | | * Since: 2.14 |
3523 | | */ |
3524 | | gboolean |
3525 | | g_regex_check_replacement (const gchar *replacement, |
3526 | | gboolean *has_references, |
3527 | | GError **error) |
3528 | 0 | { |
3529 | 0 | GList *list; |
3530 | 0 | GError *tmp = NULL; |
3531 | |
|
3532 | 0 | list = split_replacement (replacement, &tmp); |
3533 | |
|
3534 | 0 | if (tmp) |
3535 | 0 | { |
3536 | 0 | g_propagate_error (error, tmp); |
3537 | 0 | return FALSE; |
3538 | 0 | } |
3539 | | |
3540 | 0 | if (has_references) |
3541 | 0 | *has_references = interpolation_list_needs_match (list); |
3542 | |
|
3543 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3544 | |
|
3545 | 0 | return TRUE; |
3546 | 0 | } |
3547 | | |
3548 | | /** |
3549 | | * g_regex_escape_nul: |
3550 | | * @string: the string to escape |
3551 | | * @length: the length of @string |
3552 | | * |
3553 | | * Escapes the nul characters in @string to "\x00". It can be used |
3554 | | * to compile a regex with embedded nul characters. |
3555 | | * |
3556 | | * For completeness, @length can be -1 for a nul-terminated string. |
3557 | | * In this case the output string will be of course equal to @string. |
3558 | | * |
3559 | | * Returns: a newly-allocated escaped string |
3560 | | * |
3561 | | * Since: 2.30 |
3562 | | */ |
3563 | | gchar * |
3564 | | g_regex_escape_nul (const gchar *string, |
3565 | | gint length) |
3566 | 0 | { |
3567 | 0 | GString *escaped; |
3568 | 0 | const gchar *p, *piece_start, *end; |
3569 | 0 | gint backslashes; |
3570 | |
|
3571 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3572 | | |
3573 | 0 | if (length < 0) |
3574 | 0 | return g_strdup (string); |
3575 | | |
3576 | 0 | end = string + length; |
3577 | 0 | p = piece_start = string; |
3578 | 0 | escaped = g_string_sized_new (length + 1); |
3579 | |
|
3580 | 0 | backslashes = 0; |
3581 | 0 | while (p < end) |
3582 | 0 | { |
3583 | 0 | switch (*p) |
3584 | 0 | { |
3585 | 0 | case '\0': |
3586 | 0 | if (p != piece_start) |
3587 | 0 | { |
3588 | | /* copy the previous piece. */ |
3589 | 0 | g_string_append_len (escaped, piece_start, p - piece_start); |
3590 | 0 | } |
3591 | 0 | if ((backslashes & 1) == 0) |
3592 | 0 | g_string_append_c (escaped, '\\'); |
3593 | 0 | g_string_append_c (escaped, 'x'); |
3594 | 0 | g_string_append_c (escaped, '0'); |
3595 | 0 | g_string_append_c (escaped, '0'); |
3596 | 0 | piece_start = ++p; |
3597 | 0 | backslashes = 0; |
3598 | 0 | break; |
3599 | 0 | case '\\': |
3600 | 0 | backslashes++; |
3601 | 0 | ++p; |
3602 | 0 | break; |
3603 | 0 | default: |
3604 | 0 | backslashes = 0; |
3605 | 0 | p = g_utf8_next_char (p); |
3606 | 0 | break; |
3607 | 0 | } |
3608 | 0 | } |
3609 | | |
3610 | 0 | if (piece_start < end) |
3611 | 0 | g_string_append_len (escaped, piece_start, end - piece_start); |
3612 | |
|
3613 | 0 | return g_string_free (escaped, FALSE); |
3614 | 0 | } |
3615 | | |
3616 | | /** |
3617 | | * g_regex_escape_string: |
3618 | | * @string: the string to escape |
3619 | | * @length: the length of @string, in bytes, or -1 if @string is nul-terminated |
3620 | | * |
3621 | | * Escapes the special characters used for regular expressions |
3622 | | * in @string, for instance "a.b*c" becomes "a\.b\*c". This |
3623 | | * function is useful to dynamically generate regular expressions. |
3624 | | * |
3625 | | * @string can contain nul characters that are replaced with "\0", |
3626 | | * in this case remember to specify the correct length of @string |
3627 | | * in @length. |
3628 | | * |
3629 | | * Returns: a newly-allocated escaped string |
3630 | | * |
3631 | | * Since: 2.14 |
3632 | | */ |
3633 | | gchar * |
3634 | | g_regex_escape_string (const gchar *string, |
3635 | | gint length) |
3636 | 0 | { |
3637 | 0 | GString *escaped; |
3638 | 0 | const char *p, *piece_start, *end; |
3639 | |
|
3640 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3641 | | |
3642 | 0 | if (length < 0) |
3643 | 0 | length = strlen (string); |
3644 | |
|
3645 | 0 | end = string + length; |
3646 | 0 | p = piece_start = string; |
3647 | 0 | escaped = g_string_sized_new (length + 1); |
3648 | |
|
3649 | 0 | while (p < end) |
3650 | 0 | { |
3651 | 0 | switch (*p) |
3652 | 0 | { |
3653 | 0 | case '\0': |
3654 | 0 | case '\\': |
3655 | 0 | case '|': |
3656 | 0 | case '(': |
3657 | 0 | case ')': |
3658 | 0 | case '[': |
3659 | 0 | case ']': |
3660 | 0 | case '{': |
3661 | 0 | case '}': |
3662 | 0 | case '^': |
3663 | 0 | case '$': |
3664 | 0 | case '*': |
3665 | 0 | case '+': |
3666 | 0 | case '?': |
3667 | 0 | case '.': |
3668 | 0 | if (p != piece_start) |
3669 | | /* copy the previous piece. */ |
3670 | 0 | g_string_append_len (escaped, piece_start, p - piece_start); |
3671 | 0 | g_string_append_c (escaped, '\\'); |
3672 | 0 | if (*p == '\0') |
3673 | 0 | g_string_append_c (escaped, '0'); |
3674 | 0 | else |
3675 | 0 | g_string_append_c (escaped, *p); |
3676 | 0 | piece_start = ++p; |
3677 | 0 | break; |
3678 | 0 | default: |
3679 | 0 | p = g_utf8_next_char (p); |
3680 | 0 | break; |
3681 | 0 | } |
3682 | 0 | } |
3683 | | |
3684 | 0 | if (piece_start < end) |
3685 | 0 | g_string_append_len (escaped, piece_start, end - piece_start); |
3686 | |
|
3687 | 0 | return g_string_free (escaped, FALSE); |
3688 | 0 | } |