/src/irssi/subprojects/glib-2.74.3/glib/gregex.c
Line | Count | Source |
1 | | /* GRegex -- regular expression API wrapper around PCRE. |
2 | | * |
3 | | * Copyright (C) 1999, 2000 Scott Wimer |
4 | | * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com> |
5 | | * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org> |
6 | | * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com> |
7 | | * |
8 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
9 | | * |
10 | | * This library is free software; you can redistribute it and/or |
11 | | * modify it under the terms of the GNU Lesser General Public |
12 | | * License as published by the Free Software Foundation; either |
13 | | * version 2.1 of the License, or (at your option) any later version. |
14 | | * |
15 | | * This library is distributed in the hope that it will be useful, |
16 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | | * Lesser General Public License for more details. |
19 | | * |
20 | | * You should have received a copy of the GNU Lesser General Public License |
21 | | * along with this library; if not, see <http://www.gnu.org/licenses/>. |
22 | | */ |
23 | | |
24 | | #include "config.h" |
25 | | |
26 | | #include <stdint.h> |
27 | | #include <string.h> |
28 | | |
29 | | #define PCRE2_CODE_UNIT_WIDTH 8 |
30 | | #include <pcre2.h> |
31 | | |
32 | | #include "gtypes.h" |
33 | | #include "gregex.h" |
34 | | #include "glibintl.h" |
35 | | #include "glist.h" |
36 | | #include "gmessages.h" |
37 | | #include "gstrfuncs.h" |
38 | | #include "gatomic.h" |
39 | | #include "gtestutils.h" |
40 | | #include "gthread.h" |
41 | | |
42 | | /** |
43 | | * SECTION:gregex |
44 | | * @title: Perl-compatible regular expressions |
45 | | * @short_description: matches strings against regular expressions |
46 | | * @see_also: [Regular expression syntax][glib-regex-syntax] |
47 | | * |
48 | | * The g_regex_*() functions implement regular |
49 | | * expression pattern matching using syntax and semantics similar to |
50 | | * Perl regular expression. |
51 | | * |
52 | | * Some functions accept a @start_position argument, setting it differs |
53 | | * from just passing over a shortened string and setting %G_REGEX_MATCH_NOTBOL |
54 | | * in the case of a pattern that begins with any kind of lookbehind assertion. |
55 | | * For example, consider the pattern "\Biss\B" which finds occurrences of "iss" |
56 | | * in the middle of words. ("\B" matches only if the current position in the |
57 | | * subject is not a word boundary.) When applied to the string "Mississipi" |
58 | | * from the fourth byte, namely "issipi", it does not match, because "\B" is |
59 | | * always false at the start of the subject, which is deemed to be a word |
60 | | * boundary. However, if the entire string is passed , but with |
61 | | * @start_position set to 4, it finds the second occurrence of "iss" because |
62 | | * it is able to look behind the starting point to discover that it is |
63 | | * preceded by a letter. |
64 | | * |
65 | | * Note that, unless you set the %G_REGEX_RAW flag, all the strings passed |
66 | | * to these functions must be encoded in UTF-8. The lengths and the positions |
67 | | * inside the strings are in bytes and not in characters, so, for instance, |
68 | | * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a |
69 | | * single character. If you set %G_REGEX_RAW the strings can be non-valid |
70 | | * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two |
71 | | * bytes and two characters long. |
72 | | * |
73 | | * When matching a pattern, "\n" matches only against a "\n" character in |
74 | | * the string, and "\r" matches only a "\r" character. To match any newline |
75 | | * sequence use "\R". This particular group matches either the two-character |
76 | | * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed, |
77 | | * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), |
78 | | * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line |
79 | | * separator, U+2028), or PS (paragraph separator, U+2029). |
80 | | * |
81 | | * The behaviour of the dot, circumflex, and dollar metacharacters are |
82 | | * affected by newline characters, the default is to recognize any newline |
83 | | * character (the same characters recognized by "\R"). This can be changed |
84 | | * with %G_REGEX_NEWLINE_CR, %G_REGEX_NEWLINE_LF and %G_REGEX_NEWLINE_CRLF |
85 | | * compile options, and with %G_REGEX_MATCH_NEWLINE_ANY, |
86 | | * %G_REGEX_MATCH_NEWLINE_CR, %G_REGEX_MATCH_NEWLINE_LF and |
87 | | * %G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also |
88 | | * relevant when compiling a pattern if %G_REGEX_EXTENDED is set, and an |
89 | | * unescaped "#" outside a character class is encountered. This indicates |
90 | | * a comment that lasts until after the next newline. |
91 | | * |
92 | | * Creating and manipulating the same #GRegex structure from different |
93 | | * threads is not a problem as #GRegex does not modify its internal |
94 | | * state between creation and destruction, on the other hand #GMatchInfo |
95 | | * is not threadsafe. |
96 | | * |
97 | | * The regular expressions low-level functionalities are obtained through |
98 | | * the excellent |
99 | | * [PCRE](http://www.pcre.org/) |
100 | | * library written by Philip Hazel. |
101 | | */ |
102 | | |
103 | 0 | #define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED | \ |
104 | 0 | PCRE2_NO_UTF_CHECK | \ |
105 | 0 | PCRE2_ENDANCHORED) |
106 | | |
107 | | /* Mask of all the possible values for GRegexCompileFlags. */ |
108 | 0 | #define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT | \ |
109 | 0 | G_REGEX_CASELESS | \ |
110 | 0 | G_REGEX_MULTILINE | \ |
111 | 0 | G_REGEX_DOTALL | \ |
112 | 0 | G_REGEX_EXTENDED | \ |
113 | 0 | G_REGEX_ANCHORED | \ |
114 | 0 | G_REGEX_DOLLAR_ENDONLY | \ |
115 | 0 | G_REGEX_UNGREEDY | \ |
116 | 0 | G_REGEX_RAW | \ |
117 | 0 | G_REGEX_NO_AUTO_CAPTURE | \ |
118 | 0 | G_REGEX_OPTIMIZE | \ |
119 | 0 | G_REGEX_FIRSTLINE | \ |
120 | 0 | G_REGEX_DUPNAMES | \ |
121 | 0 | G_REGEX_NEWLINE_CR | \ |
122 | 0 | G_REGEX_NEWLINE_LF | \ |
123 | 0 | G_REGEX_NEWLINE_CRLF | \ |
124 | 0 | G_REGEX_NEWLINE_ANYCRLF | \ |
125 | 0 | G_REGEX_BSR_ANYCRLF) |
126 | | |
127 | 0 | #define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS | \ |
128 | 0 | PCRE2_ALT_BSUX | \ |
129 | 0 | PCRE2_AUTO_CALLOUT | \ |
130 | 0 | PCRE2_CASELESS | \ |
131 | 0 | PCRE2_DOLLAR_ENDONLY | \ |
132 | 0 | PCRE2_DOTALL | \ |
133 | 0 | PCRE2_DUPNAMES | \ |
134 | 0 | PCRE2_EXTENDED | \ |
135 | 0 | PCRE2_FIRSTLINE | \ |
136 | 0 | PCRE2_MATCH_UNSET_BACKREF | \ |
137 | 0 | PCRE2_MULTILINE | \ |
138 | 0 | PCRE2_NEVER_UCP | \ |
139 | 0 | PCRE2_NEVER_UTF | \ |
140 | 0 | PCRE2_NO_AUTO_CAPTURE | \ |
141 | 0 | PCRE2_NO_AUTO_POSSESS | \ |
142 | 0 | PCRE2_NO_DOTSTAR_ANCHOR | \ |
143 | 0 | PCRE2_NO_START_OPTIMIZE | \ |
144 | 0 | PCRE2_UCP | \ |
145 | 0 | PCRE2_UNGREEDY | \ |
146 | 0 | PCRE2_UTF | \ |
147 | 0 | PCRE2_NEVER_BACKSLASH_C | \ |
148 | 0 | PCRE2_ALT_CIRCUMFLEX | \ |
149 | 0 | PCRE2_ALT_VERBNAMES | \ |
150 | 0 | PCRE2_USE_OFFSET_LIMIT | \ |
151 | 0 | PCRE2_EXTENDED_MORE | \ |
152 | 0 | PCRE2_LITERAL | \ |
153 | 0 | PCRE2_MATCH_INVALID_UTF | \ |
154 | 0 | G_REGEX_PCRE_GENERIC_MASK) |
155 | | |
156 | 0 | #define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF) |
157 | | |
158 | | /* Mask of all the possible values for GRegexMatchFlags. */ |
159 | 0 | #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT | \ |
160 | 0 | G_REGEX_MATCH_ANCHORED | \ |
161 | 0 | G_REGEX_MATCH_NOTBOL | \ |
162 | 0 | G_REGEX_MATCH_NOTEOL | \ |
163 | 0 | G_REGEX_MATCH_NOTEMPTY | \ |
164 | 0 | G_REGEX_MATCH_PARTIAL | \ |
165 | 0 | G_REGEX_MATCH_NEWLINE_CR | \ |
166 | 0 | G_REGEX_MATCH_NEWLINE_LF | \ |
167 | 0 | G_REGEX_MATCH_NEWLINE_CRLF | \ |
168 | 0 | G_REGEX_MATCH_NEWLINE_ANY | \ |
169 | 0 | G_REGEX_MATCH_NEWLINE_ANYCRLF | \ |
170 | 0 | G_REGEX_MATCH_BSR_ANYCRLF | \ |
171 | 0 | G_REGEX_MATCH_BSR_ANY | \ |
172 | 0 | G_REGEX_MATCH_PARTIAL_SOFT | \ |
173 | 0 | G_REGEX_MATCH_PARTIAL_HARD | \ |
174 | 0 | G_REGEX_MATCH_NOTEMPTY_ATSTART) |
175 | | |
176 | 0 | #define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL |\ |
177 | 0 | PCRE2_NOTEOL |\ |
178 | 0 | PCRE2_NOTEMPTY |\ |
179 | 0 | PCRE2_NOTEMPTY_ATSTART |\ |
180 | 0 | PCRE2_PARTIAL_SOFT |\ |
181 | 0 | PCRE2_PARTIAL_HARD |\ |
182 | 0 | PCRE2_NO_JIT |\ |
183 | 0 | PCRE2_COPY_MATCHED_SUBJECT |\ |
184 | 0 | G_REGEX_PCRE_GENERIC_MASK) |
185 | | |
186 | | /* TODO: Support PCRE2_NEWLINE_NUL */ |
187 | | #define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR | \ |
188 | | PCRE2_NEWLINE_LF | \ |
189 | | PCRE2_NEWLINE_CRLF | \ |
190 | | PCRE2_NEWLINE_ANYCRLF) |
191 | | |
192 | | /* Some match options are not supported when using JIT as stated in the |
193 | | * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section: |
194 | | * https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5 |
195 | | */ |
196 | 0 | #define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \ |
197 | 0 | PCRE2_ENDANCHORED) |
198 | | |
199 | 0 | #define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR | \ |
200 | 0 | G_REGEX_NEWLINE_LF | \ |
201 | 0 | G_REGEX_NEWLINE_CRLF | \ |
202 | 0 | G_REGEX_NEWLINE_ANYCRLF) |
203 | | |
204 | 0 | #define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR | \ |
205 | 0 | G_REGEX_MATCH_NEWLINE_LF | \ |
206 | 0 | G_REGEX_MATCH_NEWLINE_CRLF | \ |
207 | 0 | G_REGEX_MATCH_NEWLINE_ANY | \ |
208 | 0 | G_REGEX_MATCH_NEWLINE_ANYCRLF) |
209 | | |
210 | | /* if the string is in UTF-8 use g_utf8_ functions, else use |
211 | | * use just +/- 1. */ |
212 | 0 | #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
213 | 0 | ((s) + 1) : \ |
214 | 0 | g_utf8_next_char (s)) |
215 | 0 | #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
216 | 0 | ((s) - 1) : \ |
217 | 0 | g_utf8_prev_char (s)) |
218 | | |
219 | | struct _GMatchInfo |
220 | | { |
221 | | gint ref_count; /* the ref count (atomic) */ |
222 | | GRegex *regex; /* the regex */ |
223 | | uint32_t match_opts; /* pcre match options used at match time on the regex */ |
224 | | gint matches; /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */ |
225 | | uint32_t n_subpatterns; /* total number of sub patterns in the regex */ |
226 | | gint pos; /* position in the string where last match left off */ |
227 | | uint32_t n_offsets; /* number of offsets */ |
228 | | gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ |
229 | | gint *workspace; /* workspace for pcre2_dfa_match() */ |
230 | | PCRE2_SIZE n_workspace; /* number of workspace elements */ |
231 | | const gchar *string; /* string passed to the match function */ |
232 | | gssize string_len; /* length of string, in bytes */ |
233 | | pcre2_match_context *match_context; |
234 | | pcre2_match_data *match_data; |
235 | | }; |
236 | | |
237 | | typedef enum |
238 | | { |
239 | | JIT_STATUS_DEFAULT, |
240 | | JIT_STATUS_ENABLED, |
241 | | JIT_STATUS_DISABLED |
242 | | } JITStatus; |
243 | | |
244 | | struct _GRegex |
245 | | { |
246 | | gint ref_count; /* the ref count for the immutable part (atomic) */ |
247 | | gchar *pattern; /* the pattern */ |
248 | | pcre2_code *pcre_re; /* compiled form of the pattern */ |
249 | | uint32_t compile_opts; /* options used at compile time on the pattern, pcre2 values */ |
250 | | GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */ |
251 | | uint32_t match_opts; /* pcre2 options used at match time on the regex */ |
252 | | GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */ |
253 | | uint32_t jit_options; /* options which were enabled for jit compiler */ |
254 | | JITStatus jit_status; /* indicates the status of jit compiler for this compiled regex */ |
255 | | }; |
256 | | |
257 | | /* TRUE if ret is an error code, FALSE otherwise. */ |
258 | 0 | #define IS_PCRE2_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL) |
259 | | |
260 | | typedef struct _InterpolationData InterpolationData; |
261 | | static gboolean interpolation_list_needs_match (GList *list); |
262 | | static gboolean interpolate_replacement (const GMatchInfo *match_info, |
263 | | GString *result, |
264 | | gpointer data); |
265 | | static GList *split_replacement (const gchar *replacement, |
266 | | GError **error); |
267 | | static void free_interpolation_data (InterpolationData *data); |
268 | | |
269 | | static uint32_t |
270 | | get_pcre2_compile_options (GRegexCompileFlags compile_flags) |
271 | 0 | { |
272 | | /* Maps compile flags to pcre2 values */ |
273 | 0 | uint32_t pcre2_flags = 0; |
274 | |
|
275 | 0 | if (compile_flags & G_REGEX_CASELESS) |
276 | 0 | pcre2_flags |= PCRE2_CASELESS; |
277 | 0 | if (compile_flags & G_REGEX_MULTILINE) |
278 | 0 | pcre2_flags |= PCRE2_MULTILINE; |
279 | 0 | if (compile_flags & G_REGEX_DOTALL) |
280 | 0 | pcre2_flags |= PCRE2_DOTALL; |
281 | 0 | if (compile_flags & G_REGEX_EXTENDED) |
282 | 0 | pcre2_flags |= PCRE2_EXTENDED; |
283 | 0 | if (compile_flags & G_REGEX_ANCHORED) |
284 | 0 | pcre2_flags |= PCRE2_ANCHORED; |
285 | 0 | if (compile_flags & G_REGEX_DOLLAR_ENDONLY) |
286 | 0 | pcre2_flags |= PCRE2_DOLLAR_ENDONLY; |
287 | 0 | if (compile_flags & G_REGEX_UNGREEDY) |
288 | 0 | pcre2_flags |= PCRE2_UNGREEDY; |
289 | 0 | if (!(compile_flags & G_REGEX_RAW)) |
290 | 0 | pcre2_flags |= PCRE2_UTF; |
291 | 0 | if (compile_flags & G_REGEX_NO_AUTO_CAPTURE) |
292 | 0 | pcre2_flags |= PCRE2_NO_AUTO_CAPTURE; |
293 | 0 | if (compile_flags & G_REGEX_FIRSTLINE) |
294 | 0 | pcre2_flags |= PCRE2_FIRSTLINE; |
295 | 0 | if (compile_flags & G_REGEX_DUPNAMES) |
296 | 0 | pcre2_flags |= PCRE2_DUPNAMES; |
297 | |
|
298 | 0 | return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK; |
299 | 0 | } |
300 | | |
301 | | static uint32_t |
302 | | get_pcre2_match_options (GRegexMatchFlags match_flags, |
303 | | GRegexCompileFlags compile_flags) |
304 | 0 | { |
305 | | /* Maps match flags to pcre2 values */ |
306 | 0 | uint32_t pcre2_flags = 0; |
307 | |
|
308 | 0 | if (match_flags & G_REGEX_MATCH_ANCHORED) |
309 | 0 | pcre2_flags |= PCRE2_ANCHORED; |
310 | 0 | if (match_flags & G_REGEX_MATCH_NOTBOL) |
311 | 0 | pcre2_flags |= PCRE2_NOTBOL; |
312 | 0 | if (match_flags & G_REGEX_MATCH_NOTEOL) |
313 | 0 | pcre2_flags |= PCRE2_NOTEOL; |
314 | 0 | if (match_flags & G_REGEX_MATCH_NOTEMPTY) |
315 | 0 | pcre2_flags |= PCRE2_NOTEMPTY; |
316 | 0 | if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT) |
317 | 0 | pcre2_flags |= PCRE2_PARTIAL_SOFT; |
318 | 0 | if (match_flags & G_REGEX_MATCH_PARTIAL_HARD) |
319 | 0 | pcre2_flags |= PCRE2_PARTIAL_HARD; |
320 | 0 | if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART) |
321 | 0 | pcre2_flags |= PCRE2_NOTEMPTY_ATSTART; |
322 | |
|
323 | 0 | if (compile_flags & G_REGEX_RAW) |
324 | 0 | pcre2_flags |= PCRE2_NO_UTF_CHECK; |
325 | |
|
326 | 0 | return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK; |
327 | 0 | } |
328 | | |
329 | | static GRegexCompileFlags |
330 | | g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags) |
331 | 0 | { |
332 | 0 | GRegexCompileFlags compile_flags = G_REGEX_DEFAULT; |
333 | |
|
334 | 0 | if (pcre2_flags & PCRE2_CASELESS) |
335 | 0 | compile_flags |= G_REGEX_CASELESS; |
336 | 0 | if (pcre2_flags & PCRE2_MULTILINE) |
337 | 0 | compile_flags |= G_REGEX_MULTILINE; |
338 | 0 | if (pcre2_flags & PCRE2_DOTALL) |
339 | 0 | compile_flags |= G_REGEX_DOTALL; |
340 | 0 | if (pcre2_flags & PCRE2_EXTENDED) |
341 | 0 | compile_flags |= G_REGEX_EXTENDED; |
342 | 0 | if (pcre2_flags & PCRE2_ANCHORED) |
343 | 0 | compile_flags |= G_REGEX_ANCHORED; |
344 | 0 | if (pcre2_flags & PCRE2_DOLLAR_ENDONLY) |
345 | 0 | compile_flags |= G_REGEX_DOLLAR_ENDONLY; |
346 | 0 | if (pcre2_flags & PCRE2_UNGREEDY) |
347 | 0 | compile_flags |= G_REGEX_UNGREEDY; |
348 | 0 | if (!(pcre2_flags & PCRE2_UTF)) |
349 | 0 | compile_flags |= G_REGEX_RAW; |
350 | 0 | if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE) |
351 | 0 | compile_flags |= G_REGEX_NO_AUTO_CAPTURE; |
352 | 0 | if (pcre2_flags & PCRE2_FIRSTLINE) |
353 | 0 | compile_flags |= G_REGEX_FIRSTLINE; |
354 | 0 | if (pcre2_flags & PCRE2_DUPNAMES) |
355 | 0 | compile_flags |= G_REGEX_DUPNAMES; |
356 | |
|
357 | 0 | return compile_flags & G_REGEX_COMPILE_MASK; |
358 | 0 | } |
359 | | |
360 | | static GRegexMatchFlags |
361 | | g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags) |
362 | 0 | { |
363 | 0 | GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT; |
364 | |
|
365 | 0 | if (pcre2_flags & PCRE2_ANCHORED) |
366 | 0 | match_flags |= G_REGEX_MATCH_ANCHORED; |
367 | 0 | if (pcre2_flags & PCRE2_NOTBOL) |
368 | 0 | match_flags |= G_REGEX_MATCH_NOTBOL; |
369 | 0 | if (pcre2_flags & PCRE2_NOTEOL) |
370 | 0 | match_flags |= G_REGEX_MATCH_NOTEOL; |
371 | 0 | if (pcre2_flags & PCRE2_NOTEMPTY) |
372 | 0 | match_flags |= G_REGEX_MATCH_NOTEMPTY; |
373 | 0 | if (pcre2_flags & PCRE2_PARTIAL_SOFT) |
374 | 0 | match_flags |= G_REGEX_MATCH_PARTIAL_SOFT; |
375 | 0 | if (pcre2_flags & PCRE2_PARTIAL_HARD) |
376 | 0 | match_flags |= G_REGEX_MATCH_PARTIAL_HARD; |
377 | 0 | if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART) |
378 | 0 | match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART; |
379 | |
|
380 | 0 | return (match_flags & G_REGEX_MATCH_MASK); |
381 | 0 | } |
382 | | |
383 | | static uint32_t |
384 | | get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags) |
385 | 0 | { |
386 | 0 | compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK; |
387 | |
|
388 | 0 | switch (compile_flags) |
389 | 0 | { |
390 | 0 | case G_REGEX_NEWLINE_CR: |
391 | 0 | return PCRE2_NEWLINE_CR; |
392 | 0 | case G_REGEX_NEWLINE_LF: |
393 | 0 | return PCRE2_NEWLINE_LF; |
394 | 0 | case G_REGEX_NEWLINE_CRLF: |
395 | 0 | return PCRE2_NEWLINE_CRLF; |
396 | 0 | case G_REGEX_NEWLINE_ANYCRLF: |
397 | 0 | return PCRE2_NEWLINE_ANYCRLF; |
398 | 0 | default: |
399 | 0 | if (compile_flags != 0) |
400 | 0 | return 0; |
401 | | |
402 | 0 | return PCRE2_NEWLINE_ANY; |
403 | 0 | } |
404 | 0 | } |
405 | | |
406 | | static uint32_t |
407 | | get_pcre2_newline_match_options (GRegexMatchFlags match_flags) |
408 | 0 | { |
409 | 0 | switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK) |
410 | 0 | { |
411 | 0 | case G_REGEX_MATCH_NEWLINE_CR: |
412 | 0 | return PCRE2_NEWLINE_CR; |
413 | 0 | case G_REGEX_MATCH_NEWLINE_LF: |
414 | 0 | return PCRE2_NEWLINE_LF; |
415 | 0 | case G_REGEX_MATCH_NEWLINE_CRLF: |
416 | 0 | return PCRE2_NEWLINE_CRLF; |
417 | 0 | case G_REGEX_MATCH_NEWLINE_ANY: |
418 | 0 | return PCRE2_NEWLINE_ANY; |
419 | 0 | case G_REGEX_MATCH_NEWLINE_ANYCRLF: |
420 | 0 | return PCRE2_NEWLINE_ANYCRLF; |
421 | 0 | default: |
422 | 0 | return 0; |
423 | 0 | } |
424 | 0 | } |
425 | | |
426 | | static uint32_t |
427 | | get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags) |
428 | 0 | { |
429 | 0 | if (compile_flags & G_REGEX_BSR_ANYCRLF) |
430 | 0 | return PCRE2_BSR_ANYCRLF; |
431 | | |
432 | 0 | return PCRE2_BSR_UNICODE; |
433 | 0 | } |
434 | | |
435 | | static uint32_t |
436 | | get_pcre2_bsr_match_options (GRegexMatchFlags match_flags) |
437 | 0 | { |
438 | 0 | if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF) |
439 | 0 | return PCRE2_BSR_ANYCRLF; |
440 | | |
441 | 0 | if (match_flags & G_REGEX_MATCH_BSR_ANY) |
442 | 0 | return PCRE2_BSR_UNICODE; |
443 | | |
444 | 0 | return 0; |
445 | 0 | } |
446 | | |
447 | | static char * |
448 | | get_pcre2_error_string (int errcode) |
449 | 0 | { |
450 | 0 | PCRE2_UCHAR8 error_msg[2048]; |
451 | 0 | int err_length; |
452 | |
|
453 | 0 | err_length = pcre2_get_error_message (errcode, error_msg, |
454 | 0 | G_N_ELEMENTS (error_msg)); |
455 | |
|
456 | 0 | if (err_length <= 0) |
457 | 0 | return NULL; |
458 | | |
459 | | /* The array is always filled with a trailing zero */ |
460 | 0 | g_assert ((size_t) err_length < G_N_ELEMENTS (error_msg)); |
461 | 0 | return g_memdup2 (error_msg, err_length + 1); |
462 | 0 | } |
463 | | |
464 | | static const gchar * |
465 | | translate_match_error (gint errcode) |
466 | 0 | { |
467 | 0 | switch (errcode) |
468 | 0 | { |
469 | 0 | case PCRE2_ERROR_NOMATCH: |
470 | | /* not an error */ |
471 | 0 | break; |
472 | 0 | case PCRE2_ERROR_NULL: |
473 | | /* NULL argument, this should not happen in GRegex */ |
474 | 0 | g_critical ("A NULL argument was passed to PCRE"); |
475 | 0 | break; |
476 | 0 | case PCRE2_ERROR_BADOPTION: |
477 | 0 | return "bad options"; |
478 | 0 | case PCRE2_ERROR_BADMAGIC: |
479 | 0 | return _("corrupted object"); |
480 | 0 | case PCRE2_ERROR_NOMEMORY: |
481 | 0 | return _("out of memory"); |
482 | 0 | case PCRE2_ERROR_NOSUBSTRING: |
483 | | /* not used by pcre2_match() */ |
484 | 0 | break; |
485 | 0 | case PCRE2_ERROR_MATCHLIMIT: |
486 | 0 | case PCRE2_ERROR_JIT_STACKLIMIT: |
487 | 0 | return _("backtracking limit reached"); |
488 | 0 | case PCRE2_ERROR_CALLOUT: |
489 | | /* callouts are not implemented */ |
490 | 0 | break; |
491 | 0 | case PCRE2_ERROR_BADUTFOFFSET: |
492 | | /* we do not check if strings are valid */ |
493 | 0 | break; |
494 | 0 | case PCRE2_ERROR_PARTIAL: |
495 | | /* not an error */ |
496 | 0 | break; |
497 | 0 | case PCRE2_ERROR_INTERNAL: |
498 | 0 | return _("internal error"); |
499 | 0 | case PCRE2_ERROR_DFA_UITEM: |
500 | 0 | return _("the pattern contains items not supported for partial matching"); |
501 | 0 | case PCRE2_ERROR_DFA_UCOND: |
502 | 0 | return _("back references as conditions are not supported for partial matching"); |
503 | 0 | case PCRE2_ERROR_DFA_WSSIZE: |
504 | | /* handled expanding the workspace */ |
505 | 0 | break; |
506 | 0 | case PCRE2_ERROR_DFA_RECURSE: |
507 | 0 | case PCRE2_ERROR_RECURSIONLIMIT: |
508 | 0 | return _("recursion limit reached"); |
509 | 0 | case PCRE2_ERROR_BADOFFSET: |
510 | 0 | return _("bad offset"); |
511 | 0 | case PCRE2_ERROR_RECURSELOOP: |
512 | 0 | return _("recursion loop"); |
513 | 0 | case PCRE2_ERROR_JIT_BADOPTION: |
514 | | /* should not happen in GRegex since we check modes before each match */ |
515 | 0 | return _("matching mode is requested that was not compiled for JIT"); |
516 | 0 | default: |
517 | 0 | break; |
518 | 0 | } |
519 | 0 | return NULL; |
520 | 0 | } |
521 | | |
522 | | static char * |
523 | | get_match_error_message (int errcode) |
524 | 0 | { |
525 | 0 | const char *msg = translate_match_error (errcode); |
526 | 0 | char *error_string; |
527 | |
|
528 | 0 | if (msg) |
529 | 0 | return g_strdup (msg); |
530 | | |
531 | 0 | error_string = get_pcre2_error_string (errcode); |
532 | |
|
533 | 0 | if (error_string) |
534 | 0 | return error_string; |
535 | | |
536 | 0 | return g_strdup (_("unknown error")); |
537 | 0 | } |
538 | | |
539 | | static void |
540 | | translate_compile_error (gint *errcode, const gchar **errmsg) |
541 | 0 | { |
542 | | /* If errcode is known we put the translatable error message in |
543 | | * errmsg. If errcode is unknown we put the generic |
544 | | * G_REGEX_ERROR_COMPILE error code in errcode. |
545 | | * Note that there can be more PCRE errors with the same GRegexError |
546 | | * and that some PCRE errors are useless for us. |
547 | | */ |
548 | 0 | gint original_errcode = *errcode; |
549 | |
|
550 | 0 | *errcode = -1; |
551 | 0 | *errmsg = NULL; |
552 | |
|
553 | 0 | switch (original_errcode) |
554 | 0 | { |
555 | 0 | case PCRE2_ERROR_END_BACKSLASH: |
556 | 0 | *errcode = G_REGEX_ERROR_STRAY_BACKSLASH; |
557 | 0 | *errmsg = _("\\ at end of pattern"); |
558 | 0 | break; |
559 | 0 | case PCRE2_ERROR_END_BACKSLASH_C: |
560 | 0 | *errcode = G_REGEX_ERROR_MISSING_CONTROL_CHAR; |
561 | 0 | *errmsg = _("\\c at end of pattern"); |
562 | 0 | break; |
563 | 0 | case PCRE2_ERROR_UNKNOWN_ESCAPE: |
564 | 0 | case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE: |
565 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; |
566 | 0 | *errmsg = _("unrecognized character following \\"); |
567 | 0 | break; |
568 | 0 | case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER: |
569 | 0 | *errcode = G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER; |
570 | 0 | *errmsg = _("numbers out of order in {} quantifier"); |
571 | 0 | break; |
572 | 0 | case PCRE2_ERROR_QUANTIFIER_TOO_BIG: |
573 | 0 | *errcode = G_REGEX_ERROR_QUANTIFIER_TOO_BIG; |
574 | 0 | *errmsg = _("number too big in {} quantifier"); |
575 | 0 | break; |
576 | 0 | case PCRE2_ERROR_MISSING_SQUARE_BRACKET: |
577 | 0 | *errcode = G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS; |
578 | 0 | *errmsg = _("missing terminating ] for character class"); |
579 | 0 | break; |
580 | 0 | case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS: |
581 | 0 | *errcode = G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS; |
582 | 0 | *errmsg = _("invalid escape sequence in character class"); |
583 | 0 | break; |
584 | 0 | case PCRE2_ERROR_CLASS_RANGE_ORDER: |
585 | 0 | *errcode = G_REGEX_ERROR_RANGE_OUT_OF_ORDER; |
586 | 0 | *errmsg = _("range out of order in character class"); |
587 | 0 | break; |
588 | 0 | case PCRE2_ERROR_QUANTIFIER_INVALID: |
589 | 0 | case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT: |
590 | 0 | *errcode = G_REGEX_ERROR_NOTHING_TO_REPEAT; |
591 | 0 | *errmsg = _("nothing to repeat"); |
592 | 0 | break; |
593 | 0 | case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY: |
594 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
595 | 0 | *errmsg = _("unrecognized character after (? or (?-"); |
596 | 0 | break; |
597 | 0 | case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS: |
598 | 0 | *errcode = G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS; |
599 | 0 | *errmsg = _("POSIX named classes are supported only within a class"); |
600 | 0 | break; |
601 | 0 | case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING: |
602 | 0 | *errcode = G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED; |
603 | 0 | *errmsg = _("POSIX collating elements are not supported"); |
604 | 0 | break; |
605 | 0 | case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS: |
606 | 0 | case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS: |
607 | 0 | case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING: |
608 | 0 | *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; |
609 | 0 | *errmsg = _("missing terminating )"); |
610 | 0 | break; |
611 | 0 | case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE: |
612 | 0 | *errcode = G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE; |
613 | 0 | *errmsg = _("reference to non-existent subpattern"); |
614 | 0 | break; |
615 | 0 | case PCRE2_ERROR_MISSING_COMMENT_CLOSING: |
616 | 0 | *errcode = G_REGEX_ERROR_UNTERMINATED_COMMENT; |
617 | 0 | *errmsg = _("missing ) after comment"); |
618 | 0 | break; |
619 | 0 | case PCRE2_ERROR_PATTERN_TOO_LARGE: |
620 | 0 | *errcode = G_REGEX_ERROR_EXPRESSION_TOO_LARGE; |
621 | 0 | *errmsg = _("regular expression is too large"); |
622 | 0 | break; |
623 | 0 | case PCRE2_ERROR_MISSING_CONDITION_CLOSING: |
624 | 0 | *errcode = G_REGEX_ERROR_MALFORMED_CONDITION; |
625 | 0 | *errmsg = _("malformed number or name after (?("); |
626 | 0 | break; |
627 | 0 | case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH: |
628 | 0 | *errcode = G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND; |
629 | 0 | *errmsg = _("lookbehind assertion is not fixed length"); |
630 | 0 | break; |
631 | 0 | case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES: |
632 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES; |
633 | 0 | *errmsg = _("conditional group contains more than two branches"); |
634 | 0 | break; |
635 | 0 | case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED: |
636 | 0 | *errcode = G_REGEX_ERROR_ASSERTION_EXPECTED; |
637 | 0 | *errmsg = _("assertion expected after (?("); |
638 | 0 | break; |
639 | 0 | case PCRE2_ERROR_BAD_RELATIVE_REFERENCE: |
640 | 0 | *errcode = G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE; |
641 | 0 | *errmsg = _("a numbered reference must not be zero"); |
642 | 0 | break; |
643 | 0 | case PCRE2_ERROR_UNKNOWN_POSIX_CLASS: |
644 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME; |
645 | 0 | *errmsg = _("unknown POSIX class name"); |
646 | 0 | break; |
647 | 0 | case PCRE2_ERROR_CODE_POINT_TOO_BIG: |
648 | 0 | case PCRE2_ERROR_INVALID_HEXADECIMAL: |
649 | 0 | *errcode = G_REGEX_ERROR_HEX_CODE_TOO_LARGE; |
650 | 0 | *errmsg = _("character value in \\x{...} sequence is too large"); |
651 | 0 | break; |
652 | 0 | case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C: |
653 | 0 | *errcode = G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND; |
654 | 0 | *errmsg = _("\\C not allowed in lookbehind assertion"); |
655 | 0 | break; |
656 | 0 | case PCRE2_ERROR_MISSING_NAME_TERMINATOR: |
657 | 0 | *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR; |
658 | 0 | *errmsg = _("missing terminator in subpattern name"); |
659 | 0 | break; |
660 | 0 | case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME: |
661 | 0 | *errcode = G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME; |
662 | 0 | *errmsg = _("two named subpatterns have the same name"); |
663 | 0 | break; |
664 | 0 | case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY: |
665 | 0 | *errcode = G_REGEX_ERROR_MALFORMED_PROPERTY; |
666 | 0 | *errmsg = _("malformed \\P or \\p sequence"); |
667 | 0 | break; |
668 | 0 | case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY: |
669 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_PROPERTY; |
670 | 0 | *errmsg = _("unknown property name after \\P or \\p"); |
671 | 0 | break; |
672 | 0 | case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG: |
673 | 0 | *errcode = G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG; |
674 | 0 | *errmsg = _("subpattern name is too long (maximum 32 characters)"); |
675 | 0 | break; |
676 | 0 | case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS: |
677 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_SUBPATTERNS; |
678 | 0 | *errmsg = _("too many named subpatterns (maximum 10,000)"); |
679 | 0 | break; |
680 | 0 | case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG: |
681 | 0 | *errcode = G_REGEX_ERROR_INVALID_OCTAL_VALUE; |
682 | 0 | *errmsg = _("octal value is greater than \\377"); |
683 | 0 | break; |
684 | 0 | case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES: |
685 | 0 | *errcode = G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE; |
686 | 0 | *errmsg = _("DEFINE group contains more than one branch"); |
687 | 0 | break; |
688 | 0 | case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE: |
689 | 0 | *errcode = G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS; |
690 | 0 | *errmsg = _("inconsistent NEWLINE options"); |
691 | 0 | break; |
692 | 0 | case PCRE2_ERROR_BACKSLASH_G_SYNTAX: |
693 | 0 | *errcode = G_REGEX_ERROR_MISSING_BACK_REFERENCE; |
694 | 0 | *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or " |
695 | 0 | "number, or by a plain number"); |
696 | 0 | break; |
697 | 0 | case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED: |
698 | 0 | *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN; |
699 | 0 | *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)"); |
700 | 0 | break; |
701 | 0 | case PCRE2_ERROR_VERB_UNKNOWN: |
702 | 0 | *errcode = G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB; |
703 | 0 | *errmsg = _("(*VERB) not recognized"); |
704 | 0 | break; |
705 | 0 | case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG: |
706 | 0 | *errcode = G_REGEX_ERROR_NUMBER_TOO_BIG; |
707 | 0 | *errmsg = _("number is too big"); |
708 | 0 | break; |
709 | 0 | case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED: |
710 | 0 | *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME; |
711 | 0 | *errmsg = _("missing subpattern name after (?&"); |
712 | 0 | break; |
713 | 0 | case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH: |
714 | 0 | *errcode = G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME; |
715 | 0 | *errmsg = _("different names for subpatterns of the same number are not allowed"); |
716 | 0 | break; |
717 | 0 | case PCRE2_ERROR_MARK_MISSING_ARGUMENT: |
718 | 0 | *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED; |
719 | 0 | *errmsg = _("(*MARK) must have an argument"); |
720 | 0 | break; |
721 | 0 | case PCRE2_ERROR_BACKSLASH_C_SYNTAX: |
722 | 0 | *errcode = G_REGEX_ERROR_INVALID_CONTROL_CHAR; |
723 | 0 | *errmsg = _( "\\c must be followed by an ASCII character"); |
724 | 0 | break; |
725 | 0 | case PCRE2_ERROR_BACKSLASH_K_SYNTAX: |
726 | 0 | *errcode = G_REGEX_ERROR_MISSING_NAME; |
727 | 0 | *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name"); |
728 | 0 | break; |
729 | 0 | case PCRE2_ERROR_BACKSLASH_N_IN_CLASS: |
730 | 0 | *errcode = G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS; |
731 | 0 | *errmsg = _("\\N is not supported in a class"); |
732 | 0 | break; |
733 | 0 | case PCRE2_ERROR_VERB_NAME_TOO_LONG: |
734 | 0 | *errcode = G_REGEX_ERROR_NAME_TOO_LONG; |
735 | 0 | *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)"); |
736 | 0 | break; |
737 | 0 | case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW: |
738 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
739 | 0 | *errmsg = _("code overflow"); |
740 | 0 | break; |
741 | 0 | case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P: |
742 | 0 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
743 | 0 | *errmsg = _("unrecognized character after (?P"); |
744 | 0 | break; |
745 | 0 | case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE: |
746 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
747 | 0 | *errmsg = _("overran compiling workspace"); |
748 | 0 | break; |
749 | 0 | case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN: |
750 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
751 | 0 | *errmsg = _("previously-checked referenced subpattern not found"); |
752 | 0 | break; |
753 | 0 | case PCRE2_ERROR_HEAP_FAILED: |
754 | 0 | case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW: |
755 | 0 | case PCRE2_ERROR_UNICODE_NOT_SUPPORTED: |
756 | 0 | case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT: |
757 | 0 | case PCRE2_ERROR_NO_SURROGATES_IN_UTF16: |
758 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS: |
759 | 0 | case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE: |
760 | 0 | case PCRE2_ERROR_INTERNAL_STUDY_ERROR: |
761 | 0 | case PCRE2_ERROR_UTF_IS_DISABLED: |
762 | 0 | case PCRE2_ERROR_UCP_IS_DISABLED: |
763 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS: |
764 | 0 | case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED: |
765 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE: |
766 | 0 | case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP: |
767 | 0 | *errcode = G_REGEX_ERROR_INTERNAL; |
768 | 0 | break; |
769 | 0 | case PCRE2_ERROR_INVALID_SUBPATTERN_NAME: |
770 | 0 | case PCRE2_ERROR_CLASS_INVALID_RANGE: |
771 | 0 | case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE: |
772 | 0 | case PCRE2_ERROR_PARENTHESES_STACK_CHECK: |
773 | 0 | case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED: |
774 | 0 | case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG: |
775 | 0 | case PCRE2_ERROR_MISSING_CALLOUT_CLOSING: |
776 | 0 | case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB: |
777 | 0 | case PCRE2_ERROR_NULL_PATTERN: |
778 | 0 | case PCRE2_ERROR_BAD_OPTIONS: |
779 | 0 | case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP: |
780 | 0 | case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE: |
781 | 0 | case PCRE2_ERROR_INVALID_OCTAL: |
782 | 0 | case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG: |
783 | 0 | case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG: |
784 | 0 | case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS: |
785 | 0 | case PCRE2_ERROR_VERSION_CONDITION_SYNTAX: |
786 | 0 | case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER: |
787 | 0 | case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER: |
788 | 0 | case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED: |
789 | 0 | case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP: |
790 | 0 | case PCRE2_ERROR_PATTERN_TOO_COMPLICATED: |
791 | 0 | case PCRE2_ERROR_LOOKBEHIND_TOO_LONG: |
792 | 0 | case PCRE2_ERROR_PATTERN_STRING_TOO_LONG: |
793 | 0 | case PCRE2_ERROR_BAD_LITERAL_OPTIONS: |
794 | 0 | default: |
795 | 0 | *errcode = G_REGEX_ERROR_COMPILE; |
796 | 0 | break; |
797 | 0 | } |
798 | | |
799 | 0 | g_assert (*errcode != -1); |
800 | 0 | } |
801 | | |
802 | | /* GMatchInfo */ |
803 | | |
804 | | static GMatchInfo * |
805 | | match_info_new (const GRegex *regex, |
806 | | const gchar *string, |
807 | | gint string_len, |
808 | | gint start_position, |
809 | | GRegexMatchFlags match_options, |
810 | | gboolean is_dfa) |
811 | 0 | { |
812 | 0 | GMatchInfo *match_info; |
813 | |
|
814 | 0 | if (string_len < 0) |
815 | 0 | string_len = strlen (string); |
816 | |
|
817 | 0 | match_info = g_new0 (GMatchInfo, 1); |
818 | 0 | match_info->ref_count = 1; |
819 | 0 | match_info->regex = g_regex_ref ((GRegex *)regex); |
820 | 0 | match_info->string = string; |
821 | 0 | match_info->string_len = string_len; |
822 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
823 | 0 | match_info->pos = start_position; |
824 | 0 | match_info->match_opts = |
825 | 0 | get_pcre2_match_options (match_options, regex->orig_compile_opts); |
826 | |
|
827 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, |
828 | 0 | &match_info->n_subpatterns); |
829 | |
|
830 | 0 | match_info->match_context = pcre2_match_context_create (NULL); |
831 | |
|
832 | 0 | if (is_dfa) |
833 | 0 | { |
834 | | /* These values should be enough for most cases, if they are not |
835 | | * enough g_regex_match_all_full() will expand them. */ |
836 | 0 | match_info->n_workspace = 100; |
837 | 0 | match_info->workspace = g_new (gint, match_info->n_workspace); |
838 | 0 | } |
839 | |
|
840 | 0 | match_info->n_offsets = 2; |
841 | 0 | match_info->offsets = g_new0 (gint, match_info->n_offsets); |
842 | | /* Set an invalid position for the previous match. */ |
843 | 0 | match_info->offsets[0] = -1; |
844 | 0 | match_info->offsets[1] = -1; |
845 | |
|
846 | 0 | match_info->match_data = pcre2_match_data_create_from_pattern ( |
847 | 0 | match_info->regex->pcre_re, |
848 | 0 | NULL); |
849 | |
|
850 | 0 | return match_info; |
851 | 0 | } |
852 | | |
853 | | static gboolean |
854 | | recalc_match_offsets (GMatchInfo *match_info, |
855 | | GError **error) |
856 | 0 | { |
857 | 0 | PCRE2_SIZE *ovector; |
858 | 0 | uint32_t ovector_size = 0; |
859 | 0 | uint32_t pre_n_offset; |
860 | 0 | uint32_t i; |
861 | |
|
862 | 0 | g_assert (!IS_PCRE2_ERROR (match_info->matches)); |
863 | | |
864 | 0 | if (match_info->matches == PCRE2_ERROR_PARTIAL) |
865 | 0 | ovector_size = 1; |
866 | 0 | else if (match_info->matches > 0) |
867 | 0 | ovector_size = match_info->matches; |
868 | |
|
869 | 0 | g_assert (ovector_size != 0); |
870 | | |
871 | 0 | if (pcre2_get_ovector_count (match_info->match_data) < ovector_size) |
872 | 0 | { |
873 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
874 | 0 | _("Error while matching regular expression %s: %s"), |
875 | 0 | match_info->regex->pattern, _("code overflow")); |
876 | 0 | return FALSE; |
877 | 0 | } |
878 | | |
879 | 0 | pre_n_offset = match_info->n_offsets; |
880 | 0 | match_info->n_offsets = ovector_size * 2; |
881 | 0 | ovector = pcre2_get_ovector_pointer (match_info->match_data); |
882 | |
|
883 | 0 | if (match_info->n_offsets != pre_n_offset) |
884 | 0 | { |
885 | 0 | match_info->offsets = g_realloc_n (match_info->offsets, |
886 | 0 | match_info->n_offsets, |
887 | 0 | sizeof (gint)); |
888 | 0 | } |
889 | |
|
890 | 0 | for (i = 0; i < match_info->n_offsets; i++) |
891 | 0 | { |
892 | 0 | match_info->offsets[i] = (int) ovector[i]; |
893 | 0 | } |
894 | |
|
895 | 0 | return TRUE; |
896 | 0 | } |
897 | | |
898 | | static JITStatus |
899 | | enable_jit_with_match_options (GRegex *regex, |
900 | | uint32_t match_options) |
901 | 0 | { |
902 | 0 | gint retval; |
903 | 0 | uint32_t old_jit_options, new_jit_options; |
904 | |
|
905 | 0 | if (!(regex->orig_compile_opts & G_REGEX_OPTIMIZE)) |
906 | 0 | return JIT_STATUS_DISABLED; |
907 | | |
908 | 0 | if (regex->jit_status == JIT_STATUS_DISABLED) |
909 | 0 | return JIT_STATUS_DISABLED; |
910 | | |
911 | 0 | if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS) |
912 | 0 | return JIT_STATUS_DISABLED; |
913 | | |
914 | 0 | old_jit_options = regex->jit_options; |
915 | 0 | new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE; |
916 | 0 | if (match_options & PCRE2_PARTIAL_HARD) |
917 | 0 | new_jit_options |= PCRE2_JIT_PARTIAL_HARD; |
918 | 0 | if (match_options & PCRE2_PARTIAL_SOFT) |
919 | 0 | new_jit_options |= PCRE2_JIT_PARTIAL_SOFT; |
920 | | |
921 | | /* no new options enabled */ |
922 | 0 | if (new_jit_options == old_jit_options) |
923 | 0 | return regex->jit_status; |
924 | | |
925 | 0 | retval = pcre2_jit_compile (regex->pcre_re, new_jit_options); |
926 | 0 | switch (retval) |
927 | 0 | { |
928 | 0 | case 0: /* JIT enabled successfully */ |
929 | 0 | regex->jit_options = new_jit_options; |
930 | 0 | return JIT_STATUS_ENABLED; |
931 | 0 | case PCRE2_ERROR_NOMEMORY: |
932 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
933 | 0 | "but JIT was unable to allocate executable memory for the " |
934 | 0 | "compiler. Falling back to interpretive code."); |
935 | 0 | return JIT_STATUS_DISABLED; |
936 | 0 | case PCRE2_ERROR_JIT_BADOPTION: |
937 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
938 | 0 | "but JIT support is not available. Falling back to " |
939 | 0 | "interpretive code."); |
940 | 0 | return JIT_STATUS_DISABLED; |
941 | 0 | break; |
942 | 0 | default: |
943 | 0 | g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
944 | 0 | "but request for JIT support had unexpectedly failed (error %d). " |
945 | 0 | "Falling back to interpretive code.", retval); |
946 | 0 | return JIT_STATUS_DISABLED; |
947 | 0 | break; |
948 | 0 | } |
949 | | |
950 | 0 | return regex->jit_status; |
951 | 0 | } |
952 | | |
953 | | /** |
954 | | * g_match_info_get_regex: |
955 | | * @match_info: a #GMatchInfo |
956 | | * |
957 | | * Returns #GRegex object used in @match_info. It belongs to Glib |
958 | | * and must not be freed. Use g_regex_ref() if you need to keep it |
959 | | * after you free @match_info object. |
960 | | * |
961 | | * Returns: (transfer none): #GRegex object used in @match_info |
962 | | * |
963 | | * Since: 2.14 |
964 | | */ |
965 | | GRegex * |
966 | | g_match_info_get_regex (const GMatchInfo *match_info) |
967 | 0 | { |
968 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
969 | 0 | return match_info->regex; |
970 | 0 | } |
971 | | |
972 | | /** |
973 | | * g_match_info_get_string: |
974 | | * @match_info: a #GMatchInfo |
975 | | * |
976 | | * Returns the string searched with @match_info. This is the |
977 | | * string passed to g_regex_match() or g_regex_replace() so |
978 | | * you may not free it before calling this function. |
979 | | * |
980 | | * Returns: the string searched with @match_info |
981 | | * |
982 | | * Since: 2.14 |
983 | | */ |
984 | | const gchar * |
985 | | g_match_info_get_string (const GMatchInfo *match_info) |
986 | 0 | { |
987 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
988 | 0 | return match_info->string; |
989 | 0 | } |
990 | | |
991 | | /** |
992 | | * g_match_info_ref: |
993 | | * @match_info: a #GMatchInfo |
994 | | * |
995 | | * Increases reference count of @match_info by 1. |
996 | | * |
997 | | * Returns: @match_info |
998 | | * |
999 | | * Since: 2.30 |
1000 | | */ |
1001 | | GMatchInfo * |
1002 | | g_match_info_ref (GMatchInfo *match_info) |
1003 | 0 | { |
1004 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1005 | 0 | g_atomic_int_inc (&match_info->ref_count); |
1006 | 0 | return match_info; |
1007 | 0 | } |
1008 | | |
1009 | | /** |
1010 | | * g_match_info_unref: |
1011 | | * @match_info: a #GMatchInfo |
1012 | | * |
1013 | | * Decreases reference count of @match_info by 1. When reference count drops |
1014 | | * to zero, it frees all the memory associated with the match_info structure. |
1015 | | * |
1016 | | * Since: 2.30 |
1017 | | */ |
1018 | | void |
1019 | | g_match_info_unref (GMatchInfo *match_info) |
1020 | 0 | { |
1021 | 0 | if (g_atomic_int_dec_and_test (&match_info->ref_count)) |
1022 | 0 | { |
1023 | 0 | g_regex_unref (match_info->regex); |
1024 | 0 | if (match_info->match_context) |
1025 | 0 | pcre2_match_context_free (match_info->match_context); |
1026 | 0 | if (match_info->match_data) |
1027 | 0 | pcre2_match_data_free (match_info->match_data); |
1028 | 0 | g_free (match_info->offsets); |
1029 | 0 | g_free (match_info->workspace); |
1030 | 0 | g_free (match_info); |
1031 | 0 | } |
1032 | 0 | } |
1033 | | |
1034 | | /** |
1035 | | * g_match_info_free: |
1036 | | * @match_info: (nullable): a #GMatchInfo, or %NULL |
1037 | | * |
1038 | | * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does |
1039 | | * nothing. |
1040 | | * |
1041 | | * Since: 2.14 |
1042 | | */ |
1043 | | void |
1044 | | g_match_info_free (GMatchInfo *match_info) |
1045 | 0 | { |
1046 | 0 | if (match_info == NULL) |
1047 | 0 | return; |
1048 | | |
1049 | 0 | g_match_info_unref (match_info); |
1050 | 0 | } |
1051 | | |
1052 | | /** |
1053 | | * g_match_info_next: |
1054 | | * @match_info: a #GMatchInfo structure |
1055 | | * @error: location to store the error occurring, or %NULL to ignore errors |
1056 | | * |
1057 | | * Scans for the next match using the same parameters of the previous |
1058 | | * call to g_regex_match_full() or g_regex_match() that returned |
1059 | | * @match_info. |
1060 | | * |
1061 | | * The match is done on the string passed to the match function, so you |
1062 | | * cannot free it before calling this function. |
1063 | | * |
1064 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
1065 | | * |
1066 | | * Since: 2.14 |
1067 | | */ |
1068 | | gboolean |
1069 | | g_match_info_next (GMatchInfo *match_info, |
1070 | | GError **error) |
1071 | 0 | { |
1072 | 0 | JITStatus jit_status; |
1073 | 0 | gint prev_match_start; |
1074 | 0 | gint prev_match_end; |
1075 | 0 | uint32_t opts; |
1076 | |
|
1077 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1078 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
1079 | 0 | g_return_val_if_fail (match_info->pos >= 0, FALSE); |
1080 | | |
1081 | 0 | prev_match_start = match_info->offsets[0]; |
1082 | 0 | prev_match_end = match_info->offsets[1]; |
1083 | |
|
1084 | 0 | if (match_info->pos > match_info->string_len) |
1085 | 0 | { |
1086 | | /* we have reached the end of the string */ |
1087 | 0 | match_info->pos = -1; |
1088 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
1089 | 0 | return FALSE; |
1090 | 0 | } |
1091 | | |
1092 | 0 | opts = match_info->regex->match_opts | match_info->match_opts; |
1093 | |
|
1094 | 0 | jit_status = enable_jit_with_match_options (match_info->regex, opts); |
1095 | 0 | if (jit_status == JIT_STATUS_ENABLED) |
1096 | 0 | { |
1097 | 0 | match_info->matches = pcre2_jit_match (match_info->regex->pcre_re, |
1098 | 0 | (PCRE2_SPTR8) match_info->string, |
1099 | 0 | match_info->string_len, |
1100 | 0 | match_info->pos, |
1101 | 0 | opts, |
1102 | 0 | match_info->match_data, |
1103 | 0 | match_info->match_context); |
1104 | 0 | } |
1105 | 0 | else |
1106 | 0 | { |
1107 | 0 | match_info->matches = pcre2_match (match_info->regex->pcre_re, |
1108 | 0 | (PCRE2_SPTR8) match_info->string, |
1109 | 0 | match_info->string_len, |
1110 | 0 | match_info->pos, |
1111 | 0 | opts, |
1112 | 0 | match_info->match_data, |
1113 | 0 | match_info->match_context); |
1114 | 0 | } |
1115 | |
|
1116 | 0 | if (IS_PCRE2_ERROR (match_info->matches)) |
1117 | 0 | { |
1118 | 0 | gchar *error_msg = get_match_error_message (match_info->matches); |
1119 | |
|
1120 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
1121 | 0 | _("Error while matching regular expression %s: %s"), |
1122 | 0 | match_info->regex->pattern, error_msg); |
1123 | 0 | g_clear_pointer (&error_msg, g_free); |
1124 | 0 | return FALSE; |
1125 | 0 | } |
1126 | 0 | else if (match_info->matches == 0) |
1127 | 0 | { |
1128 | | /* info->offsets is too small. */ |
1129 | 0 | match_info->n_offsets *= 2; |
1130 | 0 | match_info->offsets = g_realloc_n (match_info->offsets, |
1131 | 0 | match_info->n_offsets, |
1132 | 0 | sizeof (gint)); |
1133 | |
|
1134 | 0 | pcre2_match_data_free (match_info->match_data); |
1135 | 0 | match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL); |
1136 | |
|
1137 | 0 | return g_match_info_next (match_info, error); |
1138 | 0 | } |
1139 | 0 | else if (match_info->matches == PCRE2_ERROR_NOMATCH) |
1140 | 0 | { |
1141 | | /* We're done with this match info */ |
1142 | 0 | match_info->pos = -1; |
1143 | 0 | return FALSE; |
1144 | 0 | } |
1145 | 0 | else |
1146 | 0 | if (!recalc_match_offsets (match_info, error)) |
1147 | 0 | return FALSE; |
1148 | | |
1149 | | /* avoid infinite loops if the pattern is an empty string or something |
1150 | | * equivalent */ |
1151 | 0 | if (match_info->pos == match_info->offsets[1]) |
1152 | 0 | { |
1153 | 0 | if (match_info->pos > match_info->string_len) |
1154 | 0 | { |
1155 | | /* we have reached the end of the string */ |
1156 | 0 | match_info->pos = -1; |
1157 | 0 | match_info->matches = PCRE2_ERROR_NOMATCH; |
1158 | 0 | return FALSE; |
1159 | 0 | } |
1160 | | |
1161 | 0 | match_info->pos = NEXT_CHAR (match_info->regex, |
1162 | 0 | &match_info->string[match_info->pos]) - |
1163 | 0 | match_info->string; |
1164 | 0 | } |
1165 | 0 | else |
1166 | 0 | { |
1167 | 0 | match_info->pos = match_info->offsets[1]; |
1168 | 0 | } |
1169 | | |
1170 | 0 | g_assert (match_info->matches < 0 || |
1171 | 0 | (uint32_t) match_info->matches <= match_info->n_subpatterns + 1); |
1172 | | |
1173 | | /* it's possible to get two identical matches when we are matching |
1174 | | * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and |
1175 | | * the string is "RegExTest" we have: |
1176 | | * - search at position 0: match from 0 to 0 |
1177 | | * - search at position 1: match from 3 to 3 |
1178 | | * - search at position 3: match from 3 to 3 (duplicate) |
1179 | | * - search at position 4: match from 5 to 5 |
1180 | | * - search at position 5: match from 5 to 5 (duplicate) |
1181 | | * - search at position 6: no match -> stop |
1182 | | * so we have to ignore the duplicates. |
1183 | | * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */ |
1184 | 0 | if (match_info->matches >= 0 && |
1185 | 0 | prev_match_start == match_info->offsets[0] && |
1186 | 0 | prev_match_end == match_info->offsets[1]) |
1187 | 0 | { |
1188 | | /* ignore this match and search the next one */ |
1189 | 0 | return g_match_info_next (match_info, error); |
1190 | 0 | } |
1191 | | |
1192 | 0 | return match_info->matches >= 0; |
1193 | 0 | } |
1194 | | |
1195 | | /** |
1196 | | * g_match_info_matches: |
1197 | | * @match_info: a #GMatchInfo structure |
1198 | | * |
1199 | | * Returns whether the previous match operation succeeded. |
1200 | | * |
1201 | | * Returns: %TRUE if the previous match operation succeeded, |
1202 | | * %FALSE otherwise |
1203 | | * |
1204 | | * Since: 2.14 |
1205 | | */ |
1206 | | gboolean |
1207 | | g_match_info_matches (const GMatchInfo *match_info) |
1208 | 0 | { |
1209 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1210 | | |
1211 | 0 | return match_info->matches >= 0; |
1212 | 0 | } |
1213 | | |
1214 | | /** |
1215 | | * g_match_info_get_match_count: |
1216 | | * @match_info: a #GMatchInfo structure |
1217 | | * |
1218 | | * Retrieves the number of matched substrings (including substring 0, |
1219 | | * that is the whole matched text), so 1 is returned if the pattern |
1220 | | * has no substrings in it and 0 is returned if the match failed. |
1221 | | * |
1222 | | * If the last match was obtained using the DFA algorithm, that is |
1223 | | * using g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1224 | | * count is not that of the number of capturing parentheses but that of |
1225 | | * the number of matched substrings. |
1226 | | * |
1227 | | * Returns: Number of matched substrings, or -1 if an error occurred |
1228 | | * |
1229 | | * Since: 2.14 |
1230 | | */ |
1231 | | gint |
1232 | | g_match_info_get_match_count (const GMatchInfo *match_info) |
1233 | 0 | { |
1234 | 0 | g_return_val_if_fail (match_info, -1); |
1235 | | |
1236 | 0 | if (match_info->matches == PCRE2_ERROR_NOMATCH) |
1237 | | /* no match */ |
1238 | 0 | return 0; |
1239 | 0 | else if (match_info->matches < PCRE2_ERROR_NOMATCH) |
1240 | | /* error */ |
1241 | 0 | return -1; |
1242 | 0 | else |
1243 | | /* match */ |
1244 | 0 | return match_info->matches; |
1245 | 0 | } |
1246 | | |
1247 | | /** |
1248 | | * g_match_info_is_partial_match: |
1249 | | * @match_info: a #GMatchInfo structure |
1250 | | * |
1251 | | * Usually if the string passed to g_regex_match*() matches as far as |
1252 | | * it goes, but is too short to match the entire pattern, %FALSE is |
1253 | | * returned. There are circumstances where it might be helpful to |
1254 | | * distinguish this case from other cases in which there is no match. |
1255 | | * |
1256 | | * Consider, for example, an application where a human is required to |
1257 | | * type in data for a field with specific formatting requirements. An |
1258 | | * example might be a date in the form ddmmmyy, defined by the pattern |
1259 | | * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$". |
1260 | | * If the application sees the user’s keystrokes one by one, and can |
1261 | | * check that what has been typed so far is potentially valid, it is |
1262 | | * able to raise an error as soon as a mistake is made. |
1263 | | * |
1264 | | * GRegex supports the concept of partial matching by means of the |
1265 | | * %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD flags. |
1266 | | * When they are used, the return code for |
1267 | | * g_regex_match() or g_regex_match_full() is, as usual, %TRUE |
1268 | | * for a complete match, %FALSE otherwise. But, when these functions |
1269 | | * return %FALSE, you can check if the match was partial calling |
1270 | | * g_match_info_is_partial_match(). |
1271 | | * |
1272 | | * The difference between %G_REGEX_MATCH_PARTIAL_SOFT and |
1273 | | * %G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered |
1274 | | * with %G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a |
1275 | | * possible complete match, while with %G_REGEX_MATCH_PARTIAL_HARD matching |
1276 | | * stops at the partial match. |
1277 | | * When both %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD |
1278 | | * are set, the latter takes precedence. |
1279 | | * |
1280 | | * There were formerly some restrictions on the pattern for partial matching. |
1281 | | * The restrictions no longer apply. |
1282 | | * |
1283 | | * See pcrepartial(3) for more information on partial matching. |
1284 | | * |
1285 | | * Returns: %TRUE if the match was partial, %FALSE otherwise |
1286 | | * |
1287 | | * Since: 2.14 |
1288 | | */ |
1289 | | gboolean |
1290 | | g_match_info_is_partial_match (const GMatchInfo *match_info) |
1291 | 0 | { |
1292 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1293 | | |
1294 | 0 | return match_info->matches == PCRE2_ERROR_PARTIAL; |
1295 | 0 | } |
1296 | | |
1297 | | /** |
1298 | | * g_match_info_expand_references: |
1299 | | * @match_info: (nullable): a #GMatchInfo or %NULL |
1300 | | * @string_to_expand: the string to expand |
1301 | | * @error: location to store the error occurring, or %NULL to ignore errors |
1302 | | * |
1303 | | * Returns a new string containing the text in @string_to_expand with |
1304 | | * references and escape sequences expanded. References refer to the last |
1305 | | * match done with @string against @regex and have the same syntax used by |
1306 | | * g_regex_replace(). |
1307 | | * |
1308 | | * The @string_to_expand must be UTF-8 encoded even if %G_REGEX_RAW was |
1309 | | * passed to g_regex_new(). |
1310 | | * |
1311 | | * The backreferences are extracted from the string passed to the match |
1312 | | * function, so you cannot call this function after freeing the string. |
1313 | | * |
1314 | | * @match_info may be %NULL in which case @string_to_expand must not |
1315 | | * contain references. For instance "foo\n" does not refer to an actual |
1316 | | * pattern and '\n' merely will be replaced with \n character, |
1317 | | * while to expand "\0" (whole match) one needs the result of a match. |
1318 | | * Use g_regex_check_replacement() to find out whether @string_to_expand |
1319 | | * contains references. |
1320 | | * |
1321 | | * Returns: (nullable): the expanded string, or %NULL if an error occurred |
1322 | | * |
1323 | | * Since: 2.14 |
1324 | | */ |
1325 | | gchar * |
1326 | | g_match_info_expand_references (const GMatchInfo *match_info, |
1327 | | const gchar *string_to_expand, |
1328 | | GError **error) |
1329 | 0 | { |
1330 | 0 | GString *result; |
1331 | 0 | GList *list; |
1332 | 0 | GError *tmp_error = NULL; |
1333 | |
|
1334 | 0 | g_return_val_if_fail (string_to_expand != NULL, NULL); |
1335 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
1336 | | |
1337 | 0 | list = split_replacement (string_to_expand, &tmp_error); |
1338 | 0 | if (tmp_error != NULL) |
1339 | 0 | { |
1340 | 0 | g_propagate_error (error, tmp_error); |
1341 | 0 | return NULL; |
1342 | 0 | } |
1343 | | |
1344 | 0 | if (!match_info && interpolation_list_needs_match (list)) |
1345 | 0 | { |
1346 | 0 | g_critical ("String '%s' contains references to the match, can't " |
1347 | 0 | "expand references without GMatchInfo object", |
1348 | 0 | string_to_expand); |
1349 | 0 | return NULL; |
1350 | 0 | } |
1351 | | |
1352 | 0 | result = g_string_sized_new (strlen (string_to_expand)); |
1353 | 0 | interpolate_replacement (match_info, result, list); |
1354 | |
|
1355 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
1356 | |
|
1357 | 0 | return g_string_free (result, FALSE); |
1358 | 0 | } |
1359 | | |
1360 | | /** |
1361 | | * g_match_info_fetch: |
1362 | | * @match_info: #GMatchInfo structure |
1363 | | * @match_num: number of the sub expression |
1364 | | * |
1365 | | * Retrieves the text matching the @match_num'th capturing |
1366 | | * parentheses. 0 is the full text of the match, 1 is the first paren |
1367 | | * set, 2 the second, and so on. |
1368 | | * |
1369 | | * If @match_num is a valid sub pattern but it didn't match anything |
1370 | | * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty |
1371 | | * string is returned. |
1372 | | * |
1373 | | * If the match was obtained using the DFA algorithm, that is using |
1374 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1375 | | * string is not that of a set of parentheses but that of a matched |
1376 | | * substring. Substrings are matched in reverse order of length, so |
1377 | | * 0 is the longest match. |
1378 | | * |
1379 | | * The string is fetched from the string passed to the match function, |
1380 | | * so you cannot call this function after freeing the string. |
1381 | | * |
1382 | | * Returns: (nullable): The matched substring, or %NULL if an error |
1383 | | * occurred. You have to free the string yourself |
1384 | | * |
1385 | | * Since: 2.14 |
1386 | | */ |
1387 | | gchar * |
1388 | | g_match_info_fetch (const GMatchInfo *match_info, |
1389 | | gint match_num) |
1390 | 0 | { |
1391 | 0 | gchar *match = NULL; |
1392 | 0 | gint start, end; |
1393 | |
|
1394 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1395 | 0 | g_return_val_if_fail (match_num >= 0, NULL); |
1396 | | |
1397 | | /* match_num does not exist or it didn't matched, i.e. matching "b" |
1398 | | * against "(a)?b" then group 0 is empty. */ |
1399 | 0 | if (!g_match_info_fetch_pos (match_info, match_num, &start, &end)) |
1400 | 0 | match = NULL; |
1401 | 0 | else if (start == -1) |
1402 | 0 | match = g_strdup (""); |
1403 | 0 | else |
1404 | 0 | match = g_strndup (&match_info->string[start], end - start); |
1405 | |
|
1406 | 0 | return match; |
1407 | 0 | } |
1408 | | |
1409 | | /** |
1410 | | * g_match_info_fetch_pos: |
1411 | | * @match_info: #GMatchInfo structure |
1412 | | * @match_num: number of the sub expression |
1413 | | * @start_pos: (out) (optional): pointer to location where to store |
1414 | | * the start position, or %NULL |
1415 | | * @end_pos: (out) (optional): pointer to location where to store |
1416 | | * the end position, or %NULL |
1417 | | * |
1418 | | * Retrieves the position in bytes of the @match_num'th capturing |
1419 | | * parentheses. 0 is the full text of the match, 1 is the first |
1420 | | * paren set, 2 the second, and so on. |
1421 | | * |
1422 | | * If @match_num is a valid sub pattern but it didn't match anything |
1423 | | * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos |
1424 | | * and @end_pos are set to -1 and %TRUE is returned. |
1425 | | * |
1426 | | * If the match was obtained using the DFA algorithm, that is using |
1427 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1428 | | * position is not that of a set of parentheses but that of a matched |
1429 | | * substring. Substrings are matched in reverse order of length, so |
1430 | | * 0 is the longest match. |
1431 | | * |
1432 | | * Returns: %TRUE if the position was fetched, %FALSE otherwise. If |
1433 | | * the position cannot be fetched, @start_pos and @end_pos are left |
1434 | | * unchanged |
1435 | | * |
1436 | | * Since: 2.14 |
1437 | | */ |
1438 | | gboolean |
1439 | | g_match_info_fetch_pos (const GMatchInfo *match_info, |
1440 | | gint match_num, |
1441 | | gint *start_pos, |
1442 | | gint *end_pos) |
1443 | 0 | { |
1444 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1445 | 0 | g_return_val_if_fail (match_num >= 0, FALSE); |
1446 | | |
1447 | | /* check whether there was an error */ |
1448 | 0 | if (match_info->matches < 0) |
1449 | 0 | return FALSE; |
1450 | | |
1451 | | /* make sure the sub expression number they're requesting is less than |
1452 | | * the total number of sub expressions in the regex. When matching all |
1453 | | * (g_regex_match_all()), also compare against the number of matches */ |
1454 | 0 | if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches)) |
1455 | 0 | return FALSE; |
1456 | | |
1457 | 0 | if (start_pos != NULL) |
1458 | 0 | *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1; |
1459 | |
|
1460 | 0 | if (end_pos != NULL) |
1461 | 0 | *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1; |
1462 | |
|
1463 | 0 | return TRUE; |
1464 | 0 | } |
1465 | | |
1466 | | /* |
1467 | | * Returns number of first matched subpattern with name @name. |
1468 | | * There may be more than one in case when DUPNAMES is used, |
1469 | | * and not all subpatterns with that name match; |
1470 | | * pcre2_substring_number_from_name() does not work in that case. |
1471 | | */ |
1472 | | static gint |
1473 | | get_matched_substring_number (const GMatchInfo *match_info, |
1474 | | const gchar *name) |
1475 | 0 | { |
1476 | 0 | gint entrysize; |
1477 | 0 | PCRE2_SPTR first, last; |
1478 | 0 | guchar *entry; |
1479 | |
|
1480 | 0 | if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES)) |
1481 | 0 | return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR8) name); |
1482 | | |
1483 | | /* This code is analogous to code from pcre2_substring.c: |
1484 | | * pcre2_substring_get_byname() */ |
1485 | 0 | entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re, |
1486 | 0 | (PCRE2_SPTR8) name, |
1487 | 0 | &first, |
1488 | 0 | &last); |
1489 | |
|
1490 | 0 | if (entrysize <= 0) |
1491 | 0 | return entrysize; |
1492 | | |
1493 | 0 | for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize) |
1494 | 0 | { |
1495 | 0 | gint n = (entry[0] << 8) + entry[1]; |
1496 | 0 | if (match_info->offsets[n*2] >= 0) |
1497 | 0 | return n; |
1498 | 0 | } |
1499 | | |
1500 | 0 | return (first[0] << 8) + first[1]; |
1501 | 0 | } |
1502 | | |
1503 | | /** |
1504 | | * g_match_info_fetch_named: |
1505 | | * @match_info: #GMatchInfo structure |
1506 | | * @name: name of the subexpression |
1507 | | * |
1508 | | * Retrieves the text matching the capturing parentheses named @name. |
1509 | | * |
1510 | | * If @name is a valid sub pattern name but it didn't match anything |
1511 | | * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") |
1512 | | * then an empty string is returned. |
1513 | | * |
1514 | | * The string is fetched from the string passed to the match function, |
1515 | | * so you cannot call this function after freeing the string. |
1516 | | * |
1517 | | * Returns: (nullable): The matched substring, or %NULL if an error |
1518 | | * occurred. You have to free the string yourself |
1519 | | * |
1520 | | * Since: 2.14 |
1521 | | */ |
1522 | | gchar * |
1523 | | g_match_info_fetch_named (const GMatchInfo *match_info, |
1524 | | const gchar *name) |
1525 | 0 | { |
1526 | 0 | gint num; |
1527 | |
|
1528 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1529 | 0 | g_return_val_if_fail (name != NULL, NULL); |
1530 | | |
1531 | 0 | num = get_matched_substring_number (match_info, name); |
1532 | 0 | if (num < 0) |
1533 | 0 | return NULL; |
1534 | 0 | else |
1535 | 0 | return g_match_info_fetch (match_info, num); |
1536 | 0 | } |
1537 | | |
1538 | | /** |
1539 | | * g_match_info_fetch_named_pos: |
1540 | | * @match_info: #GMatchInfo structure |
1541 | | * @name: name of the subexpression |
1542 | | * @start_pos: (out) (optional): pointer to location where to store |
1543 | | * the start position, or %NULL |
1544 | | * @end_pos: (out) (optional): pointer to location where to store |
1545 | | * the end position, or %NULL |
1546 | | * |
1547 | | * Retrieves the position in bytes of the capturing parentheses named @name. |
1548 | | * |
1549 | | * If @name is a valid sub pattern name but it didn't match anything |
1550 | | * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") |
1551 | | * then @start_pos and @end_pos are set to -1 and %TRUE is returned. |
1552 | | * |
1553 | | * Returns: %TRUE if the position was fetched, %FALSE otherwise. |
1554 | | * If the position cannot be fetched, @start_pos and @end_pos |
1555 | | * are left unchanged. |
1556 | | * |
1557 | | * Since: 2.14 |
1558 | | */ |
1559 | | gboolean |
1560 | | g_match_info_fetch_named_pos (const GMatchInfo *match_info, |
1561 | | const gchar *name, |
1562 | | gint *start_pos, |
1563 | | gint *end_pos) |
1564 | 0 | { |
1565 | 0 | gint num; |
1566 | |
|
1567 | 0 | g_return_val_if_fail (match_info != NULL, FALSE); |
1568 | 0 | g_return_val_if_fail (name != NULL, FALSE); |
1569 | | |
1570 | 0 | num = get_matched_substring_number (match_info, name); |
1571 | 0 | if (num < 0) |
1572 | 0 | return FALSE; |
1573 | | |
1574 | 0 | return g_match_info_fetch_pos (match_info, num, start_pos, end_pos); |
1575 | 0 | } |
1576 | | |
1577 | | /** |
1578 | | * g_match_info_fetch_all: |
1579 | | * @match_info: a #GMatchInfo structure |
1580 | | * |
1581 | | * Bundles up pointers to each of the matching substrings from a match |
1582 | | * and stores them in an array of gchar pointers. The first element in |
1583 | | * the returned array is the match number 0, i.e. the entire matched |
1584 | | * text. |
1585 | | * |
1586 | | * If a sub pattern didn't match anything (e.g. sub pattern 1, matching |
1587 | | * "b" against "(a)?b") then an empty string is inserted. |
1588 | | * |
1589 | | * If the last match was obtained using the DFA algorithm, that is using |
1590 | | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1591 | | * strings are not that matched by sets of parentheses but that of the |
1592 | | * matched substring. Substrings are matched in reverse order of length, |
1593 | | * so the first one is the longest match. |
1594 | | * |
1595 | | * The strings are fetched from the string passed to the match function, |
1596 | | * so you cannot call this function after freeing the string. |
1597 | | * |
1598 | | * Returns: (transfer full): a %NULL-terminated array of gchar * |
1599 | | * pointers. It must be freed using g_strfreev(). If the previous |
1600 | | * match failed %NULL is returned |
1601 | | * |
1602 | | * Since: 2.14 |
1603 | | */ |
1604 | | gchar ** |
1605 | | g_match_info_fetch_all (const GMatchInfo *match_info) |
1606 | 0 | { |
1607 | 0 | gchar **result; |
1608 | 0 | gint i; |
1609 | |
|
1610 | 0 | g_return_val_if_fail (match_info != NULL, NULL); |
1611 | | |
1612 | 0 | if (match_info->matches < 0) |
1613 | 0 | return NULL; |
1614 | | |
1615 | 0 | result = g_new (gchar *, match_info->matches + 1); |
1616 | 0 | for (i = 0; i < match_info->matches; i++) |
1617 | 0 | result[i] = g_match_info_fetch (match_info, i); |
1618 | 0 | result[i] = NULL; |
1619 | |
|
1620 | 0 | return result; |
1621 | 0 | } |
1622 | | |
1623 | | |
1624 | | /* GRegex */ |
1625 | | |
1626 | | G_DEFINE_QUARK (g-regex-error-quark, g_regex_error) |
1627 | | |
1628 | | /** |
1629 | | * g_regex_ref: |
1630 | | * @regex: a #GRegex |
1631 | | * |
1632 | | * Increases reference count of @regex by 1. |
1633 | | * |
1634 | | * Returns: @regex |
1635 | | * |
1636 | | * Since: 2.14 |
1637 | | */ |
1638 | | GRegex * |
1639 | | g_regex_ref (GRegex *regex) |
1640 | 0 | { |
1641 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
1642 | 0 | g_atomic_int_inc (®ex->ref_count); |
1643 | 0 | return regex; |
1644 | 0 | } |
1645 | | |
1646 | | /** |
1647 | | * g_regex_unref: |
1648 | | * @regex: a #GRegex |
1649 | | * |
1650 | | * Decreases reference count of @regex by 1. When reference count drops |
1651 | | * to zero, it frees all the memory associated with the regex structure. |
1652 | | * |
1653 | | * Since: 2.14 |
1654 | | */ |
1655 | | void |
1656 | | g_regex_unref (GRegex *regex) |
1657 | 0 | { |
1658 | 0 | g_return_if_fail (regex != NULL); |
1659 | | |
1660 | 0 | if (g_atomic_int_dec_and_test (®ex->ref_count)) |
1661 | 0 | { |
1662 | 0 | g_free (regex->pattern); |
1663 | 0 | if (regex->pcre_re != NULL) |
1664 | 0 | pcre2_code_free (regex->pcre_re); |
1665 | 0 | g_free (regex); |
1666 | 0 | } |
1667 | 0 | } |
1668 | | |
1669 | | static pcre2_code * regex_compile (const gchar *pattern, |
1670 | | uint32_t compile_options, |
1671 | | uint32_t newline_options, |
1672 | | uint32_t bsr_options, |
1673 | | GError **error); |
1674 | | |
1675 | | static uint32_t get_pcre2_inline_compile_options (pcre2_code *re, |
1676 | | uint32_t compile_options); |
1677 | | |
1678 | | /** |
1679 | | * g_regex_new: |
1680 | | * @pattern: the regular expression |
1681 | | * @compile_options: compile options for the regular expression, or 0 |
1682 | | * @match_options: match options for the regular expression, or 0 |
1683 | | * @error: return location for a #GError |
1684 | | * |
1685 | | * Compiles the regular expression to an internal form, and does |
1686 | | * the initial setup of the #GRegex structure. |
1687 | | * |
1688 | | * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call |
1689 | | * g_regex_unref() when you are done with it |
1690 | | * |
1691 | | * Since: 2.14 |
1692 | | */ |
1693 | | GRegex * |
1694 | | g_regex_new (const gchar *pattern, |
1695 | | GRegexCompileFlags compile_options, |
1696 | | GRegexMatchFlags match_options, |
1697 | | GError **error) |
1698 | 0 | { |
1699 | 0 | GRegex *regex; |
1700 | 0 | pcre2_code *re; |
1701 | 0 | static gsize initialised = 0; |
1702 | 0 | uint32_t pcre_compile_options; |
1703 | 0 | uint32_t pcre_match_options; |
1704 | 0 | uint32_t newline_options; |
1705 | 0 | uint32_t bsr_options; |
1706 | |
|
1707 | 0 | g_return_val_if_fail (pattern != NULL, NULL); |
1708 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
1709 | 0 | G_GNUC_BEGIN_IGNORE_DEPRECATIONS |
1710 | 0 | g_return_val_if_fail ((compile_options & ~(G_REGEX_COMPILE_MASK | |
1711 | 0 | G_REGEX_JAVASCRIPT_COMPAT)) == 0, NULL); |
1712 | 0 | G_GNUC_END_IGNORE_DEPRECATIONS |
1713 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
1714 | | |
1715 | 0 | if (g_once_init_enter (&initialised)) |
1716 | 0 | { |
1717 | 0 | int supports_utf8; |
1718 | |
|
1719 | 0 | pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8); |
1720 | 0 | if (!supports_utf8) |
1721 | 0 | g_critical (_("PCRE library is compiled without UTF8 support")); |
1722 | |
|
1723 | 0 | g_once_init_leave (&initialised, supports_utf8 ? 1 : 2); |
1724 | 0 | } |
1725 | |
|
1726 | 0 | if (G_UNLIKELY (initialised != 1)) |
1727 | 0 | { |
1728 | 0 | g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, |
1729 | 0 | _("PCRE library is compiled with incompatible options")); |
1730 | 0 | return NULL; |
1731 | 0 | } |
1732 | | |
1733 | 0 | pcre_compile_options = get_pcre2_compile_options (compile_options); |
1734 | 0 | pcre_match_options = get_pcre2_match_options (match_options, compile_options); |
1735 | |
|
1736 | 0 | newline_options = get_pcre2_newline_match_options (match_options); |
1737 | 0 | if (newline_options == 0) |
1738 | 0 | newline_options = get_pcre2_newline_compile_options (compile_options); |
1739 | |
|
1740 | 0 | if (newline_options == 0) |
1741 | 0 | { |
1742 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
1743 | 0 | "Invalid newline flags"); |
1744 | 0 | return NULL; |
1745 | 0 | } |
1746 | | |
1747 | 0 | bsr_options = get_pcre2_bsr_match_options (match_options); |
1748 | 0 | if (!bsr_options) |
1749 | 0 | bsr_options = get_pcre2_bsr_compile_options (compile_options); |
1750 | |
|
1751 | 0 | re = regex_compile (pattern, pcre_compile_options, |
1752 | 0 | newline_options, bsr_options, error); |
1753 | 0 | if (re == NULL) |
1754 | 0 | return NULL; |
1755 | | |
1756 | 0 | pcre_compile_options |= |
1757 | 0 | get_pcre2_inline_compile_options (re, pcre_compile_options); |
1758 | |
|
1759 | 0 | regex = g_new0 (GRegex, 1); |
1760 | 0 | regex->ref_count = 1; |
1761 | 0 | regex->pattern = g_strdup (pattern); |
1762 | 0 | regex->pcre_re = re; |
1763 | 0 | regex->compile_opts = pcre_compile_options; |
1764 | 0 | regex->orig_compile_opts = compile_options; |
1765 | 0 | regex->match_opts = pcre_match_options; |
1766 | 0 | regex->orig_match_opts = match_options; |
1767 | 0 | regex->jit_status = enable_jit_with_match_options (regex, regex->match_opts); |
1768 | |
|
1769 | 0 | return regex; |
1770 | 0 | } |
1771 | | |
1772 | | static pcre2_code * |
1773 | | regex_compile (const gchar *pattern, |
1774 | | uint32_t compile_options, |
1775 | | uint32_t newline_options, |
1776 | | uint32_t bsr_options, |
1777 | | GError **error) |
1778 | 0 | { |
1779 | 0 | pcre2_code *re; |
1780 | 0 | pcre2_compile_context *context; |
1781 | 0 | const gchar *errmsg; |
1782 | 0 | PCRE2_SIZE erroffset; |
1783 | 0 | gint errcode; |
1784 | |
|
1785 | 0 | context = pcre2_compile_context_create (NULL); |
1786 | | |
1787 | | /* set newline options */ |
1788 | 0 | if (pcre2_set_newline (context, newline_options) != 0) |
1789 | 0 | { |
1790 | 0 | g_set_error (error, G_REGEX_ERROR, |
1791 | 0 | G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
1792 | 0 | "Invalid newline flags"); |
1793 | 0 | pcre2_compile_context_free (context); |
1794 | 0 | return NULL; |
1795 | 0 | } |
1796 | | |
1797 | | /* set bsr options */ |
1798 | 0 | if (pcre2_set_bsr (context, bsr_options) != 0) |
1799 | 0 | { |
1800 | 0 | g_set_error (error, G_REGEX_ERROR, |
1801 | 0 | G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
1802 | 0 | "Invalid BSR flags"); |
1803 | 0 | pcre2_compile_context_free (context); |
1804 | 0 | return NULL; |
1805 | 0 | } |
1806 | | |
1807 | | /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */ |
1808 | 0 | if (compile_options & PCRE2_UTF) |
1809 | 0 | compile_options |= PCRE2_NO_UTF_CHECK; |
1810 | |
|
1811 | 0 | compile_options |= PCRE2_UCP; |
1812 | | |
1813 | | /* compile the pattern */ |
1814 | 0 | re = pcre2_compile ((PCRE2_SPTR8) pattern, |
1815 | 0 | PCRE2_ZERO_TERMINATED, |
1816 | 0 | compile_options, |
1817 | 0 | &errcode, |
1818 | 0 | &erroffset, |
1819 | 0 | context); |
1820 | 0 | pcre2_compile_context_free (context); |
1821 | | |
1822 | | /* if the compilation failed, set the error member and return |
1823 | | * immediately */ |
1824 | 0 | if (re == NULL) |
1825 | 0 | { |
1826 | 0 | GError *tmp_error; |
1827 | 0 | gchar *offset_str; |
1828 | 0 | gchar *pcre2_errmsg = NULL; |
1829 | 0 | int original_errcode; |
1830 | | |
1831 | | /* Translate the PCRE error code to GRegexError and use a translated |
1832 | | * error message if possible */ |
1833 | 0 | original_errcode = errcode; |
1834 | 0 | translate_compile_error (&errcode, &errmsg); |
1835 | |
|
1836 | 0 | if (!errmsg) |
1837 | 0 | { |
1838 | 0 | errmsg = _("unknown error"); |
1839 | 0 | pcre2_errmsg = get_pcre2_error_string (original_errcode); |
1840 | 0 | } |
1841 | | |
1842 | | /* PCRE uses byte offsets but we want to show character offsets */ |
1843 | 0 | erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]); |
1844 | |
|
1845 | 0 | offset_str = g_strdup_printf ("%" G_GSIZE_FORMAT, erroffset); |
1846 | 0 | tmp_error = g_error_new (G_REGEX_ERROR, errcode, |
1847 | 0 | _("Error while compiling regular expression ‘%s’ " |
1848 | 0 | "at char %s: %s"), |
1849 | 0 | pattern, offset_str, |
1850 | 0 | pcre2_errmsg ? pcre2_errmsg : errmsg); |
1851 | 0 | g_propagate_error (error, tmp_error); |
1852 | 0 | g_free (offset_str); |
1853 | 0 | g_clear_pointer (&pcre2_errmsg, g_free); |
1854 | |
|
1855 | 0 | return NULL; |
1856 | 0 | } |
1857 | | |
1858 | 0 | return re; |
1859 | 0 | } |
1860 | | |
1861 | | static uint32_t |
1862 | | get_pcre2_inline_compile_options (pcre2_code *re, |
1863 | | uint32_t compile_options) |
1864 | 0 | { |
1865 | 0 | uint32_t pcre_compile_options; |
1866 | 0 | uint32_t nonpcre_compile_options; |
1867 | | |
1868 | | /* For options set at the beginning of the pattern, pcre puts them into |
1869 | | * compile options, e.g. "(?i)foo" will make the pcre structure store |
1870 | | * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */ |
1871 | 0 | nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; |
1872 | 0 | pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options); |
1873 | 0 | compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK; |
1874 | 0 | compile_options |= nonpcre_compile_options; |
1875 | |
|
1876 | 0 | if (!(compile_options & PCRE2_DUPNAMES)) |
1877 | 0 | { |
1878 | 0 | uint32_t jchanged = 0; |
1879 | 0 | pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged); |
1880 | 0 | if (jchanged) |
1881 | 0 | compile_options |= PCRE2_DUPNAMES; |
1882 | 0 | } |
1883 | |
|
1884 | 0 | return compile_options; |
1885 | 0 | } |
1886 | | |
1887 | | /** |
1888 | | * g_regex_get_pattern: |
1889 | | * @regex: a #GRegex structure |
1890 | | * |
1891 | | * Gets the pattern string associated with @regex, i.e. a copy of |
1892 | | * the string passed to g_regex_new(). |
1893 | | * |
1894 | | * Returns: the pattern of @regex |
1895 | | * |
1896 | | * Since: 2.14 |
1897 | | */ |
1898 | | const gchar * |
1899 | | g_regex_get_pattern (const GRegex *regex) |
1900 | 0 | { |
1901 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
1902 | | |
1903 | 0 | return regex->pattern; |
1904 | 0 | } |
1905 | | |
1906 | | /** |
1907 | | * g_regex_get_max_backref: |
1908 | | * @regex: a #GRegex |
1909 | | * |
1910 | | * Returns the number of the highest back reference |
1911 | | * in the pattern, or 0 if the pattern does not contain |
1912 | | * back references. |
1913 | | * |
1914 | | * Returns: the number of the highest back reference |
1915 | | * |
1916 | | * Since: 2.14 |
1917 | | */ |
1918 | | gint |
1919 | | g_regex_get_max_backref (const GRegex *regex) |
1920 | 0 | { |
1921 | 0 | uint32_t value; |
1922 | |
|
1923 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value); |
1924 | |
|
1925 | 0 | return value; |
1926 | 0 | } |
1927 | | |
1928 | | /** |
1929 | | * g_regex_get_capture_count: |
1930 | | * @regex: a #GRegex |
1931 | | * |
1932 | | * Returns the number of capturing subpatterns in the pattern. |
1933 | | * |
1934 | | * Returns: the number of capturing subpatterns |
1935 | | * |
1936 | | * Since: 2.14 |
1937 | | */ |
1938 | | gint |
1939 | | g_regex_get_capture_count (const GRegex *regex) |
1940 | 0 | { |
1941 | 0 | uint32_t value; |
1942 | |
|
1943 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value); |
1944 | |
|
1945 | 0 | return value; |
1946 | 0 | } |
1947 | | |
1948 | | /** |
1949 | | * g_regex_get_has_cr_or_lf: |
1950 | | * @regex: a #GRegex structure |
1951 | | * |
1952 | | * Checks whether the pattern contains explicit CR or LF references. |
1953 | | * |
1954 | | * Returns: %TRUE if the pattern contains explicit CR or LF references |
1955 | | * |
1956 | | * Since: 2.34 |
1957 | | */ |
1958 | | gboolean |
1959 | | g_regex_get_has_cr_or_lf (const GRegex *regex) |
1960 | 0 | { |
1961 | 0 | uint32_t value; |
1962 | |
|
1963 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value); |
1964 | |
|
1965 | 0 | return !!value; |
1966 | 0 | } |
1967 | | |
1968 | | /** |
1969 | | * g_regex_get_max_lookbehind: |
1970 | | * @regex: a #GRegex structure |
1971 | | * |
1972 | | * Gets the number of characters in the longest lookbehind assertion in the |
1973 | | * pattern. This information is useful when doing multi-segment matching using |
1974 | | * the partial matching facilities. |
1975 | | * |
1976 | | * Returns: the number of characters in the longest lookbehind assertion. |
1977 | | * |
1978 | | * Since: 2.38 |
1979 | | */ |
1980 | | gint |
1981 | | g_regex_get_max_lookbehind (const GRegex *regex) |
1982 | 0 | { |
1983 | 0 | uint32_t max_lookbehind; |
1984 | |
|
1985 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND, |
1986 | 0 | &max_lookbehind); |
1987 | |
|
1988 | 0 | return max_lookbehind; |
1989 | 0 | } |
1990 | | |
1991 | | /** |
1992 | | * g_regex_get_compile_flags: |
1993 | | * @regex: a #GRegex |
1994 | | * |
1995 | | * Returns the compile options that @regex was created with. |
1996 | | * |
1997 | | * Depending on the version of PCRE that is used, this may or may not |
1998 | | * include flags set by option expressions such as `(?i)` found at the |
1999 | | * top-level within the compiled pattern. |
2000 | | * |
2001 | | * Returns: flags from #GRegexCompileFlags |
2002 | | * |
2003 | | * Since: 2.26 |
2004 | | */ |
2005 | | GRegexCompileFlags |
2006 | | g_regex_get_compile_flags (const GRegex *regex) |
2007 | 0 | { |
2008 | 0 | GRegexCompileFlags extra_flags; |
2009 | 0 | uint32_t info_value; |
2010 | |
|
2011 | 0 | g_return_val_if_fail (regex != NULL, 0); |
2012 | | |
2013 | | /* Preserve original G_REGEX_OPTIMIZE */ |
2014 | 0 | extra_flags = (regex->orig_compile_opts & G_REGEX_OPTIMIZE); |
2015 | | |
2016 | | /* Also include the newline options */ |
2017 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_NEWLINE, &info_value); |
2018 | 0 | switch (info_value) |
2019 | 0 | { |
2020 | 0 | case PCRE2_NEWLINE_ANYCRLF: |
2021 | 0 | extra_flags |= G_REGEX_NEWLINE_ANYCRLF; |
2022 | 0 | break; |
2023 | 0 | case PCRE2_NEWLINE_CRLF: |
2024 | 0 | extra_flags |= G_REGEX_NEWLINE_CRLF; |
2025 | 0 | break; |
2026 | 0 | case PCRE2_NEWLINE_LF: |
2027 | 0 | extra_flags |= G_REGEX_NEWLINE_LF; |
2028 | 0 | break; |
2029 | 0 | case PCRE2_NEWLINE_CR: |
2030 | 0 | extra_flags |= G_REGEX_NEWLINE_CR; |
2031 | 0 | break; |
2032 | 0 | default: |
2033 | 0 | break; |
2034 | 0 | } |
2035 | | |
2036 | | /* Also include the bsr options */ |
2037 | 0 | pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BSR, &info_value); |
2038 | 0 | switch (info_value) |
2039 | 0 | { |
2040 | 0 | case PCRE2_BSR_ANYCRLF: |
2041 | 0 | extra_flags |= G_REGEX_BSR_ANYCRLF; |
2042 | 0 | break; |
2043 | 0 | default: |
2044 | 0 | break; |
2045 | 0 | } |
2046 | | |
2047 | 0 | return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags; |
2048 | 0 | } |
2049 | | |
2050 | | /** |
2051 | | * g_regex_get_match_flags: |
2052 | | * @regex: a #GRegex |
2053 | | * |
2054 | | * Returns the match options that @regex was created with. |
2055 | | * |
2056 | | * Returns: flags from #GRegexMatchFlags |
2057 | | * |
2058 | | * Since: 2.26 |
2059 | | */ |
2060 | | GRegexMatchFlags |
2061 | | g_regex_get_match_flags (const GRegex *regex) |
2062 | 0 | { |
2063 | 0 | uint32_t flags; |
2064 | |
|
2065 | 0 | g_return_val_if_fail (regex != NULL, 0); |
2066 | | |
2067 | 0 | flags = g_regex_match_flags_from_pcre2 (regex->match_opts); |
2068 | 0 | flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK); |
2069 | 0 | flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF)); |
2070 | |
|
2071 | 0 | return flags; |
2072 | 0 | } |
2073 | | |
2074 | | /** |
2075 | | * g_regex_match_simple: |
2076 | | * @pattern: the regular expression |
2077 | | * @string: the string to scan for matches |
2078 | | * @compile_options: compile options for the regular expression, or 0 |
2079 | | * @match_options: match options, or 0 |
2080 | | * |
2081 | | * Scans for a match in @string for @pattern. |
2082 | | * |
2083 | | * This function is equivalent to g_regex_match() but it does not |
2084 | | * require to compile the pattern with g_regex_new(), avoiding some |
2085 | | * lines of code when you need just to do a match without extracting |
2086 | | * substrings, capture counts, and so on. |
2087 | | * |
2088 | | * If this function is to be called on the same @pattern more than |
2089 | | * once, it's more efficient to compile the pattern once with |
2090 | | * g_regex_new() and then use g_regex_match(). |
2091 | | * |
2092 | | * Returns: %TRUE if the string matched, %FALSE otherwise |
2093 | | * |
2094 | | * Since: 2.14 |
2095 | | */ |
2096 | | gboolean |
2097 | | g_regex_match_simple (const gchar *pattern, |
2098 | | const gchar *string, |
2099 | | GRegexCompileFlags compile_options, |
2100 | | GRegexMatchFlags match_options) |
2101 | 0 | { |
2102 | 0 | GRegex *regex; |
2103 | 0 | gboolean result; |
2104 | |
|
2105 | 0 | regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL); |
2106 | 0 | if (!regex) |
2107 | 0 | return FALSE; |
2108 | 0 | result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL); |
2109 | 0 | g_regex_unref (regex); |
2110 | 0 | return result; |
2111 | 0 | } |
2112 | | |
2113 | | /** |
2114 | | * g_regex_match: |
2115 | | * @regex: a #GRegex structure from g_regex_new() |
2116 | | * @string: the string to scan for matches |
2117 | | * @match_options: match options |
2118 | | * @match_info: (out) (optional): pointer to location where to store |
2119 | | * the #GMatchInfo, or %NULL if you do not need it |
2120 | | * |
2121 | | * Scans for a match in @string for the pattern in @regex. |
2122 | | * The @match_options are combined with the match options specified |
2123 | | * when the @regex structure was created, letting you have more |
2124 | | * flexibility in reusing #GRegex structures. |
2125 | | * |
2126 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2127 | | * |
2128 | | * A #GMatchInfo structure, used to get information on the match, |
2129 | | * is stored in @match_info if not %NULL. Note that if @match_info |
2130 | | * is not %NULL then it is created even if the function returns %FALSE, |
2131 | | * i.e. you must free it regardless if regular expression actually matched. |
2132 | | * |
2133 | | * To retrieve all the non-overlapping matches of the pattern in |
2134 | | * string you can use g_match_info_next(). |
2135 | | * |
2136 | | * |[<!-- language="C" --> |
2137 | | * static void |
2138 | | * print_uppercase_words (const gchar *string) |
2139 | | * { |
2140 | | * // Print all uppercase-only words. |
2141 | | * GRegex *regex; |
2142 | | * GMatchInfo *match_info; |
2143 | | * |
2144 | | * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
2145 | | * g_regex_match (regex, string, 0, &match_info); |
2146 | | * while (g_match_info_matches (match_info)) |
2147 | | * { |
2148 | | * gchar *word = g_match_info_fetch (match_info, 0); |
2149 | | * g_print ("Found: %s\n", word); |
2150 | | * g_free (word); |
2151 | | * g_match_info_next (match_info, NULL); |
2152 | | * } |
2153 | | * g_match_info_free (match_info); |
2154 | | * g_regex_unref (regex); |
2155 | | * } |
2156 | | * ]| |
2157 | | * |
2158 | | * @string is not copied and is used in #GMatchInfo internally. If |
2159 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2160 | | * freeing or modifying @string then the behaviour is undefined. |
2161 | | * |
2162 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2163 | | * |
2164 | | * Since: 2.14 |
2165 | | */ |
2166 | | gboolean |
2167 | | g_regex_match (const GRegex *regex, |
2168 | | const gchar *string, |
2169 | | GRegexMatchFlags match_options, |
2170 | | GMatchInfo **match_info) |
2171 | 0 | { |
2172 | 0 | return g_regex_match_full (regex, string, -1, 0, match_options, |
2173 | 0 | match_info, NULL); |
2174 | 0 | } |
2175 | | |
2176 | | /** |
2177 | | * g_regex_match_full: |
2178 | | * @regex: a #GRegex structure from g_regex_new() |
2179 | | * @string: (array length=string_len): the string to scan for matches |
2180 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2181 | | * @start_position: starting index of the string to match, in bytes |
2182 | | * @match_options: match options |
2183 | | * @match_info: (out) (optional): pointer to location where to store |
2184 | | * the #GMatchInfo, or %NULL if you do not need it |
2185 | | * @error: location to store the error occurring, or %NULL to ignore errors |
2186 | | * |
2187 | | * Scans for a match in @string for the pattern in @regex. |
2188 | | * The @match_options are combined with the match options specified |
2189 | | * when the @regex structure was created, letting you have more |
2190 | | * flexibility in reusing #GRegex structures. |
2191 | | * |
2192 | | * Setting @start_position differs from just passing over a shortened |
2193 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2194 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2195 | | * |
2196 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2197 | | * |
2198 | | * A #GMatchInfo structure, used to get information on the match, is |
2199 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2200 | | * not %NULL then it is created even if the function returns %FALSE, |
2201 | | * i.e. you must free it regardless if regular expression actually |
2202 | | * matched. |
2203 | | * |
2204 | | * @string is not copied and is used in #GMatchInfo internally. If |
2205 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2206 | | * freeing or modifying @string then the behaviour is undefined. |
2207 | | * |
2208 | | * To retrieve all the non-overlapping matches of the pattern in |
2209 | | * string you can use g_match_info_next(). |
2210 | | * |
2211 | | * |[<!-- language="C" --> |
2212 | | * static void |
2213 | | * print_uppercase_words (const gchar *string) |
2214 | | * { |
2215 | | * // Print all uppercase-only words. |
2216 | | * GRegex *regex; |
2217 | | * GMatchInfo *match_info; |
2218 | | * GError *error = NULL; |
2219 | | * |
2220 | | * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
2221 | | * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error); |
2222 | | * while (g_match_info_matches (match_info)) |
2223 | | * { |
2224 | | * gchar *word = g_match_info_fetch (match_info, 0); |
2225 | | * g_print ("Found: %s\n", word); |
2226 | | * g_free (word); |
2227 | | * g_match_info_next (match_info, &error); |
2228 | | * } |
2229 | | * g_match_info_free (match_info); |
2230 | | * g_regex_unref (regex); |
2231 | | * if (error != NULL) |
2232 | | * { |
2233 | | * g_printerr ("Error while matching: %s\n", error->message); |
2234 | | * g_error_free (error); |
2235 | | * } |
2236 | | * } |
2237 | | * ]| |
2238 | | * |
2239 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2240 | | * |
2241 | | * Since: 2.14 |
2242 | | */ |
2243 | | gboolean |
2244 | | g_regex_match_full (const GRegex *regex, |
2245 | | const gchar *string, |
2246 | | gssize string_len, |
2247 | | gint start_position, |
2248 | | GRegexMatchFlags match_options, |
2249 | | GMatchInfo **match_info, |
2250 | | GError **error) |
2251 | 0 | { |
2252 | 0 | GMatchInfo *info; |
2253 | 0 | gboolean match_ok; |
2254 | |
|
2255 | 0 | g_return_val_if_fail (regex != NULL, FALSE); |
2256 | 0 | g_return_val_if_fail (string != NULL, FALSE); |
2257 | 0 | g_return_val_if_fail (start_position >= 0, FALSE); |
2258 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
2259 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
2260 | | |
2261 | 0 | info = match_info_new (regex, string, string_len, start_position, |
2262 | 0 | match_options, FALSE); |
2263 | 0 | match_ok = g_match_info_next (info, error); |
2264 | 0 | if (match_info != NULL) |
2265 | 0 | *match_info = info; |
2266 | 0 | else |
2267 | 0 | g_match_info_free (info); |
2268 | |
|
2269 | 0 | return match_ok; |
2270 | 0 | } |
2271 | | |
2272 | | /** |
2273 | | * g_regex_match_all: |
2274 | | * @regex: a #GRegex structure from g_regex_new() |
2275 | | * @string: the string to scan for matches |
2276 | | * @match_options: match options |
2277 | | * @match_info: (out) (optional): pointer to location where to store |
2278 | | * the #GMatchInfo, or %NULL if you do not need it |
2279 | | * |
2280 | | * Using the standard algorithm for regular expression matching only |
2281 | | * the longest match in the string is retrieved. This function uses |
2282 | | * a different algorithm so it can retrieve all the possible matches. |
2283 | | * For more documentation see g_regex_match_all_full(). |
2284 | | * |
2285 | | * A #GMatchInfo structure, used to get information on the match, is |
2286 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2287 | | * not %NULL then it is created even if the function returns %FALSE, |
2288 | | * i.e. you must free it regardless if regular expression actually |
2289 | | * matched. |
2290 | | * |
2291 | | * @string is not copied and is used in #GMatchInfo internally. If |
2292 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2293 | | * freeing or modifying @string then the behaviour is undefined. |
2294 | | * |
2295 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2296 | | * |
2297 | | * Since: 2.14 |
2298 | | */ |
2299 | | gboolean |
2300 | | g_regex_match_all (const GRegex *regex, |
2301 | | const gchar *string, |
2302 | | GRegexMatchFlags match_options, |
2303 | | GMatchInfo **match_info) |
2304 | 0 | { |
2305 | 0 | return g_regex_match_all_full (regex, string, -1, 0, match_options, |
2306 | 0 | match_info, NULL); |
2307 | 0 | } |
2308 | | |
2309 | | /** |
2310 | | * g_regex_match_all_full: |
2311 | | * @regex: a #GRegex structure from g_regex_new() |
2312 | | * @string: (array length=string_len): the string to scan for matches |
2313 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2314 | | * @start_position: starting index of the string to match, in bytes |
2315 | | * @match_options: match options |
2316 | | * @match_info: (out) (optional): pointer to location where to store |
2317 | | * the #GMatchInfo, or %NULL if you do not need it |
2318 | | * @error: location to store the error occurring, or %NULL to ignore errors |
2319 | | * |
2320 | | * Using the standard algorithm for regular expression matching only |
2321 | | * the longest match in the @string is retrieved, it is not possible |
2322 | | * to obtain all the available matches. For instance matching |
2323 | | * "<a> <b> <c>" against the pattern "<.*>" |
2324 | | * you get "<a> <b> <c>". |
2325 | | * |
2326 | | * This function uses a different algorithm (called DFA, i.e. deterministic |
2327 | | * finite automaton), so it can retrieve all the possible matches, all |
2328 | | * starting at the same point in the string. For instance matching |
2329 | | * "<a> <b> <c>" against the pattern "<.*>;" |
2330 | | * you would obtain three matches: "<a> <b> <c>", |
2331 | | * "<a> <b>" and "<a>". |
2332 | | * |
2333 | | * The number of matched strings is retrieved using |
2334 | | * g_match_info_get_match_count(). To obtain the matched strings and |
2335 | | * their position you can use, respectively, g_match_info_fetch() and |
2336 | | * g_match_info_fetch_pos(). Note that the strings are returned in |
2337 | | * reverse order of length; that is, the longest matching string is |
2338 | | * given first. |
2339 | | * |
2340 | | * Note that the DFA algorithm is slower than the standard one and it |
2341 | | * is not able to capture substrings, so backreferences do not work. |
2342 | | * |
2343 | | * Setting @start_position differs from just passing over a shortened |
2344 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2345 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2346 | | * |
2347 | | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
2348 | | * |
2349 | | * A #GMatchInfo structure, used to get information on the match, is |
2350 | | * stored in @match_info if not %NULL. Note that if @match_info is |
2351 | | * not %NULL then it is created even if the function returns %FALSE, |
2352 | | * i.e. you must free it regardless if regular expression actually |
2353 | | * matched. |
2354 | | * |
2355 | | * @string is not copied and is used in #GMatchInfo internally. If |
2356 | | * you use any #GMatchInfo method (except g_match_info_free()) after |
2357 | | * freeing or modifying @string then the behaviour is undefined. |
2358 | | * |
2359 | | * Returns: %TRUE is the string matched, %FALSE otherwise |
2360 | | * |
2361 | | * Since: 2.14 |
2362 | | */ |
2363 | | gboolean |
2364 | | g_regex_match_all_full (const GRegex *regex, |
2365 | | const gchar *string, |
2366 | | gssize string_len, |
2367 | | gint start_position, |
2368 | | GRegexMatchFlags match_options, |
2369 | | GMatchInfo **match_info, |
2370 | | GError **error) |
2371 | 0 | { |
2372 | 0 | GMatchInfo *info; |
2373 | 0 | gboolean done; |
2374 | 0 | pcre2_code *pcre_re; |
2375 | 0 | gboolean retval; |
2376 | 0 | uint32_t newline_options; |
2377 | 0 | uint32_t bsr_options; |
2378 | |
|
2379 | 0 | g_return_val_if_fail (regex != NULL, FALSE); |
2380 | 0 | g_return_val_if_fail (string != NULL, FALSE); |
2381 | 0 | g_return_val_if_fail (start_position >= 0, FALSE); |
2382 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
2383 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
2384 | | |
2385 | 0 | newline_options = get_pcre2_newline_match_options (match_options); |
2386 | 0 | if (!newline_options) |
2387 | 0 | newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts); |
2388 | |
|
2389 | 0 | bsr_options = get_pcre2_bsr_match_options (match_options); |
2390 | 0 | if (!bsr_options) |
2391 | 0 | bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts); |
2392 | | |
2393 | | /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an |
2394 | | * optimization for normal regex matching, but results in omitting some |
2395 | | * shorter matches here, and an observable behaviour change. |
2396 | | * |
2397 | | * DFA matching is rather niche, and very rarely used according to |
2398 | | * codesearch.debian.net, so don't bother caching the recompiled RE. */ |
2399 | 0 | pcre_re = regex_compile (regex->pattern, |
2400 | 0 | regex->compile_opts | PCRE2_NO_AUTO_POSSESS, |
2401 | 0 | newline_options, bsr_options, error); |
2402 | 0 | if (pcre_re == NULL) |
2403 | 0 | return FALSE; |
2404 | | |
2405 | 0 | info = match_info_new (regex, string, string_len, start_position, |
2406 | 0 | match_options, TRUE); |
2407 | |
|
2408 | 0 | done = FALSE; |
2409 | 0 | while (!done) |
2410 | 0 | { |
2411 | 0 | done = TRUE; |
2412 | 0 | info->matches = pcre2_dfa_match (pcre_re, |
2413 | 0 | (PCRE2_SPTR8) info->string, info->string_len, |
2414 | 0 | info->pos, |
2415 | 0 | (regex->match_opts | info->match_opts), |
2416 | 0 | info->match_data, |
2417 | 0 | info->match_context, |
2418 | 0 | info->workspace, info->n_workspace); |
2419 | 0 | if (info->matches == PCRE2_ERROR_DFA_WSSIZE) |
2420 | 0 | { |
2421 | | /* info->workspace is too small. */ |
2422 | 0 | info->n_workspace *= 2; |
2423 | 0 | info->workspace = g_realloc_n (info->workspace, |
2424 | 0 | info->n_workspace, |
2425 | 0 | sizeof (gint)); |
2426 | 0 | done = FALSE; |
2427 | 0 | } |
2428 | 0 | else if (info->matches == 0) |
2429 | 0 | { |
2430 | | /* info->offsets is too small. */ |
2431 | 0 | info->n_offsets *= 2; |
2432 | 0 | info->offsets = g_realloc_n (info->offsets, |
2433 | 0 | info->n_offsets, |
2434 | 0 | sizeof (gint)); |
2435 | 0 | pcre2_match_data_free (info->match_data); |
2436 | 0 | info->match_data = pcre2_match_data_create (info->n_offsets, NULL); |
2437 | 0 | done = FALSE; |
2438 | 0 | } |
2439 | 0 | else if (IS_PCRE2_ERROR (info->matches)) |
2440 | 0 | { |
2441 | 0 | gchar *error_msg = get_match_error_message (info->matches); |
2442 | |
|
2443 | 0 | g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
2444 | 0 | _("Error while matching regular expression %s: %s"), |
2445 | 0 | regex->pattern, error_msg); |
2446 | 0 | g_clear_pointer (&error_msg, g_free); |
2447 | 0 | } |
2448 | 0 | else if (info->matches != PCRE2_ERROR_NOMATCH) |
2449 | 0 | { |
2450 | 0 | if (!recalc_match_offsets (info, error)) |
2451 | 0 | info->matches = PCRE2_ERROR_NOMATCH; |
2452 | 0 | } |
2453 | 0 | } |
2454 | |
|
2455 | 0 | pcre2_code_free (pcre_re); |
2456 | | |
2457 | | /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only |
2458 | | * holds true for a single match, rather than matching all */ |
2459 | | |
2460 | | /* set info->pos to -1 so that a call to g_match_info_next() fails. */ |
2461 | 0 | info->pos = -1; |
2462 | 0 | retval = info->matches >= 0; |
2463 | |
|
2464 | 0 | if (match_info != NULL) |
2465 | 0 | *match_info = info; |
2466 | 0 | else |
2467 | 0 | g_match_info_free (info); |
2468 | |
|
2469 | 0 | return retval; |
2470 | 0 | } |
2471 | | |
2472 | | /** |
2473 | | * g_regex_get_string_number: |
2474 | | * @regex: #GRegex structure |
2475 | | * @name: name of the subexpression |
2476 | | * |
2477 | | * Retrieves the number of the subexpression named @name. |
2478 | | * |
2479 | | * Returns: The number of the subexpression or -1 if @name |
2480 | | * does not exists |
2481 | | * |
2482 | | * Since: 2.14 |
2483 | | */ |
2484 | | gint |
2485 | | g_regex_get_string_number (const GRegex *regex, |
2486 | | const gchar *name) |
2487 | 0 | { |
2488 | 0 | gint num; |
2489 | |
|
2490 | 0 | g_return_val_if_fail (regex != NULL, -1); |
2491 | 0 | g_return_val_if_fail (name != NULL, -1); |
2492 | | |
2493 | 0 | num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR8) name); |
2494 | 0 | if (num == PCRE2_ERROR_NOSUBSTRING) |
2495 | 0 | num = -1; |
2496 | |
|
2497 | 0 | return num; |
2498 | 0 | } |
2499 | | |
2500 | | /** |
2501 | | * g_regex_split_simple: |
2502 | | * @pattern: the regular expression |
2503 | | * @string: the string to scan for matches |
2504 | | * @compile_options: compile options for the regular expression, or 0 |
2505 | | * @match_options: match options, or 0 |
2506 | | * |
2507 | | * Breaks the string on the pattern, and returns an array of |
2508 | | * the tokens. If the pattern contains capturing parentheses, |
2509 | | * then the text for each of the substrings will also be returned. |
2510 | | * If the pattern does not match anywhere in the string, then the |
2511 | | * whole string is returned as the first token. |
2512 | | * |
2513 | | * This function is equivalent to g_regex_split() but it does |
2514 | | * not require to compile the pattern with g_regex_new(), avoiding |
2515 | | * some lines of code when you need just to do a split without |
2516 | | * extracting substrings, capture counts, and so on. |
2517 | | * |
2518 | | * If this function is to be called on the same @pattern more than |
2519 | | * once, it's more efficient to compile the pattern once with |
2520 | | * g_regex_new() and then use g_regex_split(). |
2521 | | * |
2522 | | * As a special case, the result of splitting the empty string "" |
2523 | | * is an empty vector, not a vector containing a single string. |
2524 | | * The reason for this special case is that being able to represent |
2525 | | * an empty vector is typically more useful than consistent handling |
2526 | | * of empty elements. If you do need to represent empty elements, |
2527 | | * you'll need to check for the empty string before calling this |
2528 | | * function. |
2529 | | * |
2530 | | * A pattern that can match empty strings splits @string into |
2531 | | * separate characters wherever it matches the empty string between |
2532 | | * characters. For example splitting "ab c" using as a separator |
2533 | | * "\s*", you will get "a", "b" and "c". |
2534 | | * |
2535 | | * Returns: (transfer full): a %NULL-terminated array of strings. Free |
2536 | | * it using g_strfreev() |
2537 | | * |
2538 | | * Since: 2.14 |
2539 | | **/ |
2540 | | gchar ** |
2541 | | g_regex_split_simple (const gchar *pattern, |
2542 | | const gchar *string, |
2543 | | GRegexCompileFlags compile_options, |
2544 | | GRegexMatchFlags match_options) |
2545 | 0 | { |
2546 | 0 | GRegex *regex; |
2547 | 0 | gchar **result; |
2548 | |
|
2549 | 0 | regex = g_regex_new (pattern, compile_options, 0, NULL); |
2550 | 0 | if (!regex) |
2551 | 0 | return NULL; |
2552 | | |
2553 | 0 | result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL); |
2554 | 0 | g_regex_unref (regex); |
2555 | 0 | return result; |
2556 | 0 | } |
2557 | | |
2558 | | /** |
2559 | | * g_regex_split: |
2560 | | * @regex: a #GRegex structure |
2561 | | * @string: the string to split with the pattern |
2562 | | * @match_options: match time option flags |
2563 | | * |
2564 | | * Breaks the string on the pattern, and returns an array of the tokens. |
2565 | | * If the pattern contains capturing parentheses, then the text for each |
2566 | | * of the substrings will also be returned. If the pattern does not match |
2567 | | * anywhere in the string, then the whole string is returned as the first |
2568 | | * token. |
2569 | | * |
2570 | | * As a special case, the result of splitting the empty string "" is an |
2571 | | * empty vector, not a vector containing a single string. The reason for |
2572 | | * this special case is that being able to represent an empty vector is |
2573 | | * typically more useful than consistent handling of empty elements. If |
2574 | | * you do need to represent empty elements, you'll need to check for the |
2575 | | * empty string before calling this function. |
2576 | | * |
2577 | | * A pattern that can match empty strings splits @string into separate |
2578 | | * characters wherever it matches the empty string between characters. |
2579 | | * For example splitting "ab c" using as a separator "\s*", you will get |
2580 | | * "a", "b" and "c". |
2581 | | * |
2582 | | * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
2583 | | * it using g_strfreev() |
2584 | | * |
2585 | | * Since: 2.14 |
2586 | | **/ |
2587 | | gchar ** |
2588 | | g_regex_split (const GRegex *regex, |
2589 | | const gchar *string, |
2590 | | GRegexMatchFlags match_options) |
2591 | 0 | { |
2592 | 0 | return g_regex_split_full (regex, string, -1, 0, |
2593 | 0 | match_options, 0, NULL); |
2594 | 0 | } |
2595 | | |
2596 | | /** |
2597 | | * g_regex_split_full: |
2598 | | * @regex: a #GRegex structure |
2599 | | * @string: (array length=string_len): the string to split with the pattern |
2600 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2601 | | * @start_position: starting index of the string to match, in bytes |
2602 | | * @match_options: match time option flags |
2603 | | * @max_tokens: the maximum number of tokens to split @string into. |
2604 | | * If this is less than 1, the string is split completely |
2605 | | * @error: return location for a #GError |
2606 | | * |
2607 | | * Breaks the string on the pattern, and returns an array of the tokens. |
2608 | | * If the pattern contains capturing parentheses, then the text for each |
2609 | | * of the substrings will also be returned. If the pattern does not match |
2610 | | * anywhere in the string, then the whole string is returned as the first |
2611 | | * token. |
2612 | | * |
2613 | | * As a special case, the result of splitting the empty string "" is an |
2614 | | * empty vector, not a vector containing a single string. The reason for |
2615 | | * this special case is that being able to represent an empty vector is |
2616 | | * typically more useful than consistent handling of empty elements. If |
2617 | | * you do need to represent empty elements, you'll need to check for the |
2618 | | * empty string before calling this function. |
2619 | | * |
2620 | | * A pattern that can match empty strings splits @string into separate |
2621 | | * characters wherever it matches the empty string between characters. |
2622 | | * For example splitting "ab c" using as a separator "\s*", you will get |
2623 | | * "a", "b" and "c". |
2624 | | * |
2625 | | * Setting @start_position differs from just passing over a shortened |
2626 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
2627 | | * that begins with any kind of lookbehind assertion, such as "\b". |
2628 | | * |
2629 | | * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
2630 | | * it using g_strfreev() |
2631 | | * |
2632 | | * Since: 2.14 |
2633 | | **/ |
2634 | | gchar ** |
2635 | | g_regex_split_full (const GRegex *regex, |
2636 | | const gchar *string, |
2637 | | gssize string_len, |
2638 | | gint start_position, |
2639 | | GRegexMatchFlags match_options, |
2640 | | gint max_tokens, |
2641 | | GError **error) |
2642 | 0 | { |
2643 | 0 | GError *tmp_error = NULL; |
2644 | 0 | GMatchInfo *match_info; |
2645 | 0 | GList *list, *last; |
2646 | 0 | gint i; |
2647 | 0 | gint token_count; |
2648 | 0 | gboolean match_ok; |
2649 | | /* position of the last separator. */ |
2650 | 0 | gint last_separator_end; |
2651 | | /* was the last match 0 bytes long? */ |
2652 | 0 | gboolean last_match_is_empty; |
2653 | | /* the returned array of char **s */ |
2654 | 0 | gchar **string_list; |
2655 | |
|
2656 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
2657 | 0 | g_return_val_if_fail (string != NULL, NULL); |
2658 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
2659 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
2660 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
2661 | | |
2662 | 0 | if (max_tokens <= 0) |
2663 | 0 | max_tokens = G_MAXINT; |
2664 | |
|
2665 | 0 | if (string_len < 0) |
2666 | 0 | string_len = strlen (string); |
2667 | | |
2668 | | /* zero-length string */ |
2669 | 0 | if (string_len - start_position == 0) |
2670 | 0 | return g_new0 (gchar *, 1); |
2671 | | |
2672 | 0 | if (max_tokens == 1) |
2673 | 0 | { |
2674 | 0 | string_list = g_new0 (gchar *, 2); |
2675 | 0 | string_list[0] = g_strndup (&string[start_position], |
2676 | 0 | string_len - start_position); |
2677 | 0 | return string_list; |
2678 | 0 | } |
2679 | | |
2680 | 0 | list = NULL; |
2681 | 0 | token_count = 0; |
2682 | 0 | last_separator_end = start_position; |
2683 | 0 | last_match_is_empty = FALSE; |
2684 | |
|
2685 | 0 | match_ok = g_regex_match_full (regex, string, string_len, start_position, |
2686 | 0 | match_options, &match_info, &tmp_error); |
2687 | |
|
2688 | 0 | while (tmp_error == NULL) |
2689 | 0 | { |
2690 | 0 | if (match_ok) |
2691 | 0 | { |
2692 | 0 | last_match_is_empty = |
2693 | 0 | (match_info->offsets[0] == match_info->offsets[1]); |
2694 | | |
2695 | | /* we need to skip empty separators at the same position of the end |
2696 | | * of another separator. e.g. the string is "a b" and the separator |
2697 | | * is " *", so from 1 to 2 we have a match and at position 2 we have |
2698 | | * an empty match. */ |
2699 | 0 | if (last_separator_end != match_info->offsets[1]) |
2700 | 0 | { |
2701 | 0 | gchar *token; |
2702 | 0 | gint match_count; |
2703 | |
|
2704 | 0 | token = g_strndup (string + last_separator_end, |
2705 | 0 | match_info->offsets[0] - last_separator_end); |
2706 | 0 | list = g_list_prepend (list, token); |
2707 | 0 | token_count++; |
2708 | | |
2709 | | /* if there were substrings, these need to be added to |
2710 | | * the list. */ |
2711 | 0 | match_count = g_match_info_get_match_count (match_info); |
2712 | 0 | if (match_count > 1) |
2713 | 0 | { |
2714 | 0 | for (i = 1; i < match_count; i++) |
2715 | 0 | list = g_list_prepend (list, g_match_info_fetch (match_info, i)); |
2716 | 0 | } |
2717 | 0 | } |
2718 | 0 | } |
2719 | 0 | else |
2720 | 0 | { |
2721 | | /* if there was no match, copy to end of string. */ |
2722 | 0 | if (!last_match_is_empty) |
2723 | 0 | { |
2724 | 0 | gchar *token = g_strndup (string + last_separator_end, |
2725 | 0 | match_info->string_len - last_separator_end); |
2726 | 0 | list = g_list_prepend (list, token); |
2727 | 0 | } |
2728 | | /* no more tokens, end the loop. */ |
2729 | 0 | break; |
2730 | 0 | } |
2731 | | |
2732 | | /* -1 to leave room for the last part. */ |
2733 | 0 | if (token_count >= max_tokens - 1) |
2734 | 0 | { |
2735 | | /* we have reached the maximum number of tokens, so we copy |
2736 | | * the remaining part of the string. */ |
2737 | 0 | if (last_match_is_empty) |
2738 | 0 | { |
2739 | | /* the last match was empty, so we have moved one char |
2740 | | * after the real position to avoid empty matches at the |
2741 | | * same position. */ |
2742 | 0 | match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string; |
2743 | 0 | } |
2744 | | /* the if is needed in the case we have terminated the available |
2745 | | * tokens, but we are at the end of the string, so there are no |
2746 | | * characters left to copy. */ |
2747 | 0 | if (string_len > match_info->pos) |
2748 | 0 | { |
2749 | 0 | gchar *token = g_strndup (string + match_info->pos, |
2750 | 0 | string_len - match_info->pos); |
2751 | 0 | list = g_list_prepend (list, token); |
2752 | 0 | } |
2753 | | /* end the loop. */ |
2754 | 0 | break; |
2755 | 0 | } |
2756 | | |
2757 | 0 | last_separator_end = match_info->pos; |
2758 | 0 | if (last_match_is_empty) |
2759 | | /* if the last match was empty, g_match_info_next() has moved |
2760 | | * forward to avoid infinite loops, but we still need to copy that |
2761 | | * character. */ |
2762 | 0 | last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string; |
2763 | |
|
2764 | 0 | match_ok = g_match_info_next (match_info, &tmp_error); |
2765 | 0 | } |
2766 | 0 | g_match_info_free (match_info); |
2767 | 0 | if (tmp_error != NULL) |
2768 | 0 | { |
2769 | 0 | g_propagate_error (error, tmp_error); |
2770 | 0 | g_list_free_full (list, g_free); |
2771 | 0 | return NULL; |
2772 | 0 | } |
2773 | | |
2774 | 0 | string_list = g_new (gchar *, g_list_length (list) + 1); |
2775 | 0 | i = 0; |
2776 | 0 | for (last = g_list_last (list); last; last = g_list_previous (last)) |
2777 | 0 | string_list[i++] = last->data; |
2778 | 0 | string_list[i] = NULL; |
2779 | 0 | g_list_free (list); |
2780 | |
|
2781 | 0 | return string_list; |
2782 | 0 | } |
2783 | | |
2784 | | enum |
2785 | | { |
2786 | | REPL_TYPE_STRING, |
2787 | | REPL_TYPE_CHARACTER, |
2788 | | REPL_TYPE_SYMBOLIC_REFERENCE, |
2789 | | REPL_TYPE_NUMERIC_REFERENCE, |
2790 | | REPL_TYPE_CHANGE_CASE |
2791 | | }; |
2792 | | |
2793 | | typedef enum |
2794 | | { |
2795 | | CHANGE_CASE_NONE = 1 << 0, |
2796 | | CHANGE_CASE_UPPER = 1 << 1, |
2797 | | CHANGE_CASE_LOWER = 1 << 2, |
2798 | | CHANGE_CASE_UPPER_SINGLE = 1 << 3, |
2799 | | CHANGE_CASE_LOWER_SINGLE = 1 << 4, |
2800 | | CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE, |
2801 | | CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE, |
2802 | | CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE |
2803 | | } ChangeCase; |
2804 | | |
2805 | | struct _InterpolationData |
2806 | | { |
2807 | | gchar *text; |
2808 | | gint type; |
2809 | | gint num; |
2810 | | gchar c; |
2811 | | ChangeCase change_case; |
2812 | | }; |
2813 | | |
2814 | | static void |
2815 | | free_interpolation_data (InterpolationData *data) |
2816 | 0 | { |
2817 | 0 | g_free (data->text); |
2818 | 0 | g_free (data); |
2819 | 0 | } |
2820 | | |
2821 | | static const gchar * |
2822 | | expand_escape (const gchar *replacement, |
2823 | | const gchar *p, |
2824 | | InterpolationData *data, |
2825 | | GError **error) |
2826 | 0 | { |
2827 | 0 | const gchar *q, *r; |
2828 | 0 | gint x, d, h, i; |
2829 | 0 | const gchar *error_detail; |
2830 | 0 | gint base = 0; |
2831 | 0 | GError *tmp_error = NULL; |
2832 | |
|
2833 | 0 | p++; |
2834 | 0 | switch (*p) |
2835 | 0 | { |
2836 | 0 | case 't': |
2837 | 0 | p++; |
2838 | 0 | data->c = '\t'; |
2839 | 0 | data->type = REPL_TYPE_CHARACTER; |
2840 | 0 | break; |
2841 | 0 | case 'n': |
2842 | 0 | p++; |
2843 | 0 | data->c = '\n'; |
2844 | 0 | data->type = REPL_TYPE_CHARACTER; |
2845 | 0 | break; |
2846 | 0 | case 'v': |
2847 | 0 | p++; |
2848 | 0 | data->c = '\v'; |
2849 | 0 | data->type = REPL_TYPE_CHARACTER; |
2850 | 0 | break; |
2851 | 0 | case 'r': |
2852 | 0 | p++; |
2853 | 0 | data->c = '\r'; |
2854 | 0 | data->type = REPL_TYPE_CHARACTER; |
2855 | 0 | break; |
2856 | 0 | case 'f': |
2857 | 0 | p++; |
2858 | 0 | data->c = '\f'; |
2859 | 0 | data->type = REPL_TYPE_CHARACTER; |
2860 | 0 | break; |
2861 | 0 | case 'a': |
2862 | 0 | p++; |
2863 | 0 | data->c = '\a'; |
2864 | 0 | data->type = REPL_TYPE_CHARACTER; |
2865 | 0 | break; |
2866 | 0 | case 'b': |
2867 | 0 | p++; |
2868 | 0 | data->c = '\b'; |
2869 | 0 | data->type = REPL_TYPE_CHARACTER; |
2870 | 0 | break; |
2871 | 0 | case '\\': |
2872 | 0 | p++; |
2873 | 0 | data->c = '\\'; |
2874 | 0 | data->type = REPL_TYPE_CHARACTER; |
2875 | 0 | break; |
2876 | 0 | case 'x': |
2877 | 0 | p++; |
2878 | 0 | x = 0; |
2879 | 0 | if (*p == '{') |
2880 | 0 | { |
2881 | 0 | p++; |
2882 | 0 | do |
2883 | 0 | { |
2884 | 0 | h = g_ascii_xdigit_value (*p); |
2885 | 0 | if (h < 0) |
2886 | 0 | { |
2887 | 0 | error_detail = _("hexadecimal digit or “}” expected"); |
2888 | 0 | goto error; |
2889 | 0 | } |
2890 | 0 | x = x * 16 + h; |
2891 | 0 | p++; |
2892 | 0 | } |
2893 | 0 | while (*p != '}'); |
2894 | 0 | p++; |
2895 | 0 | } |
2896 | 0 | else |
2897 | 0 | { |
2898 | 0 | for (i = 0; i < 2; i++) |
2899 | 0 | { |
2900 | 0 | h = g_ascii_xdigit_value (*p); |
2901 | 0 | if (h < 0) |
2902 | 0 | { |
2903 | 0 | error_detail = _("hexadecimal digit expected"); |
2904 | 0 | goto error; |
2905 | 0 | } |
2906 | 0 | x = x * 16 + h; |
2907 | 0 | p++; |
2908 | 0 | } |
2909 | 0 | } |
2910 | 0 | data->type = REPL_TYPE_STRING; |
2911 | 0 | data->text = g_new0 (gchar, 8); |
2912 | 0 | g_unichar_to_utf8 (x, data->text); |
2913 | 0 | break; |
2914 | 0 | case 'l': |
2915 | 0 | p++; |
2916 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2917 | 0 | data->change_case = CHANGE_CASE_LOWER_SINGLE; |
2918 | 0 | break; |
2919 | 0 | case 'u': |
2920 | 0 | p++; |
2921 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2922 | 0 | data->change_case = CHANGE_CASE_UPPER_SINGLE; |
2923 | 0 | break; |
2924 | 0 | case 'L': |
2925 | 0 | p++; |
2926 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2927 | 0 | data->change_case = CHANGE_CASE_LOWER; |
2928 | 0 | break; |
2929 | 0 | case 'U': |
2930 | 0 | p++; |
2931 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2932 | 0 | data->change_case = CHANGE_CASE_UPPER; |
2933 | 0 | break; |
2934 | 0 | case 'E': |
2935 | 0 | p++; |
2936 | 0 | data->type = REPL_TYPE_CHANGE_CASE; |
2937 | 0 | data->change_case = CHANGE_CASE_NONE; |
2938 | 0 | break; |
2939 | 0 | case 'g': |
2940 | 0 | p++; |
2941 | 0 | if (*p != '<') |
2942 | 0 | { |
2943 | 0 | error_detail = _("missing “<” in symbolic reference"); |
2944 | 0 | goto error; |
2945 | 0 | } |
2946 | 0 | q = p + 1; |
2947 | 0 | do |
2948 | 0 | { |
2949 | 0 | p++; |
2950 | 0 | if (!*p) |
2951 | 0 | { |
2952 | 0 | error_detail = _("unfinished symbolic reference"); |
2953 | 0 | goto error; |
2954 | 0 | } |
2955 | 0 | } |
2956 | 0 | while (*p != '>'); |
2957 | 0 | if (p - q == 0) |
2958 | 0 | { |
2959 | 0 | error_detail = _("zero-length symbolic reference"); |
2960 | 0 | goto error; |
2961 | 0 | } |
2962 | 0 | if (g_ascii_isdigit (*q)) |
2963 | 0 | { |
2964 | 0 | x = 0; |
2965 | 0 | do |
2966 | 0 | { |
2967 | 0 | h = g_ascii_digit_value (*q); |
2968 | 0 | if (h < 0) |
2969 | 0 | { |
2970 | 0 | error_detail = _("digit expected"); |
2971 | 0 | p = q; |
2972 | 0 | goto error; |
2973 | 0 | } |
2974 | 0 | x = x * 10 + h; |
2975 | 0 | q++; |
2976 | 0 | } |
2977 | 0 | while (q != p); |
2978 | 0 | data->num = x; |
2979 | 0 | data->type = REPL_TYPE_NUMERIC_REFERENCE; |
2980 | 0 | } |
2981 | 0 | else |
2982 | 0 | { |
2983 | 0 | r = q; |
2984 | 0 | do |
2985 | 0 | { |
2986 | 0 | if (!g_ascii_isalnum (*r)) |
2987 | 0 | { |
2988 | 0 | error_detail = _("illegal symbolic reference"); |
2989 | 0 | p = r; |
2990 | 0 | goto error; |
2991 | 0 | } |
2992 | 0 | r++; |
2993 | 0 | } |
2994 | 0 | while (r != p); |
2995 | 0 | data->text = g_strndup (q, p - q); |
2996 | 0 | data->type = REPL_TYPE_SYMBOLIC_REFERENCE; |
2997 | 0 | } |
2998 | 0 | p++; |
2999 | 0 | break; |
3000 | 0 | case '0': |
3001 | | /* if \0 is followed by a number is an octal number representing a |
3002 | | * character, else it is a numeric reference. */ |
3003 | 0 | if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0) |
3004 | 0 | { |
3005 | 0 | base = 8; |
3006 | 0 | p = g_utf8_next_char (p); |
3007 | 0 | } |
3008 | 0 | G_GNUC_FALLTHROUGH; |
3009 | 0 | case '1': |
3010 | 0 | case '2': |
3011 | 0 | case '3': |
3012 | 0 | case '4': |
3013 | 0 | case '5': |
3014 | 0 | case '6': |
3015 | 0 | case '7': |
3016 | 0 | case '8': |
3017 | 0 | case '9': |
3018 | 0 | x = 0; |
3019 | 0 | d = 0; |
3020 | 0 | for (i = 0; i < 3; i++) |
3021 | 0 | { |
3022 | 0 | h = g_ascii_digit_value (*p); |
3023 | 0 | if (h < 0) |
3024 | 0 | break; |
3025 | 0 | if (h > 7) |
3026 | 0 | { |
3027 | 0 | if (base == 8) |
3028 | 0 | break; |
3029 | 0 | else |
3030 | 0 | base = 10; |
3031 | 0 | } |
3032 | 0 | if (i == 2 && base == 10) |
3033 | 0 | break; |
3034 | 0 | x = x * 8 + h; |
3035 | 0 | d = d * 10 + h; |
3036 | 0 | p++; |
3037 | 0 | } |
3038 | 0 | if (base == 8 || i == 3) |
3039 | 0 | { |
3040 | 0 | data->type = REPL_TYPE_STRING; |
3041 | 0 | data->text = g_new0 (gchar, 8); |
3042 | 0 | g_unichar_to_utf8 (x, data->text); |
3043 | 0 | } |
3044 | 0 | else |
3045 | 0 | { |
3046 | 0 | data->type = REPL_TYPE_NUMERIC_REFERENCE; |
3047 | 0 | data->num = d; |
3048 | 0 | } |
3049 | 0 | break; |
3050 | 0 | case 0: |
3051 | 0 | error_detail = _("stray final “\\”"); |
3052 | 0 | goto error; |
3053 | 0 | break; |
3054 | 0 | default: |
3055 | 0 | error_detail = _("unknown escape sequence"); |
3056 | 0 | goto error; |
3057 | 0 | } |
3058 | | |
3059 | 0 | return p; |
3060 | | |
3061 | 0 | error: |
3062 | | /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */ |
3063 | 0 | tmp_error = g_error_new (G_REGEX_ERROR, |
3064 | 0 | G_REGEX_ERROR_REPLACE, |
3065 | 0 | _("Error while parsing replacement " |
3066 | 0 | "text “%s” at char %lu: %s"), |
3067 | 0 | replacement, |
3068 | 0 | (gulong)(p - replacement), |
3069 | 0 | error_detail); |
3070 | 0 | g_propagate_error (error, tmp_error); |
3071 | |
|
3072 | 0 | return NULL; |
3073 | 0 | } |
3074 | | |
3075 | | static GList * |
3076 | | split_replacement (const gchar *replacement, |
3077 | | GError **error) |
3078 | 0 | { |
3079 | 0 | GList *list = NULL; |
3080 | 0 | InterpolationData *data; |
3081 | 0 | const gchar *p, *start; |
3082 | |
|
3083 | 0 | start = p = replacement; |
3084 | 0 | while (*p) |
3085 | 0 | { |
3086 | 0 | if (*p == '\\') |
3087 | 0 | { |
3088 | 0 | data = g_new0 (InterpolationData, 1); |
3089 | 0 | start = p = expand_escape (replacement, p, data, error); |
3090 | 0 | if (p == NULL) |
3091 | 0 | { |
3092 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3093 | 0 | free_interpolation_data (data); |
3094 | |
|
3095 | 0 | return NULL; |
3096 | 0 | } |
3097 | 0 | list = g_list_prepend (list, data); |
3098 | 0 | } |
3099 | 0 | else |
3100 | 0 | { |
3101 | 0 | p++; |
3102 | 0 | if (*p == '\\' || *p == '\0') |
3103 | 0 | { |
3104 | 0 | if (p - start > 0) |
3105 | 0 | { |
3106 | 0 | data = g_new0 (InterpolationData, 1); |
3107 | 0 | data->text = g_strndup (start, p - start); |
3108 | 0 | data->type = REPL_TYPE_STRING; |
3109 | 0 | list = g_list_prepend (list, data); |
3110 | 0 | } |
3111 | 0 | } |
3112 | 0 | } |
3113 | 0 | } |
3114 | | |
3115 | 0 | return g_list_reverse (list); |
3116 | 0 | } |
3117 | | |
3118 | | /* Change the case of c based on change_case. */ |
3119 | | #define CHANGE_CASE(c, change_case) \ |
3120 | 0 | (((change_case) & CHANGE_CASE_LOWER_MASK) ? \ |
3121 | 0 | g_unichar_tolower (c) : \ |
3122 | 0 | g_unichar_toupper (c)) |
3123 | | |
3124 | | static void |
3125 | | string_append (GString *string, |
3126 | | const gchar *text, |
3127 | | ChangeCase *change_case) |
3128 | 0 | { |
3129 | 0 | gunichar c; |
3130 | |
|
3131 | 0 | if (text[0] == '\0') |
3132 | 0 | return; |
3133 | | |
3134 | 0 | if (*change_case == CHANGE_CASE_NONE) |
3135 | 0 | { |
3136 | 0 | g_string_append (string, text); |
3137 | 0 | } |
3138 | 0 | else if (*change_case & CHANGE_CASE_SINGLE_MASK) |
3139 | 0 | { |
3140 | 0 | c = g_utf8_get_char (text); |
3141 | 0 | g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
3142 | 0 | g_string_append (string, g_utf8_next_char (text)); |
3143 | 0 | *change_case = CHANGE_CASE_NONE; |
3144 | 0 | } |
3145 | 0 | else |
3146 | 0 | { |
3147 | 0 | while (*text != '\0') |
3148 | 0 | { |
3149 | 0 | c = g_utf8_get_char (text); |
3150 | 0 | g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
3151 | 0 | text = g_utf8_next_char (text); |
3152 | 0 | } |
3153 | 0 | } |
3154 | 0 | } |
3155 | | |
3156 | | static gboolean |
3157 | | interpolate_replacement (const GMatchInfo *match_info, |
3158 | | GString *result, |
3159 | | gpointer data) |
3160 | 0 | { |
3161 | 0 | GList *list; |
3162 | 0 | InterpolationData *idata; |
3163 | 0 | gchar *match; |
3164 | 0 | ChangeCase change_case = CHANGE_CASE_NONE; |
3165 | |
|
3166 | 0 | for (list = data; list; list = list->next) |
3167 | 0 | { |
3168 | 0 | idata = list->data; |
3169 | 0 | switch (idata->type) |
3170 | 0 | { |
3171 | 0 | case REPL_TYPE_STRING: |
3172 | 0 | string_append (result, idata->text, &change_case); |
3173 | 0 | break; |
3174 | 0 | case REPL_TYPE_CHARACTER: |
3175 | 0 | g_string_append_c (result, CHANGE_CASE (idata->c, change_case)); |
3176 | 0 | if (change_case & CHANGE_CASE_SINGLE_MASK) |
3177 | 0 | change_case = CHANGE_CASE_NONE; |
3178 | 0 | break; |
3179 | 0 | case REPL_TYPE_NUMERIC_REFERENCE: |
3180 | 0 | match = g_match_info_fetch (match_info, idata->num); |
3181 | 0 | if (match) |
3182 | 0 | { |
3183 | 0 | string_append (result, match, &change_case); |
3184 | 0 | g_free (match); |
3185 | 0 | } |
3186 | 0 | break; |
3187 | 0 | case REPL_TYPE_SYMBOLIC_REFERENCE: |
3188 | 0 | match = g_match_info_fetch_named (match_info, idata->text); |
3189 | 0 | if (match) |
3190 | 0 | { |
3191 | 0 | string_append (result, match, &change_case); |
3192 | 0 | g_free (match); |
3193 | 0 | } |
3194 | 0 | break; |
3195 | 0 | case REPL_TYPE_CHANGE_CASE: |
3196 | 0 | change_case = idata->change_case; |
3197 | 0 | break; |
3198 | 0 | } |
3199 | 0 | } |
3200 | | |
3201 | 0 | return FALSE; |
3202 | 0 | } |
3203 | | |
3204 | | /* whether actual match_info is needed for replacement, i.e. |
3205 | | * whether there are references |
3206 | | */ |
3207 | | static gboolean |
3208 | | interpolation_list_needs_match (GList *list) |
3209 | 0 | { |
3210 | 0 | while (list != NULL) |
3211 | 0 | { |
3212 | 0 | InterpolationData *data = list->data; |
3213 | |
|
3214 | 0 | if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE || |
3215 | 0 | data->type == REPL_TYPE_NUMERIC_REFERENCE) |
3216 | 0 | { |
3217 | 0 | return TRUE; |
3218 | 0 | } |
3219 | | |
3220 | 0 | list = list->next; |
3221 | 0 | } |
3222 | | |
3223 | 0 | return FALSE; |
3224 | 0 | } |
3225 | | |
3226 | | /** |
3227 | | * g_regex_replace: |
3228 | | * @regex: a #GRegex structure |
3229 | | * @string: (array length=string_len): the string to perform matches against |
3230 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3231 | | * @start_position: starting index of the string to match, in bytes |
3232 | | * @replacement: text to replace each match with |
3233 | | * @match_options: options for the match |
3234 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3235 | | * |
3236 | | * Replaces all occurrences of the pattern in @regex with the |
3237 | | * replacement text. Backreferences of the form '\number' or |
3238 | | * '\g<number>' in the replacement text are interpolated by the |
3239 | | * number-th captured subexpression of the match, '\g<name>' refers |
3240 | | * to the captured subexpression with the given name. '\0' refers |
3241 | | * to the complete match, but '\0' followed by a number is the octal |
3242 | | * representation of a character. To include a literal '\' in the |
3243 | | * replacement, write '\\\\'. |
3244 | | * |
3245 | | * There are also escapes that changes the case of the following text: |
3246 | | * |
3247 | | * - \l: Convert to lower case the next character |
3248 | | * - \u: Convert to upper case the next character |
3249 | | * - \L: Convert to lower case till \E |
3250 | | * - \U: Convert to upper case till \E |
3251 | | * - \E: End case modification |
3252 | | * |
3253 | | * If you do not need to use backreferences use g_regex_replace_literal(). |
3254 | | * |
3255 | | * The @replacement string must be UTF-8 encoded even if %G_REGEX_RAW was |
3256 | | * passed to g_regex_new(). If you want to use not UTF-8 encoded strings |
3257 | | * you can use g_regex_replace_literal(). |
3258 | | * |
3259 | | * Setting @start_position differs from just passing over a shortened |
3260 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern that |
3261 | | * begins with any kind of lookbehind assertion, such as "\b". |
3262 | | * |
3263 | | * Returns: a newly allocated string containing the replacements |
3264 | | * |
3265 | | * Since: 2.14 |
3266 | | */ |
3267 | | gchar * |
3268 | | g_regex_replace (const GRegex *regex, |
3269 | | const gchar *string, |
3270 | | gssize string_len, |
3271 | | gint start_position, |
3272 | | const gchar *replacement, |
3273 | | GRegexMatchFlags match_options, |
3274 | | GError **error) |
3275 | 0 | { |
3276 | 0 | gchar *result; |
3277 | 0 | GList *list; |
3278 | 0 | GError *tmp_error = NULL; |
3279 | |
|
3280 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
3281 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3282 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
3283 | 0 | g_return_val_if_fail (replacement != NULL, NULL); |
3284 | 0 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
3285 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3286 | | |
3287 | 0 | list = split_replacement (replacement, &tmp_error); |
3288 | 0 | if (tmp_error != NULL) |
3289 | 0 | { |
3290 | 0 | g_propagate_error (error, tmp_error); |
3291 | 0 | return NULL; |
3292 | 0 | } |
3293 | | |
3294 | 0 | result = g_regex_replace_eval (regex, |
3295 | 0 | string, string_len, start_position, |
3296 | 0 | match_options, |
3297 | 0 | interpolate_replacement, |
3298 | 0 | (gpointer)list, |
3299 | 0 | &tmp_error); |
3300 | 0 | if (tmp_error != NULL) |
3301 | 0 | g_propagate_error (error, tmp_error); |
3302 | |
|
3303 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3304 | |
|
3305 | 0 | return result; |
3306 | 0 | } |
3307 | | |
3308 | | static gboolean |
3309 | | literal_replacement (const GMatchInfo *match_info, |
3310 | | GString *result, |
3311 | | gpointer data) |
3312 | 0 | { |
3313 | 0 | g_string_append (result, data); |
3314 | 0 | return FALSE; |
3315 | 0 | } |
3316 | | |
3317 | | /** |
3318 | | * g_regex_replace_literal: |
3319 | | * @regex: a #GRegex structure |
3320 | | * @string: (array length=string_len): the string to perform matches against |
3321 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3322 | | * @start_position: starting index of the string to match, in bytes |
3323 | | * @replacement: text to replace each match with |
3324 | | * @match_options: options for the match |
3325 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3326 | | * |
3327 | | * Replaces all occurrences of the pattern in @regex with the |
3328 | | * replacement text. @replacement is replaced literally, to |
3329 | | * include backreferences use g_regex_replace(). |
3330 | | * |
3331 | | * Setting @start_position differs from just passing over a |
3332 | | * shortened string and setting %G_REGEX_MATCH_NOTBOL in the |
3333 | | * case of a pattern that begins with any kind of lookbehind |
3334 | | * assertion, such as "\b". |
3335 | | * |
3336 | | * Returns: a newly allocated string containing the replacements |
3337 | | * |
3338 | | * Since: 2.14 |
3339 | | */ |
3340 | | gchar * |
3341 | | g_regex_replace_literal (const GRegex *regex, |
3342 | | const gchar *string, |
3343 | | gssize string_len, |
3344 | | gint start_position, |
3345 | | const gchar *replacement, |
3346 | | GRegexMatchFlags match_options, |
3347 | | GError **error) |
3348 | 0 | { |
3349 | 0 | g_return_val_if_fail (replacement != NULL, NULL); |
3350 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3351 | | |
3352 | 0 | return g_regex_replace_eval (regex, |
3353 | 0 | string, string_len, start_position, |
3354 | 0 | match_options, |
3355 | 0 | literal_replacement, |
3356 | 0 | (gpointer)replacement, |
3357 | 0 | error); |
3358 | 0 | } |
3359 | | |
3360 | | /** |
3361 | | * g_regex_replace_eval: |
3362 | | * @regex: a #GRegex structure from g_regex_new() |
3363 | | * @string: (array length=string_len): string to perform matches against |
3364 | | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
3365 | | * @start_position: starting index of the string to match, in bytes |
3366 | | * @match_options: options for the match |
3367 | | * @eval: a function to call for each match |
3368 | | * @user_data: user data to pass to the function |
3369 | | * @error: location to store the error occurring, or %NULL to ignore errors |
3370 | | * |
3371 | | * Replaces occurrences of the pattern in regex with the output of |
3372 | | * @eval for that occurrence. |
3373 | | * |
3374 | | * Setting @start_position differs from just passing over a shortened |
3375 | | * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
3376 | | * that begins with any kind of lookbehind assertion, such as "\b". |
3377 | | * |
3378 | | * The following example uses g_regex_replace_eval() to replace multiple |
3379 | | * strings at once: |
3380 | | * |[<!-- language="C" --> |
3381 | | * static gboolean |
3382 | | * eval_cb (const GMatchInfo *info, |
3383 | | * GString *res, |
3384 | | * gpointer data) |
3385 | | * { |
3386 | | * gchar *match; |
3387 | | * gchar *r; |
3388 | | * |
3389 | | * match = g_match_info_fetch (info, 0); |
3390 | | * r = g_hash_table_lookup ((GHashTable *)data, match); |
3391 | | * g_string_append (res, r); |
3392 | | * g_free (match); |
3393 | | * |
3394 | | * return FALSE; |
3395 | | * } |
3396 | | * |
3397 | | * ... |
3398 | | * |
3399 | | * GRegex *reg; |
3400 | | * GHashTable *h; |
3401 | | * gchar *res; |
3402 | | * |
3403 | | * h = g_hash_table_new (g_str_hash, g_str_equal); |
3404 | | * |
3405 | | * g_hash_table_insert (h, "1", "ONE"); |
3406 | | * g_hash_table_insert (h, "2", "TWO"); |
3407 | | * g_hash_table_insert (h, "3", "THREE"); |
3408 | | * g_hash_table_insert (h, "4", "FOUR"); |
3409 | | * |
3410 | | * reg = g_regex_new ("1|2|3|4", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
3411 | | * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL); |
3412 | | * g_hash_table_destroy (h); |
3413 | | * |
3414 | | * ... |
3415 | | * ]| |
3416 | | * |
3417 | | * Returns: a newly allocated string containing the replacements |
3418 | | * |
3419 | | * Since: 2.14 |
3420 | | */ |
3421 | | gchar * |
3422 | | g_regex_replace_eval (const GRegex *regex, |
3423 | | const gchar *string, |
3424 | | gssize string_len, |
3425 | | gint start_position, |
3426 | | GRegexMatchFlags match_options, |
3427 | | GRegexEvalCallback eval, |
3428 | | gpointer user_data, |
3429 | | GError **error) |
3430 | 0 | { |
3431 | 0 | GMatchInfo *match_info; |
3432 | 0 | GString *result; |
3433 | 0 | gint str_pos = 0; |
3434 | 0 | gboolean done = FALSE; |
3435 | 0 | GError *tmp_error = NULL; |
3436 | |
|
3437 | 0 | g_return_val_if_fail (regex != NULL, NULL); |
3438 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3439 | 0 | g_return_val_if_fail (start_position >= 0, NULL); |
3440 | 0 | g_return_val_if_fail (eval != NULL, NULL); |
3441 | 0 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
3442 | | |
3443 | 0 | if (string_len < 0) |
3444 | 0 | string_len = strlen (string); |
3445 | |
|
3446 | 0 | result = g_string_sized_new (string_len); |
3447 | | |
3448 | | /* run down the string making matches. */ |
3449 | 0 | g_regex_match_full (regex, string, string_len, start_position, |
3450 | 0 | match_options, &match_info, &tmp_error); |
3451 | 0 | while (!done && g_match_info_matches (match_info)) |
3452 | 0 | { |
3453 | 0 | g_string_append_len (result, |
3454 | 0 | string + str_pos, |
3455 | 0 | match_info->offsets[0] - str_pos); |
3456 | 0 | done = (*eval) (match_info, result, user_data); |
3457 | 0 | str_pos = match_info->offsets[1]; |
3458 | 0 | g_match_info_next (match_info, &tmp_error); |
3459 | 0 | } |
3460 | 0 | g_match_info_free (match_info); |
3461 | 0 | if (tmp_error != NULL) |
3462 | 0 | { |
3463 | 0 | g_propagate_error (error, tmp_error); |
3464 | 0 | g_string_free (result, TRUE); |
3465 | 0 | return NULL; |
3466 | 0 | } |
3467 | | |
3468 | 0 | g_string_append_len (result, string + str_pos, string_len - str_pos); |
3469 | 0 | return g_string_free (result, FALSE); |
3470 | 0 | } |
3471 | | |
3472 | | /** |
3473 | | * g_regex_check_replacement: |
3474 | | * @replacement: the replacement string |
3475 | | * @has_references: (out) (optional): location to store information about |
3476 | | * references in @replacement or %NULL |
3477 | | * @error: location to store error |
3478 | | * |
3479 | | * Checks whether @replacement is a valid replacement string |
3480 | | * (see g_regex_replace()), i.e. that all escape sequences in |
3481 | | * it are valid. |
3482 | | * |
3483 | | * If @has_references is not %NULL then @replacement is checked |
3484 | | * for pattern references. For instance, replacement text 'foo\n' |
3485 | | * does not contain references and may be evaluated without information |
3486 | | * about actual match, but '\0\1' (whole match followed by first |
3487 | | * subpattern) requires valid #GMatchInfo object. |
3488 | | * |
3489 | | * Returns: whether @replacement is a valid replacement string |
3490 | | * |
3491 | | * Since: 2.14 |
3492 | | */ |
3493 | | gboolean |
3494 | | g_regex_check_replacement (const gchar *replacement, |
3495 | | gboolean *has_references, |
3496 | | GError **error) |
3497 | 0 | { |
3498 | 0 | GList *list; |
3499 | 0 | GError *tmp = NULL; |
3500 | |
|
3501 | 0 | list = split_replacement (replacement, &tmp); |
3502 | |
|
3503 | 0 | if (tmp) |
3504 | 0 | { |
3505 | 0 | g_propagate_error (error, tmp); |
3506 | 0 | return FALSE; |
3507 | 0 | } |
3508 | | |
3509 | 0 | if (has_references) |
3510 | 0 | *has_references = interpolation_list_needs_match (list); |
3511 | |
|
3512 | 0 | g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
3513 | |
|
3514 | 0 | return TRUE; |
3515 | 0 | } |
3516 | | |
3517 | | /** |
3518 | | * g_regex_escape_nul: |
3519 | | * @string: the string to escape |
3520 | | * @length: the length of @string |
3521 | | * |
3522 | | * Escapes the nul characters in @string to "\x00". It can be used |
3523 | | * to compile a regex with embedded nul characters. |
3524 | | * |
3525 | | * For completeness, @length can be -1 for a nul-terminated string. |
3526 | | * In this case the output string will be of course equal to @string. |
3527 | | * |
3528 | | * Returns: a newly-allocated escaped string |
3529 | | * |
3530 | | * Since: 2.30 |
3531 | | */ |
3532 | | gchar * |
3533 | | g_regex_escape_nul (const gchar *string, |
3534 | | gint length) |
3535 | 0 | { |
3536 | 0 | GString *escaped; |
3537 | 0 | const gchar *p, *piece_start, *end; |
3538 | 0 | gint backslashes; |
3539 | |
|
3540 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3541 | | |
3542 | 0 | if (length < 0) |
3543 | 0 | return g_strdup (string); |
3544 | | |
3545 | 0 | end = string + length; |
3546 | 0 | p = piece_start = string; |
3547 | 0 | escaped = g_string_sized_new (length + 1); |
3548 | |
|
3549 | 0 | backslashes = 0; |
3550 | 0 | while (p < end) |
3551 | 0 | { |
3552 | 0 | switch (*p) |
3553 | 0 | { |
3554 | 0 | case '\0': |
3555 | 0 | if (p != piece_start) |
3556 | 0 | { |
3557 | | /* copy the previous piece. */ |
3558 | 0 | g_string_append_len (escaped, piece_start, p - piece_start); |
3559 | 0 | } |
3560 | 0 | if ((backslashes & 1) == 0) |
3561 | 0 | g_string_append_c (escaped, '\\'); |
3562 | 0 | g_string_append_c (escaped, 'x'); |
3563 | 0 | g_string_append_c (escaped, '0'); |
3564 | 0 | g_string_append_c (escaped, '0'); |
3565 | 0 | piece_start = ++p; |
3566 | 0 | backslashes = 0; |
3567 | 0 | break; |
3568 | 0 | case '\\': |
3569 | 0 | backslashes++; |
3570 | 0 | ++p; |
3571 | 0 | break; |
3572 | 0 | default: |
3573 | 0 | backslashes = 0; |
3574 | 0 | p = g_utf8_next_char (p); |
3575 | 0 | break; |
3576 | 0 | } |
3577 | 0 | } |
3578 | | |
3579 | 0 | if (piece_start < end) |
3580 | 0 | g_string_append_len (escaped, piece_start, end - piece_start); |
3581 | |
|
3582 | 0 | return g_string_free (escaped, FALSE); |
3583 | 0 | } |
3584 | | |
3585 | | /** |
3586 | | * g_regex_escape_string: |
3587 | | * @string: (array length=length): the string to escape |
3588 | | * @length: the length of @string, in bytes, or -1 if @string is nul-terminated |
3589 | | * |
3590 | | * Escapes the special characters used for regular expressions |
3591 | | * in @string, for instance "a.b*c" becomes "a\.b\*c". This |
3592 | | * function is useful to dynamically generate regular expressions. |
3593 | | * |
3594 | | * @string can contain nul characters that are replaced with "\0", |
3595 | | * in this case remember to specify the correct length of @string |
3596 | | * in @length. |
3597 | | * |
3598 | | * Returns: a newly-allocated escaped string |
3599 | | * |
3600 | | * Since: 2.14 |
3601 | | */ |
3602 | | gchar * |
3603 | | g_regex_escape_string (const gchar *string, |
3604 | | gint length) |
3605 | 0 | { |
3606 | 0 | GString *escaped; |
3607 | 0 | const char *p, *piece_start, *end; |
3608 | |
|
3609 | 0 | g_return_val_if_fail (string != NULL, NULL); |
3610 | | |
3611 | 0 | if (length < 0) |
3612 | 0 | length = strlen (string); |
3613 | |
|
3614 | 0 | end = string + length; |
3615 | 0 | p = piece_start = string; |
3616 | 0 | escaped = g_string_sized_new (length + 1); |
3617 | |
|
3618 | 0 | while (p < end) |
3619 | 0 | { |
3620 | 0 | switch (*p) |
3621 | 0 | { |
3622 | 0 | case '\0': |
3623 | 0 | case '\\': |
3624 | 0 | case '|': |
3625 | 0 | case '(': |
3626 | 0 | case ')': |
3627 | 0 | case '[': |
3628 | 0 | case ']': |
3629 | 0 | case '{': |
3630 | 0 | case '}': |
3631 | 0 | case '^': |
3632 | 0 | case '$': |
3633 | 0 | case '*': |
3634 | 0 | case '+': |
3635 | 0 | case '?': |
3636 | 0 | case '.': |
3637 | 0 | if (p != piece_start) |
3638 | | /* copy the previous piece. */ |
3639 | 0 | g_string_append_len (escaped, piece_start, p - piece_start); |
3640 | 0 | g_string_append_c (escaped, '\\'); |
3641 | 0 | if (*p == '\0') |
3642 | 0 | g_string_append_c (escaped, '0'); |
3643 | 0 | else |
3644 | 0 | g_string_append_c (escaped, *p); |
3645 | 0 | piece_start = ++p; |
3646 | 0 | break; |
3647 | 0 | default: |
3648 | 0 | p = g_utf8_next_char (p); |
3649 | 0 | break; |
3650 | 0 | } |
3651 | 0 | } |
3652 | | |
3653 | 0 | if (piece_start < end) |
3654 | 0 | g_string_append_len (escaped, piece_start, end - piece_start); |
3655 | |
|
3656 | 0 | return g_string_free (escaped, FALSE); |
3657 | 0 | } |