/src/wireshark/wsutil/regex.c
Line | Count | Source |
1 | | /* |
2 | | * Wireshark - Network traffic analyzer |
3 | | * By Gerald Combs <gerald@wireshark.org> |
4 | | * Copyright 1998 Gerald Combs |
5 | | * |
6 | | * SPDX-License-Identifier: GPL-2.0-or-later |
7 | | */ |
8 | | |
9 | | #include "config.h" |
10 | | |
11 | | #include "regex.h" |
12 | | |
13 | | #include <wsutil/str_util.h> |
14 | | #include <pcre2.h> |
15 | | |
16 | | |
17 | | struct _ws_regex { |
18 | | pcre2_code *code; |
19 | | char *pattern; |
20 | | }; |
21 | | |
22 | 0 | #define ERROR_MAXLEN_IN_CODE_UNITS 128 |
23 | | |
24 | | static char * |
25 | | get_error_msg(int errorcode) |
26 | 0 | { |
27 | 0 | uint8_t *buffer; |
28 | | |
29 | | /* |
30 | | * We have to provide a buffer and we don't know how long the |
31 | | * error message is or even the maximum size. From pcre2api(3): |
32 | | * "None of the messages are very long; a |
33 | | * buffer size of 120 code units is ample." |
34 | | */ |
35 | | /* Code unit = one byte */ |
36 | 0 | buffer = g_malloc(ERROR_MAXLEN_IN_CODE_UNITS); |
37 | | /* Message is returned with a trailing zero. */ |
38 | 0 | pcre2_get_error_message(errorcode, buffer, ERROR_MAXLEN_IN_CODE_UNITS); |
39 | | /* One more at the end for good luck. */ |
40 | 0 | buffer[ERROR_MAXLEN_IN_CODE_UNITS-1] = '\0'; |
41 | 0 | return (char*)buffer; |
42 | 0 | } |
43 | | |
44 | | |
45 | | static pcre2_code * |
46 | | compile_pcre2(const char *patt, ssize_t size, char **errmsg, unsigned flags) |
47 | 0 | { |
48 | 0 | pcre2_code *code; |
49 | 0 | int errorcode; |
50 | 0 | PCRE2_SIZE length; |
51 | 0 | PCRE2_SIZE erroroffset; |
52 | 0 | uint32_t options = 0; |
53 | |
|
54 | 0 | if (size < 0) |
55 | 0 | length = PCRE2_ZERO_TERMINATED; |
56 | 0 | else |
57 | 0 | length = (PCRE2_SIZE)size; |
58 | |
|
59 | 0 | if (flags & WS_REGEX_NEVER_UTF) |
60 | 0 | options |= PCRE2_NEVER_UTF; |
61 | 0 | if (flags & WS_REGEX_CASELESS) |
62 | 0 | options |= PCRE2_CASELESS; |
63 | 0 | if (flags & WS_REGEX_ANCHORED) |
64 | 0 | options |= PCRE2_ANCHORED; |
65 | | |
66 | | /* By default UTF-8 is off. */ |
67 | 0 | code = pcre2_compile_8((PCRE2_SPTR)patt, |
68 | 0 | length, |
69 | 0 | options, |
70 | 0 | &errorcode, |
71 | 0 | &erroroffset, |
72 | 0 | NULL); |
73 | |
|
74 | 0 | if (code == NULL) { |
75 | 0 | *errmsg = get_error_msg(errorcode); |
76 | 0 | return NULL; |
77 | 0 | } |
78 | | |
79 | 0 | return code; |
80 | 0 | } |
81 | | |
82 | | |
83 | | ws_regex_t * |
84 | | ws_regex_compile_ex(const char *patt, ssize_t size, char **errmsg, unsigned flags) |
85 | 0 | { |
86 | 0 | ws_return_val_if(!patt, NULL); |
87 | | |
88 | 0 | pcre2_code *code = compile_pcre2(patt, size, errmsg, flags); |
89 | 0 | if (code == NULL) |
90 | 0 | return NULL; |
91 | | |
92 | 0 | ws_regex_t *re = g_new(ws_regex_t, 1); |
93 | 0 | re->code = code; |
94 | 0 | re->pattern = ws_escape_string_len(NULL, patt, size, false); |
95 | 0 | return re; |
96 | 0 | } |
97 | | |
98 | | |
99 | | ws_regex_t * |
100 | | ws_regex_compile(const char *patt, char **errmsg) |
101 | 0 | { |
102 | 0 | return ws_regex_compile_ex(patt, -1, errmsg, 0); |
103 | 0 | } |
104 | | |
105 | | |
106 | | static bool |
107 | | match_pcre2(pcre2_code *code, const char *subject, ssize_t subj_length, |
108 | | size_t subj_offset, pcre2_match_data *match_data) |
109 | 0 | { |
110 | 0 | PCRE2_SIZE length; |
111 | 0 | int rc; |
112 | |
|
113 | 0 | if (subj_length < 0) |
114 | 0 | length = PCRE2_ZERO_TERMINATED; |
115 | 0 | else |
116 | 0 | length = (PCRE2_SIZE)subj_length; |
117 | |
|
118 | 0 | rc = pcre2_match(code, |
119 | 0 | (const uint8_t*)subject, |
120 | 0 | length, |
121 | 0 | (PCRE2_SIZE)subj_offset, |
122 | 0 | 0, /* default options */ |
123 | 0 | match_data, |
124 | 0 | NULL); |
125 | |
|
126 | 0 | if (rc < 0) { |
127 | | /* No match */ |
128 | 0 | if (rc != PCRE2_ERROR_NOMATCH) { |
129 | | /* Error. Should not happen with UTF-8 disabled. Some huge |
130 | | * subject strings could hit some internal limit. */ |
131 | 0 | char *msg = get_error_msg(rc); |
132 | 0 | ws_debug("Unexpected pcre2_match() error: %s.", msg); |
133 | 0 | g_free(msg); |
134 | 0 | } |
135 | 0 | return false; |
136 | 0 | } |
137 | | |
138 | | /* Matched */ |
139 | 0 | return true; |
140 | 0 | } |
141 | | |
142 | | |
143 | | bool |
144 | | ws_regex_matches(const ws_regex_t *re, const char *subj) |
145 | 0 | { |
146 | 0 | return ws_regex_matches_length(re, subj, -1); |
147 | 0 | } |
148 | | |
149 | | |
150 | | bool |
151 | | ws_regex_matches_length(const ws_regex_t *re, |
152 | | const char *subj, ssize_t subj_length) |
153 | 0 | { |
154 | 0 | bool matched; |
155 | 0 | pcre2_match_data *match_data; |
156 | |
|
157 | 0 | ws_return_val_if(!re, false); |
158 | 0 | ws_return_val_if(!subj, false); |
159 | | |
160 | | /* We don't use the matched substring but pcre2_match requires |
161 | | * at least one pair of offsets. */ |
162 | 0 | match_data = pcre2_match_data_create(1, NULL); |
163 | 0 | matched = match_pcre2(re->code, subj, subj_length, 0, match_data); |
164 | 0 | pcre2_match_data_free(match_data); |
165 | 0 | return matched; |
166 | 0 | } |
167 | | |
168 | | |
169 | | bool |
170 | | ws_regex_matches_pos(const ws_regex_t *re, |
171 | | const char *subj, ssize_t subj_length, |
172 | | size_t subj_offset, size_t pos_vect[2]) |
173 | 0 | { |
174 | 0 | bool matched; |
175 | 0 | pcre2_match_data *match_data; |
176 | |
|
177 | 0 | ws_return_val_if(!re, false); |
178 | 0 | ws_return_val_if(!subj, false); |
179 | | |
180 | 0 | match_data = pcre2_match_data_create(1, NULL); |
181 | 0 | matched = match_pcre2(re->code, subj, subj_length, subj_offset, match_data); |
182 | 0 | if (matched && pos_vect) { |
183 | 0 | PCRE2_SIZE *ovect = pcre2_get_ovector_pointer(match_data); |
184 | 0 | pos_vect[0] = ovect[0]; |
185 | 0 | pos_vect[1] = ovect[1]; |
186 | 0 | } |
187 | 0 | pcre2_match_data_free(match_data); |
188 | 0 | return matched; |
189 | 0 | } |
190 | | |
191 | | |
192 | | void |
193 | | ws_regex_free(ws_regex_t *re) |
194 | 0 | { |
195 | 0 | pcre2_code_free(re->code); |
196 | 0 | g_free(re->pattern); |
197 | 0 | g_free(re); |
198 | 0 | } |
199 | | |
200 | | |
201 | | const char * |
202 | | ws_regex_pattern(const ws_regex_t *re) |
203 | 0 | { |
204 | 0 | return re->pattern; |
205 | 0 | } |