/src/poco/Foundation/src/RegularExpression.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // RegularExpression.h |
3 | | // |
4 | | // Library: Foundation |
5 | | // Package: RegExp |
6 | | // Module: RegularExpression |
7 | | // |
8 | | // Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH. |
9 | | // and Contributors. |
10 | | // |
11 | | // SPDX-License-Identifier: BSL-1.0 |
12 | | // |
13 | | |
14 | | |
15 | | #include "Poco/RegularExpression.h" |
16 | | #include "Poco/Exception.h" |
17 | | #include <sstream> |
18 | | #if defined(POCO_UNBUNDLED) |
19 | | #define PCRE2_CODE_UNIT_WIDTH 8 |
20 | | #include <pcre2.h> |
21 | | #else |
22 | | #include "pcre2_config.h" |
23 | | #include "pcre2.h" |
24 | | #endif |
25 | | |
26 | | |
27 | | namespace |
28 | | { |
29 | | class MatchData |
30 | | { |
31 | | public: |
32 | | MatchData(pcre2_code_8* code): |
33 | | _match(pcre2_match_data_create_from_pattern_8(reinterpret_cast<pcre2_code_8*>(code), nullptr)) |
34 | 0 | { |
35 | 0 | if (!_match) throw Poco::RegularExpressionException("cannot create match data"); |
36 | 0 | } |
37 | | |
38 | | ~MatchData() |
39 | 0 | { |
40 | 0 | if (_match) pcre2_match_data_free_8(_match); |
41 | 0 | } |
42 | | |
43 | | std::uint32_t count() const |
44 | 0 | { |
45 | 0 | return pcre2_get_ovector_count_8(_match); |
46 | 0 | } |
47 | | |
48 | | const PCRE2_SIZE* data() const |
49 | 0 | { |
50 | 0 | return pcre2_get_ovector_pointer_8(_match); |
51 | 0 | } |
52 | | |
53 | | operator pcre2_match_data_8*() |
54 | 0 | { |
55 | 0 | return _match; |
56 | 0 | } |
57 | | |
58 | | private: |
59 | | pcre2_match_data_8* _match; |
60 | | }; |
61 | | } |
62 | | |
63 | | |
64 | | namespace Poco { |
65 | | |
66 | | |
67 | | RegularExpression::RegularExpression(const std::string& pattern, int options, bool /*study*/): _pcre(nullptr) |
68 | 0 | { |
69 | 0 | int errorCode; |
70 | 0 | PCRE2_SIZE errorOffset; |
71 | 0 | unsigned nameCount; |
72 | 0 | unsigned nameEntrySize; |
73 | 0 | unsigned char* nameTable; |
74 | |
|
75 | 0 | pcre2_compile_context_8* context = pcre2_compile_context_create_8(nullptr); |
76 | 0 | if (!context) throw Poco::RegularExpressionException("cannot create compile context"); |
77 | | |
78 | 0 | if (options & RE_NEWLINE_LF) |
79 | 0 | pcre2_set_newline_8(context, PCRE2_NEWLINE_LF); |
80 | 0 | else if (options & RE_NEWLINE_CRLF) |
81 | 0 | pcre2_set_newline_8(context, PCRE2_NEWLINE_CRLF); |
82 | 0 | else if (options & RE_NEWLINE_ANY) |
83 | 0 | pcre2_set_newline_8(context, PCRE2_NEWLINE_ANY); |
84 | 0 | else if (options & RE_NEWLINE_ANYCRLF) |
85 | 0 | pcre2_set_newline_8(context, PCRE2_NEWLINE_ANYCRLF); |
86 | 0 | else // default RE_NEWLINE_CR |
87 | 0 | pcre2_set_newline_8(context, PCRE2_NEWLINE_CR); |
88 | |
|
89 | 0 | _pcre = pcre2_compile_8(reinterpret_cast<const PCRE2_SPTR>(pattern.c_str()), pattern.length(), compileOptions(options), &errorCode, &errorOffset, context); |
90 | 0 | pcre2_compile_context_free_8(context); |
91 | |
|
92 | 0 | if (!_pcre) |
93 | 0 | { |
94 | 0 | PCRE2_UCHAR buffer[256]; |
95 | 0 | pcre2_get_error_message_8(errorCode, buffer, sizeof(buffer)); |
96 | 0 | std::ostringstream msg; |
97 | 0 | msg << reinterpret_cast<char*>(buffer) << " (at offset " << errorOffset << ")"; |
98 | 0 | throw RegularExpressionException(msg.str()); |
99 | 0 | } |
100 | | |
101 | 0 | pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMECOUNT, &nameCount); |
102 | 0 | pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMEENTRYSIZE, &nameEntrySize); |
103 | 0 | pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMETABLE, &nameTable); |
104 | |
|
105 | 0 | for (int i = 0; i < nameCount; i++) |
106 | 0 | { |
107 | 0 | unsigned char* group = nameTable + 2 + (nameEntrySize * i); |
108 | 0 | int n = pcre2_substring_number_from_name_8(reinterpret_cast<pcre2_code_8*>(_pcre), group); |
109 | 0 | _groups[n] = std::string(reinterpret_cast<char*>(group)); |
110 | 0 | } |
111 | 0 | } |
112 | | |
113 | | |
114 | | RegularExpression::~RegularExpression() |
115 | 0 | { |
116 | 0 | if (_pcre) pcre2_code_free_8(reinterpret_cast<pcre2_code_8*>(_pcre)); |
117 | 0 | } |
118 | | |
119 | | |
120 | | int RegularExpression::match(const std::string& subject, std::string::size_type offset, Match& mtch, int options) const |
121 | 0 | { |
122 | 0 | poco_assert (offset <= subject.length()); |
123 | |
|
124 | 0 | MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre)); |
125 | 0 | int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr); |
126 | 0 | if (rc == PCRE2_ERROR_NOMATCH) |
127 | 0 | { |
128 | 0 | mtch.offset = std::string::npos; |
129 | 0 | mtch.length = 0; |
130 | 0 | return 0; |
131 | 0 | } |
132 | 0 | else if (rc == PCRE2_ERROR_BADOPTION) |
133 | 0 | { |
134 | 0 | throw RegularExpressionException("bad option"); |
135 | 0 | } |
136 | 0 | else if (rc == 0) |
137 | 0 | { |
138 | 0 | throw RegularExpressionException("too many captured substrings"); |
139 | 0 | } |
140 | 0 | else if (rc < 0) |
141 | 0 | { |
142 | 0 | PCRE2_UCHAR buffer[256]; |
143 | 0 | pcre2_get_error_message_8(rc, buffer, sizeof(buffer)); |
144 | 0 | throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer))); |
145 | 0 | } |
146 | 0 | const PCRE2_SIZE* ovec = matchData.data(); |
147 | 0 | mtch.offset = ovec[0] < 0 ? std::string::npos : ovec[0]; |
148 | 0 | mtch.length = ovec[1] - mtch.offset; |
149 | 0 | return rc; |
150 | 0 | } |
151 | | |
152 | | |
153 | | int RegularExpression::match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options) const |
154 | 0 | { |
155 | 0 | poco_assert (offset <= subject.length()); |
156 | |
|
157 | 0 | matches.clear(); |
158 | |
|
159 | 0 | MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre)); |
160 | 0 | int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, options & 0xFFFF, matchData, nullptr); |
161 | 0 | if (rc == PCRE2_ERROR_NOMATCH) |
162 | 0 | { |
163 | 0 | return 0; |
164 | 0 | } |
165 | 0 | else if (rc == PCRE2_ERROR_BADOPTION) |
166 | 0 | { |
167 | 0 | throw RegularExpressionException("bad option"); |
168 | 0 | } |
169 | 0 | else if (rc == 0) |
170 | 0 | { |
171 | 0 | throw RegularExpressionException("too many captured substrings"); |
172 | 0 | } |
173 | 0 | else if (rc < 0) |
174 | 0 | { |
175 | 0 | PCRE2_UCHAR buffer[256]; |
176 | 0 | pcre2_get_error_message_8(rc, buffer, sizeof(buffer)); |
177 | 0 | throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer))); |
178 | 0 | } |
179 | 0 | matches.reserve(rc); |
180 | 0 | const PCRE2_SIZE* ovec = matchData.data(); |
181 | 0 | for (int i = 0; i < rc; ++i) |
182 | 0 | { |
183 | 0 | Match m; |
184 | 0 | GroupMap::const_iterator it; |
185 | |
|
186 | 0 | m.offset = ovec[i*2] < 0 ? std::string::npos : ovec[i*2] ; |
187 | 0 | m.length = ovec[i*2 + 1] - m.offset; |
188 | |
|
189 | 0 | it = _groups.find(i); |
190 | 0 | if (it != _groups.end()) |
191 | 0 | { |
192 | 0 | m.name = (*it).second; |
193 | 0 | } |
194 | |
|
195 | 0 | matches.push_back(m); |
196 | 0 | } |
197 | 0 | return rc; |
198 | 0 | } |
199 | | |
200 | | |
201 | | bool RegularExpression::match(const std::string& subject, std::string::size_type offset) const |
202 | 0 | { |
203 | 0 | Match mtch; |
204 | 0 | match(subject, offset, mtch, RE_ANCHORED | RE_NOTEMPTY); |
205 | 0 | return mtch.offset == offset && mtch.length == subject.length() - offset; |
206 | 0 | } |
207 | | |
208 | | |
209 | | bool RegularExpression::match(const std::string& subject, std::string::size_type offset, int options) const |
210 | 0 | { |
211 | 0 | Match mtch; |
212 | 0 | match(subject, offset, mtch, options); |
213 | 0 | return mtch.offset == offset && mtch.length == subject.length() - offset; |
214 | 0 | } |
215 | | |
216 | | |
217 | | int RegularExpression::extract(const std::string& subject, std::string& str, int options) const |
218 | 0 | { |
219 | 0 | Match mtch; |
220 | 0 | int rc = match(subject, 0, mtch, options); |
221 | 0 | if (mtch.offset != std::string::npos) |
222 | 0 | str.assign(subject, mtch.offset, mtch.length); |
223 | 0 | else |
224 | 0 | str.clear(); |
225 | 0 | return rc; |
226 | 0 | } |
227 | | |
228 | | |
229 | | int RegularExpression::extract(const std::string& subject, std::string::size_type offset, std::string& str, int options) const |
230 | 0 | { |
231 | 0 | Match mtch; |
232 | 0 | int rc = match(subject, offset, mtch, options); |
233 | 0 | if (mtch.offset != std::string::npos) |
234 | 0 | str.assign(subject, mtch.offset, mtch.length); |
235 | 0 | else |
236 | 0 | str.clear(); |
237 | 0 | return rc; |
238 | 0 | } |
239 | | |
240 | | |
241 | | int RegularExpression::split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options) const |
242 | 0 | { |
243 | 0 | MatchVec matches; |
244 | 0 | strings.clear(); |
245 | 0 | int rc = match(subject, offset, matches, options); |
246 | 0 | strings.reserve(matches.size()); |
247 | 0 | for (const auto& m: matches) |
248 | 0 | { |
249 | 0 | if (m.offset != std::string::npos) |
250 | 0 | strings.push_back(subject.substr(m.offset, m.length)); |
251 | 0 | else |
252 | 0 | strings.push_back(std::string()); |
253 | 0 | } |
254 | 0 | return rc; |
255 | 0 | } |
256 | | |
257 | | |
258 | | int RegularExpression::subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const |
259 | 0 | { |
260 | 0 | if (options & RE_GLOBAL) |
261 | 0 | { |
262 | 0 | int rc = 0; |
263 | 0 | std::string::size_type pos = substOne(subject, offset, replacement, options); |
264 | 0 | while (pos != std::string::npos) |
265 | 0 | { |
266 | 0 | ++rc; |
267 | 0 | pos = substOne(subject, pos, replacement, options); |
268 | 0 | } |
269 | 0 | return rc; |
270 | 0 | } |
271 | 0 | else |
272 | 0 | { |
273 | 0 | return substOne(subject, offset, replacement, options) != std::string::npos ? 1 : 0; |
274 | 0 | } |
275 | 0 | } |
276 | | |
277 | | |
278 | | std::string::size_type RegularExpression::substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const |
279 | 0 | { |
280 | 0 | if (offset >= subject.length()) return std::string::npos; |
281 | | |
282 | 0 | MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre)); |
283 | 0 | int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr); |
284 | 0 | if (rc == PCRE2_ERROR_NOMATCH) |
285 | 0 | { |
286 | 0 | return std::string::npos; |
287 | 0 | } |
288 | 0 | else if (rc == PCRE2_ERROR_BADOPTION) |
289 | 0 | { |
290 | 0 | throw RegularExpressionException("bad option"); |
291 | 0 | } |
292 | 0 | else if (rc == 0) |
293 | 0 | { |
294 | 0 | throw RegularExpressionException("too many captured substrings"); |
295 | 0 | } |
296 | 0 | else if (rc < 0) |
297 | 0 | { |
298 | 0 | PCRE2_UCHAR buffer[256]; |
299 | 0 | pcre2_get_error_message_8(rc, buffer, sizeof(buffer)); |
300 | 0 | throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer))); |
301 | 0 | } |
302 | 0 | const PCRE2_SIZE* ovec = matchData.data(); |
303 | 0 | std::string result; |
304 | 0 | std::string::size_type len = subject.length(); |
305 | 0 | std::string::size_type pos = 0; |
306 | 0 | std::string::size_type rp = std::string::npos; |
307 | 0 | while (pos < len) |
308 | 0 | { |
309 | 0 | if (ovec[0] == pos) |
310 | 0 | { |
311 | 0 | std::string::const_iterator it = replacement.begin(); |
312 | 0 | std::string::const_iterator end = replacement.end(); |
313 | 0 | while (it != end) |
314 | 0 | { |
315 | 0 | if (*it == '$' && !(options & RE_NO_VARS)) |
316 | 0 | { |
317 | 0 | ++it; |
318 | 0 | if (it != end) |
319 | 0 | { |
320 | 0 | char d = *it; |
321 | 0 | if (d >= '0' && d <= '9') |
322 | 0 | { |
323 | 0 | int c = d - '0'; |
324 | 0 | if (c < rc) |
325 | 0 | { |
326 | 0 | std::size_t o = ovec[c*2]; |
327 | 0 | std::size_t l = ovec[c*2 + 1] - o; |
328 | 0 | result.append(subject, o, l); |
329 | 0 | } |
330 | 0 | } |
331 | 0 | else |
332 | 0 | { |
333 | 0 | result += '$'; |
334 | 0 | result += d; |
335 | 0 | } |
336 | 0 | ++it; |
337 | 0 | } |
338 | 0 | else result += '$'; |
339 | 0 | } |
340 | 0 | else result += *it++; |
341 | 0 | } |
342 | 0 | pos = ovec[1]; |
343 | 0 | rp = result.length(); |
344 | 0 | } |
345 | 0 | else result += subject[pos++]; |
346 | 0 | } |
347 | 0 | subject = result; |
348 | 0 | return rp; |
349 | 0 | } |
350 | | |
351 | | |
352 | | bool RegularExpression::match(const std::string& subject, const std::string& pattern, int options) |
353 | 0 | { |
354 | 0 | int ctorOptions = options & (RE_CASELESS | RE_MULTILINE | RE_DOTALL | RE_EXTENDED | RE_ANCHORED | RE_DOLLAR_ENDONLY | RE_EXTRA | RE_UNGREEDY | RE_UTF8 | RE_NO_AUTO_CAPTURE); |
355 | 0 | int mtchOptions = options & (RE_ANCHORED | RE_NOTBOL | RE_NOTEOL | RE_NOTEMPTY | RE_NO_AUTO_CAPTURE | RE_NO_UTF8_CHECK); |
356 | 0 | RegularExpression re(pattern, ctorOptions, false); |
357 | 0 | return re.match(subject, 0, mtchOptions); |
358 | 0 | } |
359 | | |
360 | | |
361 | | int RegularExpression::compileOptions(int options) |
362 | 0 | { |
363 | 0 | int pcreOptions = 0; |
364 | |
|
365 | 0 | if (options & RE_CASELESS) |
366 | 0 | pcreOptions |= PCRE2_CASELESS; |
367 | 0 | if (options & RE_MULTILINE) |
368 | 0 | pcreOptions |= PCRE2_MULTILINE; |
369 | 0 | if (options & RE_DOTALL) |
370 | 0 | pcreOptions |= PCRE2_DOTALL; |
371 | 0 | if (options & RE_EXTENDED) |
372 | 0 | pcreOptions |= PCRE2_EXTENDED; |
373 | 0 | if (options & RE_ANCHORED) |
374 | 0 | pcreOptions |= PCRE2_ANCHORED; |
375 | 0 | if (options & RE_DOLLAR_ENDONLY) |
376 | 0 | pcreOptions |= PCRE2_DOLLAR_ENDONLY; |
377 | 0 | if (options & RE_UNGREEDY) |
378 | 0 | pcreOptions |= PCRE2_UNGREEDY; |
379 | 0 | if (options & RE_UTF8) |
380 | 0 | pcreOptions |= PCRE2_UTF | PCRE2_UCP; |
381 | 0 | if (options & RE_NO_AUTO_CAPTURE) |
382 | 0 | pcreOptions |= PCRE2_NO_AUTO_CAPTURE; |
383 | 0 | if (options & RE_FIRSTLINE) |
384 | 0 | pcreOptions |= PCRE2_FIRSTLINE; |
385 | 0 | if (options & RE_DUPNAMES) |
386 | 0 | pcreOptions |= PCRE2_DUPNAMES; |
387 | |
|
388 | 0 | return pcreOptions; |
389 | 0 | } |
390 | | |
391 | | |
392 | | int RegularExpression::matchOptions(int options) |
393 | 0 | { |
394 | 0 | int pcreOptions = 0; |
395 | |
|
396 | 0 | if (options & RE_ANCHORED) |
397 | 0 | pcreOptions |= PCRE2_ANCHORED; |
398 | 0 | if (options & RE_NOTBOL) |
399 | 0 | pcreOptions |= PCRE2_NOTBOL; |
400 | 0 | if (options & RE_NOTEOL) |
401 | 0 | pcreOptions |= PCRE2_NOTEOL; |
402 | 0 | if (options & RE_NOTEMPTY) |
403 | 0 | pcreOptions |= PCRE2_NOTEMPTY; |
404 | 0 | if (options & RE_NO_AUTO_CAPTURE) |
405 | 0 | pcreOptions |= PCRE2_NO_AUTO_CAPTURE; |
406 | 0 | if (options & RE_NO_UTF8_CHECK) |
407 | 0 | pcreOptions |= PCRE2_NO_UTF_CHECK; |
408 | |
|
409 | 0 | return pcreOptions; |
410 | 0 | } |
411 | | |
412 | | |
413 | | } // namespace Poco |