/src/poco/Foundation/include/Poco/RegularExpression.h
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // RegularExpression.h |
3 | | // |
4 | | // Library: Foundation |
5 | | // Package: RegExp |
6 | | // Module: RegularExpression |
7 | | // |
8 | | // Definitions of class RegularExpression. |
9 | | // |
10 | | // A wrapper class for Philip Hazel's PCRE - Perl Compatible Regular Expressions |
11 | | // library (http://www.pcre.org). |
12 | | // |
13 | | // Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH. |
14 | | // and Contributors. |
15 | | // |
16 | | // SPDX-License-Identifier: BSL-1.0 |
17 | | // |
18 | | |
19 | | |
20 | | #ifndef Foundation_RegularExpression_INCLUDED |
21 | | #define Foundation_RegularExpression_INCLUDED |
22 | | |
23 | | |
24 | | #include "Poco/Foundation.h" |
25 | | #include <vector> |
26 | | #include <map> |
27 | | |
28 | | |
29 | | namespace Poco { |
30 | | |
31 | | |
32 | | class Foundation_API RegularExpression |
33 | | /// A class for working with regular expressions. |
34 | | /// Implemented using PCRE2, the Perl Compatible |
35 | | /// Regular Expressions library by Philip Hazel |
36 | | /// (see http://www.pcre.org). |
37 | | { |
38 | | public: |
39 | | enum Options |
40 | | /// Some of the following options can only be passed to the constructor; |
41 | | /// some can be passed only to matching functions, and some can be used |
42 | | /// everywhere. |
43 | | /// |
44 | | /// * Options marked [ctor] can be passed to the constructor. |
45 | | /// * Options marked [match] can be passed to match, extract, split and subst. |
46 | | /// * Options marked [subst] can be passed to subst. |
47 | | /// |
48 | | /// See the PCRE documentation for more information. |
49 | | { |
50 | | RE_CASELESS = 0x00000001, /// case insensitive matching (/i) [ctor] |
51 | | RE_MULTILINE = 0x00000002, /// enable multi-line mode; affects ^ and $ (/m) [ctor] |
52 | | RE_DOTALL = 0x00000004, /// dot matches all characters, including newline (/s) [ctor] |
53 | | RE_EXTENDED = 0x00000008, /// totally ignore whitespace (/x) [ctor] |
54 | | RE_ANCHORED = 0x00000010, /// treat pattern as if it starts with a ^ [ctor, match] |
55 | | RE_DOLLAR_ENDONLY = 0x00000020, /// dollar matches end-of-string only, not last newline in string [ctor] |
56 | | RE_EXTRA = 0x00000040, /// enable optional PCRE functionality [ctor] |
57 | | RE_NOTBOL = 0x00000080, /// circumflex does not match beginning of string [match] |
58 | | RE_NOTEOL = 0x00000100, /// $ does not match end of string [match] |
59 | | RE_UNGREEDY = 0x00000200, /// make quantifiers ungreedy [ctor] |
60 | | RE_NOTEMPTY = 0x00000400, /// empty string never matches [match] |
61 | | RE_UTF8 = 0x00000800, /// assume pattern and subject is UTF-8 encoded [ctor] |
62 | | RE_NO_AUTO_CAPTURE = 0x00001000, /// disable numbered capturing parentheses [ctor, match] |
63 | | RE_NO_UTF8_CHECK = 0x00002000, /// do not check validity of UTF-8 code sequences [match] |
64 | | RE_FIRSTLINE = 0x00040000, /// an unanchored pattern is required to match |
65 | | /// before or at the first newline in the subject string, |
66 | | /// though the matched text may continue over the newline [ctor] |
67 | | RE_DUPNAMES = 0x00080000, /// names used to identify capturing subpatterns need not be unique [ctor] |
68 | | RE_NEWLINE_CR = 0x00100000, /// assume newline is CR ('\r'), the default [ctor] |
69 | | RE_NEWLINE_LF = 0x00200000, /// assume newline is LF ('\n') [ctor] |
70 | | RE_NEWLINE_CRLF = 0x00300000, /// assume newline is CRLF ("\r\n") [ctor] |
71 | | RE_NEWLINE_ANY = 0x00400000, /// assume newline is any valid Unicode newline character [ctor] |
72 | | RE_NEWLINE_ANYCRLF = 0x00500000, /// assume newline is any of CR, LF, CRLF [ctor] |
73 | | RE_GLOBAL = 0x10000000, /// replace all occurences (/g) [subst] |
74 | | RE_NO_VARS = 0x20000000 /// treat dollar in replacement string as ordinary character [subst] |
75 | | }; |
76 | | |
77 | | struct Match |
78 | | { |
79 | | std::string::size_type offset; /// zero based offset (std::string::npos if subexpr does not match) |
80 | | std::string::size_type length; /// length of substring |
81 | | std::string name; /// name of group |
82 | | }; |
83 | | using MatchVec = std::vector<Match>; |
84 | | using GroupMap = std::map<int, std::string>; |
85 | | |
86 | | RegularExpression(const std::string& pattern, int options = 0, bool study = true); |
87 | | /// Creates a regular expression and parses the given pattern. |
88 | | /// Note: the study argument is only provided for backwards compatibility |
89 | | /// and is ignored since POCO release 1.12, which uses PCRE2. |
90 | | /// For a description of the options, please see the PCRE documentation. |
91 | | /// Throws a RegularExpressionException if the patter cannot be compiled. |
92 | | |
93 | | ~RegularExpression(); |
94 | | /// Destroys the regular expression. |
95 | | |
96 | | int match(const std::string& subject, Match& mtch, int options = 0) const; |
97 | | /// Matches the given subject string against the pattern. Returns the position |
98 | | /// of the first captured substring in mtch. |
99 | | /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and |
100 | | /// mtch.length is 0. |
101 | | /// Throws a RegularExpressionException in case of an error. |
102 | | /// Returns the number of matches. |
103 | | |
104 | | int match(const std::string& subject, std::string::size_type offset, Match& mtch, int options = 0) const; |
105 | | /// Matches the given subject string, starting at offset, against the pattern. |
106 | | /// Returns the position of the captured substring in mtch. |
107 | | /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and |
108 | | /// mtch.length is 0. |
109 | | /// Throws a RegularExpressionException in case of an error. |
110 | | /// Returns the number of matches. |
111 | | |
112 | | int match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options = 0) const; |
113 | | /// Matches the given subject string against the pattern. |
114 | | /// The first entry in matches contains the position of the captured substring. |
115 | | /// The following entries identify matching subpatterns. See the PCRE documentation |
116 | | /// for a more detailed explanation. |
117 | | /// If no part of the subject matches the pattern, matches is empty. |
118 | | /// Throws a RegularExpressionException in case of an error. |
119 | | /// Returns the number of matches. |
120 | | |
121 | | bool match(const std::string& subject, std::string::size_type offset = 0) const; |
122 | | /// Returns true if and only if the subject matches the regular expression. |
123 | | /// |
124 | | /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for |
125 | | /// matching, which means that the empty string will never match and |
126 | | /// the pattern is treated as if it starts with a ^. |
127 | | |
128 | | bool match(const std::string& subject, std::string::size_type offset, int options) const; |
129 | | /// Returns true if and only if the subject matches the regular expression. |
130 | | |
131 | | bool operator == (const std::string& subject) const; |
132 | | /// Returns true if and only if the subject matches the regular expression. |
133 | | /// |
134 | | /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for |
135 | | /// matching, which means that the empty string will never match and |
136 | | /// the pattern is treated as if it starts with a ^. |
137 | | |
138 | | bool operator != (const std::string& subject) const; |
139 | | /// Returns true if and only if the subject does not match the regular expression. |
140 | | /// |
141 | | /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for |
142 | | /// matching, which means that the empty string will never match and |
143 | | /// the pattern is treated as if it starts with a ^. |
144 | | |
145 | | int extract(const std::string& subject, std::string& str, int options = 0) const; |
146 | | /// Matches the given subject string against the pattern. |
147 | | /// Returns the captured string. |
148 | | /// Throws a RegularExpressionException in case of an error. |
149 | | /// Returns the number of matches. |
150 | | |
151 | | int extract(const std::string& subject, std::string::size_type offset, std::string& str, int options = 0) const; |
152 | | /// Matches the given subject string, starting at offset, against the pattern. |
153 | | /// Returns the captured string. |
154 | | /// Throws a RegularExpressionException in case of an error. |
155 | | /// Returns the number of matches. |
156 | | |
157 | | int split(const std::string& subject, std::vector<std::string>& strings, int options = 0) const; |
158 | | /// Matches the given subject string against the pattern. |
159 | | /// The first entry in captured is the captured substring. |
160 | | /// The following entries contain substrings matching subpatterns. See the PCRE documentation |
161 | | /// for a more detailed explanation. |
162 | | /// If no part of the subject matches the pattern, captured is empty. |
163 | | /// Throws a RegularExpressionException in case of an error. |
164 | | /// Returns the number of matches. |
165 | | |
166 | | int split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options = 0) const; |
167 | | /// Matches the given subject string against the pattern. |
168 | | /// The first entry in captured is the captured substring. |
169 | | /// The following entries contain substrings matching subpatterns. See the PCRE documentation |
170 | | /// for a more detailed explanation. |
171 | | /// If no part of the subject matches the pattern, captured is empty. |
172 | | /// Throws a RegularExpressionException in case of an error. |
173 | | /// Returns the number of matches. |
174 | | |
175 | | int subst(std::string& subject, const std::string& replacement, int options = 0) const; |
176 | | /// Substitute in subject all matches of the pattern with replacement. |
177 | | /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise, |
178 | | /// only the first match is replaced. |
179 | | /// Occurrences of $<n> (for example, $1, $2, ...) in replacement are replaced |
180 | | /// with the corresponding captured string. $0 is the original subject string. |
181 | | /// Returns the number of replaced occurrences. |
182 | | |
183 | | int subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options = 0) const; |
184 | | /// Substitute in subject all matches of the pattern with replacement, |
185 | | /// starting at offset. |
186 | | /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise, |
187 | | /// only the first match is replaced. |
188 | | /// Unless RE_NO_VARS is specified, occurrences of $<n> (for example, $0, $1, $2, ... $9) |
189 | | /// in replacement are replaced with the corresponding captured string. |
190 | | /// $0 is the captured substring. $1 ... $n are the substrings matching the subpatterns. |
191 | | /// Returns the number of replaced occurrences. |
192 | | |
193 | | static bool match(const std::string& subject, const std::string& pattern, int options = 0); |
194 | | /// Matches the given subject string against the regular expression given in pattern, |
195 | | /// using the given options. |
196 | | |
197 | | protected: |
198 | | std::string::size_type substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const; |
199 | | static int compileOptions(int options); |
200 | | static int matchOptions(int options); |
201 | | |
202 | | private: |
203 | | // Note: to avoid a dependency on the pcre2.h header the following are |
204 | | // declared as void* and casted to the correct type in the implementation file. |
205 | | void* _pcre; // Actual type is pcre2_code_8* |
206 | | |
207 | | GroupMap _groups; |
208 | | |
209 | | RegularExpression(); |
210 | | RegularExpression(const RegularExpression&); |
211 | | RegularExpression& operator = (const RegularExpression&); |
212 | | }; |
213 | | |
214 | | |
215 | | // |
216 | | // inlines |
217 | | // |
218 | | inline int RegularExpression::match(const std::string& subject, Match& mtch, int options) const |
219 | 0 | { |
220 | 0 | return match(subject, 0, mtch, options); |
221 | 0 | } |
222 | | |
223 | | |
224 | | inline int RegularExpression::split(const std::string& subject, std::vector<std::string>& strings, int options) const |
225 | 0 | { |
226 | 0 | return split(subject, 0, strings, options); |
227 | 0 | } |
228 | | |
229 | | |
230 | | inline int RegularExpression::subst(std::string& subject, const std::string& replacement, int options) const |
231 | 0 | { |
232 | 0 | return subst(subject, 0, replacement, options); |
233 | 0 | } |
234 | | |
235 | | |
236 | | inline bool RegularExpression::operator == (const std::string& subject) const |
237 | 0 | { |
238 | 0 | return match(subject); |
239 | 0 | } |
240 | | |
241 | | |
242 | | inline bool RegularExpression::operator != (const std::string& subject) const |
243 | 0 | { |
244 | 0 | return !match(subject); |
245 | 0 | } |
246 | | |
247 | | |
248 | | } // namespace Poco |
249 | | |
250 | | |
251 | | #endif // Foundation_RegularExpression_INCLUDED |