/src/trafficserver/include/tsutil/Regex.h
Line | Count | Source |
1 | | /** @file |
2 | | |
3 | | A brief file description |
4 | | |
5 | | @section license License |
6 | | |
7 | | Licensed to the Apache Software Foundation (ASF) under one |
8 | | or more contributor license agreements. See the NOTICE file |
9 | | distributed with this work for additional information |
10 | | regarding copyright ownership. The ASF licenses this file |
11 | | to you under the Apache License, Version 2.0 (the |
12 | | "License"); you may not use this file except in compliance |
13 | | with the License. You may obtain a copy of the License at |
14 | | |
15 | | http://www.apache.org/licenses/LICENSE-2.0 |
16 | | |
17 | | Unless required by applicable law or agreed to in writing, software |
18 | | distributed under the License is distributed on an "AS IS" BASIS, |
19 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
20 | | See the License for the specific language governing permissions and |
21 | | limitations under the License. |
22 | | */ |
23 | | |
24 | | #pragma once |
25 | | |
26 | | #include <string_view> |
27 | | #include <string> |
28 | | #include <vector> |
29 | | #include <memory> |
30 | | |
31 | | /// @brief Match flags for regular expression evaluation. |
32 | | /// |
33 | | /// @internal These values are copied from pcre2.h, to avoid having to include it. The values are checked (with |
34 | | /// static_assert) in Regex.cc against PCRE2 named constants, in case they change in future PCRE2 releases. |
35 | | enum REFlags { |
36 | | RE_CASE_INSENSITIVE = 0x00000008u, ///< Ignore case (by default, matches are case sensitive). |
37 | | RE_UNANCHORED = 0x00000400u, ///< Unanchored (@a DFA defaults to anchored). |
38 | | RE_ANCHORED = 0x80000000u, ///< Anchored (@a Regex defaults to unanchored). |
39 | | RE_NOTEMPTY = 0x00000004u ///< Not empty (by default, matches may match empty string). |
40 | | }; |
41 | | |
42 | | /// @brief Error codes returned by regular expression operations. |
43 | | /// |
44 | | /// @internal As with REFlags, these values are copied from pcre2.h, to avoid having to include it. |
45 | | enum REErrors { |
46 | | RE_ERROR_NOMATCH = -1, ///< No match found. |
47 | | RE_ERROR_NULL = -51 ///< NULL code or subject was passed. |
48 | | }; |
49 | | |
50 | | /// @brief Wrapper for PCRE2 match data. |
51 | | class RegexMatches |
52 | | { |
53 | | friend class Regex; |
54 | | |
55 | | public: |
56 | | /** Construct a new RegexMatches object. |
57 | | * |
58 | | * @param size The number of matches to allocate space for. |
59 | | */ |
60 | | RegexMatches(uint32_t size = DEFAULT_MATCHES); |
61 | | ~RegexMatches(); |
62 | | |
63 | | /** Get the match at the given index. |
64 | | * |
65 | | * @return The match at the given index. |
66 | | */ |
67 | | std::string_view operator[](size_t index) const; |
68 | | /** Get the ovector pointer for the capture groups. Don't use this unless you know what you are doing. |
69 | | * |
70 | | * @return ovector pointer. |
71 | | */ |
72 | | size_t *get_ovector_pointer(); |
73 | | int32_t size() const; |
74 | | |
75 | | private: |
76 | | constexpr static uint32_t DEFAULT_MATCHES = 10; |
77 | | static void *malloc(size_t size, void *caller); |
78 | | static void free(void *p, void *caller); |
79 | | std::string_view _subject; |
80 | | char _buffer[24 + 96 + 28 * DEFAULT_MATCHES]; // 24 bytes for the general context, 96 bytes overhead, 28 bytes per match. |
81 | | size_t _buffer_bytes_used = 0; |
82 | | int32_t _size = 0; |
83 | | |
84 | | /// @internal This effectively wraps a void* so that we can avoid requiring the pcre2.h include for the user of the Regex |
85 | | /// API (see Regex.cc). |
86 | | struct _MatchData; |
87 | | class _MatchDataPtr |
88 | | { |
89 | | friend struct _MatchData; |
90 | | |
91 | | private: |
92 | | void *_ptr = nullptr; |
93 | | }; |
94 | | _MatchDataPtr _match_data; |
95 | | }; |
96 | | |
97 | | /// @brief Wrapper for PCRE2 match context |
98 | | /// |
99 | | /// @internal This instance is not tied to any Regex and can be used with one of the Regex::exec overloads. |
100 | | class RegexMatchContext |
101 | | { |
102 | | friend class Regex; |
103 | | |
104 | | public: |
105 | | /** Construct a new RegexMatchContext object. |
106 | | */ |
107 | | RegexMatchContext(); |
108 | | ~RegexMatchContext(); |
109 | | |
110 | | /// uses pcre2_match_context_copy for a deep copy. |
111 | | RegexMatchContext(RegexMatchContext const &orig); |
112 | | RegexMatchContext &operator=(RegexMatchContext const &orig); |
113 | | |
114 | | RegexMatchContext(RegexMatchContext &&) = default; |
115 | | RegexMatchContext &operator=(RegexMatchContext &&) = default; |
116 | | |
117 | | /** Limits the amount of backtracking that can take place. |
118 | | * Any regex exec call that fails will return PCRE2_ERROR_MATCHLIMIT(-47) |
119 | | */ |
120 | | void set_match_limit(uint32_t limit); |
121 | | |
122 | | private: |
123 | | /// @internal This wraps a void* so to avoid requiring a pcre2 include. |
124 | | struct _MatchContext; |
125 | | struct _MatchContextPtr { |
126 | | void *_ptr = nullptr; |
127 | | }; |
128 | | |
129 | | _MatchContextPtr _match_context; |
130 | | }; |
131 | | |
132 | | /// @brief Wrapper for PCRE2 regular expression. |
133 | | class Regex |
134 | | { |
135 | | public: |
136 | 0 | Regex() = default; |
137 | | /** Deep copy constructor. |
138 | | * |
139 | | * Creates a new Regex object with a deep copy of the compiled pattern. |
140 | | * Uses pcre2_code_copy() to duplicate the compiled pattern without |
141 | | * requiring the original pattern string. |
142 | | * |
143 | | * @param other The Regex object to copy from. |
144 | | */ |
145 | | Regex(Regex const &other); |
146 | | /** Deep copy assignment operator. |
147 | | * |
148 | | * Replaces the current compiled pattern with a deep copy of the other's pattern. |
149 | | * |
150 | | * @param other The Regex object to copy from. |
151 | | * @return Reference to this object. |
152 | | */ |
153 | | Regex &operator=(Regex const &other); |
154 | | Regex(Regex &&that) noexcept; |
155 | | Regex &operator=(Regex &&other); |
156 | | ~Regex(); |
157 | | |
158 | | /** Compile the @a pattern into a regular expression. |
159 | | * |
160 | | * @param pattern Source pattern for regular expression (null terminated). |
161 | | * @param flags Compilation flags. |
162 | | * @return @a true if compiled successfully, @a false otherwise. |
163 | | * |
164 | | * @a flags should be the bitwise @c or of @c REFlags values. |
165 | | */ |
166 | | bool compile(std::string_view pattern, uint32_t flags = 0); |
167 | | |
168 | | /** Compile the @a pattern into a regular expression. |
169 | | * |
170 | | * @param pattern Source pattern for regular expression (null terminated). |
171 | | * @param error String to receive error message. |
172 | | * @param erroffset Pointer to integer to receive error offset. |
173 | | * @param flags Compilation flags. |
174 | | * @return @a true if compiled successfully, @a false otherwise. |
175 | | * |
176 | | * @a flags should be the bitwise @c or of @c REFlags values. |
177 | | */ |
178 | | bool compile(std::string_view pattern, std::string &error, int &erroffset, unsigned flags = 0); |
179 | | |
180 | | /** Execute the regular expression. |
181 | | * |
182 | | * @param subject String to match against. |
183 | | * @return @c true if the pattern matched, @a false if not. |
184 | | * |
185 | | * It is safe to call this method concurrently on the same instance of @a this. |
186 | | */ |
187 | | bool exec(std::string_view subject) const; |
188 | | |
189 | | /** Execute the regular expression. |
190 | | * |
191 | | * @param subject String to match against. |
192 | | * @param flags Match flags (e.g., RE_NOTEMPTY). |
193 | | * @return @c true if the pattern matched, @a false if not. |
194 | | * |
195 | | * It is safe to call this method concurrently on the same instance of @a this. |
196 | | */ |
197 | | bool exec(std::string_view subject, uint32_t flags) const; |
198 | | |
199 | | /** Execute the regular expression. |
200 | | * |
201 | | * @param subject String to match against. |
202 | | * @param matches Place to store the capture groups. |
203 | | * @return @c The number of capture groups. < 0 if an error occurred. 0 if the number of Matches is too small. |
204 | | * |
205 | | * It is safe to call this method concurrently on the same instance of @a this. |
206 | | * |
207 | | * Each capture group takes 3 elements of @a ovector, therefore @a ovecsize must |
208 | | * be a multiple of 3 and at least three times the number of desired capture groups. |
209 | | */ |
210 | | int exec(std::string_view subject, RegexMatches &matches) const; |
211 | | |
212 | | /** Execute the regular expression. |
213 | | * |
214 | | * @param subject String to match against. |
215 | | * @param matches Place to store the capture groups. |
216 | | * @param flags Match flags (e.g., RE_NOTEMPTY). |
217 | | * @param optional context Match context (set matching limits). |
218 | | * @return @c The number of capture groups. < 0 if an error occurred. 0 if the number of Matches is too small. |
219 | | * |
220 | | * It is safe to call this method concurrently on the same instance of @a this. |
221 | | * |
222 | | * Each capture group takes 3 elements of @a ovector, therefore @a ovecsize must |
223 | | * be a multiple of 3 and at least three times the number of desired capture groups. |
224 | | */ |
225 | | int exec(std::string_view subject, RegexMatches &matches, uint32_t flags, |
226 | | RegexMatchContext const *const matchContext = nullptr) const; |
227 | | |
228 | | /** Error string for exec failure. |
229 | | * |
230 | | * @param int return code from exec call. |
231 | | */ |
232 | | static std::string get_error_string(int rc); |
233 | | |
234 | | /// @return The number of capture groups in the compiled pattern, -1 for fail. |
235 | | int32_t get_capture_count() const; |
236 | | |
237 | | /// @return number of highest back references, -1 for fail. |
238 | | int32_t get_backref_max() const; |
239 | | |
240 | | /// @return Is the compiled pattern empty? |
241 | | bool empty() const; |
242 | | |
243 | | private: |
244 | | /// @internal This effectively wraps a void* so that we can avoid requiring the pcre2.h include for the user of the Regex |
245 | | /// API (see Regex.cc). |
246 | | struct _Code; |
247 | | class _CodePtr |
248 | | { |
249 | | friend struct _Code; |
250 | | |
251 | | private: |
252 | | void *_ptr = nullptr; |
253 | | }; |
254 | | _CodePtr _code; |
255 | | }; |
256 | | |
257 | | /** Deterministic Finite state Automata container. |
258 | | * |
259 | | * This contains a set of patterns (which may be of size 1) and matches if any of the patterns |
260 | | * match. |
261 | | */ |
262 | | class DFA |
263 | | { |
264 | | public: |
265 | 0 | DFA() = default; |
266 | | ~DFA(); |
267 | | |
268 | | /// @return The number of patterns successfully compiled. |
269 | | int32_t compile(const std::string_view pattern, unsigned flags = 0); |
270 | | /// @return The number of patterns successfully compiled. |
271 | | int32_t compile(const std::string_view *const patterns, int npatterns, unsigned flags = 0); |
272 | | /// @return The number of patterns successfully compiled. |
273 | | int32_t compile(const char *const *patterns, int npatterns, unsigned flags = 0); |
274 | | |
275 | | /** Match @a str against the internal patterns. |
276 | | * |
277 | | * @param str String to match. |
278 | | * @return Index of the matched pattern, -1 if no match. |
279 | | */ |
280 | | int32_t match(std::string_view str) const; |
281 | | |
282 | | private: |
283 | | struct Pattern { |
284 | 0 | Pattern(Regex &&rxp, std::string &&s) : _re(std::move(rxp)), _p(std::move(s)) {} |
285 | | Regex _re; ///< The compile pattern. |
286 | | std::string _p; ///< The original pattern. |
287 | | }; |
288 | | |
289 | | /** Compile @a pattern and add it to the pattern set. |
290 | | * |
291 | | * @param pattern Regular expression to compile. |
292 | | * @param flags Regular expression compilation flags. |
293 | | * @return @c true if @a pattern was successfully compiled, @c false if not. |
294 | | */ |
295 | | bool build(std::string_view pattern, unsigned flags = 0); |
296 | | |
297 | | std::vector<Pattern> _patterns; |
298 | | }; |