/src/trafficserver/include/tscore/SimpleTokenizer.h
Line | Count | Source |
1 | | /** @file |
2 | | |
3 | | A brief file description |
4 | | |
5 | | @section license License |
6 | | |
7 | | Licensed to the Apache Software Foundation (ASF) under one |
8 | | or more contributor license agreements. See the NOTICE file |
9 | | distributed with this work for additional information |
10 | | regarding copyright ownership. The ASF licenses this file |
11 | | to you under the Apache License, Version 2.0 (the |
12 | | "License"); you may not use this file except in compliance |
13 | | with the License. You may obtain a copy of the License at |
14 | | |
15 | | http://www.apache.org/licenses/LICENSE-2.0 |
16 | | |
17 | | Unless required by applicable law or agreed to in writing, software |
18 | | distributed under the License is distributed on an "AS IS" BASIS, |
19 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
20 | | See the License for the specific language governing permissions and |
21 | | limitations under the License. |
22 | | */ |
23 | | |
24 | | #pragma once |
25 | | |
26 | | #include <cstdlib> |
27 | | #include <cstring> |
28 | | #include <cctype> |
29 | | #include "tscore/ink_memory.h" |
30 | | |
31 | | /*----------------------------------------------------------------------------- |
32 | | SimpleTokenizer |
33 | | |
34 | | This class provides easy token parsing from an input string. It supports: |
35 | | |
36 | | 1- ignoring (or not) of null fields |
37 | | 2- left whitespace trimming |
38 | | 3- right whitespace trimming |
39 | | 4- escaping the delimiter character with a user defined escape character |
40 | | |
41 | | The class has two constructors, one that defines the input string, |
42 | | and another one that does not. If the latter is used, then the |
43 | | setString method should be used to set the data string. |
44 | | |
45 | | Both constructors set the delimiter, the operation mode (which |
46 | | defines bullets 1-3 above), and the escape character. |
47 | | |
48 | | The available methods are: |
49 | | |
50 | | void setString(char *s) |
51 | | sets the data string to s. The mode specified upon construction of the |
52 | | tokenizer determines whether s is copied or not. |
53 | | |
54 | | char *getNext() |
55 | | returns the next token, or NULL if there are no more tokens. This method |
56 | | uses the delimiter specified upon object construction. |
57 | | |
58 | | char *getNext(char delimiter) |
59 | | similar to getNext(), but allows the user to change the delimiter (just for |
60 | | this call). |
61 | | |
62 | | char *getNext(int count) |
63 | | get the next count tokens as a single token (ignoring the delimiters in |
64 | | between). |
65 | | |
66 | | char *getNext(char delimiter, int count) |
67 | | this is similar to getNext(int count) but allows user to specify the |
68 | | delimiter. |
69 | | |
70 | | IMPORTANT: the char pointers returned by the SimpleTokenizer are valid |
71 | | ONLY during the lifetime of the object. The copy of the input string |
72 | | is destroyed by the object's destructor. |
73 | | |
74 | | char *getRest() |
75 | | returns the rest of the tokens all together. Advances pointer so a |
76 | | subsequent call to getNext returns NULL; |
77 | | |
78 | | char *peekAtRestOfString() |
79 | | returns the rest of the input string, but DOES NOT advance pointer so a |
80 | | subsequent call to getNext does return the next token (if there is still |
81 | | one). |
82 | | |
83 | | size_t getNumTokensRemaining() |
84 | | returns the number of tokens remaining in the string (using the delimiter |
85 | | specified upon object construction). |
86 | | |
87 | | size_t getNumTokensRemaining(char delimiter) |
88 | | similar to the above, but allows the user to change the delimiter (just for |
89 | | this call). |
90 | | |
91 | | Note that multiple delimiters are not supported (more than one per call). |
92 | | |
93 | | examples: |
94 | | |
95 | | SimpleTokenizer tok("one two\\ and\\ three four: five : six"); |
96 | | tok.getNumTokensRemaining() --> 5 note calculation is done assuming |
97 | | space is the delimiter |
98 | | tok.getNext() -> "one" |
99 | | tok.getNext() -> "two and three" |
100 | | tok.getNext(':') -> "four" |
101 | | tok.peekAtRestOfString() -> " five : six" |
102 | | tok.getNext(':') -> "five" |
103 | | |
104 | | SimpleTokenizer tok(", with null fields ,,,", ',', |
105 | | CONSIDER_NULL_FIELDS | KEEP_WHITESPACE); |
106 | | tok.getNext() -> "" |
107 | | tok.getNext() -> " with null fields " |
108 | | tok.getNumTokensRemaining() -> 3 |
109 | | |
110 | | ---------------------------------------------------------------------------*/ |
111 | | |
112 | | class SimpleTokenizer |
113 | | { |
114 | | public: |
115 | | // by default, null fields are disregarded, whitespace is trimmed left |
116 | | // and right, and input string is copied (not overwritten) |
117 | | // |
118 | | enum { |
119 | | CONSIDER_NULL_FIELDS = 1, |
120 | | KEEP_WHITESPACE_LEFT = 2, |
121 | | KEEP_WHITESPACE_RIGHT = 4, |
122 | | KEEP_WHITESPACE = KEEP_WHITESPACE_LEFT + KEEP_WHITESPACE_RIGHT, |
123 | | OVERWRITE_INPUT_STRING = 8 |
124 | | }; |
125 | | |
126 | 0 | SimpleTokenizer(char delimiter = ' ', unsigned mode = 0, char escape = '\\') : _delimiter(delimiter), _mode(mode), _escape(escape) |
127 | 0 | { |
128 | 0 | } |
129 | | |
130 | | // NOTE: The input string 's' is overwritten for mode OVERWRITE_INPUT_STRING. |
131 | | SimpleTokenizer(const char *s, char delimiter = ' ', unsigned mode = 0, char escape = '\\') |
132 | | : _delimiter(delimiter), _mode(mode), _escape(escape) |
133 | 0 | { |
134 | 0 | setString(s); |
135 | 0 | } |
136 | | |
137 | 0 | ~SimpleTokenizer() { _clearData(); } |
138 | | void |
139 | | setString(const char *s) |
140 | 0 | { |
141 | 0 | _clearData(); |
142 | |
|
143 | 0 | _start = 0; |
144 | 0 | _length = strlen(s); |
145 | 0 | _data = (_mode & OVERWRITE_INPUT_STRING ? const_cast<char *>(s) : ats_strdup(s)); |
146 | | |
147 | | // to handle the case where there is a null field at the end of the |
148 | | // input string, we replace the null character at the end of the |
149 | | // string with the delimiter (and consider the string to be one |
150 | | // character larger). |
151 | | // |
152 | 0 | _data[_length++] = _delimiter; |
153 | 0 | }; |
154 | | char * |
155 | | getNext(int count = 1) |
156 | 0 | { |
157 | 0 | return _getNext(_delimiter, false, count); |
158 | 0 | }; |
159 | | char * |
160 | | getNext(char delimiter, int count = 1) |
161 | 0 | { |
162 | 0 | return _getNext(delimiter, false, count); |
163 | 0 | } |
164 | | char * |
165 | | getRest() |
166 | 0 | { |
167 | 0 | // there can't be more than _length tokens, so we get the rest |
168 | 0 | // of the tokens by requesting _length of them |
169 | 0 | // |
170 | 0 | return _getNext(_delimiter, false, _length); |
171 | 0 | } |
172 | | size_t |
173 | | getNumTokensRemaining() |
174 | 0 | { |
175 | 0 | return _getNumTokensRemaining(_delimiter); |
176 | 0 | }; |
177 | | size_t |
178 | | getNumTokensRemaining(char delimiter) |
179 | 0 | { |
180 | 0 | return _getNumTokensRemaining(delimiter); |
181 | 0 | }; |
182 | | char * |
183 | | peekAtRestOfString() |
184 | 0 | { |
185 | 0 | _data[_length - 1] = 0; |
186 | 0 | return (_start < _length ? &_data[_start] : &_data[_length - 1]); |
187 | 0 | } |
188 | | |
189 | | private: |
190 | | char *_data = nullptr; // a pointer to the input data itself, |
191 | | // or to a copy of it |
192 | | char _delimiter; // the token delimiter |
193 | | unsigned _mode; // flags that determine the |
194 | | // mode of operation |
195 | | char _escape; // the escape character |
196 | | size_t _start = 0; // pointer to the start of the next |
197 | | // token |
198 | | size_t _length = 0; // the length of _data |
199 | | |
200 | | void |
201 | | _clearData() |
202 | 0 | { |
203 | 0 | if (_data && !(_mode & OVERWRITE_INPUT_STRING)) { |
204 | 0 | ats_free(_data); |
205 | 0 | } |
206 | 0 | } |
207 | | |
208 | | char * |
209 | | _getNext(char delimiter, bool countOnly = false, int numTokens = 1) |
210 | 0 | { |
211 | 0 | char *next = nullptr; |
212 | |
|
213 | 0 | if (_start < _length) { |
214 | | // set start |
215 | | // |
216 | 0 | bool hasEsc = false; // escape character seen |
217 | 0 | while (_start < _length && |
218 | 0 | ((!(_mode & CONSIDER_NULL_FIELDS) && |
219 | 0 | (_data[_start] == delimiter && !(_start && (_data[_start - 1] == _escape ? (hasEsc = true) : 0)))) || |
220 | 0 | (!(_mode & KEEP_WHITESPACE_LEFT) && isspace(_data[_start])))) { |
221 | 0 | ++_start; |
222 | 0 | } |
223 | |
|
224 | 0 | if (_start < _length) // data still available |
225 | 0 | { |
226 | | // update the extra delimiter just in case the function |
227 | | // is called with a different delimiter from the previous one |
228 | | // |
229 | 0 | _data[_length - 1] = delimiter; |
230 | |
|
231 | 0 | next = &_data[_start]; |
232 | | |
233 | | // set end |
234 | | // |
235 | 0 | size_t end = _start; |
236 | 0 | int delimCount = 0; |
237 | 0 | while (end < _length && (_data[end] != delimiter || (end && (_data[end - 1] == _escape ? (hasEsc = true) : 0)) || |
238 | 0 | ((++delimCount < numTokens) && (end < _length - 1)))) { |
239 | 0 | ++end; |
240 | 0 | } |
241 | |
|
242 | 0 | _start = end + 1; |
243 | | |
244 | | // there can be delimiters at the end if the number of tokens |
245 | | // requested is larger than 1, remove them if the |
246 | | // CONSIDER_NULL_FIELDS flag is not set |
247 | | // |
248 | 0 | if (!(_mode & CONSIDER_NULL_FIELDS)) { |
249 | 0 | while (_data[--end] == delimiter) { |
250 | | // do nothing |
251 | 0 | } |
252 | 0 | ++end; |
253 | 0 | } |
254 | |
|
255 | 0 | if (!(_mode & KEEP_WHITESPACE_RIGHT)) { |
256 | 0 | while (isspace(_data[--end])) { |
257 | | // do nothing |
258 | 0 | } |
259 | 0 | ++end; |
260 | 0 | } |
261 | |
|
262 | 0 | if (!countOnly) { |
263 | 0 | _data[end] = 0; |
264 | | |
265 | | // remove escape characters only if the number of |
266 | | // delimiters is one |
267 | | // |
268 | 0 | if (hasEsc && delimCount == 1) { |
269 | 0 | int numEscape = 0, i = 0; |
270 | 0 | while (next[i]) { |
271 | 0 | if (next[i] == _escape) { |
272 | 0 | ++numEscape; |
273 | 0 | } else { |
274 | 0 | next[i - numEscape] = next[i]; |
275 | 0 | } |
276 | 0 | ++i; |
277 | 0 | } |
278 | 0 | _data[end - numEscape] = 0; |
279 | 0 | } |
280 | 0 | } |
281 | 0 | } |
282 | 0 | } |
283 | 0 | return next; |
284 | 0 | }; |
285 | | |
286 | | size_t |
287 | | _getNumTokensRemaining(char delimiter) |
288 | 0 | { |
289 | 0 | size_t startSave = _start; // save current position |
290 | 0 | size_t count = 0; |
291 | 0 | while (_getNext(delimiter, true)) { |
292 | 0 | ++count; |
293 | 0 | }; |
294 | 0 | _start = startSave; |
295 | 0 | return count; |
296 | 0 | }; |
297 | | }; |