Coverage Report

Created: 2026-03-28 06:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/trafficserver/include/tscore/SimpleTokenizer.h
Line
Count
Source
1
/** @file
2
3
  A brief file description
4
5
  @section license License
6
7
  Licensed to the Apache Software Foundation (ASF) under one
8
  or more contributor license agreements.  See the NOTICE file
9
  distributed with this work for additional information
10
  regarding copyright ownership.  The ASF licenses this file
11
  to you under the Apache License, Version 2.0 (the
12
  "License"); you may not use this file except in compliance
13
  with the License.  You may obtain a copy of the License at
14
15
      http://www.apache.org/licenses/LICENSE-2.0
16
17
  Unless required by applicable law or agreed to in writing, software
18
  distributed under the License is distributed on an "AS IS" BASIS,
19
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20
  See the License for the specific language governing permissions and
21
  limitations under the License.
22
 */
23
24
#pragma once
25
26
#include <cstdlib>
27
#include <cstring>
28
#include <cctype>
29
#include "tscore/ink_memory.h"
30
31
/*-----------------------------------------------------------------------------
32
  SimpleTokenizer
33
34
  This class provides easy token parsing from an input string. It supports:
35
36
  1- ignoring (or not) of null fields
37
  2- left whitespace trimming
38
  3- right whitespace trimming
39
  4- escaping the delimiter character with a user defined escape character
40
41
  The class has two constructors, one that defines the input string,
42
  and another one that does not. If the latter is used, then the
43
  setString method should be used to set the data string.
44
45
  Both constructors set the delimiter, the operation mode (which
46
  defines bullets 1-3 above), and the escape character.
47
48
  The available methods are:
49
50
  void setString(char *s)
51
  sets the data string to s. The mode specified upon construction of the
52
  tokenizer determines whether s is copied or not.
53
54
  char *getNext()
55
  returns the next token, or NULL if there are no more tokens. This method
56
  uses the delimiter specified upon object construction.
57
58
  char *getNext(char delimiter)
59
  similar to getNext(), but allows the user to change the delimiter (just for
60
  this call).
61
62
  char *getNext(int count)
63
  get the next count tokens as a single token (ignoring the delimiters in
64
  between).
65
66
  char *getNext(char delimiter, int count)
67
  this is similar to getNext(int count) but allows user to specify the
68
  delimiter.
69
70
  IMPORTANT: the char pointers returned by the SimpleTokenizer are valid
71
  ONLY during the lifetime of the object. The copy of the input string
72
  is destroyed by the object's destructor.
73
74
  char *getRest()
75
  returns the rest of the tokens all together. Advances pointer so a
76
  subsequent call to getNext returns NULL;
77
78
  char *peekAtRestOfString()
79
  returns the rest of the input string, but DOES NOT advance pointer so a
80
  subsequent call to getNext does return the next token (if there is still
81
  one).
82
83
  size_t getNumTokensRemaining()
84
  returns the number of tokens remaining in the string (using the delimiter
85
  specified upon object construction).
86
87
  size_t getNumTokensRemaining(char delimiter)
88
  similar to the above, but allows the user to change the delimiter (just for
89
  this call).
90
91
  Note that multiple delimiters are not supported (more than one per call).
92
93
  examples:
94
95
  SimpleTokenizer tok("one    two\\ and\\ three four:   five : six");
96
  tok.getNumTokensRemaining() --> 5     note calculation is done assuming
97
                                        space is the delimiter
98
  tok.getNext() -> "one"
99
  tok.getNext() -> "two and three"
100
  tok.getNext(':') -> "four"
101
  tok.peekAtRestOfString() -> "   five  : six"
102
  tok.getNext(':') -> "five"
103
104
  SimpleTokenizer tok(",  with null fields ,,,", ',',
105
                      CONSIDER_NULL_FIELDS | KEEP_WHITESPACE);
106
  tok.getNext() -> ""
107
  tok.getNext() -> "  with null fields "
108
  tok.getNumTokensRemaining() -> 3
109
110
  ---------------------------------------------------------------------------*/
111
112
class SimpleTokenizer
113
{
114
public:
115
  // by default, null fields are disregarded, whitespace is trimmed left
116
  // and right, and input string is copied (not overwritten)
117
  //
118
  enum {
119
    CONSIDER_NULL_FIELDS   = 1,
120
    KEEP_WHITESPACE_LEFT   = 2,
121
    KEEP_WHITESPACE_RIGHT  = 4,
122
    KEEP_WHITESPACE        = KEEP_WHITESPACE_LEFT + KEEP_WHITESPACE_RIGHT,
123
    OVERWRITE_INPUT_STRING = 8
124
  };
125
126
0
  SimpleTokenizer(char delimiter = ' ', unsigned mode = 0, char escape = '\\') : _delimiter(delimiter), _mode(mode), _escape(escape)
127
0
  {
128
0
  }
129
130
  // NOTE: The input string 's' is overwritten for mode OVERWRITE_INPUT_STRING.
131
  SimpleTokenizer(const char *s, char delimiter = ' ', unsigned mode = 0, char escape = '\\')
132
    : _delimiter(delimiter), _mode(mode), _escape(escape)
133
0
  {
134
0
    setString(s);
135
0
  }
136
137
0
  ~SimpleTokenizer() { _clearData(); }
138
  void
139
  setString(const char *s)
140
0
  {
141
0
    _clearData();
142
143
0
    _start  = 0;
144
0
    _length = strlen(s);
145
0
    _data   = (_mode & OVERWRITE_INPUT_STRING ? const_cast<char *>(s) : ats_strdup(s));
146
147
    // to handle the case where there is a null field at the end of the
148
    // input string, we replace the null character at the end of the
149
    // string with the delimiter (and consider the string to be one
150
    // character larger).
151
    //
152
0
    _data[_length++] = _delimiter;
153
0
  };
154
  char *
155
  getNext(int count = 1)
156
0
  {
157
0
    return _getNext(_delimiter, false, count);
158
0
  };
159
  char *
160
  getNext(char delimiter, int count = 1)
161
0
  {
162
0
    return _getNext(delimiter, false, count);
163
0
  }
164
  char *
165
  getRest()
166
0
  {
167
0
    // there can't be more than _length tokens, so we get the rest
168
0
    // of the tokens by requesting _length of them
169
0
    //
170
0
    return _getNext(_delimiter, false, _length);
171
0
  }
172
  size_t
173
  getNumTokensRemaining()
174
0
  {
175
0
    return _getNumTokensRemaining(_delimiter);
176
0
  };
177
  size_t
178
  getNumTokensRemaining(char delimiter)
179
0
  {
180
0
    return _getNumTokensRemaining(delimiter);
181
0
  };
182
  char *
183
  peekAtRestOfString()
184
0
  {
185
0
    _data[_length - 1] = 0;
186
0
    return (_start < _length ? &_data[_start] : &_data[_length - 1]);
187
0
  }
188
189
private:
190
  char *_data = nullptr; // a pointer to the input data itself,
191
  // or to a copy of it
192
  char     _delimiter; // the token delimiter
193
  unsigned _mode;      // flags that determine the
194
  // mode of operation
195
  char   _escape;    // the escape character
196
  size_t _start = 0; // pointer to the start of the next
197
  // token
198
  size_t _length = 0; // the length of _data
199
200
  void
201
  _clearData()
202
0
  {
203
0
    if (_data && !(_mode & OVERWRITE_INPUT_STRING)) {
204
0
      ats_free(_data);
205
0
    }
206
0
  }
207
208
  char *
209
  _getNext(char delimiter, bool countOnly = false, int numTokens = 1)
210
0
  {
211
0
    char *next = nullptr;
212
213
0
    if (_start < _length) {
214
      // set start
215
      //
216
0
      bool hasEsc = false; // escape character seen
217
0
      while (_start < _length &&
218
0
             ((!(_mode & CONSIDER_NULL_FIELDS) &&
219
0
               (_data[_start] == delimiter && !(_start && (_data[_start - 1] == _escape ? (hasEsc = true) : 0)))) ||
220
0
              (!(_mode & KEEP_WHITESPACE_LEFT) && isspace(_data[_start])))) {
221
0
        ++_start;
222
0
      }
223
224
0
      if (_start < _length) // data still available
225
0
      {
226
        // update the extra delimiter just in case the function
227
        // is called with a different delimiter from the previous one
228
        //
229
0
        _data[_length - 1] = delimiter;
230
231
0
        next = &_data[_start];
232
233
        // set end
234
        //
235
0
        size_t end        = _start;
236
0
        int    delimCount = 0;
237
0
        while (end < _length && (_data[end] != delimiter || (end && (_data[end - 1] == _escape ? (hasEsc = true) : 0)) ||
238
0
                                 ((++delimCount < numTokens) && (end < _length - 1)))) {
239
0
          ++end;
240
0
        }
241
242
0
        _start = end + 1;
243
244
        // there can be delimiters at the end if the number of tokens
245
        // requested is larger than 1, remove them if the
246
        // CONSIDER_NULL_FIELDS flag is not set
247
        //
248
0
        if (!(_mode & CONSIDER_NULL_FIELDS)) {
249
0
          while (_data[--end] == delimiter) {
250
            // do nothing
251
0
          }
252
0
          ++end;
253
0
        }
254
255
0
        if (!(_mode & KEEP_WHITESPACE_RIGHT)) {
256
0
          while (isspace(_data[--end])) {
257
            // do nothing
258
0
          }
259
0
          ++end;
260
0
        }
261
262
0
        if (!countOnly) {
263
0
          _data[end] = 0;
264
265
          // remove escape characters only if the number of
266
          // delimiters is one
267
          //
268
0
          if (hasEsc && delimCount == 1) {
269
0
            int numEscape = 0, i = 0;
270
0
            while (next[i]) {
271
0
              if (next[i] == _escape) {
272
0
                ++numEscape;
273
0
              } else {
274
0
                next[i - numEscape] = next[i];
275
0
              }
276
0
              ++i;
277
0
            }
278
0
            _data[end - numEscape] = 0;
279
0
          }
280
0
        }
281
0
      }
282
0
    }
283
0
    return next;
284
0
  };
285
286
  size_t
287
  _getNumTokensRemaining(char delimiter)
288
0
  {
289
0
    size_t startSave = _start; // save current position
290
0
    size_t count     = 0;
291
0
    while (_getNext(delimiter, true)) {
292
0
      ++count;
293
0
    };
294
0
    _start = startSave;
295
0
    return count;
296
0
  };
297
};