/src/trafficserver/include/tscore/SimpleTokenizer.h

Source
/** @file

  A brief file description

  @section license License

  Licensed to the Apache Software Foundation (ASF) under one
  or more contributor license agreements.  See the NOTICE file
  distributed with this work for additional information
  regarding copyright ownership.  The ASF licenses this file
  to you under the Apache License, Version 2.0 (the
  "License"); you may not use this file except in compliance
  with the License.  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
 */

#pragma once

#include <cstdlib>
#include <cstring>
#include <cctype>
#include "tscore/ink_memory.h"

/*-----------------------------------------------------------------------------
  SimpleTokenizer

  This class provides easy token parsing from an input string. It supports:

  1- ignoring (or not) of null fields
  2- left whitespace trimming
  3- right whitespace trimming
  4- escaping the delimiter character with a user defined escape character

  The class has two constructors, one that defines the input string,
  and another one that does not. If the latter is used, then the
  setString method should be used to set the data string.

  Both constructors set the delimiter, the operation mode (which
  defines bullets 1-3 above), and the escape character.

  The available methods are:

  void setString(char *s)
  sets the data string to s. The mode specified upon construction of the
  tokenizer determines whether s is copied or not.

  char *getNext()
  returns the next token, or NULL if there are no more tokens. This method
  uses the delimiter specified upon object construction.

  char *getNext(char delimiter)
  similar to getNext(), but allows the user to change the delimiter (just for
  this call).

  char *getNext(int count)
  get the next count tokens as a single token (ignoring the delimiters in
  between).

  char *getNext(char delimiter, int count)
  this is similar to getNext(int count) but allows user to specify the
  delimiter.

  IMPORTANT: the char pointers returned by the SimpleTokenizer are valid
  ONLY during the lifetime of the object. The copy of the input string
  is destroyed by the object's destructor.

  char *getRest()
  returns the rest of the tokens all together. Advances pointer so a
  subsequent call to getNext returns NULL;

  char *peekAtRestOfString()
  returns the rest of the input string, but DOES NOT advance pointer so a
  subsequent call to getNext does return the next token (if there is still
  one).

  size_t getNumTokensRemaining()
  returns the number of tokens remaining in the string (using the delimiter
  specified upon object construction).

  size_t getNumTokensRemaining(char delimiter)
  similar to the above, but allows the user to change the delimiter (just for
  this call).

  Note that multiple delimiters are not supported (more than one per call).

  examples:

  SimpleTokenizer tok("one    two\\ and\\ three four:   five : six");
  tok.getNumTokensRemaining() --> 5     note calculation is done assuming
                                        space is the delimiter
  tok.getNext() -> "one"
  tok.getNext() -> "two and three"
  tok.getNext(':') -> "four"
  tok.peekAtRestOfString() -> "   five  : six"
  tok.getNext(':') -> "five"

  SimpleTokenizer tok(",  with null fields ,,,", ',',
                      CONSIDER_NULL_FIELDS | KEEP_WHITESPACE);
  tok.getNext() -> ""
  tok.getNext() -> "  with null fields "
  tok.getNumTokensRemaining() -> 3

  ---------------------------------------------------------------------------*/

class SimpleTokenizer
{
public:
  // by default, null fields are disregarded, whitespace is trimmed left
  // and right, and input string is copied (not overwritten)
  //
  enum {
    CONSIDER_NULL_FIELDS   = 1,
    KEEP_WHITESPACE_LEFT   = 2,
    KEEP_WHITESPACE_RIGHT  = 4,
    KEEP_WHITESPACE        = KEEP_WHITESPACE_LEFT + KEEP_WHITESPACE_RIGHT,
    OVERWRITE_INPUT_STRING = 8
  };

  SimpleTokenizer(char delimiter = ' ', unsigned mode = 0, char escape = '\\') : _delimiter(delimiter), _mode(mode), _escape(escape)
  {
  }

  // NOTE: The input string 's' is overwritten for mode OVERWRITE_INPUT_STRING.
  SimpleTokenizer(const char *s, char delimiter = ' ', unsigned mode = 0, char escape = '\\')
    : _delimiter(delimiter), _mode(mode), _escape(escape)
  {
    setString(s);
  }

  ~SimpleTokenizer() { _clearData(); }
  void
  setString(const char *s)
  {
    _clearData();

    _start  = 0;
    _length = strlen(s);
    _data   = (_mode & OVERWRITE_INPUT_STRING ? const_cast<char *>(s) : ats_strdup(s));

    // to handle the case where there is a null field at the end of the
    // input string, we replace the null character at the end of the
    // string with the delimiter (and consider the string to be one
    // character larger).
    //
    _data[_length++] = _delimiter;
  };
  char *
  getNext(int count = 1)
  {
    return _getNext(_delimiter, false, count);
  };
  char *
  getNext(char delimiter, int count = 1)
  {
    return _getNext(delimiter, false, count);
  }
  char *
  getRest()
  {
    // there can't be more than _length tokens, so we get the rest
    // of the tokens by requesting _length of them
    //
    return _getNext(_delimiter, false, _length);
  }
  size_t
  getNumTokensRemaining()
  {
    return _getNumTokensRemaining(_delimiter);
  };
  size_t
  getNumTokensRemaining(char delimiter)
  {
    return _getNumTokensRemaining(delimiter);
  };
  char *
  peekAtRestOfString()
  {
    _data[_length - 1] = 0;
    return (_start < _length ? &_data[_start] : &_data[_length - 1]);
  }

private:
  char *_data = nullptr; // a pointer to the input data itself,
  // or to a copy of it
  char     _delimiter; // the token delimiter
  unsigned _mode;      // flags that determine the
  // mode of operation
  char   _escape;    // the escape character
  size_t _start = 0; // pointer to the start of the next
  // token
  size_t _length = 0; // the length of _data

  void
  _clearData()
  {
    if (_data && !(_mode & OVERWRITE_INPUT_STRING)) {
      ats_free(_data);
    }
  }

  char *
  _getNext(char delimiter, bool countOnly = false, int numTokens = 1)
  {
    char *next = nullptr;

    if (_start < _length) {
      // set start
      //
      bool hasEsc = false; // escape character seen
      while (_start < _length &&
             ((!(_mode & CONSIDER_NULL_FIELDS) &&
               (_data[_start] == delimiter && !(_start && (_data[_start - 1] == _escape ? (hasEsc = true) : 0)))) ||
              (!(_mode & KEEP_WHITESPACE_LEFT) && isspace(_data[_start])))) {
        ++_start;
      }

      if (_start < _length) // data still available
      {
        // update the extra delimiter just in case the function
        // is called with a different delimiter from the previous one
        //
        _data[_length - 1] = delimiter;

        next = &_data[_start];

        // set end
        //
        size_t end        = _start;
        int    delimCount = 0;
        while (end < _length && (_data[end] != delimiter || (end && (_data[end - 1] == _escape ? (hasEsc = true) : 0)) ||
                                 ((++delimCount < numTokens) && (end < _length - 1)))) {
          ++end;
        }

        _start = end + 1;

        // there can be delimiters at the end if the number of tokens
        // requested is larger than 1, remove them if the
        // CONSIDER_NULL_FIELDS flag is not set
        //
        if (!(_mode & CONSIDER_NULL_FIELDS)) {
          while (_data[--end] == delimiter) {
            // do nothing
          }
          ++end;
        }

        if (!(_mode & KEEP_WHITESPACE_RIGHT)) {
          while (isspace(_data[--end])) {
            // do nothing
          }
          ++end;
        }

        if (!countOnly) {
          _data[end] = 0;

          // remove escape characters only if the number of
          // delimiters is one
          //
          if (hasEsc && delimCount == 1) {
            int numEscape = 0, i = 0;
            while (next[i]) {
              if (next[i] == _escape) {
                ++numEscape;
              } else {
                next[i - numEscape] = next[i];
              }
              ++i;
            }
            _data[end - numEscape] = 0;
          }
        }
      }
    }
    return next;
  };

  size_t
  _getNumTokensRemaining(char delimiter)
  {
    size_t startSave = _start; // save current position
    size_t count     = 0;
    while (_getNext(delimiter, true)) {
      ++count;
    };
    _start = startSave;
    return count;
  };
};

Coverage Report

Created: 2026-03-28 06:49

Line	Count	Source
1		/** @file
2
3		A brief file description
4
5		@section license License
6
7		Licensed to the Apache Software Foundation (ASF) under one
8		or more contributor license agreements. See the NOTICE file
9		distributed with this work for additional information
10		regarding copyright ownership. The ASF licenses this file
11		to you under the Apache License, Version 2.0 (the
12		"License"); you may not use this file except in compliance
13		with the License. You may obtain a copy of the License at
14
15		http://www.apache.org/licenses/LICENSE-2.0
16
17		Unless required by applicable law or agreed to in writing, software
18		distributed under the License is distributed on an "AS IS" BASIS,
19		WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20		See the License for the specific language governing permissions and
21		limitations under the License.
22		*/
23
24		#pragma once
25
26		#include <cstdlib>
27		#include <cstring>
28		#include <cctype>
29		#include "tscore/ink_memory.h"
30
31		/*-----------------------------------------------------------------------------
32		SimpleTokenizer
33
34		This class provides easy token parsing from an input string. It supports:
35
36		1- ignoring (or not) of null fields
37		2- left whitespace trimming
38		3- right whitespace trimming
39		4- escaping the delimiter character with a user defined escape character
40
41		The class has two constructors, one that defines the input string,
42		and another one that does not. If the latter is used, then the
43		setString method should be used to set the data string.
44
45		Both constructors set the delimiter, the operation mode (which
46		defines bullets 1-3 above), and the escape character.
47
48		The available methods are:
49
50		void setString(char *s)
51		sets the data string to s. The mode specified upon construction of the
52		tokenizer determines whether s is copied or not.
53
54		char *getNext()
55		returns the next token, or NULL if there are no more tokens. This method
56		uses the delimiter specified upon object construction.
57
58		char *getNext(char delimiter)
59		similar to getNext(), but allows the user to change the delimiter (just for
60		this call).
61
62		char *getNext(int count)
63		get the next count tokens as a single token (ignoring the delimiters in
64		between).
65
66		char *getNext(char delimiter, int count)
67		this is similar to getNext(int count) but allows user to specify the
68		delimiter.
69
70		IMPORTANT: the char pointers returned by the SimpleTokenizer are valid
71		ONLY during the lifetime of the object. The copy of the input string
72		is destroyed by the object's destructor.
73
74		char *getRest()
75		returns the rest of the tokens all together. Advances pointer so a
76		subsequent call to getNext returns NULL;
77
78		char *peekAtRestOfString()
79		returns the rest of the input string, but DOES NOT advance pointer so a
80		subsequent call to getNext does return the next token (if there is still
81		one).
82
83		size_t getNumTokensRemaining()
84		returns the number of tokens remaining in the string (using the delimiter
85		specified upon object construction).
86
87		size_t getNumTokensRemaining(char delimiter)
88		similar to the above, but allows the user to change the delimiter (just for
89		this call).
90
91		Note that multiple delimiters are not supported (more than one per call).
92
93		examples:
94
95		SimpleTokenizer tok("one two\\ and\\ three four: five : six");
96		tok.getNumTokensRemaining() --> 5 note calculation is done assuming
97		space is the delimiter
98		tok.getNext() -> "one"
99		tok.getNext() -> "two and three"
100		tok.getNext(':') -> "four"
101		tok.peekAtRestOfString() -> " five : six"
102		tok.getNext(':') -> "five"
103
104		SimpleTokenizer tok(", with null fields ,,,", ',',
105		CONSIDER_NULL_FIELDS \| KEEP_WHITESPACE);
106		tok.getNext() -> ""
107		tok.getNext() -> " with null fields "
108		tok.getNumTokensRemaining() -> 3
109
110		---------------------------------------------------------------------------*/
111
112		class SimpleTokenizer
113		{
114		public:
115		// by default, null fields are disregarded, whitespace is trimmed left
116		// and right, and input string is copied (not overwritten)
117		//
118		enum {
119		CONSIDER_NULL_FIELDS = 1,
120		KEEP_WHITESPACE_LEFT = 2,
121		KEEP_WHITESPACE_RIGHT = 4,
122		KEEP_WHITESPACE = KEEP_WHITESPACE_LEFT + KEEP_WHITESPACE_RIGHT,
123		OVERWRITE_INPUT_STRING = 8
124		};
125
126	0	SimpleTokenizer(char delimiter = ' ', unsigned mode = 0, char escape = '\\') : _delimiter(delimiter), _mode(mode), _escape(escape)
127	0	{
128	0	}
129
130		// NOTE: The input string 's' is overwritten for mode OVERWRITE_INPUT_STRING.
131		SimpleTokenizer(const char *s, char delimiter = ' ', unsigned mode = 0, char escape = '\\')
132		: _delimiter(delimiter), _mode(mode), _escape(escape)
133	0	{
134	0	setString(s);
135	0	}
136
137	0	~SimpleTokenizer() { _clearData(); }
138		void
139		setString(const char *s)
140	0	{
141	0	_clearData();
142
143	0	_start = 0;
144	0	_length = strlen(s);
145	0	_data = (_mode & OVERWRITE_INPUT_STRING ? const_cast<char *>(s) : ats_strdup(s));
146
147		// to handle the case where there is a null field at the end of the
148		// input string, we replace the null character at the end of the
149		// string with the delimiter (and consider the string to be one
150		// character larger).
151		//
152	0	_data[_length++] = _delimiter;
153	0	};
154		char *
155		getNext(int count = 1)
156	0	{
157	0	return _getNext(_delimiter, false, count);
158	0	};
159		char *
160		getNext(char delimiter, int count = 1)
161	0	{
162	0	return _getNext(delimiter, false, count);
163	0	}
164		char *
165		getRest()
166	0	{
167	0	// there can't be more than _length tokens, so we get the rest
168	0	// of the tokens by requesting _length of them
169	0	//
170	0	return _getNext(_delimiter, false, _length);
171	0	}
172		size_t
173		getNumTokensRemaining()
174	0	{
175	0	return _getNumTokensRemaining(_delimiter);
176	0	};
177		size_t
178		getNumTokensRemaining(char delimiter)
179	0	{
180	0	return _getNumTokensRemaining(delimiter);
181	0	};
182		char *
183		peekAtRestOfString()
184	0	{
185	0	_data[_length - 1] = 0;
186	0	return (_start < _length ? &_data[_start] : &_data[_length - 1]);
187	0	}
188
189		private:
190		char *_data = nullptr; // a pointer to the input data itself,
191		// or to a copy of it
192		char _delimiter; // the token delimiter
193		unsigned _mode; // flags that determine the
194		// mode of operation
195		char _escape; // the escape character
196		size_t _start = 0; // pointer to the start of the next
197		// token
198		size_t _length = 0; // the length of _data
199
200		void
201		_clearData()
202	0	{
203	0	if (_data && !(_mode & OVERWRITE_INPUT_STRING)) {
204	0	ats_free(_data);
205	0	}
206	0	}
207
208		char *
209		_getNext(char delimiter, bool countOnly = false, int numTokens = 1)
210	0	{
211	0	char *next = nullptr;
212
213	0	if (_start < _length) {
214		// set start
215		//
216	0	bool hasEsc = false; // escape character seen
217	0	while (_start < _length &&
218	0	((!(_mode & CONSIDER_NULL_FIELDS) &&
219	0	(_data[_start] == delimiter && !(_start && (_data[_start - 1] == _escape ? (hasEsc = true) : 0)))) \|\|
220	0	(!(_mode & KEEP_WHITESPACE_LEFT) && isspace(_data[_start])))) {
221	0	++_start;
222	0	}
223
224	0	if (_start < _length) // data still available
225	0	{
226		// update the extra delimiter just in case the function
227		// is called with a different delimiter from the previous one
228		//
229	0	_data[_length - 1] = delimiter;
230
231	0	next = &_data[_start];
232
233		// set end
234		//
235	0	size_t end = _start;
236	0	int delimCount = 0;
237	0	while (end < _length && (_data[end] != delimiter \|\| (end && (_data[end - 1] == _escape ? (hasEsc = true) : 0)) \|\|
238	0	((++delimCount < numTokens) && (end < _length - 1)))) {
239	0	++end;
240	0	}
241
242	0	_start = end + 1;
243
244		// there can be delimiters at the end if the number of tokens
245		// requested is larger than 1, remove them if the
246		// CONSIDER_NULL_FIELDS flag is not set
247		//
248	0	if (!(_mode & CONSIDER_NULL_FIELDS)) {
249	0	while (_data[--end] == delimiter) {
250		// do nothing
251	0	}
252	0	++end;
253	0	}
254
255	0	if (!(_mode & KEEP_WHITESPACE_RIGHT)) {
256	0	while (isspace(_data[--end])) {
257		// do nothing
258	0	}
259	0	++end;
260	0	}
261
262	0	if (!countOnly) {
263	0	_data[end] = 0;
264
265		// remove escape characters only if the number of
266		// delimiters is one
267		//
268	0	if (hasEsc && delimCount == 1) {
269	0	int numEscape = 0, i = 0;
270	0	while (next[i]) {
271	0	if (next[i] == _escape) {
272	0	++numEscape;
273	0	} else {
274	0	next[i - numEscape] = next[i];
275	0	}
276	0	++i;
277	0	}
278	0	_data[end - numEscape] = 0;
279	0	}
280	0	}
281	0	}
282	0	}
283	0	return next;
284	0	};
285
286		size_t
287		_getNumTokensRemaining(char delimiter)
288	0	{
289	0	size_t startSave = _start; // save current position
290	0	size_t count = 0;
291	0	while (_getNext(delimiter, true)) {
292	0	++count;
293	0	};
294	0	_start = startSave;
295	0	return count;
296	0	};
297		};