Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/xpcom/ds/Tokenizer.h
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
3
/* This Source Code Form is subject to the terms of the Mozilla Public
4
 * License, v. 2.0. If a copy of the MPL was not distributed with this
5
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
#ifndef Tokenizer_h__
8
#define Tokenizer_h__
9
10
#include "nsString.h"
11
#include "mozilla/CheckedInt.h"
12
#include "mozilla/ScopeExit.h"
13
#include "mozilla/TypeTraits.h"
14
#include "mozilla/UniquePtr.h"
15
#include "nsTArray.h"
16
17
namespace mozilla {
18
19
template <typename TChar>
20
class TokenizerBase
21
{
22
public:
23
  typedef nsTSubstring<TChar> TAString;
24
  typedef nsTString<TChar> TString;
25
  typedef nsTDependentString<TChar> TDependentString;
26
  typedef nsTDependentSubstring<TChar> TDependentSubstring;
27
28
  static TChar const sWhitespaces[];
29
30
  /**
31
   * The analyzer works with elements in the input cut to a sequence of token
32
   * where each token has an elementary type
33
   */
34
  enum TokenType : uint32_t
35
  {
36
    TOKEN_UNKNOWN,
37
    TOKEN_RAW,
38
    TOKEN_ERROR,
39
    TOKEN_INTEGER,
40
    TOKEN_WORD,
41
    TOKEN_CHAR,
42
    TOKEN_WS,
43
    TOKEN_EOL,
44
    TOKEN_EOF,
45
    TOKEN_CUSTOM0 = 1000
46
  };
47
48
  enum ECaseSensitivity
49
  {
50
    CASE_SENSITIVE,
51
    CASE_INSENSITIVE
52
  };
53
54
  /**
55
   * Class holding the type and the value of a token.  It can be manually created
56
   * to allow checks against it via methods of TTokenizer or are results of some of
57
   * the TTokenizer's methods.
58
   */
59
  class Token
60
  {
61
    TokenType mType;
62
    TDependentSubstring mWord;
63
    TString mCustom;
64
    TChar mChar;
65
    uint64_t mInteger;
66
    ECaseSensitivity mCustomCaseInsensitivity;
67
    bool mCustomEnabled;
68
69
    // If this token is a result of the parsing process, this member is referencing
70
    // a sub-string in the input buffer.  If this is externally created Token this
71
    // member is left an empty string.
72
    TDependentSubstring mFragment;
73
74
    friend class TokenizerBase<TChar>;
75
    void AssignFragment(typename TAString::const_char_iterator begin,
76
                        typename TAString::const_char_iterator end);
77
78
    static Token Raw();
79
80
  public:
81
    Token();
82
    Token(const Token& aOther);
83
    Token& operator=(const Token& aOther);
84
85
    // Static constructors of tokens by type and value
86
    static Token Word(TAString const& aWord);
87
    static Token Char(TChar const aChar);
88
    static Token Number(uint64_t const aNumber);
89
    static Token Whitespace();
90
    static Token NewLine();
91
    static Token EndOfFile();
92
    static Token Error();
93
94
    // Compares the two tokens, type must be identical and value
95
    // of one of the tokens must be 'any' or equal.
96
    bool Equals(const Token& aOther) const;
97
98
0
    TokenType Type() const { return mType; }
99
    TChar AsChar() const;
100
    TDependentSubstring AsString() const;
101
    uint64_t AsInteger() const;
102
103
0
    TDependentSubstring Fragment() const { return mFragment; }
Unexecuted instantiation: mozilla::TokenizerBase<char>::Token::Fragment() const
Unexecuted instantiation: mozilla::TokenizerBase<char16_t>::Token::Fragment() const
104
  };
105
106
  /**
107
   * Consumers may register a custom string that, when found in the input, is considered
108
   * a token and returned by Next*() and accepted by Check*() methods.
109
   * AddCustomToken() returns a reference to a token that can then be comapred using
110
   * Token::Equals() againts the output from Next*() or be passed to Check*().
111
   */
112
  Token AddCustomToken(const TAString& aValue, ECaseSensitivity aCaseInsensitivity, bool aEnabled = true);
113
  template <uint32_t N>
114
  Token AddCustomToken(const TChar(&aValue)[N], ECaseSensitivity aCaseInsensitivity, bool aEnabled = true)
115
  {
116
    return AddCustomToken(TDependentSubstring(aValue, N - 1), aCaseInsensitivity, aEnabled);
117
  }
118
  void RemoveCustomToken(Token& aToken);
119
  /**
120
   * Only applies to a custom type of a Token (see AddCustomToken above.)
121
   * This turns on and off token recognition.  When a custom token is disabled,
122
   * it's ignored as never added as a custom token.
123
   */
124
  void EnableCustomToken(Token const& aToken, bool aEnable);
125
126
  /**
127
   * Mode of tokenization.
128
   * FULL tokenization, the default, recognizes built-in tokens and any custom tokens,
129
   * if added.
130
   * CUSTOM_ONLY will only recognize custom tokens, the rest is seen as 'raw'.
131
   * This mode can be understood as a 'binary' mode.
132
   */
133
  enum class Mode
134
  {
135
    FULL,
136
    CUSTOM_ONLY
137
  };
138
  void SetTokenizingMode(Mode aMode);
139
140
  /**
141
   * Return false iff the last Check*() call has returned false or when we've read past
142
   * the end of the input string.
143
   */
144
  MOZ_MUST_USE bool HasFailed() const;
145
146
protected:
147
  explicit TokenizerBase(const TChar* aWhitespaces = nullptr,
148
                         const TChar* aAdditionalWordChars = nullptr);
149
150
  // false if we have already read the EOF token.
151
  bool HasInput() const;
152
  // Main parsing function, it doesn't shift the read cursor, just returns the next
153
  // token position.
154
  typename TAString::const_char_iterator Parse(Token& aToken) const;
155
  // Is read cursor at the end?
156
  bool IsEnd(const typename TAString::const_char_iterator& caret) const;
157
  // True, when we are at the end of the input data, but it has not been marked
158
  // as complete yet.  In that case we cannot proceed with providing a multi-TChar token.
159
  bool IsPending(const typename TAString::const_char_iterator & caret) const;
160
  // Is read cursor on a character that is a word start?
161
  bool IsWordFirst(const TChar aInput) const;
162
  // Is read cursor on a character that is an in-word letter?
163
  bool IsWord(const TChar aInput) const;
164
  // Is read cursor on a character that is a valid number?
165
  // TODO - support multiple radix
166
  bool IsNumber(const TChar aInput) const;
167
  // Is equal to the given custom token?
168
  bool IsCustom(const typename TAString::const_char_iterator& caret,
169
                const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
170
171
  // Friendly helper to assign a fragment on a Token
172
  static void AssignFragment(Token& aToken,
173
                             typename TAString::const_char_iterator begin,
174
                             typename TAString::const_char_iterator end);
175
176
  // true iff we have already read the EOF token
177
  bool mPastEof;
178
  // true iff the last Check*() call has returned false, reverts to true on Rollback() call
179
  bool mHasFailed;
180
  // true if the input string is final (finished), false when we expect more data
181
  // yet to be fed to the tokenizer (see IncrementalTokenizer derived class).
182
  bool mInputFinished;
183
  // custom only vs full tokenizing mode, see the Parse() method
184
  Mode mMode;
185
  // minimal raw data chunked delivery during incremental feed
186
  uint32_t mMinRawDelivery;
187
188
  // Customizable list of whitespaces
189
  const TChar* mWhitespaces;
190
  // Additinal custom word characters
191
  const TChar* mAdditionalWordChars;
192
193
  // All these point to the original buffer passed to the constructor or to the incremental
194
  // buffer after FeedInput.
195
  typename TAString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
196
  typename TAString::const_char_iterator mEnd; // End of the input position
197
198
  // This is the list of tokens user has registered with AddCustomToken()
199
  nsTArray<UniquePtr<Token>> mCustomTokens;
200
  uint32_t mNextCustomTokenID;
201
202
private:
203
  TokenizerBase() = delete;
204
  TokenizerBase(const TokenizerBase&) = delete;
205
  TokenizerBase(TokenizerBase&&) = delete;
206
  TokenizerBase(const TokenizerBase&&) = delete;
207
  TokenizerBase &operator=(const TokenizerBase&) = delete;
208
};
209
210
/**
211
 * This is a simple implementation of a lexical analyzer or maybe better
212
 * called a tokenizer.
213
 *
214
 * Please use Tokenizer or Tokenizer16 classes, that are specializations
215
 * of this template class.  Tokenizer is for ASCII input, Tokenizer16 may
216
 * handle char16_t input, but doesn't recognize whitespaces or numbers
217
 * other than standard `char` specialized Tokenizer class.
218
 */
219
template <typename TChar>
220
class TTokenizer : public TokenizerBase<TChar>
221
{
222
public:
223
  typedef TokenizerBase<TChar> base;
224
225
  /**
226
   * @param aSource
227
   *    The string to parse.
228
   *    IMPORTANT NOTE: TTokenizer doesn't ensure the input string buffer
229
   * lifetime. It's up to the consumer to make sure the string's buffer outlives
230
   * the TTokenizer!
231
   * @param aWhitespaces
232
   *    If non-null TTokenizer will use this custom set of whitespaces for
233
   * CheckWhite() and SkipWhites() calls. By default the list consists of space
234
   * and tab.
235
   * @param aAdditionalWordChars
236
   *    If non-null it will be added to the list of characters that consist a
237
   * word. This is useful when you want to accept e.g. '-' in HTTP headers. By
238
   * default a word character is consider any character for which upper case
239
   *    is different from lower case.
240
   *
241
   * If there is an overlap between aWhitespaces and aAdditionalWordChars, the
242
   * check for word characters is made first.
243
   */
244
  explicit TTokenizer(const typename base::TAString& aSource,
245
                      const TChar* aWhitespaces = nullptr,
246
                      const TChar* aAdditionalWordChars = nullptr);
247
  explicit TTokenizer(const TChar* aSource,
248
                      const TChar* aWhitespaces = nullptr,
249
                      const TChar* aAdditionalWordChars = nullptr);
250
251
  /**
252
   * When there is still anything to read from the input, tokenize it, store the token type
253
   * and value to aToken result and shift the cursor past this just parsed token.  Each call
254
   * to Next() reads another token from the input and shifts the cursor.
255
   * Returns false if we have passed the end of the input.
256
   */
257
  MOZ_MUST_USE
258
  bool Next(typename base::Token& aToken);
259
260
  /**
261
   * Parse the token on the input read cursor position, check its type is equal to aTokenType
262
   * and if so, put it into aResult, shift the cursor and return true.  Otherwise, leave
263
   * the input read cursor position intact and return false.
264
   */
265
  MOZ_MUST_USE
266
  bool Check(const typename base::TokenType aTokenType, typename base::Token& aResult);
267
  /**
268
   * Same as above method, just compares both token type and token value passed in aToken.
269
   * When both the type and the value equals, shift the cursor and return true.  Otherwise
270
   * return false.
271
   */
272
  MOZ_MUST_USE
273
  bool Check(const typename base::Token& aToken);
274
275
  /**
276
   * SkipWhites method (below) may also skip new line characters automatically.
277
   */
278
  enum WhiteSkipping {
279
    /**
280
     * SkipWhites will only skip what is defined as a white space (default).
281
     */
282
    DONT_INCLUDE_NEW_LINE = 0,
283
    /**
284
     * SkipWhites will skip definited white spaces as well as new lines
285
     * automatically.
286
     */
287
    INCLUDE_NEW_LINE = 1
288
  };
289
290
  /**
291
   * Skips any occurence of whitespaces specified in mWhitespaces member,
292
   * optionally skip also new lines.
293
   */
294
  void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE);
295
296
  /**
297
   * Skips all tokens until the given one is found or EOF is hit.  The token
298
   * or EOF are next to read.
299
   */
300
  void SkipUntil(typename base::Token const& aToken);
301
302
  // These are mostly shortcuts for the Check() methods above.
303
304
  /**
305
   * Check whitespace character is present.
306
   */
307
  MOZ_MUST_USE
308
4.66M
  bool CheckWhite() { return Check(base::Token::Whitespace()); }
mozilla::TTokenizer<char>::CheckWhite()
Line
Count
Source
308
4.66M
  bool CheckWhite() { return Check(base::Token::Whitespace()); }
Unexecuted instantiation: mozilla::TTokenizer<char16_t>::CheckWhite()
309
  /**
310
   * Check there is a single character on the read cursor position.  If so, shift the read
311
   * cursor position and return true.  Otherwise false.
312
   */
313
  MOZ_MUST_USE
314
0
  bool CheckChar(const TChar aChar) { return Check(base::Token::Char(aChar)); }
315
  /**
316
   * This is a customizable version of CheckChar.  aClassifier is a function called with
317
   * value of the character on the current input read position.  If this user function
318
   * returns true, read cursor is shifted and true returned.  Otherwise false.
319
   * The user classifiction function is not called when we are at or past the end and
320
   * false is immediately returned.
321
   */
322
  MOZ_MUST_USE
323
  bool CheckChar(bool (*aClassifier)(const TChar aChar));
324
  /**
325
   * Check for a whole expected word.
326
   */
327
  MOZ_MUST_USE
328
0
  bool CheckWord(const typename base::TAString& aWord) {
329
0
    return Check(base::Token::Word(aWord));
330
0
  }
331
  /**
332
   * Shortcut for literal const word check with compile time length calculation.
333
   */
334
  template <uint32_t N>
335
  MOZ_MUST_USE
336
  bool CheckWord(const TChar (&aWord)[N]) {
337
    return Check(base::Token::Word(typename base::TDependentString(aWord, N - 1)));
338
  }
339
  /**
340
   * Checks \r, \n or \r\n.
341
   */
342
  MOZ_MUST_USE
343
0
  bool CheckEOL() { return Check(base::Token::NewLine()); }
Unexecuted instantiation: mozilla::TTokenizer<char>::CheckEOL()
Unexecuted instantiation: mozilla::TTokenizer<char16_t>::CheckEOL()
344
  /**
345
   * Checks we are at the end of the input string reading.  If so, shift past the end
346
   * and returns true.  Otherwise does nothing and returns false.
347
   */
348
  MOZ_MUST_USE
349
0
  bool CheckEOF() { return Check(base::Token::EndOfFile()); }
350
351
  /**
352
   * These are shortcuts to obtain the value immediately when the token type matches.
353
   */
354
  MOZ_MUST_USE bool ReadChar(TChar* aValue);
355
  MOZ_MUST_USE bool ReadChar(bool (*aClassifier)(const TChar aChar),
356
                             TChar* aValue);
357
  MOZ_MUST_USE bool ReadWord(typename base::TAString& aValue);
358
  MOZ_MUST_USE bool ReadWord(typename base::TDependentSubstring& aValue);
359
360
  /**
361
   * This is an integer read helper.  It returns false and doesn't move the read
362
   * cursor when any of the following happens:
363
   *  - the token at the read cursor is not an integer
364
   *  - the final number doesn't fit the T type
365
   * Otherwise true is returned, aValue is filled with the integral number
366
   * and the cursor is moved forward.
367
   */
368
  template <typename T>
369
  MOZ_MUST_USE bool ReadInteger(T *aValue)
370
  {
371
    MOZ_RELEASE_ASSERT(aValue);
372
373
    typename base::TAString::const_char_iterator rollback = mRollback;
374
    typename base::TAString::const_char_iterator cursor = base::mCursor;
375
    typename base::Token t;
376
    if (!Check(base::TOKEN_INTEGER, t)) {
377
      return false;
378
    }
379
380
    mozilla::CheckedInt<T> checked(t.AsInteger());
381
    if (!checked.isValid()) {
382
      // Move to a state as if Check() call has failed
383
      mRollback = rollback;
384
      base::mCursor = cursor;
385
      base::mHasFailed = true;
386
      return false;
387
    }
388
389
    *aValue = checked.value();
390
    return true;
391
  }
392
393
  /**
394
   * Same as above, but accepts an integer with an optional minus sign.
395
   */
396
  template <typename T,
397
            typename V = typename EnableIf<IsSigned<typename RemovePointer<T>::Type>::value,
398
                                           typename RemovePointer<T>::Type>::Type>
399
  MOZ_MUST_USE bool ReadSignedInteger(T *aValue)
400
  {
401
    MOZ_RELEASE_ASSERT(aValue);
402
403
    typename base::TAString::const_char_iterator rollback = mRollback;
404
    typename base::TAString::const_char_iterator cursor = base::mCursor;
405
    auto revert = MakeScopeExit([&] {
406
      // Move to a state as if Check() call has failed
407
      mRollback = rollback;
408
      base::mCursor = cursor;
409
      base::mHasFailed = true;
410
    });
411
412
    // Using functional raw access because '-' could be part of the word set
413
    // making CheckChar('-') not work.
414
    bool minus = CheckChar([](const TChar aChar) { return aChar == '-'; });
415
416
    typename base::Token t;
417
    if (!Check(base::TOKEN_INTEGER, t)) {
418
      return false;
419
    }
420
421
    mozilla::CheckedInt<T> checked(t.AsInteger());
422
    if (minus) {
423
      checked *= -1;
424
    }
425
426
    if (!checked.isValid()) {
427
      return false;
428
    }
429
430
    *aValue = checked.value();
431
    revert.release();
432
    return true;
433
  }
434
435
  /**
436
   * Returns the read cursor position back as it was before the last call of any parsing
437
   * method of TTokenizer (Next, Check*, Skip*, Read*) so that the last operation
438
   * can be repeated.
439
   * Rollback cannot be used multiple times, it only reverts the last successfull parse
440
   * operation.  It also cannot be used before any parsing operation has been called
441
   * on the TTokenizer.
442
   */
443
  void Rollback();
444
445
  /**
446
   * Record() and Claim() are collecting the input as it is being parsed to obtain
447
   * a substring between particular syntax bounderies defined by any recursive
448
   * descent parser or simple parser the TTokenizer is used to read the input for.
449
   * Inlucsion of a token that has just been parsed can be controlled using an arguemnt.
450
   */
451
  enum ClaimInclusion {
452
    /**
453
     * Include resulting (or passed) token of the last lexical analyzer operation in the result.
454
     */
455
    INCLUDE_LAST,
456
    /**
457
     * Do not include it.
458
     */
459
    EXCLUDE_LAST
460
  };
461
462
  /**
463
   * Start the process of recording.  Based on aInclude value the begining of the recorded
464
   * sub-string is at the current position (EXCLUDE_LAST) or at the position before the last
465
   * parsed token (INCLUDE_LAST).
466
   */
467
  void Record(ClaimInclusion aInclude = EXCLUDE_LAST);
468
  /**
469
   * Claim result of the record started with Record() call before.  Depending on aInclude
470
   * the ending of the sub-string result includes or excludes the last parsed or checked
471
   * token.
472
   */
473
  void Claim(typename base::TAString& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
474
  void Claim(typename base::TDependentSubstring& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
475
476
  /**
477
   * If aToken is found, aResult is set to the substring between the current
478
   * position and the position of aToken, potentially including aToken depending
479
   * on aInclude.
480
   * If aToken isn't found aResult is set to the substring between the current
481
   * position and the end of the string.
482
   * If aToken is found, the method returns true. Otherwise it returns false.
483
   *
484
   * Calling Rollback() after ReadUntil() will return the read cursor to the
485
   * position it had before ReadUntil was called.
486
   */
487
  MOZ_MUST_USE bool ReadUntil(typename base::Token const& aToken, typename base::TDependentSubstring& aResult,
488
                              ClaimInclusion aInclude = EXCLUDE_LAST);
489
  MOZ_MUST_USE bool ReadUntil(typename base::Token const& aToken, typename base::TAString& aResult,
490
                              ClaimInclusion aInclude = EXCLUDE_LAST);
491
492
protected:
493
  // All these point to the original buffer passed to the TTokenizer's constructor
494
  typename base::TAString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is
495
  typename base::TAString::const_char_iterator mRollback; // Position of the previous token start
496
497
private:
498
  TTokenizer() = delete;
499
  TTokenizer(const TTokenizer&) = delete;
500
  TTokenizer(TTokenizer&&) = delete;
501
  TTokenizer(const TTokenizer&&) = delete;
502
  TTokenizer &operator=(const TTokenizer&) = delete;
503
};
504
505
typedef TTokenizer<char> Tokenizer;
506
typedef TTokenizer<char16_t> Tokenizer16;
507
508
} // mozilla
509
510
#endif // Tokenizer_h__