/src/mozilla-central/xpcom/ds/Tokenizer.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* vim: set ts=8 sts=2 et sw=2 tw=80: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | |
7 | | #ifndef Tokenizer_h__ |
8 | | #define Tokenizer_h__ |
9 | | |
10 | | #include "nsString.h" |
11 | | #include "mozilla/CheckedInt.h" |
12 | | #include "mozilla/ScopeExit.h" |
13 | | #include "mozilla/TypeTraits.h" |
14 | | #include "mozilla/UniquePtr.h" |
15 | | #include "nsTArray.h" |
16 | | |
17 | | namespace mozilla { |
18 | | |
19 | | template <typename TChar> |
20 | | class TokenizerBase |
21 | | { |
22 | | public: |
23 | | typedef nsTSubstring<TChar> TAString; |
24 | | typedef nsTString<TChar> TString; |
25 | | typedef nsTDependentString<TChar> TDependentString; |
26 | | typedef nsTDependentSubstring<TChar> TDependentSubstring; |
27 | | |
28 | | static TChar const sWhitespaces[]; |
29 | | |
30 | | /** |
31 | | * The analyzer works with elements in the input cut to a sequence of token |
32 | | * where each token has an elementary type |
33 | | */ |
34 | | enum TokenType : uint32_t |
35 | | { |
36 | | TOKEN_UNKNOWN, |
37 | | TOKEN_RAW, |
38 | | TOKEN_ERROR, |
39 | | TOKEN_INTEGER, |
40 | | TOKEN_WORD, |
41 | | TOKEN_CHAR, |
42 | | TOKEN_WS, |
43 | | TOKEN_EOL, |
44 | | TOKEN_EOF, |
45 | | TOKEN_CUSTOM0 = 1000 |
46 | | }; |
47 | | |
48 | | enum ECaseSensitivity |
49 | | { |
50 | | CASE_SENSITIVE, |
51 | | CASE_INSENSITIVE |
52 | | }; |
53 | | |
54 | | /** |
55 | | * Class holding the type and the value of a token. It can be manually created |
56 | | * to allow checks against it via methods of TTokenizer or are results of some of |
57 | | * the TTokenizer's methods. |
58 | | */ |
59 | | class Token |
60 | | { |
61 | | TokenType mType; |
62 | | TDependentSubstring mWord; |
63 | | TString mCustom; |
64 | | TChar mChar; |
65 | | uint64_t mInteger; |
66 | | ECaseSensitivity mCustomCaseInsensitivity; |
67 | | bool mCustomEnabled; |
68 | | |
69 | | // If this token is a result of the parsing process, this member is referencing |
70 | | // a sub-string in the input buffer. If this is externally created Token this |
71 | | // member is left an empty string. |
72 | | TDependentSubstring mFragment; |
73 | | |
74 | | friend class TokenizerBase<TChar>; |
75 | | void AssignFragment(typename TAString::const_char_iterator begin, |
76 | | typename TAString::const_char_iterator end); |
77 | | |
78 | | static Token Raw(); |
79 | | |
80 | | public: |
81 | | Token(); |
82 | | Token(const Token& aOther); |
83 | | Token& operator=(const Token& aOther); |
84 | | |
85 | | // Static constructors of tokens by type and value |
86 | | static Token Word(TAString const& aWord); |
87 | | static Token Char(TChar const aChar); |
88 | | static Token Number(uint64_t const aNumber); |
89 | | static Token Whitespace(); |
90 | | static Token NewLine(); |
91 | | static Token EndOfFile(); |
92 | | static Token Error(); |
93 | | |
94 | | // Compares the two tokens, type must be identical and value |
95 | | // of one of the tokens must be 'any' or equal. |
96 | | bool Equals(const Token& aOther) const; |
97 | | |
98 | 0 | TokenType Type() const { return mType; } |
99 | | TChar AsChar() const; |
100 | | TDependentSubstring AsString() const; |
101 | | uint64_t AsInteger() const; |
102 | | |
103 | 0 | TDependentSubstring Fragment() const { return mFragment; } Unexecuted instantiation: mozilla::TokenizerBase<char>::Token::Fragment() const Unexecuted instantiation: mozilla::TokenizerBase<char16_t>::Token::Fragment() const |
104 | | }; |
105 | | |
106 | | /** |
107 | | * Consumers may register a custom string that, when found in the input, is considered |
108 | | * a token and returned by Next*() and accepted by Check*() methods. |
109 | | * AddCustomToken() returns a reference to a token that can then be comapred using |
110 | | * Token::Equals() againts the output from Next*() or be passed to Check*(). |
111 | | */ |
112 | | Token AddCustomToken(const TAString& aValue, ECaseSensitivity aCaseInsensitivity, bool aEnabled = true); |
113 | | template <uint32_t N> |
114 | | Token AddCustomToken(const TChar(&aValue)[N], ECaseSensitivity aCaseInsensitivity, bool aEnabled = true) |
115 | | { |
116 | | return AddCustomToken(TDependentSubstring(aValue, N - 1), aCaseInsensitivity, aEnabled); |
117 | | } |
118 | | void RemoveCustomToken(Token& aToken); |
119 | | /** |
120 | | * Only applies to a custom type of a Token (see AddCustomToken above.) |
121 | | * This turns on and off token recognition. When a custom token is disabled, |
122 | | * it's ignored as never added as a custom token. |
123 | | */ |
124 | | void EnableCustomToken(Token const& aToken, bool aEnable); |
125 | | |
126 | | /** |
127 | | * Mode of tokenization. |
128 | | * FULL tokenization, the default, recognizes built-in tokens and any custom tokens, |
129 | | * if added. |
130 | | * CUSTOM_ONLY will only recognize custom tokens, the rest is seen as 'raw'. |
131 | | * This mode can be understood as a 'binary' mode. |
132 | | */ |
133 | | enum class Mode |
134 | | { |
135 | | FULL, |
136 | | CUSTOM_ONLY |
137 | | }; |
138 | | void SetTokenizingMode(Mode aMode); |
139 | | |
140 | | /** |
141 | | * Return false iff the last Check*() call has returned false or when we've read past |
142 | | * the end of the input string. |
143 | | */ |
144 | | MOZ_MUST_USE bool HasFailed() const; |
145 | | |
146 | | protected: |
147 | | explicit TokenizerBase(const TChar* aWhitespaces = nullptr, |
148 | | const TChar* aAdditionalWordChars = nullptr); |
149 | | |
150 | | // false if we have already read the EOF token. |
151 | | bool HasInput() const; |
152 | | // Main parsing function, it doesn't shift the read cursor, just returns the next |
153 | | // token position. |
154 | | typename TAString::const_char_iterator Parse(Token& aToken) const; |
155 | | // Is read cursor at the end? |
156 | | bool IsEnd(const typename TAString::const_char_iterator& caret) const; |
157 | | // True, when we are at the end of the input data, but it has not been marked |
158 | | // as complete yet. In that case we cannot proceed with providing a multi-TChar token. |
159 | | bool IsPending(const typename TAString::const_char_iterator & caret) const; |
160 | | // Is read cursor on a character that is a word start? |
161 | | bool IsWordFirst(const TChar aInput) const; |
162 | | // Is read cursor on a character that is an in-word letter? |
163 | | bool IsWord(const TChar aInput) const; |
164 | | // Is read cursor on a character that is a valid number? |
165 | | // TODO - support multiple radix |
166 | | bool IsNumber(const TChar aInput) const; |
167 | | // Is equal to the given custom token? |
168 | | bool IsCustom(const typename TAString::const_char_iterator& caret, |
169 | | const Token& aCustomToken, uint32_t* aLongest = nullptr) const; |
170 | | |
171 | | // Friendly helper to assign a fragment on a Token |
172 | | static void AssignFragment(Token& aToken, |
173 | | typename TAString::const_char_iterator begin, |
174 | | typename TAString::const_char_iterator end); |
175 | | |
176 | | // true iff we have already read the EOF token |
177 | | bool mPastEof; |
178 | | // true iff the last Check*() call has returned false, reverts to true on Rollback() call |
179 | | bool mHasFailed; |
180 | | // true if the input string is final (finished), false when we expect more data |
181 | | // yet to be fed to the tokenizer (see IncrementalTokenizer derived class). |
182 | | bool mInputFinished; |
183 | | // custom only vs full tokenizing mode, see the Parse() method |
184 | | Mode mMode; |
185 | | // minimal raw data chunked delivery during incremental feed |
186 | | uint32_t mMinRawDelivery; |
187 | | |
188 | | // Customizable list of whitespaces |
189 | | const TChar* mWhitespaces; |
190 | | // Additinal custom word characters |
191 | | const TChar* mAdditionalWordChars; |
192 | | |
193 | | // All these point to the original buffer passed to the constructor or to the incremental |
194 | | // buffer after FeedInput. |
195 | | typename TAString::const_char_iterator mCursor; // Position of the current (actually next to read) token start |
196 | | typename TAString::const_char_iterator mEnd; // End of the input position |
197 | | |
198 | | // This is the list of tokens user has registered with AddCustomToken() |
199 | | nsTArray<UniquePtr<Token>> mCustomTokens; |
200 | | uint32_t mNextCustomTokenID; |
201 | | |
202 | | private: |
203 | | TokenizerBase() = delete; |
204 | | TokenizerBase(const TokenizerBase&) = delete; |
205 | | TokenizerBase(TokenizerBase&&) = delete; |
206 | | TokenizerBase(const TokenizerBase&&) = delete; |
207 | | TokenizerBase &operator=(const TokenizerBase&) = delete; |
208 | | }; |
209 | | |
210 | | /** |
211 | | * This is a simple implementation of a lexical analyzer or maybe better |
212 | | * called a tokenizer. |
213 | | * |
214 | | * Please use Tokenizer or Tokenizer16 classes, that are specializations |
215 | | * of this template class. Tokenizer is for ASCII input, Tokenizer16 may |
216 | | * handle char16_t input, but doesn't recognize whitespaces or numbers |
217 | | * other than standard `char` specialized Tokenizer class. |
218 | | */ |
219 | | template <typename TChar> |
220 | | class TTokenizer : public TokenizerBase<TChar> |
221 | | { |
222 | | public: |
223 | | typedef TokenizerBase<TChar> base; |
224 | | |
225 | | /** |
226 | | * @param aSource |
227 | | * The string to parse. |
228 | | * IMPORTANT NOTE: TTokenizer doesn't ensure the input string buffer |
229 | | * lifetime. It's up to the consumer to make sure the string's buffer outlives |
230 | | * the TTokenizer! |
231 | | * @param aWhitespaces |
232 | | * If non-null TTokenizer will use this custom set of whitespaces for |
233 | | * CheckWhite() and SkipWhites() calls. By default the list consists of space |
234 | | * and tab. |
235 | | * @param aAdditionalWordChars |
236 | | * If non-null it will be added to the list of characters that consist a |
237 | | * word. This is useful when you want to accept e.g. '-' in HTTP headers. By |
238 | | * default a word character is consider any character for which upper case |
239 | | * is different from lower case. |
240 | | * |
241 | | * If there is an overlap between aWhitespaces and aAdditionalWordChars, the |
242 | | * check for word characters is made first. |
243 | | */ |
244 | | explicit TTokenizer(const typename base::TAString& aSource, |
245 | | const TChar* aWhitespaces = nullptr, |
246 | | const TChar* aAdditionalWordChars = nullptr); |
247 | | explicit TTokenizer(const TChar* aSource, |
248 | | const TChar* aWhitespaces = nullptr, |
249 | | const TChar* aAdditionalWordChars = nullptr); |
250 | | |
251 | | /** |
252 | | * When there is still anything to read from the input, tokenize it, store the token type |
253 | | * and value to aToken result and shift the cursor past this just parsed token. Each call |
254 | | * to Next() reads another token from the input and shifts the cursor. |
255 | | * Returns false if we have passed the end of the input. |
256 | | */ |
257 | | MOZ_MUST_USE |
258 | | bool Next(typename base::Token& aToken); |
259 | | |
260 | | /** |
261 | | * Parse the token on the input read cursor position, check its type is equal to aTokenType |
262 | | * and if so, put it into aResult, shift the cursor and return true. Otherwise, leave |
263 | | * the input read cursor position intact and return false. |
264 | | */ |
265 | | MOZ_MUST_USE |
266 | | bool Check(const typename base::TokenType aTokenType, typename base::Token& aResult); |
267 | | /** |
268 | | * Same as above method, just compares both token type and token value passed in aToken. |
269 | | * When both the type and the value equals, shift the cursor and return true. Otherwise |
270 | | * return false. |
271 | | */ |
272 | | MOZ_MUST_USE |
273 | | bool Check(const typename base::Token& aToken); |
274 | | |
275 | | /** |
276 | | * SkipWhites method (below) may also skip new line characters automatically. |
277 | | */ |
278 | | enum WhiteSkipping { |
279 | | /** |
280 | | * SkipWhites will only skip what is defined as a white space (default). |
281 | | */ |
282 | | DONT_INCLUDE_NEW_LINE = 0, |
283 | | /** |
284 | | * SkipWhites will skip definited white spaces as well as new lines |
285 | | * automatically. |
286 | | */ |
287 | | INCLUDE_NEW_LINE = 1 |
288 | | }; |
289 | | |
290 | | /** |
291 | | * Skips any occurence of whitespaces specified in mWhitespaces member, |
292 | | * optionally skip also new lines. |
293 | | */ |
294 | | void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE); |
295 | | |
296 | | /** |
297 | | * Skips all tokens until the given one is found or EOF is hit. The token |
298 | | * or EOF are next to read. |
299 | | */ |
300 | | void SkipUntil(typename base::Token const& aToken); |
301 | | |
302 | | // These are mostly shortcuts for the Check() methods above. |
303 | | |
304 | | /** |
305 | | * Check whitespace character is present. |
306 | | */ |
307 | | MOZ_MUST_USE |
308 | 4.66M | bool CheckWhite() { return Check(base::Token::Whitespace()); } mozilla::TTokenizer<char>::CheckWhite() Line | Count | Source | 308 | 4.66M | bool CheckWhite() { return Check(base::Token::Whitespace()); } |
Unexecuted instantiation: mozilla::TTokenizer<char16_t>::CheckWhite() |
309 | | /** |
310 | | * Check there is a single character on the read cursor position. If so, shift the read |
311 | | * cursor position and return true. Otherwise false. |
312 | | */ |
313 | | MOZ_MUST_USE |
314 | 0 | bool CheckChar(const TChar aChar) { return Check(base::Token::Char(aChar)); } |
315 | | /** |
316 | | * This is a customizable version of CheckChar. aClassifier is a function called with |
317 | | * value of the character on the current input read position. If this user function |
318 | | * returns true, read cursor is shifted and true returned. Otherwise false. |
319 | | * The user classifiction function is not called when we are at or past the end and |
320 | | * false is immediately returned. |
321 | | */ |
322 | | MOZ_MUST_USE |
323 | | bool CheckChar(bool (*aClassifier)(const TChar aChar)); |
324 | | /** |
325 | | * Check for a whole expected word. |
326 | | */ |
327 | | MOZ_MUST_USE |
328 | 0 | bool CheckWord(const typename base::TAString& aWord) { |
329 | 0 | return Check(base::Token::Word(aWord)); |
330 | 0 | } |
331 | | /** |
332 | | * Shortcut for literal const word check with compile time length calculation. |
333 | | */ |
334 | | template <uint32_t N> |
335 | | MOZ_MUST_USE |
336 | | bool CheckWord(const TChar (&aWord)[N]) { |
337 | | return Check(base::Token::Word(typename base::TDependentString(aWord, N - 1))); |
338 | | } |
339 | | /** |
340 | | * Checks \r, \n or \r\n. |
341 | | */ |
342 | | MOZ_MUST_USE |
343 | 0 | bool CheckEOL() { return Check(base::Token::NewLine()); } Unexecuted instantiation: mozilla::TTokenizer<char>::CheckEOL() Unexecuted instantiation: mozilla::TTokenizer<char16_t>::CheckEOL() |
344 | | /** |
345 | | * Checks we are at the end of the input string reading. If so, shift past the end |
346 | | * and returns true. Otherwise does nothing and returns false. |
347 | | */ |
348 | | MOZ_MUST_USE |
349 | 0 | bool CheckEOF() { return Check(base::Token::EndOfFile()); } |
350 | | |
351 | | /** |
352 | | * These are shortcuts to obtain the value immediately when the token type matches. |
353 | | */ |
354 | | MOZ_MUST_USE bool ReadChar(TChar* aValue); |
355 | | MOZ_MUST_USE bool ReadChar(bool (*aClassifier)(const TChar aChar), |
356 | | TChar* aValue); |
357 | | MOZ_MUST_USE bool ReadWord(typename base::TAString& aValue); |
358 | | MOZ_MUST_USE bool ReadWord(typename base::TDependentSubstring& aValue); |
359 | | |
360 | | /** |
361 | | * This is an integer read helper. It returns false and doesn't move the read |
362 | | * cursor when any of the following happens: |
363 | | * - the token at the read cursor is not an integer |
364 | | * - the final number doesn't fit the T type |
365 | | * Otherwise true is returned, aValue is filled with the integral number |
366 | | * and the cursor is moved forward. |
367 | | */ |
368 | | template <typename T> |
369 | | MOZ_MUST_USE bool ReadInteger(T *aValue) |
370 | | { |
371 | | MOZ_RELEASE_ASSERT(aValue); |
372 | | |
373 | | typename base::TAString::const_char_iterator rollback = mRollback; |
374 | | typename base::TAString::const_char_iterator cursor = base::mCursor; |
375 | | typename base::Token t; |
376 | | if (!Check(base::TOKEN_INTEGER, t)) { |
377 | | return false; |
378 | | } |
379 | | |
380 | | mozilla::CheckedInt<T> checked(t.AsInteger()); |
381 | | if (!checked.isValid()) { |
382 | | // Move to a state as if Check() call has failed |
383 | | mRollback = rollback; |
384 | | base::mCursor = cursor; |
385 | | base::mHasFailed = true; |
386 | | return false; |
387 | | } |
388 | | |
389 | | *aValue = checked.value(); |
390 | | return true; |
391 | | } |
392 | | |
393 | | /** |
394 | | * Same as above, but accepts an integer with an optional minus sign. |
395 | | */ |
396 | | template <typename T, |
397 | | typename V = typename EnableIf<IsSigned<typename RemovePointer<T>::Type>::value, |
398 | | typename RemovePointer<T>::Type>::Type> |
399 | | MOZ_MUST_USE bool ReadSignedInteger(T *aValue) |
400 | | { |
401 | | MOZ_RELEASE_ASSERT(aValue); |
402 | | |
403 | | typename base::TAString::const_char_iterator rollback = mRollback; |
404 | | typename base::TAString::const_char_iterator cursor = base::mCursor; |
405 | | auto revert = MakeScopeExit([&] { |
406 | | // Move to a state as if Check() call has failed |
407 | | mRollback = rollback; |
408 | | base::mCursor = cursor; |
409 | | base::mHasFailed = true; |
410 | | }); |
411 | | |
412 | | // Using functional raw access because '-' could be part of the word set |
413 | | // making CheckChar('-') not work. |
414 | | bool minus = CheckChar([](const TChar aChar) { return aChar == '-'; }); |
415 | | |
416 | | typename base::Token t; |
417 | | if (!Check(base::TOKEN_INTEGER, t)) { |
418 | | return false; |
419 | | } |
420 | | |
421 | | mozilla::CheckedInt<T> checked(t.AsInteger()); |
422 | | if (minus) { |
423 | | checked *= -1; |
424 | | } |
425 | | |
426 | | if (!checked.isValid()) { |
427 | | return false; |
428 | | } |
429 | | |
430 | | *aValue = checked.value(); |
431 | | revert.release(); |
432 | | return true; |
433 | | } |
434 | | |
435 | | /** |
436 | | * Returns the read cursor position back as it was before the last call of any parsing |
437 | | * method of TTokenizer (Next, Check*, Skip*, Read*) so that the last operation |
438 | | * can be repeated. |
439 | | * Rollback cannot be used multiple times, it only reverts the last successfull parse |
440 | | * operation. It also cannot be used before any parsing operation has been called |
441 | | * on the TTokenizer. |
442 | | */ |
443 | | void Rollback(); |
444 | | |
445 | | /** |
446 | | * Record() and Claim() are collecting the input as it is being parsed to obtain |
447 | | * a substring between particular syntax bounderies defined by any recursive |
448 | | * descent parser or simple parser the TTokenizer is used to read the input for. |
449 | | * Inlucsion of a token that has just been parsed can be controlled using an arguemnt. |
450 | | */ |
451 | | enum ClaimInclusion { |
452 | | /** |
453 | | * Include resulting (or passed) token of the last lexical analyzer operation in the result. |
454 | | */ |
455 | | INCLUDE_LAST, |
456 | | /** |
457 | | * Do not include it. |
458 | | */ |
459 | | EXCLUDE_LAST |
460 | | }; |
461 | | |
462 | | /** |
463 | | * Start the process of recording. Based on aInclude value the begining of the recorded |
464 | | * sub-string is at the current position (EXCLUDE_LAST) or at the position before the last |
465 | | * parsed token (INCLUDE_LAST). |
466 | | */ |
467 | | void Record(ClaimInclusion aInclude = EXCLUDE_LAST); |
468 | | /** |
469 | | * Claim result of the record started with Record() call before. Depending on aInclude |
470 | | * the ending of the sub-string result includes or excludes the last parsed or checked |
471 | | * token. |
472 | | */ |
473 | | void Claim(typename base::TAString& aResult, ClaimInclusion aInclude = EXCLUDE_LAST); |
474 | | void Claim(typename base::TDependentSubstring& aResult, ClaimInclusion aInclude = EXCLUDE_LAST); |
475 | | |
476 | | /** |
477 | | * If aToken is found, aResult is set to the substring between the current |
478 | | * position and the position of aToken, potentially including aToken depending |
479 | | * on aInclude. |
480 | | * If aToken isn't found aResult is set to the substring between the current |
481 | | * position and the end of the string. |
482 | | * If aToken is found, the method returns true. Otherwise it returns false. |
483 | | * |
484 | | * Calling Rollback() after ReadUntil() will return the read cursor to the |
485 | | * position it had before ReadUntil was called. |
486 | | */ |
487 | | MOZ_MUST_USE bool ReadUntil(typename base::Token const& aToken, typename base::TDependentSubstring& aResult, |
488 | | ClaimInclusion aInclude = EXCLUDE_LAST); |
489 | | MOZ_MUST_USE bool ReadUntil(typename base::Token const& aToken, typename base::TAString& aResult, |
490 | | ClaimInclusion aInclude = EXCLUDE_LAST); |
491 | | |
492 | | protected: |
493 | | // All these point to the original buffer passed to the TTokenizer's constructor |
494 | | typename base::TAString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is |
495 | | typename base::TAString::const_char_iterator mRollback; // Position of the previous token start |
496 | | |
497 | | private: |
498 | | TTokenizer() = delete; |
499 | | TTokenizer(const TTokenizer&) = delete; |
500 | | TTokenizer(TTokenizer&&) = delete; |
501 | | TTokenizer(const TTokenizer&&) = delete; |
502 | | TTokenizer &operator=(const TTokenizer&) = delete; |
503 | | }; |
504 | | |
505 | | typedef TTokenizer<char> Tokenizer; |
506 | | typedef TTokenizer<char16_t> Tokenizer16; |
507 | | |
508 | | } // mozilla |
509 | | |
510 | | #endif // Tokenizer_h__ |