/src/aspell/common/tokenizer.hpp
Line | Count | Source (jump to first uncovered line) |
1 | | // This file is part of The New Aspell |
2 | | // Copyright (C) 2001 by Kevin Atkinson under the GNU LGPL license |
3 | | // version 2.0 or 2.1. You should have received a copy of the LGPL |
4 | | // license along with this library if you did not you can find |
5 | | // it at http://www.gnu.org/. |
6 | | |
7 | | #ifndef ACOMMON_TOKENIZER__HPP |
8 | | #define ACOMMON_TOKENIZER__HPP |
9 | | |
10 | | #include "char_vector.hpp" |
11 | | #include "filter_char.hpp" |
12 | | #include "filter_char_vector.hpp" |
13 | | |
14 | | namespace acommon { |
15 | | |
16 | | class Convert; |
17 | | class Speller; |
18 | | class Config; |
19 | | |
20 | | class Tokenizer { |
21 | | |
22 | | public: |
23 | | Tokenizer(); |
24 | | virtual ~Tokenizer(); |
25 | | |
26 | | FilterChar * word_begin; |
27 | | FilterChar * word_end; |
28 | | FilterChar * end; |
29 | | |
30 | | CharVector word; // this word is in the final encoded form |
31 | | unsigned int begin_pos; // pointers back to the original word |
32 | | unsigned int end_pos; |
33 | | |
34 | | // The string passed in _must_ have a null character |
35 | | // at stop - 1. (ie stop must be one past the end) |
36 | | void reset (FilterChar * in, FilterChar * stop); |
37 | 0 | bool at_end() const {return word_begin == word_end;} |
38 | | |
39 | | virtual bool advance() = 0; // returns false if there is nothing left |
40 | | |
41 | | bool is_begin(unsigned char c) const |
42 | 2.58M | {return char_type_[c].begin;} |
43 | | bool is_middle(unsigned char c) const |
44 | 37.3k | {return char_type_[c].middle;} |
45 | | bool is_end(unsigned char c) const |
46 | 27.6k | {return char_type_[c].end;} |
47 | | bool is_word(unsigned char c) const |
48 | 19.0M | {return char_type_[c].word;} |
49 | | |
50 | | public: // but don't use |
51 | | // The speller class is expected to fill these members in |
52 | | struct CharType { |
53 | | bool begin; |
54 | | bool middle; |
55 | | bool end; |
56 | | bool word; |
57 | 178k | CharType() : begin(false), middle(false), end(false), word(false) {} |
58 | | }; |
59 | | |
60 | | CharType char_type_[256]; |
61 | | Convert * conv_; |
62 | | FilterCharVector buf_; |
63 | | }; |
64 | | |
65 | | // returns a new tokenizer and sets it up with the given speller |
66 | | // class |
67 | | |
68 | | PosibErr<Tokenizer *> new_tokenizer(Speller *); |
69 | | |
70 | | } |
71 | | |
72 | | #endif |