/src/aspell/modules/tokenizer/basic.cpp
Line | Count | Source |
1 | | |
2 | | // This file is part of The New Aspell |
3 | | // Copyright (C) 2001 by Kevin Atkinson under the GNU LGPL license |
4 | | // version 2.0 or 2.1. You should have received a copy of the LGPL |
5 | | // license along with this library if you did not you can find |
6 | | // it at http://www.gnu.org/. |
7 | | |
8 | | #include "tokenizer.hpp" |
9 | | #include "convert.hpp" |
10 | | #include "speller.hpp" |
11 | | |
12 | | |
13 | | namespace acommon { |
14 | | |
15 | | class TokenizerBasic : public Tokenizer |
16 | | { |
17 | | public: |
18 | | bool advance(); |
19 | | }; |
20 | | |
21 | 37.6k | bool TokenizerBasic::advance() { |
22 | 37.6k | word_begin = word_end; |
23 | 37.6k | begin_pos = end_pos; |
24 | 37.6k | FilterChar * cur = word_begin; |
25 | 37.6k | unsigned int cur_pos = begin_pos; |
26 | 37.6k | word.clear(); |
27 | | |
28 | | // skip spaces (non-word characters) |
29 | 2.46M | while (*cur != 0 && |
30 | 2.46M | !(is_word(*cur) |
31 | 2.42M | || (is_begin(*cur) && is_word(cur[1])))) |
32 | 2.42M | { |
33 | 2.42M | cur_pos += cur->width; |
34 | 2.42M | ++cur; |
35 | 2.42M | } |
36 | | |
37 | 37.6k | if (*cur == 0) return false; |
38 | | |
39 | 36.5k | word_begin = cur; |
40 | 36.5k | begin_pos = cur_pos; |
41 | | |
42 | 36.5k | if (is_begin(*cur) && is_word(cur[1])) |
43 | 14 | { |
44 | 14 | cur_pos += cur->width; |
45 | 14 | ++cur; |
46 | 14 | } |
47 | | |
48 | 14.3M | while (is_word(*cur) || |
49 | 380k | (is_middle(*cur) && |
50 | 344k | cur > word_begin && is_word(cur[-1]) && |
51 | 344k | is_word(cur[1]) )) |
52 | 14.3M | { |
53 | 14.3M | word.append(*cur); |
54 | 14.3M | cur_pos += cur->width; |
55 | 14.3M | ++cur; |
56 | 14.3M | } |
57 | | |
58 | 36.5k | if (is_end(*cur)) |
59 | 1.02k | { |
60 | 1.02k | word.append(*cur); |
61 | 1.02k | cur_pos += cur->width; |
62 | 1.02k | ++cur; |
63 | 1.02k | } |
64 | | |
65 | 36.5k | word.append('\0'); |
66 | 36.5k | word_end = cur; |
67 | 36.5k | end_pos = cur_pos; |
68 | | |
69 | 36.5k | return true; |
70 | 37.6k | } |
71 | | #undef increment__ |
72 | | |
73 | | PosibErr<Tokenizer *> new_tokenizer(Speller * speller) |
74 | 1.09k | { |
75 | 1.09k | Tokenizer * tok = new TokenizerBasic(); |
76 | 1.09k | speller->setup_tokenizer(tok); |
77 | 1.09k | return tok; |
78 | 1.09k | } |
79 | | |
80 | | } |