/src/hunspell/src/parsers/textparser.cxx
Line | Count | Source |
1 | | /* ***** BEGIN LICENSE BLOCK ***** |
2 | | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
3 | | * |
4 | | * Copyright (C) 2002-2022 Németh László |
5 | | * |
6 | | * The contents of this file are subject to the Mozilla Public License Version |
7 | | * 1.1 (the "License"); you may not use this file except in compliance with |
8 | | * the License. You may obtain a copy of the License at |
9 | | * http://www.mozilla.org/MPL/ |
10 | | * |
11 | | * Software distributed under the License is distributed on an "AS IS" basis, |
12 | | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
13 | | * for the specific language governing rights and limitations under the |
14 | | * License. |
15 | | * |
16 | | * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks. |
17 | | * |
18 | | * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, |
19 | | * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, |
20 | | * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, |
21 | | * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, |
22 | | * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen |
23 | | * |
24 | | * Alternatively, the contents of this file may be used under the terms of |
25 | | * either the GNU General Public License Version 2 or later (the "GPL"), or |
26 | | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
27 | | * in which case the provisions of the GPL or the LGPL are applicable instead |
28 | | * of those above. If you wish to allow use of your version of this file only |
29 | | * under the terms of either the GPL or the LGPL, and not to allow others to |
30 | | * use your version of this file under the terms of the MPL, indicate your |
31 | | * decision by deleting the provisions above and replace them with the notice |
32 | | * and other provisions required by the GPL or the LGPL. If you do not delete |
33 | | * the provisions above, a recipient may use your version of this file under |
34 | | * the terms of any one of the MPL, the GPL or the LGPL. |
35 | | * |
36 | | * ***** END LICENSE BLOCK ***** */ |
37 | | |
38 | | #include <cstdlib> |
39 | | #include <cstring> |
40 | | #include <cstdio> |
41 | | #include <ctype.h> |
42 | | |
43 | | #include "../hunspell/csutil.hxx" |
44 | | #include "textparser.hxx" |
45 | | |
46 | | #include <algorithm> |
47 | | |
48 | | #ifndef W32 |
49 | | using namespace std; |
50 | | #endif |
51 | | |
52 | | // ISO-8859-1 HTML character entities |
53 | | |
54 | | static const char* LATIN1[] = { |
55 | | "À", "Ã", "Å", "Æ", "È", "Ê", |
56 | | "Ì", "Ï", "Ð", "Ñ", "Ò", "Ø", |
57 | | "Ù", "Þ", "à", "ã", "å", "æ", |
58 | | "è", "ê", "ì", "ï", "ð", "ñ", |
59 | | "ò", "ø", "ù", "þ", "ÿ"}; |
60 | | |
61 | 3.59M | #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*)) |
62 | | |
63 | | #define ENTITY_APOS "'" |
64 | 0 | #define UTF8_APOS "\xe2\x80\x99" |
65 | 11.2M | #define APOSTROPHE "'" |
66 | | |
67 | 908 | TextParser::TextParser(const char* wordchars) { |
68 | 908 | init(wordchars); |
69 | 908 | } |
70 | | |
71 | 0 | TextParser::TextParser(const w_char* wordchars, int len) { |
72 | 0 | init(wordchars, len); |
73 | 0 | } |
74 | | |
75 | 908 | TextParser::~TextParser() = default; |
76 | | |
77 | 1.67G | int TextParser::is_wordchar(const char* w) { |
78 | 1.67G | if (*w == '\0') |
79 | 603k | return 0; |
80 | 1.67G | size_t cache_index = (*w + 256) % 256; |
81 | 1.67G | if (utf8) { |
82 | 0 | const bool use_cache = cache_index < 0x80; |
83 | 0 | if (use_cache) |
84 | 0 | return wordcharacters[cache_index]; |
85 | 0 | if (u8_u16(wc, w, true) < 1) |
86 | 0 | return 0; |
87 | 0 | unsigned short idx = (unsigned short)wc[0]; |
88 | 0 | return unicodeisalpha(idx) || |
89 | 0 | (wordchars_utf16 && |
90 | 0 | std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0])); |
91 | 1.67G | } else { |
92 | 1.67G | return wordcharacters[cache_index]; |
93 | 1.67G | } |
94 | 1.67G | } |
95 | | |
96 | 14.5M | const char* TextParser::get_latin1(const char* s) { |
97 | 14.5M | if (s[0] == '&') { |
98 | 126k | unsigned int i = 0; |
99 | 3.47M | while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i])) != 0) |
100 | 3.34M | i++; |
101 | 126k | if (i != LATIN1_LEN) |
102 | 26.8k | return LATIN1[i]; |
103 | 126k | } |
104 | 14.4M | return nullptr; |
105 | 14.5M | } |
106 | | |
107 | 908 | void TextParser::init(const char* wordchars) { |
108 | 908 | actual = 0; |
109 | 908 | head = 0; |
110 | 908 | token = 0; |
111 | 908 | state = 0; |
112 | 908 | utf8 = 0; |
113 | 908 | checkurl = 0; |
114 | 908 | wordchars_utf16 = nullptr; |
115 | 908 | wclen = 0; |
116 | 908 | wordcharacters.resize(256, 0); |
117 | 908 | if (!wordchars) |
118 | 442 | wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM"; |
119 | 48.5k | for (unsigned int j = 0; j < strlen(wordchars); ++j) { |
120 | 47.6k | wordcharacters[(wordchars[j] + 256) % 256] = 1; |
121 | 47.6k | } |
122 | 908 | } |
123 | | |
124 | 0 | void TextParser::init(const w_char* wc_utf8, int len) { |
125 | 0 | actual = 0; |
126 | 0 | head = 0; |
127 | 0 | token = 0; |
128 | 0 | state = 0; |
129 | 0 | utf8 = 1; |
130 | 0 | checkurl = 0; |
131 | 0 | wordchars_utf16 = wc_utf8; |
132 | 0 | wclen = len; |
133 | | |
134 | | // build a cache for the simple cases |
135 | 0 | wordcharacters.resize(0x80); |
136 | 0 | w_char wc2; |
137 | 0 | wc2.h = 0; |
138 | 0 | for (unsigned char idx = 0; idx < 0x80; ++idx) { |
139 | 0 | wc2.l = idx; |
140 | 0 | int cache = unicodeisalpha(idx) || |
141 | 0 | (wordchars_utf16 && |
142 | 0 | std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc2)); |
143 | 0 | wordcharacters[idx] = cache; |
144 | 0 | } |
145 | 0 | } |
146 | | |
147 | 1.67G | int TextParser::next_char(const char* ln, size_t* pos) { |
148 | 1.67G | if (*(ln + *pos) == '\0') |
149 | 405k | return 1; |
150 | 1.67G | if (utf8) { |
151 | 0 | if (*(ln + *pos) >> 7) { |
152 | | // jump to next UTF-8 character |
153 | 0 | for ((*pos)++; (*(ln + *pos) & 0xc0) == 0x80; (*pos)++) |
154 | 0 | ; |
155 | 0 | } else { |
156 | 0 | (*pos)++; |
157 | 0 | } |
158 | 0 | } else |
159 | 1.67G | (*pos)++; |
160 | 1.67G | return 0; |
161 | 1.67G | } |
162 | | |
163 | 908 | void TextParser::put_line(const char* word) { |
164 | 908 | actual = (actual + 1) % MAXPREVLINE; |
165 | 908 | line[actual].assign(word); |
166 | 908 | token = 0; |
167 | 908 | head = 0; |
168 | 908 | check_urls(); |
169 | 908 | } |
170 | | |
171 | 404k | std::string TextParser::get_prevline(int n) const { |
172 | 404k | return line[(actual + MAXPREVLINE - n) % MAXPREVLINE]; |
173 | 404k | } |
174 | | |
175 | 404k | std::string TextParser::get_line() const { |
176 | 404k | return get_prevline(0); |
177 | 404k | } |
178 | | |
179 | 1.25M | bool TextParser::next_token(std::string &t) { |
180 | 1.25M | const char* latin1; |
181 | | |
182 | 15.7M | for (;;) { |
183 | 15.7M | switch (state) { |
184 | 4.47M | case 0: // non word chars |
185 | 4.47M | if (is_wordchar(line[actual].c_str() + head)) { |
186 | 1.26M | state = 1; |
187 | 1.26M | token = head; |
188 | 3.20M | } else if ((latin1 = get_latin1(line[actual].c_str() + head))) { |
189 | 4.81k | state = 1; |
190 | 4.81k | token = head; |
191 | 4.81k | head += strlen(latin1); |
192 | 4.81k | } |
193 | 4.47M | break; |
194 | 11.2M | case 1: // wordchar |
195 | 11.2M | if ((latin1 = get_latin1(line[actual].c_str() + head))) { |
196 | 22.0k | head += strlen(latin1); |
197 | 11.2M | } else if ((is_wordchar(APOSTROPHE) || |
198 | 5.14M | (is_utf8() && is_wordchar(UTF8_APOS))) && |
199 | 6.13M | !line[actual].empty() && line[actual][head] == '\'' && |
200 | 44.2k | is_wordchar(line[actual].c_str() + head + 1)) { |
201 | 14.1k | head++; |
202 | 11.2M | } else if (is_utf8() && |
203 | 0 | is_wordchar(APOSTROPHE) && // add Unicode apostrophe |
204 | | // to the WORDCHARS, if |
205 | | // needed |
206 | 0 | strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) == |
207 | 0 | 0 && |
208 | 0 | is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) { |
209 | 0 | head += strlen(UTF8_APOS) - 1; |
210 | 11.2M | } else if (!is_wordchar(line[actual].c_str() + head)) { |
211 | 1.26M | state = 0; |
212 | 1.26M | if (alloc_token(token, &head, t)) |
213 | 1.25M | return true; |
214 | 1.26M | } |
215 | 10.0M | break; |
216 | 15.7M | } |
217 | 14.5M | if (next_char(line[actual].c_str(), &head)) |
218 | 632 | return false; |
219 | 14.5M | } |
220 | 1.25M | } |
221 | | |
222 | 0 | size_t TextParser::get_tokenpos() { |
223 | 0 | return token; |
224 | 0 | } |
225 | | |
226 | 403k | int TextParser::change_token(const char* word) { |
227 | 403k | if (word) { |
228 | 403k | std::string remainder(line[actual].substr(head)); |
229 | 403k | line[actual].resize(token); |
230 | 403k | line[actual].append(word); |
231 | 403k | line[actual].append(remainder); |
232 | 403k | head = token; |
233 | 403k | check_urls(); |
234 | 403k | return 1; |
235 | 403k | } |
236 | 0 | return 0; |
237 | 403k | } |
238 | | |
239 | 0 | std::string TextParser::get_word(const std::string &tok) { |
240 | 0 | return tok; |
241 | 0 | } |
242 | | |
243 | 404k | void TextParser::check_urls() { |
244 | 404k | urlline.resize(line[actual].size() + 1); |
245 | 404k | int url_state = 0; |
246 | 404k | size_t url_head = 0; |
247 | 404k | size_t url_token = 0; |
248 | 404k | int url = 0; |
249 | 1.66G | for (;;) { |
250 | 1.66G | switch (url_state) { |
251 | 503M | case 0: // non word chars |
252 | 503M | if (is_wordchar(line[actual].c_str() + url_head)) { |
253 | 296M | url_state = 1; |
254 | 296M | url_token = url_head; |
255 | | // Unix path |
256 | 296M | } else if (line[actual][url_head] == '/') { |
257 | 14.7M | url_state = 1; |
258 | 14.7M | url_token = url_head; |
259 | 14.7M | url = 1; |
260 | 14.7M | } |
261 | 503M | break; |
262 | 1.16G | case 1: // wordchar |
263 | 1.16G | char ch = line[actual][url_head]; |
264 | | // e-mail address |
265 | 1.16G | if ((ch == '@') || |
266 | | // MS-DOS, Windows path |
267 | 1.15G | (strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) || |
268 | | // URL |
269 | 1.15G | (strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) { |
270 | 10.5M | url = 1; |
271 | 1.14G | } else if (!(is_wordchar(line[actual].c_str() + url_head) || (ch == '-') || |
272 | 771M | (ch == '_') || (ch == '\\') || (ch == '.') || |
273 | 715M | (ch == ':') || (ch == '/') || (ch == '~') || (ch == '%') || |
274 | 567M | (ch == '*') || (ch == '$') || (ch == '[') || (ch == ']') || |
275 | 475M | (ch == '?') || (ch == '!') || |
276 | 437M | ((ch >= '0') && (ch <= '9')))) { |
277 | 311M | url_state = 0; |
278 | 311M | if (url == 1) { |
279 | 135M | for (size_t i = url_token; i < url_head; ++i) { |
280 | 110M | urlline[i] = true; |
281 | 110M | } |
282 | 24.3M | } |
283 | 311M | url = 0; |
284 | 311M | } |
285 | 1.16G | break; |
286 | 1.66G | } |
287 | 1.66G | urlline[url_head] = false; |
288 | 1.66G | if (next_char(line[actual].c_str(), &url_head)) |
289 | 404k | return; |
290 | 1.66G | } |
291 | 404k | } |
292 | | |
293 | 1.26M | int TextParser::get_url(size_t token_pos, size_t* hd) { |
294 | 1.91M | for (size_t i = *hd; i < line[actual].size() && urlline[i]; i++, (*hd)++) |
295 | 647k | ; |
296 | 1.26M | return checkurl ? 0 : urlline[token_pos]; |
297 | 1.26M | } |
298 | | |
299 | 908 | void TextParser::set_url_checking(int check) { |
300 | 908 | checkurl = check; |
301 | 908 | } |
302 | | |
303 | 1.26M | bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) { |
304 | 1.26M | if (get_url(tokn, hd)) |
305 | 15.9k | return false; |
306 | 1.25M | t = line[actual].substr(tokn, *hd - tokn); |
307 | | // remove colon for Finnish and Swedish language |
308 | 1.25M | if (!t.empty() && t[t.size() - 1] == ':') { |
309 | 22.1k | t.resize(t.size() - 1); |
310 | 22.1k | if (t.empty()) { |
311 | 0 | return false; |
312 | 0 | } |
313 | 22.1k | } |
314 | 1.25M | return true; |
315 | 1.25M | } |