Coverage Report

Created: 2026-06-15 06:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/hunspell/src/parsers/textparser.cxx
Line
Count
Source
1
/* ***** BEGIN LICENSE BLOCK *****
2
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3
 *
4
 * Copyright (C) 2002-2022 Németh László
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version
7
 * 1.1 (the "License"); you may not use this file except in compliance with
8
 * the License. You may obtain a copy of the License at
9
 * http://www.mozilla.org/MPL/
10
 *
11
 * Software distributed under the License is distributed on an "AS IS" basis,
12
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13
 * for the specific language governing rights and limitations under the
14
 * License.
15
 *
16
 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17
 *
18
 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19
 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20
 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21
 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22
 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23
 *
24
 * Alternatively, the contents of this file may be used under the terms of
25
 * either the GNU General Public License Version 2 or later (the "GPL"), or
26
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
 * in which case the provisions of the GPL or the LGPL are applicable instead
28
 * of those above. If you wish to allow use of your version of this file only
29
 * under the terms of either the GPL or the LGPL, and not to allow others to
30
 * use your version of this file under the terms of the MPL, indicate your
31
 * decision by deleting the provisions above and replace them with the notice
32
 * and other provisions required by the GPL or the LGPL. If you do not delete
33
 * the provisions above, a recipient may use your version of this file under
34
 * the terms of any one of the MPL, the GPL or the LGPL.
35
 *
36
 * ***** END LICENSE BLOCK ***** */
37
38
#include <cstdlib>
39
#include <cstring>
40
#include <cstdio>
41
#include <ctype.h>
42
43
#include "../hunspell/csutil.hxx"
44
#include "textparser.hxx"
45
46
#include <algorithm>
47
48
#ifndef W32
49
using namespace std;
50
#endif
51
52
// ISO-8859-1 HTML character entities
53
54
static const char* LATIN1[] = {
55
    "&Agrave;", "&Atilde;", "&Aring;",  "&AElig;",  "&Egrave;", "&Ecirc;",
56
    "&Igrave;", "&Iuml;",   "&ETH;",    "&Ntilde;", "&Ograve;", "&Oslash;",
57
    "&Ugrave;", "&THORN;",  "&agrave;", "&atilde;", "&aring;",  "&aelig;",
58
    "&egrave;", "&ecirc;",  "&igrave;", "&iuml;",   "&eth;",    "&ntilde;",
59
    "&ograve;", "&oslash;", "&ugrave;", "&thorn;",  "&yuml;"};
60
61
3.59M
#define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*))
62
63
#define ENTITY_APOS "&apos;"
64
0
#define UTF8_APOS "\xe2\x80\x99"
65
11.2M
#define APOSTROPHE "'"
66
67
908
TextParser::TextParser(const char* wordchars) {
68
908
  init(wordchars);
69
908
}
70
71
0
TextParser::TextParser(const w_char* wordchars, int len) {
72
0
  init(wordchars, len);
73
0
}
74
75
908
TextParser::~TextParser() = default;
76
77
1.67G
int TextParser::is_wordchar(const char* w) {
78
1.67G
  if (*w == '\0')
79
603k
    return 0;
80
1.67G
  size_t cache_index = (*w + 256) % 256;
81
1.67G
  if (utf8) {
82
0
    const bool use_cache = cache_index < 0x80;
83
0
    if (use_cache)
84
0
      return wordcharacters[cache_index];
85
0
    if (u8_u16(wc, w, true) < 1)
86
0
        return 0;
87
0
    unsigned short idx = (unsigned short)wc[0];
88
0
    return unicodeisalpha(idx) ||
89
0
           (wordchars_utf16 &&
90
0
            std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0]));
91
1.67G
  } else {
92
1.67G
    return wordcharacters[cache_index];
93
1.67G
  }
94
1.67G
}
95
96
14.5M
const char* TextParser::get_latin1(const char* s) {
97
14.5M
  if (s[0] == '&') {
98
126k
    unsigned int i = 0;
99
3.47M
    while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i])) != 0)
100
3.34M
      i++;
101
126k
    if (i != LATIN1_LEN)
102
26.8k
      return LATIN1[i];
103
126k
  }
104
14.4M
  return nullptr;
105
14.5M
}
106
107
908
void TextParser::init(const char* wordchars) {
108
908
  actual = 0;
109
908
  head = 0;
110
908
  token = 0;
111
908
  state = 0;
112
908
  utf8 = 0;
113
908
  checkurl = 0;
114
908
  wordchars_utf16 = nullptr;
115
908
  wclen = 0;
116
908
  wordcharacters.resize(256, 0);
117
908
  if (!wordchars)
118
442
    wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
119
48.5k
  for (unsigned int j = 0; j < strlen(wordchars); ++j) {
120
47.6k
    wordcharacters[(wordchars[j] + 256) % 256] = 1;
121
47.6k
  }
122
908
}
123
124
0
void TextParser::init(const w_char* wc_utf8, int len) {
125
0
  actual = 0;
126
0
  head = 0;
127
0
  token = 0;
128
0
  state = 0;
129
0
  utf8 = 1;
130
0
  checkurl = 0;
131
0
  wordchars_utf16 = wc_utf8;
132
0
  wclen = len;
133
134
  // build a cache for the simple cases
135
0
  wordcharacters.resize(0x80);
136
0
  w_char wc2;
137
0
  wc2.h = 0;
138
0
  for (unsigned char idx = 0; idx < 0x80; ++idx) {
139
0
    wc2.l = idx;
140
0
    int cache = unicodeisalpha(idx) ||
141
0
                (wordchars_utf16 &&
142
0
                 std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc2));
143
0
    wordcharacters[idx] = cache;
144
0
  }
145
0
}
146
147
1.67G
int TextParser::next_char(const char* ln, size_t* pos) {
148
1.67G
  if (*(ln + *pos) == '\0')
149
405k
    return 1;
150
1.67G
  if (utf8) {
151
0
    if (*(ln + *pos) >> 7) {
152
      // jump to next UTF-8 character
153
0
      for ((*pos)++; (*(ln + *pos) & 0xc0) == 0x80; (*pos)++)
154
0
        ;
155
0
    } else {
156
0
      (*pos)++;
157
0
    }
158
0
  } else
159
1.67G
    (*pos)++;
160
1.67G
  return 0;
161
1.67G
}
162
163
908
void TextParser::put_line(const char* word) {
164
908
  actual = (actual + 1) % MAXPREVLINE;
165
908
  line[actual].assign(word);
166
908
  token = 0;
167
908
  head = 0;
168
908
  check_urls();
169
908
}
170
171
404k
std::string TextParser::get_prevline(int n) const {
172
404k
  return line[(actual + MAXPREVLINE - n) % MAXPREVLINE];
173
404k
}
174
175
404k
std::string TextParser::get_line() const {
176
404k
  return get_prevline(0);
177
404k
}
178
179
1.25M
bool TextParser::next_token(std::string &t) {
180
1.25M
  const char* latin1;
181
182
15.7M
  for (;;) {
183
15.7M
    switch (state) {
184
4.47M
      case 0:  // non word chars
185
4.47M
        if (is_wordchar(line[actual].c_str() + head)) {
186
1.26M
          state = 1;
187
1.26M
          token = head;
188
3.20M
        } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
189
4.81k
          state = 1;
190
4.81k
          token = head;
191
4.81k
          head += strlen(latin1);
192
4.81k
        }
193
4.47M
        break;
194
11.2M
      case 1:  // wordchar
195
11.2M
        if ((latin1 = get_latin1(line[actual].c_str() + head))) {
196
22.0k
          head += strlen(latin1);
197
11.2M
        } else if ((is_wordchar(APOSTROPHE) ||
198
5.14M
                    (is_utf8() && is_wordchar(UTF8_APOS))) &&
199
6.13M
                   !line[actual].empty() && line[actual][head] == '\'' &&
200
44.2k
                   is_wordchar(line[actual].c_str() + head + 1)) {
201
14.1k
          head++;
202
11.2M
        } else if (is_utf8() &&
203
0
                   is_wordchar(APOSTROPHE) &&  // add Unicode apostrophe
204
                                                      // to the WORDCHARS, if
205
                                                      // needed
206
0
                   strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
207
0
                       0 &&
208
0
                   is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
209
0
          head += strlen(UTF8_APOS) - 1;
210
11.2M
        } else if (!is_wordchar(line[actual].c_str() + head)) {
211
1.26M
          state = 0;
212
1.26M
          if (alloc_token(token, &head, t))
213
1.25M
            return true;
214
1.26M
        }
215
10.0M
        break;
216
15.7M
    }
217
14.5M
    if (next_char(line[actual].c_str(), &head))
218
632
      return false;
219
14.5M
  }
220
1.25M
}
221
222
0
size_t TextParser::get_tokenpos() {
223
0
  return token;
224
0
}
225
226
403k
int TextParser::change_token(const char* word) {
227
403k
  if (word) {
228
403k
    std::string remainder(line[actual].substr(head));
229
403k
    line[actual].resize(token);
230
403k
    line[actual].append(word);
231
403k
    line[actual].append(remainder);
232
403k
    head = token;
233
403k
    check_urls();
234
403k
    return 1;
235
403k
  }
236
0
  return 0;
237
403k
}
238
239
0
std::string TextParser::get_word(const std::string &tok) {
240
0
  return tok;
241
0
}
242
243
404k
void TextParser::check_urls() {
244
404k
  urlline.resize(line[actual].size() + 1);
245
404k
  int url_state = 0;
246
404k
  size_t url_head = 0;
247
404k
  size_t url_token = 0;
248
404k
  int url = 0;
249
1.66G
  for (;;) {
250
1.66G
    switch (url_state) {
251
503M
      case 0:  // non word chars
252
503M
        if (is_wordchar(line[actual].c_str() + url_head)) {
253
296M
          url_state = 1;
254
296M
          url_token = url_head;
255
          // Unix path
256
296M
        } else if (line[actual][url_head] == '/') {
257
14.7M
          url_state = 1;
258
14.7M
          url_token = url_head;
259
14.7M
          url = 1;
260
14.7M
        }
261
503M
        break;
262
1.16G
      case 1:  // wordchar
263
1.16G
        char ch = line[actual][url_head];
264
        // e-mail address
265
1.16G
        if ((ch == '@') ||
266
            // MS-DOS, Windows path
267
1.15G
            (strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) ||
268
            // URL
269
1.15G
            (strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) {
270
10.5M
          url = 1;
271
1.14G
        } else if (!(is_wordchar(line[actual].c_str() + url_head) || (ch == '-') ||
272
771M
                     (ch == '_') || (ch == '\\') || (ch == '.') ||
273
715M
                     (ch == ':') || (ch == '/') || (ch == '~') || (ch == '%') ||
274
567M
                     (ch == '*') || (ch == '$') || (ch == '[') || (ch == ']') ||
275
475M
                     (ch == '?') || (ch == '!') ||
276
437M
                     ((ch >= '0') && (ch <= '9')))) {
277
311M
          url_state = 0;
278
311M
          if (url == 1) {
279
135M
            for (size_t i = url_token; i < url_head; ++i) {
280
110M
              urlline[i] = true;
281
110M
            }
282
24.3M
          }
283
311M
          url = 0;
284
311M
        }
285
1.16G
        break;
286
1.66G
    }
287
1.66G
    urlline[url_head] = false;
288
1.66G
    if (next_char(line[actual].c_str(), &url_head))
289
404k
      return;
290
1.66G
  }
291
404k
}
292
293
1.26M
int TextParser::get_url(size_t token_pos, size_t* hd) {
294
1.91M
  for (size_t i = *hd; i < line[actual].size() && urlline[i]; i++, (*hd)++)
295
647k
    ;
296
1.26M
  return checkurl ? 0 : urlline[token_pos];
297
1.26M
}
298
299
908
void TextParser::set_url_checking(int check) {
300
908
  checkurl = check;
301
908
}
302
303
1.26M
bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) {
304
1.26M
  if (get_url(tokn, hd))
305
15.9k
    return false;
306
1.25M
  t = line[actual].substr(tokn, *hd - tokn);
307
  // remove colon for Finnish and Swedish language
308
1.25M
  if (!t.empty() && t[t.size() - 1] == ':') {
309
22.1k
    t.resize(t.size() - 1);
310
22.1k
    if (t.empty()) {
311
0
      return false;
312
0
    }
313
22.1k
  }
314
1.25M
  return true;
315
1.25M
}