/src/hunspell/src/parsers/textparser.cxx

Source
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * Copyright (C) 2002-2022 Németh László
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
 *
 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>

#include "../hunspell/csutil.hxx"
#include "textparser.hxx"

#include <algorithm>

#ifndef W32
using namespace std;
#endif

// ISO-8859-1 HTML character entities

static const char* LATIN1[] = {
    "&Agrave;", "&Atilde;", "&Aring;",  "&AElig;",  "&Egrave;", "&Ecirc;",
    "&Igrave;", "&Iuml;",   "&ETH;",    "&Ntilde;", "&Ograve;", "&Oslash;",
    "&Ugrave;", "&THORN;",  "&agrave;", "&atilde;", "&aring;",  "&aelig;",
    "&egrave;", "&ecirc;",  "&igrave;", "&iuml;",   "&eth;",    "&ntilde;",
    "&ograve;", "&oslash;", "&ugrave;", "&thorn;",  "&yuml;"};

#define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*))

#define ENTITY_APOS "&apos;"
#define UTF8_APOS "\xe2\x80\x99"
#define APOSTROPHE "'"

TextParser::TextParser(const char* wordchars) {
  init(wordchars);
}

TextParser::TextParser(const w_char* wordchars, int len) {
  init(wordchars, len);
}

TextParser::~TextParser() = default;

int TextParser::is_wordchar(const char* w) {
  if (*w == '\0')
    return 0;
  size_t cache_index = (*w + 256) % 256;
  if (utf8) {
    const bool use_cache = cache_index < 0x80;
    if (use_cache)
      return wordcharacters[cache_index];
    if (u8_u16(wc, w, true) < 1)
        return 0;
    unsigned short idx = (unsigned short)wc[0];
    return unicodeisalpha(idx) ||
           (wordchars_utf16 &&
            std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0]));
  } else {
    return wordcharacters[cache_index];
  }
}

const char* TextParser::get_latin1(const char* s) {
  if (s[0] == '&') {
    unsigned int i = 0;
    while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i])) != 0)
      i++;
    if (i != LATIN1_LEN)
      return LATIN1[i];
  }
  return nullptr;
}

void TextParser::init(const char* wordchars) {
  actual = 0;
  head = 0;
  token = 0;
  state = 0;
  utf8 = 0;
  checkurl = 0;
  wordchars_utf16 = nullptr;
  wclen = 0;
  wordcharacters.resize(256, 0);
  if (!wordchars)
    wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
  for (unsigned int j = 0; j < strlen(wordchars); ++j) {
    wordcharacters[(wordchars[j] + 256) % 256] = 1;
  }
}

void TextParser::init(const w_char* wc_utf8, int len) {
  actual = 0;
  head = 0;
  token = 0;
  state = 0;
  utf8 = 1;
  checkurl = 0;
  wordchars_utf16 = wc_utf8;
  wclen = len;

  // build a cache for the simple cases
  wordcharacters.resize(0x80);
  w_char wc2;
  wc2.h = 0;
  for (unsigned char idx = 0; idx < 0x80; ++idx) {
    wc2.l = idx;
    int cache = unicodeisalpha(idx) ||
                (wordchars_utf16 &&
                 std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc2));
    wordcharacters[idx] = cache;
  }
}

int TextParser::next_char(const char* ln, size_t* pos) {
  if (*(ln + *pos) == '\0')
    return 1;
  if (utf8) {
    if (*(ln + *pos) >> 7) {
      // jump to next UTF-8 character
      for ((*pos)++; (*(ln + *pos) & 0xc0) == 0x80; (*pos)++)
        ;
    } else {
      (*pos)++;
    }
  } else
    (*pos)++;
  return 0;
}

void TextParser::put_line(const char* word) {
  actual = (actual + 1) % MAXPREVLINE;
  line[actual].assign(word);
  token = 0;
  head = 0;
  check_urls();
}

std::string TextParser::get_prevline(int n) const {
  return line[(actual + MAXPREVLINE - n) % MAXPREVLINE];
}

std::string TextParser::get_line() const {
  return get_prevline(0);
}

bool TextParser::next_token(std::string &t) {
  const char* latin1;

  for (;;) {
    switch (state) {
      case 0:  // non word chars
        if (is_wordchar(line[actual].c_str() + head)) {
          state = 1;
          token = head;
        } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
          state = 1;
          token = head;
          head += strlen(latin1);
        }
        break;
      case 1:  // wordchar
        if ((latin1 = get_latin1(line[actual].c_str() + head))) {
          head += strlen(latin1);
        } else if ((is_wordchar(APOSTROPHE) ||
                    (is_utf8() && is_wordchar(UTF8_APOS))) &&
                   !line[actual].empty() && line[actual][head] == '\'' &&
                   is_wordchar(line[actual].c_str() + head + 1)) {
          head++;
        } else if (is_utf8() &&
                   is_wordchar(APOSTROPHE) &&  // add Unicode apostrophe
                                                      // to the WORDCHARS, if
                                                      // needed
                   strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
                       0 &&
                   is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
          head += strlen(UTF8_APOS) - 1;
        } else if (!is_wordchar(line[actual].c_str() + head)) {
          state = 0;
          if (alloc_token(token, &head, t))
            return true;
        }
        break;
    }
    if (next_char(line[actual].c_str(), &head))
      return false;
  }
}

size_t TextParser::get_tokenpos() {
  return token;
}

int TextParser::change_token(const char* word) {
  if (word) {
    std::string remainder(line[actual].substr(head));
    line[actual].resize(token);
    line[actual].append(word);
    line[actual].append(remainder);
    head = token;
    check_urls();
    return 1;
  }
  return 0;
}

std::string TextParser::get_word(const std::string &tok) {
  return tok;
}

void TextParser::check_urls() {
  urlline.resize(line[actual].size() + 1);
  int url_state = 0;
  size_t url_head = 0;
  size_t url_token = 0;
  int url = 0;
  for (;;) {
    switch (url_state) {
      case 0:  // non word chars
        if (is_wordchar(line[actual].c_str() + url_head)) {
          url_state = 1;
          url_token = url_head;
          // Unix path
        } else if (line[actual][url_head] == '/') {
          url_state = 1;
          url_token = url_head;
          url = 1;
        }
        break;
      case 1:  // wordchar
        char ch = line[actual][url_head];
        // e-mail address
        if ((ch == '@') ||
            // MS-DOS, Windows path
            (strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) ||
            // URL
            (strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) {
          url = 1;
        } else if (!(is_wordchar(line[actual].c_str() + url_head) || (ch == '-') ||
                     (ch == '_') || (ch == '\\') || (ch == '.') ||
                     (ch == ':') || (ch == '/') || (ch == '~') || (ch == '%') ||
                     (ch == '*') || (ch == '$') || (ch == '[') || (ch == ']') ||
                     (ch == '?') || (ch == '!') ||
                     ((ch >= '0') && (ch <= '9')))) {
          url_state = 0;
          if (url == 1) {
            for (size_t i = url_token; i < url_head; ++i) {
              urlline[i] = true;
            }
          }
          url = 0;
        }
        break;
    }
    urlline[url_head] = false;
    if (next_char(line[actual].c_str(), &url_head))
      return;
  }
}

int TextParser::get_url(size_t token_pos, size_t* hd) {
  for (size_t i = *hd; i < line[actual].size() && urlline[i]; i++, (*hd)++)
    ;
  return checkurl ? 0 : urlline[token_pos];
}

void TextParser::set_url_checking(int check) {
  checkurl = check;
}

bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) {
  if (get_url(tokn, hd))
    return false;
  t = line[actual].substr(tokn, *hd - tokn);
  // remove colon for Finnish and Swedish language
  if (!t.empty() && t[t.size() - 1] == ':') {
    t.resize(t.size() - 1);
    if (t.empty()) {
      return false;
    }
  }
  return true;
}

Coverage Report

Created: 2026-06-15 06:21

Line	Count	Source
1		/* *** BEGIN LICENSE BLOCK ***
2		* Version: MPL 1.1/GPL 2.0/LGPL 2.1
3		*
4		* Copyright (C) 2002-2022 Németh László
5		*
6		* The contents of this file are subject to the Mozilla Public License Version
7		* 1.1 (the "License"); you may not use this file except in compliance with
8		* the License. You may obtain a copy of the License at
9		* http://www.mozilla.org/MPL/
10		*
11		* Software distributed under the License is distributed on an "AS IS" basis,
12		* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13		* for the specific language governing rights and limitations under the
14		* License.
15		*
16		* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17		*
18		* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19		* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20		* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21		* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22		* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23		*
24		* Alternatively, the contents of this file may be used under the terms of
25		* either the GNU General Public License Version 2 or later (the "GPL"), or
26		* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27		* in which case the provisions of the GPL or the LGPL are applicable instead
28		* of those above. If you wish to allow use of your version of this file only
29		* under the terms of either the GPL or the LGPL, and not to allow others to
30		* use your version of this file under the terms of the MPL, indicate your
31		* decision by deleting the provisions above and replace them with the notice
32		* and other provisions required by the GPL or the LGPL. If you do not delete
33		* the provisions above, a recipient may use your version of this file under
34		* the terms of any one of the MPL, the GPL or the LGPL.
35		*
36		* *** END LICENSE BLOCK *** */
37
38		#include <cstdlib>
39		#include <cstring>
40		#include <cstdio>
41		#include <ctype.h>
42
43		#include "../hunspell/csutil.hxx"
44		#include "textparser.hxx"
45
46		#include <algorithm>
47
48		#ifndef W32
49		using namespace std;
50		#endif
51
52		// ISO-8859-1 HTML character entities
53
54		static const char* LATIN1[] = {
55		"À", "Ã", "Å", "Æ", "È", "Ê",
56		"Ì", "Ï", "Ð", "Ñ", "Ò", "Ø",
57		"Ù", "Þ", "à", "ã", "å", "æ",
58		"è", "ê", "ì", "ï", "ð", "ñ",
59		"ò", "ø", "ù", "þ", "ÿ"};
60
61	3.59M	#define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*))
62
63		#define ENTITY_APOS "'"
64	0	#define UTF8_APOS "\xe2\x80\x99"
65	11.2M	#define APOSTROPHE "'"
66
67	908	TextParser::TextParser(const char* wordchars) {
68	908	init(wordchars);
69	908	}
70
71	0	TextParser::TextParser(const w_char* wordchars, int len) {
72	0	init(wordchars, len);
73	0	}
74
75	908	TextParser::~TextParser() = default;
76
77	1.67G	int TextParser::is_wordchar(const char* w) {
78	1.67G	if (*w == '\0')
79	603k	return 0;
80	1.67G	size_t cache_index = (*w + 256) % 256;
81	1.67G	if (utf8) {
82	0	const bool use_cache = cache_index < 0x80;
83	0	if (use_cache)
84	0	return wordcharacters[cache_index];
85	0	if (u8_u16(wc, w, true) < 1)
86	0	return 0;
87	0	unsigned short idx = (unsigned short)wc[0];
88	0	return unicodeisalpha(idx) \|\|
89	0	(wordchars_utf16 &&
90	0	std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0]));
91	1.67G	} else {
92	1.67G	return wordcharacters[cache_index];
93	1.67G	}
94	1.67G	}
95
96	14.5M	const char* TextParser::get_latin1(const char* s) {
97	14.5M	if (s[0] == '&') {
98	126k	unsigned int i = 0;
99	3.47M	while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i])) != 0)
100	3.34M	i++;
101	126k	if (i != LATIN1_LEN)
102	26.8k	return LATIN1[i];
103	126k	}
104	14.4M	return nullptr;
105	14.5M	}
106
107	908	void TextParser::init(const char* wordchars) {
108	908	actual = 0;
109	908	head = 0;
110	908	token = 0;
111	908	state = 0;
112	908	utf8 = 0;
113	908	checkurl = 0;
114	908	wordchars_utf16 = nullptr;
115	908	wclen = 0;
116	908	wordcharacters.resize(256, 0);
117	908	if (!wordchars)
118	442	wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
119	48.5k	for (unsigned int j = 0; j < strlen(wordchars); ++j) {
120	47.6k	wordcharacters[(wordchars[j] + 256) % 256] = 1;
121	47.6k	}
122	908	}
123
124	0	void TextParser::init(const w_char* wc_utf8, int len) {
125	0	actual = 0;
126	0	head = 0;
127	0	token = 0;
128	0	state = 0;
129	0	utf8 = 1;
130	0	checkurl = 0;
131	0	wordchars_utf16 = wc_utf8;
132	0	wclen = len;
133
134		// build a cache for the simple cases
135	0	wordcharacters.resize(0x80);
136	0	w_char wc2;
137	0	wc2.h = 0;
138	0	for (unsigned char idx = 0; idx < 0x80; ++idx) {
139	0	wc2.l = idx;
140	0	int cache = unicodeisalpha(idx) \|\|
141	0	(wordchars_utf16 &&
142	0	std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc2));
143	0	wordcharacters[idx] = cache;
144	0	}
145	0	}
146
147	1.67G	int TextParser::next_char(const char* ln, size_t* pos) {
148	1.67G	if ((ln + pos) == '\0')
149	405k	return 1;
150	1.67G	if (utf8) {
151	0	if ((ln + pos) >> 7) {
152		// jump to next UTF-8 character
153	0	for ((pos)++; ((ln + pos) & 0xc0) == 0x80; (pos)++)
154	0	;
155	0	} else {
156	0	(*pos)++;
157	0	}
158	0	} else
159	1.67G	(*pos)++;
160	1.67G	return 0;
161	1.67G	}
162
163	908	void TextParser::put_line(const char* word) {
164	908	actual = (actual + 1) % MAXPREVLINE;
165	908	line[actual].assign(word);
166	908	token = 0;
167	908	head = 0;
168	908	check_urls();
169	908	}
170
171	404k	std::string TextParser::get_prevline(int n) const {
172	404k	return line[(actual + MAXPREVLINE - n) % MAXPREVLINE];
173	404k	}
174
175	404k	std::string TextParser::get_line() const {
176	404k	return get_prevline(0);
177	404k	}
178
179	1.25M	bool TextParser::next_token(std::string &t) {
180	1.25M	const char* latin1;
181
182	15.7M	for (;;) {
183	15.7M	switch (state) {
184	4.47M	case 0: // non word chars
185	4.47M	if (is_wordchar(line[actual].c_str() + head)) {
186	1.26M	state = 1;
187	1.26M	token = head;
188	3.20M	} else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
189	4.81k	state = 1;
190	4.81k	token = head;
191	4.81k	head += strlen(latin1);
192	4.81k	}
193	4.47M	break;
194	11.2M	case 1: // wordchar
195	11.2M	if ((latin1 = get_latin1(line[actual].c_str() + head))) {
196	22.0k	head += strlen(latin1);
197	11.2M	} else if ((is_wordchar(APOSTROPHE) \|\|
198	5.14M	(is_utf8() && is_wordchar(UTF8_APOS))) &&
199	6.13M	!line[actual].empty() && line[actual][head] == '\'' &&
200	44.2k	is_wordchar(line[actual].c_str() + head + 1)) {
201	14.1k	head++;
202	11.2M	} else if (is_utf8() &&
203	0	is_wordchar(APOSTROPHE) && // add Unicode apostrophe
204		// to the WORDCHARS, if
205		// needed
206	0	strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
207	0	0 &&
208	0	is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
209	0	head += strlen(UTF8_APOS) - 1;
210	11.2M	} else if (!is_wordchar(line[actual].c_str() + head)) {
211	1.26M	state = 0;
212	1.26M	if (alloc_token(token, &head, t))
213	1.25M	return true;
214	1.26M	}
215	10.0M	break;
216	15.7M	}
217	14.5M	if (next_char(line[actual].c_str(), &head))
218	632	return false;
219	14.5M	}
220	1.25M	}
221
222	0	size_t TextParser::get_tokenpos() {
223	0	return token;
224	0	}
225
226	403k	int TextParser::change_token(const char* word) {
227	403k	if (word) {
228	403k	std::string remainder(line[actual].substr(head));
229	403k	line[actual].resize(token);
230	403k	line[actual].append(word);
231	403k	line[actual].append(remainder);
232	403k	head = token;
233	403k	check_urls();
234	403k	return 1;
235	403k	}
236	0	return 0;
237	403k	}
238
239	0	std::string TextParser::get_word(const std::string &tok) {
240	0	return tok;
241	0	}
242
243	404k	void TextParser::check_urls() {
244	404k	urlline.resize(line[actual].size() + 1);
245	404k	int url_state = 0;
246	404k	size_t url_head = 0;
247	404k	size_t url_token = 0;
248	404k	int url = 0;
249	1.66G	for (;;) {
250	1.66G	switch (url_state) {
251	503M	case 0: // non word chars
252	503M	if (is_wordchar(line[actual].c_str() + url_head)) {
253	296M	url_state = 1;
254	296M	url_token = url_head;
255		// Unix path
256	296M	} else if (line[actual][url_head] == '/') {
257	14.7M	url_state = 1;
258	14.7M	url_token = url_head;
259	14.7M	url = 1;
260	14.7M	}
261	503M	break;
262	1.16G	case 1: // wordchar
263	1.16G	char ch = line[actual][url_head];
264		// e-mail address
265	1.16G	if ((ch == '@') \|\|
266		// MS-DOS, Windows path
267	1.15G	(strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) \|\|
268		// URL
269	1.15G	(strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) {
270	10.5M	url = 1;
271	1.14G	} else if (!(is_wordchar(line[actual].c_str() + url_head) \|\| (ch == '-') \|\|
272	771M	(ch == '_') \|\| (ch == '\\') \|\| (ch == '.') \|\|
273	715M	(ch == ':') \|\| (ch == '/') \|\| (ch == '~') \|\| (ch == '%') \|\|
274	567M	(ch == '*') \|\| (ch == '$') \|\| (ch == '[') \|\| (ch == ']') \|\|
275	475M	(ch == '?') \|\| (ch == '!') \|\|
276	437M	((ch >= '0') && (ch <= '9')))) {
277	311M	url_state = 0;
278	311M	if (url == 1) {
279	135M	for (size_t i = url_token; i < url_head; ++i) {
280	110M	urlline[i] = true;
281	110M	}
282	24.3M	}
283	311M	url = 0;
284	311M	}
285	1.16G	break;
286	1.66G	}
287	1.66G	urlline[url_head] = false;
288	1.66G	if (next_char(line[actual].c_str(), &url_head))
289	404k	return;
290	1.66G	}
291	404k	}
292
293	1.26M	int TextParser::get_url(size_t token_pos, size_t* hd) {
294	1.91M	for (size_t i = hd; i < line[actual].size() && urlline[i]; i++, (hd)++)
295	647k	;
296	1.26M	return checkurl ? 0 : urlline[token_pos];
297	1.26M	}
298
299	908	void TextParser::set_url_checking(int check) {
300	908	checkurl = check;
301	908	}
302
303	1.26M	bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) {
304	1.26M	if (get_url(tokn, hd))
305	15.9k	return false;
306	1.25M	t = line[actual].substr(tokn, *hd - tokn);
307		// remove colon for Finnish and Swedish language
308	1.25M	if (!t.empty() && t[t.size() - 1] == ':') {
309	22.1k	t.resize(t.size() - 1);
310	22.1k	if (t.empty()) {
311	0	return false;
312	0	}
313	22.1k	}
314	1.25M	return true;
315	1.25M	}