/src/aspell/modules/speller/default/phonet.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* phonetic.c - generic replacement aglogithms for phonetic transformation |
2 | | Copyright (C) 2000 Björn Jacke |
3 | | |
4 | | This library is free software; you can redistribute it and/or |
5 | | modify it under the terms of the GNU Lesser General Public |
6 | | License version 2.1 as published by the Free Software Foundation; |
7 | | |
8 | | This library is distributed in the hope that it will be useful, |
9 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
11 | | Lesser General Public License for more details. |
12 | | |
13 | | You should have received a copy of the GNU Lesser General Public |
14 | | License along with this library; if not, write to the Free Software |
15 | | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
16 | | |
17 | | Björn Jacke may be reached by email at bjoern.jacke@gmx.de |
18 | | |
19 | | Changelog: |
20 | | |
21 | | 2000-01-05 Björn Jacke <bjoern.jacke@gmx.de> |
22 | | Initial Release insprired by the article about phonetic |
23 | | transformations out of c't 25/1999 |
24 | | |
25 | | */ |
26 | | |
27 | | #include <string.h> |
28 | | #include <assert.h> |
29 | | |
30 | | #include <vector> |
31 | | |
32 | | #include "asc_ctype.hpp" |
33 | | #include "string.hpp" |
34 | | #include "phonet.hpp" |
35 | | #include "errors.hpp" |
36 | | #include "fstream.hpp" |
37 | | #include "getdata.hpp" |
38 | | #include "language.hpp" |
39 | | #include "objstack.hpp" |
40 | | #include "vararray.hpp" |
41 | | |
42 | | using namespace acommon; |
43 | | |
44 | | namespace aspeller { |
45 | | |
46 | | const char * const PhonetParms::rules_end = ""; |
47 | | |
48 | 0 | static bool to_bool(const String & str) { |
49 | 0 | if (str == "1" || str == "true") return true; |
50 | 0 | else return false; |
51 | 0 | } |
52 | | #if 0 |
53 | | void dump_phonet_rules(ostream & out, const PhonetParms & parms) { |
54 | | out << "version " << parms.version << "\n"; |
55 | | out << "followup " << parms.followup << "\n"; |
56 | | out << "collapse_result " << parms.collapse_result << "\n"; |
57 | | out << "\n"; |
58 | | ios::fmtflags flags = out.setf(ios::left); |
59 | | for (int i = 0; parms.rules[i] != PhonetParms::rules_end; i += 2) { |
60 | | out << setw(20) << parms.rules[i] << " " |
61 | | << (parms.rules[i+1][0] == '\0' ? "_" : parms.rules[i+1]) |
62 | | << "\n"; |
63 | | } |
64 | | out.flags(flags); |
65 | | } |
66 | | #endif |
67 | | |
68 | | struct PhonetParmsImpl : public PhonetParms { |
69 | | void * data; |
70 | | ObjStack strings; |
71 | 716 | PhonetParmsImpl() : data(0) {} |
72 | 716 | ~PhonetParmsImpl() {if (data) free(data);} |
73 | | }; |
74 | | |
75 | | static void init_phonet_hash(PhonetParms & parms); |
76 | | |
77 | | // like strcpy but safe if the strings overlap |
78 | | // but only if dest < src |
79 | 218 | static inline void strmove(char * dest, char * src) { |
80 | 2.19M | while (*src) |
81 | 2.19M | *dest++ = *src++; |
82 | 218 | *dest = '\0'; |
83 | 218 | } |
84 | | |
85 | | PosibErr<PhonetParms *> new_phonet(const String & file, |
86 | | Conv & iconv, |
87 | | const Language * lang) |
88 | 716 | { |
89 | 716 | String buf; DataPair dp; |
90 | | |
91 | 716 | FStream in; |
92 | 716 | RET_ON_ERR(in.open(file, "r")); |
93 | | |
94 | 716 | PhonetParmsImpl * parms = new PhonetParmsImpl(); |
95 | | |
96 | 716 | parms->lang = lang; |
97 | | |
98 | 716 | parms->followup = true; |
99 | 716 | parms->collapse_result = false; |
100 | 716 | parms->remove_accents = true; |
101 | | |
102 | 716 | int num = 0; |
103 | 76.6k | while (getdata_pair(in, dp, buf)) { |
104 | 75.8k | if (dp.key != "followup" && dp.key != "collapse_result" && |
105 | 75.8k | dp.key != "version") |
106 | 75.1k | ++num; |
107 | 75.8k | } |
108 | | |
109 | 716 | in.restart(); |
110 | | |
111 | 716 | size_t vsize = sizeof(char *) * (2 * num + 2); |
112 | 716 | parms->data = malloc(vsize); |
113 | | |
114 | 716 | const char * * r = (const char * *)parms->data; |
115 | | |
116 | 716 | char * empty_str = parms->strings.dup(""); |
117 | | |
118 | 76.6k | while (true) { |
119 | 76.6k | if (!getdata_pair(in, dp, buf)) break; |
120 | 75.8k | if (dp.key == "followup") { |
121 | 0 | parms->followup = to_bool(dp.value); |
122 | 75.8k | } else if (dp.key == "collapse_result") { |
123 | 0 | parms->collapse_result = to_bool(dp.value); |
124 | 75.8k | } else if (dp.key == "version") { |
125 | 716 | parms->version = dp.value; |
126 | 75.1k | } else if (dp.key == "remove_accents") { |
127 | 0 | parms->remove_accents = to_bool(dp.value); |
128 | 75.1k | } else { |
129 | 75.1k | *r = parms->strings.dup(iconv(dp.key)); |
130 | 75.1k | ++r; |
131 | 75.1k | if (dp.value == "_") { |
132 | 13.6k | *r = empty_str; |
133 | 61.5k | } else { |
134 | 61.5k | *r = parms->strings.dup(iconv(dp.value)); |
135 | 61.5k | } |
136 | 75.1k | ++r; |
137 | 75.1k | } |
138 | 75.8k | } |
139 | 716 | if (parms->version.empty()) { |
140 | 0 | delete parms; |
141 | 0 | return make_err(bad_file_format, file, "You must specify a version string"); |
142 | 0 | } |
143 | 716 | *(r ) = PhonetParms::rules_end; |
144 | 716 | *(r+1) = PhonetParms::rules_end; |
145 | 716 | parms->rules = (const char * *)parms->data; |
146 | | |
147 | | |
148 | 184k | for (unsigned i = 0; i != 256; ++i) { |
149 | 183k | parms->to_clean[i] = (lang->char_type(i) > Language::NonLetter |
150 | 183k | ? (parms->remove_accents |
151 | 81.6k | ? lang->to_upper(lang->de_accent(i)) |
152 | 81.6k | : lang->to_upper(i)) |
153 | 183k | : 0); |
154 | 183k | } |
155 | | |
156 | 716 | init_phonet_hash(*parms); |
157 | | |
158 | 716 | return parms; |
159 | 716 | } |
160 | | |
161 | | static void init_phonet_hash(PhonetParms & parms) |
162 | 716 | { |
163 | 716 | int i, k; |
164 | | |
165 | 184k | for (i = 0; i < parms.hash_size; i++) { |
166 | 183k | parms.hash[i] = -1; |
167 | 183k | } |
168 | | |
169 | 75.8k | for (i = 0; parms.rules[i] != PhonetParms::rules_end; i += 2) { |
170 | | /** set hash value **/ |
171 | 75.1k | k = (unsigned char) parms.rules[i][0]; |
172 | | |
173 | 75.1k | if (parms.hash[k] < 0) { |
174 | 19.3k | parms.hash[k] = i; |
175 | 19.3k | } |
176 | 75.1k | } |
177 | 716 | } |
178 | | |
179 | | |
180 | | #ifdef PHONET_TRACE |
181 | | void trace_info(char * text, int n, char * error, |
182 | | const PhonetParms & parms) |
183 | | { |
184 | | /** dump tracing info **/ |
185 | | |
186 | | printf ("%s %d: \"%s\" > \"%s\" %s", text, ((n/2)+1), parms.rules[n], |
187 | | parms.rules[n+1], error); |
188 | | } |
189 | | #endif |
190 | | |
191 | | int phonet (const char * inword, char * target, |
192 | | int len, |
193 | | const PhonetParms & parms) |
194 | 185k | { |
195 | | /** Do phonetic transformation. **/ |
196 | | /** "len" = length of "inword" incl. '\0'. **/ |
197 | | |
198 | | /** result: >= 0: length of "target" **/ |
199 | | /** otherwise: error **/ |
200 | | |
201 | 185k | int i,j,k=0,n,p,z; |
202 | 185k | int k0,n0,p0=-333,z0; |
203 | 185k | if (len == -1) len = strlen(inword); |
204 | 185k | VARARRAY(char, word, len + 1); |
205 | 185k | char c, c0; |
206 | 185k | const char * s; |
207 | | |
208 | 185k | typedef unsigned char uchar; |
209 | | |
210 | | /** to convert string to uppercase and possible remove accents **/ |
211 | 185k | char * res = word; |
212 | 25.3M | for (const char * str = inword; *str; ++str) { |
213 | 25.2M | char c = parms.to_clean[(uchar)*str]; |
214 | 25.2M | if (c) *res++ = c; |
215 | 25.2M | } |
216 | 185k | *res = '\0'; |
217 | | |
218 | | /** check word **/ |
219 | 185k | i = j = z = 0; |
220 | 24.8M | while ((c = word[i]) != '\0') { |
221 | | #ifdef PHONET_TRACE |
222 | | cout << "\nChecking position " << j << ": word = \"" |
223 | | << word+i << "\","; |
224 | | printf (" target = \"%.*s\"", j, target); |
225 | | #endif |
226 | 24.6M | n = parms.hash[(uchar) c]; |
227 | 24.6M | z0 = 0; |
228 | | |
229 | 24.6M | if (n >= 0) { |
230 | | /** check all rules for the same letter **/ |
231 | 147M | while (parms.rules[n][0] == c) { |
232 | | #ifdef PHONET_TRACE |
233 | | trace_info ("\n> Checking rule No.",n,"",parms); |
234 | | #endif |
235 | | |
236 | | /** check whole string **/ |
237 | 136M | k = 1; /** number of found letters **/ |
238 | 136M | p = 5; /** default priority **/ |
239 | 136M | s = parms.rules[n]; |
240 | 136M | s++; /** important for (see below) "*(s-1)" **/ |
241 | | |
242 | 140M | while (*s != '\0' && word[i+k] == *s |
243 | 140M | && !asc_isdigit (*s) && strchr ("(-<^$", *s) == NULL) { |
244 | 3.68M | k++; |
245 | 3.68M | s++; |
246 | 3.68M | } |
247 | 136M | if (*s == '(') { |
248 | | /** check letters in "(..)" **/ |
249 | 22.7M | if (parms.lang->is_alpha(word[i+k]) // ...could be implied? |
250 | 22.7M | && strchr(s+1, word[i+k]) != NULL) { |
251 | 40.1k | k++; |
252 | 246k | while (*s != ')') |
253 | 206k | s++; |
254 | 40.1k | s++; |
255 | 40.1k | } |
256 | 22.7M | } |
257 | 136M | p0 = (int) *s; |
258 | 136M | k0 = k; |
259 | 140M | while (*s == '-' && k > 1) { |
260 | 3.66M | k--; |
261 | 3.66M | s++; |
262 | 3.66M | } |
263 | 136M | if (*s == '<') |
264 | 326 | s++; |
265 | 136M | if (asc_isdigit (*s)) { |
266 | | /** determine priority **/ |
267 | 501 | p = *s - '0'; |
268 | 501 | s++; |
269 | 501 | } |
270 | 136M | if (*s == '^' && *(s+1) == '^') |
271 | 0 | s++; |
272 | | |
273 | 136M | if (*s == '\0' |
274 | 136M | || (*s == '^' |
275 | 122M | && (i == 0 || ! parms.lang->is_alpha(word[i-1])) |
276 | 122M | && (*(s+1) != '$' |
277 | 58.7k | || (! parms.lang->is_alpha(word[i+k0]) ))) |
278 | 136M | || (*s == '$' && i > 0 |
279 | 122M | && parms.lang->is_alpha(word[i-1]) |
280 | 122M | && (! parms.lang->is_alpha(word[i+k0]) ))) |
281 | 14.0M | { |
282 | | /** search for followup rules, if: **/ |
283 | | /** parms.followup and k > 1 and NO '-' in searchstring **/ |
284 | 14.0M | c0 = word[i+k-1]; |
285 | 14.0M | n0 = parms.hash[(uchar) c0]; |
286 | | // |
287 | 14.0M | if (parms.followup && k > 1 && n0 >= 0 |
288 | 14.0M | && p0 != (int) '-' && word[i+k] != '\0') { |
289 | | /** test follow-up rule for "word[i+k]" **/ |
290 | 37.6k | while (parms.rules[n0][0] == c0) { |
291 | | #ifdef PHONET_TRACE |
292 | | trace_info ("\n> > follow-up rule No.",n0,"... ",parms); |
293 | | #endif |
294 | | |
295 | | /** check whole string **/ |
296 | 33.7k | k0 = k; |
297 | 33.7k | p0 = 5; |
298 | 33.7k | s = parms.rules[n0]; |
299 | 33.7k | s++; |
300 | 39.7k | while (*s != '\0' && word[i+k0] == *s |
301 | 39.7k | && ! asc_isdigit(*s) && strchr("(-<^$",*s) == NULL) { |
302 | 5.95k | k0++; |
303 | 5.95k | s++; |
304 | 5.95k | } |
305 | 33.7k | if (*s == '(') { |
306 | | /** check letters **/ |
307 | 8.05k | if (parms.lang->is_alpha(word[i+k0]) |
308 | 8.05k | && strchr (s+1, word[i+k0]) != NULL) { |
309 | 7.67k | k0++; |
310 | 50.7k | while (*s != ')' && *s != '\0') |
311 | 43.0k | s++; |
312 | 7.67k | if (*s == ')') |
313 | 7.67k | s++; |
314 | 7.67k | } |
315 | 8.05k | } |
316 | 39.3k | while (*s == '-') { |
317 | | /** "k0" gets NOT reduced **/ |
318 | | /** because "if (k0 == k)" **/ |
319 | 5.60k | s++; |
320 | 5.60k | } |
321 | 33.7k | if (*s == '<') |
322 | 25 | s++; |
323 | 33.7k | if (asc_isdigit (*s)) { |
324 | 0 | p0 = *s - '0'; |
325 | 0 | s++; |
326 | 0 | } |
327 | | |
328 | 33.7k | if (*s == '\0' |
329 | | /** *s == '^' cuts **/ |
330 | 33.7k | || (*s == '$' && ! parms.lang->is_alpha(word[i+k0]))) |
331 | 7.23k | { |
332 | 7.23k | if (k0 == k) { |
333 | | /** this is just a piece of the string **/ |
334 | | #ifdef PHONET_TRACE |
335 | | cout << "discarded (too short)"; |
336 | | #endif |
337 | 3.87k | n0 += 2; |
338 | 3.87k | continue; |
339 | 3.87k | } |
340 | | |
341 | 3.36k | if (p0 < p) { |
342 | | /** priority too low **/ |
343 | | #ifdef PHONET_TRACE |
344 | | cout << "discarded (priority)"; |
345 | | #endif |
346 | 0 | n0 += 2; |
347 | 0 | continue; |
348 | 0 | } |
349 | | /** rule fits; stop search **/ |
350 | 3.36k | break; |
351 | 3.36k | } |
352 | | #ifdef PHONET_TRACE |
353 | | cout << "discarded"; |
354 | | #endif |
355 | 26.5k | n0 += 2; |
356 | 26.5k | } /** End of "while (parms.rules[n0][0] == c0)" **/ |
357 | | |
358 | 7.24k | if (p0 >= p && parms.rules[n0][0] == c0) { |
359 | | #ifdef PHONET_TRACE |
360 | | trace_info ("\n> Rule No.", n,"",parms); |
361 | | trace_info ("\n> not used because of follow-up", |
362 | | n0,"",parms); |
363 | | #endif |
364 | 3.36k | n += 2; |
365 | 3.36k | continue; |
366 | 3.36k | } |
367 | 7.24k | } /** end of follow-up stuff **/ |
368 | | |
369 | | /** replace string **/ |
370 | | #ifdef PHONET_TRACE |
371 | | trace_info ("\nUsing rule No.", n,"\n",parms); |
372 | | #endif |
373 | 14.0M | s = parms.rules[n+1]; |
374 | 14.0M | p0 = (parms.rules[n][0] != '\0' |
375 | 14.0M | && strchr (parms.rules[n]+1,'<') != NULL) ? 1:0; |
376 | 14.0M | if (p0 == 1 && z == 0) { |
377 | | /** rule with '<' is used **/ |
378 | 218 | if (j > 0 && *s != '\0' |
379 | 218 | && (target[j-1] == c || target[j-1] == *s)) { |
380 | 0 | j--; |
381 | 0 | } |
382 | 218 | z0 = 1; |
383 | 218 | z = 1; |
384 | 218 | k0 = 0; |
385 | 436 | while (*s != '\0' && word[i+k0] != '\0') { |
386 | 218 | word[i+k0] = *s; |
387 | 218 | k0++; |
388 | 218 | s++; |
389 | 218 | } |
390 | 218 | if (k > k0) |
391 | 218 | strmove (&word[0]+i+k0, &word[0]+i+k); |
392 | | |
393 | | /** new "actual letter" **/ |
394 | 218 | c = word[i]; |
395 | 218 | } |
396 | 14.0M | else { /** no '<' rule used **/ |
397 | 14.0M | i += k - 1; |
398 | 14.0M | z = 0; |
399 | 14.0M | while (*s != '\0' |
400 | 14.0M | && *(s+1) != '\0' && j < len) { |
401 | 12.7k | if (j == 0 || target[j-1] != *s) { |
402 | 12.5k | target[j] = *s; |
403 | 12.5k | j++; |
404 | 12.5k | } |
405 | 12.7k | s++; |
406 | 12.7k | } |
407 | | /** new "actual letter" **/ |
408 | 14.0M | c = *s; |
409 | 14.0M | if (parms.rules[n][0] != '\0' |
410 | 14.0M | && strstr (parms.rules[n]+1, "^^") != NULL) { |
411 | 0 | if (c != '\0') { |
412 | 0 | target[j] = c; |
413 | 0 | j++; |
414 | 0 | } |
415 | 0 | strmove (&word[0], &word[0]+i+1); |
416 | 0 | i = 0; |
417 | 0 | z0 = 1; |
418 | 0 | } |
419 | 14.0M | } |
420 | 14.0M | break; |
421 | 14.0M | } /** end of follow-up stuff **/ |
422 | 122M | n += 2; |
423 | 122M | } /** end of while (parms.rules[n][0] == c) **/ |
424 | 24.6M | } /** end of if (n >= 0) **/ |
425 | 24.6M | if (z0 == 0) { |
426 | 24.6M | if (k && (assert(p0!=-333),!p0) && j < len && c != '\0' |
427 | 24.6M | && (!parms.collapse_result || j == 0 || target[j-1] != c)){ |
428 | | /** condense only double letters **/ |
429 | 10.4M | target[j] = c; |
430 | | ///printf("\n setting \n"); |
431 | 10.4M | j++; |
432 | 10.4M | } |
433 | | #ifdef PHONET_TRACE |
434 | | else if (p0 || !k) |
435 | | cout << "\nNo rule found; character \"" << word[i] << "\" skipped\n"; |
436 | | #endif |
437 | | |
438 | 0 | i++; |
439 | 24.6M | z = 0; |
440 | 24.6M | k=0; |
441 | 24.6M | } |
442 | 24.6M | } /** end of while ((c = word[i]) != '\0') **/ |
443 | | |
444 | 185k | target[j] = '\0'; |
445 | 185k | return (j); |
446 | | |
447 | 185k | } /** end of function "phonet" **/ |
448 | | } |
449 | | |
450 | | #if 0 |
451 | | |
452 | | int main (int argc, char *argv[]) { |
453 | | using namespace autil; |
454 | | |
455 | | if (argc < 3) { |
456 | | printf ("Usage: phonet <data file> <word>\n"); |
457 | | return(1); |
458 | | } |
459 | | |
460 | | char phone_word[strlen(argv[2])+1]; /** max possible length of words **/ |
461 | | |
462 | | PhonetParms * parms; |
463 | | ifstream f(argv[1]); |
464 | | parms = load_phonet_rules(f); |
465 | | |
466 | | init_phonet_charinfo(*parms); |
467 | | init_phonet_hash(*parms); |
468 | | phonet (argv[2],phone_word,*parms); |
469 | | printf ("%s\n", phone_word); |
470 | | return(0); |
471 | | } |
472 | | #endif |