Coverage Report

Created: 2026-01-17 06:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aspell/modules/speller/default/phonet.cpp
Line
Count
Source
1
/*  phonetic.c - generic replacement aglogithms for phonetic transformation
2
    Copyright (C) 2000 Björn Jacke
3
4
    This library is free software; you can redistribute it and/or
5
    modify it under the terms of the GNU Lesser General Public
6
    License version 2.1 as published by the Free Software Foundation;
7
 
8
    This library is distributed in the hope that it will be useful,
9
    but WITHOUT ANY WARRANTY; without even the implied warranty of
10
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
    Lesser General Public License for more details.
12
 
13
    You should have received a copy of the GNU Lesser General Public
14
    License along with this library; if not, write to the Free Software
15
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
    Björn Jacke may be reached by email at bjoern.jacke@gmx.de
18
19
    Changelog:
20
21
    2000-01-05  Björn Jacke <bjoern.jacke@gmx.de>
22
                Initial Release insprired by the article about phonetic
23
                transformations out of c't 25/1999
24
25
*/
26
27
#include <string.h>
28
#include <assert.h>
29
30
#include <vector>
31
32
#include "asc_ctype.hpp"
33
#include "string.hpp"
34
#include "phonet.hpp"
35
#include "errors.hpp"
36
#include "fstream.hpp"
37
#include "getdata.hpp"
38
#include "language.hpp"
39
#include "objstack.hpp"
40
#include "vararray.hpp"
41
42
using namespace acommon;
43
44
namespace aspeller {
45
46
  const char * const PhonetParms::rules_end = "";
47
  
48
0
  static bool to_bool(const String & str) {
49
0
    if (str == "1" || str == "true") return true;
50
0
    else return false;
51
0
  }
52
#if 0
53
  void dump_phonet_rules(ostream & out, const PhonetParms & parms) {
54
    out << "version         " << parms.version << "\n";
55
    out << "followup        " << parms.followup << "\n";
56
    out << "collapse_result " << parms.collapse_result << "\n";
57
    out << "\n";
58
    ios::fmtflags flags = out.setf(ios::left);
59
    for (int i = 0; parms.rules[i] != PhonetParms::rules_end; i += 2) {
60
      out << setw(20) << parms.rules[i] << " " 
61
    << (parms.rules[i+1][0] == '\0' ? "_" : parms.rules[i+1])
62
    << "\n";
63
    }
64
    out.flags(flags);
65
  }
66
#endif
67
68
  struct PhonetParmsImpl : public PhonetParms {
69
    void * data;
70
    ObjStack strings;
71
1.00k
    PhonetParmsImpl() : data(0) {}
72
1.00k
    ~PhonetParmsImpl() {if (data) free(data);}
73
  };
74
75
  static void init_phonet_hash(PhonetParms & parms);
76
77
  // like strcpy but safe if the strings overlap
78
  //   but only if dest < src
79
270
  static inline void strmove(char * dest, char * src) {
80
1.12k
    while (*src) 
81
855
      *dest++ = *src++;
82
270
    *dest = '\0';
83
270
  }
84
  
85
  PosibErr<PhonetParms *> new_phonet(const String & file, 
86
                                     Conv & iconv,
87
                                     const Language * lang) 
88
1.00k
  {
89
1.00k
    String buf; DataPair dp;
90
91
1.00k
    FStream in;
92
1.00k
    RET_ON_ERR(in.open(file, "r"));
93
94
1.00k
    PhonetParmsImpl * parms = new PhonetParmsImpl();
95
96
1.00k
    parms->lang = lang;
97
98
1.00k
    parms->followup        = true;
99
1.00k
    parms->collapse_result = false;
100
1.00k
    parms->remove_accents  = true;
101
102
1.00k
    int num = 0;
103
107k
    while (getdata_pair(in, dp, buf)) {
104
106k
      if (dp.key != "followup" && dp.key != "collapse_result" &&
105
106k
    dp.key != "version")
106
105k
  ++num;
107
106k
    }
108
109
1.00k
    in.restart();
110
111
1.00k
    size_t vsize = sizeof(char *) * (2 * num + 2);
112
1.00k
    parms->data = malloc(vsize);
113
114
1.00k
    const char * * r = (const char * *)parms->data;
115
116
1.00k
    char * empty_str = parms->strings.dup("");
117
118
107k
    while (true) {
119
107k
      if (!getdata_pair(in, dp, buf)) break;
120
106k
      if (dp.key == "followup") {
121
0
  parms->followup = to_bool(dp.value);
122
106k
      } else if (dp.key == "collapse_result") {
123
0
  parms->collapse_result = to_bool(dp.value);
124
106k
      } else if (dp.key == "version") {
125
1.00k
  parms->version = dp.value;
126
105k
      } else if (dp.key == "remove_accents") {
127
0
        parms->remove_accents = to_bool(dp.value);
128
105k
      } else {
129
105k
  *r = parms->strings.dup(iconv(dp.key));
130
105k
  ++r;
131
105k
  if (dp.value == "_") {
132
19.1k
    *r = empty_str;
133
86.6k
  } else {
134
86.6k
    *r = parms->strings.dup(iconv(dp.value));
135
86.6k
  }
136
105k
  ++r;
137
105k
      }
138
106k
    }
139
1.00k
    if (parms->version.empty()) {
140
0
      delete parms;
141
0
      return make_err(bad_file_format, file, "You must specify a version string");
142
0
    }
143
1.00k
    *(r  ) = PhonetParms::rules_end;
144
1.00k
    *(r+1) = PhonetParms::rules_end;
145
1.00k
    parms->rules = (const char * *)parms->data;
146
147
148
259k
    for (unsigned i = 0; i != 256; ++i) {
149
258k
      parms->to_clean[i] = (lang->char_type(i) > Language::NonLetter 
150
258k
                            ? (parms->remove_accents 
151
114k
                               ? lang->to_upper(lang->de_accent(i)) 
152
114k
                               : lang->to_upper(i))
153
258k
                            : 0);
154
258k
    }
155
156
1.00k
    init_phonet_hash(*parms);
157
158
1.00k
    return parms;
159
1.00k
  }
160
161
  static void init_phonet_hash(PhonetParms & parms) 
162
1.00k
  {
163
1.00k
    int i, k;
164
165
259k
    for (i = 0; i < parms.hash_size; i++) {
166
258k
      parms.hash[i] = -1;
167
258k
    }
168
169
106k
    for (i = 0; parms.rules[i] != PhonetParms::rules_end; i += 2) {
170
      /**  set hash value  **/
171
105k
      k = (unsigned char) parms.rules[i][0];
172
173
105k
      if (parms.hash[k] < 0) {
174
27.2k
  parms.hash[k] = i;
175
27.2k
      }
176
105k
    }
177
1.00k
  }
178
179
180
#ifdef PHONET_TRACE
181
  void trace_info(char * text, int n, char * error,
182
      const PhonetParms & parms) 
183
  {
184
    /**  dump tracing info  **/
185
    
186
    printf ("%s %d:  \"%s\"  >  \"%s\" %s", text, ((n/2)+1), parms.rules[n],
187
      parms.rules[n+1], error);
188
  }
189
#endif
190
191
  int phonet (const char * inword, char * target,
192
              int len,
193
        const PhonetParms & parms)
194
305k
  {
195
    /**       Do phonetic transformation.       **/
196
    /**  "len" = length of "inword" incl. '\0'. **/
197
198
    /**  result:  >= 0:  length of "target"    **/
199
    /**            otherwise:  error            **/
200
201
305k
    int  i,j,k=0,n,p,z;
202
305k
    int  k0,n0,p0=-333,z0;
203
305k
    if (len == -1) len = strlen(inword);
204
305k
    VARARRAY(char, word, len + 1);
205
305k
    char c, c0;
206
305k
    const char * s;
207
208
305k
    typedef unsigned char uchar;
209
    
210
    /**  to convert string to uppercase and possible remove accents **/
211
305k
    char * res = word;
212
14.4M
    for (const char * str = inword; *str; ++str) {
213
14.1M
      char c = parms.to_clean[(uchar)*str];
214
14.1M
      if (c) *res++ = c;
215
14.1M
    }
216
305k
    *res = '\0';
217
    
218
    /**  check word  **/
219
305k
    i = j = z = 0;
220
14.1M
    while ((c = word[i]) != '\0') {
221
      #ifdef PHONET_TRACE
222
         cout << "\nChecking position " << j << ":  word = \""
223
              << word+i << "\",";
224
         printf ("  target = \"%.*s\"", j, target);
225
      #endif
226
13.8M
      n = parms.hash[(uchar) c];
227
13.8M
      z0 = 0;
228
229
13.8M
      if (n >= 0) {
230
        /**  check all rules for the same letter  **/
231
67.2M
        while (parms.rules[n][0] == c) {
232
          #ifdef PHONET_TRACE
233
             trace_info ("\n> Checking rule No.",n,"",parms);
234
          #endif
235
236
          /**  check whole string  **/
237
62.5M
          k = 1;   /** number of found letters  **/
238
62.5M
          p = 5;   /** default priority  **/
239
62.5M
          s = parms.rules[n];
240
62.5M
          s++;     /**  important for (see below)  "*(s-1)"  **/
241
          
242
66.9M
          while (*s != '\0'  &&  word[i+k] == *s
243
4.44M
                 &&  !asc_isdigit (*s)  &&  strchr ("(-<^$", *s) == NULL) {
244
4.44M
            k++;
245
4.44M
            s++;
246
4.44M
          }
247
62.5M
          if (*s == '(') {
248
            /**  check letters in "(..)"  **/
249
10.0M
            if (parms.lang->is_alpha(word[i+k])  // ...could be implied?
250
9.88M
                && strchr(s+1, word[i+k]) != NULL) {
251
413k
              k++;
252
2.30M
              while (*s != ')')
253
1.88M
                s++;
254
413k
              s++;
255
413k
            }
256
10.0M
          }
257
62.5M
          p0 = (int) *s;
258
62.5M
          k0 = k;
259
66.7M
          while (*s == '-'  &&  k > 1) {
260
4.26M
            k--;
261
4.26M
            s++;
262
4.26M
          }
263
62.5M
          if (*s == '<')
264
724
            s++;
265
62.5M
          if (asc_isdigit (*s)) {
266
            /**  determine priority  **/
267
1.39k
            p = *s - '0';
268
1.39k
            s++;
269
1.39k
          }
270
62.5M
          if (*s == '^'  &&  *(s+1) == '^')
271
0
            s++;
272
273
62.5M
          if (*s == '\0'
274
53.5M
              || (*s == '^'  
275
5.17M
                  && (i == 0  ||  ! parms.lang->is_alpha(word[i-1]))
276
83.7k
                  && (*(s+1) != '$'
277
176
                      || (! parms.lang->is_alpha(word[i+k0]) )))
278
53.4M
              || (*s == '$'  &&  i > 0  
279
64
                  &&  parms.lang->is_alpha(word[i-1])
280
64
                  && (! parms.lang->is_alpha(word[i+k0]) ))) 
281
9.07M
          {
282
            /**  search for followup rules, if:     **/
283
            /**  parms.followup and k > 1  and  NO '-' in searchstring **/
284
9.07M
            c0 = word[i+k-1];
285
9.07M
            n0 = parms.hash[(uchar) c0];
286
//
287
9.07M
            if (parms.followup  &&  k > 1  &&  n0 >= 0
288
176k
                &&  p0 != (int) '-'  &&  word[i+k] != '\0') {
289
              /**  test follow-up rule for "word[i+k]"  **/
290
256k
              while (parms.rules[n0][0] == c0) {
291
                #ifdef PHONET_TRACE
292
                    trace_info ("\n> > follow-up rule No.",n0,"... ",parms);
293
                #endif
294
295
                /**  check whole string  **/
296
251k
                k0 = k;
297
251k
                p0 = 5;
298
251k
                s = parms.rules[n0];
299
251k
                s++;
300
329k
                while (*s != '\0'  &&  word[i+k0] == *s
301
77.4k
                       && ! asc_isdigit(*s)  &&  strchr("(-<^$",*s) == NULL) {
302
77.4k
                  k0++;
303
77.4k
                  s++;
304
77.4k
                }
305
251k
                if (*s == '(') {
306
                  /**  check letters  **/
307
541
                  if (parms.lang->is_alpha(word[i+k0])
308
541
                      &&  strchr (s+1, word[i+k0]) != NULL) {
309
0
                    k0++;
310
0
                    while (*s != ')'  &&  *s != '\0')
311
0
                      s++;
312
0
                    if (*s == ')')
313
0
                      s++;
314
0
                  }
315
541
                }
316
326k
                while (*s == '-') {
317
                  /**  "k0" gets NOT reduced   **/
318
                  /**  because "if (k0 == k)"  **/
319
75.2k
                  s++;
320
75.2k
                }
321
251k
                if (*s == '<')
322
454
                  s++;
323
251k
                if (asc_isdigit (*s)) {
324
554
                  p0 = *s - '0';
325
554
                  s++;
326
554
                }
327
328
251k
                if (*s == '\0'
329
                    /**  *s == '^' cuts  **/
330
169k
                    || (*s == '$'  &&  ! parms.lang->is_alpha(word[i+k0]))) 
331
81.5k
                {
332
81.5k
                  if (k0 == k) {
333
                    /**  this is just a piece of the string  **/
334
                    #ifdef PHONET_TRACE
335
                        cout << "discarded (too short)";
336
                    #endif
337
5.27k
                    n0 += 2;
338
5.27k
                    continue;
339
5.27k
                  }
340
341
76.3k
                  if (p0 < p) {
342
                    /**  priority too low  **/
343
                    #ifdef PHONET_TRACE
344
                        cout << "discarded (priority)";
345
                    #endif
346
0
                    n0 += 2;
347
0
                    continue;
348
0
                  }
349
                  /**  rule fits; stop search  **/
350
76.3k
                  break;
351
76.3k
                }
352
                #ifdef PHONET_TRACE
353
                    cout << "discarded";
354
                #endif
355
169k
                n0 += 2;
356
169k
              } /**  End of "while (parms.rules[n0][0] == c0)"  **/
357
358
81.6k
              if (p0 >= p  && parms.rules[n0][0] == c0) {
359
                #ifdef PHONET_TRACE
360
                    trace_info ("\n> Rule No.", n,"",parms);
361
                    trace_info ("\n> not used because of follow-up",
362
                                      n0,"",parms);
363
                #endif
364
76.3k
                n += 2;
365
76.3k
                continue;
366
76.3k
              }
367
81.6k
            } /** end of follow-up stuff **/
368
369
            /**  replace string  **/
370
            #ifdef PHONET_TRACE
371
                trace_info ("\nUsing rule No.", n,"\n",parms);
372
            #endif
373
9.00M
            s = parms.rules[n+1];
374
9.00M
            p0 = (parms.rules[n][0] != '\0'
375
9.00M
                 &&  strchr (parms.rules[n]+1,'<') != NULL) ? 1:0;
376
9.00M
            if (p0 == 1 &&  z == 0) {
377
              /**  rule with '<' is used  **/
378
270
              if (j > 0  &&  *s != '\0'
379
190
                 && (target[j-1] == c  ||  target[j-1] == *s)) {
380
0
                j--;
381
0
              }
382
270
              z0 = 1;
383
270
              z = 1;
384
270
              k0 = 0;
385
540
              while (*s != '\0'  &&  word[i+k0] != '\0') {
386
270
                word[i+k0] = *s;
387
270
                k0++;
388
270
                s++;
389
270
              }
390
270
              if (k > k0)
391
270
                strmove (&word[0]+i+k0, &word[0]+i+k);
392
393
              /**  new "actual letter"  **/
394
270
              c = word[i];
395
270
            }
396
9.00M
            else { /** no '<' rule used **/
397
9.00M
              i += k - 1;
398
9.00M
              z = 0;
399
9.01M
              while (*s != '\0'
400
4.90M
                     &&  *(s+1) != '\0'  &&  j < len) {
401
13.8k
                if (j == 0  ||  target[j-1] != *s) {
402
13.8k
                  target[j] = *s;
403
13.8k
                  j++;
404
13.8k
                }
405
13.8k
                s++;
406
13.8k
              }
407
              /**  new "actual letter"  **/
408
9.00M
              c = *s;
409
9.00M
              if (parms.rules[n][0] != '\0'
410
9.00M
                 &&  strstr (parms.rules[n]+1, "^^") != NULL) {
411
0
                if (c != '\0') {
412
0
                  target[j] = c;
413
0
                  j++;
414
0
                }
415
0
                strmove (&word[0], &word[0]+i+1);
416
0
                i = 0;
417
0
                z0 = 1;
418
0
              }
419
9.00M
            }
420
9.00M
            break;
421
9.07M
          }  /** end of follow-up stuff **/
422
53.4M
          n += 2;
423
53.4M
        } /**  end of while (parms.rules[n][0] == c)  **/
424
13.7M
      } /**  end of if (n >= 0)  **/
425
13.8M
      if (z0 == 0) {
426
13.8M
        if (k && (assert(p0!=-333),!p0) &&  j < len &&  c != '\0'
427
4.89M
           && (!parms.collapse_result  ||  j == 0  ||  target[j-1] != c)){
428
           /**  condense only double letters  **/
429
4.89M
          target[j] = c;
430
    ///printf("\n setting \n");
431
4.89M
          j++;
432
4.89M
        }
433
        #ifdef PHONET_TRACE
434
        else if (p0 || !k)
435
          cout << "\nNo rule found; character \"" << word[i] << "\" skipped\n";
436
        #endif
437
438
0
        i++;
439
13.8M
        z = 0;
440
13.8M
  k=0;
441
13.8M
      }
442
13.8M
    }  /**  end of   while ((c = word[i]) != '\0')  **/
443
444
305k
    target[j] = '\0';
445
305k
    return (j);
446
447
305k
  }  /**  end of function "phonet"  **/
448
}
449
450
#if 0
451
452
int main (int argc, char *argv[]) {
453
  using namespace autil;
454
455
  if (argc < 3) {
456
    printf ("Usage:  phonet <data file> <word>\n");
457
    return(1);
458
  }
459
460
  char phone_word[strlen(argv[2])+1]; /**  max possible length of words  **/
461
462
  PhonetParms * parms;
463
  ifstream f(argv[1]);
464
  parms = load_phonet_rules(f);
465
466
  init_phonet_charinfo(*parms);
467
  init_phonet_hash(*parms);
468
  phonet (argv[2],phone_word,*parms);
469
  printf ("%s\n", phone_word);
470
  return(0);
471
}
472
#endif