Coverage Report

Created: 2024-09-08 06:23

/src/aspell/modules/speller/default/phonet.cpp
Line
Count
Source (jump to first uncovered line)
1
/*  phonetic.c - generic replacement aglogithms for phonetic transformation
2
    Copyright (C) 2000 Björn Jacke
3
4
    This library is free software; you can redistribute it and/or
5
    modify it under the terms of the GNU Lesser General Public
6
    License version 2.1 as published by the Free Software Foundation;
7
 
8
    This library is distributed in the hope that it will be useful,
9
    but WITHOUT ANY WARRANTY; without even the implied warranty of
10
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
    Lesser General Public License for more details.
12
 
13
    You should have received a copy of the GNU Lesser General Public
14
    License along with this library; if not, write to the Free Software
15
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
17
    Björn Jacke may be reached by email at bjoern.jacke@gmx.de
18
19
    Changelog:
20
21
    2000-01-05  Björn Jacke <bjoern.jacke@gmx.de>
22
                Initial Release insprired by the article about phonetic
23
                transformations out of c't 25/1999
24
25
*/
26
27
#include <string.h>
28
#include <assert.h>
29
30
#include <vector>
31
32
#include "asc_ctype.hpp"
33
#include "string.hpp"
34
#include "phonet.hpp"
35
#include "errors.hpp"
36
#include "fstream.hpp"
37
#include "getdata.hpp"
38
#include "language.hpp"
39
#include "objstack.hpp"
40
#include "vararray.hpp"
41
42
using namespace acommon;
43
44
namespace aspeller {
45
46
  const char * const PhonetParms::rules_end = "";
47
  
48
0
  static bool to_bool(const String & str) {
49
0
    if (str == "1" || str == "true") return true;
50
0
    else return false;
51
0
  }
52
#if 0
53
  void dump_phonet_rules(ostream & out, const PhonetParms & parms) {
54
    out << "version         " << parms.version << "\n";
55
    out << "followup        " << parms.followup << "\n";
56
    out << "collapse_result " << parms.collapse_result << "\n";
57
    out << "\n";
58
    ios::fmtflags flags = out.setf(ios::left);
59
    for (int i = 0; parms.rules[i] != PhonetParms::rules_end; i += 2) {
60
      out << setw(20) << parms.rules[i] << " " 
61
    << (parms.rules[i+1][0] == '\0' ? "_" : parms.rules[i+1])
62
    << "\n";
63
    }
64
    out.flags(flags);
65
  }
66
#endif
67
68
  struct PhonetParmsImpl : public PhonetParms {
69
    void * data;
70
    ObjStack strings;
71
716
    PhonetParmsImpl() : data(0) {}
72
716
    ~PhonetParmsImpl() {if (data) free(data);}
73
  };
74
75
  static void init_phonet_hash(PhonetParms & parms);
76
77
  // like strcpy but safe if the strings overlap
78
  //   but only if dest < src
79
218
  static inline void strmove(char * dest, char * src) {
80
2.19M
    while (*src) 
81
2.19M
      *dest++ = *src++;
82
218
    *dest = '\0';
83
218
  }
84
  
85
  PosibErr<PhonetParms *> new_phonet(const String & file, 
86
                                     Conv & iconv,
87
                                     const Language * lang) 
88
716
  {
89
716
    String buf; DataPair dp;
90
91
716
    FStream in;
92
716
    RET_ON_ERR(in.open(file, "r"));
93
94
716
    PhonetParmsImpl * parms = new PhonetParmsImpl();
95
96
716
    parms->lang = lang;
97
98
716
    parms->followup        = true;
99
716
    parms->collapse_result = false;
100
716
    parms->remove_accents  = true;
101
102
716
    int num = 0;
103
76.6k
    while (getdata_pair(in, dp, buf)) {
104
75.8k
      if (dp.key != "followup" && dp.key != "collapse_result" &&
105
75.8k
    dp.key != "version")
106
75.1k
  ++num;
107
75.8k
    }
108
109
716
    in.restart();
110
111
716
    size_t vsize = sizeof(char *) * (2 * num + 2);
112
716
    parms->data = malloc(vsize);
113
114
716
    const char * * r = (const char * *)parms->data;
115
116
716
    char * empty_str = parms->strings.dup("");
117
118
76.6k
    while (true) {
119
76.6k
      if (!getdata_pair(in, dp, buf)) break;
120
75.8k
      if (dp.key == "followup") {
121
0
  parms->followup = to_bool(dp.value);
122
75.8k
      } else if (dp.key == "collapse_result") {
123
0
  parms->collapse_result = to_bool(dp.value);
124
75.8k
      } else if (dp.key == "version") {
125
716
  parms->version = dp.value;
126
75.1k
      } else if (dp.key == "remove_accents") {
127
0
        parms->remove_accents = to_bool(dp.value);
128
75.1k
      } else {
129
75.1k
  *r = parms->strings.dup(iconv(dp.key));
130
75.1k
  ++r;
131
75.1k
  if (dp.value == "_") {
132
13.6k
    *r = empty_str;
133
61.5k
  } else {
134
61.5k
    *r = parms->strings.dup(iconv(dp.value));
135
61.5k
  }
136
75.1k
  ++r;
137
75.1k
      }
138
75.8k
    }
139
716
    if (parms->version.empty()) {
140
0
      delete parms;
141
0
      return make_err(bad_file_format, file, "You must specify a version string");
142
0
    }
143
716
    *(r  ) = PhonetParms::rules_end;
144
716
    *(r+1) = PhonetParms::rules_end;
145
716
    parms->rules = (const char * *)parms->data;
146
147
148
184k
    for (unsigned i = 0; i != 256; ++i) {
149
183k
      parms->to_clean[i] = (lang->char_type(i) > Language::NonLetter 
150
183k
                            ? (parms->remove_accents 
151
81.6k
                               ? lang->to_upper(lang->de_accent(i)) 
152
81.6k
                               : lang->to_upper(i))
153
183k
                            : 0);
154
183k
    }
155
156
716
    init_phonet_hash(*parms);
157
158
716
    return parms;
159
716
  }
160
161
  static void init_phonet_hash(PhonetParms & parms) 
162
716
  {
163
716
    int i, k;
164
165
184k
    for (i = 0; i < parms.hash_size; i++) {
166
183k
      parms.hash[i] = -1;
167
183k
    }
168
169
75.8k
    for (i = 0; parms.rules[i] != PhonetParms::rules_end; i += 2) {
170
      /**  set hash value  **/
171
75.1k
      k = (unsigned char) parms.rules[i][0];
172
173
75.1k
      if (parms.hash[k] < 0) {
174
19.3k
  parms.hash[k] = i;
175
19.3k
      }
176
75.1k
    }
177
716
  }
178
179
180
#ifdef PHONET_TRACE
181
  void trace_info(char * text, int n, char * error,
182
      const PhonetParms & parms) 
183
  {
184
    /**  dump tracing info  **/
185
    
186
    printf ("%s %d:  \"%s\"  >  \"%s\" %s", text, ((n/2)+1), parms.rules[n],
187
      parms.rules[n+1], error);
188
  }
189
#endif
190
191
  int phonet (const char * inword, char * target,
192
              int len,
193
        const PhonetParms & parms)
194
185k
  {
195
    /**       Do phonetic transformation.       **/
196
    /**  "len" = length of "inword" incl. '\0'. **/
197
198
    /**  result:  >= 0:  length of "target"    **/
199
    /**            otherwise:  error            **/
200
201
185k
    int  i,j,k=0,n,p,z;
202
185k
    int  k0,n0,p0=-333,z0;
203
185k
    if (len == -1) len = strlen(inword);
204
185k
    VARARRAY(char, word, len + 1);
205
185k
    char c, c0;
206
185k
    const char * s;
207
208
185k
    typedef unsigned char uchar;
209
    
210
    /**  to convert string to uppercase and possible remove accents **/
211
185k
    char * res = word;
212
25.3M
    for (const char * str = inword; *str; ++str) {
213
25.2M
      char c = parms.to_clean[(uchar)*str];
214
25.2M
      if (c) *res++ = c;
215
25.2M
    }
216
185k
    *res = '\0';
217
    
218
    /**  check word  **/
219
185k
    i = j = z = 0;
220
24.8M
    while ((c = word[i]) != '\0') {
221
      #ifdef PHONET_TRACE
222
         cout << "\nChecking position " << j << ":  word = \""
223
              << word+i << "\",";
224
         printf ("  target = \"%.*s\"", j, target);
225
      #endif
226
24.6M
      n = parms.hash[(uchar) c];
227
24.6M
      z0 = 0;
228
229
24.6M
      if (n >= 0) {
230
        /**  check all rules for the same letter  **/
231
147M
        while (parms.rules[n][0] == c) {
232
          #ifdef PHONET_TRACE
233
             trace_info ("\n> Checking rule No.",n,"",parms);
234
          #endif
235
236
          /**  check whole string  **/
237
136M
          k = 1;   /** number of found letters  **/
238
136M
          p = 5;   /** default priority  **/
239
136M
          s = parms.rules[n];
240
136M
          s++;     /**  important for (see below)  "*(s-1)"  **/
241
          
242
140M
          while (*s != '\0'  &&  word[i+k] == *s
243
140M
                 &&  !asc_isdigit (*s)  &&  strchr ("(-<^$", *s) == NULL) {
244
3.68M
            k++;
245
3.68M
            s++;
246
3.68M
          }
247
136M
          if (*s == '(') {
248
            /**  check letters in "(..)"  **/
249
22.7M
            if (parms.lang->is_alpha(word[i+k])  // ...could be implied?
250
22.7M
                && strchr(s+1, word[i+k]) != NULL) {
251
40.1k
              k++;
252
246k
              while (*s != ')')
253
206k
                s++;
254
40.1k
              s++;
255
40.1k
            }
256
22.7M
          }
257
136M
          p0 = (int) *s;
258
136M
          k0 = k;
259
140M
          while (*s == '-'  &&  k > 1) {
260
3.66M
            k--;
261
3.66M
            s++;
262
3.66M
          }
263
136M
          if (*s == '<')
264
326
            s++;
265
136M
          if (asc_isdigit (*s)) {
266
            /**  determine priority  **/
267
501
            p = *s - '0';
268
501
            s++;
269
501
          }
270
136M
          if (*s == '^'  &&  *(s+1) == '^')
271
0
            s++;
272
273
136M
          if (*s == '\0'
274
136M
              || (*s == '^'  
275
122M
                  && (i == 0  ||  ! parms.lang->is_alpha(word[i-1]))
276
122M
                  && (*(s+1) != '$'
277
58.7k
                      || (! parms.lang->is_alpha(word[i+k0]) )))
278
136M
              || (*s == '$'  &&  i > 0  
279
122M
                  &&  parms.lang->is_alpha(word[i-1])
280
122M
                  && (! parms.lang->is_alpha(word[i+k0]) ))) 
281
14.0M
          {
282
            /**  search for followup rules, if:     **/
283
            /**  parms.followup and k > 1  and  NO '-' in searchstring **/
284
14.0M
            c0 = word[i+k-1];
285
14.0M
            n0 = parms.hash[(uchar) c0];
286
//
287
14.0M
            if (parms.followup  &&  k > 1  &&  n0 >= 0
288
14.0M
                &&  p0 != (int) '-'  &&  word[i+k] != '\0') {
289
              /**  test follow-up rule for "word[i+k]"  **/
290
37.6k
              while (parms.rules[n0][0] == c0) {
291
                #ifdef PHONET_TRACE
292
                    trace_info ("\n> > follow-up rule No.",n0,"... ",parms);
293
                #endif
294
295
                /**  check whole string  **/
296
33.7k
                k0 = k;
297
33.7k
                p0 = 5;
298
33.7k
                s = parms.rules[n0];
299
33.7k
                s++;
300
39.7k
                while (*s != '\0'  &&  word[i+k0] == *s
301
39.7k
                       && ! asc_isdigit(*s)  &&  strchr("(-<^$",*s) == NULL) {
302
5.95k
                  k0++;
303
5.95k
                  s++;
304
5.95k
                }
305
33.7k
                if (*s == '(') {
306
                  /**  check letters  **/
307
8.05k
                  if (parms.lang->is_alpha(word[i+k0])
308
8.05k
                      &&  strchr (s+1, word[i+k0]) != NULL) {
309
7.67k
                    k0++;
310
50.7k
                    while (*s != ')'  &&  *s != '\0')
311
43.0k
                      s++;
312
7.67k
                    if (*s == ')')
313
7.67k
                      s++;
314
7.67k
                  }
315
8.05k
                }
316
39.3k
                while (*s == '-') {
317
                  /**  "k0" gets NOT reduced   **/
318
                  /**  because "if (k0 == k)"  **/
319
5.60k
                  s++;
320
5.60k
                }
321
33.7k
                if (*s == '<')
322
25
                  s++;
323
33.7k
                if (asc_isdigit (*s)) {
324
0
                  p0 = *s - '0';
325
0
                  s++;
326
0
                }
327
328
33.7k
                if (*s == '\0'
329
                    /**  *s == '^' cuts  **/
330
33.7k
                    || (*s == '$'  &&  ! parms.lang->is_alpha(word[i+k0]))) 
331
7.23k
                {
332
7.23k
                  if (k0 == k) {
333
                    /**  this is just a piece of the string  **/
334
                    #ifdef PHONET_TRACE
335
                        cout << "discarded (too short)";
336
                    #endif
337
3.87k
                    n0 += 2;
338
3.87k
                    continue;
339
3.87k
                  }
340
341
3.36k
                  if (p0 < p) {
342
                    /**  priority too low  **/
343
                    #ifdef PHONET_TRACE
344
                        cout << "discarded (priority)";
345
                    #endif
346
0
                    n0 += 2;
347
0
                    continue;
348
0
                  }
349
                  /**  rule fits; stop search  **/
350
3.36k
                  break;
351
3.36k
                }
352
                #ifdef PHONET_TRACE
353
                    cout << "discarded";
354
                #endif
355
26.5k
                n0 += 2;
356
26.5k
              } /**  End of "while (parms.rules[n0][0] == c0)"  **/
357
358
7.24k
              if (p0 >= p  && parms.rules[n0][0] == c0) {
359
                #ifdef PHONET_TRACE
360
                    trace_info ("\n> Rule No.", n,"",parms);
361
                    trace_info ("\n> not used because of follow-up",
362
                                      n0,"",parms);
363
                #endif
364
3.36k
                n += 2;
365
3.36k
                continue;
366
3.36k
              }
367
7.24k
            } /** end of follow-up stuff **/
368
369
            /**  replace string  **/
370
            #ifdef PHONET_TRACE
371
                trace_info ("\nUsing rule No.", n,"\n",parms);
372
            #endif
373
14.0M
            s = parms.rules[n+1];
374
14.0M
            p0 = (parms.rules[n][0] != '\0'
375
14.0M
                 &&  strchr (parms.rules[n]+1,'<') != NULL) ? 1:0;
376
14.0M
            if (p0 == 1 &&  z == 0) {
377
              /**  rule with '<' is used  **/
378
218
              if (j > 0  &&  *s != '\0'
379
218
                 && (target[j-1] == c  ||  target[j-1] == *s)) {
380
0
                j--;
381
0
              }
382
218
              z0 = 1;
383
218
              z = 1;
384
218
              k0 = 0;
385
436
              while (*s != '\0'  &&  word[i+k0] != '\0') {
386
218
                word[i+k0] = *s;
387
218
                k0++;
388
218
                s++;
389
218
              }
390
218
              if (k > k0)
391
218
                strmove (&word[0]+i+k0, &word[0]+i+k);
392
393
              /**  new "actual letter"  **/
394
218
              c = word[i];
395
218
            }
396
14.0M
            else { /** no '<' rule used **/
397
14.0M
              i += k - 1;
398
14.0M
              z = 0;
399
14.0M
              while (*s != '\0'
400
14.0M
                     &&  *(s+1) != '\0'  &&  j < len) {
401
12.7k
                if (j == 0  ||  target[j-1] != *s) {
402
12.5k
                  target[j] = *s;
403
12.5k
                  j++;
404
12.5k
                }
405
12.7k
                s++;
406
12.7k
              }
407
              /**  new "actual letter"  **/
408
14.0M
              c = *s;
409
14.0M
              if (parms.rules[n][0] != '\0'
410
14.0M
                 &&  strstr (parms.rules[n]+1, "^^") != NULL) {
411
0
                if (c != '\0') {
412
0
                  target[j] = c;
413
0
                  j++;
414
0
                }
415
0
                strmove (&word[0], &word[0]+i+1);
416
0
                i = 0;
417
0
                z0 = 1;
418
0
              }
419
14.0M
            }
420
14.0M
            break;
421
14.0M
          }  /** end of follow-up stuff **/
422
122M
          n += 2;
423
122M
        } /**  end of while (parms.rules[n][0] == c)  **/
424
24.6M
      } /**  end of if (n >= 0)  **/
425
24.6M
      if (z0 == 0) {
426
24.6M
        if (k && (assert(p0!=-333),!p0) &&  j < len &&  c != '\0'
427
24.6M
           && (!parms.collapse_result  ||  j == 0  ||  target[j-1] != c)){
428
           /**  condense only double letters  **/
429
10.4M
          target[j] = c;
430
    ///printf("\n setting \n");
431
10.4M
          j++;
432
10.4M
        }
433
        #ifdef PHONET_TRACE
434
        else if (p0 || !k)
435
          cout << "\nNo rule found; character \"" << word[i] << "\" skipped\n";
436
        #endif
437
438
0
        i++;
439
24.6M
        z = 0;
440
24.6M
  k=0;
441
24.6M
      }
442
24.6M
    }  /**  end of   while ((c = word[i]) != '\0')  **/
443
444
185k
    target[j] = '\0';
445
185k
    return (j);
446
447
185k
  }  /**  end of function "phonet"  **/
448
}
449
450
#if 0
451
452
int main (int argc, char *argv[]) {
453
  using namespace autil;
454
455
  if (argc < 3) {
456
    printf ("Usage:  phonet <data file> <word>\n");
457
    return(1);
458
  }
459
460
  char phone_word[strlen(argv[2])+1]; /**  max possible length of words  **/
461
462
  PhonetParms * parms;
463
  ifstream f(argv[1]);
464
  parms = load_phonet_rules(f);
465
466
  init_phonet_charinfo(*parms);
467
  init_phonet_hash(*parms);
468
  phonet (argv[2],phone_word,*parms);
469
  printf ("%s\n", phone_word);
470
  return(0);
471
}
472
#endif