Coverage Report

Created: 2022-10-06 21:30

/src/php-src/ext/standard/metaphone.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
   +----------------------------------------------------------------------+
3
   | Copyright (c) The PHP Group                                          |
4
   +----------------------------------------------------------------------+
5
   | This source file is subject to version 3.01 of the PHP license,      |
6
   | that is bundled with this package in the file LICENSE, and is        |
7
   | available through the world-wide-web at the following url:           |
8
   | http://www.php.net/license/3_01.txt                                  |
9
   | If you did not receive a copy of the PHP license and are unable to   |
10
   | obtain it through the world-wide-web, please send a note to          |
11
   | license@php.net so we can mail you a copy immediately.               |
12
   +----------------------------------------------------------------------+
13
   | Author: Thies C. Arntzen <thies@thieso.net>                          |
14
   +----------------------------------------------------------------------+
15
*/
16
17
/*
18
  Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
19
*/
20
21
#include "php.h"
22
23
static int metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional);
24
25
/* {{{ Break english phrases down into their phonemes */
26
PHP_FUNCTION(metaphone)
27
0
{
28
0
  zend_string *str;
29
0
  zend_string *result = NULL;
30
0
  zend_long phones = 0;
31
32
0
  ZEND_PARSE_PARAMETERS_START(1, 2)
33
0
    Z_PARAM_STR(str)
34
0
    Z_PARAM_OPTIONAL
35
0
    Z_PARAM_LONG(phones)
36
0
  ZEND_PARSE_PARAMETERS_END();
37
38
0
  if (metaphone((unsigned char *)ZSTR_VAL(str), ZSTR_LEN(str), phones, &result, 1) == 0) {
39
0
    RETVAL_STR(result);
40
0
  } else {
41
0
    if (result) {
42
0
      zend_string_free(result);
43
0
    }
44
0
    RETURN_FALSE;
45
0
  }
46
0
}
47
/* }}} */
48
49
/*
50
   this is now the original code by Michael G Schwern:
51
   i've changed it just a slightly bit (use emalloc,
52
   get rid of includes etc)
53
  - thies - 13.09.1999
54
*/
55
56
/*-----------------------------  */
57
/* this used to be "metaphone.h" */
58
/*-----------------------------  */
59
60
/* Special encodings */
61
#define  SH   'X'
62
#define  TH   '0'
63
64
/*-----------------------------  */
65
/* end of "metaphone.h"          */
66
/*-----------------------------  */
67
68
/*----------------------------- */
69
/* this used to be "metachar.h" */
70
/*----------------------------- */
71
72
/* Metachar.h ... little bits about characters for metaphone */
73
/*-- Character encoding array & accessing macros --*/
74
/* Stolen directly out of the book... */
75
static const char _codes[26] =
76
{
77
  1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
78
/*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
79
};
80
81
82
0
#define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
83
84
0
#define isvowel(c)  (ENCODE(c) & 1)    /* AEIOU */
85
86
/* These letters are passed through unchanged */
87
#define NOCHANGE(c) (ENCODE(c) & 2)   /* FJMNR */
88
89
/* These form diphthongs when preceding H */
90
0
#define AFFECTH(c)  (ENCODE(c) & 4)    /* CGPST */
91
92
/* These make C and G soft */
93
0
#define MAKESOFT(c) (ENCODE(c) & 8)    /* EIY */
94
95
/* These prevent GH from becoming F */
96
0
#define NOGHTOF(c)  (ENCODE(c) & 16)  /* BDH */
97
98
/*----------------------------- */
99
/* end of "metachar.h"          */
100
/*----------------------------- */
101
102
/* I suppose I could have been using a character pointer instead of
103
 * accesssing the array directly... */
104
105
/* Look at the next letter in the word */
106
0
#define Next_Letter (toupper(word[w_idx+1]))
107
/* Look at the current letter in the word */
108
0
#define Curr_Letter (toupper(word[w_idx]))
109
/* Go N letters back. */
110
0
#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
111
/* Previous letter.  I dunno, should this return null on failure? */
112
0
#define Prev_Letter (Look_Back_Letter(1))
113
/* Look two letters down.  It makes sure you don't walk off the string. */
114
0
#define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
115
0
                           : '\0')
116
0
#define Look_Ahead_Letter(n) (toupper(Lookahead((char *) word+w_idx, n)))
117
118
119
/* Allows us to safely look ahead an arbitrary # of letters */
120
/* I probably could have just used strlen... */
121
static char Lookahead(char *word, int how_far)
122
0
{
123
0
  char letter_ahead = '\0'; /* null by default */
124
0
  int idx;
125
0
  for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
126
  /* Edge forward in the string... */
127
128
0
  letter_ahead = word[idx]; /* idx will be either == to how_far or
129
                 * at the end of the string
130
                 */
131
0
  return letter_ahead;
132
0
}
133
134
135
/* phonize one letter
136
 * We don't know the buffers size in advance. On way to solve this is to just
137
 * re-allocate the buffer size. We're using an extra of 2 characters (this
138
 * could be one though; or more too). */
139
0
#define Phonize(c)  { \
140
0
            if (p_idx >= max_buffer_len) { \
141
0
              *phoned_word = zend_string_extend(*phoned_word, 2 * sizeof(char) + max_buffer_len, 0); \
142
0
              max_buffer_len += 2; \
143
0
            } \
144
0
            ZSTR_VAL(*phoned_word)[p_idx++] = c; \
145
0
            ZSTR_LEN(*phoned_word) = p_idx; \
146
0
          }
147
/* Slap a null character on the end of the phoned word */
148
0
#define End_Phoned_Word { \
149
0
              if (p_idx == max_buffer_len) { \
150
0
                *phoned_word = zend_string_extend(*phoned_word, 1 * sizeof(char) + max_buffer_len, 0); \
151
0
                max_buffer_len += 1; \
152
0
              } \
153
0
              ZSTR_VAL(*phoned_word)[p_idx] = '\0'; \
154
0
              ZSTR_LEN(*phoned_word) = p_idx; \
155
0
            }
156
/* How long is the phoned word? */
157
0
#define Phone_Len (p_idx)
158
159
/* Note is a letter is a 'break' in the word */
160
0
#define Isbreak(c)  (!isalpha(c))
161
162
/* {{{ metaphone */
163
static int metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional)
164
0
{
165
0
  int w_idx = 0;        /* point in the phonization we're at. */
166
0
  size_t p_idx = 0;       /* end of the phoned phrase */
167
0
  size_t max_buffer_len = 0;    /* maximum length of the destination buffer */
168
169
/*-- Parameter checks --*/
170
  /* Negative phoneme length is meaningless */
171
172
0
  if (max_phonemes < 0)
173
0
    return -1;
174
175
  /* Empty/null string is meaningless */
176
  /* Overly paranoid */
177
  /* assert(word != NULL && word[0] != '\0'); */
178
179
0
  if (word == NULL)
180
0
    return -1;
181
182
/*-- Allocate memory for our phoned_phrase --*/
183
0
  if (max_phonemes == 0) { /* Assume largest possible */
184
0
    max_buffer_len = word_len;
185
0
    *phoned_word = zend_string_alloc(sizeof(char) * word_len + 1, 0);
186
0
  } else {
187
0
    max_buffer_len = max_phonemes;
188
0
    *phoned_word = zend_string_alloc(sizeof(char) * max_phonemes + 1, 0);
189
0
  }
190
191
192
/*-- The first phoneme has to be processed specially. --*/
193
  /* Find our first letter */
194
0
  for (; !isalpha(Curr_Letter); w_idx++) {
195
    /* On the off chance we were given nothing but crap... */
196
0
    if (Curr_Letter == '\0') {
197
0
      End_Phoned_Word
198
0
        return SUCCESS; /* For testing */
199
0
    }
200
0
  }
201
202
0
  switch (Curr_Letter) {
203
    /* AE becomes E */
204
0
  case 'A':
205
0
    if (Next_Letter == 'E') {
206
0
      Phonize('E');
207
0
      w_idx += 2;
208
0
    }
209
    /* Remember, preserve vowels at the beginning */
210
0
    else {
211
0
      Phonize('A');
212
0
      w_idx++;
213
0
    }
214
0
    break;
215
    /* [GKP]N becomes N */
216
0
  case 'G':
217
0
  case 'K':
218
0
  case 'P':
219
0
    if (Next_Letter == 'N') {
220
0
      Phonize('N');
221
0
      w_idx += 2;
222
0
    }
223
0
    break;
224
    /* WH becomes W,
225
       WR becomes R
226
       W if followed by a vowel */
227
0
  case 'W':
228
0
    if (Next_Letter == 'R') {
229
0
      Phonize(Next_Letter);
230
0
      w_idx += 2;
231
0
    } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
232
0
      Phonize('W');
233
0
      w_idx += 2;
234
0
    }
235
    /* else ignore */
236
0
    break;
237
    /* X becomes S */
238
0
  case 'X':
239
0
    Phonize('S');
240
0
    w_idx++;
241
0
    break;
242
    /* Vowels are kept */
243
    /* We did A already
244
       case 'A':
245
       case 'a':
246
     */
247
0
  case 'E':
248
0
  case 'I':
249
0
  case 'O':
250
0
  case 'U':
251
0
    Phonize(Curr_Letter);
252
0
    w_idx++;
253
0
    break;
254
0
  default:
255
    /* do nothing */
256
0
    break;
257
0
  }
258
259
260
261
  /* On to the metaphoning */
262
0
  for (; Curr_Letter != '\0' &&
263
0
     (max_phonemes == 0 || Phone_Len < (size_t)max_phonemes);
264
0
     w_idx++) {
265
    /* How many letters to skip because an eariler encoding handled
266
     * multiple letters */
267
0
    unsigned short int skip_letter = 0;
268
269
270
    /* THOUGHT:  It would be nice if, rather than having things like...
271
     * well, SCI.  For SCI you encode the S, then have to remember
272
     * to skip the C.  So the phonome SCI invades both S and C.  It would
273
     * be better, IMHO, to skip the C from the S part of the encoding.
274
     * Hell, I'm trying it.
275
     */
276
277
    /* Ignore non-alphas */
278
0
    if (!isalpha(Curr_Letter))
279
0
      continue;
280
281
    /* Drop duplicates, except CC */
282
0
    if (Curr_Letter == Prev_Letter &&
283
0
      Curr_Letter != 'C')
284
0
      continue;
285
286
0
    switch (Curr_Letter) {
287
      /* B -> B unless in MB */
288
0
    case 'B':
289
0
      if (Prev_Letter != 'M')
290
0
        Phonize('B');
291
0
      break;
292
      /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
293
       * (SCHW is handled in S)
294
       *  S if -CI-, -CE- or -CY-
295
       *  dropped if -SCI-, SCE-, -SCY- (handed in S)
296
       *  else K
297
       */
298
0
    case 'C':
299
0
      if (MAKESOFT(Next_Letter)) { /* C[IEY] */
300
0
        if (After_Next_Letter == 'A' &&
301
0
          Next_Letter == 'I') { /* CIA */
302
0
          Phonize(SH);
303
0
        }
304
        /* SC[IEY] */
305
0
        else if (Prev_Letter == 'S') {
306
          /* Dropped */
307
0
        } else {
308
0
          Phonize('S');
309
0
        }
310
0
      } else if (Next_Letter == 'H') {
311
0
        if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */
312
0
          Phonize('K');
313
0
        } else {
314
0
          Phonize(SH);
315
0
        }
316
0
        skip_letter++;
317
0
      } else {
318
0
        Phonize('K');
319
0
      }
320
0
      break;
321
      /* J if in -DGE-, -DGI- or -DGY-
322
       * else T
323
       */
324
0
    case 'D':
325
0
      if (Next_Letter == 'G' &&
326
0
        MAKESOFT(After_Next_Letter)) {
327
0
        Phonize('J');
328
0
        skip_letter++;
329
0
      } else
330
0
        Phonize('T');
331
0
      break;
332
      /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
333
       * else dropped if -GNED, -GN,
334
       * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
335
       * else J if in -GE-, -GI, -GY and not GG
336
       * else K
337
       */
338
0
    case 'G':
339
0
      if (Next_Letter == 'H') {
340
0
        if (!(NOGHTOF(Look_Back_Letter(3)) ||
341
0
            Look_Back_Letter(4) == 'H')) {
342
0
          Phonize('F');
343
0
          skip_letter++;
344
0
        } else {
345
          /* silent */
346
0
        }
347
0
      } else if (Next_Letter == 'N') {
348
0
        if (Isbreak(After_Next_Letter) ||
349
0
          (After_Next_Letter == 'E' &&
350
0
           Look_Ahead_Letter(3) == 'D')) {
351
          /* dropped */
352
0
        } else
353
0
          Phonize('K');
354
0
      } else if (MAKESOFT(Next_Letter) &&
355
0
             Prev_Letter != 'G') {
356
0
        Phonize('J');
357
0
      } else {
358
0
        Phonize('K');
359
0
      }
360
0
      break;
361
      /* H if before a vowel and not after C,G,P,S,T */
362
0
    case 'H':
363
0
      if (isvowel(Next_Letter) &&
364
0
        !AFFECTH(Prev_Letter))
365
0
        Phonize('H');
366
0
      break;
367
      /* dropped if after C
368
       * else K
369
       */
370
0
    case 'K':
371
0
      if (Prev_Letter != 'C')
372
0
        Phonize('K');
373
0
      break;
374
      /* F if before H
375
       * else P
376
       */
377
0
    case 'P':
378
0
      if (Next_Letter == 'H') {
379
0
        Phonize('F');
380
0
      } else {
381
0
        Phonize('P');
382
0
      }
383
0
      break;
384
      /* K
385
       */
386
0
    case 'Q':
387
0
      Phonize('K');
388
0
      break;
389
      /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
390
       * else S
391
       */
392
0
    case 'S':
393
0
      if (Next_Letter == 'I' &&
394
0
        (After_Next_Letter == 'O' ||
395
0
         After_Next_Letter == 'A')) {
396
0
        Phonize(SH);
397
0
      } else if (Next_Letter == 'H') {
398
0
        Phonize(SH);
399
0
        skip_letter++;
400
0
      } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
401
0
        Phonize(SH);
402
0
        skip_letter += 2;
403
0
      } else {
404
0
        Phonize('S');
405
0
      }
406
0
      break;
407
      /* 'sh' in -TIA- or -TIO-
408
       * else 'th' before H
409
       * else T
410
       */
411
0
    case 'T':
412
0
      if (Next_Letter == 'I' &&
413
0
        (After_Next_Letter == 'O' ||
414
0
         After_Next_Letter == 'A')) {
415
0
        Phonize(SH);
416
0
      } else if (Next_Letter == 'H') {
417
0
        Phonize(TH);
418
0
        skip_letter++;
419
0
      } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
420
0
        Phonize('T');
421
0
      }
422
0
      break;
423
      /* F */
424
0
    case 'V':
425
0
      Phonize('F');
426
0
      break;
427
      /* W before a vowel, else dropped */
428
0
    case 'W':
429
0
      if (isvowel(Next_Letter))
430
0
        Phonize('W');
431
0
      break;
432
      /* KS */
433
0
    case 'X':
434
0
      Phonize('K');
435
0
      Phonize('S');
436
0
      break;
437
      /* Y if followed by a vowel */
438
0
    case 'Y':
439
0
      if (isvowel(Next_Letter))
440
0
        Phonize('Y');
441
0
      break;
442
      /* S */
443
0
    case 'Z':
444
0
      Phonize('S');
445
0
      break;
446
      /* No transformation */
447
0
    case 'F':
448
0
    case 'J':
449
0
    case 'L':
450
0
    case 'M':
451
0
    case 'N':
452
0
    case 'R':
453
0
      Phonize(Curr_Letter);
454
0
      break;
455
0
    default:
456
      /* nothing */
457
0
      break;
458
0
    }           /* END SWITCH */
459
460
0
    w_idx += skip_letter;
461
0
  }              /* END FOR */
462
463
0
  End_Phoned_Word;
464
465
0
  return 0;
466
0
}                /* END metaphone */
467
/* }}} */