Coverage Report

Created: 2022-10-14 11:23

/src/php-src/ext/standard/metaphone.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
   +----------------------------------------------------------------------+
3
   | Copyright (c) The PHP Group                                          |
4
   +----------------------------------------------------------------------+
5
   | This source file is subject to version 3.01 of the PHP license,      |
6
   | that is bundled with this package in the file LICENSE, and is        |
7
   | available through the world-wide-web at the following url:           |
8
   | http://www.php.net/license/3_01.txt                                  |
9
   | If you did not receive a copy of the PHP license and are unable to   |
10
   | obtain it through the world-wide-web, please send a note to          |
11
   | license@php.net so we can mail you a copy immediately.               |
12
   +----------------------------------------------------------------------+
13
   | Author: Thies C. Arntzen <thies@thieso.net>                          |
14
   +----------------------------------------------------------------------+
15
*/
16
17
/*
18
  Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
19
*/
20
21
#include "php.h"
22
23
static int metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional);
24
25
/* {{{ proto string|false metaphone(string text[, int phones])
26
   Break english phrases down into their phonemes */
27
PHP_FUNCTION(metaphone)
28
0
{
29
0
  zend_string *str;
30
0
  zend_string *result = NULL;
31
0
  zend_long phones = 0;
32
33
0
  ZEND_PARSE_PARAMETERS_START(1, 2)
34
0
    Z_PARAM_STR(str)
35
0
    Z_PARAM_OPTIONAL
36
0
    Z_PARAM_LONG(phones)
37
0
  ZEND_PARSE_PARAMETERS_END();
38
39
0
  if (metaphone((unsigned char *)ZSTR_VAL(str), ZSTR_LEN(str), phones, &result, 1) == 0) {
40
0
    RETVAL_STR(result);
41
0
  } else {
42
0
    if (result) {
43
0
      zend_string_free(result);
44
0
    }
45
0
    RETURN_FALSE;
46
0
  }
47
0
}
48
/* }}} */
49
50
/*
51
   this is now the original code by Michael G Schwern:
52
   i've changed it just a slightly bit (use emalloc,
53
   get rid of includes etc)
54
  - thies - 13.09.1999
55
*/
56
57
/*-----------------------------  */
58
/* this used to be "metaphone.h" */
59
/*-----------------------------  */
60
61
/* Special encodings */
62
#define  SH   'X'
63
#define  TH   '0'
64
65
/*-----------------------------  */
66
/* end of "metaphone.h"          */
67
/*-----------------------------  */
68
69
/*----------------------------- */
70
/* this used to be "metachar.h" */
71
/*----------------------------- */
72
73
/* Metachar.h ... little bits about characters for metaphone */
74
/*-- Character encoding array & accessing macros --*/
75
/* Stolen directly out of the book... */
76
static const char _codes[26] =
77
{
78
  1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
79
/*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
80
};
81
82
83
0
#define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
84
85
0
#define isvowel(c)  (ENCODE(c) & 1)    /* AEIOU */
86
87
/* These letters are passed through unchanged */
88
#define NOCHANGE(c) (ENCODE(c) & 2)   /* FJMNR */
89
90
/* These form diphthongs when preceding H */
91
0
#define AFFECTH(c)  (ENCODE(c) & 4)    /* CGPST */
92
93
/* These make C and G soft */
94
0
#define MAKESOFT(c) (ENCODE(c) & 8)    /* EIY */
95
96
/* These prevent GH from becoming F */
97
0
#define NOGHTOF(c)  (ENCODE(c) & 16)  /* BDH */
98
99
/*----------------------------- */
100
/* end of "metachar.h"          */
101
/*----------------------------- */
102
103
/* I suppose I could have been using a character pointer instead of
104
 * accesssing the array directly... */
105
106
/* Look at the next letter in the word */
107
0
#define Next_Letter (toupper(word[w_idx+1]))
108
/* Look at the current letter in the word */
109
0
#define Curr_Letter (toupper(word[w_idx]))
110
/* Go N letters back. */
111
0
#define Look_Back_Letter(n) (w_idx >= n ? toupper(word[w_idx-n]) : '\0')
112
/* Previous letter.  I dunno, should this return null on failure? */
113
0
#define Prev_Letter (Look_Back_Letter(1))
114
/* Look two letters down.  It makes sure you don't walk off the string. */
115
0
#define After_Next_Letter (Next_Letter != '\0' ? toupper(word[w_idx+2]) \
116
0
                           : '\0')
117
0
#define Look_Ahead_Letter(n) (toupper(Lookahead((char *) word+w_idx, n)))
118
119
120
/* Allows us to safely look ahead an arbitrary # of letters */
121
/* I probably could have just used strlen... */
122
static char Lookahead(char *word, int how_far)
123
0
{
124
0
  char letter_ahead = '\0'; /* null by default */
125
0
  int idx;
126
0
  for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
127
  /* Edge forward in the string... */
128
129
0
  letter_ahead = word[idx]; /* idx will be either == to how_far or
130
                 * at the end of the string
131
                 */
132
0
  return letter_ahead;
133
0
}
134
135
136
/* phonize one letter
137
 * We don't know the buffers size in advance. On way to solve this is to just
138
 * re-allocate the buffer size. We're using an extra of 2 characters (this
139
 * could be one though; or more too). */
140
0
#define Phonize(c)  { \
141
0
            if (p_idx >= max_buffer_len) { \
142
0
              *phoned_word = zend_string_extend(*phoned_word, 2 * sizeof(char) + max_buffer_len, 0); \
143
0
              max_buffer_len += 2; \
144
0
            } \
145
0
            ZSTR_VAL(*phoned_word)[p_idx++] = c; \
146
0
            ZSTR_LEN(*phoned_word) = p_idx; \
147
0
          }
148
/* Slap a null character on the end of the phoned word */
149
0
#define End_Phoned_Word { \
150
0
              if (p_idx == max_buffer_len) { \
151
0
                *phoned_word = zend_string_extend(*phoned_word, 1 * sizeof(char) + max_buffer_len, 0); \
152
0
                max_buffer_len += 1; \
153
0
              } \
154
0
              ZSTR_VAL(*phoned_word)[p_idx] = '\0'; \
155
0
              ZSTR_LEN(*phoned_word) = p_idx; \
156
0
            }
157
/* How long is the phoned word? */
158
0
#define Phone_Len (p_idx)
159
160
/* Note is a letter is a 'break' in the word */
161
0
#define Isbreak(c)  (!isalpha(c))
162
163
/* {{{ metaphone
164
 */
165
static int metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional)
166
0
{
167
0
  int w_idx = 0;        /* point in the phonization we're at. */
168
0
  size_t p_idx = 0;       /* end of the phoned phrase */
169
0
  size_t max_buffer_len = 0;    /* maximum length of the destination buffer */
170
171
/*-- Parameter checks --*/
172
  /* Negative phoneme length is meaningless */
173
174
0
  if (max_phonemes < 0)
175
0
    return -1;
176
177
  /* Empty/null string is meaningless */
178
  /* Overly paranoid */
179
  /* assert(word != NULL && word[0] != '\0'); */
180
181
0
  if (word == NULL)
182
0
    return -1;
183
184
/*-- Allocate memory for our phoned_phrase --*/
185
0
  if (max_phonemes == 0) { /* Assume largest possible */
186
0
    max_buffer_len = word_len;
187
0
    *phoned_word = zend_string_alloc(sizeof(char) * word_len + 1, 0);
188
0
  } else {
189
0
    max_buffer_len = max_phonemes;
190
0
    *phoned_word = zend_string_alloc(sizeof(char) * max_phonemes + 1, 0);
191
0
  }
192
193
194
/*-- The first phoneme has to be processed specially. --*/
195
  /* Find our first letter */
196
0
  for (; !isalpha(Curr_Letter); w_idx++) {
197
    /* On the off chance we were given nothing but crap... */
198
0
    if (Curr_Letter == '\0') {
199
0
      End_Phoned_Word
200
0
        return SUCCESS; /* For testing */
201
0
    }
202
0
  }
203
204
0
  switch (Curr_Letter) {
205
    /* AE becomes E */
206
0
  case 'A':
207
0
    if (Next_Letter == 'E') {
208
0
      Phonize('E');
209
0
      w_idx += 2;
210
0
    }
211
    /* Remember, preserve vowels at the beginning */
212
0
    else {
213
0
      Phonize('A');
214
0
      w_idx++;
215
0
    }
216
0
    break;
217
    /* [GKP]N becomes N */
218
0
  case 'G':
219
0
  case 'K':
220
0
  case 'P':
221
0
    if (Next_Letter == 'N') {
222
0
      Phonize('N');
223
0
      w_idx += 2;
224
0
    }
225
0
    break;
226
    /* WH becomes W,
227
       WR becomes R
228
       W if followed by a vowel */
229
0
  case 'W':
230
0
    if (Next_Letter == 'R') {
231
0
      Phonize(Next_Letter);
232
0
      w_idx += 2;
233
0
    } else if (Next_Letter == 'H' || isvowel(Next_Letter)) {
234
0
      Phonize('W');
235
0
      w_idx += 2;
236
0
    }
237
    /* else ignore */
238
0
    break;
239
    /* X becomes S */
240
0
  case 'X':
241
0
    Phonize('S');
242
0
    w_idx++;
243
0
    break;
244
    /* Vowels are kept */
245
    /* We did A already
246
       case 'A':
247
       case 'a':
248
     */
249
0
  case 'E':
250
0
  case 'I':
251
0
  case 'O':
252
0
  case 'U':
253
0
    Phonize(Curr_Letter);
254
0
    w_idx++;
255
0
    break;
256
0
  default:
257
    /* do nothing */
258
0
    break;
259
0
  }
260
261
262
263
  /* On to the metaphoning */
264
0
  for (; Curr_Letter != '\0' &&
265
0
     (max_phonemes == 0 || Phone_Len < (size_t)max_phonemes);
266
0
     w_idx++) {
267
    /* How many letters to skip because an eariler encoding handled
268
     * multiple letters */
269
0
    unsigned short int skip_letter = 0;
270
271
272
    /* THOUGHT:  It would be nice if, rather than having things like...
273
     * well, SCI.  For SCI you encode the S, then have to remember
274
     * to skip the C.  So the phonome SCI invades both S and C.  It would
275
     * be better, IMHO, to skip the C from the S part of the encoding.
276
     * Hell, I'm trying it.
277
     */
278
279
    /* Ignore non-alphas */
280
0
    if (!isalpha(Curr_Letter))
281
0
      continue;
282
283
    /* Drop duplicates, except CC */
284
0
    if (Curr_Letter == Prev_Letter &&
285
0
      Curr_Letter != 'C')
286
0
      continue;
287
288
0
    switch (Curr_Letter) {
289
      /* B -> B unless in MB */
290
0
    case 'B':
291
0
      if (Prev_Letter != 'M')
292
0
        Phonize('B');
293
0
      break;
294
      /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
295
       * (SCHW is handled in S)
296
       *  S if -CI-, -CE- or -CY-
297
       *  dropped if -SCI-, SCE-, -SCY- (handed in S)
298
       *  else K
299
       */
300
0
    case 'C':
301
0
      if (MAKESOFT(Next_Letter)) { /* C[IEY] */
302
0
        if (After_Next_Letter == 'A' &&
303
0
          Next_Letter == 'I') { /* CIA */
304
0
          Phonize(SH);
305
0
        }
306
        /* SC[IEY] */
307
0
        else if (Prev_Letter == 'S') {
308
          /* Dropped */
309
0
        } else {
310
0
          Phonize('S');
311
0
        }
312
0
      } else if (Next_Letter == 'H') {
313
0
        if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */
314
0
          Phonize('K');
315
0
        } else {
316
0
          Phonize(SH);
317
0
        }
318
0
        skip_letter++;
319
0
      } else {
320
0
        Phonize('K');
321
0
      }
322
0
      break;
323
      /* J if in -DGE-, -DGI- or -DGY-
324
       * else T
325
       */
326
0
    case 'D':
327
0
      if (Next_Letter == 'G' &&
328
0
        MAKESOFT(After_Next_Letter)) {
329
0
        Phonize('J');
330
0
        skip_letter++;
331
0
      } else
332
0
        Phonize('T');
333
0
      break;
334
      /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
335
       * else dropped if -GNED, -GN,
336
       * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
337
       * else J if in -GE-, -GI, -GY and not GG
338
       * else K
339
       */
340
0
    case 'G':
341
0
      if (Next_Letter == 'H') {
342
0
        if (!(NOGHTOF(Look_Back_Letter(3)) ||
343
0
            Look_Back_Letter(4) == 'H')) {
344
0
          Phonize('F');
345
0
          skip_letter++;
346
0
        } else {
347
          /* silent */
348
0
        }
349
0
      } else if (Next_Letter == 'N') {
350
0
        if (Isbreak(After_Next_Letter) ||
351
0
          (After_Next_Letter == 'E' &&
352
0
           Look_Ahead_Letter(3) == 'D')) {
353
          /* dropped */
354
0
        } else
355
0
          Phonize('K');
356
0
      } else if (MAKESOFT(Next_Letter) &&
357
0
             Prev_Letter != 'G') {
358
0
        Phonize('J');
359
0
      } else {
360
0
        Phonize('K');
361
0
      }
362
0
      break;
363
      /* H if before a vowel and not after C,G,P,S,T */
364
0
    case 'H':
365
0
      if (isvowel(Next_Letter) &&
366
0
        !AFFECTH(Prev_Letter))
367
0
        Phonize('H');
368
0
      break;
369
      /* dropped if after C
370
       * else K
371
       */
372
0
    case 'K':
373
0
      if (Prev_Letter != 'C')
374
0
        Phonize('K');
375
0
      break;
376
      /* F if before H
377
       * else P
378
       */
379
0
    case 'P':
380
0
      if (Next_Letter == 'H') {
381
0
        Phonize('F');
382
0
      } else {
383
0
        Phonize('P');
384
0
      }
385
0
      break;
386
      /* K
387
       */
388
0
    case 'Q':
389
0
      Phonize('K');
390
0
      break;
391
      /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
392
       * else S
393
       */
394
0
    case 'S':
395
0
      if (Next_Letter == 'I' &&
396
0
        (After_Next_Letter == 'O' ||
397
0
         After_Next_Letter == 'A')) {
398
0
        Phonize(SH);
399
0
      } else if (Next_Letter == 'H') {
400
0
        Phonize(SH);
401
0
        skip_letter++;
402
0
      } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
403
0
        Phonize(SH);
404
0
        skip_letter += 2;
405
0
      } else {
406
0
        Phonize('S');
407
0
      }
408
0
      break;
409
      /* 'sh' in -TIA- or -TIO-
410
       * else 'th' before H
411
       * else T
412
       */
413
0
    case 'T':
414
0
      if (Next_Letter == 'I' &&
415
0
        (After_Next_Letter == 'O' ||
416
0
         After_Next_Letter == 'A')) {
417
0
        Phonize(SH);
418
0
      } else if (Next_Letter == 'H') {
419
0
        Phonize(TH);
420
0
        skip_letter++;
421
0
      } else if (!(Next_Letter == 'C' && After_Next_Letter == 'H')) {
422
0
        Phonize('T');
423
0
      }
424
0
      break;
425
      /* F */
426
0
    case 'V':
427
0
      Phonize('F');
428
0
      break;
429
      /* W before a vowel, else dropped */
430
0
    case 'W':
431
0
      if (isvowel(Next_Letter))
432
0
        Phonize('W');
433
0
      break;
434
      /* KS */
435
0
    case 'X':
436
0
      Phonize('K');
437
0
      Phonize('S');
438
0
      break;
439
      /* Y if followed by a vowel */
440
0
    case 'Y':
441
0
      if (isvowel(Next_Letter))
442
0
        Phonize('Y');
443
0
      break;
444
      /* S */
445
0
    case 'Z':
446
0
      Phonize('S');
447
0
      break;
448
      /* No transformation */
449
0
    case 'F':
450
0
    case 'J':
451
0
    case 'L':
452
0
    case 'M':
453
0
    case 'N':
454
0
    case 'R':
455
0
      Phonize(Curr_Letter);
456
0
      break;
457
0
    default:
458
      /* nothing */
459
0
      break;
460
0
    }           /* END SWITCH */
461
462
0
    w_idx += skip_letter;
463
0
  }              /* END FOR */
464
465
0
  End_Phoned_Word;
466
467
0
  return 0;
468
0
}                /* END metaphone */
469
/* }}} */