Coverage Report

Created: 2026-06-02 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/php-src/ext/standard/metaphone.c
Line
Count
Source
1
/*
2
   +----------------------------------------------------------------------+
3
   | Copyright © The PHP Group and Contributors.                          |
4
   +----------------------------------------------------------------------+
5
   | This source file is subject to the Modified BSD License that is      |
6
   | bundled with this package in the file LICENSE, and is available      |
7
   | through the World Wide Web at <https://www.php.net/license/>.        |
8
   |                                                                      |
9
   | SPDX-License-Identifier: BSD-3-Clause                                |
10
   +----------------------------------------------------------------------+
11
   | Author: Thies C. Arntzen <thies@thieso.net>                          |
12
   +----------------------------------------------------------------------+
13
*/
14
15
/*
16
  Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
17
*/
18
19
#include "php.h"
20
21
static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional);
22
23
/* {{{ Break english phrases down into their phonemes */
24
PHP_FUNCTION(metaphone)
25
0
{
26
0
  zend_string *str;
27
0
  zend_string *result = NULL;
28
0
  zend_long phones = 0;
29
30
0
  ZEND_PARSE_PARAMETERS_START(1, 2)
31
0
    Z_PARAM_STR(str)
32
0
    Z_PARAM_OPTIONAL
33
0
    Z_PARAM_LONG(phones)
34
0
  ZEND_PARSE_PARAMETERS_END();
35
36
0
  if (phones < 0) {
37
0
    zend_argument_value_error(2, "must be greater than or equal to 0");
38
0
    RETURN_THROWS();
39
0
  }
40
41
0
  metaphone((unsigned char *)ZSTR_VAL(str), ZSTR_LEN(str), phones, &result, 1);
42
0
  RETVAL_STR(result);
43
0
}
44
/* }}} */
45
46
/*
47
   this is now the original code by Michael G Schwern:
48
   i've changed it just a slightly bit (use emalloc,
49
   get rid of includes etc)
50
  - thies - 13.09.1999
51
*/
52
53
/*-----------------------------  */
54
/* this used to be "metaphone.h" */
55
/*-----------------------------  */
56
57
/* Special encodings */
58
#define  SH   'X'
59
#define  TH   '0'
60
61
/*-----------------------------  */
62
/* end of "metaphone.h"          */
63
/*-----------------------------  */
64
65
/*----------------------------- */
66
/* this used to be "metachar.h" */
67
/*----------------------------- */
68
69
/* Metachar.h ... little bits about characters for metaphone */
70
/*-- Character encoding array & accessing macros --*/
71
/* Stolen directly out of the book... */
72
static const char _codes[26] =
73
{
74
  1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
75
/*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
76
};
77
78
79
/* Note: these functions require an uppercase letter input! */
80
0
static zend_always_inline char encode(char c) {
81
0
  if (isalpha((unsigned char)c)) {
82
0
    ZEND_ASSERT(c >= 'A' && c <= 'Z');
83
0
    return _codes[(c - 'A')];
84
0
  } else {
85
0
    return 0;
86
0
  }
87
0
}
88
89
0
#define isvowel(c)  (encode(c) & 1)    /* AEIOU */
90
91
/* These letters are passed through unchanged */
92
#define NOCHANGE(c) (encode(c) & 2)   /* FJMNR */
93
94
/* These form diphthongs when preceding H */
95
0
#define AFFECTH(c)  (encode(c) & 4)    /* CGPST */
96
97
/* These make C and G soft */
98
0
#define MAKESOFT(c) (encode(c) & 8)    /* EIY */
99
100
/* These prevent GH from becoming F */
101
0
#define NOGHTOF(c)  (encode(c) & 16)  /* BDH */
102
103
/*----------------------------- */
104
/* end of "metachar.h"          */
105
/*----------------------------- */
106
107
/* I suppose I could have been using a character pointer instead of
108
 * accessing the array directly... */
109
110
0
#define Convert_Raw(c) toupper((unsigned char)c)
111
/* Look at the next letter in the word */
112
0
#define Read_Raw_Next_Letter (word[w_idx+1])
113
0
#define Read_Next_Letter (Convert_Raw(Read_Raw_Next_Letter))
114
/* Look at the current letter in the word */
115
0
#define Read_Raw_Curr_Letter (word[w_idx])
116
#define Read_Curr_Letter (Convert_Raw(Read_Raw_Curr_Letter))
117
/* Go N letters back. */
118
0
#define Look_Back_Letter(n) (w_idx >= n ? Convert_Raw(word[w_idx-n]) : '\0')
119
/* Previous letter.  I dunno, should this return null on failure? */
120
0
#define Read_Prev_Letter (Look_Back_Letter(1))
121
/* Look two letters down.  It makes sure you don't walk off the string. */
122
0
#define Read_After_Next_Letter  (Read_Raw_Next_Letter != '\0' ? Convert_Raw(word[w_idx+2]) \
123
0
                           : '\0')
124
0
#define Look_Ahead_Letter(n) (toupper((unsigned char)Lookahead((char *) word+w_idx, n)))
125
126
127
/* Allows us to safely look ahead an arbitrary # of letters */
128
/* I probably could have just used strlen... */
129
static char Lookahead(char *word, size_t how_far)
130
0
{
131
0
  size_t idx;
132
0
  for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
133
  /* Edge forward in the string... */
134
135
0
  return  word[idx];      /* idx will be either == to how_far or
136
                 * at the end of the string where it will be null
137
                 */
138
0
}
139
140
141
/* phonize one letter
142
 * We don't know the buffers size in advance. On way to solve this is to just
143
 * re-allocate the buffer size. We're using an extra of 2 characters (this
144
 * could be one though; or more too). */
145
0
#define Phonize(c)  { \
146
0
            if (p_idx >= max_buffer_len) { \
147
0
              *phoned_word = zend_string_extend(*phoned_word, 2 * sizeof(char) + max_buffer_len, 0); \
148
0
              max_buffer_len += 2; \
149
0
            } \
150
0
            ZSTR_VAL(*phoned_word)[p_idx++] = c; \
151
0
            ZSTR_LEN(*phoned_word) = p_idx; \
152
0
          }
153
/* Slap a null character on the end of the phoned word */
154
0
#define End_Phoned_Word() { \
155
0
              if (p_idx == max_buffer_len) { \
156
0
                *phoned_word = zend_string_extend(*phoned_word, 1 * sizeof(char) + max_buffer_len, 0); \
157
0
                max_buffer_len += 1; \
158
0
              } \
159
0
              ZSTR_VAL(*phoned_word)[p_idx] = '\0'; \
160
0
              ZSTR_LEN(*phoned_word) = p_idx; \
161
0
            }
162
/* How long is the phoned word? */
163
0
#define Phone_Len (p_idx)
164
165
/* Note is a letter is a 'break' in the word */
166
0
#define Isbreak(c)  (!isalpha((unsigned char)(c)))
167
168
/* {{{ metaphone */
169
static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional)
170
0
{
171
0
  size_t w_idx = 0;       /* point in the phonization we're at. */
172
0
  size_t p_idx = 0;       /* end of the phoned phrase */
173
0
  size_t max_buffer_len = 0;    /* maximum length of the destination buffer */
174
0
  char curr_letter;
175
0
  ZEND_ASSERT(word != NULL);
176
0
  ZEND_ASSERT(max_phonemes >= 0);
177
178
/*-- Allocate memory for our phoned_phrase --*/
179
0
  if (max_phonemes == 0) { /* Assume largest possible */
180
0
    max_buffer_len = word_len;
181
0
    *phoned_word = zend_string_alloc(sizeof(char) * word_len + 1, 0);
182
0
  } else {
183
0
    max_buffer_len = max_phonemes;
184
0
    *phoned_word = zend_string_alloc(sizeof(char) * max_phonemes + 1, 0);
185
0
  }
186
187
188
/*-- The first phoneme has to be processed specially. --*/
189
  /* Find our first letter */
190
0
  for (; !isalpha((unsigned char)(curr_letter = Read_Raw_Curr_Letter)); w_idx++) {
191
    /* On the off chance we were given nothing but crap... */
192
0
    if (curr_letter == '\0') {
193
0
      End_Phoned_Word();
194
0
      return;
195
0
    }
196
0
  }
197
198
0
  curr_letter = Convert_Raw(curr_letter);
199
200
0
  switch (curr_letter) {
201
    /* AE becomes E */
202
0
  case 'A':
203
0
    if (Read_Next_Letter == 'E') {
204
0
      Phonize('E');
205
0
      w_idx += 2;
206
0
    }
207
    /* Remember, preserve vowels at the beginning */
208
0
    else {
209
0
      Phonize('A');
210
0
      w_idx++;
211
0
    }
212
0
    break;
213
    /* [GKP]N becomes N */
214
0
  case 'G':
215
0
  case 'K':
216
0
  case 'P':
217
0
    if (Read_Next_Letter == 'N') {
218
0
      Phonize('N');
219
0
      w_idx += 2;
220
0
    }
221
0
    break;
222
    /* WH becomes W,
223
       WR becomes R
224
       W if followed by a vowel */
225
0
  case 'W': {
226
0
    char next_letter = Read_Next_Letter;
227
0
    if (next_letter == 'R') {
228
0
      Phonize('R');
229
0
      w_idx += 2;
230
0
    } else if (next_letter == 'H' || isvowel(next_letter)) {
231
0
      Phonize('W');
232
0
      w_idx += 2;
233
0
    }
234
    /* else ignore */
235
0
    break;
236
0
  }
237
    /* X becomes S */
238
0
  case 'X':
239
0
    Phonize('S');
240
0
    w_idx++;
241
0
    break;
242
    /* Vowels are kept */
243
    /* We did A already
244
       case 'A':
245
       case 'a':
246
     */
247
0
  case 'E':
248
0
  case 'I':
249
0
  case 'O':
250
0
  case 'U':
251
0
    Phonize(curr_letter);
252
0
    w_idx++;
253
0
    break;
254
0
  default:
255
    /* do nothing */
256
0
    break;
257
0
  }
258
259
260
261
  /* On to the metaphoning */
262
0
  for (; (curr_letter = Read_Raw_Curr_Letter) != '\0' &&
263
0
     (max_phonemes == 0 || Phone_Len < (size_t)max_phonemes);
264
0
     w_idx++) {
265
    /* How many letters to skip because an earlier encoding handled
266
     * multiple letters */
267
0
    unsigned short int skip_letter = 0;
268
269
270
    /* THOUGHT:  It would be nice if, rather than having things like...
271
     * well, SCI.  For SCI you encode the S, then have to remember
272
     * to skip the C.  So the phonome SCI invades both S and C.  It would
273
     * be better, IMHO, to skip the C from the S part of the encoding.
274
     * Hell, I'm trying it.
275
     */
276
277
    /* Ignore non-alphas */
278
0
    if (!isalpha((unsigned char)curr_letter))
279
0
      continue;
280
281
0
    curr_letter = Convert_Raw(curr_letter);
282
    /* Note: we can't cache curr_letter from the previous loop
283
     * because of the skip_letter variable. */
284
0
    char prev_letter = Read_Prev_Letter;
285
286
    /* Drop duplicates, except CC */
287
0
    if (curr_letter == prev_letter &&
288
0
      curr_letter != 'C')
289
0
      continue;
290
291
0
    switch (curr_letter) {
292
      /* B -> B unless in MB */
293
0
    case 'B':
294
0
      if (prev_letter != 'M')
295
0
        Phonize('B');
296
0
      break;
297
      /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
298
       * (SCHW is handled in S)
299
       *  S if -CI-, -CE- or -CY-
300
       *  dropped if -SCI-, SCE-, -SCY- (handed in S)
301
       *  else K
302
       */
303
0
    case 'C': {
304
0
      char next_letter = Read_Next_Letter;
305
0
      if (MAKESOFT(next_letter)) { /* C[IEY] */
306
0
        if (next_letter == 'I' && Read_After_Next_Letter == 'A') { /* CIA */
307
0
          Phonize(SH);
308
0
        }
309
        /* SC[IEY] */
310
0
        else if (prev_letter == 'S') {
311
          /* Dropped */
312
0
        } else {
313
0
          Phonize('S');
314
0
        }
315
0
      } else if (next_letter == 'H') {
316
0
        if ((!traditional) && (prev_letter == 'S' || Read_After_Next_Letter == 'R')) { /* Christ, School */
317
0
          Phonize('K');
318
0
        } else {
319
0
          Phonize(SH);
320
0
        }
321
0
        skip_letter++;
322
0
      } else {
323
0
        Phonize('K');
324
0
      }
325
0
      break;
326
0
    }
327
      /* J if in -DGE-, -DGI- or -DGY-
328
       * else T
329
       */
330
0
    case 'D':
331
0
      if (Read_Next_Letter == 'G' &&
332
0
        MAKESOFT(Read_After_Next_Letter)) {
333
0
        Phonize('J');
334
0
        skip_letter++;
335
0
      } else
336
0
        Phonize('T');
337
0
      break;
338
      /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
339
       * else dropped if -GNED, -GN,
340
       * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
341
       * else J if in -GE-, -GI, -GY and not GG
342
       * else K
343
       */
344
0
    case 'G': {
345
0
      char next_letter = Read_Next_Letter;
346
0
      if (next_letter == 'H') {
347
0
        if (!(NOGHTOF(Look_Back_Letter(3)) ||
348
0
            Look_Back_Letter(4) == 'H')) {
349
0
          Phonize('F');
350
0
          skip_letter++;
351
0
        } else {
352
          /* silent */
353
0
        }
354
0
      } else if (next_letter == 'N') {
355
0
        char after_next_letter = Read_After_Next_Letter;
356
0
        if (Isbreak(after_next_letter) ||
357
0
          (after_next_letter == 'E' &&
358
0
           Look_Ahead_Letter(3) == 'D')) {
359
          /* dropped */
360
0
        } else
361
0
          Phonize('K');
362
0
      } else if (MAKESOFT(next_letter) &&
363
0
             prev_letter != 'G') {
364
0
        Phonize('J');
365
0
      } else {
366
0
        Phonize('K');
367
0
      }
368
0
      break;
369
0
    }
370
      /* H if before a vowel and not after C,G,P,S,T */
371
0
    case 'H':
372
0
      if (isvowel(Read_Next_Letter) &&
373
0
        !AFFECTH(prev_letter))
374
0
        Phonize('H');
375
0
      break;
376
      /* dropped if after C
377
       * else K
378
       */
379
0
    case 'K':
380
0
      if (prev_letter != 'C')
381
0
        Phonize('K');
382
0
      break;
383
      /* F if before H
384
       * else P
385
       */
386
0
    case 'P':
387
0
      if (Read_Next_Letter == 'H') {
388
0
        Phonize('F');
389
0
      } else {
390
0
        Phonize('P');
391
0
      }
392
0
      break;
393
      /* K
394
       */
395
0
    case 'Q':
396
0
      Phonize('K');
397
0
      break;
398
      /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
399
       * else S
400
       */
401
0
    case 'S': {
402
0
      char next_letter = Read_Next_Letter;
403
0
      char after_next_letter;
404
0
      if (next_letter == 'I' &&
405
0
        ((after_next_letter = Read_After_Next_Letter) == 'O' ||
406
0
         after_next_letter == 'A')) {
407
0
        Phonize(SH);
408
0
      } else if (next_letter == 'H') {
409
0
        Phonize(SH);
410
0
        skip_letter++;
411
0
      } else if ((!traditional) && (next_letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) {
412
0
        Phonize(SH);
413
0
        skip_letter += 2;
414
0
      } else {
415
0
        Phonize('S');
416
0
      }
417
0
      break;
418
0
    }
419
      /* 'sh' in -TIA- or -TIO-
420
       * else 'th' before H
421
       * else T
422
       */
423
0
    case 'T': {
424
0
      char next_letter = Read_Next_Letter;
425
0
      char after_next_letter;
426
0
      if (next_letter == 'I' &&
427
0
        ((after_next_letter = Read_After_Next_Letter) == 'O' ||
428
0
         after_next_letter == 'A')) {
429
0
        Phonize(SH);
430
0
      } else if (next_letter == 'H') {
431
0
        Phonize(TH);
432
0
        skip_letter++;
433
0
      } else if (!(next_letter == 'C' && Read_After_Next_Letter == 'H')) {
434
0
        Phonize('T');
435
0
      }
436
0
      break;
437
0
    }
438
      /* F */
439
0
    case 'V':
440
0
      Phonize('F');
441
0
      break;
442
      /* W before a vowel, else dropped */
443
0
    case 'W':
444
0
      if (isvowel(Read_Next_Letter))
445
0
        Phonize('W');
446
0
      break;
447
      /* KS */
448
0
    case 'X':
449
0
      Phonize('K');
450
0
      Phonize('S');
451
0
      break;
452
      /* Y if followed by a vowel */
453
0
    case 'Y':
454
0
      if (isvowel(Read_Next_Letter))
455
0
        Phonize('Y');
456
0
      break;
457
      /* S */
458
0
    case 'Z':
459
0
      Phonize('S');
460
0
      break;
461
      /* No transformation */
462
0
    case 'F':
463
0
    case 'J':
464
0
    case 'L':
465
0
    case 'M':
466
0
    case 'N':
467
0
    case 'R':
468
0
      Phonize(curr_letter);
469
0
      break;
470
0
    default:
471
      /* nothing */
472
0
      break;
473
0
    }           /* END SWITCH */
474
475
0
    w_idx += skip_letter;
476
0
  }              /* END FOR */
477
478
0
  End_Phoned_Word();
479
0
}                /* END metaphone */
480
/* }}} */