/src/php-src/ext/standard/metaphone.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | +----------------------------------------------------------------------+ |
3 | | | Copyright (c) The PHP Group | |
4 | | +----------------------------------------------------------------------+ |
5 | | | This source file is subject to version 3.01 of the PHP license, | |
6 | | | that is bundled with this package in the file LICENSE, and is | |
7 | | | available through the world-wide-web at the following url: | |
8 | | | https://www.php.net/license/3_01.txt | |
9 | | | If you did not receive a copy of the PHP license and are unable to | |
10 | | | obtain it through the world-wide-web, please send a note to | |
11 | | | license@php.net so we can mail you a copy immediately. | |
12 | | +----------------------------------------------------------------------+ |
13 | | | Author: Thies C. Arntzen <thies@thieso.net> | |
14 | | +----------------------------------------------------------------------+ |
15 | | */ |
16 | | |
17 | | /* |
18 | | Based on CPANs "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> |
19 | | */ |
20 | | |
21 | | #include "php.h" |
22 | | |
23 | | static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional); |
24 | | |
25 | | /* {{{ Break english phrases down into their phonemes */ |
26 | | PHP_FUNCTION(metaphone) |
27 | 0 | { |
28 | 0 | zend_string *str; |
29 | 0 | zend_string *result = NULL; |
30 | 0 | zend_long phones = 0; |
31 | |
|
32 | 0 | ZEND_PARSE_PARAMETERS_START(1, 2) |
33 | 0 | Z_PARAM_STR(str) |
34 | 0 | Z_PARAM_OPTIONAL |
35 | 0 | Z_PARAM_LONG(phones) |
36 | 0 | ZEND_PARSE_PARAMETERS_END(); |
37 | | |
38 | 0 | if (phones < 0) { |
39 | 0 | zend_argument_value_error(2, "must be greater than or equal to 0"); |
40 | 0 | RETURN_THROWS(); |
41 | 0 | } |
42 | | |
43 | 0 | metaphone((unsigned char *)ZSTR_VAL(str), ZSTR_LEN(str), phones, &result, 1); |
44 | 0 | RETVAL_STR(result); |
45 | 0 | } |
46 | | /* }}} */ |
47 | | |
48 | | /* |
49 | | this is now the original code by Michael G Schwern: |
50 | | i've changed it just a slightly bit (use emalloc, |
51 | | get rid of includes etc) |
52 | | - thies - 13.09.1999 |
53 | | */ |
54 | | |
55 | | /*----------------------------- */ |
56 | | /* this used to be "metaphone.h" */ |
57 | | /*----------------------------- */ |
58 | | |
59 | | /* Special encodings */ |
60 | | #define SH 'X' |
61 | | #define TH '0' |
62 | | |
63 | | /*----------------------------- */ |
64 | | /* end of "metaphone.h" */ |
65 | | /*----------------------------- */ |
66 | | |
67 | | /*----------------------------- */ |
68 | | /* this used to be "metachar.h" */ |
69 | | /*----------------------------- */ |
70 | | |
71 | | /* Metachar.h ... little bits about characters for metaphone */ |
72 | | /*-- Character encoding array & accessing macros --*/ |
73 | | /* Stolen directly out of the book... */ |
74 | | static const char _codes[26] = |
75 | | { |
76 | | 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0 |
77 | | /* a b c d e f g h i j k l m n o p q r s t u v w x y z */ |
78 | | }; |
79 | | |
80 | | |
81 | | /* Note: these functions require an uppercase letter input! */ |
82 | 0 | static zend_always_inline char encode(char c) { |
83 | 0 | if (isalpha(c)) { |
84 | 0 | ZEND_ASSERT(c >= 'A' && c <= 'Z'); |
85 | 0 | return _codes[(c - 'A')]; |
86 | 0 | } else { |
87 | 0 | return 0; |
88 | 0 | } |
89 | 0 | } |
90 | | |
91 | 0 | #define isvowel(c) (encode(c) & 1) /* AEIOU */ |
92 | | |
93 | | /* These letters are passed through unchanged */ |
94 | | #define NOCHANGE(c) (encode(c) & 2) /* FJMNR */ |
95 | | |
96 | | /* These form diphthongs when preceding H */ |
97 | 0 | #define AFFECTH(c) (encode(c) & 4) /* CGPST */ |
98 | | |
99 | | /* These make C and G soft */ |
100 | 0 | #define MAKESOFT(c) (encode(c) & 8) /* EIY */ |
101 | | |
102 | | /* These prevent GH from becoming F */ |
103 | 0 | #define NOGHTOF(c) (encode(c) & 16) /* BDH */ |
104 | | |
105 | | /*----------------------------- */ |
106 | | /* end of "metachar.h" */ |
107 | | /*----------------------------- */ |
108 | | |
109 | | /* I suppose I could have been using a character pointer instead of |
110 | | * accesssing the array directly... */ |
111 | | |
112 | 0 | #define Convert_Raw(c) toupper(c) |
113 | | /* Look at the next letter in the word */ |
114 | 0 | #define Read_Raw_Next_Letter (word[w_idx+1]) |
115 | 0 | #define Read_Next_Letter (Convert_Raw(Read_Raw_Next_Letter)) |
116 | | /* Look at the current letter in the word */ |
117 | 0 | #define Read_Raw_Curr_Letter (word[w_idx]) |
118 | | #define Read_Curr_Letter (Convert_Raw(Read_Raw_Curr_Letter)) |
119 | | /* Go N letters back. */ |
120 | 0 | #define Look_Back_Letter(n) (w_idx >= n ? Convert_Raw(word[w_idx-n]) : '\0') |
121 | | /* Previous letter. I dunno, should this return null on failure? */ |
122 | 0 | #define Read_Prev_Letter (Look_Back_Letter(1)) |
123 | | /* Look two letters down. It makes sure you don't walk off the string. */ |
124 | 0 | #define Read_After_Next_Letter (Read_Raw_Next_Letter != '\0' ? Convert_Raw(word[w_idx+2]) \ |
125 | 0 | : '\0') |
126 | 0 | #define Look_Ahead_Letter(n) (toupper(Lookahead((char *) word+w_idx, n))) |
127 | | |
128 | | |
129 | | /* Allows us to safely look ahead an arbitrary # of letters */ |
130 | | /* I probably could have just used strlen... */ |
131 | | static char Lookahead(char *word, int how_far) |
132 | 0 | { |
133 | 0 | int idx; |
134 | 0 | for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); |
135 | | /* Edge forward in the string... */ |
136 | |
|
137 | 0 | return word[idx]; /* idx will be either == to how_far or |
138 | | * at the end of the string where it will be null |
139 | | */ |
140 | 0 | } |
141 | | |
142 | | |
143 | | /* phonize one letter |
144 | | * We don't know the buffers size in advance. On way to solve this is to just |
145 | | * re-allocate the buffer size. We're using an extra of 2 characters (this |
146 | | * could be one though; or more too). */ |
147 | 0 | #define Phonize(c) { \ |
148 | 0 | if (p_idx >= max_buffer_len) { \ |
149 | 0 | *phoned_word = zend_string_extend(*phoned_word, 2 * sizeof(char) + max_buffer_len, 0); \ |
150 | 0 | max_buffer_len += 2; \ |
151 | 0 | } \ |
152 | 0 | ZSTR_VAL(*phoned_word)[p_idx++] = c; \ |
153 | 0 | ZSTR_LEN(*phoned_word) = p_idx; \ |
154 | 0 | } |
155 | | /* Slap a null character on the end of the phoned word */ |
156 | 0 | #define End_Phoned_Word() { \ |
157 | 0 | if (p_idx == max_buffer_len) { \ |
158 | 0 | *phoned_word = zend_string_extend(*phoned_word, 1 * sizeof(char) + max_buffer_len, 0); \ |
159 | 0 | max_buffer_len += 1; \ |
160 | 0 | } \ |
161 | 0 | ZSTR_VAL(*phoned_word)[p_idx] = '\0'; \ |
162 | 0 | ZSTR_LEN(*phoned_word) = p_idx; \ |
163 | 0 | } |
164 | | /* How long is the phoned word? */ |
165 | 0 | #define Phone_Len (p_idx) |
166 | | |
167 | | /* Note is a letter is a 'break' in the word */ |
168 | 0 | #define Isbreak(c) (!isalpha(c)) |
169 | | |
170 | | /* {{{ metaphone */ |
171 | | static void metaphone(unsigned char *word, size_t word_len, zend_long max_phonemes, zend_string **phoned_word, int traditional) |
172 | 0 | { |
173 | 0 | int w_idx = 0; /* point in the phonization we're at. */ |
174 | 0 | size_t p_idx = 0; /* end of the phoned phrase */ |
175 | 0 | size_t max_buffer_len = 0; /* maximum length of the destination buffer */ |
176 | 0 | char curr_letter; |
177 | 0 | ZEND_ASSERT(word != NULL); |
178 | 0 | ZEND_ASSERT(max_phonemes >= 0); |
179 | | |
180 | | /*-- Allocate memory for our phoned_phrase --*/ |
181 | 0 | if (max_phonemes == 0) { /* Assume largest possible */ |
182 | 0 | max_buffer_len = word_len; |
183 | 0 | *phoned_word = zend_string_alloc(sizeof(char) * word_len + 1, 0); |
184 | 0 | } else { |
185 | 0 | max_buffer_len = max_phonemes; |
186 | 0 | *phoned_word = zend_string_alloc(sizeof(char) * max_phonemes + 1, 0); |
187 | 0 | } |
188 | | |
189 | | |
190 | | /*-- The first phoneme has to be processed specially. --*/ |
191 | | /* Find our first letter */ |
192 | 0 | for (; !isalpha(curr_letter = Read_Raw_Curr_Letter); w_idx++) { |
193 | | /* On the off chance we were given nothing but crap... */ |
194 | 0 | if (curr_letter == '\0') { |
195 | 0 | End_Phoned_Word(); |
196 | 0 | return; |
197 | 0 | } |
198 | 0 | } |
199 | | |
200 | 0 | curr_letter = Convert_Raw(curr_letter); |
201 | |
|
202 | 0 | switch (curr_letter) { |
203 | | /* AE becomes E */ |
204 | 0 | case 'A': |
205 | 0 | if (Read_Next_Letter == 'E') { |
206 | 0 | Phonize('E'); |
207 | 0 | w_idx += 2; |
208 | 0 | } |
209 | | /* Remember, preserve vowels at the beginning */ |
210 | 0 | else { |
211 | 0 | Phonize('A'); |
212 | 0 | w_idx++; |
213 | 0 | } |
214 | 0 | break; |
215 | | /* [GKP]N becomes N */ |
216 | 0 | case 'G': |
217 | 0 | case 'K': |
218 | 0 | case 'P': |
219 | 0 | if (Read_Next_Letter == 'N') { |
220 | 0 | Phonize('N'); |
221 | 0 | w_idx += 2; |
222 | 0 | } |
223 | 0 | break; |
224 | | /* WH becomes W, |
225 | | WR becomes R |
226 | | W if followed by a vowel */ |
227 | 0 | case 'W': { |
228 | 0 | char next_letter = Read_Next_Letter; |
229 | 0 | if (next_letter == 'R') { |
230 | 0 | Phonize('R'); |
231 | 0 | w_idx += 2; |
232 | 0 | } else if (next_letter == 'H' || isvowel(next_letter)) { |
233 | 0 | Phonize('W'); |
234 | 0 | w_idx += 2; |
235 | 0 | } |
236 | | /* else ignore */ |
237 | 0 | break; |
238 | 0 | } |
239 | | /* X becomes S */ |
240 | 0 | case 'X': |
241 | 0 | Phonize('S'); |
242 | 0 | w_idx++; |
243 | 0 | break; |
244 | | /* Vowels are kept */ |
245 | | /* We did A already |
246 | | case 'A': |
247 | | case 'a': |
248 | | */ |
249 | 0 | case 'E': |
250 | 0 | case 'I': |
251 | 0 | case 'O': |
252 | 0 | case 'U': |
253 | 0 | Phonize(curr_letter); |
254 | 0 | w_idx++; |
255 | 0 | break; |
256 | 0 | default: |
257 | | /* do nothing */ |
258 | 0 | break; |
259 | 0 | } |
260 | | |
261 | | |
262 | | |
263 | | /* On to the metaphoning */ |
264 | 0 | for (; (curr_letter = Read_Raw_Curr_Letter) != '\0' && |
265 | 0 | (max_phonemes == 0 || Phone_Len < (size_t)max_phonemes); |
266 | 0 | w_idx++) { |
267 | | /* How many letters to skip because an eariler encoding handled |
268 | | * multiple letters */ |
269 | 0 | unsigned short int skip_letter = 0; |
270 | | |
271 | | |
272 | | /* THOUGHT: It would be nice if, rather than having things like... |
273 | | * well, SCI. For SCI you encode the S, then have to remember |
274 | | * to skip the C. So the phonome SCI invades both S and C. It would |
275 | | * be better, IMHO, to skip the C from the S part of the encoding. |
276 | | * Hell, I'm trying it. |
277 | | */ |
278 | | |
279 | | /* Ignore non-alphas */ |
280 | 0 | if (!isalpha(curr_letter)) |
281 | 0 | continue; |
282 | | |
283 | 0 | curr_letter = Convert_Raw(curr_letter); |
284 | | /* Note: we can't cache curr_letter from the previous loop |
285 | | * because of the skip_letter variable. */ |
286 | 0 | char prev_letter = Read_Prev_Letter; |
287 | | |
288 | | /* Drop duplicates, except CC */ |
289 | 0 | if (curr_letter == prev_letter && |
290 | 0 | curr_letter != 'C') |
291 | 0 | continue; |
292 | | |
293 | 0 | switch (curr_letter) { |
294 | | /* B -> B unless in MB */ |
295 | 0 | case 'B': |
296 | 0 | if (prev_letter != 'M') |
297 | 0 | Phonize('B'); |
298 | 0 | break; |
299 | | /* 'sh' if -CIA- or -CH, but not SCH, except SCHW. |
300 | | * (SCHW is handled in S) |
301 | | * S if -CI-, -CE- or -CY- |
302 | | * dropped if -SCI-, SCE-, -SCY- (handed in S) |
303 | | * else K |
304 | | */ |
305 | 0 | case 'C': { |
306 | 0 | char next_letter = Read_Next_Letter; |
307 | 0 | if (MAKESOFT(next_letter)) { /* C[IEY] */ |
308 | 0 | if (next_letter == 'I' && Read_After_Next_Letter == 'A') { /* CIA */ |
309 | 0 | Phonize(SH); |
310 | 0 | } |
311 | | /* SC[IEY] */ |
312 | 0 | else if (prev_letter == 'S') { |
313 | | /* Dropped */ |
314 | 0 | } else { |
315 | 0 | Phonize('S'); |
316 | 0 | } |
317 | 0 | } else if (next_letter == 'H') { |
318 | 0 | if ((!traditional) && (prev_letter == 'S' || Read_After_Next_Letter == 'R')) { /* Christ, School */ |
319 | 0 | Phonize('K'); |
320 | 0 | } else { |
321 | 0 | Phonize(SH); |
322 | 0 | } |
323 | 0 | skip_letter++; |
324 | 0 | } else { |
325 | 0 | Phonize('K'); |
326 | 0 | } |
327 | 0 | break; |
328 | 0 | } |
329 | | /* J if in -DGE-, -DGI- or -DGY- |
330 | | * else T |
331 | | */ |
332 | 0 | case 'D': |
333 | 0 | if (Read_Next_Letter == 'G' && |
334 | 0 | MAKESOFT(Read_After_Next_Letter)) { |
335 | 0 | Phonize('J'); |
336 | 0 | skip_letter++; |
337 | 0 | } else |
338 | 0 | Phonize('T'); |
339 | 0 | break; |
340 | | /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH |
341 | | * else dropped if -GNED, -GN, |
342 | | * else dropped if -DGE-, -DGI- or -DGY- (handled in D) |
343 | | * else J if in -GE-, -GI, -GY and not GG |
344 | | * else K |
345 | | */ |
346 | 0 | case 'G': { |
347 | 0 | char next_letter = Read_Next_Letter; |
348 | 0 | if (next_letter == 'H') { |
349 | 0 | if (!(NOGHTOF(Look_Back_Letter(3)) || |
350 | 0 | Look_Back_Letter(4) == 'H')) { |
351 | 0 | Phonize('F'); |
352 | 0 | skip_letter++; |
353 | 0 | } else { |
354 | | /* silent */ |
355 | 0 | } |
356 | 0 | } else if (next_letter == 'N') { |
357 | 0 | char after_next_letter = Read_After_Next_Letter; |
358 | 0 | if (Isbreak(after_next_letter) || |
359 | 0 | (after_next_letter == 'E' && |
360 | 0 | Look_Ahead_Letter(3) == 'D')) { |
361 | | /* dropped */ |
362 | 0 | } else |
363 | 0 | Phonize('K'); |
364 | 0 | } else if (MAKESOFT(next_letter) && |
365 | 0 | prev_letter != 'G') { |
366 | 0 | Phonize('J'); |
367 | 0 | } else { |
368 | 0 | Phonize('K'); |
369 | 0 | } |
370 | 0 | break; |
371 | 0 | } |
372 | | /* H if before a vowel and not after C,G,P,S,T */ |
373 | 0 | case 'H': |
374 | 0 | if (isvowel(Read_Next_Letter) && |
375 | 0 | !AFFECTH(prev_letter)) |
376 | 0 | Phonize('H'); |
377 | 0 | break; |
378 | | /* dropped if after C |
379 | | * else K |
380 | | */ |
381 | 0 | case 'K': |
382 | 0 | if (prev_letter != 'C') |
383 | 0 | Phonize('K'); |
384 | 0 | break; |
385 | | /* F if before H |
386 | | * else P |
387 | | */ |
388 | 0 | case 'P': |
389 | 0 | if (Read_Next_Letter == 'H') { |
390 | 0 | Phonize('F'); |
391 | 0 | } else { |
392 | 0 | Phonize('P'); |
393 | 0 | } |
394 | 0 | break; |
395 | | /* K |
396 | | */ |
397 | 0 | case 'Q': |
398 | 0 | Phonize('K'); |
399 | 0 | break; |
400 | | /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW- |
401 | | * else S |
402 | | */ |
403 | 0 | case 'S': { |
404 | 0 | char next_letter = Read_Next_Letter; |
405 | 0 | char after_next_letter; |
406 | 0 | if (next_letter == 'I' && |
407 | 0 | ((after_next_letter = Read_After_Next_Letter) == 'O' || |
408 | 0 | after_next_letter == 'A')) { |
409 | 0 | Phonize(SH); |
410 | 0 | } else if (next_letter == 'H') { |
411 | 0 | Phonize(SH); |
412 | 0 | skip_letter++; |
413 | 0 | } else if ((!traditional) && (next_letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) { |
414 | 0 | Phonize(SH); |
415 | 0 | skip_letter += 2; |
416 | 0 | } else { |
417 | 0 | Phonize('S'); |
418 | 0 | } |
419 | 0 | break; |
420 | 0 | } |
421 | | /* 'sh' in -TIA- or -TIO- |
422 | | * else 'th' before H |
423 | | * else T |
424 | | */ |
425 | 0 | case 'T': { |
426 | 0 | char next_letter = Read_Next_Letter; |
427 | 0 | char after_next_letter; |
428 | 0 | if (next_letter == 'I' && |
429 | 0 | ((after_next_letter = Read_After_Next_Letter) == 'O' || |
430 | 0 | after_next_letter == 'A')) { |
431 | 0 | Phonize(SH); |
432 | 0 | } else if (next_letter == 'H') { |
433 | 0 | Phonize(TH); |
434 | 0 | skip_letter++; |
435 | 0 | } else if (!(next_letter == 'C' && Read_After_Next_Letter == 'H')) { |
436 | 0 | Phonize('T'); |
437 | 0 | } |
438 | 0 | break; |
439 | 0 | } |
440 | | /* F */ |
441 | 0 | case 'V': |
442 | 0 | Phonize('F'); |
443 | 0 | break; |
444 | | /* W before a vowel, else dropped */ |
445 | 0 | case 'W': |
446 | 0 | if (isvowel(Read_Next_Letter)) |
447 | 0 | Phonize('W'); |
448 | 0 | break; |
449 | | /* KS */ |
450 | 0 | case 'X': |
451 | 0 | Phonize('K'); |
452 | 0 | Phonize('S'); |
453 | 0 | break; |
454 | | /* Y if followed by a vowel */ |
455 | 0 | case 'Y': |
456 | 0 | if (isvowel(Read_Next_Letter)) |
457 | 0 | Phonize('Y'); |
458 | 0 | break; |
459 | | /* S */ |
460 | 0 | case 'Z': |
461 | 0 | Phonize('S'); |
462 | 0 | break; |
463 | | /* No transformation */ |
464 | 0 | case 'F': |
465 | 0 | case 'J': |
466 | 0 | case 'L': |
467 | 0 | case 'M': |
468 | 0 | case 'N': |
469 | 0 | case 'R': |
470 | 0 | Phonize(curr_letter); |
471 | 0 | break; |
472 | 0 | default: |
473 | | /* nothing */ |
474 | 0 | break; |
475 | 0 | } /* END SWITCH */ |
476 | | |
477 | 0 | w_idx += skip_letter; |
478 | 0 | } /* END FOR */ |
479 | | |
480 | 0 | End_Phoned_Word(); |
481 | 0 | } /* END metaphone */ |
482 | | /* }}} */ |