• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2001-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package org.apache.commons.codec.language;
18 
19 import org.apache.commons.codec.EncoderException;
20 import org.apache.commons.codec.StringEncoder;
21 
22 /**
23  * Encodes a string into a double metaphone value.
24  * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>.
25  * <ul>
26  * <li>Original Article: <a
27  * href="http://www.cuj.com/documents/s=8038/cuj0006philips/">
28  * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li>
29  * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
30  * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li>
31  * </ul>
32  *
33  * @author Apache Software Foundation
34  * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $
35  */
36 public class DoubleMetaphone implements StringEncoder {
37 
38     /**
39      * "Vowels" to test for
40      */
41     private static final String VOWELS = "AEIOUY";
42 
43     /**
44      * Prefixes when present which are not pronounced
45      */
46     private static final String[] SILENT_START =
47     { "GN", "KN", "PN", "WR", "PS" };
48     private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
49     { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
50     private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
51     { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
52     private static final String[] L_T_K_S_N_M_B_Z =
53     { "L", "T", "K", "S", "N", "M", "B", "Z" };
54 
55     /**
56      * Maximum length of an encoding, default is 4
57      */
58     protected int maxCodeLen = 4;
59 
60     /**
61      * Creates an instance of this DoubleMetaphone encoder
62      */
DoubleMetaphone()63     public DoubleMetaphone() {
64         super();
65     }
66 
67     /**
68      * Encode a value with Double Metaphone
69      *
70      * @param value String to encode
71      * @return an encoded string
72      */
doubleMetaphone(String value)73     public String doubleMetaphone(String value) {
74         return doubleMetaphone(value, false);
75     }
76 
77     /**
78      * Encode a value with Double Metaphone, optionally using the alternate
79      * encoding.
80      *
81      * @param value String to encode
82      * @param alternate use alternate encode
83      * @return an encoded string
84      */
doubleMetaphone(String value, boolean alternate)85     public String doubleMetaphone(String value, boolean alternate) {
86         value = cleanInput(value);
87         if (value == null) {
88             return null;
89         }
90 
91         boolean slavoGermanic = isSlavoGermanic(value);
92         int index = isSilentStart(value) ? 1 : 0;
93 
94         DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
95 
96         while (!result.isComplete() && index <= value.length() - 1) {
97             switch (value.charAt(index)) {
98             case 'A':
99             case 'E':
100             case 'I':
101             case 'O':
102             case 'U':
103             case 'Y':
104                 index = handleAEIOUY(value, result, index);
105                 break;
106             case 'B':
107                 result.append('P');
108                 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
109                 break;
110             case '\u00C7':
111                 // A C with a Cedilla
112                 result.append('S');
113                 index++;
114                 break;
115             case 'C':
116                 index = handleC(value, result, index);
117                 break;
118             case 'D':
119                 index = handleD(value, result, index);
120                 break;
121             case 'F':
122                 result.append('F');
123                 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
124                 break;
125             case 'G':
126                 index = handleG(value, result, index, slavoGermanic);
127                 break;
128             case 'H':
129                 index = handleH(value, result, index);
130                 break;
131             case 'J':
132                 index = handleJ(value, result, index, slavoGermanic);
133                 break;
134             case 'K':
135                 result.append('K');
136                 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
137                 break;
138             case 'L':
139                 index = handleL(value, result, index);
140                 break;
141             case 'M':
142                 result.append('M');
143                 index = conditionM0(value, index) ? index + 2 : index + 1;
144                 break;
145             case 'N':
146                 result.append('N');
147                 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
148                 break;
149             case '\u00D1':
150                 // N with a tilde (spanish ene)
151                 result.append('N');
152                 index++;
153                 break;
154             case 'P':
155                 index = handleP(value, result, index);
156                 break;
157             case 'Q':
158                 result.append('K');
159                 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
160                 break;
161             case 'R':
162                 index = handleR(value, result, index, slavoGermanic);
163                 break;
164             case 'S':
165                 index = handleS(value, result, index, slavoGermanic);
166                 break;
167             case 'T':
168                 index = handleT(value, result, index);
169                 break;
170             case 'V':
171                 result.append('F');
172                 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
173                 break;
174             case 'W':
175                 index = handleW(value, result, index);
176                 break;
177             case 'X':
178                 index = handleX(value, result, index);
179                 break;
180             case 'Z':
181                 index = handleZ(value, result, index, slavoGermanic);
182                 break;
183             default:
184                 index++;
185                 break;
186             }
187         }
188 
189         return alternate ? result.getAlternate() : result.getPrimary();
190     }
191 
192     /**
193      * Encode the value using DoubleMetaphone.  It will only work if
194      * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
195      *
196      * @param obj Object to encode (should be of type String)
197      * @return An encoded Object (will be of type String)
198      * @throws EncoderException encode parameter is not of type String
199      */
encode(Object obj)200     public Object encode(Object obj) throws EncoderException {
201         if (!(obj instanceof String)) {
202             throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
203         }
204         return doubleMetaphone((String) obj);
205     }
206 
207     /**
208      * Encode the value using DoubleMetaphone.
209      *
210      * @param value String to encode
211      * @return An encoded String
212      */
encode(String value)213     public String encode(String value) {
214         return doubleMetaphone(value);
215     }
216 
217     /**
218      * Check if the Double Metaphone values of two <code>String</code> values
219      * are equal.
220      *
221      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
222      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
223      * @return <code>true</code> if the encoded <code>String</code>s are equal;
224      *          <code>false</code> otherwise.
225      * @see #isDoubleMetaphoneEqual(String,String,boolean)
226      */
isDoubleMetaphoneEqual(String value1, String value2)227     public boolean isDoubleMetaphoneEqual(String value1, String value2) {
228         return isDoubleMetaphoneEqual(value1, value2, false);
229     }
230 
231     /**
232      * Check if the Double Metaphone values of two <code>String</code> values
233      * are equal, optionally using the alternate value.
234      *
235      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
236      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
237      * @param alternate use the alternate value if <code>true</code>.
238      * @return <code>true</code> if the encoded <code>String</code>s are equal;
239      *          <code>false</code> otherwise.
240      */
isDoubleMetaphoneEqual(String value1, String value2, boolean alternate)241     public boolean isDoubleMetaphoneEqual(String value1,
242                                           String value2,
243                                           boolean alternate) {
244         return doubleMetaphone(value1, alternate).equals(doubleMetaphone
245                                                          (value2, alternate));
246     }
247 
248     /**
249      * Returns the maxCodeLen.
250      * @return int
251      */
getMaxCodeLen()252     public int getMaxCodeLen() {
253         return this.maxCodeLen;
254     }
255 
256     /**
257      * Sets the maxCodeLen.
258      * @param maxCodeLen The maxCodeLen to set
259      */
setMaxCodeLen(int maxCodeLen)260     public void setMaxCodeLen(int maxCodeLen) {
261         this.maxCodeLen = maxCodeLen;
262     }
263 
264     //-- BEGIN HANDLERS --//
265 
266     /**
267      * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
268      */
handleAEIOUY(String value, DoubleMetaphoneResult result, int index)269     private int handleAEIOUY(String value, DoubleMetaphoneResult result, int
270                              index) {
271         if (index == 0) {
272             result.append('A');
273         }
274         return index + 1;
275     }
276 
277     /**
278      * Handles 'C' cases
279      */
handleC(String value, DoubleMetaphoneResult result, int index)280     private int handleC(String value,
281                         DoubleMetaphoneResult result,
282                         int index) {
283         if (conditionC0(value, index)) {  // very confusing, moved out
284             result.append('K');
285             index += 2;
286         } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
287             result.append('S');
288             index += 2;
289         } else if (contains(value, index, 2, "CH")) {
290             index = handleCH(value, result, index);
291         } else if (contains(value, index, 2, "CZ") &&
292                    !contains(value, index - 2, 4, "WICZ")) {
293             //-- "Czerny" --//
294             result.append('S', 'X');
295             index += 2;
296         } else if (contains(value, index + 1, 3, "CIA")) {
297             //-- "focaccia" --//
298             result.append('X');
299             index += 3;
300         } else if (contains(value, index, 2, "CC") &&
301                    !(index == 1 && charAt(value, 0) == 'M')) {
302             //-- double "cc" but not "McClelland" --//
303             return handleCC(value, result, index);
304         } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
305             result.append('K');
306             index += 2;
307         } else if (contains(value, index, 2, "CI", "CE", "CY")) {
308             //-- Italian vs. English --//
309             if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
310                 result.append('S', 'X');
311             } else {
312                 result.append('S');
313             }
314             index += 2;
315         } else {
316             result.append('K');
317             if (contains(value, index + 1, 2, " C", " Q", " G")) {
318                 //-- Mac Caffrey, Mac Gregor --//
319                 index += 3;
320             } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
321                        !contains(value, index + 1, 2, "CE", "CI")) {
322                 index += 2;
323             } else {
324                 index++;
325             }
326         }
327 
328         return index;
329     }
330 
331     /**
332      * Handles 'CC' cases
333      */
handleCC(String value, DoubleMetaphoneResult result, int index)334     private int handleCC(String value,
335                          DoubleMetaphoneResult result,
336                          int index) {
337         if (contains(value, index + 2, 1, "I", "E", "H") &&
338             !contains(value, index + 2, 2, "HU")) {
339             //-- "bellocchio" but not "bacchus" --//
340             if ((index == 1 && charAt(value, index - 1) == 'A') ||
341                 contains(value, index - 1, 5, "UCCEE", "UCCES")) {
342                 //-- "accident", "accede", "succeed" --//
343                 result.append("KS");
344             } else {
345                 //-- "bacci", "bertucci", other Italian --//
346                 result.append('X');
347             }
348             index += 3;
349         } else {    // Pierce's rule
350             result.append('K');
351             index += 2;
352         }
353 
354         return index;
355     }
356 
357     /**
358      * Handles 'CH' cases
359      */
handleCH(String value, DoubleMetaphoneResult result, int index)360     private int handleCH(String value,
361                          DoubleMetaphoneResult result,
362                          int index) {
363         if (index > 0 && contains(value, index, 4, "CHAE")) {   // Michael
364             result.append('K', 'X');
365             return index + 2;
366         } else if (conditionCH0(value, index)) {
367             //-- Greek roots ("chemistry", "chorus", etc.) --//
368             result.append('K');
369             return index + 2;
370         } else if (conditionCH1(value, index)) {
371             //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
372             result.append('K');
373             return index + 2;
374         } else {
375             if (index > 0) {
376                 if (contains(value, 0, 2, "MC")) {
377                     result.append('K');
378                 } else {
379                     result.append('X', 'K');
380                 }
381             } else {
382                 result.append('X');
383             }
384             return index + 2;
385         }
386     }
387 
388     /**
389      * Handles 'D' cases
390      */
handleD(String value, DoubleMetaphoneResult result, int index)391     private int handleD(String value,
392                         DoubleMetaphoneResult result,
393                         int index) {
394         if (contains(value, index, 2, "DG")) {
395             //-- "Edge" --//
396             if (contains(value, index + 2, 1, "I", "E", "Y")) {
397                 result.append('J');
398                 index += 3;
399                 //-- "Edgar" --//
400             } else {
401                 result.append("TK");
402                 index += 2;
403             }
404         } else if (contains(value, index, 2, "DT", "DD")) {
405             result.append('T');
406             index += 2;
407         } else {
408             result.append('T');
409             index++;
410         }
411         return index;
412     }
413 
414     /**
415      * Handles 'G' cases
416      */
handleG(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)417     private int handleG(String value,
418                         DoubleMetaphoneResult result,
419                         int index,
420                         boolean slavoGermanic) {
421         if (charAt(value, index + 1) == 'H') {
422             index = handleGH(value, result, index);
423         } else if (charAt(value, index + 1) == 'N') {
424             if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
425                 result.append("KN", "N");
426             } else if (!contains(value, index + 2, 2, "EY") &&
427                        charAt(value, index + 1) != 'Y' && !slavoGermanic) {
428                 result.append("N", "KN");
429             } else {
430                 result.append("KN");
431             }
432             index = index + 2;
433         } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
434             result.append("KL", "L");
435             index += 2;
436         } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
437             //-- -ges-, -gep-, -gel-, -gie- at beginning --//
438             result.append('K', 'J');
439             index += 2;
440         } else if ((contains(value, index + 1, 2, "ER") ||
441                     charAt(value, index + 1) == 'Y') &&
442                    !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
443                    !contains(value, index - 1, 1, "E", "I") &&
444                    !contains(value, index - 1, 3, "RGY", "OGY")) {
445             //-- -ger-, -gy- --//
446             result.append('K', 'J');
447             index += 2;
448         } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
449                    contains(value, index - 1, 4, "AGGI", "OGGI")) {
450             //-- Italian "biaggi" --//
451             if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) {
452                 //-- obvious germanic --//
453                 result.append('K');
454             } else if (contains(value, index + 1, 4, "IER")) {
455                 result.append('J');
456             } else {
457                 result.append('J', 'K');
458             }
459             index += 2;
460         } else if (charAt(value, index + 1) == 'G') {
461             index += 2;
462             result.append('K');
463         } else {
464             index++;
465             result.append('K');
466         }
467         return index;
468     }
469 
470     /**
471      * Handles 'GH' cases
472      */
handleGH(String value, DoubleMetaphoneResult result, int index)473     private int handleGH(String value,
474                          DoubleMetaphoneResult result,
475                          int index) {
476         if (index > 0 && !isVowel(charAt(value, index - 1))) {
477             result.append('K');
478             index += 2;
479         } else if (index == 0) {
480             if (charAt(value, index + 2) == 'I') {
481                 result.append('J');
482             } else {
483                 result.append('K');
484             }
485             index += 2;
486         } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
487                    (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
488                    (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
489             //-- Parker's rule (with some further refinements) - "hugh"
490             index += 2;
491         } else {
492             if (index > 2 && charAt(value, index - 1) == 'U' &&
493                 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
494                 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
495                 result.append('F');
496             } else if (index > 0 && charAt(value, index - 1) != 'I') {
497                 result.append('K');
498             }
499             index += 2;
500         }
501         return index;
502     }
503 
504     /**
505      * Handles 'H' cases
506      */
handleH(String value, DoubleMetaphoneResult result, int index)507     private int handleH(String value,
508                         DoubleMetaphoneResult result,
509                         int index) {
510         //-- only keep if first & before vowel or between 2 vowels --//
511         if ((index == 0 || isVowel(charAt(value, index - 1))) &&
512             isVowel(charAt(value, index + 1))) {
513             result.append('H');
514             index += 2;
515             //-- also takes car of "HH" --//
516         } else {
517             index++;
518         }
519         return index;
520     }
521 
522     /**
523      * Handles 'J' cases
524      */
handleJ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)525     private int handleJ(String value, DoubleMetaphoneResult result, int index,
526                         boolean slavoGermanic) {
527         if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
528                 //-- obvious Spanish, "Jose", "San Jacinto" --//
529                 if ((index == 0 && (charAt(value, index + 4) == ' ') ||
530                      value.length() == 4) || contains(value, 0, 4, "SAN ")) {
531                     result.append('H');
532                 } else {
533                     result.append('J', 'H');
534                 }
535                 index++;
536             } else {
537                 if (index == 0 && !contains(value, index, 4, "JOSE")) {
538                     result.append('J', 'A');
539                 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
540                               (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
541                     result.append('J', 'H');
542                 } else if (index == value.length() - 1) {
543                     result.append('J', ' ');
544                 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) {
545                     result.append('J');
546                 }
547 
548                 if (charAt(value, index + 1) == 'J') {
549                     index += 2;
550                 } else {
551                     index++;
552                 }
553             }
554         return index;
555     }
556 
557     /**
558      * Handles 'L' cases
559      */
handleL(String value, DoubleMetaphoneResult result, int index)560     private int handleL(String value,
561                         DoubleMetaphoneResult result,
562                         int index) {
563         result.append('L');
564         if (charAt(value, index + 1) == 'L') {
565             if (conditionL0(value, index)) {
566                 result.appendAlternate(' ');
567             }
568             index += 2;
569         } else {
570             index++;
571         }
572         return index;
573     }
574 
575     /**
576      * Handles 'P' cases
577      */
handleP(String value, DoubleMetaphoneResult result, int index)578     private int handleP(String value,
579                         DoubleMetaphoneResult result,
580                         int index) {
581         if (charAt(value, index + 1) == 'H') {
582             result.append('F');
583             index += 2;
584         } else {
585             result.append('P');
586             index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
587         }
588         return index;
589     }
590 
591     /**
592      * Handles 'R' cases
593      */
handleR(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)594     private int handleR(String value,
595                         DoubleMetaphoneResult result,
596                         int index,
597                         boolean slavoGermanic) {
598         if (index == value.length() - 1 && !slavoGermanic &&
599             contains(value, index - 2, 2, "IE") &&
600             !contains(value, index - 4, 2, "ME", "MA")) {
601             result.appendAlternate('R');
602         } else {
603             result.append('R');
604         }
605         return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
606     }
607 
608     /**
609      * Handles 'S' cases
610      */
handleS(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)611     private int handleS(String value,
612                         DoubleMetaphoneResult result,
613                         int index,
614                         boolean slavoGermanic) {
615         if (contains(value, index - 1, 3, "ISL", "YSL")) {
616             //-- special cases "island", "isle", "carlisle", "carlysle" --//
617             index++;
618         } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
619             //-- special case "sugar-" --//
620             result.append('X', 'S');
621             index++;
622         } else if (contains(value, index, 2, "SH")) {
623             if (contains(value, index + 1, 4,
624                          "HEIM", "HOEK", "HOLM", "HOLZ")) {
625                 //-- germanic --//
626                 result.append('S');
627             } else {
628                 result.append('X');
629             }
630             index += 2;
631         } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
632             //-- Italian and Armenian --//
633             if (slavoGermanic) {
634                 result.append('S');
635             } else {
636                 result.append('S', 'X');
637             }
638             index += 3;
639         } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) {
640             //-- german & anglicisations, e.g. "smith" match "schmidt" //
641             // "snider" match "schneider" --//
642             //-- also, -sz- in slavic language altho in hungarian it //
643             //   is pronounced "s" --//
644             result.append('S', 'X');
645             index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
646         } else if (contains(value, index, 2, "SC")) {
647             index = handleSC(value, result, index);
648         } else {
649             if (index == value.length() - 1 && contains(value, index - 2,
650                                                         2, "AI", "OI")){
651                 //-- french e.g. "resnais", "artois" --//
652                 result.appendAlternate('S');
653             } else {
654                 result.append('S');
655             }
656             index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
657         }
658         return index;
659     }
660 
661     /**
662      * Handles 'SC' cases
663      */
handleSC(String value, DoubleMetaphoneResult result, int index)664     private int handleSC(String value,
665                          DoubleMetaphoneResult result,
666                          int index) {
667         if (charAt(value, index + 2) == 'H') {
668             //-- Schlesinger's rule --//
669             if (contains(value, index + 3,
670                          2, "OO", "ER", "EN", "UY", "ED", "EM")) {
671                 //-- Dutch origin, e.g. "school", "schooner" --//
672                 if (contains(value, index + 3, 2, "ER", "EN")) {
673                     //-- "schermerhorn", "schenker" --//
674                     result.append("X", "SK");
675                 } else {
676                     result.append("SK");
677                 }
678             } else {
679                 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
680                     result.append('X', 'S');
681                 } else {
682                     result.append('X');
683                 }
684             }
685         } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
686             result.append('S');
687         } else {
688             result.append("SK");
689         }
690         return index + 3;
691     }
692 
693     /**
694      * Handles 'T' cases
695      */
handleT(String value, DoubleMetaphoneResult result, int index)696     private int handleT(String value,
697                         DoubleMetaphoneResult result,
698                         int index) {
699         if (contains(value, index, 4, "TION")) {
700             result.append('X');
701             index += 3;
702         } else if (contains(value, index, 3, "TIA", "TCH")) {
703             result.append('X');
704             index += 3;
705         } else if (contains(value, index, 2, "TH") || contains(value, index,
706                                                                3, "TTH")) {
707             if (contains(value, index + 2, 2, "OM", "AM") ||
708                 //-- special case "thomas", "thames" or germanic --//
709                 contains(value, 0, 4, "VAN ", "VON ") ||
710                 contains(value, 0, 3, "SCH")) {
711                 result.append('T');
712             } else {
713                 result.append('0', 'T');
714             }
715             index += 2;
716         } else {
717             result.append('T');
718             index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
719         }
720         return index;
721     }
722 
723     /**
724      * Handles 'W' cases
725      */
handleW(String value, DoubleMetaphoneResult result, int index)726     private int handleW(String value,
727                         DoubleMetaphoneResult result,
728                         int index) {
729         if (contains(value, index, 2, "WR")) {
730             //-- can also be in middle of word --//
731             result.append('R');
732             index += 2;
733         } else {
734             if (index == 0 && (isVowel(charAt(value, index + 1)) ||
735                                contains(value, index, 2, "WH"))) {
736                 if (isVowel(charAt(value, index + 1))) {
737                     //-- Wasserman should match Vasserman --//
738                     result.append('A', 'F');
739                 } else {
740                     //-- need Uomo to match Womo --//
741                     result.append('A');
742                 }
743                 index++;
744             } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
745                        contains(value, index - 1,
746                                 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
747                        contains(value, 0, 3, "SCH")) {
748                 //-- Arnow should match Arnoff --//
749                 result.appendAlternate('F');
750                 index++;
751             } else if (contains(value, index, 4, "WICZ", "WITZ")) {
752                 //-- Polish e.g. "filipowicz" --//
753                 result.append("TS", "FX");
754                 index += 4;
755             } else {
756                 index++;
757             }
758         }
759         return index;
760     }
761 
762     /**
763      * Handles 'X' cases
764      */
handleX(String value, DoubleMetaphoneResult result, int index)765     private int handleX(String value,
766                         DoubleMetaphoneResult result,
767                         int index) {
768         if (index == 0) {
769             result.append('S');
770             index++;
771         } else {
772             if (!((index == value.length() - 1) &&
773                   (contains(value, index - 3, 3, "IAU", "EAU") ||
774                    contains(value, index - 2, 2, "AU", "OU")))) {
775                 //-- French e.g. breaux --//
776                 result.append("KS");
777             }
778             index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
779         }
780         return index;
781     }
782 
783     /**
784      * Handles 'Z' cases
785      */
handleZ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)786     private int handleZ(String value, DoubleMetaphoneResult result, int index,
787                         boolean slavoGermanic) {
788         if (charAt(value, index + 1) == 'H') {
789             //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
790             result.append('J');
791             index += 2;
792         } else {
793             if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
794                 result.append("S", "TS");
795             } else {
796                 result.append('S');
797             }
798             index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
799         }
800         return index;
801     }
802 
803     //-- BEGIN CONDITIONS --//
804 
805     /**
806      * Complex condition 0 for 'C'
807      */
conditionC0(String value, int index)808     private boolean conditionC0(String value, int index) {
809         if (contains(value, index, 4, "CHIA")) {
810             return true;
811         } else if (index <= 1) {
812             return false;
813         } else if (isVowel(charAt(value, index - 2))) {
814             return false;
815         } else if (!contains(value, index - 1, 3, "ACH")) {
816             return false;
817         } else {
818             char c = charAt(value, index + 2);
819             return (c != 'I' && c != 'E')
820                     || contains(value, index - 2, 6, "BACHER", "MACHER");
821         }
822     }
823 
824     /**
825      * Complex condition 0 for 'CH'
826      */
conditionCH0(String value, int index)827     private boolean conditionCH0(String value, int index) {
828         if (index != 0) {
829             return false;
830         } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
831                    !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
832             return false;
833         } else if (contains(value, 0, 5, "CHORE")) {
834             return false;
835         } else {
836             return true;
837         }
838     }
839 
840     /**
841      * Complex condition 1 for 'CH'
842      */
conditionCH1(String value, int index)843     private boolean conditionCH1(String value, int index) {
844         return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0,
845                                                                    3, "SCH")) ||
846                 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
847                 contains(value, index + 2, 1, "T", "S") ||
848                 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
849                  (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
850     }
851 
852     /**
853      * Complex condition 0 for 'L'
854      */
conditionL0(String value, int index)855     private boolean conditionL0(String value, int index) {
856         if (index == value.length() - 3 &&
857             contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
858             return true;
859         } else if ((contains(value, index - 1, 2, "AS", "OS") ||
860                     contains(value, value.length() - 1, 1, "A", "O")) &&
861                    contains(value, index - 1, 4, "ALLE")) {
862             return true;
863         } else {
864             return false;
865         }
866     }
867 
868     /**
869      * Complex condition 0 for 'M'
870      */
conditionM0(String value, int index)871     private boolean conditionM0(String value, int index) {
872         if (charAt(value, index + 1) == 'M') {
873             return true;
874         }
875         return contains(value, index - 1, 3, "UMB")
876                 && ((index + 1) == value.length() - 1 || contains(value,
877                         index + 2, 2, "ER"));
878     }
879 
880     //-- BEGIN HELPER FUNCTIONS --//
881 
882     /**
883      * Determines whether or not a value is of slavo-germanic orgin. A value is
884      * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
885      */
isSlavoGermanic(String value)886     private boolean isSlavoGermanic(String value) {
887         return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
888             value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
889     }
890 
891     /**
892      * Determines whether or not a character is a vowel or not
893      */
isVowel(char ch)894     private boolean isVowel(char ch) {
895         return VOWELS.indexOf(ch) != -1;
896     }
897 
898     /**
899      * Determines whether or not the value starts with a silent letter.  It will
900      * return <code>true</code> if the value starts with any of 'GN', 'KN',
901      * 'PN', 'WR' or 'PS'.
902      */
isSilentStart(String value)903     private boolean isSilentStart(String value) {
904         boolean result = false;
905         for (int i = 0; i < SILENT_START.length; i++) {
906             if (value.startsWith(SILENT_START[i])) {
907                 result = true;
908                 break;
909             }
910         }
911         return result;
912     }
913 
914     /**
915      * Cleans the input
916      */
cleanInput(String input)917     private String cleanInput(String input) {
918         if (input == null) {
919             return null;
920         }
921         input = input.trim();
922         if (input.length() == 0) {
923             return null;
924         }
925         return input.toUpperCase();
926     }
927 
928     /**
929      * Gets the character at index <code>index</code> if available, otherwise
930      * it returns <code>Character.MIN_VALUE</code> so that there is some sort
931      * of a default
932      */
charAt(String value, int index)933     protected char charAt(String value, int index) {
934         if (index < 0 || index >= value.length()) {
935             return Character.MIN_VALUE;
936         }
937         return value.charAt(index);
938     }
939 
940     /**
941      * Shortcut method with 1 criteria
942      */
contains(String value, int start, int length, String criteria)943     private static boolean contains(String value, int start, int length,
944                                     String criteria) {
945         return contains(value, start, length,
946                         new String[] { criteria });
947     }
948 
949     /**
950      * Shortcut method with 2 criteria
951      */
contains(String value, int start, int length, String criteria1, String criteria2)952     private static boolean contains(String value, int start, int length,
953                                     String criteria1, String criteria2) {
954         return contains(value, start, length,
955                         new String[] { criteria1, criteria2 });
956     }
957 
958     /**
959      * Shortcut method with 3 criteria
960      */
contains(String value, int start, int length, String criteria1, String criteria2, String criteria3)961     private static boolean contains(String value, int start, int length,
962                                     String criteria1, String criteria2,
963                                     String criteria3) {
964         return contains(value, start, length,
965                         new String[] { criteria1, criteria2, criteria3 });
966     }
967 
968     /**
969      * Shortcut method with 4 criteria
970      */
contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4)971     private static boolean contains(String value, int start, int length,
972                                     String criteria1, String criteria2,
973                                     String criteria3, String criteria4) {
974         return contains(value, start, length,
975                         new String[] { criteria1, criteria2, criteria3,
976                                        criteria4 });
977     }
978 
979     /**
980      * Shortcut method with 5 criteria
981      */
contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4, String criteria5)982     private static boolean contains(String value, int start, int length,
983                                     String criteria1, String criteria2,
984                                     String criteria3, String criteria4,
985                                     String criteria5) {
986         return contains(value, start, length,
987                         new String[] { criteria1, criteria2, criteria3,
988                                        criteria4, criteria5 });
989     }
990 
991     /**
992      * Shortcut method with 6 criteria
993      */
contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4, String criteria5, String criteria6)994     private static boolean contains(String value, int start, int length,
995                                     String criteria1, String criteria2,
996                                     String criteria3, String criteria4,
997                                     String criteria5, String criteria6) {
998         return contains(value, start, length,
999                         new String[] { criteria1, criteria2, criteria3,
1000                                        criteria4, criteria5, criteria6 });
1001     }
1002 
1003     /**
1004      * Determines whether <code>value</code> contains any of the criteria
1005      starting
1006      * at index <code>start</code> and matching up to length <code>length</code>
1007      */
contains(String value, int start, int length, String[] criteria)1008     protected static boolean contains(String value, int start, int length,
1009                                       String[] criteria) {
1010         boolean result = false;
1011         if (start >= 0 && start + length <= value.length()) {
1012             String target = value.substring(start, start + length);
1013 
1014             for (int i = 0; i < criteria.length; i++) {
1015                 if (target.equals(criteria[i])) {
1016                     result = true;
1017                     break;
1018                 }
1019             }
1020         }
1021         return result;
1022     }
1023 
1024     //-- BEGIN INNER CLASSES --//
1025 
1026     /**
1027      * Inner class for storing results, since there is the optional alternate
1028      * encoding.
1029      */
1030     public class DoubleMetaphoneResult {
1031 
1032         private StringBuffer primary = new StringBuffer(getMaxCodeLen());
1033         private StringBuffer alternate = new StringBuffer(getMaxCodeLen());
1034         private int maxLength;
1035 
DoubleMetaphoneResult(int maxLength)1036         public DoubleMetaphoneResult(int maxLength) {
1037             this.maxLength = maxLength;
1038         }
1039 
append(char value)1040         public void append(char value) {
1041             appendPrimary(value);
1042             appendAlternate(value);
1043         }
1044 
append(char primary, char alternate)1045         public void append(char primary, char alternate) {
1046             appendPrimary(primary);
1047             appendAlternate(alternate);
1048         }
1049 
appendPrimary(char value)1050         public void appendPrimary(char value) {
1051             if (this.primary.length() < this.maxLength) {
1052                 this.primary.append(value);
1053             }
1054         }
1055 
appendAlternate(char value)1056         public void appendAlternate(char value) {
1057             if (this.alternate.length() < this.maxLength) {
1058                 this.alternate.append(value);
1059             }
1060         }
1061 
append(String value)1062         public void append(String value) {
1063             appendPrimary(value);
1064             appendAlternate(value);
1065         }
1066 
append(String primary, String alternate)1067         public void append(String primary, String alternate) {
1068             appendPrimary(primary);
1069             appendAlternate(alternate);
1070         }
1071 
appendPrimary(String value)1072         public void appendPrimary(String value) {
1073             int addChars = this.maxLength - this.primary.length();
1074             if (value.length() <= addChars) {
1075                 this.primary.append(value);
1076             } else {
1077                 this.primary.append(value.substring(0, addChars));
1078             }
1079         }
1080 
appendAlternate(String value)1081         public void appendAlternate(String value) {
1082             int addChars = this.maxLength - this.alternate.length();
1083             if (value.length() <= addChars) {
1084                 this.alternate.append(value);
1085             } else {
1086                 this.alternate.append(value.substring(0, addChars));
1087             }
1088         }
1089 
getPrimary()1090         public String getPrimary() {
1091             return this.primary.toString();
1092         }
1093 
getAlternate()1094         public String getAlternate() {
1095             return this.alternate.toString();
1096         }
1097 
isComplete()1098         public boolean isComplete() {
1099             return this.primary.length() >= this.maxLength &&
1100                 this.alternate.length() >= this.maxLength;
1101         }
1102     }
1103 }
1104