• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2001-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package org.apache.commons.codec.language;
18 
19 import org.apache.commons.codec.EncoderException;
20 import org.apache.commons.codec.StringEncoder;
21 
22 /**
23  * Encodes a string into a metaphone value.
24  * <p>
25  * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
26  * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
27  * </p>
28  * <p>
29  * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
30  * 39.</CITE>
31  * </p>
32  *
33  * @author Apache Software Foundation
34  * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $
35  */
36 public class Metaphone implements StringEncoder {
37 
38     /**
39      * Five values in the English language
40      */
41     private String vowels = "AEIOU" ;
42 
43     /**
44      * Variable used in Metaphone algorithm
45      */
46     private String frontv = "EIY"   ;
47 
48     /**
49      * Variable used in Metaphone algorithm
50      */
51     private String varson = "CSPTG" ;
52 
53     /**
54      * The max code length for metaphone is 4
55      */
56     private int maxCodeLen = 4 ;
57 
58     /**
59      * Creates an instance of the Metaphone encoder
60      */
Metaphone()61     public Metaphone() {
62         super();
63     }
64 
65     /**
66      * Find the metaphone value of a String. This is similar to the
67      * soundex algorithm, but better at finding similar sounding words.
68      * All input is converted to upper case.
69      * Limitations: Input format is expected to be a single ASCII word
70      * with only characters in the A - Z range, no punctuation or numbers.
71      *
72      * @param txt String to find the metaphone code for
73      * @return A metaphone code corresponding to the String supplied
74      */
metaphone(String txt)75     public String metaphone(String txt) {
76         boolean hard = false ;
77         if ((txt == null) || (txt.length() == 0)) {
78             return "" ;
79         }
80         // single character is itself
81         if (txt.length() == 1) {
82             return txt.toUpperCase() ;
83         }
84 
85         char[] inwd = txt.toUpperCase().toCharArray() ;
86 
87         StringBuffer local = new StringBuffer(40); // manipulate
88         StringBuffer code = new StringBuffer(10) ; //   output
89         // handle initial 2 characters exceptions
90         switch(inwd[0]) {
91         case 'K' :
92         case 'G' :
93         case 'P' : /* looking for KN, etc*/
94             if (inwd[1] == 'N') {
95                 local.append(inwd, 1, inwd.length - 1);
96             } else {
97                 local.append(inwd);
98             }
99             break;
100         case 'A': /* looking for AE */
101             if (inwd[1] == 'E') {
102                 local.append(inwd, 1, inwd.length - 1);
103             } else {
104                 local.append(inwd);
105             }
106             break;
107         case 'W' : /* looking for WR or WH */
108             if (inwd[1] == 'R') {   // WR -> R
109                 local.append(inwd, 1, inwd.length - 1);
110                 break ;
111             }
112             if (inwd[1] == 'H') {
113                 local.append(inwd, 1, inwd.length - 1);
114                 local.setCharAt(0, 'W'); // WH -> W
115             } else {
116                 local.append(inwd);
117             }
118             break;
119         case 'X' : /* initial X becomes S */
120             inwd[0] = 'S';
121             local.append(inwd);
122             break ;
123         default :
124             local.append(inwd);
125         } // now local has working string with initials fixed
126 
127         int wdsz = local.length();
128         int n = 0 ;
129 
130         while ((code.length() < this.getMaxCodeLen()) &&
131                (n < wdsz) ) { // max code size of 4 works well
132             char symb = local.charAt(n) ;
133             // remove duplicate letters except C
134             if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
135                 n++ ;
136             } else { // not dup
137                 switch(symb) {
138                 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
139                     if (n == 0) {
140                         code.append(symb);
141                     }
142                     break ; // only use vowel if leading char
143                 case 'B' :
144                     if ( isPreviousChar(local, n, 'M') &&
145                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
146                         break;
147                     }
148                     code.append(symb);
149                     break;
150                 case 'C' : // lots of C special cases
151                     /* discard if SCI, SCE or SCY */
152                     if ( isPreviousChar(local, n, 'S') &&
153                          !isLastChar(wdsz, n) &&
154                          (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) {
155                         break;
156                     }
157                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
158                         code.append('X');
159                         break;
160                     }
161                     if (!isLastChar(wdsz, n) &&
162                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
163                         code.append('S');
164                         break; // CI,CE,CY -> S
165                     }
166                     if (isPreviousChar(local, n, 'S') &&
167                         isNextChar(local, n, 'H') ) { // SCH->sk
168                         code.append('K') ;
169                         break ;
170                     }
171                     if (isNextChar(local, n, 'H')) { // detect CH
172                         if ((n == 0) &&
173                             (wdsz >= 3) &&
174                             isVowel(local,2) ) { // CH consonant -> K consonant
175                             code.append('K');
176                         } else {
177                             code.append('X'); // CHvowel -> X
178                         }
179                     } else {
180                         code.append('K');
181                     }
182                     break ;
183                 case 'D' :
184                     if (!isLastChar(wdsz, n + 1) &&
185                         isNextChar(local, n, 'G') &&
186                         (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
187                         code.append('J'); n += 2 ;
188                     } else {
189                         code.append('T');
190                     }
191                     break ;
192                 case 'G' : // GH silent at end or before consonant
193                     if (isLastChar(wdsz, n + 1) &&
194                         isNextChar(local, n, 'H')) {
195                         break;
196                     }
197                     if (!isLastChar(wdsz, n + 1) &&
198                         isNextChar(local,n,'H') &&
199                         !isVowel(local,n+2)) {
200                         break;
201                     }
202                     if ((n > 0) &&
203                         ( regionMatch(local, n, "GN") ||
204                           regionMatch(local, n, "GNED") ) ) {
205                         break; // silent G
206                     }
207                     if (isPreviousChar(local, n, 'G')) {
208                         hard = true ;
209                     } else {
210                         hard = false ;
211                     }
212                     if (!isLastChar(wdsz, n) &&
213                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0) &&
214                         (!hard)) {
215                         code.append('J');
216                     } else {
217                         code.append('K');
218                     }
219                     break ;
220                 case 'H':
221                     if (isLastChar(wdsz, n)) {
222                         break ; // terminal H
223                     }
224                     if ((n > 0) &&
225                         (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
226                         break;
227                     }
228                     if (isVowel(local,n+1)) {
229                         code.append('H'); // Hvowel
230                     }
231                     break;
232                 case 'F':
233                 case 'J' :
234                 case 'L' :
235                 case 'M':
236                 case 'N' :
237                 case 'R' :
238                     code.append(symb);
239                     break;
240                 case 'K' :
241                     if (n > 0) { // not initial
242                         if (!isPreviousChar(local, n, 'C')) {
243                             code.append(symb);
244                         }
245                     } else {
246                         code.append(symb); // initial K
247                     }
248                     break ;
249                 case 'P' :
250                     if (isNextChar(local,n,'H')) {
251                         // PH -> F
252                         code.append('F');
253                     } else {
254                         code.append(symb);
255                     }
256                     break ;
257                 case 'Q' :
258                     code.append('K');
259                     break;
260                 case 'S' :
261                     if (regionMatch(local,n,"SH") ||
262                         regionMatch(local,n,"SIO") ||
263                         regionMatch(local,n,"SIA")) {
264                         code.append('X');
265                     } else {
266                         code.append('S');
267                     }
268                     break;
269                 case 'T' :
270                     if (regionMatch(local,n,"TIA") ||
271                         regionMatch(local,n,"TIO")) {
272                         code.append('X');
273                         break;
274                     }
275                     if (regionMatch(local,n,"TCH")) {
276                         // Silent if in "TCH"
277                         break;
278                     }
279                     // substitute numeral 0 for TH (resembles theta after all)
280                     if (regionMatch(local,n,"TH")) {
281                         code.append('0');
282                     } else {
283                         code.append('T');
284                     }
285                     break ;
286                 case 'V' :
287                     code.append('F'); break ;
288                 case 'W' : case 'Y' : // silent if not followed by vowel
289                     if (!isLastChar(wdsz,n) &&
290                         isVowel(local,n+1)) {
291                         code.append(symb);
292                     }
293                     break ;
294                 case 'X' :
295                     code.append('K'); code.append('S');
296                     break ;
297                 case 'Z' :
298                     code.append('S'); break ;
299                 } // end switch
300                 n++ ;
301             } // end else from symb != 'C'
302             if (code.length() > this.getMaxCodeLen()) {
303                 code.setLength(this.getMaxCodeLen());
304             }
305         }
306         return code.toString();
307     }
308 
isVowel(StringBuffer string, int index)309     private boolean isVowel(StringBuffer string, int index) {
310         return (this.vowels.indexOf(string.charAt(index)) >= 0);
311     }
312 
isPreviousChar(StringBuffer string, int index, char c)313     private boolean isPreviousChar(StringBuffer string, int index, char c) {
314         boolean matches = false;
315         if( index > 0 &&
316             index < string.length() ) {
317             matches = string.charAt(index - 1) == c;
318         }
319         return matches;
320     }
321 
isNextChar(StringBuffer string, int index, char c)322     private boolean isNextChar(StringBuffer string, int index, char c) {
323         boolean matches = false;
324         if( index >= 0 &&
325             index < string.length() - 1 ) {
326             matches = string.charAt(index + 1) == c;
327         }
328         return matches;
329     }
330 
regionMatch(StringBuffer string, int index, String test)331     private boolean regionMatch(StringBuffer string, int index, String test) {
332         boolean matches = false;
333         if( index >= 0 &&
334             (index + test.length() - 1) < string.length() ) {
335             String substring = string.substring( index, index + test.length());
336             matches = substring.equals( test );
337         }
338         return matches;
339     }
340 
isLastChar(int wdsz, int n)341     private boolean isLastChar(int wdsz, int n) {
342         return n + 1 == wdsz;
343     }
344 
345 
346     /**
347      * Encodes an Object using the metaphone algorithm.  This method
348      * is provided in order to satisfy the requirements of the
349      * Encoder interface, and will throw an EncoderException if the
350      * supplied object is not of type java.lang.String.
351      *
352      * @param pObject Object to encode
353      * @return An object (or type java.lang.String) containing the
354      *         metaphone code which corresponds to the String supplied.
355      * @throws EncoderException if the parameter supplied is not
356      *                          of type java.lang.String
357      */
encode(Object pObject)358     public Object encode(Object pObject) throws EncoderException {
359         if (!(pObject instanceof java.lang.String)) {
360             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
361         }
362         return metaphone((String) pObject);
363     }
364 
365     /**
366      * Encodes a String using the Metaphone algorithm.
367      *
368      * @param pString String object to encode
369      * @return The metaphone code corresponding to the String supplied
370      */
encode(String pString)371     public String encode(String pString) {
372         return metaphone(pString);
373     }
374 
375     /**
376      * Tests is the metaphones of two strings are identical.
377      *
378      * @param str1 First of two strings to compare
379      * @param str2 Second of two strings to compare
380      * @return true if the metaphones of these strings are identical,
381      *         false otherwise.
382      */
isMetaphoneEqual(String str1, String str2)383     public boolean isMetaphoneEqual(String str1, String str2) {
384         return metaphone(str1).equals(metaphone(str2));
385     }
386 
387     /**
388      * Returns the maxCodeLen.
389      * @return int
390      */
getMaxCodeLen()391     public int getMaxCodeLen() { return this.maxCodeLen; }
392 
393     /**
394      * Sets the maxCodeLen.
395      * @param maxCodeLen The maxCodeLen to set
396      */
setMaxCodeLen(int maxCodeLen)397     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
398 
399 }
400