• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2001-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package org.apache.commons.codec.language;
18 
19 import org.apache.commons.codec.EncoderException;
20 import org.apache.commons.codec.StringEncoder;
21 
22 /**
23  * Encodes a string into a metaphone value.
24  * <p>
25  * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
26  * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
27  * </p>
28  * <p>
29  * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
30  * 39.</CITE>
31  * </p>
32  *
33  * @author Apache Software Foundation
34  * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $
35  *
36  * @deprecated Please use {@link java.net.URL#openConnection} instead.
37  *     Please visit <a href="http://android-developers.blogspot.com/2011/09/androids-http-clients.html">this webpage</a>
38  *     for further details.
39  */
40 @Deprecated
41 public class Metaphone implements StringEncoder {
42 
43     /**
44      * Five values in the English language
45      */
46     private String vowels = "AEIOU" ;
47 
48     /**
49      * Variable used in Metaphone algorithm
50      */
51     private String frontv = "EIY"   ;
52 
53     /**
54      * Variable used in Metaphone algorithm
55      */
56     private String varson = "CSPTG" ;
57 
58     /**
59      * The max code length for metaphone is 4
60      */
61     private int maxCodeLen = 4 ;
62 
63     /**
64      * Creates an instance of the Metaphone encoder
65      */
Metaphone()66     public Metaphone() {
67         super();
68     }
69 
70     /**
71      * Find the metaphone value of a String. This is similar to the
72      * soundex algorithm, but better at finding similar sounding words.
73      * All input is converted to upper case.
74      * Limitations: Input format is expected to be a single ASCII word
75      * with only characters in the A - Z range, no punctuation or numbers.
76      *
77      * @param txt String to find the metaphone code for
78      * @return A metaphone code corresponding to the String supplied
79      */
metaphone(String txt)80     public String metaphone(String txt) {
81         boolean hard = false ;
82         if ((txt == null) || (txt.length() == 0)) {
83             return "" ;
84         }
85         // single character is itself
86         if (txt.length() == 1) {
87             return txt.toUpperCase() ;
88         }
89 
90         char[] inwd = txt.toUpperCase().toCharArray() ;
91 
92         StringBuffer local = new StringBuffer(40); // manipulate
93         StringBuffer code = new StringBuffer(10) ; //   output
94         // handle initial 2 characters exceptions
95         switch(inwd[0]) {
96         case 'K' :
97         case 'G' :
98         case 'P' : /* looking for KN, etc*/
99             if (inwd[1] == 'N') {
100                 local.append(inwd, 1, inwd.length - 1);
101             } else {
102                 local.append(inwd);
103             }
104             break;
105         case 'A': /* looking for AE */
106             if (inwd[1] == 'E') {
107                 local.append(inwd, 1, inwd.length - 1);
108             } else {
109                 local.append(inwd);
110             }
111             break;
112         case 'W' : /* looking for WR or WH */
113             if (inwd[1] == 'R') {   // WR -> R
114                 local.append(inwd, 1, inwd.length - 1);
115                 break ;
116             }
117             if (inwd[1] == 'H') {
118                 local.append(inwd, 1, inwd.length - 1);
119                 local.setCharAt(0, 'W'); // WH -> W
120             } else {
121                 local.append(inwd);
122             }
123             break;
124         case 'X' : /* initial X becomes S */
125             inwd[0] = 'S';
126             local.append(inwd);
127             break ;
128         default :
129             local.append(inwd);
130         } // now local has working string with initials fixed
131 
132         int wdsz = local.length();
133         int n = 0 ;
134 
135         while ((code.length() < this.getMaxCodeLen()) &&
136                (n < wdsz) ) { // max code size of 4 works well
137             char symb = local.charAt(n) ;
138             // remove duplicate letters except C
139             if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
140                 n++ ;
141             } else { // not dup
142                 switch(symb) {
143                 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
144                     if (n == 0) {
145                         code.append(symb);
146                     }
147                     break ; // only use vowel if leading char
148                 case 'B' :
149                     if ( isPreviousChar(local, n, 'M') &&
150                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
151                         break;
152                     }
153                     code.append(symb);
154                     break;
155                 case 'C' : // lots of C special cases
156                     /* discard if SCI, SCE or SCY */
157                     if ( isPreviousChar(local, n, 'S') &&
158                          !isLastChar(wdsz, n) &&
159                          (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) {
160                         break;
161                     }
162                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
163                         code.append('X');
164                         break;
165                     }
166                     if (!isLastChar(wdsz, n) &&
167                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
168                         code.append('S');
169                         break; // CI,CE,CY -> S
170                     }
171                     if (isPreviousChar(local, n, 'S') &&
172                         isNextChar(local, n, 'H') ) { // SCH->sk
173                         code.append('K') ;
174                         break ;
175                     }
176                     if (isNextChar(local, n, 'H')) { // detect CH
177                         if ((n == 0) &&
178                             (wdsz >= 3) &&
179                             isVowel(local,2) ) { // CH consonant -> K consonant
180                             code.append('K');
181                         } else {
182                             code.append('X'); // CHvowel -> X
183                         }
184                     } else {
185                         code.append('K');
186                     }
187                     break ;
188                 case 'D' :
189                     if (!isLastChar(wdsz, n + 1) &&
190                         isNextChar(local, n, 'G') &&
191                         (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
192                         code.append('J'); n += 2 ;
193                     } else {
194                         code.append('T');
195                     }
196                     break ;
197                 case 'G' : // GH silent at end or before consonant
198                     if (isLastChar(wdsz, n + 1) &&
199                         isNextChar(local, n, 'H')) {
200                         break;
201                     }
202                     if (!isLastChar(wdsz, n + 1) &&
203                         isNextChar(local,n,'H') &&
204                         !isVowel(local,n+2)) {
205                         break;
206                     }
207                     if ((n > 0) &&
208                         ( regionMatch(local, n, "GN") ||
209                           regionMatch(local, n, "GNED") ) ) {
210                         break; // silent G
211                     }
212                     if (isPreviousChar(local, n, 'G')) {
213                         hard = true ;
214                     } else {
215                         hard = false ;
216                     }
217                     if (!isLastChar(wdsz, n) &&
218                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0) &&
219                         (!hard)) {
220                         code.append('J');
221                     } else {
222                         code.append('K');
223                     }
224                     break ;
225                 case 'H':
226                     if (isLastChar(wdsz, n)) {
227                         break ; // terminal H
228                     }
229                     if ((n > 0) &&
230                         (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
231                         break;
232                     }
233                     if (isVowel(local,n+1)) {
234                         code.append('H'); // Hvowel
235                     }
236                     break;
237                 case 'F':
238                 case 'J' :
239                 case 'L' :
240                 case 'M':
241                 case 'N' :
242                 case 'R' :
243                     code.append(symb);
244                     break;
245                 case 'K' :
246                     if (n > 0) { // not initial
247                         if (!isPreviousChar(local, n, 'C')) {
248                             code.append(symb);
249                         }
250                     } else {
251                         code.append(symb); // initial K
252                     }
253                     break ;
254                 case 'P' :
255                     if (isNextChar(local,n,'H')) {
256                         // PH -> F
257                         code.append('F');
258                     } else {
259                         code.append(symb);
260                     }
261                     break ;
262                 case 'Q' :
263                     code.append('K');
264                     break;
265                 case 'S' :
266                     if (regionMatch(local,n,"SH") ||
267                         regionMatch(local,n,"SIO") ||
268                         regionMatch(local,n,"SIA")) {
269                         code.append('X');
270                     } else {
271                         code.append('S');
272                     }
273                     break;
274                 case 'T' :
275                     if (regionMatch(local,n,"TIA") ||
276                         regionMatch(local,n,"TIO")) {
277                         code.append('X');
278                         break;
279                     }
280                     if (regionMatch(local,n,"TCH")) {
281                         // Silent if in "TCH"
282                         break;
283                     }
284                     // substitute numeral 0 for TH (resembles theta after all)
285                     if (regionMatch(local,n,"TH")) {
286                         code.append('0');
287                     } else {
288                         code.append('T');
289                     }
290                     break ;
291                 case 'V' :
292                     code.append('F'); break ;
293                 case 'W' : case 'Y' : // silent if not followed by vowel
294                     if (!isLastChar(wdsz,n) &&
295                         isVowel(local,n+1)) {
296                         code.append(symb);
297                     }
298                     break ;
299                 case 'X' :
300                     code.append('K'); code.append('S');
301                     break ;
302                 case 'Z' :
303                     code.append('S'); break ;
304                 } // end switch
305                 n++ ;
306             } // end else from symb != 'C'
307             if (code.length() > this.getMaxCodeLen()) {
308                 code.setLength(this.getMaxCodeLen());
309             }
310         }
311         return code.toString();
312     }
313 
isVowel(StringBuffer string, int index)314     private boolean isVowel(StringBuffer string, int index) {
315         return (this.vowels.indexOf(string.charAt(index)) >= 0);
316     }
317 
isPreviousChar(StringBuffer string, int index, char c)318     private boolean isPreviousChar(StringBuffer string, int index, char c) {
319         boolean matches = false;
320         if( index > 0 &&
321             index < string.length() ) {
322             matches = string.charAt(index - 1) == c;
323         }
324         return matches;
325     }
326 
isNextChar(StringBuffer string, int index, char c)327     private boolean isNextChar(StringBuffer string, int index, char c) {
328         boolean matches = false;
329         if( index >= 0 &&
330             index < string.length() - 1 ) {
331             matches = string.charAt(index + 1) == c;
332         }
333         return matches;
334     }
335 
regionMatch(StringBuffer string, int index, String test)336     private boolean regionMatch(StringBuffer string, int index, String test) {
337         boolean matches = false;
338         if( index >= 0 &&
339             (index + test.length() - 1) < string.length() ) {
340             String substring = string.substring( index, index + test.length());
341             matches = substring.equals( test );
342         }
343         return matches;
344     }
345 
isLastChar(int wdsz, int n)346     private boolean isLastChar(int wdsz, int n) {
347         return n + 1 == wdsz;
348     }
349 
350 
351     /**
352      * Encodes an Object using the metaphone algorithm.  This method
353      * is provided in order to satisfy the requirements of the
354      * Encoder interface, and will throw an EncoderException if the
355      * supplied object is not of type java.lang.String.
356      *
357      * @param pObject Object to encode
358      * @return An object (or type java.lang.String) containing the
359      *         metaphone code which corresponds to the String supplied.
360      * @throws EncoderException if the parameter supplied is not
361      *                          of type java.lang.String
362      */
encode(Object pObject)363     public Object encode(Object pObject) throws EncoderException {
364         if (!(pObject instanceof java.lang.String)) {
365             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
366         }
367         return metaphone((String) pObject);
368     }
369 
370     /**
371      * Encodes a String using the Metaphone algorithm.
372      *
373      * @param pString String object to encode
374      * @return The metaphone code corresponding to the String supplied
375      */
encode(String pString)376     public String encode(String pString) {
377         return metaphone(pString);
378     }
379 
380     /**
381      * Tests is the metaphones of two strings are identical.
382      *
383      * @param str1 First of two strings to compare
384      * @param str2 Second of two strings to compare
385      * @return true if the metaphones of these strings are identical,
386      *         false otherwise.
387      */
isMetaphoneEqual(String str1, String str2)388     public boolean isMetaphoneEqual(String str1, String str2) {
389         return metaphone(str1).equals(metaphone(str2));
390     }
391 
392     /**
393      * Returns the maxCodeLen.
394      * @return int
395      */
getMaxCodeLen()396     public int getMaxCodeLen() { return this.maxCodeLen; }
397 
398     /**
399      * Sets the maxCodeLen.
400      * @param maxCodeLen The maxCodeLen to set
401      */
setMaxCodeLen(int maxCodeLen)402     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
403 
404 }
405