• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2001-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package org.apache.commons.codec.language;
18 
19 import org.apache.commons.codec.EncoderException;
20 import org.apache.commons.codec.StringEncoder;
21 
22 /**
23  * Encodes a string into a Refined Soundex value. A refined soundex code is
24  * optimized for spell checking words. Soundex method originally developed by
25  * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
26  *
27  * @author Apache Software Foundation
28  * @version $Id: RefinedSoundex.java,v 1.21 2004/06/05 18:32:04 ggregory Exp $
29  */
30 public class RefinedSoundex implements StringEncoder {
31 
32     /**
33      * This static variable contains an instance of the RefinedSoundex using
34      * the US_ENGLISH mapping.
35      */
36     public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
37 
38     /**
39      * RefinedSoundex is *refined* for a number of reasons one being that the
40      * mappings have been altered. This implementation contains default
41      * mappings for US English.
42      */
43     public static final char[] US_ENGLISH_MAPPING = "01360240043788015936020505".toCharArray();
44 
45     /**
46      * Every letter of the alphabet is "mapped" to a numerical value. This char
47      * array holds the values to which each letter is mapped. This
48      * implementation contains a default map for US_ENGLISH
49      */
50     private char[] soundexMapping;
51 
52     /**
53      * Creates an instance of the RefinedSoundex object using the default US
54      * English mapping.
55      */
RefinedSoundex()56     public RefinedSoundex() {
57         this(US_ENGLISH_MAPPING);
58     }
59 
60     /**
61      * Creates a refined soundex instance using a custom mapping. This
62      * constructor can be used to customize the mapping, and/or possibly
63      * provide an internationalized mapping for a non-Western character set.
64      *
65      * @param mapping
66      *                  Mapping array to use when finding the corresponding code for
67      *                  a given character
68      */
RefinedSoundex(char[] mapping)69     public RefinedSoundex(char[] mapping) {
70         this.soundexMapping = mapping;
71     }
72 
73     // BEGIN android-note
74     // Removed @see reference to SoundexUtils below, since the class isn't
75     // public.
76     // END android-note
77     /**
78      * Returns the number of characters in the two encoded Strings that are the
79      * same. This return value ranges from 0 to the length of the shortest
80      * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
81      * example) indicates strong similarity or identical values. For refined
82      * Soundex, the return value can be greater than 4.
83      *
84      * @param s1
85      *                  A String that will be encoded and compared.
86      * @param s2
87      *                  A String that will be encoded and compared.
88      * @return The number of characters in the two encoded Strings that are the
89      *             same from 0 to to the length of the shortest encoded String.
90      *
91      * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
92      *          MS T-SQL DIFFERENCE</a>
93      *
94      * @throws EncoderException
95      *                  if an error occurs encoding one of the strings
96      * @since 1.3
97      */
difference(String s1, String s2)98     public int difference(String s1, String s2) throws EncoderException {
99         return SoundexUtils.difference(this, s1, s2);
100     }
101 
102     /**
103      * Encodes an Object using the refined soundex algorithm. This method is
104      * provided in order to satisfy the requirements of the Encoder interface,
105      * and will throw an EncoderException if the supplied object is not of type
106      * java.lang.String.
107      *
108      * @param pObject
109      *                  Object to encode
110      * @return An object (or type java.lang.String) containing the refined
111      *             soundex code which corresponds to the String supplied.
112      * @throws EncoderException
113      *                  if the parameter supplied is not of type java.lang.String
114      */
encode(Object pObject)115     public Object encode(Object pObject) throws EncoderException {
116         if (!(pObject instanceof java.lang.String)) {
117             throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
118         }
119         return soundex((String) pObject);
120     }
121 
122     /**
123      * Encodes a String using the refined soundex algorithm.
124      *
125      * @param pString
126      *                  A String object to encode
127      * @return A Soundex code corresponding to the String supplied
128      */
encode(String pString)129     public String encode(String pString) {
130         return soundex(pString);
131     }
132 
133     /**
134      * Returns the mapping code for a given character. The mapping codes are
135      * maintained in an internal char array named soundexMapping, and the
136      * default values of these mappings are US English.
137      *
138      * @param c
139      *                  char to get mapping for
140      * @return A character (really a numeral) to return for the given char
141      */
getMappingCode(char c)142     char getMappingCode(char c) {
143         if (!Character.isLetter(c)) {
144             return 0;
145         }
146         return this.soundexMapping[Character.toUpperCase(c) - 'A'];
147     }
148 
149     /**
150      * Retreives the Refined Soundex code for a given String object.
151      *
152      * @param str
153      *                  String to encode using the Refined Soundex algorithm
154      * @return A soundex code for the String supplied
155      */
soundex(String str)156     public String soundex(String str) {
157         if (str == null) {
158             return null;
159         }
160         str = SoundexUtils.clean(str);
161         if (str.length() == 0) {
162             return str;
163         }
164 
165         StringBuffer sBuf = new StringBuffer();
166         sBuf.append(str.charAt(0));
167 
168         char last, current;
169         last = '*';
170 
171         for (int i = 0; i < str.length(); i++) {
172 
173             current = getMappingCode(str.charAt(i));
174             if (current == last) {
175                 continue;
176             } else if (current != 0) {
177                 sBuf.append(current);
178             }
179 
180             last = current;
181 
182         }
183 
184         return sBuf.toString();
185     }
186 }
187