1 /* 2 * Copyright 2001-2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.commons.codec.language; 18 19 import org.apache.commons.codec.EncoderException; 20 import org.apache.commons.codec.StringEncoder; 21 22 /** 23 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a 24 * general purpose scheme to find word with similar phonemes. 25 * 26 * @author Apache Software Foundation 27 * @version $Id: Soundex.java,v 1.26 2004/07/07 23:15:24 ggregory Exp $ 28 * 29 * @deprecated Please use {@link java.net.URL#openConnection} instead. 30 * Please visit <a href="http://android-developers.blogspot.com/2011/09/androids-http-clients.html">this webpage</a> 31 * for further details. 32 */ 33 @Deprecated 34 public class Soundex implements StringEncoder { 35 36 /** 37 * An instance of Soundex using the US_ENGLISH_MAPPING mapping. 38 * 39 * @see #US_ENGLISH_MAPPING 40 */ 41 public static final Soundex US_ENGLISH = new Soundex(); 42 43 /** 44 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position 45 * means do not encode. 46 * <p> 47 * (This constant is provided as both an implementation convenience and to allow Javadoc to pick 48 * up the value for the constant values page.) 49 * </p> 50 * 51 * @see #US_ENGLISH_MAPPING 52 */ 53 public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; 54 55 /** 56 * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position 57 * means do not encode. 58 * 59 * @see Soundex#Soundex(char[]) 60 */ 61 public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); 62 63 // BEGIN android-note 64 // Removed @see reference to SoundexUtils below, since the class isn't 65 // public. 66 // END android-note 67 /** 68 * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This 69 * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or 70 * identical values. 71 * 72 * @param s1 73 * A String that will be encoded and compared. 74 * @param s2 75 * A String that will be encoded and compared. 76 * @return The number of characters in the two encoded Strings that are the same from 0 to 4. 77 * 78 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS 79 * T-SQL DIFFERENCE </a> 80 * 81 * @throws EncoderException 82 * if an error occurs encoding one of the strings 83 * @since 1.3 84 */ difference(String s1, String s2)85 public int difference(String s1, String s2) throws EncoderException { 86 return SoundexUtils.difference(this, s1, s2); 87 } 88 89 /** 90 * The maximum length of a Soundex code - Soundex codes are only four characters by definition. 91 * 92 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 93 */ 94 private int maxLength = 4; 95 96 /** 97 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 98 * letter is mapped. This implementation contains a default map for US_ENGLISH 99 */ 100 private char[] soundexMapping; 101 102 /** 103 * Creates an instance using US_ENGLISH_MAPPING 104 * 105 * @see Soundex#Soundex(char[]) 106 * @see Soundex#US_ENGLISH_MAPPING 107 */ Soundex()108 public Soundex() { 109 this(US_ENGLISH_MAPPING); 110 } 111 112 /** 113 * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized 114 * mapping for a non-Western character set. 115 * 116 * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each 117 * letter is mapped. This implementation contains a default map for US_ENGLISH 118 * 119 * @param mapping 120 * Mapping array to use when finding the corresponding code for a given character 121 */ Soundex(char[] mapping)122 public Soundex(char[] mapping) { 123 this.setSoundexMapping(mapping); 124 } 125 126 /** 127 * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of 128 * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String. 129 * 130 * @param pObject 131 * Object to encode 132 * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String 133 * supplied. 134 * @throws EncoderException 135 * if the parameter supplied is not of type java.lang.String 136 * @throws IllegalArgumentException 137 * if a character is not mapped 138 */ encode(Object pObject)139 public Object encode(Object pObject) throws EncoderException { 140 if (!(pObject instanceof String)) { 141 throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); 142 } 143 return soundex((String) pObject); 144 } 145 146 /** 147 * Encodes a String using the soundex algorithm. 148 * 149 * @param pString 150 * A String object to encode 151 * @return A Soundex code corresponding to the String supplied 152 * @throws IllegalArgumentException 153 * if a character is not mapped 154 */ encode(String pString)155 public String encode(String pString) { 156 return soundex(pString); 157 } 158 159 /** 160 * Used internally by the SoundEx algorithm. 161 * 162 * Consonants from the same code group separated by W or H are treated as one. 163 * 164 * @param str 165 * the cleaned working string to encode (in upper case). 166 * @param index 167 * the character position to encode 168 * @return Mapping code for a particular character 169 * @throws IllegalArgumentException 170 * if the character is not mapped 171 */ getMappingCode(String str, int index)172 private char getMappingCode(String str, int index) { 173 char mappedChar = this.map(str.charAt(index)); 174 // HW rule check 175 if (index > 1 && mappedChar != '0') { 176 char hwChar = str.charAt(index - 1); 177 if ('H' == hwChar || 'W' == hwChar) { 178 char preHWChar = str.charAt(index - 2); 179 char firstCode = this.map(preHWChar); 180 if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) { 181 return 0; 182 } 183 } 184 } 185 return mappedChar; 186 } 187 188 /** 189 * Returns the maxLength. Standard Soundex 190 * 191 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 192 * @return int 193 */ getMaxLength()194 public int getMaxLength() { 195 return this.maxLength; 196 } 197 198 /** 199 * Returns the soundex mapping. 200 * 201 * @return soundexMapping. 202 */ getSoundexMapping()203 private char[] getSoundexMapping() { 204 return this.soundexMapping; 205 } 206 207 /** 208 * Maps the given upper-case character to it's Soudex code. 209 * 210 * @param ch 211 * An upper-case character. 212 * @return A Soundex code. 213 * @throws IllegalArgumentException 214 * Thrown if <code>ch</code> is not mapped. 215 */ map(char ch)216 private char map(char ch) { 217 int index = ch - 'A'; 218 if (index < 0 || index >= this.getSoundexMapping().length) { 219 throw new IllegalArgumentException("The character is not mapped: " + ch); 220 } 221 return this.getSoundexMapping()[index]; 222 } 223 224 /** 225 * Sets the maxLength. 226 * 227 * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. 228 * @param maxLength 229 * The maxLength to set 230 */ setMaxLength(int maxLength)231 public void setMaxLength(int maxLength) { 232 this.maxLength = maxLength; 233 } 234 235 /** 236 * Sets the soundexMapping. 237 * 238 * @param soundexMapping 239 * The soundexMapping to set. 240 */ setSoundexMapping(char[] soundexMapping)241 private void setSoundexMapping(char[] soundexMapping) { 242 this.soundexMapping = soundexMapping; 243 } 244 245 /** 246 * Retreives the Soundex code for a given String object. 247 * 248 * @param str 249 * String to encode using the Soundex algorithm 250 * @return A soundex code for the String supplied 251 * @throws IllegalArgumentException 252 * if a character is not mapped 253 */ soundex(String str)254 public String soundex(String str) { 255 if (str == null) { 256 return null; 257 } 258 str = SoundexUtils.clean(str); 259 if (str.length() == 0) { 260 return str; 261 } 262 char out[] = {'0', '0', '0', '0'}; 263 char last, mapped; 264 int incount = 1, count = 1; 265 out[0] = str.charAt(0); 266 last = getMappingCode(str, 0); 267 while ((incount < str.length()) && (count < out.length)) { 268 mapped = getMappingCode(str, incount++); 269 if (mapped != 0) { 270 if ((mapped != '0') && (mapped != last)) { 271 out[count++] = mapped; 272 } 273 last = mapped; 274 } 275 } 276 return new String(out); 277 } 278 279 } 280