1 /* 2 * Copyright 2001-2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.commons.codec.language; 18 19 import org.apache.commons.codec.EncoderException; 20 import org.apache.commons.codec.StringEncoder; 21 22 /** 23 * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes. 24 * 25 * @author Apache Software Foundation 26 * @version $Id: SoundexUtils.java,v 1.5 2004/03/17 18:31:35 ggregory Exp $ 27 * @since 1.3 28 */ 29 final class SoundexUtils { 30 31 /** 32 * Cleans up the input string before Soundex processing by only returning 33 * upper case letters. 34 * 35 * @param str 36 * The String to clean. 37 * @return A clean String. 38 */ clean(String str)39 static String clean(String str) { 40 if (str == null || str.length() == 0) { 41 return str; 42 } 43 int len = str.length(); 44 char[] chars = new char[len]; 45 int count = 0; 46 for (int i = 0; i < len; i++) { 47 if (Character.isLetter(str.charAt(i))) { 48 chars[count++] = str.charAt(i); 49 } 50 } 51 if (count == len) { 52 return str.toUpperCase(); 53 } 54 return new String(chars, 0, count).toUpperCase(); 55 } 56 57 /** 58 * Encodes the Strings and returns the number of characters in the two 59 * encoded Strings that are the same. 60 * <ul> 61 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates 62 * little or no similarity, and 4 indicates strong similarity or identical 63 * values.</li> 64 * <li>For refined Soundex, the return value can be greater than 4.</li> 65 * </ul> 66 * 67 * @param encoder 68 * The encoder to use to encode the Strings. 69 * @param s1 70 * A String that will be encoded and compared. 71 * @param s2 72 * A String that will be encoded and compared. 73 * @return The number of characters in the two Soundex encoded Strings that 74 * are the same. 75 * 76 * @see #differenceEncoded(String,String) 77 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> 78 * MS T-SQL DIFFERENCE</a> 79 * 80 * @throws EncoderException 81 * if an error occurs encoding one of the strings 82 */ difference(StringEncoder encoder, String s1, String s2)83 static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException { 84 return differenceEncoded(encoder.encode(s1), encoder.encode(s2)); 85 } 86 87 /** 88 * Returns the number of characters in the two Soundex encoded Strings that 89 * are the same. 90 * <ul> 91 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates 92 * little or no similarity, and 4 indicates strong similarity or identical 93 * values.</li> 94 * <li>For refined Soundex, the return value can be greater than 4.</li> 95 * </ul> 96 * 97 * @param es1 98 * An encoded String. 99 * @param es2 100 * An encoded String. 101 * @return The number of characters in the two Soundex encoded Strings that 102 * are the same. 103 * 104 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> 105 * MS T-SQL DIFFERENCE</a> 106 */ differenceEncoded(String es1, String es2)107 static int differenceEncoded(String es1, String es2) { 108 109 if (es1 == null || es2 == null) { 110 return 0; 111 } 112 int lengthToMatch = Math.min(es1.length(), es2.length()); 113 int diff = 0; 114 for (int i = 0; i < lengthToMatch; i++) { 115 if (es1.charAt(i) == es2.charAt(i)) { 116 diff++; 117 } 118 } 119 return diff; 120 } 121 122 } 123