1 /* 2 * Copyright 2001-2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.commons.codec.language; 18 19 import org.apache.commons.codec.EncoderException; 20 import org.apache.commons.codec.StringEncoder; 21 22 /** 23 * Encodes a string into a double metaphone value. 24 * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>. 25 * <ul> 26 * <li>Original Article: <a 27 * href="http://www.cuj.com/documents/s=8038/cuj0006philips/"> 28 * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li> 29 * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip"> 30 * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li> 31 * </ul> 32 * 33 * @author Apache Software Foundation 34 * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $ 35 * 36 * @deprecated Please use {@link java.net.URL#openConnection} instead. 37 * Please visit <a href="http://android-developers.blogspot.com/2011/09/androids-http-clients.html">this webpage</a> 38 * for further details. 39 */ 40 @Deprecated 41 public class DoubleMetaphone implements StringEncoder { 42 43 /** 44 * "Vowels" to test for 45 */ 46 private static final String VOWELS = "AEIOUY"; 47 48 /** 49 * Prefixes when present which are not pronounced 50 */ 51 private static final String[] SILENT_START = 52 { "GN", "KN", "PN", "WR", "PS" }; 53 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = 54 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 55 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = 56 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 57 private static final String[] L_T_K_S_N_M_B_Z = 58 { "L", "T", "K", "S", "N", "M", "B", "Z" }; 59 60 /** 61 * Maximum length of an encoding, default is 4 62 */ 63 protected int maxCodeLen = 4; 64 65 /** 66 * Creates an instance of this DoubleMetaphone encoder 67 */ DoubleMetaphone()68 public DoubleMetaphone() { 69 super(); 70 } 71 72 /** 73 * Encode a value with Double Metaphone 74 * 75 * @param value String to encode 76 * @return an encoded string 77 */ doubleMetaphone(String value)78 public String doubleMetaphone(String value) { 79 return doubleMetaphone(value, false); 80 } 81 82 /** 83 * Encode a value with Double Metaphone, optionally using the alternate 84 * encoding. 85 * 86 * @param value String to encode 87 * @param alternate use alternate encode 88 * @return an encoded string 89 */ doubleMetaphone(String value, boolean alternate)90 public String doubleMetaphone(String value, boolean alternate) { 91 value = cleanInput(value); 92 if (value == null) { 93 return null; 94 } 95 96 boolean slavoGermanic = isSlavoGermanic(value); 97 int index = isSilentStart(value) ? 1 : 0; 98 99 DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); 100 101 while (!result.isComplete() && index <= value.length() - 1) { 102 switch (value.charAt(index)) { 103 case 'A': 104 case 'E': 105 case 'I': 106 case 'O': 107 case 'U': 108 case 'Y': 109 index = handleAEIOUY(value, result, index); 110 break; 111 case 'B': 112 result.append('P'); 113 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 114 break; 115 case '\u00C7': 116 // A C with a Cedilla 117 result.append('S'); 118 index++; 119 break; 120 case 'C': 121 index = handleC(value, result, index); 122 break; 123 case 'D': 124 index = handleD(value, result, index); 125 break; 126 case 'F': 127 result.append('F'); 128 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 129 break; 130 case 'G': 131 index = handleG(value, result, index, slavoGermanic); 132 break; 133 case 'H': 134 index = handleH(value, result, index); 135 break; 136 case 'J': 137 index = handleJ(value, result, index, slavoGermanic); 138 break; 139 case 'K': 140 result.append('K'); 141 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 142 break; 143 case 'L': 144 index = handleL(value, result, index); 145 break; 146 case 'M': 147 result.append('M'); 148 index = conditionM0(value, index) ? index + 2 : index + 1; 149 break; 150 case 'N': 151 result.append('N'); 152 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 153 break; 154 case '\u00D1': 155 // N with a tilde (spanish ene) 156 result.append('N'); 157 index++; 158 break; 159 case 'P': 160 index = handleP(value, result, index); 161 break; 162 case 'Q': 163 result.append('K'); 164 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 165 break; 166 case 'R': 167 index = handleR(value, result, index, slavoGermanic); 168 break; 169 case 'S': 170 index = handleS(value, result, index, slavoGermanic); 171 break; 172 case 'T': 173 index = handleT(value, result, index); 174 break; 175 case 'V': 176 result.append('F'); 177 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 178 break; 179 case 'W': 180 index = handleW(value, result, index); 181 break; 182 case 'X': 183 index = handleX(value, result, index); 184 break; 185 case 'Z': 186 index = handleZ(value, result, index, slavoGermanic); 187 break; 188 default: 189 index++; 190 break; 191 } 192 } 193 194 return alternate ? result.getAlternate() : result.getPrimary(); 195 } 196 197 /** 198 * Encode the value using DoubleMetaphone. It will only work if 199 * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>). 200 * 201 * @param obj Object to encode (should be of type String) 202 * @return An encoded Object (will be of type String) 203 * @throws EncoderException encode parameter is not of type String 204 */ encode(Object obj)205 public Object encode(Object obj) throws EncoderException { 206 if (!(obj instanceof String)) { 207 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 208 } 209 return doubleMetaphone((String) obj); 210 } 211 212 /** 213 * Encode the value using DoubleMetaphone. 214 * 215 * @param value String to encode 216 * @return An encoded String 217 */ encode(String value)218 public String encode(String value) { 219 return doubleMetaphone(value); 220 } 221 222 /** 223 * Check if the Double Metaphone values of two <code>String</code> values 224 * are equal. 225 * 226 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 227 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 228 * @return <code>true</code> if the encoded <code>String</code>s are equal; 229 * <code>false</code> otherwise. 230 * @see #isDoubleMetaphoneEqual(String,String,boolean) 231 */ isDoubleMetaphoneEqual(String value1, String value2)232 public boolean isDoubleMetaphoneEqual(String value1, String value2) { 233 return isDoubleMetaphoneEqual(value1, value2, false); 234 } 235 236 /** 237 * Check if the Double Metaphone values of two <code>String</code> values 238 * are equal, optionally using the alternate value. 239 * 240 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 241 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 242 * @param alternate use the alternate value if <code>true</code>. 243 * @return <code>true</code> if the encoded <code>String</code>s are equal; 244 * <code>false</code> otherwise. 245 */ isDoubleMetaphoneEqual(String value1, String value2, boolean alternate)246 public boolean isDoubleMetaphoneEqual(String value1, 247 String value2, 248 boolean alternate) { 249 return doubleMetaphone(value1, alternate).equals(doubleMetaphone 250 (value2, alternate)); 251 } 252 253 /** 254 * Returns the maxCodeLen. 255 * @return int 256 */ getMaxCodeLen()257 public int getMaxCodeLen() { 258 return this.maxCodeLen; 259 } 260 261 /** 262 * Sets the maxCodeLen. 263 * @param maxCodeLen The maxCodeLen to set 264 */ setMaxCodeLen(int maxCodeLen)265 public void setMaxCodeLen(int maxCodeLen) { 266 this.maxCodeLen = maxCodeLen; 267 } 268 269 //-- BEGIN HANDLERS --// 270 271 /** 272 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases 273 */ handleAEIOUY(String value, DoubleMetaphoneResult result, int index)274 private int handleAEIOUY(String value, DoubleMetaphoneResult result, int 275 index) { 276 if (index == 0) { 277 result.append('A'); 278 } 279 return index + 1; 280 } 281 282 /** 283 * Handles 'C' cases 284 */ handleC(String value, DoubleMetaphoneResult result, int index)285 private int handleC(String value, 286 DoubleMetaphoneResult result, 287 int index) { 288 if (conditionC0(value, index)) { // very confusing, moved out 289 result.append('K'); 290 index += 2; 291 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 292 result.append('S'); 293 index += 2; 294 } else if (contains(value, index, 2, "CH")) { 295 index = handleCH(value, result, index); 296 } else if (contains(value, index, 2, "CZ") && 297 !contains(value, index - 2, 4, "WICZ")) { 298 //-- "Czerny" --// 299 result.append('S', 'X'); 300 index += 2; 301 } else if (contains(value, index + 1, 3, "CIA")) { 302 //-- "focaccia" --// 303 result.append('X'); 304 index += 3; 305 } else if (contains(value, index, 2, "CC") && 306 !(index == 1 && charAt(value, 0) == 'M')) { 307 //-- double "cc" but not "McClelland" --// 308 return handleCC(value, result, index); 309 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 310 result.append('K'); 311 index += 2; 312 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 313 //-- Italian vs. English --// 314 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 315 result.append('S', 'X'); 316 } else { 317 result.append('S'); 318 } 319 index += 2; 320 } else { 321 result.append('K'); 322 if (contains(value, index + 1, 2, " C", " Q", " G")) { 323 //-- Mac Caffrey, Mac Gregor --// 324 index += 3; 325 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 326 !contains(value, index + 1, 2, "CE", "CI")) { 327 index += 2; 328 } else { 329 index++; 330 } 331 } 332 333 return index; 334 } 335 336 /** 337 * Handles 'CC' cases 338 */ handleCC(String value, DoubleMetaphoneResult result, int index)339 private int handleCC(String value, 340 DoubleMetaphoneResult result, 341 int index) { 342 if (contains(value, index + 2, 1, "I", "E", "H") && 343 !contains(value, index + 2, 2, "HU")) { 344 //-- "bellocchio" but not "bacchus" --// 345 if ((index == 1 && charAt(value, index - 1) == 'A') || 346 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 347 //-- "accident", "accede", "succeed" --// 348 result.append("KS"); 349 } else { 350 //-- "bacci", "bertucci", other Italian --// 351 result.append('X'); 352 } 353 index += 3; 354 } else { // Pierce's rule 355 result.append('K'); 356 index += 2; 357 } 358 359 return index; 360 } 361 362 /** 363 * Handles 'CH' cases 364 */ handleCH(String value, DoubleMetaphoneResult result, int index)365 private int handleCH(String value, 366 DoubleMetaphoneResult result, 367 int index) { 368 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 369 result.append('K', 'X'); 370 return index + 2; 371 } else if (conditionCH0(value, index)) { 372 //-- Greek roots ("chemistry", "chorus", etc.) --// 373 result.append('K'); 374 return index + 2; 375 } else if (conditionCH1(value, index)) { 376 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 377 result.append('K'); 378 return index + 2; 379 } else { 380 if (index > 0) { 381 if (contains(value, 0, 2, "MC")) { 382 result.append('K'); 383 } else { 384 result.append('X', 'K'); 385 } 386 } else { 387 result.append('X'); 388 } 389 return index + 2; 390 } 391 } 392 393 /** 394 * Handles 'D' cases 395 */ handleD(String value, DoubleMetaphoneResult result, int index)396 private int handleD(String value, 397 DoubleMetaphoneResult result, 398 int index) { 399 if (contains(value, index, 2, "DG")) { 400 //-- "Edge" --// 401 if (contains(value, index + 2, 1, "I", "E", "Y")) { 402 result.append('J'); 403 index += 3; 404 //-- "Edgar" --// 405 } else { 406 result.append("TK"); 407 index += 2; 408 } 409 } else if (contains(value, index, 2, "DT", "DD")) { 410 result.append('T'); 411 index += 2; 412 } else { 413 result.append('T'); 414 index++; 415 } 416 return index; 417 } 418 419 /** 420 * Handles 'G' cases 421 */ handleG(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)422 private int handleG(String value, 423 DoubleMetaphoneResult result, 424 int index, 425 boolean slavoGermanic) { 426 if (charAt(value, index + 1) == 'H') { 427 index = handleGH(value, result, index); 428 } else if (charAt(value, index + 1) == 'N') { 429 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 430 result.append("KN", "N"); 431 } else if (!contains(value, index + 2, 2, "EY") && 432 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 433 result.append("N", "KN"); 434 } else { 435 result.append("KN"); 436 } 437 index = index + 2; 438 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 439 result.append("KL", "L"); 440 index += 2; 441 } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 442 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 443 result.append('K', 'J'); 444 index += 2; 445 } else if ((contains(value, index + 1, 2, "ER") || 446 charAt(value, index + 1) == 'Y') && 447 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 448 !contains(value, index - 1, 1, "E", "I") && 449 !contains(value, index - 1, 3, "RGY", "OGY")) { 450 //-- -ger-, -gy- --// 451 result.append('K', 'J'); 452 index += 2; 453 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 454 contains(value, index - 1, 4, "AGGI", "OGGI")) { 455 //-- Italian "biaggi" --// 456 if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) { 457 //-- obvious germanic --// 458 result.append('K'); 459 } else if (contains(value, index + 1, 4, "IER")) { 460 result.append('J'); 461 } else { 462 result.append('J', 'K'); 463 } 464 index += 2; 465 } else if (charAt(value, index + 1) == 'G') { 466 index += 2; 467 result.append('K'); 468 } else { 469 index++; 470 result.append('K'); 471 } 472 return index; 473 } 474 475 /** 476 * Handles 'GH' cases 477 */ handleGH(String value, DoubleMetaphoneResult result, int index)478 private int handleGH(String value, 479 DoubleMetaphoneResult result, 480 int index) { 481 if (index > 0 && !isVowel(charAt(value, index - 1))) { 482 result.append('K'); 483 index += 2; 484 } else if (index == 0) { 485 if (charAt(value, index + 2) == 'I') { 486 result.append('J'); 487 } else { 488 result.append('K'); 489 } 490 index += 2; 491 } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || 492 (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || 493 (index > 3 && contains(value, index - 4, 1, "B", "H"))) { 494 //-- Parker's rule (with some further refinements) - "hugh" 495 index += 2; 496 } else { 497 if (index > 2 && charAt(value, index - 1) == 'U' && 498 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 499 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 500 result.append('F'); 501 } else if (index > 0 && charAt(value, index - 1) != 'I') { 502 result.append('K'); 503 } 504 index += 2; 505 } 506 return index; 507 } 508 509 /** 510 * Handles 'H' cases 511 */ handleH(String value, DoubleMetaphoneResult result, int index)512 private int handleH(String value, 513 DoubleMetaphoneResult result, 514 int index) { 515 //-- only keep if first & before vowel or between 2 vowels --// 516 if ((index == 0 || isVowel(charAt(value, index - 1))) && 517 isVowel(charAt(value, index + 1))) { 518 result.append('H'); 519 index += 2; 520 //-- also takes car of "HH" --// 521 } else { 522 index++; 523 } 524 return index; 525 } 526 527 /** 528 * Handles 'J' cases 529 */ handleJ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)530 private int handleJ(String value, DoubleMetaphoneResult result, int index, 531 boolean slavoGermanic) { 532 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 533 //-- obvious Spanish, "Jose", "San Jacinto" --// 534 if ((index == 0 && (charAt(value, index + 4) == ' ') || 535 value.length() == 4) || contains(value, 0, 4, "SAN ")) { 536 result.append('H'); 537 } else { 538 result.append('J', 'H'); 539 } 540 index++; 541 } else { 542 if (index == 0 && !contains(value, index, 4, "JOSE")) { 543 result.append('J', 'A'); 544 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 545 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 546 result.append('J', 'H'); 547 } else if (index == value.length() - 1) { 548 result.append('J', ' '); 549 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) { 550 result.append('J'); 551 } 552 553 if (charAt(value, index + 1) == 'J') { 554 index += 2; 555 } else { 556 index++; 557 } 558 } 559 return index; 560 } 561 562 /** 563 * Handles 'L' cases 564 */ handleL(String value, DoubleMetaphoneResult result, int index)565 private int handleL(String value, 566 DoubleMetaphoneResult result, 567 int index) { 568 result.append('L'); 569 if (charAt(value, index + 1) == 'L') { 570 if (conditionL0(value, index)) { 571 result.appendAlternate(' '); 572 } 573 index += 2; 574 } else { 575 index++; 576 } 577 return index; 578 } 579 580 /** 581 * Handles 'P' cases 582 */ handleP(String value, DoubleMetaphoneResult result, int index)583 private int handleP(String value, 584 DoubleMetaphoneResult result, 585 int index) { 586 if (charAt(value, index + 1) == 'H') { 587 result.append('F'); 588 index += 2; 589 } else { 590 result.append('P'); 591 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 592 } 593 return index; 594 } 595 596 /** 597 * Handles 'R' cases 598 */ handleR(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)599 private int handleR(String value, 600 DoubleMetaphoneResult result, 601 int index, 602 boolean slavoGermanic) { 603 if (index == value.length() - 1 && !slavoGermanic && 604 contains(value, index - 2, 2, "IE") && 605 !contains(value, index - 4, 2, "ME", "MA")) { 606 result.appendAlternate('R'); 607 } else { 608 result.append('R'); 609 } 610 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 611 } 612 613 /** 614 * Handles 'S' cases 615 */ handleS(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)616 private int handleS(String value, 617 DoubleMetaphoneResult result, 618 int index, 619 boolean slavoGermanic) { 620 if (contains(value, index - 1, 3, "ISL", "YSL")) { 621 //-- special cases "island", "isle", "carlisle", "carlysle" --// 622 index++; 623 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 624 //-- special case "sugar-" --// 625 result.append('X', 'S'); 626 index++; 627 } else if (contains(value, index, 2, "SH")) { 628 if (contains(value, index + 1, 4, 629 "HEIM", "HOEK", "HOLM", "HOLZ")) { 630 //-- germanic --// 631 result.append('S'); 632 } else { 633 result.append('X'); 634 } 635 index += 2; 636 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 637 //-- Italian and Armenian --// 638 if (slavoGermanic) { 639 result.append('S'); 640 } else { 641 result.append('S', 'X'); 642 } 643 index += 3; 644 } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) { 645 //-- german & anglicisations, e.g. "smith" match "schmidt" // 646 // "snider" match "schneider" --// 647 //-- also, -sz- in slavic language altho in hungarian it // 648 // is pronounced "s" --// 649 result.append('S', 'X'); 650 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 651 } else if (contains(value, index, 2, "SC")) { 652 index = handleSC(value, result, index); 653 } else { 654 if (index == value.length() - 1 && contains(value, index - 2, 655 2, "AI", "OI")){ 656 //-- french e.g. "resnais", "artois" --// 657 result.appendAlternate('S'); 658 } else { 659 result.append('S'); 660 } 661 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 662 } 663 return index; 664 } 665 666 /** 667 * Handles 'SC' cases 668 */ handleSC(String value, DoubleMetaphoneResult result, int index)669 private int handleSC(String value, 670 DoubleMetaphoneResult result, 671 int index) { 672 if (charAt(value, index + 2) == 'H') { 673 //-- Schlesinger's rule --// 674 if (contains(value, index + 3, 675 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 676 //-- Dutch origin, e.g. "school", "schooner" --// 677 if (contains(value, index + 3, 2, "ER", "EN")) { 678 //-- "schermerhorn", "schenker" --// 679 result.append("X", "SK"); 680 } else { 681 result.append("SK"); 682 } 683 } else { 684 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 685 result.append('X', 'S'); 686 } else { 687 result.append('X'); 688 } 689 } 690 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 691 result.append('S'); 692 } else { 693 result.append("SK"); 694 } 695 return index + 3; 696 } 697 698 /** 699 * Handles 'T' cases 700 */ handleT(String value, DoubleMetaphoneResult result, int index)701 private int handleT(String value, 702 DoubleMetaphoneResult result, 703 int index) { 704 if (contains(value, index, 4, "TION")) { 705 result.append('X'); 706 index += 3; 707 } else if (contains(value, index, 3, "TIA", "TCH")) { 708 result.append('X'); 709 index += 3; 710 } else if (contains(value, index, 2, "TH") || contains(value, index, 711 3, "TTH")) { 712 if (contains(value, index + 2, 2, "OM", "AM") || 713 //-- special case "thomas", "thames" or germanic --// 714 contains(value, 0, 4, "VAN ", "VON ") || 715 contains(value, 0, 3, "SCH")) { 716 result.append('T'); 717 } else { 718 result.append('0', 'T'); 719 } 720 index += 2; 721 } else { 722 result.append('T'); 723 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 724 } 725 return index; 726 } 727 728 /** 729 * Handles 'W' cases 730 */ handleW(String value, DoubleMetaphoneResult result, int index)731 private int handleW(String value, 732 DoubleMetaphoneResult result, 733 int index) { 734 if (contains(value, index, 2, "WR")) { 735 //-- can also be in middle of word --// 736 result.append('R'); 737 index += 2; 738 } else { 739 if (index == 0 && (isVowel(charAt(value, index + 1)) || 740 contains(value, index, 2, "WH"))) { 741 if (isVowel(charAt(value, index + 1))) { 742 //-- Wasserman should match Vasserman --// 743 result.append('A', 'F'); 744 } else { 745 //-- need Uomo to match Womo --// 746 result.append('A'); 747 } 748 index++; 749 } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || 750 contains(value, index - 1, 751 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 752 contains(value, 0, 3, "SCH")) { 753 //-- Arnow should match Arnoff --// 754 result.appendAlternate('F'); 755 index++; 756 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 757 //-- Polish e.g. "filipowicz" --// 758 result.append("TS", "FX"); 759 index += 4; 760 } else { 761 index++; 762 } 763 } 764 return index; 765 } 766 767 /** 768 * Handles 'X' cases 769 */ handleX(String value, DoubleMetaphoneResult result, int index)770 private int handleX(String value, 771 DoubleMetaphoneResult result, 772 int index) { 773 if (index == 0) { 774 result.append('S'); 775 index++; 776 } else { 777 if (!((index == value.length() - 1) && 778 (contains(value, index - 3, 3, "IAU", "EAU") || 779 contains(value, index - 2, 2, "AU", "OU")))) { 780 //-- French e.g. breaux --// 781 result.append("KS"); 782 } 783 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 784 } 785 return index; 786 } 787 788 /** 789 * Handles 'Z' cases 790 */ handleZ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)791 private int handleZ(String value, DoubleMetaphoneResult result, int index, 792 boolean slavoGermanic) { 793 if (charAt(value, index + 1) == 'H') { 794 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// 795 result.append('J'); 796 index += 2; 797 } else { 798 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { 799 result.append("S", "TS"); 800 } else { 801 result.append('S'); 802 } 803 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 804 } 805 return index; 806 } 807 808 //-- BEGIN CONDITIONS --// 809 810 /** 811 * Complex condition 0 for 'C' 812 */ conditionC0(String value, int index)813 private boolean conditionC0(String value, int index) { 814 if (contains(value, index, 4, "CHIA")) { 815 return true; 816 } else if (index <= 1) { 817 return false; 818 } else if (isVowel(charAt(value, index - 2))) { 819 return false; 820 } else if (!contains(value, index - 1, 3, "ACH")) { 821 return false; 822 } else { 823 char c = charAt(value, index + 2); 824 return (c != 'I' && c != 'E') 825 || contains(value, index - 2, 6, "BACHER", "MACHER"); 826 } 827 } 828 829 /** 830 * Complex condition 0 for 'CH' 831 */ conditionCH0(String value, int index)832 private boolean conditionCH0(String value, int index) { 833 if (index != 0) { 834 return false; 835 } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 836 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 837 return false; 838 } else if (contains(value, 0, 5, "CHORE")) { 839 return false; 840 } else { 841 return true; 842 } 843 } 844 845 /** 846 * Complex condition 1 for 'CH' 847 */ conditionCH1(String value, int index)848 private boolean conditionCH1(String value, int index) { 849 return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 850 3, "SCH")) || 851 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 852 contains(value, index + 2, 1, "T", "S") || 853 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 854 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); 855 } 856 857 /** 858 * Complex condition 0 for 'L' 859 */ conditionL0(String value, int index)860 private boolean conditionL0(String value, int index) { 861 if (index == value.length() - 3 && 862 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 863 return true; 864 } else if ((contains(value, index - 1, 2, "AS", "OS") || 865 contains(value, value.length() - 1, 1, "A", "O")) && 866 contains(value, index - 1, 4, "ALLE")) { 867 return true; 868 } else { 869 return false; 870 } 871 } 872 873 /** 874 * Complex condition 0 for 'M' 875 */ conditionM0(String value, int index)876 private boolean conditionM0(String value, int index) { 877 if (charAt(value, index + 1) == 'M') { 878 return true; 879 } 880 return contains(value, index - 1, 3, "UMB") 881 && ((index + 1) == value.length() - 1 || contains(value, 882 index + 2, 2, "ER")); 883 } 884 885 //-- BEGIN HELPER FUNCTIONS --// 886 887 /** 888 * Determines whether or not a value is of slavo-germanic orgin. A value is 889 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. 890 */ isSlavoGermanic(String value)891 private boolean isSlavoGermanic(String value) { 892 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 893 value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1; 894 } 895 896 /** 897 * Determines whether or not a character is a vowel or not 898 */ isVowel(char ch)899 private boolean isVowel(char ch) { 900 return VOWELS.indexOf(ch) != -1; 901 } 902 903 /** 904 * Determines whether or not the value starts with a silent letter. It will 905 * return <code>true</code> if the value starts with any of 'GN', 'KN', 906 * 'PN', 'WR' or 'PS'. 907 */ isSilentStart(String value)908 private boolean isSilentStart(String value) { 909 boolean result = false; 910 for (int i = 0; i < SILENT_START.length; i++) { 911 if (value.startsWith(SILENT_START[i])) { 912 result = true; 913 break; 914 } 915 } 916 return result; 917 } 918 919 /** 920 * Cleans the input 921 */ cleanInput(String input)922 private String cleanInput(String input) { 923 if (input == null) { 924 return null; 925 } 926 input = input.trim(); 927 if (input.length() == 0) { 928 return null; 929 } 930 return input.toUpperCase(); 931 } 932 933 /** 934 * Gets the character at index <code>index</code> if available, otherwise 935 * it returns <code>Character.MIN_VALUE</code> so that there is some sort 936 * of a default 937 */ charAt(String value, int index)938 protected char charAt(String value, int index) { 939 if (index < 0 || index >= value.length()) { 940 return Character.MIN_VALUE; 941 } 942 return value.charAt(index); 943 } 944 945 /** 946 * Shortcut method with 1 criteria 947 */ contains(String value, int start, int length, String criteria)948 private static boolean contains(String value, int start, int length, 949 String criteria) { 950 return contains(value, start, length, 951 new String[] { criteria }); 952 } 953 954 /** 955 * Shortcut method with 2 criteria 956 */ contains(String value, int start, int length, String criteria1, String criteria2)957 private static boolean contains(String value, int start, int length, 958 String criteria1, String criteria2) { 959 return contains(value, start, length, 960 new String[] { criteria1, criteria2 }); 961 } 962 963 /** 964 * Shortcut method with 3 criteria 965 */ contains(String value, int start, int length, String criteria1, String criteria2, String criteria3)966 private static boolean contains(String value, int start, int length, 967 String criteria1, String criteria2, 968 String criteria3) { 969 return contains(value, start, length, 970 new String[] { criteria1, criteria2, criteria3 }); 971 } 972 973 /** 974 * Shortcut method with 4 criteria 975 */ contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4)976 private static boolean contains(String value, int start, int length, 977 String criteria1, String criteria2, 978 String criteria3, String criteria4) { 979 return contains(value, start, length, 980 new String[] { criteria1, criteria2, criteria3, 981 criteria4 }); 982 } 983 984 /** 985 * Shortcut method with 5 criteria 986 */ contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4, String criteria5)987 private static boolean contains(String value, int start, int length, 988 String criteria1, String criteria2, 989 String criteria3, String criteria4, 990 String criteria5) { 991 return contains(value, start, length, 992 new String[] { criteria1, criteria2, criteria3, 993 criteria4, criteria5 }); 994 } 995 996 /** 997 * Shortcut method with 6 criteria 998 */ contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4, String criteria5, String criteria6)999 private static boolean contains(String value, int start, int length, 1000 String criteria1, String criteria2, 1001 String criteria3, String criteria4, 1002 String criteria5, String criteria6) { 1003 return contains(value, start, length, 1004 new String[] { criteria1, criteria2, criteria3, 1005 criteria4, criteria5, criteria6 }); 1006 } 1007 1008 /** 1009 * Determines whether <code>value</code> contains any of the criteria 1010 starting 1011 * at index <code>start</code> and matching up to length <code>length</code> 1012 */ contains(String value, int start, int length, String[] criteria)1013 protected static boolean contains(String value, int start, int length, 1014 String[] criteria) { 1015 boolean result = false; 1016 if (start >= 0 && start + length <= value.length()) { 1017 String target = value.substring(start, start + length); 1018 1019 for (int i = 0; i < criteria.length; i++) { 1020 if (target.equals(criteria[i])) { 1021 result = true; 1022 break; 1023 } 1024 } 1025 } 1026 return result; 1027 } 1028 1029 //-- BEGIN INNER CLASSES --// 1030 1031 /** 1032 * Inner class for storing results, since there is the optional alternate 1033 * encoding. 1034 */ 1035 public class DoubleMetaphoneResult { 1036 1037 private StringBuffer primary = new StringBuffer(getMaxCodeLen()); 1038 private StringBuffer alternate = new StringBuffer(getMaxCodeLen()); 1039 private int maxLength; 1040 DoubleMetaphoneResult(int maxLength)1041 public DoubleMetaphoneResult(int maxLength) { 1042 this.maxLength = maxLength; 1043 } 1044 append(char value)1045 public void append(char value) { 1046 appendPrimary(value); 1047 appendAlternate(value); 1048 } 1049 append(char primary, char alternate)1050 public void append(char primary, char alternate) { 1051 appendPrimary(primary); 1052 appendAlternate(alternate); 1053 } 1054 appendPrimary(char value)1055 public void appendPrimary(char value) { 1056 if (this.primary.length() < this.maxLength) { 1057 this.primary.append(value); 1058 } 1059 } 1060 appendAlternate(char value)1061 public void appendAlternate(char value) { 1062 if (this.alternate.length() < this.maxLength) { 1063 this.alternate.append(value); 1064 } 1065 } 1066 append(String value)1067 public void append(String value) { 1068 appendPrimary(value); 1069 appendAlternate(value); 1070 } 1071 append(String primary, String alternate)1072 public void append(String primary, String alternate) { 1073 appendPrimary(primary); 1074 appendAlternate(alternate); 1075 } 1076 appendPrimary(String value)1077 public void appendPrimary(String value) { 1078 int addChars = this.maxLength - this.primary.length(); 1079 if (value.length() <= addChars) { 1080 this.primary.append(value); 1081 } else { 1082 this.primary.append(value.substring(0, addChars)); 1083 } 1084 } 1085 appendAlternate(String value)1086 public void appendAlternate(String value) { 1087 int addChars = this.maxLength - this.alternate.length(); 1088 if (value.length() <= addChars) { 1089 this.alternate.append(value); 1090 } else { 1091 this.alternate.append(value.substring(0, addChars)); 1092 } 1093 } 1094 getPrimary()1095 public String getPrimary() { 1096 return this.primary.toString(); 1097 } 1098 getAlternate()1099 public String getAlternate() { 1100 return this.alternate.toString(); 1101 } 1102 isComplete()1103 public boolean isComplete() { 1104 return this.primary.length() >= this.maxLength && 1105 this.alternate.length() >= this.maxLength; 1106 } 1107 } 1108 } 1109