1 /* 2 * Copyright 2001-2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.commons.codec.language; 18 19 import org.apache.commons.codec.EncoderException; 20 import org.apache.commons.codec.StringEncoder; 21 22 /** 23 * Encodes a string into a double metaphone value. 24 * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>. 25 * <ul> 26 * <li>Original Article: <a 27 * href="http://www.cuj.com/documents/s=8038/cuj0006philips/"> 28 * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li> 29 * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip"> 30 * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li> 31 * </ul> 32 * 33 * @author Apache Software Foundation 34 * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $ 35 */ 36 public class DoubleMetaphone implements StringEncoder { 37 38 /** 39 * "Vowels" to test for 40 */ 41 private static final String VOWELS = "AEIOUY"; 42 43 /** 44 * Prefixes when present which are not pronounced 45 */ 46 private static final String[] SILENT_START = 47 { "GN", "KN", "PN", "WR", "PS" }; 48 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = 49 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 50 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = 51 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 52 private static final String[] L_T_K_S_N_M_B_Z = 53 { "L", "T", "K", "S", "N", "M", "B", "Z" }; 54 55 /** 56 * Maximum length of an encoding, default is 4 57 */ 58 protected int maxCodeLen = 4; 59 60 /** 61 * Creates an instance of this DoubleMetaphone encoder 62 */ DoubleMetaphone()63 public DoubleMetaphone() { 64 super(); 65 } 66 67 /** 68 * Encode a value with Double Metaphone 69 * 70 * @param value String to encode 71 * @return an encoded string 72 */ doubleMetaphone(String value)73 public String doubleMetaphone(String value) { 74 return doubleMetaphone(value, false); 75 } 76 77 /** 78 * Encode a value with Double Metaphone, optionally using the alternate 79 * encoding. 80 * 81 * @param value String to encode 82 * @param alternate use alternate encode 83 * @return an encoded string 84 */ doubleMetaphone(String value, boolean alternate)85 public String doubleMetaphone(String value, boolean alternate) { 86 value = cleanInput(value); 87 if (value == null) { 88 return null; 89 } 90 91 boolean slavoGermanic = isSlavoGermanic(value); 92 int index = isSilentStart(value) ? 1 : 0; 93 94 DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); 95 96 while (!result.isComplete() && index <= value.length() - 1) { 97 switch (value.charAt(index)) { 98 case 'A': 99 case 'E': 100 case 'I': 101 case 'O': 102 case 'U': 103 case 'Y': 104 index = handleAEIOUY(value, result, index); 105 break; 106 case 'B': 107 result.append('P'); 108 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 109 break; 110 case '\u00C7': 111 // A C with a Cedilla 112 result.append('S'); 113 index++; 114 break; 115 case 'C': 116 index = handleC(value, result, index); 117 break; 118 case 'D': 119 index = handleD(value, result, index); 120 break; 121 case 'F': 122 result.append('F'); 123 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 124 break; 125 case 'G': 126 index = handleG(value, result, index, slavoGermanic); 127 break; 128 case 'H': 129 index = handleH(value, result, index); 130 break; 131 case 'J': 132 index = handleJ(value, result, index, slavoGermanic); 133 break; 134 case 'K': 135 result.append('K'); 136 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 137 break; 138 case 'L': 139 index = handleL(value, result, index); 140 break; 141 case 'M': 142 result.append('M'); 143 index = conditionM0(value, index) ? index + 2 : index + 1; 144 break; 145 case 'N': 146 result.append('N'); 147 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 148 break; 149 case '\u00D1': 150 // N with a tilde (spanish ene) 151 result.append('N'); 152 index++; 153 break; 154 case 'P': 155 index = handleP(value, result, index); 156 break; 157 case 'Q': 158 result.append('K'); 159 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 160 break; 161 case 'R': 162 index = handleR(value, result, index, slavoGermanic); 163 break; 164 case 'S': 165 index = handleS(value, result, index, slavoGermanic); 166 break; 167 case 'T': 168 index = handleT(value, result, index); 169 break; 170 case 'V': 171 result.append('F'); 172 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 173 break; 174 case 'W': 175 index = handleW(value, result, index); 176 break; 177 case 'X': 178 index = handleX(value, result, index); 179 break; 180 case 'Z': 181 index = handleZ(value, result, index, slavoGermanic); 182 break; 183 default: 184 index++; 185 break; 186 } 187 } 188 189 return alternate ? result.getAlternate() : result.getPrimary(); 190 } 191 192 /** 193 * Encode the value using DoubleMetaphone. It will only work if 194 * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>). 195 * 196 * @param obj Object to encode (should be of type String) 197 * @return An encoded Object (will be of type String) 198 * @throws EncoderException encode parameter is not of type String 199 */ encode(Object obj)200 public Object encode(Object obj) throws EncoderException { 201 if (!(obj instanceof String)) { 202 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 203 } 204 return doubleMetaphone((String) obj); 205 } 206 207 /** 208 * Encode the value using DoubleMetaphone. 209 * 210 * @param value String to encode 211 * @return An encoded String 212 */ encode(String value)213 public String encode(String value) { 214 return doubleMetaphone(value); 215 } 216 217 /** 218 * Check if the Double Metaphone values of two <code>String</code> values 219 * are equal. 220 * 221 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 222 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 223 * @return <code>true</code> if the encoded <code>String</code>s are equal; 224 * <code>false</code> otherwise. 225 * @see #isDoubleMetaphoneEqual(String,String,boolean) 226 */ isDoubleMetaphoneEqual(String value1, String value2)227 public boolean isDoubleMetaphoneEqual(String value1, String value2) { 228 return isDoubleMetaphoneEqual(value1, value2, false); 229 } 230 231 /** 232 * Check if the Double Metaphone values of two <code>String</code> values 233 * are equal, optionally using the alternate value. 234 * 235 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 236 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 237 * @param alternate use the alternate value if <code>true</code>. 238 * @return <code>true</code> if the encoded <code>String</code>s are equal; 239 * <code>false</code> otherwise. 240 */ isDoubleMetaphoneEqual(String value1, String value2, boolean alternate)241 public boolean isDoubleMetaphoneEqual(String value1, 242 String value2, 243 boolean alternate) { 244 return doubleMetaphone(value1, alternate).equals(doubleMetaphone 245 (value2, alternate)); 246 } 247 248 /** 249 * Returns the maxCodeLen. 250 * @return int 251 */ getMaxCodeLen()252 public int getMaxCodeLen() { 253 return this.maxCodeLen; 254 } 255 256 /** 257 * Sets the maxCodeLen. 258 * @param maxCodeLen The maxCodeLen to set 259 */ setMaxCodeLen(int maxCodeLen)260 public void setMaxCodeLen(int maxCodeLen) { 261 this.maxCodeLen = maxCodeLen; 262 } 263 264 //-- BEGIN HANDLERS --// 265 266 /** 267 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases 268 */ handleAEIOUY(String value, DoubleMetaphoneResult result, int index)269 private int handleAEIOUY(String value, DoubleMetaphoneResult result, int 270 index) { 271 if (index == 0) { 272 result.append('A'); 273 } 274 return index + 1; 275 } 276 277 /** 278 * Handles 'C' cases 279 */ handleC(String value, DoubleMetaphoneResult result, int index)280 private int handleC(String value, 281 DoubleMetaphoneResult result, 282 int index) { 283 if (conditionC0(value, index)) { // very confusing, moved out 284 result.append('K'); 285 index += 2; 286 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 287 result.append('S'); 288 index += 2; 289 } else if (contains(value, index, 2, "CH")) { 290 index = handleCH(value, result, index); 291 } else if (contains(value, index, 2, "CZ") && 292 !contains(value, index - 2, 4, "WICZ")) { 293 //-- "Czerny" --// 294 result.append('S', 'X'); 295 index += 2; 296 } else if (contains(value, index + 1, 3, "CIA")) { 297 //-- "focaccia" --// 298 result.append('X'); 299 index += 3; 300 } else if (contains(value, index, 2, "CC") && 301 !(index == 1 && charAt(value, 0) == 'M')) { 302 //-- double "cc" but not "McClelland" --// 303 return handleCC(value, result, index); 304 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 305 result.append('K'); 306 index += 2; 307 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 308 //-- Italian vs. English --// 309 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 310 result.append('S', 'X'); 311 } else { 312 result.append('S'); 313 } 314 index += 2; 315 } else { 316 result.append('K'); 317 if (contains(value, index + 1, 2, " C", " Q", " G")) { 318 //-- Mac Caffrey, Mac Gregor --// 319 index += 3; 320 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 321 !contains(value, index + 1, 2, "CE", "CI")) { 322 index += 2; 323 } else { 324 index++; 325 } 326 } 327 328 return index; 329 } 330 331 /** 332 * Handles 'CC' cases 333 */ handleCC(String value, DoubleMetaphoneResult result, int index)334 private int handleCC(String value, 335 DoubleMetaphoneResult result, 336 int index) { 337 if (contains(value, index + 2, 1, "I", "E", "H") && 338 !contains(value, index + 2, 2, "HU")) { 339 //-- "bellocchio" but not "bacchus" --// 340 if ((index == 1 && charAt(value, index - 1) == 'A') || 341 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 342 //-- "accident", "accede", "succeed" --// 343 result.append("KS"); 344 } else { 345 //-- "bacci", "bertucci", other Italian --// 346 result.append('X'); 347 } 348 index += 3; 349 } else { // Pierce's rule 350 result.append('K'); 351 index += 2; 352 } 353 354 return index; 355 } 356 357 /** 358 * Handles 'CH' cases 359 */ handleCH(String value, DoubleMetaphoneResult result, int index)360 private int handleCH(String value, 361 DoubleMetaphoneResult result, 362 int index) { 363 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 364 result.append('K', 'X'); 365 return index + 2; 366 } else if (conditionCH0(value, index)) { 367 //-- Greek roots ("chemistry", "chorus", etc.) --// 368 result.append('K'); 369 return index + 2; 370 } else if (conditionCH1(value, index)) { 371 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 372 result.append('K'); 373 return index + 2; 374 } else { 375 if (index > 0) { 376 if (contains(value, 0, 2, "MC")) { 377 result.append('K'); 378 } else { 379 result.append('X', 'K'); 380 } 381 } else { 382 result.append('X'); 383 } 384 return index + 2; 385 } 386 } 387 388 /** 389 * Handles 'D' cases 390 */ handleD(String value, DoubleMetaphoneResult result, int index)391 private int handleD(String value, 392 DoubleMetaphoneResult result, 393 int index) { 394 if (contains(value, index, 2, "DG")) { 395 //-- "Edge" --// 396 if (contains(value, index + 2, 1, "I", "E", "Y")) { 397 result.append('J'); 398 index += 3; 399 //-- "Edgar" --// 400 } else { 401 result.append("TK"); 402 index += 2; 403 } 404 } else if (contains(value, index, 2, "DT", "DD")) { 405 result.append('T'); 406 index += 2; 407 } else { 408 result.append('T'); 409 index++; 410 } 411 return index; 412 } 413 414 /** 415 * Handles 'G' cases 416 */ handleG(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)417 private int handleG(String value, 418 DoubleMetaphoneResult result, 419 int index, 420 boolean slavoGermanic) { 421 if (charAt(value, index + 1) == 'H') { 422 index = handleGH(value, result, index); 423 } else if (charAt(value, index + 1) == 'N') { 424 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 425 result.append("KN", "N"); 426 } else if (!contains(value, index + 2, 2, "EY") && 427 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 428 result.append("N", "KN"); 429 } else { 430 result.append("KN"); 431 } 432 index = index + 2; 433 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 434 result.append("KL", "L"); 435 index += 2; 436 } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 437 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 438 result.append('K', 'J'); 439 index += 2; 440 } else if ((contains(value, index + 1, 2, "ER") || 441 charAt(value, index + 1) == 'Y') && 442 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 443 !contains(value, index - 1, 1, "E", "I") && 444 !contains(value, index - 1, 3, "RGY", "OGY")) { 445 //-- -ger-, -gy- --// 446 result.append('K', 'J'); 447 index += 2; 448 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 449 contains(value, index - 1, 4, "AGGI", "OGGI")) { 450 //-- Italian "biaggi" --// 451 if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) { 452 //-- obvious germanic --// 453 result.append('K'); 454 } else if (contains(value, index + 1, 4, "IER")) { 455 result.append('J'); 456 } else { 457 result.append('J', 'K'); 458 } 459 index += 2; 460 } else if (charAt(value, index + 1) == 'G') { 461 index += 2; 462 result.append('K'); 463 } else { 464 index++; 465 result.append('K'); 466 } 467 return index; 468 } 469 470 /** 471 * Handles 'GH' cases 472 */ handleGH(String value, DoubleMetaphoneResult result, int index)473 private int handleGH(String value, 474 DoubleMetaphoneResult result, 475 int index) { 476 if (index > 0 && !isVowel(charAt(value, index - 1))) { 477 result.append('K'); 478 index += 2; 479 } else if (index == 0) { 480 if (charAt(value, index + 2) == 'I') { 481 result.append('J'); 482 } else { 483 result.append('K'); 484 } 485 index += 2; 486 } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || 487 (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || 488 (index > 3 && contains(value, index - 4, 1, "B", "H"))) { 489 //-- Parker's rule (with some further refinements) - "hugh" 490 index += 2; 491 } else { 492 if (index > 2 && charAt(value, index - 1) == 'U' && 493 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 494 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 495 result.append('F'); 496 } else if (index > 0 && charAt(value, index - 1) != 'I') { 497 result.append('K'); 498 } 499 index += 2; 500 } 501 return index; 502 } 503 504 /** 505 * Handles 'H' cases 506 */ handleH(String value, DoubleMetaphoneResult result, int index)507 private int handleH(String value, 508 DoubleMetaphoneResult result, 509 int index) { 510 //-- only keep if first & before vowel or between 2 vowels --// 511 if ((index == 0 || isVowel(charAt(value, index - 1))) && 512 isVowel(charAt(value, index + 1))) { 513 result.append('H'); 514 index += 2; 515 //-- also takes car of "HH" --// 516 } else { 517 index++; 518 } 519 return index; 520 } 521 522 /** 523 * Handles 'J' cases 524 */ handleJ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)525 private int handleJ(String value, DoubleMetaphoneResult result, int index, 526 boolean slavoGermanic) { 527 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 528 //-- obvious Spanish, "Jose", "San Jacinto" --// 529 if ((index == 0 && (charAt(value, index + 4) == ' ') || 530 value.length() == 4) || contains(value, 0, 4, "SAN ")) { 531 result.append('H'); 532 } else { 533 result.append('J', 'H'); 534 } 535 index++; 536 } else { 537 if (index == 0 && !contains(value, index, 4, "JOSE")) { 538 result.append('J', 'A'); 539 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 540 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 541 result.append('J', 'H'); 542 } else if (index == value.length() - 1) { 543 result.append('J', ' '); 544 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) { 545 result.append('J'); 546 } 547 548 if (charAt(value, index + 1) == 'J') { 549 index += 2; 550 } else { 551 index++; 552 } 553 } 554 return index; 555 } 556 557 /** 558 * Handles 'L' cases 559 */ handleL(String value, DoubleMetaphoneResult result, int index)560 private int handleL(String value, 561 DoubleMetaphoneResult result, 562 int index) { 563 result.append('L'); 564 if (charAt(value, index + 1) == 'L') { 565 if (conditionL0(value, index)) { 566 result.appendAlternate(' '); 567 } 568 index += 2; 569 } else { 570 index++; 571 } 572 return index; 573 } 574 575 /** 576 * Handles 'P' cases 577 */ handleP(String value, DoubleMetaphoneResult result, int index)578 private int handleP(String value, 579 DoubleMetaphoneResult result, 580 int index) { 581 if (charAt(value, index + 1) == 'H') { 582 result.append('F'); 583 index += 2; 584 } else { 585 result.append('P'); 586 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 587 } 588 return index; 589 } 590 591 /** 592 * Handles 'R' cases 593 */ handleR(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)594 private int handleR(String value, 595 DoubleMetaphoneResult result, 596 int index, 597 boolean slavoGermanic) { 598 if (index == value.length() - 1 && !slavoGermanic && 599 contains(value, index - 2, 2, "IE") && 600 !contains(value, index - 4, 2, "ME", "MA")) { 601 result.appendAlternate('R'); 602 } else { 603 result.append('R'); 604 } 605 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 606 } 607 608 /** 609 * Handles 'S' cases 610 */ handleS(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)611 private int handleS(String value, 612 DoubleMetaphoneResult result, 613 int index, 614 boolean slavoGermanic) { 615 if (contains(value, index - 1, 3, "ISL", "YSL")) { 616 //-- special cases "island", "isle", "carlisle", "carlysle" --// 617 index++; 618 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 619 //-- special case "sugar-" --// 620 result.append('X', 'S'); 621 index++; 622 } else if (contains(value, index, 2, "SH")) { 623 if (contains(value, index + 1, 4, 624 "HEIM", "HOEK", "HOLM", "HOLZ")) { 625 //-- germanic --// 626 result.append('S'); 627 } else { 628 result.append('X'); 629 } 630 index += 2; 631 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 632 //-- Italian and Armenian --// 633 if (slavoGermanic) { 634 result.append('S'); 635 } else { 636 result.append('S', 'X'); 637 } 638 index += 3; 639 } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) { 640 //-- german & anglicisations, e.g. "smith" match "schmidt" // 641 // "snider" match "schneider" --// 642 //-- also, -sz- in slavic language altho in hungarian it // 643 // is pronounced "s" --// 644 result.append('S', 'X'); 645 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 646 } else if (contains(value, index, 2, "SC")) { 647 index = handleSC(value, result, index); 648 } else { 649 if (index == value.length() - 1 && contains(value, index - 2, 650 2, "AI", "OI")){ 651 //-- french e.g. "resnais", "artois" --// 652 result.appendAlternate('S'); 653 } else { 654 result.append('S'); 655 } 656 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 657 } 658 return index; 659 } 660 661 /** 662 * Handles 'SC' cases 663 */ handleSC(String value, DoubleMetaphoneResult result, int index)664 private int handleSC(String value, 665 DoubleMetaphoneResult result, 666 int index) { 667 if (charAt(value, index + 2) == 'H') { 668 //-- Schlesinger's rule --// 669 if (contains(value, index + 3, 670 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 671 //-- Dutch origin, e.g. "school", "schooner" --// 672 if (contains(value, index + 3, 2, "ER", "EN")) { 673 //-- "schermerhorn", "schenker" --// 674 result.append("X", "SK"); 675 } else { 676 result.append("SK"); 677 } 678 } else { 679 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 680 result.append('X', 'S'); 681 } else { 682 result.append('X'); 683 } 684 } 685 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 686 result.append('S'); 687 } else { 688 result.append("SK"); 689 } 690 return index + 3; 691 } 692 693 /** 694 * Handles 'T' cases 695 */ handleT(String value, DoubleMetaphoneResult result, int index)696 private int handleT(String value, 697 DoubleMetaphoneResult result, 698 int index) { 699 if (contains(value, index, 4, "TION")) { 700 result.append('X'); 701 index += 3; 702 } else if (contains(value, index, 3, "TIA", "TCH")) { 703 result.append('X'); 704 index += 3; 705 } else if (contains(value, index, 2, "TH") || contains(value, index, 706 3, "TTH")) { 707 if (contains(value, index + 2, 2, "OM", "AM") || 708 //-- special case "thomas", "thames" or germanic --// 709 contains(value, 0, 4, "VAN ", "VON ") || 710 contains(value, 0, 3, "SCH")) { 711 result.append('T'); 712 } else { 713 result.append('0', 'T'); 714 } 715 index += 2; 716 } else { 717 result.append('T'); 718 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 719 } 720 return index; 721 } 722 723 /** 724 * Handles 'W' cases 725 */ handleW(String value, DoubleMetaphoneResult result, int index)726 private int handleW(String value, 727 DoubleMetaphoneResult result, 728 int index) { 729 if (contains(value, index, 2, "WR")) { 730 //-- can also be in middle of word --// 731 result.append('R'); 732 index += 2; 733 } else { 734 if (index == 0 && (isVowel(charAt(value, index + 1)) || 735 contains(value, index, 2, "WH"))) { 736 if (isVowel(charAt(value, index + 1))) { 737 //-- Wasserman should match Vasserman --// 738 result.append('A', 'F'); 739 } else { 740 //-- need Uomo to match Womo --// 741 result.append('A'); 742 } 743 index++; 744 } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || 745 contains(value, index - 1, 746 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 747 contains(value, 0, 3, "SCH")) { 748 //-- Arnow should match Arnoff --// 749 result.appendAlternate('F'); 750 index++; 751 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 752 //-- Polish e.g. "filipowicz" --// 753 result.append("TS", "FX"); 754 index += 4; 755 } else { 756 index++; 757 } 758 } 759 return index; 760 } 761 762 /** 763 * Handles 'X' cases 764 */ handleX(String value, DoubleMetaphoneResult result, int index)765 private int handleX(String value, 766 DoubleMetaphoneResult result, 767 int index) { 768 if (index == 0) { 769 result.append('S'); 770 index++; 771 } else { 772 if (!((index == value.length() - 1) && 773 (contains(value, index - 3, 3, "IAU", "EAU") || 774 contains(value, index - 2, 2, "AU", "OU")))) { 775 //-- French e.g. breaux --// 776 result.append("KS"); 777 } 778 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 779 } 780 return index; 781 } 782 783 /** 784 * Handles 'Z' cases 785 */ handleZ(String value, DoubleMetaphoneResult result, int index, boolean slavoGermanic)786 private int handleZ(String value, DoubleMetaphoneResult result, int index, 787 boolean slavoGermanic) { 788 if (charAt(value, index + 1) == 'H') { 789 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// 790 result.append('J'); 791 index += 2; 792 } else { 793 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { 794 result.append("S", "TS"); 795 } else { 796 result.append('S'); 797 } 798 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 799 } 800 return index; 801 } 802 803 //-- BEGIN CONDITIONS --// 804 805 /** 806 * Complex condition 0 for 'C' 807 */ conditionC0(String value, int index)808 private boolean conditionC0(String value, int index) { 809 if (contains(value, index, 4, "CHIA")) { 810 return true; 811 } else if (index <= 1) { 812 return false; 813 } else if (isVowel(charAt(value, index - 2))) { 814 return false; 815 } else if (!contains(value, index - 1, 3, "ACH")) { 816 return false; 817 } else { 818 char c = charAt(value, index + 2); 819 return (c != 'I' && c != 'E') 820 || contains(value, index - 2, 6, "BACHER", "MACHER"); 821 } 822 } 823 824 /** 825 * Complex condition 0 for 'CH' 826 */ conditionCH0(String value, int index)827 private boolean conditionCH0(String value, int index) { 828 if (index != 0) { 829 return false; 830 } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 831 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 832 return false; 833 } else if (contains(value, 0, 5, "CHORE")) { 834 return false; 835 } else { 836 return true; 837 } 838 } 839 840 /** 841 * Complex condition 1 for 'CH' 842 */ conditionCH1(String value, int index)843 private boolean conditionCH1(String value, int index) { 844 return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 845 3, "SCH")) || 846 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 847 contains(value, index + 2, 1, "T", "S") || 848 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 849 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); 850 } 851 852 /** 853 * Complex condition 0 for 'L' 854 */ conditionL0(String value, int index)855 private boolean conditionL0(String value, int index) { 856 if (index == value.length() - 3 && 857 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 858 return true; 859 } else if ((contains(value, index - 1, 2, "AS", "OS") || 860 contains(value, value.length() - 1, 1, "A", "O")) && 861 contains(value, index - 1, 4, "ALLE")) { 862 return true; 863 } else { 864 return false; 865 } 866 } 867 868 /** 869 * Complex condition 0 for 'M' 870 */ conditionM0(String value, int index)871 private boolean conditionM0(String value, int index) { 872 if (charAt(value, index + 1) == 'M') { 873 return true; 874 } 875 return contains(value, index - 1, 3, "UMB") 876 && ((index + 1) == value.length() - 1 || contains(value, 877 index + 2, 2, "ER")); 878 } 879 880 //-- BEGIN HELPER FUNCTIONS --// 881 882 /** 883 * Determines whether or not a value is of slavo-germanic orgin. A value is 884 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. 885 */ isSlavoGermanic(String value)886 private boolean isSlavoGermanic(String value) { 887 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 888 value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1; 889 } 890 891 /** 892 * Determines whether or not a character is a vowel or not 893 */ isVowel(char ch)894 private boolean isVowel(char ch) { 895 return VOWELS.indexOf(ch) != -1; 896 } 897 898 /** 899 * Determines whether or not the value starts with a silent letter. It will 900 * return <code>true</code> if the value starts with any of 'GN', 'KN', 901 * 'PN', 'WR' or 'PS'. 902 */ isSilentStart(String value)903 private boolean isSilentStart(String value) { 904 boolean result = false; 905 for (int i = 0; i < SILENT_START.length; i++) { 906 if (value.startsWith(SILENT_START[i])) { 907 result = true; 908 break; 909 } 910 } 911 return result; 912 } 913 914 /** 915 * Cleans the input 916 */ cleanInput(String input)917 private String cleanInput(String input) { 918 if (input == null) { 919 return null; 920 } 921 input = input.trim(); 922 if (input.length() == 0) { 923 return null; 924 } 925 return input.toUpperCase(); 926 } 927 928 /** 929 * Gets the character at index <code>index</code> if available, otherwise 930 * it returns <code>Character.MIN_VALUE</code> so that there is some sort 931 * of a default 932 */ charAt(String value, int index)933 protected char charAt(String value, int index) { 934 if (index < 0 || index >= value.length()) { 935 return Character.MIN_VALUE; 936 } 937 return value.charAt(index); 938 } 939 940 /** 941 * Shortcut method with 1 criteria 942 */ contains(String value, int start, int length, String criteria)943 private static boolean contains(String value, int start, int length, 944 String criteria) { 945 return contains(value, start, length, 946 new String[] { criteria }); 947 } 948 949 /** 950 * Shortcut method with 2 criteria 951 */ contains(String value, int start, int length, String criteria1, String criteria2)952 private static boolean contains(String value, int start, int length, 953 String criteria1, String criteria2) { 954 return contains(value, start, length, 955 new String[] { criteria1, criteria2 }); 956 } 957 958 /** 959 * Shortcut method with 3 criteria 960 */ contains(String value, int start, int length, String criteria1, String criteria2, String criteria3)961 private static boolean contains(String value, int start, int length, 962 String criteria1, String criteria2, 963 String criteria3) { 964 return contains(value, start, length, 965 new String[] { criteria1, criteria2, criteria3 }); 966 } 967 968 /** 969 * Shortcut method with 4 criteria 970 */ contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4)971 private static boolean contains(String value, int start, int length, 972 String criteria1, String criteria2, 973 String criteria3, String criteria4) { 974 return contains(value, start, length, 975 new String[] { criteria1, criteria2, criteria3, 976 criteria4 }); 977 } 978 979 /** 980 * Shortcut method with 5 criteria 981 */ contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4, String criteria5)982 private static boolean contains(String value, int start, int length, 983 String criteria1, String criteria2, 984 String criteria3, String criteria4, 985 String criteria5) { 986 return contains(value, start, length, 987 new String[] { criteria1, criteria2, criteria3, 988 criteria4, criteria5 }); 989 } 990 991 /** 992 * Shortcut method with 6 criteria 993 */ contains(String value, int start, int length, String criteria1, String criteria2, String criteria3, String criteria4, String criteria5, String criteria6)994 private static boolean contains(String value, int start, int length, 995 String criteria1, String criteria2, 996 String criteria3, String criteria4, 997 String criteria5, String criteria6) { 998 return contains(value, start, length, 999 new String[] { criteria1, criteria2, criteria3, 1000 criteria4, criteria5, criteria6 }); 1001 } 1002 1003 /** 1004 * Determines whether <code>value</code> contains any of the criteria 1005 starting 1006 * at index <code>start</code> and matching up to length <code>length</code> 1007 */ contains(String value, int start, int length, String[] criteria)1008 protected static boolean contains(String value, int start, int length, 1009 String[] criteria) { 1010 boolean result = false; 1011 if (start >= 0 && start + length <= value.length()) { 1012 String target = value.substring(start, start + length); 1013 1014 for (int i = 0; i < criteria.length; i++) { 1015 if (target.equals(criteria[i])) { 1016 result = true; 1017 break; 1018 } 1019 } 1020 } 1021 return result; 1022 } 1023 1024 //-- BEGIN INNER CLASSES --// 1025 1026 /** 1027 * Inner class for storing results, since there is the optional alternate 1028 * encoding. 1029 */ 1030 public class DoubleMetaphoneResult { 1031 1032 private StringBuffer primary = new StringBuffer(getMaxCodeLen()); 1033 private StringBuffer alternate = new StringBuffer(getMaxCodeLen()); 1034 private int maxLength; 1035 DoubleMetaphoneResult(int maxLength)1036 public DoubleMetaphoneResult(int maxLength) { 1037 this.maxLength = maxLength; 1038 } 1039 append(char value)1040 public void append(char value) { 1041 appendPrimary(value); 1042 appendAlternate(value); 1043 } 1044 append(char primary, char alternate)1045 public void append(char primary, char alternate) { 1046 appendPrimary(primary); 1047 appendAlternate(alternate); 1048 } 1049 appendPrimary(char value)1050 public void appendPrimary(char value) { 1051 if (this.primary.length() < this.maxLength) { 1052 this.primary.append(value); 1053 } 1054 } 1055 appendAlternate(char value)1056 public void appendAlternate(char value) { 1057 if (this.alternate.length() < this.maxLength) { 1058 this.alternate.append(value); 1059 } 1060 } 1061 append(String value)1062 public void append(String value) { 1063 appendPrimary(value); 1064 appendAlternate(value); 1065 } 1066 append(String primary, String alternate)1067 public void append(String primary, String alternate) { 1068 appendPrimary(primary); 1069 appendAlternate(alternate); 1070 } 1071 appendPrimary(String value)1072 public void appendPrimary(String value) { 1073 int addChars = this.maxLength - this.primary.length(); 1074 if (value.length() <= addChars) { 1075 this.primary.append(value); 1076 } else { 1077 this.primary.append(value.substring(0, addChars)); 1078 } 1079 } 1080 appendAlternate(String value)1081 public void appendAlternate(String value) { 1082 int addChars = this.maxLength - this.alternate.length(); 1083 if (value.length() <= addChars) { 1084 this.alternate.append(value); 1085 } else { 1086 this.alternate.append(value.substring(0, addChars)); 1087 } 1088 } 1089 getPrimary()1090 public String getPrimary() { 1091 return this.primary.toString(); 1092 } 1093 getAlternate()1094 public String getAlternate() { 1095 return this.alternate.toString(); 1096 } 1097 isComplete()1098 public boolean isComplete() { 1099 return this.primary.length() >= this.maxLength && 1100 this.alternate.length() >= this.maxLength; 1101 } 1102 } 1103 } 1104