1 /* 2 ********************************************************************** 3 * Copyright (c) 2006-2007, Google and others. All Rights Reserved. 4 ********************************************************************** 5 * Author: Mark Davis 6 ********************************************************************** 7 */ 8 package org.unicode.cldr.util; 9 10 import java.util.ArrayList; 11 12 import com.ibm.icu.text.BreakIterator; 13 import com.ibm.icu.text.CollationElementIterator; 14 import com.ibm.icu.text.Collator; 15 import com.ibm.icu.text.RuleBasedCollator; 16 import com.ibm.icu.util.ULocale; 17 18 /** 19 * This is intended to be a reference implementation for StringSearch. The 20 * implementation is thus not high performance; it just transforms both the key 21 * (what is searched for) and the target into collationElement space, and 22 * searches there. It is however, architecturally cleaner in that it first has a 23 * search that finds and extended range, then picks a boundary within that. 24 * <p> 25 * The key is that there is a match for a key string in a target text at positions <a,b> iff collator.compare(key, 26 * target.substring(a,b)) == 0. 27 * 28 * @author markdavis 29 */ 30 public class ReferenceStringSearch { 31 private static final int PADDING = 3; 32 33 private RuleBasedCollator collator = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 34 35 private BreakIterator breaker; 36 37 private String key; 38 39 private String target; 40 41 // all of the following are in processed CollationElementSpace 42 43 private int[] keyBuffer; 44 45 private CollationElementIterator2 targetIterator; 46 47 private int[] targetBuffer; 48 49 private int targetBufferStart; 50 private int targetBufferLength; 51 52 // Map from target items back to native indices 53 private int[] targetBackMapBefore; 54 55 private int[] targetBackMapAfter; 56 private int lastAfter; 57 58 // input flags to indicate which way we want to go within the range of 59 // possible boundaries 60 boolean widestStart, widestLimit; 61 62 /** 63 * A struct for showing match information with the range of possible 64 * boundaries. 65 */ 66 public static class ExtendedRange { 67 int minStart, maxStart, minLimit, maxLimit; 68 69 @Override toString()70 public String toString() { 71 return minStart + ", " + maxStart + ", " + minLimit + ", " + maxLimit; 72 } 73 toString(String key, String target)74 public String toString(String key, String target) { 75 return "'" + target.substring(0, minStart) + "[" 76 + target.substring(minStart, maxStart) + "{" 77 + target.substring(maxStart, minLimit) + "}" 78 + target.substring(minLimit, maxLimit) + "]" 79 + target.substring(maxLimit) + "'"; 80 } 81 } 82 83 /** 84 * A struct that just shows the start/end, based on the settings given. 85 */ 86 public static class Range { 87 public int start; 88 public int limit; 89 90 @Override toString()91 public String toString() { 92 return start + ", " + limit; 93 } 94 toString(String key, String target)95 public String toString(String key, String target) { 96 return "'" + target.substring(0, start) + "[" 97 + target.substring(start, limit) + "]" 98 + target.substring(limit) + "'"; 99 } 100 } 101 getCollator()102 public RuleBasedCollator getCollator() { 103 return collator; 104 } 105 106 /** 107 * If the collator settings are changed externally, be sure to call 108 * setCollator(); 109 */ setCollator(RuleBasedCollator collator)110 public ReferenceStringSearch setCollator(RuleBasedCollator collator) { 111 this.collator = collator; 112 targetIterator = new CollationElementIterator2(collator); 113 // reset the key and target, since their collationElements may change 114 if (key != null) { 115 setKey(key); 116 } 117 if (target != null) { 118 setTarget(target); 119 } 120 return this; 121 } 122 getBreaker()123 public BreakIterator getBreaker() { 124 return breaker; 125 } 126 setBreaker(BreakIterator breaker)127 public ReferenceStringSearch setBreaker(BreakIterator breaker) { 128 this.breaker = breaker; 129 if (target != null) { 130 breaker.setText(target); 131 } 132 return this; 133 } 134 135 /** 136 * If true, will pick the largest possible limit for a match. 137 */ isWidestLimit()138 public boolean isWidestLimit() { 139 return widestLimit; 140 } 141 142 /** 143 * If true, will pick the largest possible limit for a match. 144 */ setWidestLimit(boolean widestLimit)145 public void setWidestLimit(boolean widestLimit) { 146 this.widestLimit = widestLimit; 147 } 148 149 /** 150 * If true, will pick the least possible start for a match. 151 */ isWidestStart()152 public boolean isWidestStart() { 153 return widestStart; 154 } 155 156 /** 157 * If true, will pick the least possible start for a match. 158 */ setWidestStart(boolean widestStart)159 public void setWidestStart(boolean widestStart) { 160 this.widestStart = widestStart; 161 } 162 getKey()163 public String getKey() { 164 return key; 165 } 166 setKey(String key)167 public ReferenceStringSearch setKey(String key) { 168 this.key = key; 169 ArrayList<Integer> keyBufferList = new ArrayList<>(); 170 CollationElementIterator2 keyIterator = new CollationElementIterator2(collator).setText(key); 171 while (true) { 172 int collationElement = keyIterator.nextProcessed(); 173 if (collationElement == CollationElementIterator.NULLORDER) { 174 break; 175 } 176 // store the primary, plus the index before and after. 177 keyBufferList.add(collationElement); 178 } 179 keyBuffer = getIntBuffer(keyBufferList); 180 return this; 181 } 182 getNativeOffset()183 public int getNativeOffset() { 184 return targetBackMapBefore[targetBufferStart]; 185 } 186 setNativeOffset(int nativeOffset)187 public ReferenceStringSearch setNativeOffset(int nativeOffset) { 188 if (nativeOffset < 0 || nativeOffset > target.length()) { 189 throw new ArrayIndexOutOfBoundsException(); 190 } 191 // use dumb implementation. Better implementation would leverage 192 // contents of current buffer 193 targetIterator.setOffset(nativeOffset); 194 targetBufferStart = 0; 195 targetBufferLength = 0; 196 lastAfter = 0; 197 if (nativeOffset != 0) { 198 // we have to reset lastAfter! 199 targetIterator.previousProcessed(); 200 lastAfter = targetIterator.offsetAfter; 201 } 202 fillBuffer(); 203 return this; 204 } 205 getTarget()206 public String getTarget() { 207 return target; 208 } 209 210 /** 211 * Set the text to search within. 212 */ setTarget(String target)213 public ReferenceStringSearch setTarget(String target) { 214 this.target = target; 215 if (breaker != null) { 216 breaker.setText(target); 217 } 218 targetIterator.setText(target); // TODO optimize creation 219 // make the buffer just a bit bigger than we need 220 // note: we could optimize to reuse buffers, but probably not worth it 221 targetBuffer = new int[keyBuffer.length + PADDING]; 222 targetBackMapBefore = new int[keyBuffer.length + PADDING]; 223 targetBackMapAfter = new int[keyBuffer.length + PADDING]; 224 targetBufferStart = 0; 225 lastAfter = 0; 226 targetBufferLength = 0; 227 fillBuffer(); 228 return this; 229 } 230 231 /** 232 * We keep the circular buffer as start + length instead of start + limit so 233 * we can distinguish the 0 case. 234 * 235 * @return 236 */ shiftBuffer()237 private boolean shiftBuffer() { 238 lastAfter = targetBackMapAfter[targetBufferStart]; // remember last after 239 ++targetBufferStart; 240 if (targetBufferStart >= targetBuffer.length) { 241 targetBufferStart = 0; 242 } 243 --targetBufferLength; 244 return fillBuffer(); 245 } 246 fillBuffer()247 private boolean fillBuffer() { 248 // TODO mark as done so we don't call extra at the end 249 while (targetBufferLength < keyBuffer.length + 1) { 250 int ce = targetIterator.nextProcessed(); 251 if (ce == CollationElementIterator.NULLORDER) { 252 return false; 253 } 254 int targetBufferLimit = targetBufferStart + targetBufferLength; 255 if (targetBufferLimit >= targetBuffer.length) { 256 targetBufferLimit -= targetBuffer.length; 257 } 258 targetBuffer[targetBufferLimit] = ce; 259 targetBackMapBefore[targetBufferLimit] = targetIterator.offsetBefore; 260 targetBackMapAfter[targetBufferLimit] = targetIterator.offsetAfter; 261 ++targetBufferLength; 262 } 263 return true; 264 } 265 266 /** 267 * Find the next match, and set the min/maxStart/Limit. Return false if not 268 * found. Here is an example, where [...] is the maximal match, and {..} the 269 * minimal. 270 * 271 * <pre> 272 * Raw Positions of 'eß' in ' ess eß ESS̀ ' 273 * '[ {ess} ]eß ESS̀ ' 274 * ' ess[ {eß} ]ESS̀ ' 275 * ' ess eß[ {ESS}̀ ]' 276 * </pre> 277 * 278 * Matches may overlap. Thus 279 * 280 * <pre> 281 * Raw Positions of 'a!a' in 'b.a.a.a.b' 282 * 'b[.{a.a}.]a.b' 283 * 'b.a[.{a.a}.]b' 284 * </pre> 285 */ searchForwards(ExtendedRange position)286 public boolean searchForwards(ExtendedRange position) { 287 while (targetBufferLength >= keyBuffer.length) { 288 if (matchesAt()) { 289 // minStart is from the After of previous item. 290 position.minStart = lastAfter; 291 292 position.maxStart = targetBackMapBefore[targetBufferStart]; 293 int last = targetBufferStart + keyBuffer.length - 1; 294 if (last >= targetBuffer.length) { 295 last -= targetBuffer.length; 296 } 297 position.minLimit = targetBackMapAfter[last]; 298 299 // maxLimit is from the Before of the next item 300 // or, if we are at the very end of the text, the text length. 301 if (targetBufferLength == keyBuffer.length) { 302 position.maxLimit = target.length(); 303 } else { 304 ++last; // we reuse the position since we are already there. 305 if (last >= targetBuffer.length) { 306 last -= targetBuffer.length; 307 } 308 position.maxLimit = targetBackMapBefore[last]; 309 } 310 shiftBuffer(); // move to next offset 311 return true; 312 } 313 shiftBuffer(); // move to next offset 314 } 315 return false; 316 } 317 318 /** 319 * Simple match at position. 320 */ matchesAt()321 private boolean matchesAt() { 322 int j = targetBufferStart; 323 for (int i = 0; i < keyBuffer.length; ++i) { 324 if (keyBuffer[i] != targetBuffer[j]) { 325 return false; 326 } 327 ++j; 328 if (j >= targetBuffer.length) { 329 j = 0; 330 } 331 } 332 return true; 333 } 334 335 private ExtendedRange internalPosition = new ExtendedRange(); 336 337 /** 338 * This is the main public interface for searching. It filters out anything 339 * that doesn't match the breaker, and adjusts the boundaries to the max/min 340 * permitted. Examples: 341 * 342 * <pre> 343 * Positions of 'eß' in ' ess eß ESS̀ ' 344 * ' [ess] eß ESS̀ ' 345 * ' ess [eß] ESS̀ ' 346 * ' ess eß [ESS̀] ' 347 * Count: 3 348 * 349 * Positions of 'a!a' in 'b.a.a.a.b' 350 * 'b.[a.a].a.b' 351 * 'b.a.[a.a].b' 352 * Count: 2 353 * </pre> 354 * 355 * @param position 356 * @return 357 */ searchForwards(Range position)358 public boolean searchForwards(Range position) { 359 while (true) { 360 boolean succeeds = searchForwards(internalPosition); 361 if (!succeeds) return false; 362 position.start = getBoundary(breaker, internalPosition.minStart, internalPosition.maxStart, !widestStart); 363 if (position.start == -1) continue; // failed to find the right boundary 364 position.limit = getBoundary(breaker, internalPosition.minLimit, internalPosition.maxLimit, widestLimit); 365 if (position.limit == -1) continue; // failed to find the right boundary 366 return true; 367 } 368 } 369 370 /****************** PRIVATES ***************/ 371 372 /** 373 * This really ought to be just methods on CollationElementIterator. 374 */ 375 public static class CollationElementIterator2 { 376 private CollationElementIterator keyIterator; 377 private int strengthMask; 378 private int variableTop; 379 private int offsetBefore; 380 private int offsetAfter; 381 getOffsetBefore()382 public int getOffsetBefore() { 383 return offsetBefore; 384 } 385 getOffsetAfter()386 public int getOffsetAfter() { 387 return offsetAfter; 388 } 389 reset()390 public CollationElementIterator2 reset() { 391 keyIterator.reset(); 392 return this; 393 } 394 setOffset(int offset)395 public CollationElementIterator2 setOffset(int offset) { 396 keyIterator.setOffset(offset); 397 return this; 398 } 399 setText(String source)400 public CollationElementIterator2 setText(String source) { 401 keyIterator.setText(source); 402 return this; 403 } 404 CollationElementIterator2(RuleBasedCollator collator)405 public CollationElementIterator2(RuleBasedCollator collator) { 406 // gather some information that we will need later 407 strengthMask = 0xFFFF0000; 408 variableTop = !collator.isAlternateHandlingShifted() ? -1 : collator.getVariableTop() | 0xFFFF; 409 // this needs to be fixed a bit for case-level, etc. 410 switch (collator.getStrength()) { 411 case Collator.PRIMARY: 412 strengthMask = 0xFFFF0000; 413 break; 414 case Collator.SECONDARY: 415 strengthMask = 0xFFFFFF00; 416 break; 417 default: 418 strengthMask = 0xFFFFFFFF; 419 break; 420 } 421 keyIterator = collator.getCollationElementIterator(""); 422 } 423 424 /** 425 * This should be a method on CollationElementIterator. Returns next 426 * non-zero collation element, setting indexBefore, indexAfter. Should also 427 * process shifted and strength, masking as needed. If a collation element 428 * has a continuation, then the indexAfter = indexBefore, for example, if 429 * [CE1,CE2] form a single collation element for the characters between 430 * native indexes 5 and 8, (C2 being a continuation, then the result of two 431 * calls to nextProcessed would be [CE1, 5, 5] then [CE1, 5,8]. 432 * <p> 433 * previousProcessed would do similar things, backwards. 434 * 435 */ nextProcessed()436 int nextProcessed() { 437 while (true) { 438 offsetBefore = keyIterator.getOffset(); 439 int collationElement = keyIterator.next(); 440 if (collationElement != CollationElementIterator.NULLORDER) { 441 442 // note: the collation element iterator ought to give us processed values, but it doesn't 443 // so we have to simulate that. 444 collationElement &= strengthMask; // mask to only the strengths we have 445 // check for shifted. 446 // TODO This is not exactly right, and we also need to eject any following combining marks, 447 // so fix later. 448 if (collationElement < variableTop && collationElement > 0xFFFF) { 449 continue; 450 } 451 if (collationElement == 0) { 452 continue; 453 } 454 455 } 456 offsetAfter = keyIterator.getOffset(); 457 return collationElement; 458 } 459 } 460 previousProcessed()461 int previousProcessed() { 462 while (true) { 463 offsetAfter = keyIterator.getOffset(); 464 int collationElement = keyIterator.previous(); 465 if (collationElement != CollationElementIterator.NULLORDER) { 466 467 // note: the collation element iterator ought to give us processed values, but it doesn't 468 // so we have to simulate that. 469 collationElement &= strengthMask; // mask to only the strengths we have 470 // check for shifted. 471 // TODO This is not exactly right, and we also need to eject any following combining marks, 472 // so fix later. 473 if (collationElement < variableTop && collationElement > 0xFFFF) { 474 continue; 475 } 476 if (collationElement == 0) { 477 continue; 478 } 479 480 } 481 offsetBefore = keyIterator.getOffset(); 482 return collationElement; 483 } 484 } 485 } 486 487 /** 488 * Utility 489 * 490 * @param keyBufferList 491 * @return 492 */ getIntBuffer(ArrayList<Integer> keyBufferList)493 private int[] getIntBuffer(ArrayList<Integer> keyBufferList) { 494 int[] buffer = new int[keyBufferList.size()]; 495 for (int i = 0; i < keyBufferList.size(); ++i) { 496 buffer[i] = keyBufferList.get(i).intValue(); 497 } 498 return buffer; 499 } 500 501 /** 502 * This really should be on breakIterator, so it can be done more efficiently. 503 * <p> 504 * Get the item that is on a boundary according to the breaker, and is between the input boundaries. The boolean 505 * greatest controls whether we pick the least or the greatest possible offset that works according to the breaker. 506 * <p> 507 * Returns -1 if there is no valid offset according to the breaker. 508 * 509 * @param minBoundary 510 * @param maxBoundary 511 * @param greatest 512 * @return 513 */ getBoundary(BreakIterator breaker, int minBoundary, int maxBoundary, boolean greatest)514 public static int getBoundary(BreakIterator breaker, int minBoundary, int maxBoundary, boolean greatest) { 515 if (breaker == null) return greatest ? maxBoundary : minBoundary; 516 int result; 517 // this may or may not be the most efficient way to test; ask Andy 518 if (greatest) { 519 result = breaker.preceding(maxBoundary + 1); 520 if (result < minBoundary) { 521 result = -1; 522 } 523 } else { 524 result = breaker.following(minBoundary - 1); 525 if (result < minBoundary) { 526 result = -1; 527 } 528 } 529 return result; 530 } 531 532 }