• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
4  **********************************************************************
5  * Author: Mark Davis
6  **********************************************************************
7  */
8 package org.unicode.cldr.util;
9 
10 import java.util.ArrayList;
11 
12 import com.ibm.icu.text.BreakIterator;
13 import com.ibm.icu.text.CollationElementIterator;
14 import com.ibm.icu.text.Collator;
15 import com.ibm.icu.text.RuleBasedCollator;
16 import com.ibm.icu.util.ULocale;
17 
18 /**
19  * This is intended to be a reference implementation for StringSearch. The
20  * implementation is thus not high performance; it just transforms both the key
21  * (what is searched for) and the target into collationElement space, and
22  * searches there. It is however, architecturally cleaner in that it first has a
23  * search that finds and extended range, then picks a boundary within that.
24  * <p>
25  * The key is that there is a match for a key string in a target text at positions <a,b> iff collator.compare(key,
26  * target.substring(a,b)) == 0.
27  *
28  * @author markdavis
29  */
30 public class ReferenceStringSearch {
31     private static final int PADDING = 3;
32 
33     private RuleBasedCollator collator = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
34 
35     private BreakIterator breaker;
36 
37     private String key;
38 
39     private String target;
40 
41     // all of the following are in processed CollationElementSpace
42 
43     private int[] keyBuffer;
44 
45     private CollationElementIterator2 targetIterator;
46 
47     private int[] targetBuffer;
48 
49     private int targetBufferStart;
50     private int targetBufferLength;
51 
52     // Map from target items back to native indices
53     private int[] targetBackMapBefore;
54 
55     private int[] targetBackMapAfter;
56     private int lastAfter;
57 
58     // input flags to indicate which way we want to go within the range of
59     // possible boundaries
60     boolean widestStart, widestLimit;
61 
62     /**
63      * A struct for showing match information with the range of possible
64      * boundaries.
65      */
66     public static class ExtendedRange {
67         int minStart, maxStart, minLimit, maxLimit;
68 
69         @Override
toString()70         public String toString() {
71             return minStart + ", " + maxStart + ", " + minLimit + ", " + maxLimit;
72         }
73 
toString(String key, String target)74         public String toString(String key, String target) {
75             return "'" + target.substring(0, minStart) + "["
76                 + target.substring(minStart, maxStart) + "{"
77                 + target.substring(maxStart, minLimit) + "}"
78                 + target.substring(minLimit, maxLimit) + "]"
79                 + target.substring(maxLimit) + "'";
80         }
81     }
82 
83     /**
84      * A struct that just shows the start/end, based on the settings given.
85      */
86     public static class Range {
87         public int start;
88         public int limit;
89 
90         @Override
toString()91         public String toString() {
92             return start + ", " + limit;
93         }
94 
toString(String key, String target)95         public String toString(String key, String target) {
96             return "'" + target.substring(0, start) + "["
97                 + target.substring(start, limit) + "]"
98                 + target.substring(limit) + "'";
99         }
100     }
101 
getCollator()102     public RuleBasedCollator getCollator() {
103         return collator;
104     }
105 
106     /**
107      * If the collator settings are changed externally, be sure to call
108      * setCollator();
109      */
setCollator(RuleBasedCollator collator)110     public ReferenceStringSearch setCollator(RuleBasedCollator collator) {
111         this.collator = collator;
112         targetIterator = new CollationElementIterator2(collator);
113         // reset the key and target, since their collationElements may change
114         if (key != null) {
115             setKey(key);
116         }
117         if (target != null) {
118             setTarget(target);
119         }
120         return this;
121     }
122 
getBreaker()123     public BreakIterator getBreaker() {
124         return breaker;
125     }
126 
setBreaker(BreakIterator breaker)127     public ReferenceStringSearch setBreaker(BreakIterator breaker) {
128         this.breaker = breaker;
129         if (target != null) {
130             breaker.setText(target);
131         }
132         return this;
133     }
134 
135     /**
136      * If true, will pick the largest possible limit for a match.
137      */
isWidestLimit()138     public boolean isWidestLimit() {
139         return widestLimit;
140     }
141 
142     /**
143      * If true, will pick the largest possible limit for a match.
144      */
setWidestLimit(boolean widestLimit)145     public void setWidestLimit(boolean widestLimit) {
146         this.widestLimit = widestLimit;
147     }
148 
149     /**
150      * If true, will pick the least possible start for a match.
151      */
isWidestStart()152     public boolean isWidestStart() {
153         return widestStart;
154     }
155 
156     /**
157      * If true, will pick the least possible start for a match.
158      */
setWidestStart(boolean widestStart)159     public void setWidestStart(boolean widestStart) {
160         this.widestStart = widestStart;
161     }
162 
getKey()163     public String getKey() {
164         return key;
165     }
166 
setKey(String key)167     public ReferenceStringSearch setKey(String key) {
168         this.key = key;
169         ArrayList<Integer> keyBufferList = new ArrayList<>();
170         CollationElementIterator2 keyIterator = new CollationElementIterator2(collator).setText(key);
171         while (true) {
172             int collationElement = keyIterator.nextProcessed();
173             if (collationElement == CollationElementIterator.NULLORDER) {
174                 break;
175             }
176             // store the primary, plus the index before and after.
177             keyBufferList.add(collationElement);
178         }
179         keyBuffer = getIntBuffer(keyBufferList);
180         return this;
181     }
182 
getNativeOffset()183     public int getNativeOffset() {
184         return targetBackMapBefore[targetBufferStart];
185     }
186 
setNativeOffset(int nativeOffset)187     public ReferenceStringSearch setNativeOffset(int nativeOffset) {
188         if (nativeOffset < 0 || nativeOffset > target.length()) {
189             throw new ArrayIndexOutOfBoundsException();
190         }
191         // use dumb implementation. Better implementation would leverage
192         // contents of current buffer
193         targetIterator.setOffset(nativeOffset);
194         targetBufferStart = 0;
195         targetBufferLength = 0;
196         lastAfter = 0;
197         if (nativeOffset != 0) {
198             // we have to reset lastAfter!
199             targetIterator.previousProcessed();
200             lastAfter = targetIterator.offsetAfter;
201         }
202         fillBuffer();
203         return this;
204     }
205 
getTarget()206     public String getTarget() {
207         return target;
208     }
209 
210     /**
211      * Set the text to search within.
212      */
setTarget(String target)213     public ReferenceStringSearch setTarget(String target) {
214         this.target = target;
215         if (breaker != null) {
216             breaker.setText(target);
217         }
218         targetIterator.setText(target); // TODO optimize creation
219         // make the buffer just a bit bigger than we need
220         // note: we could optimize to reuse buffers, but probably not worth it
221         targetBuffer = new int[keyBuffer.length + PADDING];
222         targetBackMapBefore = new int[keyBuffer.length + PADDING];
223         targetBackMapAfter = new int[keyBuffer.length + PADDING];
224         targetBufferStart = 0;
225         lastAfter = 0;
226         targetBufferLength = 0;
227         fillBuffer();
228         return this;
229     }
230 
231     /**
232      * We keep the circular buffer as start + length instead of start + limit so
233      * we can distinguish the 0 case.
234      *
235      * @return
236      */
shiftBuffer()237     private boolean shiftBuffer() {
238         lastAfter = targetBackMapAfter[targetBufferStart]; // remember last after
239         ++targetBufferStart;
240         if (targetBufferStart >= targetBuffer.length) {
241             targetBufferStart = 0;
242         }
243         --targetBufferLength;
244         return fillBuffer();
245     }
246 
fillBuffer()247     private boolean fillBuffer() {
248         // TODO mark as done so we don't call extra at the end
249         while (targetBufferLength < keyBuffer.length + 1) {
250             int ce = targetIterator.nextProcessed();
251             if (ce == CollationElementIterator.NULLORDER) {
252                 return false;
253             }
254             int targetBufferLimit = targetBufferStart + targetBufferLength;
255             if (targetBufferLimit >= targetBuffer.length) {
256                 targetBufferLimit -= targetBuffer.length;
257             }
258             targetBuffer[targetBufferLimit] = ce;
259             targetBackMapBefore[targetBufferLimit] = targetIterator.offsetBefore;
260             targetBackMapAfter[targetBufferLimit] = targetIterator.offsetAfter;
261             ++targetBufferLength;
262         }
263         return true;
264     }
265 
266     /**
267      * Find the next match, and set the min/maxStart/Limit. Return false if not
268      * found. Here is an example, where [...] is the maximal match, and {..} the
269      * minimal.
270      *
271      * <pre>
272      *  Raw Positions of 'eß' in ' ess eß ESS̀ '
273      *    '[ {ess} ]eß ESS̀ '
274      *    ' ess[ {eß} ]ESS̀ '
275      *    ' ess eß[ {ESS}̀ ]'
276      * </pre>
277      *
278      * Matches may overlap. Thus
279      *
280      * <pre>
281      *  Raw Positions of 'a!a' in 'b.a.a.a.b'
282      *    'b[.{a.a}.]a.b'
283      *    'b.a[.{a.a}.]b'
284      * </pre>
285      */
searchForwards(ExtendedRange position)286     public boolean searchForwards(ExtendedRange position) {
287         while (targetBufferLength >= keyBuffer.length) {
288             if (matchesAt()) {
289                 // minStart is from the After of previous item.
290                 position.minStart = lastAfter;
291 
292                 position.maxStart = targetBackMapBefore[targetBufferStart];
293                 int last = targetBufferStart + keyBuffer.length - 1;
294                 if (last >= targetBuffer.length) {
295                     last -= targetBuffer.length;
296                 }
297                 position.minLimit = targetBackMapAfter[last];
298 
299                 // maxLimit is from the Before of the next item
300                 // or, if we are at the very end of the text, the text length.
301                 if (targetBufferLength == keyBuffer.length) {
302                     position.maxLimit = target.length();
303                 } else {
304                     ++last; // we reuse the position since we are already there.
305                     if (last >= targetBuffer.length) {
306                         last -= targetBuffer.length;
307                     }
308                     position.maxLimit = targetBackMapBefore[last];
309                 }
310                 shiftBuffer(); // move to next offset
311                 return true;
312             }
313             shiftBuffer(); // move to next offset
314         }
315         return false;
316     }
317 
318     /**
319      * Simple match at position.
320      */
matchesAt()321     private boolean matchesAt() {
322         int j = targetBufferStart;
323         for (int i = 0; i < keyBuffer.length; ++i) {
324             if (keyBuffer[i] != targetBuffer[j]) {
325                 return false;
326             }
327             ++j;
328             if (j >= targetBuffer.length) {
329                 j = 0;
330             }
331         }
332         return true;
333     }
334 
335     private ExtendedRange internalPosition = new ExtendedRange();
336 
337     /**
338      * This is the main public interface for searching. It filters out anything
339      * that doesn't match the breaker, and adjusts the boundaries to the max/min
340      * permitted. Examples:
341      *
342      * <pre>
343      *  Positions of 'eß' in ' ess eß ESS̀ '
344      *    ' [ess] eß ESS̀ '
345      *    ' ess [eß] ESS̀ '
346      *    ' ess eß [ESS̀] '
347      *    Count: 3
348      *
349      * Positions of 'a!a' in 'b.a.a.a.b'
350      *    'b.[a.a].a.b'
351      *    'b.a.[a.a].b'
352      *    Count: 2
353      * </pre>
354      *
355      * @param position
356      * @return
357      */
searchForwards(Range position)358     public boolean searchForwards(Range position) {
359         while (true) {
360             boolean succeeds = searchForwards(internalPosition);
361             if (!succeeds) return false;
362             position.start = getBoundary(breaker, internalPosition.minStart, internalPosition.maxStart, !widestStart);
363             if (position.start == -1) continue; // failed to find the right boundary
364             position.limit = getBoundary(breaker, internalPosition.minLimit, internalPosition.maxLimit, widestLimit);
365             if (position.limit == -1) continue; // failed to find the right boundary
366             return true;
367         }
368     }
369 
370     /****************** PRIVATES ***************/
371 
372     /**
373      * This really ought to be just methods on CollationElementIterator.
374      */
375     public static class CollationElementIterator2 {
376         private CollationElementIterator keyIterator;
377         private int strengthMask;
378         private int variableTop;
379         private int offsetBefore;
380         private int offsetAfter;
381 
getOffsetBefore()382         public int getOffsetBefore() {
383             return offsetBefore;
384         }
385 
getOffsetAfter()386         public int getOffsetAfter() {
387             return offsetAfter;
388         }
389 
reset()390         public CollationElementIterator2 reset() {
391             keyIterator.reset();
392             return this;
393         }
394 
setOffset(int offset)395         public CollationElementIterator2 setOffset(int offset) {
396             keyIterator.setOffset(offset);
397             return this;
398         }
399 
setText(String source)400         public CollationElementIterator2 setText(String source) {
401             keyIterator.setText(source);
402             return this;
403         }
404 
CollationElementIterator2(RuleBasedCollator collator)405         public CollationElementIterator2(RuleBasedCollator collator) {
406             // gather some information that we will need later
407             strengthMask = 0xFFFF0000;
408             variableTop = !collator.isAlternateHandlingShifted() ? -1 : collator.getVariableTop() | 0xFFFF;
409             // this needs to be fixed a bit for case-level, etc.
410             switch (collator.getStrength()) {
411             case Collator.PRIMARY:
412                 strengthMask = 0xFFFF0000;
413                 break;
414             case Collator.SECONDARY:
415                 strengthMask = 0xFFFFFF00;
416                 break;
417             default:
418                 strengthMask = 0xFFFFFFFF;
419                 break;
420             }
421             keyIterator = collator.getCollationElementIterator("");
422         }
423 
424         /**
425          * This should be a method on CollationElementIterator. Returns next
426          * non-zero collation element, setting indexBefore, indexAfter. Should also
427          * process shifted and strength, masking as needed. If a collation element
428          * has a continuation, then the indexAfter = indexBefore, for example, if
429          * [CE1,CE2] form a single collation element for the characters between
430          * native indexes 5 and 8, (C2 being a continuation, then the result of two
431          * calls to nextProcessed would be [CE1, 5, 5] then [CE1, 5,8].
432          * <p>
433          * previousProcessed would do similar things, backwards.
434          *
435          */
nextProcessed()436         int nextProcessed() {
437             while (true) {
438                 offsetBefore = keyIterator.getOffset();
439                 int collationElement = keyIterator.next();
440                 if (collationElement != CollationElementIterator.NULLORDER) {
441 
442                     // note: the collation element iterator ought to give us processed values, but it doesn't
443                     // so we have to simulate that.
444                     collationElement &= strengthMask; // mask to only the strengths we have
445                     // check for shifted.
446                     // TODO This is not exactly right, and we also need to eject any following combining marks,
447                     // so fix later.
448                     if (collationElement < variableTop && collationElement > 0xFFFF) {
449                         continue;
450                     }
451                     if (collationElement == 0) {
452                         continue;
453                     }
454 
455                 }
456                 offsetAfter = keyIterator.getOffset();
457                 return collationElement;
458             }
459         }
460 
previousProcessed()461         int previousProcessed() {
462             while (true) {
463                 offsetAfter = keyIterator.getOffset();
464                 int collationElement = keyIterator.previous();
465                 if (collationElement != CollationElementIterator.NULLORDER) {
466 
467                     // note: the collation element iterator ought to give us processed values, but it doesn't
468                     // so we have to simulate that.
469                     collationElement &= strengthMask; // mask to only the strengths we have
470                     // check for shifted.
471                     // TODO This is not exactly right, and we also need to eject any following combining marks,
472                     // so fix later.
473                     if (collationElement < variableTop && collationElement > 0xFFFF) {
474                         continue;
475                     }
476                     if (collationElement == 0) {
477                         continue;
478                     }
479 
480                 }
481                 offsetBefore = keyIterator.getOffset();
482                 return collationElement;
483             }
484         }
485     }
486 
487     /**
488      * Utility
489      *
490      * @param keyBufferList
491      * @return
492      */
getIntBuffer(ArrayList<Integer> keyBufferList)493     private int[] getIntBuffer(ArrayList<Integer> keyBufferList) {
494         int[] buffer = new int[keyBufferList.size()];
495         for (int i = 0; i < keyBufferList.size(); ++i) {
496             buffer[i] = keyBufferList.get(i).intValue();
497         }
498         return buffer;
499     }
500 
501     /**
502      * This really should be on breakIterator, so it can be done more efficiently.
503      * <p>
504      * Get the item that is on a boundary according to the breaker, and is between the input boundaries. The boolean
505      * greatest controls whether we pick the least or the greatest possible offset that works according to the breaker.
506      * <p>
507      * Returns -1 if there is no valid offset according to the breaker.
508      *
509      * @param minBoundary
510      * @param maxBoundary
511      * @param greatest
512      * @return
513      */
getBoundary(BreakIterator breaker, int minBoundary, int maxBoundary, boolean greatest)514     public static int getBoundary(BreakIterator breaker, int minBoundary, int maxBoundary, boolean greatest) {
515         if (breaker == null) return greatest ? maxBoundary : minBoundary;
516         int result;
517         // this may or may not be the most efficient way to test; ask Andy
518         if (greatest) {
519             result = breaker.preceding(maxBoundary + 1);
520             if (result < minBoundary) {
521                 result = -1;
522             }
523         } else {
524             result = breaker.following(minBoundary - 1);
525             if (result < minBoundary) {
526                 result = -1;
527             }
528         }
529         return result;
530     }
531 
532 }