android-13.0.0_r83/s

/*
 **********************************************************************
 * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
 **********************************************************************
 * Author: Mark Davis
 **********************************************************************
 */
package org.unicode.cldr.util;

import java.util.ArrayList;

import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.CollationElementIterator;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;

/**
 * This is intended to be a reference implementation for StringSearch. The
 * implementation is thus not high performance; it just transforms both the key
 * (what is searched for) and the target into collationElement space, and
 * searches there. It is however, architecturally cleaner in that it first has a
 * search that finds and extended range, then picks a boundary within that.
 * <p>
 * The key is that there is a match for a key string in a target text at positions <a,b> iff collator.compare(key,
 * target.substring(a,b)) == 0.
 *
 * @author markdavis
 */
public class ReferenceStringSearch {
    private static final int PADDING = 3;

    private RuleBasedCollator collator = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);

    private BreakIterator breaker;

    private String key;

    private String target;

    // all of the following are in processed CollationElementSpace

    private int[] keyBuffer;

    private CollationElementIterator2 targetIterator;

    private int[] targetBuffer;

    private int targetBufferStart;
    private int targetBufferLength;

    // Map from target items back to native indices
    private int[] targetBackMapBefore;

    private int[] targetBackMapAfter;
    private int lastAfter;

    // input flags to indicate which way we want to go within the range of
    // possible boundaries
    boolean widestStart, widestLimit;

    /**
     * A struct for showing match information with the range of possible
     * boundaries.
     */
    public static class ExtendedRange {
        int minStart, maxStart, minLimit, maxLimit;

        @Override
        public String toString() {
            return minStart + ", " + maxStart + ", " + minLimit + ", " + maxLimit;
        }

        public String toString(String key, String target) {
            return "'" + target.substring(0, minStart) + "["
                + target.substring(minStart, maxStart) + "{"
                + target.substring(maxStart, minLimit) + "}"
                + target.substring(minLimit, maxLimit) + "]"
                + target.substring(maxLimit) + "'";
        }
    }

    /**
     * A struct that just shows the start/end, based on the settings given.
     */
    public static class Range {
        public int start;
        public int limit;

        @Override
        public String toString() {
            return start + ", " + limit;
        }

        public String toString(String key, String target) {
            return "'" + target.substring(0, start) + "["
                + target.substring(start, limit) + "]"
                + target.substring(limit) + "'";
        }
    }

    public RuleBasedCollator getCollator() {
        return collator;
    }

    /**
     * If the collator settings are changed externally, be sure to call
     * setCollator();
     */
    public ReferenceStringSearch setCollator(RuleBasedCollator collator) {
        this.collator = collator;
        targetIterator = new CollationElementIterator2(collator);
        // reset the key and target, since their collationElements may change
        if (key != null) {
            setKey(key);
        }
        if (target != null) {
            setTarget(target);
        }
        return this;
    }

    public BreakIterator getBreaker() {
        return breaker;
    }

    public ReferenceStringSearch setBreaker(BreakIterator breaker) {
        this.breaker = breaker;
        if (target != null) {
            breaker.setText(target);
        }
        return this;
    }

    /**
     * If true, will pick the largest possible limit for a match.
     */
    public boolean isWidestLimit() {
        return widestLimit;
    }

    /**
     * If true, will pick the largest possible limit for a match.
     */
    public void setWidestLimit(boolean widestLimit) {
        this.widestLimit = widestLimit;
    }

    /**
     * If true, will pick the least possible start for a match.
     */
    public boolean isWidestStart() {
        return widestStart;
    }

    /**
     * If true, will pick the least possible start for a match.
     */
    public void setWidestStart(boolean widestStart) {
        this.widestStart = widestStart;
    }

    public String getKey() {
        return key;
    }

    public ReferenceStringSearch setKey(String key) {
        this.key = key;
        ArrayList<Integer> keyBufferList = new ArrayList<>();
        CollationElementIterator2 keyIterator = new CollationElementIterator2(collator).setText(key);
        while (true) {
            int collationElement = keyIterator.nextProcessed();
            if (collationElement == CollationElementIterator.NULLORDER) {
                break;
            }
            // store the primary, plus the index before and after.
            keyBufferList.add(collationElement);
        }
        keyBuffer = getIntBuffer(keyBufferList);
        return this;
    }

    public int getNativeOffset() {
        return targetBackMapBefore[targetBufferStart];
    }

    public ReferenceStringSearch setNativeOffset(int nativeOffset) {
        if (nativeOffset < 0 || nativeOffset > target.length()) {
            throw new ArrayIndexOutOfBoundsException();
        }
        // use dumb implementation. Better implementation would leverage
        // contents of current buffer
        targetIterator.setOffset(nativeOffset);
        targetBufferStart = 0;
        targetBufferLength = 0;
        lastAfter = 0;
        if (nativeOffset != 0) {
            // we have to reset lastAfter!
            targetIterator.previousProcessed();
            lastAfter = targetIterator.offsetAfter;
        }
        fillBuffer();
        return this;
    }

    public String getTarget() {
        return target;
    }

    /**
     * Set the text to search within.
     */
    public ReferenceStringSearch setTarget(String target) {
        this.target = target;
        if (breaker != null) {
            breaker.setText(target);
        }
        targetIterator.setText(target); // TODO optimize creation
        // make the buffer just a bit bigger than we need
        // note: we could optimize to reuse buffers, but probably not worth it
        targetBuffer = new int[keyBuffer.length + PADDING];
        targetBackMapBefore = new int[keyBuffer.length + PADDING];
        targetBackMapAfter = new int[keyBuffer.length + PADDING];
        targetBufferStart = 0;
        lastAfter = 0;
        targetBufferLength = 0;
        fillBuffer();
        return this;
    }

    /**
     * We keep the circular buffer as start + length instead of start + limit so
     * we can distinguish the 0 case.
     *
     * @return
     */
    private boolean shiftBuffer() {
        lastAfter = targetBackMapAfter[targetBufferStart]; // remember last after
        ++targetBufferStart;
        if (targetBufferStart >= targetBuffer.length) {
            targetBufferStart = 0;
        }
        --targetBufferLength;
        return fillBuffer();
    }

    private boolean fillBuffer() {
        // TODO mark as done so we don't call extra at the end
        while (targetBufferLength < keyBuffer.length + 1) {
            int ce = targetIterator.nextProcessed();
            if (ce == CollationElementIterator.NULLORDER) {
                return false;
            }
            int targetBufferLimit = targetBufferStart + targetBufferLength;
            if (targetBufferLimit >= targetBuffer.length) {
                targetBufferLimit -= targetBuffer.length;
            }
            targetBuffer[targetBufferLimit] = ce;
            targetBackMapBefore[targetBufferLimit] = targetIterator.offsetBefore;
            targetBackMapAfter[targetBufferLimit] = targetIterator.offsetAfter;
            ++targetBufferLength;
        }
        return true;
    }

    /**
     * Find the next match, and set the min/maxStart/Limit. Return false if not
     * found. Here is an example, where [...] is the maximal match, and {..} the
     * minimal.
     *
     * <pre>
     *  Raw Positions of 'eß' in ' ess eß ESS̀ '
     *    '[ {ess} ]eß ESS̀ '
     *    ' ess[ {eß} ]ESS̀ '
     *    ' ess eß[ {ESS}̀ ]'
     * </pre>
     *
     * Matches may overlap. Thus
     *
     * <pre>
     *  Raw Positions of 'a!a' in 'b.a.a.a.b'
     *    'b[.{a.a}.]a.b'
     *    'b.a[.{a.a}.]b'
     * </pre>
     */
    public boolean searchForwards(ExtendedRange position) {
        while (targetBufferLength >= keyBuffer.length) {
            if (matchesAt()) {
                // minStart is from the After of previous item.
                position.minStart = lastAfter;

                position.maxStart = targetBackMapBefore[targetBufferStart];
                int last = targetBufferStart + keyBuffer.length - 1;
                if (last >= targetBuffer.length) {
                    last -= targetBuffer.length;
                }
                position.minLimit = targetBackMapAfter[last];

                // maxLimit is from the Before of the next item
                // or, if we are at the very end of the text, the text length.
                if (targetBufferLength == keyBuffer.length) {
                    position.maxLimit = target.length();
                } else {
                    ++last; // we reuse the position since we are already there.
                    if (last >= targetBuffer.length) {
                        last -= targetBuffer.length;
                    }
                    position.maxLimit = targetBackMapBefore[last];
                }
                shiftBuffer(); // move to next offset
                return true;
            }
            shiftBuffer(); // move to next offset
        }
        return false;
    }

    /**
     * Simple match at position.
     */
    private boolean matchesAt() {
        int j = targetBufferStart;
        for (int i = 0; i < keyBuffer.length; ++i) {
            if (keyBuffer[i] != targetBuffer[j]) {
                return false;
            }
            ++j;
            if (j >= targetBuffer.length) {
                j = 0;
            }
        }
        return true;
    }

    private ExtendedRange internalPosition = new ExtendedRange();

    /**
     * This is the main public interface for searching. It filters out anything
     * that doesn't match the breaker, and adjusts the boundaries to the max/min
     * permitted. Examples:
     *
     * <pre>
     *  Positions of 'eß' in ' ess eß ESS̀ '
     *    ' [ess] eß ESS̀ '
     *    ' ess [eß] ESS̀ '
     *    ' ess eß [ESS̀] '
     *    Count: 3
     *
     * Positions of 'a!a' in 'b.a.a.a.b'
     *    'b.[a.a].a.b'
     *    'b.a.[a.a].b'
     *    Count: 2
     * </pre>
     *
     * @param position
     * @return
     */
    public boolean searchForwards(Range position) {
        while (true) {
            boolean succeeds = searchForwards(internalPosition);
            if (!succeeds) return false;
            position.start = getBoundary(breaker, internalPosition.minStart, internalPosition.maxStart, !widestStart);
            if (position.start == -1) continue; // failed to find the right boundary
            position.limit = getBoundary(breaker, internalPosition.minLimit, internalPosition.maxLimit, widestLimit);
            if (position.limit == -1) continue; // failed to find the right boundary
            return true;
        }
    }

    /****************** PRIVATES ***************/

    /**
     * This really ought to be just methods on CollationElementIterator.
     */
    public static class CollationElementIterator2 {
        private CollationElementIterator keyIterator;
        private int strengthMask;
        private int variableTop;
        private int offsetBefore;
        private int offsetAfter;

        public int getOffsetBefore() {
            return offsetBefore;
        }

        public int getOffsetAfter() {
            return offsetAfter;
        }

        public CollationElementIterator2 reset() {
            keyIterator.reset();
            return this;
        }

        public CollationElementIterator2 setOffset(int offset) {
            keyIterator.setOffset(offset);
            return this;
        }

        public CollationElementIterator2 setText(String source) {
            keyIterator.setText(source);
            return this;
        }

        public CollationElementIterator2(RuleBasedCollator collator) {
            // gather some information that we will need later
            strengthMask = 0xFFFF0000;
            variableTop = !collator.isAlternateHandlingShifted() ? -1 : collator.getVariableTop() | 0xFFFF;
            // this needs to be fixed a bit for case-level, etc.
            switch (collator.getStrength()) {
            case Collator.PRIMARY:
                strengthMask = 0xFFFF0000;
                break;
            case Collator.SECONDARY:
                strengthMask = 0xFFFFFF00;
                break;
            default:
                strengthMask = 0xFFFFFFFF;
                break;
            }
            keyIterator = collator.getCollationElementIterator("");
        }

        /**
         * This should be a method on CollationElementIterator. Returns next
         * non-zero collation element, setting indexBefore, indexAfter. Should also
         * process shifted and strength, masking as needed. If a collation element
         * has a continuation, then the indexAfter = indexBefore, for example, if
         * [CE1,CE2] form a single collation element for the characters between
         * native indexes 5 and 8, (C2 being a continuation, then the result of two
         * calls to nextProcessed would be [CE1, 5, 5] then [CE1, 5,8].
         * <p>
         * previousProcessed would do similar things, backwards.
         *
         */
        int nextProcessed() {
            while (true) {
                offsetBefore = keyIterator.getOffset();
                int collationElement = keyIterator.next();
                if (collationElement != CollationElementIterator.NULLORDER) {

                    // note: the collation element iterator ought to give us processed values, but it doesn't
                    // so we have to simulate that.
                    collationElement &= strengthMask; // mask to only the strengths we have
                    // check for shifted.
                    // TODO This is not exactly right, and we also need to eject any following combining marks,
                    // so fix later.
                    if (collationElement < variableTop && collationElement > 0xFFFF) {
                        continue;
                    }
                    if (collationElement == 0) {
                        continue;
                    }

                }
                offsetAfter = keyIterator.getOffset();
                return collationElement;
            }
        }

        int previousProcessed() {
            while (true) {
                offsetAfter = keyIterator.getOffset();
                int collationElement = keyIterator.previous();
                if (collationElement != CollationElementIterator.NULLORDER) {

                    // note: the collation element iterator ought to give us processed values, but it doesn't
                    // so we have to simulate that.
                    collationElement &= strengthMask; // mask to only the strengths we have
                    // check for shifted.
                    // TODO This is not exactly right, and we also need to eject any following combining marks,
                    // so fix later.
                    if (collationElement < variableTop && collationElement > 0xFFFF) {
                        continue;
                    }
                    if (collationElement == 0) {
                        continue;
                    }

                }
                offsetBefore = keyIterator.getOffset();
                return collationElement;
            }
        }
    }

    /**
     * Utility
     *
     * @param keyBufferList
     * @return
     */
    private int[] getIntBuffer(ArrayList<Integer> keyBufferList) {
        int[] buffer = new int[keyBufferList.size()];
        for (int i = 0; i < keyBufferList.size(); ++i) {
            buffer[i] = keyBufferList.get(i).intValue();
        }
        return buffer;
    }

    /**
     * This really should be on breakIterator, so it can be done more efficiently.
     * <p>
     * Get the item that is on a boundary according to the breaker, and is between the input boundaries. The boolean
     * greatest controls whether we pick the least or the greatest possible offset that works according to the breaker.
     * <p>
     * Returns -1 if there is no valid offset according to the breaker.
     *
     * @param minBoundary
     * @param maxBoundary
     * @param greatest
     * @return
     */
    public static int getBoundary(BreakIterator breaker, int minBoundary, int maxBoundary, boolean greatest) {
        if (breaker == null) return greatest ? maxBoundary : minBoundary;
        int result;
        // this may or may not be the most efficient way to test; ask Andy
        if (greatest) {
            result = breaker.preceding(maxBoundary + 1);
            if (result < minBoundary) {
                result = -1;
            }
        } else {
            result = breaker.following(minBoundary - 1);
            if (result < minBoundary) {
                result = -1;
            }
        }
        return result;
    }

}