1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, Google, Inc., International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.util.Collection; 12 import java.util.Comparator; 13 import java.util.Iterator; 14 import java.util.LinkedList; 15 import java.util.Map.Entry; 16 import java.util.Set; 17 import java.util.TreeMap; 18 import java.util.TreeSet; 19 20 import com.ibm.icu.lang.CharSequences; 21 import com.ibm.icu.util.ICUException; 22 23 @SuppressWarnings("deprecation") 24 public class StringRange { 25 private static final boolean DEBUG = false; 26 27 public interface Adder { 28 /** 29 * @param start 30 * @param end may be null, for adding single string 31 */ add(String start, String end)32 void add(String start, String end); 33 } 34 35 public static final Comparator<int[]> COMPARE_INT_ARRAYS = new Comparator<int[]>() { 36 @Override 37 public int compare(int[] o1, int[] o2) { 38 int minIndex = Math.min(o1.length, o2.length); 39 for (int i = 0; i < minIndex; ++i) { 40 int diff = o1[i] - o2[i]; 41 if (diff != 0) { 42 return diff; 43 } 44 } 45 return o1.length - o2.length; 46 } 47 }; 48 49 /** 50 * Compact the set of strings. 51 * @param source set of strings 52 * @param adder adds each pair to the output. See the {@link Adder} interface. 53 * @param shorterPairs use abc-d instead of abc-abd 54 * @param moreCompact use a more compact form, at the expense of more processing. If false, source must be sorted. 55 */ compact(Set<String> source, Adder adder, boolean shorterPairs, boolean moreCompact)56 public static void compact(Set<String> source, Adder adder, boolean shorterPairs, boolean moreCompact) { 57 if (!moreCompact) { 58 String start = null; 59 String end = null; 60 int lastCp = 0; 61 int prefixLen = 0; 62 for (String s : source) { 63 if (start != null) { // We have something queued up 64 if (s.regionMatches(0, start, 0, prefixLen)) { 65 int currentCp = s.codePointAt(prefixLen); 66 if (currentCp == 1+lastCp && s.length() == prefixLen + Character.charCount(currentCp)) { 67 end = s; 68 lastCp = currentCp; 69 continue; 70 } 71 } 72 // We failed to find continuation. Add what we have and restart 73 adder.add(start, end == null ? null 74 : !shorterPairs ? end 75 : end.substring(prefixLen, end.length())); 76 } 77 // new possible range 78 start = s; 79 end = null; 80 lastCp = s.codePointBefore(s.length()); 81 prefixLen = s.length() - Character.charCount(lastCp); 82 } 83 adder.add(start, end == null ? null 84 : !shorterPairs ? end 85 : end.substring(prefixLen, end.length())); 86 } else { 87 // not a fast algorithm, but ok for now 88 // TODO rewire to use the first (slower) algorithm to generate the ranges, then compact them from there. 89 // first sort by lengths 90 Relation<Integer,Ranges> lengthToArrays = Relation.of(new TreeMap<Integer,Set<Ranges>>(), TreeSet.class); 91 for (String s : source) { 92 Ranges item = new Ranges(s); 93 lengthToArrays.put(item.size(), item); 94 } 95 // then compact items of each length and emit compacted sets 96 for (Entry<Integer, Set<Ranges>> entry : lengthToArrays.keyValuesSet()) { 97 LinkedList<Ranges> compacted = compact(entry.getKey(), entry.getValue()); 98 for (Ranges ranges : compacted) { 99 adder.add(ranges.start(), ranges.end(shorterPairs)); 100 } 101 } 102 } 103 } 104 105 /** 106 * Faster but not as good compaction. Only looks at final codepoint. 107 * @param source set of strings 108 * @param adder adds each pair to the output. See the {@link Adder} interface. 109 * @param shorterPairs use abc-d instead of abc-abd 110 */ compact(Set<String> source, Adder adder, boolean shorterPairs)111 public static void compact(Set<String> source, Adder adder, boolean shorterPairs) { 112 compact(source,adder,shorterPairs,false); 113 } 114 compact(int size, Set<Ranges> inputRanges)115 private static LinkedList<Ranges> compact(int size, Set<Ranges> inputRanges) { 116 LinkedList<Ranges> ranges = new LinkedList<Ranges>(inputRanges); 117 for (int i = size-1; i >= 0; --i) { 118 Ranges last = null; 119 for (Iterator<Ranges> it = ranges.iterator(); it.hasNext();) { 120 Ranges item = it.next(); 121 if (last == null) { 122 last = item; 123 } else if (last.merge(i, item)) { 124 it.remove(); 125 } else { 126 last = item; // go to next 127 } 128 } 129 }; 130 return ranges; 131 } 132 133 static final class Range implements Comparable<Range>{ 134 int min; 135 int max; Range(int min, int max)136 public Range(int min, int max) { 137 this.min = min; 138 this.max = max; 139 } 140 @Override equals(Object obj)141 public boolean equals(Object obj) { 142 return this == obj || (obj != null && obj instanceof Range && compareTo((Range)obj) == 0); 143 } 144 @Override compareTo(Range that)145 public int compareTo(Range that) { 146 int diff = min - that.min; 147 if (diff != 0) { 148 return diff; 149 } 150 return max - that.max; 151 } 152 @Override hashCode()153 public int hashCode() { 154 return min * 37 + max; 155 } 156 @Override toString()157 public String toString() { 158 StringBuilder result = new StringBuilder().appendCodePoint(min); 159 return min == max ? result.toString() : result.append('~').appendCodePoint(max).toString(); 160 } 161 } 162 163 static final class Ranges implements Comparable<Ranges> { 164 private final Range[] ranges; Ranges(String s)165 public Ranges(String s) { 166 int[] array = CharSequences.codePoints(s); 167 ranges = new Range[array.length]; 168 for (int i = 0; i < array.length; ++i) { 169 ranges[i] = new Range(array[i], array[i]); 170 } 171 } merge(int pivot, Ranges other)172 public boolean merge(int pivot, Ranges other) { 173 // We merge items if the pivot is adjacent, and all later ranges are equal. 174 for (int i = ranges.length-1; i >= 0; --i) { 175 if (i == pivot) { 176 if (ranges[i].max != other.ranges[i].min-1) { // not adjacent 177 return false; 178 } 179 } else { 180 if (!ranges[i].equals(other.ranges[i])) { 181 return false; 182 } 183 } 184 } 185 if (DEBUG) System.out.print("Merging: " + this + ", " + other); 186 ranges[pivot].max = other.ranges[pivot].max; 187 if (DEBUG) System.out.println(" => " + this); 188 return true; 189 } 190 start()191 public String start() { 192 StringBuilder result = new StringBuilder(); 193 for (int i = 0; i < ranges.length; ++i) { 194 result.appendCodePoint(ranges[i].min); 195 } 196 return result.toString(); 197 } end(boolean mostCompact)198 public String end(boolean mostCompact) { 199 int firstDiff = firstDifference(); 200 if (firstDiff == ranges.length) { 201 return null; 202 } 203 StringBuilder result = new StringBuilder(); 204 for (int i = mostCompact ? firstDiff : 0; i < ranges.length; ++i) { 205 result.appendCodePoint(ranges[i].max); 206 } 207 return result.toString(); 208 } firstDifference()209 public int firstDifference() { 210 for (int i = 0; i < ranges.length; ++i) { 211 if (ranges[i].min != ranges[i].max){ 212 return i; 213 } 214 } 215 return ranges.length; 216 } size()217 public Integer size() { 218 return ranges.length; 219 } 220 @Override compareTo(Ranges other)221 public int compareTo(Ranges other) { 222 int diff = ranges.length - other.ranges.length; 223 if (diff != 0) { 224 return diff; 225 } 226 for (int i = 0; i < ranges.length; ++i) { 227 diff = ranges[i].compareTo(other.ranges[i]); 228 if (diff != 0) { 229 return diff; 230 } 231 } 232 return 0; 233 } 234 @Override toString()235 public String toString() { 236 String start = start(); 237 String end = end(false); 238 return end == null ? start : start + "~" + end; 239 } 240 } 241 expand(String start, String end, boolean requireSameLength, Collection<String> output)242 public static Collection<String> expand(String start, String end, boolean requireSameLength, Collection<String> output) { 243 if (start == null || end == null) { 244 throw new ICUException("Range must have 2 valid strings"); 245 } 246 int[] startCps = CharSequences.codePoints(start); 247 int[] endCps = CharSequences.codePoints(end); 248 int startOffset = startCps.length - endCps.length; 249 250 if (requireSameLength && startOffset != 0) { 251 throw new ICUException("Range must have equal-length strings"); 252 } else if (startOffset < 0) { 253 throw new ICUException("Range must have start-length ≥ end-length"); 254 } else if (endCps.length == 0) { 255 throw new ICUException("Range must have end-length > 0"); 256 } 257 258 StringBuilder builder = new StringBuilder(); 259 for (int i = 0; i < startOffset; ++i) { 260 builder.appendCodePoint(startCps[i]); 261 } 262 add(0, startOffset, startCps, endCps, builder, output); 263 return output; 264 } 265 add(int endIndex, int startOffset, int[] starts, int[] ends, StringBuilder builder, Collection<String> output)266 private static void add(int endIndex, int startOffset, int[] starts, int[] ends, StringBuilder builder, Collection<String> output) { 267 int start = starts[endIndex+startOffset]; 268 int end = ends[endIndex]; 269 if (start > end) { 270 throw new ICUException("Range must have xᵢ ≤ yᵢ for each index i"); 271 } 272 boolean last = endIndex == ends.length - 1; 273 int startLen = builder.length(); 274 for (int i = start; i <= end; ++i) { 275 builder.appendCodePoint(i); 276 if (last) { 277 output.add(builder.toString()); 278 } else { 279 add(endIndex+1, startOffset, starts, ends, builder, output); 280 } 281 builder.setLength(startLen); 282 } 283 } 284 } 285