1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2014-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import com.ibm.icu.text.UnicodeSet.SpanCondition; 12 import com.ibm.icu.util.OutputInt; 13 14 /** 15 * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches. 16 * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen. 17 * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with 18 * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on. 19 * For the inverse, the following mapping is used: 20 * <ul> 21 * <li>{@link UnicodeSet.SpanCondition#SIMPLE} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li> 22 * <li>{@link UnicodeSet.SpanCondition#CONTAINED} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li> 23 * <li>{@link UnicodeSet.SpanCondition#NOT_CONTAINED} → {@link UnicodeSet.SpanCondition#SIMPLE}</li> 24 * </ul> 25 * These are actually not complete inverses. However, the alternating works because there are no gaps. 26 * For example, with [a{ab}{bc}], you get the following behavior when scanning forward: 27 * 28 * <table border="1"> 29 * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr> 30 * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr> 31 * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr> 32 * </table> 33 * <p>So here is what happens when you alternate: 34 * 35 * <table border="1"> 36 * <tr><th>start</th><td>|xxxabcyyy</td></tr> 37 * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr> 38 * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr> 39 * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr> 40 * </table> 41 * <p>The entire string is traversed. 42 * 43 * @stable ICU 54 44 */ 45 public class UnicodeSetSpanner { 46 47 private final UnicodeSet unicodeSet; 48 49 /** 50 * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class 51 * can be used with a non-frozen version to avoid the cost of freezing. 52 * 53 * @param source 54 * the original UnicodeSet 55 * 56 * @stable ICU 54 57 */ UnicodeSetSpanner(UnicodeSet source)58 public UnicodeSetSpanner(UnicodeSet source) { 59 unicodeSet = source; 60 } 61 62 /** 63 * Returns the UnicodeSet used for processing. It is frozen iff the original was. 64 * 65 * @return the construction set. 66 * 67 * @stable ICU 54 68 */ getUnicodeSet()69 public UnicodeSet getUnicodeSet() { 70 return unicodeSet; 71 } 72 73 74 /** 75 * {@inheritDoc} 76 * 77 * @stable ICU 54 78 */ 79 @Override equals(Object other)80 public boolean equals(Object other) { 81 return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet); 82 } 83 84 /** 85 * {@inheritDoc} 86 * 87 * @stable ICU 54 88 */ 89 @Override hashCode()90 public int hashCode() { 91 return unicodeSet.hashCode(); 92 } 93 94 /** 95 * Options for replaceFrom and countIn to control how to treat each matched span. 96 * It is similar to whether one is replacing [abc] by x, or [abc]* by x. 97 * 98 * @stable ICU 54 99 */ 100 public enum CountMethod { 101 /** 102 * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate 103 * set elements. 104 * 105 * @stable ICU 54 106 */ 107 WHOLE_SPAN, 108 /** 109 * Use the smallest number of elements in the spanned range for counting and modification, 110 * based on the {@link UnicodeSet.SpanCondition}. 111 * If the set has no strings, this will be the same as the number of spanned code points. 112 * <p>For example, in the string "abab" with SpanCondition.SIMPLE: 113 * <ul> 114 * <li>spanning with [ab] will count four MIN_ELEMENTS.</li> 115 * <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li> 116 * <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li> 117 * </ul> 118 * 119 * @stable ICU 54 120 */ 121 MIN_ELEMENTS, 122 // Note: could in the future have an additional option MAX_ELEMENTS 123 } 124 125 /** 126 * Returns the number of matching characters found in a character sequence, 127 * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. 128 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 129 * @param sequence 130 * the sequence to count characters in 131 * @return the count. Zero if there are none. 132 * 133 * @stable ICU 54 134 */ countIn(CharSequence sequence)135 public int countIn(CharSequence sequence) { 136 return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE); 137 } 138 139 /** 140 * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE. 141 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 142 * @param sequence 143 * the sequence to count characters in 144 * @param countMethod 145 * whether to treat an entire span as a match, or individual elements as matches 146 * @return the count. Zero if there are none. 147 * 148 * @stable ICU 54 149 */ countIn(CharSequence sequence, CountMethod countMethod)150 public int countIn(CharSequence sequence, CountMethod countMethod) { 151 return countIn(sequence, countMethod, SpanCondition.SIMPLE); 152 } 153 154 /** 155 * Returns the number of matching characters found in a character sequence. 156 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 157 * @param sequence 158 * the sequence to count characters in 159 * @param countMethod 160 * whether to treat an entire span as a match, or individual elements as matches 161 * @param spanCondition 162 * the spanCondition to use. SIMPLE or CONTAINED means only count the elements in the span; 163 * NOT_CONTAINED is the reverse. 164 * <br><b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in edge cases. 165 * @return the count. Zero if there are none. 166 * 167 * @stable ICU 54 168 */ countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition)169 public int countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition) { 170 int count = 0; 171 int start = 0; 172 SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE 173 : SpanCondition.NOT_CONTAINED; 174 final int length = sequence.length(); 175 OutputInt spanCount = null; 176 while (start != length) { 177 int endOfSpan = unicodeSet.span(sequence, start, skipSpan); 178 if (endOfSpan == length) { 179 break; 180 } 181 if (countMethod == CountMethod.WHOLE_SPAN) { 182 start = unicodeSet.span(sequence, endOfSpan, spanCondition); 183 count += 1; 184 } else { 185 if (spanCount == null) { 186 spanCount = new OutputInt(); 187 } 188 start = unicodeSet.spanAndCount(sequence, endOfSpan, spanCondition, spanCount); 189 count += spanCount.value; 190 } 191 } 192 return count; 193 } 194 195 /** 196 * Delete all the matching spans in sequence, using SpanCondition.SIMPLE 197 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 198 * @param sequence 199 * charsequence to replace matching spans in. 200 * @return modified string. 201 * 202 * @stable ICU 54 203 */ deleteFrom(CharSequence sequence)204 public String deleteFrom(CharSequence sequence) { 205 return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE); 206 } 207 208 /** 209 * Delete all matching spans in sequence, according to the spanCondition. 210 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 211 * @param sequence 212 * charsequence to replace matching spans in. 213 * @param spanCondition 214 * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED) 215 * @return modified string. 216 * 217 * @stable ICU 54 218 */ deleteFrom(CharSequence sequence, SpanCondition spanCondition)219 public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) { 220 return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition); 221 } 222 223 /** 224 * Replace all matching spans in sequence by the replacement, 225 * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. 226 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 227 * @param sequence 228 * charsequence to replace matching spans in. 229 * @param replacement 230 * replacement sequence. To delete, use "" 231 * @return modified string. 232 * 233 * @stable ICU 54 234 */ replaceFrom(CharSequence sequence, CharSequence replacement)235 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 236 return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE); 237 } 238 239 /** 240 * Replace all matching spans in sequence by replacement, according to the CountMethod, using SpanCondition.SIMPLE. 241 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 242 * 243 * @param sequence 244 * charsequence to replace matching spans in. 245 * @param replacement 246 * replacement sequence. To delete, use "" 247 * @param countMethod 248 * whether to treat an entire span as a match, or individual elements as matches 249 * @return modified string. 250 * 251 * @stable ICU 54 252 */ replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod)253 public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod) { 254 return replaceFrom(sequence, replacement, countMethod, SpanCondition.SIMPLE); 255 } 256 257 /** 258 * Replace all matching spans in sequence by replacement, according to the countMethod and spanCondition. 259 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 260 * @param sequence 261 * charsequence to replace matching spans in. 262 * @param replacement 263 * replacement sequence. To delete, use "" 264 * @param countMethod 265 * whether to treat an entire span as a match, or individual elements as matches 266 * @param spanCondition 267 * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching 268 * (NOT_CONTAINED) 269 * @return modified string. 270 * 271 * @stable ICU 54 272 */ replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod, SpanCondition spanCondition)273 public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod, 274 SpanCondition spanCondition) { 275 SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE 276 : SpanCondition.NOT_CONTAINED; 277 final boolean remove = replacement.length() == 0; 278 StringBuilder result = new StringBuilder(); 279 // TODO, we can optimize this to 280 // avoid this allocation unless needed 281 282 final int length = sequence.length(); 283 OutputInt spanCount = null; 284 for (int endCopy = 0; endCopy != length;) { 285 int endModify; 286 if (countMethod == CountMethod.WHOLE_SPAN) { 287 endModify = unicodeSet.span(sequence, endCopy, spanCondition); 288 } else { 289 if (spanCount == null) { 290 spanCount = new OutputInt(); 291 } 292 endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount); 293 } 294 if (remove || endModify == 0) { 295 // do nothing 296 } else if (countMethod == CountMethod.WHOLE_SPAN) { 297 result.append(replacement); 298 } else { 299 for (int i = spanCount.value; i > 0; --i) { 300 result.append(replacement); 301 } 302 } 303 if (endModify == length) { 304 break; 305 } 306 endCopy = unicodeSet.span(sequence, endModify, copySpan); 307 result.append(sequence.subSequence(endModify, endCopy)); 308 } 309 return result.toString(); 310 } 311 312 /** 313 * Options for the trim() method 314 * 315 * @stable ICU 54 316 */ 317 public enum TrimOption { 318 /** 319 * Trim leading spans. 320 * 321 * @stable ICU 54 322 */ 323 LEADING, 324 /** 325 * Trim leading and trailing spans. 326 * 327 * @stable ICU 54 328 */ 329 BOTH, 330 /** 331 * Trim trailing spans. 332 * 333 * @stable ICU 54 334 */ 335 TRAILING; 336 } 337 338 /** 339 * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start and 340 * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example: 341 * 342 * <pre> 343 * {@code 344 * 345 * new UnicodeSet("[ab]").trim("abacatbab")} 346 * </pre> 347 * 348 * ... returns {@code "cat"}. 349 * @param sequence 350 * the sequence to trim 351 * @return a subsequence 352 * 353 * @stable ICU 54 354 */ trim(CharSequence sequence)355 public CharSequence trim(CharSequence sequence) { 356 return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE); 357 } 358 359 /** 360 * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or 361 * end of the string, using the trimOption and SpanCondition.SIMPLE. For example: 362 * 363 * <pre> 364 * {@code 365 * 366 * new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING)} 367 * </pre> 368 * 369 * ... returns {@code "catbab"}. 370 * 371 * @param sequence 372 * the sequence to trim 373 * @param trimOption 374 * LEADING, TRAILING, or BOTH 375 * @return a subsequence 376 * 377 * @stable ICU 54 378 */ trim(CharSequence sequence, TrimOption trimOption)379 public CharSequence trim(CharSequence sequence, TrimOption trimOption) { 380 return trim(sequence, trimOption, SpanCondition.SIMPLE); 381 } 382 383 /** 384 * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or 385 * end of the string, depending on the trimOption and spanCondition. For example: 386 * 387 * <pre> 388 * {@code 389 * 390 * new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING, SpanCondition.SIMPLE)} 391 * </pre> 392 * 393 * ... returns {@code "catbab"}. 394 * 395 * @param sequence 396 * the sequence to trim 397 * @param trimOption 398 * LEADING, TRAILING, or BOTH 399 * @param spanCondition 400 * SIMPLE, CONTAINED or NOT_CONTAINED 401 * @return a subsequence 402 * 403 * @stable ICU 54 404 */ trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition)405 public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) { 406 int endLeadContained, startTrailContained; 407 final int length = sequence.length(); 408 if (trimOption != TrimOption.TRAILING) { 409 endLeadContained = unicodeSet.span(sequence, spanCondition); 410 if (endLeadContained == length) { 411 return ""; 412 } 413 } else { 414 endLeadContained = 0; 415 } 416 if (trimOption != TrimOption.LEADING) { 417 startTrailContained = unicodeSet.spanBack(sequence, spanCondition); 418 } else { 419 startTrailContained = length; 420 } 421 return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence( 422 endLeadContained, startTrailContained); 423 } 424 425 } 426