1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2014-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.text; 11 12 import ohos.global.icu.text.UnicodeSet.SpanCondition; 13 import ohos.global.icu.util.OutputInt; 14 15 /** 16 * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches. 17 * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen. 18 * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with 19 * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on. 20 * For the inverse, the following mapping is used: 21 * <ul> 22 * <li>{@link UnicodeSet.SpanCondition#SIMPLE} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li> 23 * <li>{@link UnicodeSet.SpanCondition#CONTAINED} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li> 24 * <li>{@link UnicodeSet.SpanCondition#NOT_CONTAINED} → {@link UnicodeSet.SpanCondition#SIMPLE}</li> 25 * </ul> 26 * These are actually not complete inverses. However, the alternating works because there are no gaps. 27 * For example, with [a{ab}{bc}], you get the following behavior when scanning forward: 28 * 29 * <table border="1"> 30 * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr> 31 * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr> 32 * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr> 33 * </table> 34 * <p>So here is what happens when you alternate: 35 * 36 * <table border="1"> 37 * <tr><th>start</th><td>|xxxabcyyy</td></tr> 38 * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr> 39 * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr> 40 * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr> 41 * </table> 42 * <p>The entire string is traversed. 43 */ 44 public class UnicodeSetSpanner { 45 46 private final UnicodeSet unicodeSet; 47 48 /** 49 * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class 50 * can be used with a non-frozen version to avoid the cost of freezing. 51 * 52 * @param source 53 * the original UnicodeSet 54 */ UnicodeSetSpanner(UnicodeSet source)55 public UnicodeSetSpanner(UnicodeSet source) { 56 unicodeSet = source; 57 } 58 59 /** 60 * Returns the UnicodeSet used for processing. It is frozen iff the original was. 61 * 62 * @return the construction set. 63 */ getUnicodeSet()64 public UnicodeSet getUnicodeSet() { 65 return unicodeSet; 66 } 67 68 69 /** 70 * {@inheritDoc} 71 */ 72 @Override equals(Object other)73 public boolean equals(Object other) { 74 return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet); 75 } 76 77 /** 78 * {@inheritDoc} 79 */ 80 @Override hashCode()81 public int hashCode() { 82 return unicodeSet.hashCode(); 83 } 84 85 /** 86 * Options for replaceFrom and countIn to control how to treat each matched span. 87 * It is similar to whether one is replacing [abc] by x, or [abc]* by x. 88 */ 89 public enum CountMethod { 90 /** 91 * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate 92 * set elements. 93 */ 94 WHOLE_SPAN, 95 /** 96 * Use the smallest number of elements in the spanned range for counting and modification, 97 * based on the {@link UnicodeSet.SpanCondition}. 98 * If the set has no strings, this will be the same as the number of spanned code points. 99 * <p>For example, in the string "abab" with SpanCondition.SIMPLE: 100 * <ul> 101 * <li>spanning with [ab] will count four MIN_ELEMENTS.</li> 102 * <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li> 103 * <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li> 104 * </ul> 105 */ 106 MIN_ELEMENTS, 107 // Note: could in the future have an additional option MAX_ELEMENTS 108 } 109 110 /** 111 * Returns the number of matching characters found in a character sequence, 112 * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. 113 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 114 * @param sequence 115 * the sequence to count characters in 116 * @return the count. Zero if there are none. 117 */ countIn(CharSequence sequence)118 public int countIn(CharSequence sequence) { 119 return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE); 120 } 121 122 /** 123 * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE. 124 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 125 * @param sequence 126 * the sequence to count characters in 127 * @param countMethod 128 * whether to treat an entire span as a match, or individual elements as matches 129 * @return the count. Zero if there are none. 130 */ countIn(CharSequence sequence, CountMethod countMethod)131 public int countIn(CharSequence sequence, CountMethod countMethod) { 132 return countIn(sequence, countMethod, SpanCondition.SIMPLE); 133 } 134 135 /** 136 * Returns the number of matching characters found in a character sequence. 137 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 138 * @param sequence 139 * the sequence to count characters in 140 * @param countMethod 141 * whether to treat an entire span as a match, or individual elements as matches 142 * @param spanCondition 143 * the spanCondition to use. SIMPLE or CONTAINED means only count the elements in the span; 144 * NOT_CONTAINED is the reverse. 145 * <br><b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in edge cases. 146 * @return the count. Zero if there are none. 147 */ countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition)148 public int countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition) { 149 int count = 0; 150 int start = 0; 151 SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE 152 : SpanCondition.NOT_CONTAINED; 153 final int length = sequence.length(); 154 OutputInt spanCount = null; 155 while (start != length) { 156 int endOfSpan = unicodeSet.span(sequence, start, skipSpan); 157 if (endOfSpan == length) { 158 break; 159 } 160 if (countMethod == CountMethod.WHOLE_SPAN) { 161 start = unicodeSet.span(sequence, endOfSpan, spanCondition); 162 count += 1; 163 } else { 164 if (spanCount == null) { 165 spanCount = new OutputInt(); 166 } 167 start = unicodeSet.spanAndCount(sequence, endOfSpan, spanCondition, spanCount); 168 count += spanCount.value; 169 } 170 } 171 return count; 172 } 173 174 /** 175 * Delete all the matching spans in sequence, using SpanCondition.SIMPLE 176 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 177 * @param sequence 178 * charsequence to replace matching spans in. 179 * @return modified string. 180 */ deleteFrom(CharSequence sequence)181 public String deleteFrom(CharSequence sequence) { 182 return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE); 183 } 184 185 /** 186 * Delete all matching spans in sequence, according to the spanCondition. 187 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 188 * @param sequence 189 * charsequence to replace matching spans in. 190 * @param spanCondition 191 * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED) 192 * @return modified string. 193 */ deleteFrom(CharSequence sequence, SpanCondition spanCondition)194 public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) { 195 return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition); 196 } 197 198 /** 199 * Replace all matching spans in sequence by the replacement, 200 * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. 201 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 202 * @param sequence 203 * charsequence to replace matching spans in. 204 * @param replacement 205 * replacement sequence. To delete, use "" 206 * @return modified string. 207 */ replaceFrom(CharSequence sequence, CharSequence replacement)208 public String replaceFrom(CharSequence sequence, CharSequence replacement) { 209 return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE); 210 } 211 212 /** 213 * Replace all matching spans in sequence by replacement, according to the CountMethod, using SpanCondition.SIMPLE. 214 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 215 * 216 * @param sequence 217 * charsequence to replace matching spans in. 218 * @param replacement 219 * replacement sequence. To delete, use "" 220 * @param countMethod 221 * whether to treat an entire span as a match, or individual elements as matches 222 * @return modified string. 223 */ replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod)224 public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod) { 225 return replaceFrom(sequence, replacement, countMethod, SpanCondition.SIMPLE); 226 } 227 228 /** 229 * Replace all matching spans in sequence by replacement, according to the countMethod and spanCondition. 230 * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions. 231 * @param sequence 232 * charsequence to replace matching spans in. 233 * @param replacement 234 * replacement sequence. To delete, use "" 235 * @param countMethod 236 * whether to treat an entire span as a match, or individual elements as matches 237 * @param spanCondition 238 * specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching 239 * (NOT_CONTAINED) 240 * @return modified string. 241 */ replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod, SpanCondition spanCondition)242 public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod, 243 SpanCondition spanCondition) { 244 SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE 245 : SpanCondition.NOT_CONTAINED; 246 final boolean remove = replacement.length() == 0; 247 StringBuilder result = new StringBuilder(); 248 // TODO, we can optimize this to 249 // avoid this allocation unless needed 250 251 final int length = sequence.length(); 252 OutputInt spanCount = null; 253 for (int endCopy = 0; endCopy != length;) { 254 int endModify; 255 if (countMethod == CountMethod.WHOLE_SPAN) { 256 endModify = unicodeSet.span(sequence, endCopy, spanCondition); 257 } else { 258 if (spanCount == null) { 259 spanCount = new OutputInt(); 260 } 261 endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount); 262 } 263 if (remove || endModify == 0) { 264 // do nothing 265 } else if (countMethod == CountMethod.WHOLE_SPAN) { 266 result.append(replacement); 267 } else { 268 for (int i = spanCount.value; i > 0; --i) { 269 result.append(replacement); 270 } 271 } 272 if (endModify == length) { 273 break; 274 } 275 endCopy = unicodeSet.span(sequence, endModify, copySpan); 276 result.append(sequence.subSequence(endModify, endCopy)); 277 } 278 return result.toString(); 279 } 280 281 /** 282 * Options for the trim() method 283 */ 284 public enum TrimOption { 285 /** 286 * Trim leading spans. 287 */ 288 LEADING, 289 /** 290 * Trim leading and trailing spans. 291 */ 292 BOTH, 293 /** 294 * Trim trailing spans. 295 */ 296 TRAILING; 297 } 298 299 /** 300 * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start and 301 * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example: 302 * 303 * <pre> 304 * {@code 305 * 306 * new UnicodeSet("[ab]").trim("abacatbab")} 307 * </pre> 308 * 309 * ... returns {@code "cat"}. 310 * @param sequence 311 * the sequence to trim 312 * @return a subsequence 313 */ trim(CharSequence sequence)314 public CharSequence trim(CharSequence sequence) { 315 return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE); 316 } 317 318 /** 319 * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or 320 * end of the string, using the trimOption and SpanCondition.SIMPLE. For example: 321 * 322 * <pre> 323 * {@code 324 * 325 * new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING)} 326 * </pre> 327 * 328 * ... returns {@code "catbab"}. 329 * 330 * @param sequence 331 * the sequence to trim 332 * @param trimOption 333 * LEADING, TRAILING, or BOTH 334 * @return a subsequence 335 */ trim(CharSequence sequence, TrimOption trimOption)336 public CharSequence trim(CharSequence sequence, TrimOption trimOption) { 337 return trim(sequence, trimOption, SpanCondition.SIMPLE); 338 } 339 340 /** 341 * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or 342 * end of the string, depending on the trimOption and spanCondition. For example: 343 * 344 * <pre> 345 * {@code 346 * 347 * new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING, SpanCondition.SIMPLE)} 348 * </pre> 349 * 350 * ... returns {@code "catbab"}. 351 * 352 * @param sequence 353 * the sequence to trim 354 * @param trimOption 355 * LEADING, TRAILING, or BOTH 356 * @param spanCondition 357 * SIMPLE, CONTAINED or NOT_CONTAINED 358 * @return a subsequence 359 */ trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition)360 public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) { 361 int endLeadContained, startTrailContained; 362 final int length = sequence.length(); 363 if (trimOption != TrimOption.TRAILING) { 364 endLeadContained = unicodeSet.span(sequence, spanCondition); 365 if (endLeadContained == length) { 366 return ""; 367 } 368 } else { 369 endLeadContained = 0; 370 } 371 if (trimOption != TrimOption.LEADING) { 372 startTrailContained = unicodeSet.spanBack(sequence, spanCondition); 373 } else { 374 startTrailContained = length; 375 } 376 return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence( 377 endLeadContained, startTrailContained); 378 } 379 380 } 381