1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // edits.h 5 // created: 2016dec30 Markus W. Scherer 6 7 #ifndef __EDITS_H__ 8 #define __EDITS_H__ 9 10 #include "unicode/utypes.h" 11 #include "unicode/uobject.h" 12 13 /** 14 * \file 15 * \brief C++ API: C++ class Edits for low-level string transformations on styled text. 16 */ 17 18 U_NAMESPACE_BEGIN 19 20 #ifndef U_HIDE_DRAFT_API 21 22 /** 23 * Records lengths of string edits but not replacement text. 24 * Supports replacements, insertions, deletions in linear progression. 25 * Does not support moving/reordering of text. 26 * 27 * An Edits object tracks a separate UErrorCode, but ICU string transformation functions 28 * (e.g., case mapping functions) merge any such errors into their API's UErrorCode. 29 * 30 * @draft ICU 59 31 */ 32 class U_COMMON_API Edits U_FINAL : public UMemory { 33 public: 34 /** 35 * Constructs an empty object. 36 * @draft ICU 59 37 */ Edits()38 Edits() : 39 array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0), 40 errorCode_(U_ZERO_ERROR) {} 41 /** 42 * Copy constructor. 43 * @param other source edits 44 * @draft ICU 60 45 */ Edits(const Edits & other)46 Edits(const Edits &other) : 47 array(stackArray), capacity(STACK_CAPACITY), length(other.length), 48 delta(other.delta), numChanges(other.numChanges), 49 errorCode_(other.errorCode_) { 50 copyArray(other); 51 } 52 /** 53 * Move constructor, might leave src empty. 54 * This object will have the same contents that the source object had. 55 * @param src source edits 56 * @draft ICU 60 57 */ Edits(Edits && src)58 Edits(Edits &&src) U_NOEXCEPT : 59 array(stackArray), capacity(STACK_CAPACITY), length(src.length), 60 delta(src.delta), numChanges(src.numChanges), 61 errorCode_(src.errorCode_) { 62 moveArray(src); 63 } 64 65 /** 66 * Destructor. 67 * @draft ICU 59 68 */ 69 ~Edits(); 70 71 /** 72 * Assignment operator. 73 * @param other source edits 74 * @return *this 75 * @draft ICU 60 76 */ 77 Edits &operator=(const Edits &other); 78 79 /** 80 * Move assignment operator, might leave src empty. 81 * This object will have the same contents that the source object had. 82 * The behavior is undefined if *this and src are the same object. 83 * @param src source edits 84 * @return *this 85 * @draft ICU 60 86 */ 87 Edits &operator=(Edits &&src) U_NOEXCEPT; 88 89 /** 90 * Resets the data but may not release memory. 91 * @draft ICU 59 92 */ 93 void reset() U_NOEXCEPT; 94 95 /** 96 * Adds a record for an unchanged segment of text. 97 * Normally called from inside ICU string transformation functions, not user code. 98 * @draft ICU 59 99 */ 100 void addUnchanged(int32_t unchangedLength); 101 /** 102 * Adds a record for a text replacement/insertion/deletion. 103 * Normally called from inside ICU string transformation functions, not user code. 104 * @draft ICU 59 105 */ 106 void addReplace(int32_t oldLength, int32_t newLength); 107 /** 108 * Sets the UErrorCode if an error occurred while recording edits. 109 * Preserves older error codes in the outErrorCode. 110 * Normally called from inside ICU string transformation functions, not user code. 111 * @param outErrorCode Set to an error code if it does not contain one already 112 * and an error occurred while recording edits. 113 * Otherwise unchanged. 114 * @return TRUE if U_FAILURE(outErrorCode) 115 * @draft ICU 59 116 */ 117 UBool copyErrorTo(UErrorCode &outErrorCode); 118 119 /** 120 * How much longer is the new text compared with the old text? 121 * @return new length minus old length 122 * @draft ICU 59 123 */ lengthDelta()124 int32_t lengthDelta() const { return delta; } 125 /** 126 * @return TRUE if there are any change edits 127 * @draft ICU 59 128 */ hasChanges()129 UBool hasChanges() const { return numChanges != 0; } 130 131 /** 132 * @return the number of change edits 133 * @draft ICU 60 134 */ numberOfChanges()135 int32_t numberOfChanges() const { return numChanges; } 136 137 /** 138 * Access to the list of edits. 139 * @see getCoarseIterator 140 * @see getFineIterator 141 * @draft ICU 59 142 */ 143 struct U_COMMON_API Iterator U_FINAL : public UMemory { 144 /** 145 * Default constructor, empty iterator. 146 * @draft ICU 60 147 */ IteratorU_FINAL148 Iterator() : 149 array(nullptr), index(0), length(0), 150 remaining(0), onlyChanges_(FALSE), coarse(FALSE), 151 dir(0), changed(FALSE), oldLength_(0), newLength_(0), 152 srcIndex(0), replIndex(0), destIndex(0) {} 153 /** 154 * Copy constructor. 155 * @draft ICU 59 156 */ 157 Iterator(const Iterator &other) = default; 158 /** 159 * Assignment operator. 160 * @draft ICU 59 161 */ 162 Iterator &operator=(const Iterator &other) = default; 163 164 /** 165 * Advances to the next edit. 166 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 167 * or else the function returns immediately. Check for U_FAILURE() 168 * on output or use with function chaining. (See User Guide for details.) 169 * @return TRUE if there is another edit 170 * @draft ICU 59 171 */ nextU_FINAL172 UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); } 173 174 /** 175 * Finds the edit that contains the source index. 176 * The source index may be found in a non-change 177 * even if normal iteration would skip non-changes. 178 * Normal iteration can continue from a found edit. 179 * 180 * The iterator state before this search logically does not matter. 181 * (It may affect the performance of the search.) 182 * 183 * The iterator state after this search is undefined 184 * if the source index is out of bounds for the source string. 185 * 186 * @param i source index 187 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 188 * or else the function returns immediately. Check for U_FAILURE() 189 * on output or use with function chaining. (See User Guide for details.) 190 * @return TRUE if the edit for the source index was found 191 * @draft ICU 59 192 */ findSourceIndexU_FINAL193 UBool findSourceIndex(int32_t i, UErrorCode &errorCode) { 194 return findIndex(i, TRUE, errorCode) == 0; 195 } 196 197 /** 198 * Finds the edit that contains the destination index. 199 * The destination index may be found in a non-change 200 * even if normal iteration would skip non-changes. 201 * Normal iteration can continue from a found edit. 202 * 203 * The iterator state before this search logically does not matter. 204 * (It may affect the performance of the search.) 205 * 206 * The iterator state after this search is undefined 207 * if the source index is out of bounds for the source string. 208 * 209 * @param i destination index 210 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 211 * or else the function returns immediately. Check for U_FAILURE() 212 * on output or use with function chaining. (See User Guide for details.) 213 * @return TRUE if the edit for the destination index was found 214 * @draft ICU 60 215 */ findDestinationIndexU_FINAL216 UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) { 217 return findIndex(i, FALSE, errorCode) == 0; 218 } 219 220 /** 221 * Returns the destination index corresponding to the given source index. 222 * If the source index is inside a change edit (not at its start), 223 * then the destination index at the end of that edit is returned, 224 * since there is no information about index mapping inside a change edit. 225 * 226 * (This means that indexes to the start and middle of an edit, 227 * for example around a grapheme cluster, are mapped to indexes 228 * encompassing the entire edit. 229 * The alternative, mapping an interior index to the start, 230 * would map such an interval to an empty one.) 231 * 232 * This operation will usually but not always modify this object. 233 * The iterator state after this search is undefined. 234 * 235 * @param i source index 236 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 237 * or else the function returns immediately. Check for U_FAILURE() 238 * on output or use with function chaining. (See User Guide for details.) 239 * @return destination index; undefined if i is not 0..string length 240 * @draft ICU 60 241 */ 242 int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode); 243 244 /** 245 * Returns the source index corresponding to the given destination index. 246 * If the destination index is inside a change edit (not at its start), 247 * then the source index at the end of that edit is returned, 248 * since there is no information about index mapping inside a change edit. 249 * 250 * (This means that indexes to the start and middle of an edit, 251 * for example around a grapheme cluster, are mapped to indexes 252 * encompassing the entire edit. 253 * The alternative, mapping an interior index to the start, 254 * would map such an interval to an empty one.) 255 * 256 * This operation will usually but not always modify this object. 257 * The iterator state after this search is undefined. 258 * 259 * @param i destination index 260 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 261 * or else the function returns immediately. Check for U_FAILURE() 262 * on output or use with function chaining. (See User Guide for details.) 263 * @return source index; undefined if i is not 0..string length 264 * @draft ICU 60 265 */ 266 int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode); 267 268 /** 269 * @return TRUE if this edit replaces oldLength() units with newLength() different ones. 270 * FALSE if oldLength units remain unchanged. 271 * @draft ICU 59 272 */ hasChangeU_FINAL273 UBool hasChange() const { return changed; } 274 /** 275 * @return the number of units in the original string which are replaced or remain unchanged. 276 * @draft ICU 59 277 */ oldLengthU_FINAL278 int32_t oldLength() const { return oldLength_; } 279 /** 280 * @return the number of units in the modified string, if hasChange() is TRUE. 281 * Same as oldLength if hasChange() is FALSE. 282 * @draft ICU 59 283 */ newLengthU_FINAL284 int32_t newLength() const { return newLength_; } 285 286 /** 287 * @return the current index into the source string 288 * @draft ICU 59 289 */ sourceIndexU_FINAL290 int32_t sourceIndex() const { return srcIndex; } 291 /** 292 * @return the current index into the replacement-characters-only string, 293 * not counting unchanged spans 294 * @draft ICU 59 295 */ replacementIndexU_FINAL296 int32_t replacementIndex() const { return replIndex; } 297 /** 298 * @return the current index into the full destination string 299 * @draft ICU 59 300 */ destinationIndexU_FINAL301 int32_t destinationIndex() const { return destIndex; } 302 303 private: 304 friend class Edits; 305 306 Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs); 307 308 int32_t readLength(int32_t head); 309 void updateNextIndexes(); 310 void updatePreviousIndexes(); 311 UBool noNext(); 312 UBool next(UBool onlyChanges, UErrorCode &errorCode); 313 UBool previous(UErrorCode &errorCode); 314 /** @return -1: error or i<0; 0: found; 1: i>=string length */ 315 int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode); 316 317 const uint16_t *array; 318 int32_t index, length; 319 // 0 if we are not within compressed equal-length changes. 320 // Otherwise the number of remaining changes, including the current one. 321 int32_t remaining; 322 UBool onlyChanges_, coarse; 323 324 int8_t dir; // iteration direction: back(<0), initial(0), forward(>0) 325 UBool changed; 326 int32_t oldLength_, newLength_; 327 int32_t srcIndex, replIndex, destIndex; 328 }; 329 330 /** 331 * Returns an Iterator for coarse-grained changes for simple string updates. 332 * Skips non-changes. 333 * @return an Iterator that merges adjacent changes. 334 * @draft ICU 59 335 */ getCoarseChangesIterator()336 Iterator getCoarseChangesIterator() const { 337 return Iterator(array, length, TRUE, TRUE); 338 } 339 340 /** 341 * Returns an Iterator for coarse-grained changes and non-changes for simple string updates. 342 * @return an Iterator that merges adjacent changes. 343 * @draft ICU 59 344 */ getCoarseIterator()345 Iterator getCoarseIterator() const { 346 return Iterator(array, length, FALSE, TRUE); 347 } 348 349 /** 350 * Returns an Iterator for fine-grained changes for modifying styled text. 351 * Skips non-changes. 352 * @return an Iterator that separates adjacent changes. 353 * @draft ICU 59 354 */ getFineChangesIterator()355 Iterator getFineChangesIterator() const { 356 return Iterator(array, length, TRUE, FALSE); 357 } 358 359 /** 360 * Returns an Iterator for fine-grained changes and non-changes for modifying styled text. 361 * @return an Iterator that separates adjacent changes. 362 * @draft ICU 59 363 */ getFineIterator()364 Iterator getFineIterator() const { 365 return Iterator(array, length, FALSE, FALSE); 366 } 367 368 /** 369 * Merges the two input Edits and appends the result to this object. 370 * 371 * Consider two string transformations (for example, normalization and case mapping) 372 * where each records Edits in addition to writing an output string.<br> 373 * Edits ab reflect how substrings of input string a 374 * map to substrings of intermediate string b.<br> 375 * Edits bc reflect how substrings of intermediate string b 376 * map to substrings of output string c.<br> 377 * This function merges ab and bc such that the additional edits 378 * recorded in this object reflect how substrings of input string a 379 * map to substrings of output string c. 380 * 381 * If unrelated Edits are passed in where the output string of the first 382 * has a different length than the input string of the second, 383 * then a U_ILLEGAL_ARGUMENT_ERROR is reported. 384 * 385 * @param ab reflects how substrings of input string a 386 * map to substrings of intermediate string b. 387 * @param bc reflects how substrings of intermediate string b 388 * map to substrings of output string c. 389 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 390 * or else the function returns immediately. Check for U_FAILURE() 391 * on output or use with function chaining. (See User Guide for details.) 392 * @return *this, with the merged edits appended 393 * @draft ICU 60 394 */ 395 Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode); 396 397 private: 398 void releaseArray() U_NOEXCEPT; 399 Edits ©Array(const Edits &other); 400 Edits &moveArray(Edits &src) U_NOEXCEPT; 401 setLastUnit(int32_t last)402 void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; } lastUnit()403 int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; } 404 405 void append(int32_t r); 406 UBool growArray(); 407 408 static const int32_t STACK_CAPACITY = 100; 409 uint16_t *array; 410 int32_t capacity; 411 int32_t length; 412 int32_t delta; 413 int32_t numChanges; 414 UErrorCode errorCode_; 415 uint16_t stackArray[STACK_CAPACITY]; 416 }; 417 418 #endif // U_HIDE_DRAFT_API 419 420 U_NAMESPACE_END 421 422 #endif // __EDITS_H__ 423