1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // file: rbbi_cache.h 5 // 6 #ifndef RBBI_CACHE_H 7 #define RBBI_CACHE_H 8 9 #include "unicode/utypes.h" 10 11 #if !UCONFIG_NO_BREAK_ITERATION 12 13 #include "unicode/rbbi.h" 14 #include "unicode/uobject.h" 15 16 #include "uvectr32.h" 17 18 U_NAMESPACE_BEGIN 19 20 /* DictionaryCache stores the boundaries obtained from a run of dictionary characters. 21 * Dictionary boundaries are moved first to this cache, then from here 22 * to the main BreakCache, where they may inter-leave with non-dictionary 23 * boundaries. The public BreakIterator API always fetches directly 24 * from the main BreakCache, not from here. 25 * 26 * In common situations, the number of boundaries in a single dictionary run 27 * should be quite small, it will be terminated by punctuation, spaces, 28 * or any other non-dictionary characters. The main BreakCache may end 29 * up with boundaries from multiple dictionary based runs. 30 * 31 * The boundaries are stored in a simple ArrayList (vector), with the 32 * assumption that they will be accessed sequentially. 33 */ 34 class RuleBasedBreakIterator::DictionaryCache: public UMemory { 35 public: 36 DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status); 37 ~DictionaryCache(); 38 39 void reset(); 40 41 UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex); 42 UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex); 43 44 /** 45 * Populate the cache with the dictionary based boundaries within a region of text. 46 * @param startPos The start position of a range of text 47 * @param endPos The end position of a range of text 48 * @param firstRuleStatus The rule status index that applies to the break at startPos 49 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos 50 * @internal 51 */ 52 void populateDictionary(int32_t startPos, int32_t endPos, 53 int32_t firstRuleStatus, int32_t otherRuleStatus); 54 55 56 57 RuleBasedBreakIterator *fBI; 58 59 UVector32 *fBreaks; // A vector containing the boundaries. 60 int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following() 61 // or preceding(). Optimizes sequential access. 62 int32_t fStart; // Text position of first boundary in cache. 63 int32_t fLimit; // Last boundary in cache. Which is the limit of the 64 // text segment being handled by the dictionary. 65 int32_t fFirstRuleStatusIndex; // Rule status info for first boundary. 66 int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. 67 }; 68 69 70 /* 71 * class BreakCache 72 * 73 * Cache of break boundary positions and rule status values. 74 * Break iterator API functions, next(), previous(), etc., will use cached results 75 * when possible, and otherwise cache new results as they are obtained. 76 * 77 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. 78 * 79 * The cache is implemented as a single circular buffer. 80 */ 81 82 /* 83 * size of the circular cache buffer. 84 */ 85 86 class RuleBasedBreakIterator::BreakCache: public UMemory { 87 public: 88 BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status); 89 virtual ~BreakCache(); 90 void reset(int32_t pos = 0, int32_t ruleStatus = 0); next()91 void next() { if (fBufIdx == fEndBufIdx) { 92 nextOL(); 93 } else { 94 fBufIdx = modChunkSize(fBufIdx + 1); 95 fTextIdx = fBI->fPosition = fBoundaries[fBufIdx]; 96 fBI->fRuleStatusIndex = fStatuses[fBufIdx]; 97 } 98 }; 99 100 101 void nextOL(); 102 void previous(UErrorCode &status); 103 104 // Move the iteration state to the position following the startPosition. 105 // Input position must be pinned to the input length. 106 void following(int32_t startPosition, UErrorCode &status); 107 108 void preceding(int32_t startPosition, UErrorCode &status); 109 110 /* 111 * Update the state of the public BreakIterator (fBI) to reflect the 112 * current state of the break iterator cache (this). 113 */ 114 int32_t current(); 115 116 /** 117 * Add boundaries to the cache near the specified position. 118 * The given position need not be a boundary itself. 119 * The input position must be within the range of the text, and 120 * on a code point boundary. 121 * If the requested position is a break boundary, leave the iteration 122 * position on it. 123 * If the requested position is not a boundary, leave the iteration 124 * position on the preceding boundary and include both the 125 * preceding and following boundaries in the cache. 126 * Additional boundaries, either preceding or following, may be added 127 * to the cache as a side effect. 128 * 129 * Return FALSE if the operation failed. 130 */ 131 UBool populateNear(int32_t position, UErrorCode &status); 132 133 /** 134 * Add boundary(s) to the cache following the current last boundary. 135 * Return FALSE if at the end of the text, and no more boundaries can be added. 136 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. 137 */ 138 UBool populateFollowing(); 139 140 /** 141 * Add one or more boundaries to the cache preceding the first currently cached boundary. 142 * Leave the iteration position on the first added boundary. 143 * Return false if no boundaries could be added (if at the start of the text.) 144 */ 145 UBool populatePreceding(UErrorCode &status); 146 147 enum UpdatePositionValues { 148 RetainCachePosition = 0, 149 UpdateCachePosition = 1 150 }; 151 152 /* 153 * Add the boundary following the current position. 154 * The current position can be left as it was, or changed to the newly added boundary, 155 * as specified by the update parameter. 156 */ 157 void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); 158 159 160 /* 161 * Add the boundary preceding the current position. 162 * The current position can be left as it was, or changed to the newly added boundary, 163 * as specified by the update parameter. 164 */ 165 bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); 166 167 /** 168 * Set the cache position to the specified position, or, if the position 169 * falls between to cached boundaries, to the preceding boundary. 170 * Fails if the requested position is outside of the range of boundaries currently held by the cache. 171 * The startPosition must be on a code point boundary. 172 * 173 * Return TRUE if successful, FALSE if the specified position is after 174 * the last cached boundary or before the first. 175 */ 176 UBool seek(int32_t startPosition); 177 178 void dumpCache(); 179 180 private: modChunkSize(int index)181 static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; 182 183 static constexpr int32_t CACHE_SIZE = 128; 184 static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); 185 186 RuleBasedBreakIterator *fBI; 187 int32_t fStartBufIdx; 188 int32_t fEndBufIdx; // inclusive 189 190 int32_t fTextIdx; 191 int32_t fBufIdx; 192 193 int32_t fBoundaries[CACHE_SIZE]; 194 uint16_t fStatuses[CACHE_SIZE]; 195 196 UVector32 fSideBuffer; 197 }; 198 199 U_NAMESPACE_END 200 201 #endif // #if !UCONFIG_NO_BREAK_ITERATION 202 203 #endif // RBBI_CACHE_H 204