// Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // file: rbbi_cache.h // #ifndef RBBI_CACHE_H #define RBBI_CACHE_H #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION #include "unicode/rbbi.h" #include "unicode/uobject.h" #include "uvectr32.h" U_NAMESPACE_BEGIN /* DictionaryCache stores the boundaries obtained from a run of dictionary characters. * Dictionary boundaries are moved first to this cache, then from here * to the main BreakCache, where they may inter-leave with non-dictionary * boundaries. The public BreakIterator API always fetches directly * from the main BreakCache, not from here. * * In common situations, the number of boundaries in a single dictionary run * should be quite small, it will be terminated by punctuation, spaces, * or any other non-dictionary characters. The main BreakCache may end * up with boundaries from multiple dictionary based runs. * * The boundaries are stored in a simple ArrayList (vector), with the * assumption that they will be accessed sequentially. */ class RuleBasedBreakIterator::DictionaryCache: public UMemory { public: DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status); ~DictionaryCache(); void reset(); UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex); UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex); /** * Populate the cache with the dictionary based boundaries within a region of text. * @param startPos The start position of a range of text * @param endPos The end position of a range of text * @param firstRuleStatus The rule status index that applies to the break at startPos * @param otherRuleStatus The rule status index that applies to boundaries other than startPos * @internal */ void populateDictionary(int32_t startPos, int32_t endPos, int32_t firstRuleStatus, int32_t otherRuleStatus); RuleBasedBreakIterator *fBI; UVector32 fBreaks; // A vector containing the boundaries. int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following() // or preceding(). Optimizes sequential access. int32_t fStart; // Text position of first boundary in cache. int32_t fLimit; // Last boundary in cache. Which is the limit of the // text segment being handled by the dictionary. int32_t fFirstRuleStatusIndex; // Rule status info for first boundary. int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. }; /* * class BreakCache * * Cache of break boundary positions and rule status values. * Break iterator API functions, next(), previous(), etc., will use cached results * when possible, and otherwise cache new results as they are obtained. * * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. * * The cache is implemented as a single circular buffer. */ /* * size of the circular cache buffer. */ class RuleBasedBreakIterator::BreakCache: public UMemory { public: BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status); virtual ~BreakCache(); void reset(int32_t pos = 0, int32_t ruleStatus = 0); void next() { if (fBufIdx == fEndBufIdx) { nextOL(); } else { fBufIdx = modChunkSize(fBufIdx + 1); fTextIdx = fBI->fPosition = fBoundaries[fBufIdx]; fBI->fRuleStatusIndex = fStatuses[fBufIdx]; } }; void nextOL(); void previous(UErrorCode &status); // Move the iteration state to the position following the startPosition. // Input position must be pinned to the input length. void following(int32_t startPosition, UErrorCode &status); void preceding(int32_t startPosition, UErrorCode &status); /* * Update the state of the public BreakIterator (fBI) to reflect the * current state of the break iterator cache (this). */ int32_t current(); /** * Add boundaries to the cache near the specified position. * The given position need not be a boundary itself. * The input position must be within the range of the text, and * on a code point boundary. * If the requested position is a break boundary, leave the iteration * position on it. * If the requested position is not a boundary, leave the iteration * position on the preceding boundary and include both the * preceding and following boundaries in the cache. * Additional boundaries, either preceding or following, may be added * to the cache as a side effect. * * Return FALSE if the operation failed. */ UBool populateNear(int32_t position, UErrorCode &status); /** * Add boundary(s) to the cache following the current last boundary. * Return FALSE if at the end of the text, and no more boundaries can be added. * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. */ UBool populateFollowing(); /** * Add one or more boundaries to the cache preceding the first currently cached boundary. * Leave the iteration position on the first added boundary. * Return false if no boundaries could be added (if at the start of the text.) */ UBool populatePreceding(UErrorCode &status); enum UpdatePositionValues { RetainCachePosition = 0, UpdateCachePosition = 1 }; /* * Add the boundary following the current position. * The current position can be left as it was, or changed to the newly added boundary, * as specified by the update parameter. */ void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); /* * Add the boundary preceding the current position. * The current position can be left as it was, or changed to the newly added boundary, * as specified by the update parameter. */ bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); /** * Set the cache position to the specified position, or, if the position * falls between to cached boundaries, to the preceding boundary. * Fails if the requested position is outside of the range of boundaries currently held by the cache. * The startPosition must be on a code point boundary. * * Return TRUE if successful, FALSE if the specified position is after * the last cached boundary or before the first. */ UBool seek(int32_t startPosition); void dumpCache(); private: static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; static constexpr int32_t CACHE_SIZE = 128; static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); RuleBasedBreakIterator *fBI; int32_t fStartBufIdx; int32_t fEndBufIdx; // inclusive int32_t fTextIdx; int32_t fBufIdx; int32_t fBoundaries[CACHE_SIZE]; uint16_t fStatuses[CACHE_SIZE]; UVector32 fSideBuffer; }; U_NAMESPACE_END #endif // #if !UCONFIG_NO_BREAK_ITERATION #endif // RBBI_CACHE_H