• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ***************************************************************************
3 *   Copyright (C) 1999-2007 International Business Machines Corporation   *
4 *   and others. All rights reserved.                                      *
5 ***************************************************************************
6 */
7 //
8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
9 //                   runtime engine and the API implementation for
10 //                   class RuleBasedBreakIterator
11 //
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include "unicode/rbbi.h"
18 #include "unicode/schriter.h"
19 #include "unicode/uchriter.h"
20 #include "unicode/udata.h"
21 #include "unicode/uclean.h"
22 #include "rbbidata.h"
23 #include "rbbirb.h"
24 #include "cmemory.h"
25 #include "cstring.h"
26 #include "umutex.h"
27 #include "ucln_cmn.h"
28 #include "brkeng.h"
29 
30 #include "uassert.h"
31 #include "uvector.h"
32 
33 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
34 #if U_LOCAL_SERVICE_HOOK
35 #include "localsvc.h"
36 #endif
37 
38 #ifdef RBBI_DEBUG
39 static UBool fTrace = FALSE;
40 #endif
41 
42 U_NAMESPACE_BEGIN
43 
44 
45 static const int16_t START_STATE = 1;     // The state number of the starting state
46 static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"
47 
48 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)49 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
50 
51 
52 //=======================================================================
53 // constructors
54 //=======================================================================
55 
56 /**
57  * Constructs a RuleBasedBreakIterator that uses the already-created
58  * tables object that is passed in as a parameter.
59  */
60 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
61 {
62     init();
63     fData = new RBBIDataWrapper(data, status); // status checked in constructor
64     if (U_FAILURE(status)) {return;}
65     if(fData == 0) {
66         status = U_MEMORY_ALLOCATION_ERROR;
67         return;
68     }
69 }
70 
71 //-------------------------------------------------------------------------------
72 //
73 //   Constructor   from a UDataMemory handle to precompiled break rules
74 //                 stored in an ICU data file.
75 //
76 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(UDataMemory * udm,UErrorCode & status)77 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
78 {
79     init();
80     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
81     if (U_FAILURE(status)) {return;}
82     if(fData == 0) {
83         status = U_MEMORY_ALLOCATION_ERROR;
84         return;
85     }
86 }
87 
88 
89 
90 //-------------------------------------------------------------------------------
91 //
92 //   Constructor       from a set of rules supplied as a string.
93 //
94 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(const UnicodeString & rules,UParseError & parseError,UErrorCode & status)95 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
96                                                 UParseError          &parseError,
97                                                 UErrorCode           &status)
98 {
99     init();
100     if (U_FAILURE(status)) {return;}
101     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
102         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status);
103     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
104     //        creates and returns a complete RBBI.  From here, in a constructor, we
105     //        can't just return the object created by the builder factory, hence
106     //        the assignment of the factory created object to "this".
107     if (U_SUCCESS(status)) {
108         *this = *bi;
109         delete bi;
110     }
111 }
112 
113 
114 //-------------------------------------------------------------------------------
115 //
116 // Default Constructor.      Create an empty shell that can be set up later.
117 //                           Used when creating a RuleBasedBreakIterator from a set
118 //                           of rules.
119 //-------------------------------------------------------------------------------
RuleBasedBreakIterator()120 RuleBasedBreakIterator::RuleBasedBreakIterator() {
121     init();
122 }
123 
124 
125 //-------------------------------------------------------------------------------
126 //
127 //   Copy constructor.  Will produce a break iterator with the same behavior,
128 //                      and which iterates over the same text, as the one passed in.
129 //
130 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(const RuleBasedBreakIterator & other)131 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
132 : BreakIterator(other)
133 {
134     this->init();
135     *this = other;
136 }
137 
138 
139 /**
140  * Destructor
141  */
~RuleBasedBreakIterator()142 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
143     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
144         // fCharIter was adopted from the outside.
145         delete fCharIter;
146     }
147     fCharIter = NULL;
148     delete fSCharIter;
149     fCharIter = NULL;
150     delete fDCharIter;
151     fDCharIter = NULL;
152 
153     utext_close(fText);
154 
155     if (fData != NULL) {
156         fData->removeReference();
157         fData = NULL;
158     }
159     if (fCachedBreakPositions) {
160         uprv_free(fCachedBreakPositions);
161         fCachedBreakPositions = NULL;
162     }
163     if (fLanguageBreakEngines) {
164         delete fLanguageBreakEngines;
165         fLanguageBreakEngines = NULL;
166     }
167     if (fUnhandledBreakEngine) {
168         delete fUnhandledBreakEngine;
169         fUnhandledBreakEngine = NULL;
170     }
171 }
172 
173 /**
174  * Assignment operator.  Sets this iterator to have the same behavior,
175  * and iterate over the same text, as the one passed in.
176  */
177 RuleBasedBreakIterator&
operator =(const RuleBasedBreakIterator & that)178 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
179     if (this == &that) {
180         return *this;
181     }
182     reset();    // Delete break cache information
183     fBreakType = that.fBreakType;
184     if (fLanguageBreakEngines != NULL) {
185         delete fLanguageBreakEngines;
186         fLanguageBreakEngines = NULL;   // Just rebuild for now
187     }
188     // TODO: clone fLanguageBreakEngines from "that"
189     UErrorCode status = U_ZERO_ERROR;
190     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
191 
192     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
193         delete fCharIter;
194     }
195     fCharIter = NULL;
196 
197     if (that.fCharIter != NULL ) {
198         // This is a little bit tricky - it will intially appear that
199         //  this->fCharIter is adopted, even if that->fCharIter was
200         //  not adopted.  That's ok.
201         fCharIter = that.fCharIter->clone();
202     }
203 
204     if (fData != NULL) {
205         fData->removeReference();
206         fData = NULL;
207     }
208     if (that.fData != NULL) {
209         fData = that.fData->addReference();
210     }
211 
212     return *this;
213 }
214 
215 
216 
217 //-----------------------------------------------------------------------------
218 //
219 //    init()      Shared initialization routine.   Used by all the constructors.
220 //                Initializes all fields, leaving the object in a consistent state.
221 //
222 //-----------------------------------------------------------------------------
init()223 void RuleBasedBreakIterator::init() {
224     UErrorCode  status    = U_ZERO_ERROR;
225     fBufferClone          = FALSE;
226     fText                 = utext_openUChars(NULL, NULL, 0, &status);
227     fCharIter             = NULL;
228     fSCharIter            = NULL;
229     fDCharIter            = NULL;
230     fData                 = NULL;
231     fLastRuleStatusIndex  = 0;
232     fLastStatusIndexValid = TRUE;
233     fDictionaryCharCount  = 0;
234     fBreakType            = -1;
235 
236     fCachedBreakPositions    = NULL;
237     fLanguageBreakEngines    = NULL;
238     fUnhandledBreakEngine    = NULL;
239     fNumCachedBreakPositions = 0;
240     fPositionInCache         = 0;
241 
242 #ifdef RBBI_DEBUG
243     static UBool debugInitDone = FALSE;
244     if (debugInitDone == FALSE) {
245         char *debugEnv = getenv("U_RBBIDEBUG");
246         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
247             fTrace = TRUE;
248         }
249         debugInitDone = TRUE;
250     }
251 #endif
252 }
253 
254 
255 
256 //-----------------------------------------------------------------------------
257 //
258 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
259 //            behavior, and iterating over the same text, as this one.
260 //            Virtual function: does the right thing with subclasses.
261 //
262 //-----------------------------------------------------------------------------
263 BreakIterator*
clone(void) const264 RuleBasedBreakIterator::clone(void) const {
265     return new RuleBasedBreakIterator(*this);
266 }
267 
268 /**
269  * Equality operator.  Returns TRUE if both BreakIterators are of the
270  * same class, have the same behavior, and iterate over the same text.
271  */
272 UBool
operator ==(const BreakIterator & that) const273 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
274     if (that.getDynamicClassID() != getDynamicClassID()) {
275         return FALSE;
276     }
277 
278     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
279 
280     if (!utext_equals(fText, that2.fText)) {
281         // The two break iterators are operating on different text,
282         //   or have a different interation position.
283         return FALSE;
284     };
285 
286     // TODO:  need a check for when in a dictionary region at different offsets.
287 
288     if (that2.fData == fData ||
289         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
290             // The two break iterators are using the same rules.
291             return TRUE;
292         }
293     return FALSE;
294 }
295 
296 /**
297  * Compute a hash code for this BreakIterator
298  * @return A hash code
299  */
300 int32_t
hashCode(void) const301 RuleBasedBreakIterator::hashCode(void) const {
302     int32_t   hash = 0;
303     if (fData != NULL) {
304         hash = fData->hashCode();
305     }
306     return hash;
307 }
308 
309 
setText(UText * ut,UErrorCode & status)310 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
311     if (U_FAILURE(status)) {
312         return;
313     }
314     reset();
315     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
316 
317     // Set up a dummy CharacterIterator to be returned if anyone
318     //   calls getText().  With input from UText, there is no reasonable
319     //   way to return a characterIterator over the actual input text.
320     //   Return one over an empty string instead - this is the closest
321     //   we can come to signaling a failure.
322     //   (GetText() is obsolete, this failure is sort of OK)
323     if (fDCharIter == NULL) {
324         static UChar c = 0;
325         fDCharIter = new UCharCharacterIterator(&c, 0);
326     }
327 
328     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
329         // existing fCharIter was adopted from the outside.  Delete it now.
330         delete fCharIter;
331     }
332     fCharIter = fDCharIter;
333 
334     this->first();
335 }
336 
337 
getUText(UText * fillIn,UErrorCode & status) const338 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
339     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
340     return result;
341 }
342 
343 
344 
345 /**
346  * Returns the description used to create this iterator
347  */
348 const UnicodeString&
getRules() const349 RuleBasedBreakIterator::getRules() const {
350     if (fData != NULL) {
351         return fData->getRuleSourceString();
352     } else {
353         static const UnicodeString *s;
354         if (s == NULL) {
355             // TODO:  something more elegant here.
356             //        perhaps API should return the string by value.
357             //        Note:  thread unsafe init & leak are semi-ok, better than
358             //               what was before.  Sould be cleaned up, though.
359             s = new UnicodeString;
360         }
361         return *s;
362     }
363 }
364 
365 //=======================================================================
366 // BreakIterator overrides
367 //=======================================================================
368 
369 /**
370  * Return a CharacterIterator over the text being analyzed.
371  */
372 CharacterIterator&
getText() const373 RuleBasedBreakIterator::getText() const {
374     return *fCharIter;
375 }
376 
377 /**
378  * Set the iterator to analyze a new piece of text.  This function resets
379  * the current iteration position to the beginning of the text.
380  * @param newText An iterator over the text to analyze.
381  */
382 void
adoptText(CharacterIterator * newText)383 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
384     // If we are holding a CharacterIterator adopted from a
385     //   previous call to this function, delete it now.
386     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
387         delete fCharIter;
388     }
389 
390     fCharIter = newText;
391     UErrorCode status = U_ZERO_ERROR;
392     reset();
393     if (newText==NULL || newText->startIndex() != 0) {
394         // startIndex !=0 wants to be an error, but there's no way to report it.
395         // Make the iterator text be an empty string.
396         fText = utext_openUChars(fText, NULL, 0, &status);
397     } else {
398         fText = utext_openCharacterIterator(fText, newText, &status);
399     }
400     this->first();
401 }
402 
403 /**
404  * Set the iterator to analyze a new piece of text.  This function resets
405  * the current iteration position to the beginning of the text.
406  * @param newText An iterator over the text to analyze.
407  */
408 void
setText(const UnicodeString & newText)409 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
410     UErrorCode status = U_ZERO_ERROR;
411     reset();
412     fText = utext_openConstUnicodeString(fText, &newText, &status);
413 
414     // Set up a character iterator on the string.
415     //   Needed in case someone calls getText().
416     //  Can not, unfortunately, do this lazily on the (probably never)
417     //  call to getText(), because getText is const.
418     if (fSCharIter == NULL) {
419         fSCharIter = new StringCharacterIterator(newText);
420     } else {
421         fSCharIter->setText(newText);
422     }
423 
424     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
425         // old fCharIter was adopted from the outside.  Delete it.
426         delete fCharIter;
427     }
428     fCharIter = fSCharIter;
429 
430     this->first();
431 }
432 
433 
434 
435 /**
436  * Sets the current iteration position to the beginning of the text.
437  * @return The offset of the beginning of the text.
438  */
first(void)439 int32_t RuleBasedBreakIterator::first(void) {
440     reset();
441     fLastRuleStatusIndex  = 0;
442     fLastStatusIndexValid = TRUE;
443     //if (fText == NULL)
444     //    return BreakIterator::DONE;
445 
446     utext_setNativeIndex(fText, 0);
447     return 0;
448 }
449 
450 /**
451  * Sets the current iteration position to the end of the text.
452  * @return The text's past-the-end offset.
453  */
last(void)454 int32_t RuleBasedBreakIterator::last(void) {
455     reset();
456     if (fText == NULL) {
457         fLastRuleStatusIndex  = 0;
458         fLastStatusIndexValid = TRUE;
459         return BreakIterator::DONE;
460     }
461 
462     fLastStatusIndexValid = FALSE;
463     int32_t pos = (int32_t)utext_nativeLength(fText);
464     utext_setNativeIndex(fText, pos);
465     return pos;
466 }
467 
468 /**
469  * Advances the iterator either forward or backward the specified number of steps.
470  * Negative values move backward, and positive values move forward.  This is
471  * equivalent to repeatedly calling next() or previous().
472  * @param n The number of steps to move.  The sign indicates the direction
473  * (negative is backwards, and positive is forwards).
474  * @return The character offset of the boundary position n boundaries away from
475  * the current one.
476  */
next(int32_t n)477 int32_t RuleBasedBreakIterator::next(int32_t n) {
478     int32_t result = current();
479     while (n > 0) {
480         result = next();
481         --n;
482     }
483     while (n < 0) {
484         result = previous();
485         ++n;
486     }
487     return result;
488 }
489 
490 /**
491  * Advances the iterator to the next boundary position.
492  * @return The position of the first boundary after this one.
493  */
next(void)494 int32_t RuleBasedBreakIterator::next(void) {
495     // if we have cached break positions and we're still in the range
496     // covered by them, just move one step forward in the cache
497     if (fCachedBreakPositions != NULL) {
498         if (fPositionInCache < fNumCachedBreakPositions - 1) {
499             ++fPositionInCache;
500             int32_t pos = fCachedBreakPositions[fPositionInCache];
501             utext_setNativeIndex(fText, pos);
502             return pos;
503         }
504         else {
505             reset();
506         }
507     }
508 
509     int32_t startPos = current();
510     int32_t result = handleNext(fData->fForwardTable);
511     if (fDictionaryCharCount > 0) {
512         result = checkDictionary(startPos, result, FALSE);
513     }
514     return result;
515 }
516 
517 /**
518  * Advances the iterator backwards, to the last boundary preceding this one.
519  * @return The position of the last boundary position preceding this one.
520  */
previous(void)521 int32_t RuleBasedBreakIterator::previous(void) {
522     int32_t result;
523     int32_t startPos;
524 
525     // if we have cached break positions and we're still in the range
526     // covered by them, just move one step backward in the cache
527     if (fCachedBreakPositions != NULL) {
528         if (fPositionInCache > 0) {
529             --fPositionInCache;
530             // If we're at the beginning of the cache, need to reevaluate the
531             // rule status
532             if (fPositionInCache <= 0) {
533                 fLastStatusIndexValid = FALSE;
534             }
535             int32_t pos = fCachedBreakPositions[fPositionInCache];
536             utext_setNativeIndex(fText, pos);
537             return pos;
538         }
539         else {
540             reset();
541         }
542     }
543 
544     // if we're already sitting at the beginning of the text, return DONE
545     if (fText == NULL || (startPos = current()) == 0) {
546         fLastRuleStatusIndex  = 0;
547         fLastStatusIndexValid = TRUE;
548         return BreakIterator::DONE;
549     }
550 
551     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
552         result = handlePrevious(fData->fReverseTable);
553         if (fDictionaryCharCount > 0) {
554             result = checkDictionary(result, startPos, TRUE);
555         }
556         return result;
557     }
558 
559     // old rule syntax
560     // set things up.  handlePrevious() will back us up to some valid
561     // break position before the current position (we back our internal
562     // iterator up one step to prevent handlePrevious() from returning
563     // the current position), but not necessarily the last one before
564 
565     // where we started
566 
567     int32_t start = current();
568 
569     UTEXT_PREVIOUS32(fText);
570     int32_t lastResult    = handlePrevious(fData->fReverseTable);
571     if (lastResult == UBRK_DONE) {
572         lastResult = 0;
573         utext_setNativeIndex(fText, 0);
574     }
575     result = lastResult;
576     int32_t lastTag       = 0;
577     UBool   breakTagValid = FALSE;
578 
579     // iterate forward from the known break position until we pass our
580     // starting point.  The last break position before the starting
581     // point is our return value
582 
583     for (;;) {
584         result         = next();
585         if (result == BreakIterator::DONE || result >= start) {
586             break;
587         }
588         lastResult     = result;
589         lastTag        = fLastRuleStatusIndex;
590         breakTagValid  = TRUE;
591     }
592 
593     // fLastBreakTag wants to have the value for section of text preceding
594     // the result position that we are to return (in lastResult.)  If
595     // the backwards rules overshot and the above loop had to do two or more
596     // next()s to move up to the desired return position, we will have a valid
597     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
598     // we wont have a tag value for that position, which is only set by handleNext().
599 
600     // set the current iteration position to be the last break position
601     // before where we started, and then return that value
602     utext_setNativeIndex(fText, lastResult);
603     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
604     fLastStatusIndexValid = breakTagValid;
605 
606     // No need to check the dictionary; it will have been handled by
607     // next()
608 
609     return lastResult;
610 }
611 
612 /**
613  * Sets the iterator to refer to the first boundary position following
614  * the specified position.
615  * @offset The position from which to begin searching for a break position.
616  * @return The position of the first break after the current position.
617  */
following(int32_t offset)618 int32_t RuleBasedBreakIterator::following(int32_t offset) {
619     // if we have cached break positions and offset is in the range
620     // covered by them, use them
621     // TODO: could use binary search
622     // TODO: what if offset is outside range, but break is not?
623     if (fCachedBreakPositions != NULL) {
624         if (offset >= fCachedBreakPositions[0]
625                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
626             fPositionInCache = 0;
627             // We are guaranteed not to leave the array due to range test above
628             while (offset >= fCachedBreakPositions[fPositionInCache]) {
629                 ++fPositionInCache;
630             }
631             int32_t pos = fCachedBreakPositions[fPositionInCache];
632             utext_setNativeIndex(fText, pos);
633             return pos;
634         }
635         else {
636             reset();
637         }
638     }
639 
640     // if the offset passed in is already past the end of the text,
641     // just return DONE; if it's before the beginning, return the
642     // text's starting offset
643     fLastRuleStatusIndex  = 0;
644     fLastStatusIndexValid = TRUE;
645     if (fText == NULL || offset >= utext_nativeLength(fText)) {
646         last();
647         return next();
648     }
649     else if (offset < 0) {
650         return first();
651     }
652 
653     // otherwise, set our internal iteration position (temporarily)
654     // to the position passed in.  If this is the _beginning_ position,
655     // then we can just use next() to get our return value
656 
657     int32_t result = 0;
658 
659     if (fData->fSafeRevTable != NULL) {
660         // new rule syntax
661         utext_setNativeIndex(fText, offset);
662         // move forward one codepoint to prepare for moving back to a
663         // safe point.
664         // this handles offset being between a supplementary character
665         UTEXT_NEXT32(fText);
666         // handlePrevious will move most of the time to < 1 boundary away
667         handlePrevious(fData->fSafeRevTable);
668         int32_t result = next();
669         while (result <= offset) {
670             result = next();
671         }
672         return result;
673     }
674     if (fData->fSafeFwdTable != NULL) {
675         // backup plan if forward safe table is not available
676         utext_setNativeIndex(fText, offset);
677         UTEXT_PREVIOUS32(fText);
678         // handle next will give result >= offset
679         handleNext(fData->fSafeFwdTable);
680         // previous will give result 0 or 1 boundary away from offset,
681         // most of the time
682         // we have to
683         int32_t oldresult = previous();
684         while (oldresult > offset) {
685             int32_t result = previous();
686             if (result <= offset) {
687                 return oldresult;
688             }
689             oldresult = result;
690         }
691         int32_t result = next();
692         if (result <= offset) {
693             return next();
694         }
695         return result;
696     }
697     // otherwise, we have to sync up first.  Use handlePrevious() to back
698     // up to a known break position before the specified position (if
699     // we can determine that the specified position is a break position,
700     // we don't back up at all).  This may or may not be the last break
701     // position at or before our starting position.  Advance forward
702     // from here until we've passed the starting position.  The position
703     // we stop on will be the first break position after the specified one.
704     // old rule syntax
705 
706     utext_setNativeIndex(fText, offset);
707     if (offset==0 ||
708         offset==1  && utext_getNativeIndex(fText)==0) {
709         return next();
710     }
711     result = previous();
712 
713     while (result != BreakIterator::DONE && result <= offset) {
714         result = next();
715     }
716 
717     return result;
718 }
719 
720 /**
721  * Sets the iterator to refer to the last boundary position before the
722  * specified position.
723  * @offset The position to begin searching for a break from.
724  * @return The position of the last boundary before the starting position.
725  */
preceding(int32_t offset)726 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
727     // if we have cached break positions and offset is in the range
728     // covered by them, use them
729     if (fCachedBreakPositions != NULL) {
730         // TODO: binary search?
731         // TODO: What if offset is outside range, but break is not?
732         if (offset > fCachedBreakPositions[0]
733                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
734             fPositionInCache = 0;
735             while (fPositionInCache < fNumCachedBreakPositions
736                    && offset > fCachedBreakPositions[fPositionInCache])
737                 ++fPositionInCache;
738             --fPositionInCache;
739             // If we're at the beginning of the cache, need to reevaluate the
740             // rule status
741             if (fPositionInCache <= 0) {
742                 fLastStatusIndexValid = FALSE;
743             }
744             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
745             return fCachedBreakPositions[fPositionInCache];
746         }
747         else {
748             reset();
749         }
750     }
751 
752     // if the offset passed in is already past the end of the text,
753     // just return DONE; if it's before the beginning, return the
754     // text's starting offset
755     if (fText == NULL || offset > utext_nativeLength(fText)) {
756         // return BreakIterator::DONE;
757         return last();
758     }
759     else if (offset < 0) {
760         return first();
761     }
762 
763     // if we start by updating the current iteration position to the
764     // position specified by the caller, we can just use previous()
765     // to carry out this operation
766 
767     if (fData->fSafeFwdTable != NULL) {
768         // new rule syntax
769         utext_setNativeIndex(fText, offset);
770         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
771         if (newOffset != offset) {
772             // Will come here if specified offset was not a code point boundary AND
773             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
774             //   indices to the containing code point.
775             // For breakitereator::preceding only, these non-code-point indices need to be moved
776             //   up to refer to the following codepoint.
777             UTEXT_NEXT32(fText);
778             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
779         }
780 
781         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
782         //        rather than adjusting the position unconditionally?
783         //        (Change would interact with safe rules.)
784         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
785         //        affects only preceding(), seems cleaner, but is slightly different.
786         UTEXT_PREVIOUS32(fText);
787         handleNext(fData->fSafeFwdTable);
788         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
789         while (result >= offset) {
790             result = previous();
791         }
792         return result;
793     }
794     if (fData->fSafeRevTable != NULL) {
795         // backup plan if forward safe table is not available
796         //  TODO:  check whether this path can be discarded
797         //         It's probably OK to say that rules must supply both safe tables
798         //            if they use safe tables at all.  We have certainly never described
799         //            to anyone how to work with just one safe table.
800         utext_setNativeIndex(fText, offset);
801         UTEXT_NEXT32(fText);
802 
803         // handle previous will give result <= offset
804         handlePrevious(fData->fSafeRevTable);
805 
806         // next will give result 0 or 1 boundary away from offset,
807         // most of the time
808         // we have to
809         int32_t oldresult = next();
810         while (oldresult < offset) {
811             int32_t result = next();
812             if (result >= offset) {
813                 return oldresult;
814             }
815             oldresult = result;
816         }
817         int32_t result = previous();
818         if (result >= offset) {
819             return previous();
820         }
821         return result;
822     }
823 
824     // old rule syntax
825     utext_setNativeIndex(fText, offset);
826     return previous();
827 }
828 
829 /**
830  * Returns true if the specfied position is a boundary position.  As a side
831  * effect, leaves the iterator pointing to the first boundary position at
832  * or after "offset".
833  * @param offset the offset to check.
834  * @return True if "offset" is a boundary position.
835  */
isBoundary(int32_t offset)836 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
837     // the beginning index of the iterator is always a boundary position by definition
838     if (offset == 0) {
839         first();       // For side effects on current position, tag values.
840         return TRUE;
841     }
842 
843     if (offset == (int32_t)utext_nativeLength(fText)) {
844         last();       // For side effects on current position, tag values.
845         return TRUE;
846     }
847 
848     // out-of-range indexes are never boundary positions
849     if (offset < 0) {
850         first();       // For side effects on current position, tag values.
851         return FALSE;
852     }
853 
854     if (offset > utext_nativeLength(fText)) {
855         last();        // For side effects on current position, tag values.
856         return FALSE;
857     }
858 
859     // otherwise, we can use following() on the position before the specified
860     // one and return true if the position we get back is the one the user
861     // specified
862     utext_previous32From(fText, offset);
863     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
864     UBool    result  = following(backOne) == offset;
865     return result;
866 }
867 
868 /**
869  * Returns the current iteration position.
870  * @return The current iteration position.
871  */
current(void) const872 int32_t RuleBasedBreakIterator::current(void) const {
873     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
874     return pos;
875 }
876 
877 //=======================================================================
878 // implementation
879 //=======================================================================
880 
881 //
882 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
883 //                 of user text.  A variable with this enum type keeps track of where we
884 //                 are.  The state machine only fetches user input while in the RUN mode.
885 //
886 enum RBBIRunMode {
887     RBBI_START,     // state machine processing is before first char of input
888     RBBI_RUN,       // state machine processing is in the user text
889     RBBI_END        // state machine processing is after end of user text.
890 };
891 
892 
893 //-----------------------------------------------------------------------------------
894 //
895 //  handleNext(stateTable)
896 //     This method is the actual implementation of the rbbi next() method.
897 //     This method initializes the state machine to state 1
898 //     and advances through the text character by character until we reach the end
899 //     of the text or the state machine transitions to state 0.  We update our return
900 //     value every time the state machine passes through an accepting state.
901 //
902 //-----------------------------------------------------------------------------------
handleNext(const RBBIStateTable * statetable)903 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
904     int32_t             state;
905     int16_t             category        = 0;
906     RBBIRunMode         mode;
907 
908     RBBIStateTableRow  *row;
909     UChar32             c;
910     int32_t             lookaheadStatus = 0;
911     int32_t             lookaheadTagIdx = 0;
912     int32_t             result          = 0;
913     int32_t             initialPosition = 0;
914     int32_t             lookaheadResult = 0;
915     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
916     const char         *tableData       = statetable->fTableData;
917     uint32_t            tableRowLen     = statetable->fRowLen;
918 
919     #ifdef RBBI_DEBUG
920         if (fTrace) {
921             RBBIDebugPuts("Handle Next   pos   char  state category");
922         }
923     #endif
924 
925     // No matter what, handleNext alway correctly sets the break tag value.
926     fLastStatusIndexValid = TRUE;
927     fLastRuleStatusIndex = 0;
928 
929     // if we're already at the end of the text, return DONE.
930     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
931     result          = initialPosition;
932     c               = UTEXT_NEXT32(fText);
933     if (fData == NULL || c==U_SENTINEL) {
934         return BreakIterator::DONE;
935     }
936 
937     //  Set the initial state for the state machine
938     state = START_STATE;
939     row = (RBBIStateTableRow *)
940             //(statetable->fTableData + (statetable->fRowLen * state));
941             (tableData + tableRowLen * state);
942 
943 
944     mode     = RBBI_RUN;
945     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
946         category = 2;
947         mode     = RBBI_START;
948     }
949 
950 
951     // loop until we reach the end of the text or transition to state 0
952     //
953     for (;;) {
954         if (c == U_SENTINEL) {
955             // Reached end of input string.
956             if (mode == RBBI_END) {
957                 // We have already run the loop one last time with the
958                 //   character set to the psueudo {eof} value.  Now it is time
959                 //   to unconditionally bail out.
960                 if (lookaheadResult > result) {
961                     // We ran off the end of the string with a pending look-ahead match.
962                     // Treat this as if the look-ahead condition had been met, and return
963                     //  the match at the / position from the look-ahead rule.
964                     result               = lookaheadResult;
965                     fLastRuleStatusIndex = lookaheadTagIdx;
966                     lookaheadStatus = 0;
967                 }
968                 break;
969             }
970             // Run the loop one last time with the fake end-of-input character category.
971             mode = RBBI_END;
972             category = 1;
973         }
974 
975         //
976         // Get the char category.  An incoming category of 1 or 2 means that
977         //      we are preset for doing the beginning or end of input, and
978         //      that we shouldn't get a category from an actual text input character.
979         //
980         if (mode == RBBI_RUN) {
981             // look up the current character's character category, which tells us
982             // which column in the state table to look at.
983             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
984             //        not the size of the character going in, which is a UChar32.
985             //
986             UTRIE_GET16(&fData->fTrie, c, category);
987 
988             // Check the dictionary bit in the character's category.
989             //    Counter is only used by dictionary based iterators (subclasses).
990             //    Chars that need to be handled by a dictionary have a flag bit set
991             //    in their category values.
992             //
993             if ((category & 0x4000) != 0)  {
994                 fDictionaryCharCount++;
995                 //  And off the dictionary flag bit.
996                 category &= ~0x4000;
997             }
998         }
999 
1000         #ifdef RBBI_DEBUG
1001             if (fTrace) {
1002                 RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
1003                 if (0x20<=c && c<0x7f) {
1004                     RBBIDebugPrintf("\"%c\"  ", c);
1005                 } else {
1006                     RBBIDebugPrintf("%5x  ", c);
1007                 }
1008                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1009             }
1010         #endif
1011 
1012         // State Transition - move machine to its next state
1013         //
1014         state = row->fNextState[category];
1015         row = (RBBIStateTableRow *)
1016             // (statetable->fTableData + (statetable->fRowLen * state));
1017             (tableData + tableRowLen * state);
1018 
1019 
1020         if (row->fAccepting == -1) {
1021             // Match found, common case.
1022             if (mode != RBBI_START) {
1023                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1024             }
1025             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
1026         }
1027 
1028         if (row->fLookAhead != 0) {
1029             if (lookaheadStatus != 0
1030                 && row->fAccepting == lookaheadStatus) {
1031                 // Lookahead match is completed.
1032                 result               = lookaheadResult;
1033                 fLastRuleStatusIndex = lookaheadTagIdx;
1034                 lookaheadStatus      = 0;
1035                 // TODO:  make a standalone hard break in a rule work.
1036                 if (lookAheadHardBreak) {
1037                     UTEXT_SETNATIVEINDEX(fText, result);
1038                     return result;
1039                 }
1040                 // Look-ahead completed, but other rules may match further.  Continue on
1041                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1042                 goto continueOn;
1043             }
1044 
1045             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1046             lookaheadResult = r;
1047             lookaheadStatus = row->fLookAhead;
1048             lookaheadTagIdx = row->fTagIdx;
1049             goto continueOn;
1050         }
1051 
1052 
1053         if (row->fAccepting != 0) {
1054             // Because this is an accepting state, any in-progress look-ahead match
1055             //   is no longer relavant.  Clear out the pending lookahead status.
1056             lookaheadStatus = 0;           // clear out any pending look-ahead match.
1057         }
1058 
1059 continueOn:
1060         if (state == STOP_STATE) {
1061             // This is the normal exit from the lookup state machine.
1062             // We have advanced through the string until it is certain that no
1063             //   longer match is possible, no matter what characters follow.
1064             break;
1065         }
1066 
1067         // Advance to the next character.
1068         // If this is a beginning-of-input loop iteration, don't advance
1069         //    the input position.  The next iteration will be processing the
1070         //    first real input character.
1071         if (mode == RBBI_RUN) {
1072             c = UTEXT_NEXT32(fText);
1073         } else {
1074             if (mode == RBBI_START) {
1075                 mode = RBBI_RUN;
1076             }
1077         }
1078 
1079 
1080     }
1081 
1082     // The state machine is done.  Check whether it found a match...
1083 
1084     // If the iterator failed to advance in the match engine, force it ahead by one.
1085     //   (This really indicates a defect in the break rules.  They should always match
1086     //    at least one character.)
1087     if (result == initialPosition) {
1088         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1089         UTEXT_NEXT32(fText);
1090         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1091     }
1092 
1093     // Leave the iterator at our result position.
1094     UTEXT_SETNATIVEINDEX(fText, result);
1095     #ifdef RBBI_DEBUG
1096         if (fTrace) {
1097             RBBIDebugPrintf("result = %d\n\n", result);
1098         }
1099     #endif
1100     return result;
1101 }
1102 
1103 
1104 
1105 //-----------------------------------------------------------------------------------
1106 //
1107 //  handlePrevious()
1108 //
1109 //      Iterate backwards, according to the logic of the reverse rules.
1110 //      This version handles the exact style backwards rules.
1111 //
1112 //      The logic of this function is very similar to handleNext(), above.
1113 //
1114 //-----------------------------------------------------------------------------------
handlePrevious(const RBBIStateTable * statetable)1115 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1116     int32_t             state;
1117     int16_t             category        = 0;
1118     RBBIRunMode         mode;
1119     RBBIStateTableRow  *row;
1120     UChar32             c;
1121     int32_t             lookaheadStatus = 0;
1122     int32_t             result          = 0;
1123     int32_t             initialPosition = 0;
1124     int32_t             lookaheadResult = 0;
1125     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1126 
1127     #ifdef RBBI_DEBUG
1128         if (fTrace) {
1129             RBBIDebugPuts("Handle Previous   pos   char  state category");
1130         }
1131     #endif
1132 
1133     // handlePrevious() never gets the rule status.
1134     // Flag the status as invalid; if the user ever asks for status, we will need
1135     // to back up, then re-find the break position using handleNext(), which does
1136     // get the status value.
1137     fLastStatusIndexValid = FALSE;
1138     fLastRuleStatusIndex = 0;
1139 
1140     // if we're already at the start of the text, return DONE.
1141     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1142         return BreakIterator::DONE;
1143     }
1144 
1145     //  Set up the starting char.
1146     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1147     result          = initialPosition;
1148     c               = UTEXT_PREVIOUS32(fText);
1149 
1150     //  Set the initial state for the state machine
1151     state = START_STATE;
1152     row = (RBBIStateTableRow *)
1153             (statetable->fTableData + (statetable->fRowLen * state));
1154     category = 3;
1155     mode     = RBBI_RUN;
1156     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1157         category = 2;
1158         mode     = RBBI_START;
1159     }
1160 
1161 
1162     // loop until we reach the start of the text or transition to state 0
1163     //
1164     for (;;) {
1165         if (c == U_SENTINEL) {
1166             // Reached end of input string.
1167             if (mode == RBBI_END ||
1168                 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
1169                 // We have already run the loop one last time with the
1170                 //   character set to the psueudo {eof} value.  Now it is time
1171                 //   to unconditionally bail out.
1172                 //  (Or we have an old format binary rule file that does not support {eof}.)
1173                 if (lookaheadResult < result) {
1174                     // We ran off the end of the string with a pending look-ahead match.
1175                     // Treat this as if the look-ahead condition had been met, and return
1176                     //  the match at the / position from the look-ahead rule.
1177                     result               = lookaheadResult;
1178                     lookaheadStatus = 0;
1179                 } else if (result == initialPosition) {
1180                     // Ran off start, no match found.
1181                     // move one index one (towards the start, since we are doing a previous())
1182                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
1183                     UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
1184                 }
1185                 break;
1186             }
1187             // Run the loop one last time with the fake end-of-input character category.
1188             mode = RBBI_END;
1189             category = 1;
1190         }
1191 
1192         //
1193         // Get the char category.  An incoming category of 1 or 2 means that
1194         //      we are preset for doing the beginning or end of input, and
1195         //      that we shouldn't get a category from an actual text input character.
1196         //
1197         if (mode == RBBI_RUN) {
1198             // look up the current character's character category, which tells us
1199             // which column in the state table to look at.
1200             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1201             //        not the size of the character going in, which is a UChar32.
1202             //
1203             UTRIE_GET16(&fData->fTrie, c, category);
1204 
1205             // Check the dictionary bit in the character's category.
1206             //    Counter is only used by dictionary based iterators (subclasses).
1207             //    Chars that need to be handled by a dictionary have a flag bit set
1208             //    in their category values.
1209             //
1210             if ((category & 0x4000) != 0)  {
1211                 fDictionaryCharCount++;
1212                 //  And off the dictionary flag bit.
1213                 category &= ~0x4000;
1214             }
1215         }
1216 
1217         #ifdef RBBI_DEBUG
1218             if (fTrace) {
1219                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
1220                 if (0x20<=c && c<0x7f) {
1221                     RBBIDebugPrintf("\"%c\"  ", c);
1222                 } else {
1223                     RBBIDebugPrintf("%5x  ", c);
1224                 }
1225                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1226             }
1227         #endif
1228 
1229         // State Transition - move machine to its next state
1230         //
1231         state = row->fNextState[category];
1232         row = (RBBIStateTableRow *)
1233             (statetable->fTableData + (statetable->fRowLen * state));
1234 
1235         if (row->fAccepting == -1) {
1236             // Match found, common case.
1237             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1238         }
1239 
1240         if (row->fLookAhead != 0) {
1241             if (lookaheadStatus != 0
1242                 && row->fAccepting == lookaheadStatus) {
1243                 // Lookahead match is completed.
1244                 result               = lookaheadResult;
1245                 lookaheadStatus      = 0;
1246                 // TODO:  make a standalone hard break in a rule work.
1247                 if (lookAheadHardBreak) {
1248                     UTEXT_SETNATIVEINDEX(fText, result);
1249                     return result;
1250                 }
1251                 // Look-ahead completed, but other rules may match further.  Continue on
1252                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1253                 goto continueOn;
1254             }
1255 
1256             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1257             lookaheadResult = r;
1258             lookaheadStatus = row->fLookAhead;
1259             goto continueOn;
1260         }
1261 
1262 
1263         if (row->fAccepting != 0) {
1264             // Because this is an accepting state, any in-progress look-ahead match
1265             //   is no longer relavant.  Clear out the pending lookahead status.
1266             lookaheadStatus = 0;
1267         }
1268 
1269 continueOn:
1270         if (state == STOP_STATE) {
1271             // This is the normal exit from the lookup state machine.
1272             // We have advanced through the string until it is certain that no
1273             //   longer match is possible, no matter what characters follow.
1274             break;
1275         }
1276 
1277         // Move (backwards) to the next character to process.
1278         // If this is a beginning-of-input loop iteration, don't advance
1279         //    the input position.  The next iteration will be processing the
1280         //    first real input character.
1281         if (mode == RBBI_RUN) {
1282             c = UTEXT_PREVIOUS32(fText);
1283         } else {
1284             if (mode == RBBI_START) {
1285                 mode = RBBI_RUN;
1286             }
1287         }
1288     }
1289 
1290     // The state machine is done.  Check whether it found a match...
1291 
1292     // If the iterator failed to advance in the match engine, force it ahead by one.
1293     //   (This really indicates a defect in the break rules.  They should always match
1294     //    at least one character.)
1295     if (result == initialPosition) {
1296         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1297         UTEXT_PREVIOUS32(fText);
1298         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1299     }
1300 
1301     // Leave the iterator at our result position.
1302     UTEXT_SETNATIVEINDEX(fText, result);
1303     #ifdef RBBI_DEBUG
1304         if (fTrace) {
1305             RBBIDebugPrintf("result = %d\n\n", result);
1306         }
1307     #endif
1308     return result;
1309 }
1310 
1311 
1312 void
reset()1313 RuleBasedBreakIterator::reset()
1314 {
1315     if (fCachedBreakPositions) {
1316         uprv_free(fCachedBreakPositions);
1317     }
1318     fCachedBreakPositions = NULL;
1319     fNumCachedBreakPositions = 0;
1320     fDictionaryCharCount = 0;
1321     fPositionInCache = 0;
1322 }
1323 
1324 
1325 
1326 //-------------------------------------------------------------------------------
1327 //
1328 //   getRuleStatus()   Return the break rule tag associated with the current
1329 //                     iterator position.  If the iterator arrived at its current
1330 //                     position by iterating forwards, the value will have been
1331 //                     cached by the handleNext() function.
1332 //
1333 //                     If no cached status value is available, the status is
1334 //                     found by doing a previous() followed by a next(), which
1335 //                     leaves the iterator where it started, and computes the
1336 //                     status while doing the next().
1337 //
1338 //-------------------------------------------------------------------------------
makeRuleStatusValid()1339 void RuleBasedBreakIterator::makeRuleStatusValid() {
1340     if (fLastStatusIndexValid == FALSE) {
1341         //  No cached status is available.
1342         if (fText == NULL || current() == 0) {
1343             //  At start of text, or there is no text.  Status is always zero.
1344             fLastRuleStatusIndex = 0;
1345             fLastStatusIndexValid = TRUE;
1346         } else {
1347             //  Not at start of text.  Find status the tedious way.
1348             int32_t pa = current();
1349             previous();
1350             if (fNumCachedBreakPositions > 0) {
1351                 reset();                // Blow off the dictionary cache
1352             }
1353             int32_t pb = next();
1354             if (pa != pb) {
1355                 // note: the if (pa != pb) test is here only to eliminate warnings for
1356                 //       unused local variables on gcc.  Logically, it isn't needed.
1357                 U_ASSERT(pa == pb);
1358             }
1359         }
1360     }
1361     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
1362 }
1363 
1364 
getRuleStatus() const1365 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1366     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1367     nonConstThis->makeRuleStatusValid();
1368 
1369     // fLastRuleStatusIndex indexes to the start of the appropriate status record
1370     //                                                 (the number of status values.)
1371     //   This function returns the last (largest) of the array of status values.
1372     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1373     int32_t  tagVal = fData->fRuleStatusTable[idx];
1374 
1375     return tagVal;
1376 }
1377 
1378 
1379 
1380 
getRuleStatusVec(int32_t * fillInVec,int32_t capacity,UErrorCode & status)1381 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1382              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1383 {
1384     if (U_FAILURE(status)) {
1385         return 0;
1386     }
1387 
1388     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1389     nonConstThis->makeRuleStatusValid();
1390     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1391     int32_t  numValsToCopy = numVals;
1392     if (numVals > capacity) {
1393         status = U_BUFFER_OVERFLOW_ERROR;
1394         numValsToCopy = capacity;
1395     }
1396     int i;
1397     for (i=0; i<numValsToCopy; i++) {
1398         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1399     }
1400     return numVals;
1401 }
1402 
1403 
1404 
1405 //-------------------------------------------------------------------------------
1406 //
1407 //   getBinaryRules        Access to the compiled form of the rules,
1408 //                         for use by build system tools that save the data
1409 //                         for standard iterator types.
1410 //
1411 //-------------------------------------------------------------------------------
getBinaryRules(uint32_t & length)1412 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1413     const uint8_t  *retPtr = NULL;
1414     length = 0;
1415 
1416     if (fData != NULL) {
1417         retPtr = (const uint8_t *)fData->fHeader;
1418         length = fData->fHeader->fLength;
1419     }
1420     return retPtr;
1421 }
1422 
1423 
1424 
1425 
1426 //-------------------------------------------------------------------------------
1427 //
1428 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
1429 //                    Saving one heap allocation isn't worth the trouble.
1430 //                    Cloning shouldn't be done in tight loops, and
1431 //                    making the clone copy involves other heap operations anyway.
1432 //                    And the application code for correctly dealing with buffer
1433 //                    size problems and the eventual object destruction is ugly.
1434 //
1435 //-------------------------------------------------------------------------------
createBufferClone(void * stackBuffer,int32_t & bufferSize,UErrorCode & status)1436 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1437                                    int32_t &bufferSize,
1438                                    UErrorCode &status)
1439 {
1440     if (U_FAILURE(status)){
1441         return NULL;
1442     }
1443 
1444     //
1445     //  If user buffer size is zero this is a preflight operation to
1446     //    obtain the needed buffer size, allowing for worst case misalignment.
1447     //
1448     if (bufferSize == 0) {
1449         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1450         return NULL;
1451     }
1452 
1453 
1454     //
1455     //  Check the alignment and size of the user supplied buffer.
1456     //  Allocate heap memory if the user supplied memory is insufficient.
1457     //
1458     char    *buf   = (char *)stackBuffer;
1459     uint32_t s      = bufferSize;
1460 
1461     if (stackBuffer == NULL) {
1462         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
1463     }
1464     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1465         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1466         s   -= offsetUp;
1467         buf += offsetUp;
1468     }
1469     if (s < sizeof(RuleBasedBreakIterator)) {
1470         // Not enough room in the caller-supplied buffer.
1471         // Do a plain-vanilla heap based clone and return that, along with
1472         //   a warning that the clone was allocated.
1473         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
1474         if (clonedBI == 0) {
1475             status = U_MEMORY_ALLOCATION_ERROR;
1476         } else {
1477             status = U_SAFECLONE_ALLOCATED_WARNING;
1478         }
1479         return clonedBI;
1480     }
1481 
1482     //
1483     //  Clone the source BI into the caller-supplied buffer.
1484     //    TODO:  using an overloaded operator new to directly initialize the
1485     //           copy in the user's buffer would be better, but it doesn't seem
1486     //           to get along with namespaces.  Investigate why.
1487     //
1488     //           The memcpy is only safe with an empty (default constructed)
1489     //           break iterator.  Use on others can screw up reference counts
1490     //           to data.  memcpy-ing objects is not really a good idea...
1491     //
1492     RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
1493     RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
1494     uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
1495     clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
1496     *clone = *this;               // clone = the real BI we want.
1497     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
1498 
1499     return clone;
1500 }
1501 
1502 
1503 //-------------------------------------------------------------------------------
1504 //
1505 //  isDictionaryChar      Return true if the category lookup for this char
1506 //                        indicates that it is in the set of dictionary lookup
1507 //                        chars.
1508 //
1509 //                        This function is intended for use by dictionary based
1510 //                        break iterators.
1511 //
1512 //-------------------------------------------------------------------------------
1513 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
1514     if (fData == NULL) {
1515         return FALSE;
1516     }
1517     uint16_t category;
1518     UTRIE_GET16(&fData->fTrie, c, category);
1519     return (category & 0x4000) != 0;
1520 }*/
1521 
1522 
1523 //-------------------------------------------------------------------------------
1524 //
1525 //  checkDictionary       This function handles all processing of characters in
1526 //                        the "dictionary" set. It will determine the appropriate
1527 //                        course of action, and possibly set up a cache in the
1528 //                        process.
1529 //
1530 //-------------------------------------------------------------------------------
checkDictionary(int32_t startPos,int32_t endPos,UBool reverse)1531 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1532                             int32_t endPos,
1533                             UBool reverse) {
1534     // Reset the old break cache first.
1535     uint32_t dictionaryCount = fDictionaryCharCount;
1536     reset();
1537 
1538     if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
1539         return (reverse ? startPos : endPos);
1540     }
1541 
1542     // Starting from the starting point, scan towards the proposed result,
1543     // looking for the first dictionary character (which may be the one
1544     // we're on, if we're starting in the middle of a range).
1545     utext_setNativeIndex(fText, reverse ? endPos : startPos);
1546     if (reverse) {
1547         UTEXT_PREVIOUS32(fText);
1548     }
1549 
1550     int32_t rangeStart = startPos;
1551     int32_t rangeEnd = endPos;
1552 
1553     uint16_t    category;
1554     int32_t     current;
1555     UErrorCode  status = U_ZERO_ERROR;
1556     UStack      breaks(status);
1557     int32_t     foundBreakCount = 0;
1558     UChar32     c = utext_current32(fText);
1559 
1560     UTRIE_GET16(&fData->fTrie, c, category);
1561 
1562     // Is the character we're starting on a dictionary character? If so, we
1563     // need to back up to include the entire run; otherwise the results of
1564     // the break algorithm will differ depending on where we start. Since
1565     // the result is cached and there is typically a non-dictionary break
1566     // within a small number of words, there should be little performance impact.
1567     if (category & 0x4000) {
1568         if (reverse) {
1569             do {
1570                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
1571                 c = utext_current32(fText);
1572                 UTRIE_GET16(&fData->fTrie, c, category);
1573             } while (c != U_SENTINEL && (category & 0x4000));
1574             // Back up to the last dictionary character
1575             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1576             if (c == U_SENTINEL) {
1577                 // c = fText->last32();
1578                 //   TODO:  why was this if needed?
1579                 c = UTEXT_PREVIOUS32(fText);
1580             }
1581             else {
1582                 c = UTEXT_PREVIOUS32(fText);
1583             }
1584         }
1585         else {
1586             do {
1587                 c = UTEXT_PREVIOUS32(fText);
1588                 UTRIE_GET16(&fData->fTrie, c, category);
1589             }
1590             while (c != U_SENTINEL && (category & 0x4000));
1591             // Back up to the last dictionary character
1592             if (c == U_SENTINEL) {
1593                 // c = fText->first32();
1594                 c = utext_current32(fText);
1595             }
1596             else {
1597                 utext_next32(fText);
1598                 c = utext_current32(fText);
1599             }
1600             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1601         }
1602         UTRIE_GET16(&fData->fTrie, c, category);
1603     }
1604 
1605     // Loop through the text, looking for ranges of dictionary characters.
1606     // For each span, find the appropriate break engine, and ask it to find
1607     // any breaks within the span.
1608     // Note: we always do this in the forward direction, so that the break
1609     // cache is built in the right order.
1610     if (reverse) {
1611         utext_setNativeIndex(fText, rangeStart);
1612         c = utext_current32(fText);
1613         UTRIE_GET16(&fData->fTrie, c, category);
1614     }
1615     while(U_SUCCESS(status)) {
1616         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1617             utext_next32(fText);           // TODO:  tweak for post-increment operation
1618             c = utext_current32(fText);
1619             UTRIE_GET16(&fData->fTrie, c, category);
1620         }
1621         if (current >= rangeEnd) {
1622             break;
1623         }
1624 
1625         // We now have a dictionary character. Get the appropriate language object
1626         // to deal with it.
1627         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1628 
1629         // Ask the language object if there are any breaks. It will leave the text
1630         // pointer on the other side of its range, ready to search for the next one.
1631         if (lbe != NULL) {
1632             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1633         }
1634 
1635         // Reload the loop variables for the next go-round
1636         c = utext_current32(fText);
1637         UTRIE_GET16(&fData->fTrie, c, category);
1638     }
1639 
1640     // If we found breaks, build a new break cache. The first and last entries must
1641     // be the original starting and ending position.
1642     if (foundBreakCount > 0) {
1643         int32_t totalBreaks = foundBreakCount;
1644         if (startPos < breaks.elementAti(0)) {
1645             totalBreaks += 1;
1646         }
1647         if (endPos > breaks.peeki()) {
1648             totalBreaks += 1;
1649         }
1650         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1651         if (fCachedBreakPositions != NULL) {
1652             int32_t out = 0;
1653             fNumCachedBreakPositions = totalBreaks;
1654             if (startPos < breaks.elementAti(0)) {
1655                 fCachedBreakPositions[out++] = startPos;
1656             }
1657             for (int32_t i = 0; i < foundBreakCount; ++i) {
1658                 fCachedBreakPositions[out++] = breaks.elementAti(i);
1659             }
1660             if (endPos > fCachedBreakPositions[out-1]) {
1661                 fCachedBreakPositions[out] = endPos;
1662             }
1663             // If there are breaks, then by definition, we are replacing the original
1664             // proposed break by one of the breaks we found. Use following() and
1665             // preceding() to do the work. They should never recurse in this case.
1666             if (reverse) {
1667                 return preceding(endPos - 1);
1668             }
1669             else {
1670                 return following(startPos);
1671             }
1672         }
1673         // If the allocation failed, just fall through to the "no breaks found" case.
1674     }
1675 
1676     // If we get here, there were no language-based breaks. Set the text pointer
1677     // to the original proposed break.
1678     utext_setNativeIndex(fText, reverse ? startPos : endPos);
1679     return (reverse ? startPos : endPos);
1680 }
1681 
1682 U_NAMESPACE_END
1683 
1684 // defined in ucln_cmn.h
1685 
1686 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
1687 
1688 /**
1689  * Release all static memory held by breakiterator.
1690  */
1691 U_CDECL_BEGIN
breakiterator_cleanup_dict(void)1692 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1693     if (gLanguageBreakFactories) {
1694         delete gLanguageBreakFactories;
1695         gLanguageBreakFactories = NULL;
1696     }
1697     return TRUE;
1698 }
1699 U_CDECL_END
1700 
1701 U_CDECL_BEGIN
_deleteFactory(void * obj)1702 static void U_CALLCONV _deleteFactory(void *obj) {
1703     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
1704 }
1705 U_CDECL_END
1706 U_NAMESPACE_BEGIN
1707 
1708 static const LanguageBreakEngine*
getLanguageBreakEngineFromFactory(UChar32 c,int32_t breakType)1709 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1710 {
1711     UBool       needsInit;
1712     UErrorCode  status = U_ZERO_ERROR;
1713     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
1714 
1715     if (needsInit) {
1716         UStack  *factories = new UStack(_deleteFactory, NULL, status);
1717         if (U_SUCCESS(status)) {
1718             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1719             factories->push(builtIn, status);
1720 #ifdef U_LOCAL_SERVICE_HOOK
1721             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1722             if (extra != NULL) {
1723                 factories->push(extra, status);
1724             }
1725 #endif
1726         }
1727         umtx_lock(NULL);
1728         if (gLanguageBreakFactories == NULL) {
1729             gLanguageBreakFactories = factories;
1730             factories = NULL;
1731             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1732         }
1733         umtx_unlock(NULL);
1734         delete factories;
1735     }
1736 
1737     if (gLanguageBreakFactories == NULL) {
1738         return NULL;
1739     }
1740 
1741     int32_t i = gLanguageBreakFactories->size();
1742     const LanguageBreakEngine *lbe = NULL;
1743     while (--i >= 0) {
1744         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1745         lbe = factory->getEngineFor(c, breakType);
1746         if (lbe != NULL) {
1747             break;
1748         }
1749     }
1750     return lbe;
1751 }
1752 
1753 
1754 //-------------------------------------------------------------------------------
1755 //
1756 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1757 //                          the characer c.
1758 //
1759 //-------------------------------------------------------------------------------
1760 const LanguageBreakEngine *
getLanguageBreakEngine(UChar32 c)1761 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1762     const LanguageBreakEngine *lbe = NULL;
1763     UErrorCode status = U_ZERO_ERROR;
1764 
1765     if (fLanguageBreakEngines == NULL) {
1766         fLanguageBreakEngines = new UStack(status);
1767         if (U_FAILURE(status)) {
1768             delete fLanguageBreakEngines;
1769             fLanguageBreakEngines = 0;
1770             return NULL;
1771         }
1772     }
1773 
1774     int32_t i = fLanguageBreakEngines->size();
1775     while (--i >= 0) {
1776         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1777         if (lbe->handles(c, fBreakType)) {
1778             return lbe;
1779         }
1780     }
1781 
1782     // No existing dictionary took the character. See if a factory wants to
1783     // give us a new LanguageBreakEngine for this character.
1784     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1785 
1786     // If we got one, use it and push it on our stack.
1787     if (lbe != NULL) {
1788         fLanguageBreakEngines->push((void *)lbe, status);
1789         // Even if we can't remember it, we can keep looking it up, so
1790         // return it even if the push fails.
1791         return lbe;
1792     }
1793 
1794     // No engine is forthcoming for this character. Add it to the
1795     // reject set. Create the reject break engine if needed.
1796     if (fUnhandledBreakEngine == NULL) {
1797         fUnhandledBreakEngine = new UnhandledEngine(status);
1798         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1799             status = U_MEMORY_ALLOCATION_ERROR;
1800         }
1801         // Put it last so that scripts for which we have an engine get tried
1802         // first.
1803         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1804         // If we can't insert it, or creation failed, get rid of it
1805         if (U_FAILURE(status)) {
1806             delete fUnhandledBreakEngine;
1807             fUnhandledBreakEngine = 0;
1808             return NULL;
1809         }
1810     }
1811 
1812     // Tell the reject engine about the character; at its discretion, it may
1813     // add more than just the one character.
1814     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1815 
1816     return fUnhandledBreakEngine;
1817 }
1818 
1819 
1820 
1821 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1822     return fBreakType;
1823 }*/
1824 
setBreakType(int32_t type)1825 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1826     fBreakType = type;
1827     reset();
1828 }
1829 
1830 U_NAMESPACE_END
1831 
1832 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1833