1 /*
2 ***************************************************************************
3 * Copyright (C) 1999-2007 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 ***************************************************************************
6 */
7 //
8 // file: rbbi.c Contains the implementation of the rule based break iterator
9 // runtime engine and the API implementation for
10 // class RuleBasedBreakIterator
11 //
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include "unicode/rbbi.h"
18 #include "unicode/schriter.h"
19 #include "unicode/uchriter.h"
20 #include "unicode/udata.h"
21 #include "unicode/uclean.h"
22 #include "rbbidata.h"
23 #include "rbbirb.h"
24 #include "cmemory.h"
25 #include "cstring.h"
26 #include "umutex.h"
27 #include "ucln_cmn.h"
28 #include "brkeng.h"
29
30 #include "uassert.h"
31 #include "uvector.h"
32
33 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
34 #if U_LOCAL_SERVICE_HOOK
35 #include "localsvc.h"
36 #endif
37
38 #ifdef RBBI_DEBUG
39 static UBool fTrace = FALSE;
40 #endif
41
42 U_NAMESPACE_BEGIN
43
44
45 static const int16_t START_STATE = 1; // The state number of the starting state
46 static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
47
48
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)49 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
50
51
52 //=======================================================================
53 // constructors
54 //=======================================================================
55
56 /**
57 * Constructs a RuleBasedBreakIterator that uses the already-created
58 * tables object that is passed in as a parameter.
59 */
60 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
61 {
62 init();
63 fData = new RBBIDataWrapper(data, status); // status checked in constructor
64 if (U_FAILURE(status)) {return;}
65 if(fData == 0) {
66 status = U_MEMORY_ALLOCATION_ERROR;
67 return;
68 }
69 }
70
71 //-------------------------------------------------------------------------------
72 //
73 // Constructor from a UDataMemory handle to precompiled break rules
74 // stored in an ICU data file.
75 //
76 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(UDataMemory * udm,UErrorCode & status)77 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
78 {
79 init();
80 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
81 if (U_FAILURE(status)) {return;}
82 if(fData == 0) {
83 status = U_MEMORY_ALLOCATION_ERROR;
84 return;
85 }
86 }
87
88
89
90 //-------------------------------------------------------------------------------
91 //
92 // Constructor from a set of rules supplied as a string.
93 //
94 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(const UnicodeString & rules,UParseError & parseError,UErrorCode & status)95 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
96 UParseError &parseError,
97 UErrorCode &status)
98 {
99 init();
100 if (U_FAILURE(status)) {return;}
101 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
102 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status);
103 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
104 // creates and returns a complete RBBI. From here, in a constructor, we
105 // can't just return the object created by the builder factory, hence
106 // the assignment of the factory created object to "this".
107 if (U_SUCCESS(status)) {
108 *this = *bi;
109 delete bi;
110 }
111 }
112
113
114 //-------------------------------------------------------------------------------
115 //
116 // Default Constructor. Create an empty shell that can be set up later.
117 // Used when creating a RuleBasedBreakIterator from a set
118 // of rules.
119 //-------------------------------------------------------------------------------
RuleBasedBreakIterator()120 RuleBasedBreakIterator::RuleBasedBreakIterator() {
121 init();
122 }
123
124
125 //-------------------------------------------------------------------------------
126 //
127 // Copy constructor. Will produce a break iterator with the same behavior,
128 // and which iterates over the same text, as the one passed in.
129 //
130 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(const RuleBasedBreakIterator & other)131 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
132 : BreakIterator(other)
133 {
134 this->init();
135 *this = other;
136 }
137
138
139 /**
140 * Destructor
141 */
~RuleBasedBreakIterator()142 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
143 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
144 // fCharIter was adopted from the outside.
145 delete fCharIter;
146 }
147 fCharIter = NULL;
148 delete fSCharIter;
149 fCharIter = NULL;
150 delete fDCharIter;
151 fDCharIter = NULL;
152
153 utext_close(fText);
154
155 if (fData != NULL) {
156 fData->removeReference();
157 fData = NULL;
158 }
159 if (fCachedBreakPositions) {
160 uprv_free(fCachedBreakPositions);
161 fCachedBreakPositions = NULL;
162 }
163 if (fLanguageBreakEngines) {
164 delete fLanguageBreakEngines;
165 fLanguageBreakEngines = NULL;
166 }
167 if (fUnhandledBreakEngine) {
168 delete fUnhandledBreakEngine;
169 fUnhandledBreakEngine = NULL;
170 }
171 }
172
173 /**
174 * Assignment operator. Sets this iterator to have the same behavior,
175 * and iterate over the same text, as the one passed in.
176 */
177 RuleBasedBreakIterator&
operator =(const RuleBasedBreakIterator & that)178 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
179 if (this == &that) {
180 return *this;
181 }
182 reset(); // Delete break cache information
183 fBreakType = that.fBreakType;
184 if (fLanguageBreakEngines != NULL) {
185 delete fLanguageBreakEngines;
186 fLanguageBreakEngines = NULL; // Just rebuild for now
187 }
188 // TODO: clone fLanguageBreakEngines from "that"
189 UErrorCode status = U_ZERO_ERROR;
190 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
191
192 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
193 delete fCharIter;
194 }
195 fCharIter = NULL;
196
197 if (that.fCharIter != NULL ) {
198 // This is a little bit tricky - it will intially appear that
199 // this->fCharIter is adopted, even if that->fCharIter was
200 // not adopted. That's ok.
201 fCharIter = that.fCharIter->clone();
202 }
203
204 if (fData != NULL) {
205 fData->removeReference();
206 fData = NULL;
207 }
208 if (that.fData != NULL) {
209 fData = that.fData->addReference();
210 }
211
212 return *this;
213 }
214
215
216
217 //-----------------------------------------------------------------------------
218 //
219 // init() Shared initialization routine. Used by all the constructors.
220 // Initializes all fields, leaving the object in a consistent state.
221 //
222 //-----------------------------------------------------------------------------
init()223 void RuleBasedBreakIterator::init() {
224 UErrorCode status = U_ZERO_ERROR;
225 fBufferClone = FALSE;
226 fText = utext_openUChars(NULL, NULL, 0, &status);
227 fCharIter = NULL;
228 fSCharIter = NULL;
229 fDCharIter = NULL;
230 fData = NULL;
231 fLastRuleStatusIndex = 0;
232 fLastStatusIndexValid = TRUE;
233 fDictionaryCharCount = 0;
234 fBreakType = -1;
235
236 fCachedBreakPositions = NULL;
237 fLanguageBreakEngines = NULL;
238 fUnhandledBreakEngine = NULL;
239 fNumCachedBreakPositions = 0;
240 fPositionInCache = 0;
241
242 #ifdef RBBI_DEBUG
243 static UBool debugInitDone = FALSE;
244 if (debugInitDone == FALSE) {
245 char *debugEnv = getenv("U_RBBIDEBUG");
246 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
247 fTrace = TRUE;
248 }
249 debugInitDone = TRUE;
250 }
251 #endif
252 }
253
254
255
256 //-----------------------------------------------------------------------------
257 //
258 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same
259 // behavior, and iterating over the same text, as this one.
260 // Virtual function: does the right thing with subclasses.
261 //
262 //-----------------------------------------------------------------------------
263 BreakIterator*
clone(void) const264 RuleBasedBreakIterator::clone(void) const {
265 return new RuleBasedBreakIterator(*this);
266 }
267
268 /**
269 * Equality operator. Returns TRUE if both BreakIterators are of the
270 * same class, have the same behavior, and iterate over the same text.
271 */
272 UBool
operator ==(const BreakIterator & that) const273 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
274 if (that.getDynamicClassID() != getDynamicClassID()) {
275 return FALSE;
276 }
277
278 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
279
280 if (!utext_equals(fText, that2.fText)) {
281 // The two break iterators are operating on different text,
282 // or have a different interation position.
283 return FALSE;
284 };
285
286 // TODO: need a check for when in a dictionary region at different offsets.
287
288 if (that2.fData == fData ||
289 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
290 // The two break iterators are using the same rules.
291 return TRUE;
292 }
293 return FALSE;
294 }
295
296 /**
297 * Compute a hash code for this BreakIterator
298 * @return A hash code
299 */
300 int32_t
hashCode(void) const301 RuleBasedBreakIterator::hashCode(void) const {
302 int32_t hash = 0;
303 if (fData != NULL) {
304 hash = fData->hashCode();
305 }
306 return hash;
307 }
308
309
setText(UText * ut,UErrorCode & status)310 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
311 if (U_FAILURE(status)) {
312 return;
313 }
314 reset();
315 fText = utext_clone(fText, ut, FALSE, TRUE, &status);
316
317 // Set up a dummy CharacterIterator to be returned if anyone
318 // calls getText(). With input from UText, there is no reasonable
319 // way to return a characterIterator over the actual input text.
320 // Return one over an empty string instead - this is the closest
321 // we can come to signaling a failure.
322 // (GetText() is obsolete, this failure is sort of OK)
323 if (fDCharIter == NULL) {
324 static UChar c = 0;
325 fDCharIter = new UCharCharacterIterator(&c, 0);
326 }
327
328 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
329 // existing fCharIter was adopted from the outside. Delete it now.
330 delete fCharIter;
331 }
332 fCharIter = fDCharIter;
333
334 this->first();
335 }
336
337
getUText(UText * fillIn,UErrorCode & status) const338 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
339 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
340 return result;
341 }
342
343
344
345 /**
346 * Returns the description used to create this iterator
347 */
348 const UnicodeString&
getRules() const349 RuleBasedBreakIterator::getRules() const {
350 if (fData != NULL) {
351 return fData->getRuleSourceString();
352 } else {
353 static const UnicodeString *s;
354 if (s == NULL) {
355 // TODO: something more elegant here.
356 // perhaps API should return the string by value.
357 // Note: thread unsafe init & leak are semi-ok, better than
358 // what was before. Sould be cleaned up, though.
359 s = new UnicodeString;
360 }
361 return *s;
362 }
363 }
364
365 //=======================================================================
366 // BreakIterator overrides
367 //=======================================================================
368
369 /**
370 * Return a CharacterIterator over the text being analyzed.
371 */
372 CharacterIterator&
getText() const373 RuleBasedBreakIterator::getText() const {
374 return *fCharIter;
375 }
376
377 /**
378 * Set the iterator to analyze a new piece of text. This function resets
379 * the current iteration position to the beginning of the text.
380 * @param newText An iterator over the text to analyze.
381 */
382 void
adoptText(CharacterIterator * newText)383 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
384 // If we are holding a CharacterIterator adopted from a
385 // previous call to this function, delete it now.
386 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
387 delete fCharIter;
388 }
389
390 fCharIter = newText;
391 UErrorCode status = U_ZERO_ERROR;
392 reset();
393 if (newText==NULL || newText->startIndex() != 0) {
394 // startIndex !=0 wants to be an error, but there's no way to report it.
395 // Make the iterator text be an empty string.
396 fText = utext_openUChars(fText, NULL, 0, &status);
397 } else {
398 fText = utext_openCharacterIterator(fText, newText, &status);
399 }
400 this->first();
401 }
402
403 /**
404 * Set the iterator to analyze a new piece of text. This function resets
405 * the current iteration position to the beginning of the text.
406 * @param newText An iterator over the text to analyze.
407 */
408 void
setText(const UnicodeString & newText)409 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
410 UErrorCode status = U_ZERO_ERROR;
411 reset();
412 fText = utext_openConstUnicodeString(fText, &newText, &status);
413
414 // Set up a character iterator on the string.
415 // Needed in case someone calls getText().
416 // Can not, unfortunately, do this lazily on the (probably never)
417 // call to getText(), because getText is const.
418 if (fSCharIter == NULL) {
419 fSCharIter = new StringCharacterIterator(newText);
420 } else {
421 fSCharIter->setText(newText);
422 }
423
424 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
425 // old fCharIter was adopted from the outside. Delete it.
426 delete fCharIter;
427 }
428 fCharIter = fSCharIter;
429
430 this->first();
431 }
432
433
434
435 /**
436 * Sets the current iteration position to the beginning of the text.
437 * @return The offset of the beginning of the text.
438 */
first(void)439 int32_t RuleBasedBreakIterator::first(void) {
440 reset();
441 fLastRuleStatusIndex = 0;
442 fLastStatusIndexValid = TRUE;
443 //if (fText == NULL)
444 // return BreakIterator::DONE;
445
446 utext_setNativeIndex(fText, 0);
447 return 0;
448 }
449
450 /**
451 * Sets the current iteration position to the end of the text.
452 * @return The text's past-the-end offset.
453 */
last(void)454 int32_t RuleBasedBreakIterator::last(void) {
455 reset();
456 if (fText == NULL) {
457 fLastRuleStatusIndex = 0;
458 fLastStatusIndexValid = TRUE;
459 return BreakIterator::DONE;
460 }
461
462 fLastStatusIndexValid = FALSE;
463 int32_t pos = (int32_t)utext_nativeLength(fText);
464 utext_setNativeIndex(fText, pos);
465 return pos;
466 }
467
468 /**
469 * Advances the iterator either forward or backward the specified number of steps.
470 * Negative values move backward, and positive values move forward. This is
471 * equivalent to repeatedly calling next() or previous().
472 * @param n The number of steps to move. The sign indicates the direction
473 * (negative is backwards, and positive is forwards).
474 * @return The character offset of the boundary position n boundaries away from
475 * the current one.
476 */
next(int32_t n)477 int32_t RuleBasedBreakIterator::next(int32_t n) {
478 int32_t result = current();
479 while (n > 0) {
480 result = next();
481 --n;
482 }
483 while (n < 0) {
484 result = previous();
485 ++n;
486 }
487 return result;
488 }
489
490 /**
491 * Advances the iterator to the next boundary position.
492 * @return The position of the first boundary after this one.
493 */
next(void)494 int32_t RuleBasedBreakIterator::next(void) {
495 // if we have cached break positions and we're still in the range
496 // covered by them, just move one step forward in the cache
497 if (fCachedBreakPositions != NULL) {
498 if (fPositionInCache < fNumCachedBreakPositions - 1) {
499 ++fPositionInCache;
500 int32_t pos = fCachedBreakPositions[fPositionInCache];
501 utext_setNativeIndex(fText, pos);
502 return pos;
503 }
504 else {
505 reset();
506 }
507 }
508
509 int32_t startPos = current();
510 int32_t result = handleNext(fData->fForwardTable);
511 if (fDictionaryCharCount > 0) {
512 result = checkDictionary(startPos, result, FALSE);
513 }
514 return result;
515 }
516
517 /**
518 * Advances the iterator backwards, to the last boundary preceding this one.
519 * @return The position of the last boundary position preceding this one.
520 */
previous(void)521 int32_t RuleBasedBreakIterator::previous(void) {
522 int32_t result;
523 int32_t startPos;
524
525 // if we have cached break positions and we're still in the range
526 // covered by them, just move one step backward in the cache
527 if (fCachedBreakPositions != NULL) {
528 if (fPositionInCache > 0) {
529 --fPositionInCache;
530 // If we're at the beginning of the cache, need to reevaluate the
531 // rule status
532 if (fPositionInCache <= 0) {
533 fLastStatusIndexValid = FALSE;
534 }
535 int32_t pos = fCachedBreakPositions[fPositionInCache];
536 utext_setNativeIndex(fText, pos);
537 return pos;
538 }
539 else {
540 reset();
541 }
542 }
543
544 // if we're already sitting at the beginning of the text, return DONE
545 if (fText == NULL || (startPos = current()) == 0) {
546 fLastRuleStatusIndex = 0;
547 fLastStatusIndexValid = TRUE;
548 return BreakIterator::DONE;
549 }
550
551 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
552 result = handlePrevious(fData->fReverseTable);
553 if (fDictionaryCharCount > 0) {
554 result = checkDictionary(result, startPos, TRUE);
555 }
556 return result;
557 }
558
559 // old rule syntax
560 // set things up. handlePrevious() will back us up to some valid
561 // break position before the current position (we back our internal
562 // iterator up one step to prevent handlePrevious() from returning
563 // the current position), but not necessarily the last one before
564
565 // where we started
566
567 int32_t start = current();
568
569 UTEXT_PREVIOUS32(fText);
570 int32_t lastResult = handlePrevious(fData->fReverseTable);
571 if (lastResult == UBRK_DONE) {
572 lastResult = 0;
573 utext_setNativeIndex(fText, 0);
574 }
575 result = lastResult;
576 int32_t lastTag = 0;
577 UBool breakTagValid = FALSE;
578
579 // iterate forward from the known break position until we pass our
580 // starting point. The last break position before the starting
581 // point is our return value
582
583 for (;;) {
584 result = next();
585 if (result == BreakIterator::DONE || result >= start) {
586 break;
587 }
588 lastResult = result;
589 lastTag = fLastRuleStatusIndex;
590 breakTagValid = TRUE;
591 }
592
593 // fLastBreakTag wants to have the value for section of text preceding
594 // the result position that we are to return (in lastResult.) If
595 // the backwards rules overshot and the above loop had to do two or more
596 // next()s to move up to the desired return position, we will have a valid
597 // tag value. But, if handlePrevious() took us to exactly the correct result positon,
598 // we wont have a tag value for that position, which is only set by handleNext().
599
600 // set the current iteration position to be the last break position
601 // before where we started, and then return that value
602 utext_setNativeIndex(fText, lastResult);
603 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
604 fLastStatusIndexValid = breakTagValid;
605
606 // No need to check the dictionary; it will have been handled by
607 // next()
608
609 return lastResult;
610 }
611
612 /**
613 * Sets the iterator to refer to the first boundary position following
614 * the specified position.
615 * @offset The position from which to begin searching for a break position.
616 * @return The position of the first break after the current position.
617 */
following(int32_t offset)618 int32_t RuleBasedBreakIterator::following(int32_t offset) {
619 // if we have cached break positions and offset is in the range
620 // covered by them, use them
621 // TODO: could use binary search
622 // TODO: what if offset is outside range, but break is not?
623 if (fCachedBreakPositions != NULL) {
624 if (offset >= fCachedBreakPositions[0]
625 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
626 fPositionInCache = 0;
627 // We are guaranteed not to leave the array due to range test above
628 while (offset >= fCachedBreakPositions[fPositionInCache]) {
629 ++fPositionInCache;
630 }
631 int32_t pos = fCachedBreakPositions[fPositionInCache];
632 utext_setNativeIndex(fText, pos);
633 return pos;
634 }
635 else {
636 reset();
637 }
638 }
639
640 // if the offset passed in is already past the end of the text,
641 // just return DONE; if it's before the beginning, return the
642 // text's starting offset
643 fLastRuleStatusIndex = 0;
644 fLastStatusIndexValid = TRUE;
645 if (fText == NULL || offset >= utext_nativeLength(fText)) {
646 last();
647 return next();
648 }
649 else if (offset < 0) {
650 return first();
651 }
652
653 // otherwise, set our internal iteration position (temporarily)
654 // to the position passed in. If this is the _beginning_ position,
655 // then we can just use next() to get our return value
656
657 int32_t result = 0;
658
659 if (fData->fSafeRevTable != NULL) {
660 // new rule syntax
661 utext_setNativeIndex(fText, offset);
662 // move forward one codepoint to prepare for moving back to a
663 // safe point.
664 // this handles offset being between a supplementary character
665 UTEXT_NEXT32(fText);
666 // handlePrevious will move most of the time to < 1 boundary away
667 handlePrevious(fData->fSafeRevTable);
668 int32_t result = next();
669 while (result <= offset) {
670 result = next();
671 }
672 return result;
673 }
674 if (fData->fSafeFwdTable != NULL) {
675 // backup plan if forward safe table is not available
676 utext_setNativeIndex(fText, offset);
677 UTEXT_PREVIOUS32(fText);
678 // handle next will give result >= offset
679 handleNext(fData->fSafeFwdTable);
680 // previous will give result 0 or 1 boundary away from offset,
681 // most of the time
682 // we have to
683 int32_t oldresult = previous();
684 while (oldresult > offset) {
685 int32_t result = previous();
686 if (result <= offset) {
687 return oldresult;
688 }
689 oldresult = result;
690 }
691 int32_t result = next();
692 if (result <= offset) {
693 return next();
694 }
695 return result;
696 }
697 // otherwise, we have to sync up first. Use handlePrevious() to back
698 // up to a known break position before the specified position (if
699 // we can determine that the specified position is a break position,
700 // we don't back up at all). This may or may not be the last break
701 // position at or before our starting position. Advance forward
702 // from here until we've passed the starting position. The position
703 // we stop on will be the first break position after the specified one.
704 // old rule syntax
705
706 utext_setNativeIndex(fText, offset);
707 if (offset==0 ||
708 offset==1 && utext_getNativeIndex(fText)==0) {
709 return next();
710 }
711 result = previous();
712
713 while (result != BreakIterator::DONE && result <= offset) {
714 result = next();
715 }
716
717 return result;
718 }
719
720 /**
721 * Sets the iterator to refer to the last boundary position before the
722 * specified position.
723 * @offset The position to begin searching for a break from.
724 * @return The position of the last boundary before the starting position.
725 */
preceding(int32_t offset)726 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
727 // if we have cached break positions and offset is in the range
728 // covered by them, use them
729 if (fCachedBreakPositions != NULL) {
730 // TODO: binary search?
731 // TODO: What if offset is outside range, but break is not?
732 if (offset > fCachedBreakPositions[0]
733 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
734 fPositionInCache = 0;
735 while (fPositionInCache < fNumCachedBreakPositions
736 && offset > fCachedBreakPositions[fPositionInCache])
737 ++fPositionInCache;
738 --fPositionInCache;
739 // If we're at the beginning of the cache, need to reevaluate the
740 // rule status
741 if (fPositionInCache <= 0) {
742 fLastStatusIndexValid = FALSE;
743 }
744 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
745 return fCachedBreakPositions[fPositionInCache];
746 }
747 else {
748 reset();
749 }
750 }
751
752 // if the offset passed in is already past the end of the text,
753 // just return DONE; if it's before the beginning, return the
754 // text's starting offset
755 if (fText == NULL || offset > utext_nativeLength(fText)) {
756 // return BreakIterator::DONE;
757 return last();
758 }
759 else if (offset < 0) {
760 return first();
761 }
762
763 // if we start by updating the current iteration position to the
764 // position specified by the caller, we can just use previous()
765 // to carry out this operation
766
767 if (fData->fSafeFwdTable != NULL) {
768 // new rule syntax
769 utext_setNativeIndex(fText, offset);
770 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
771 if (newOffset != offset) {
772 // Will come here if specified offset was not a code point boundary AND
773 // the underlying implmentation is using UText, which snaps any non-code-point-boundary
774 // indices to the containing code point.
775 // For breakitereator::preceding only, these non-code-point indices need to be moved
776 // up to refer to the following codepoint.
777 UTEXT_NEXT32(fText);
778 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
779 }
780
781 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair,
782 // rather than adjusting the position unconditionally?
783 // (Change would interact with safe rules.)
784 // TODO: change RBBI behavior for off-boundary indices to match that of UText?
785 // affects only preceding(), seems cleaner, but is slightly different.
786 UTEXT_PREVIOUS32(fText);
787 handleNext(fData->fSafeFwdTable);
788 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
789 while (result >= offset) {
790 result = previous();
791 }
792 return result;
793 }
794 if (fData->fSafeRevTable != NULL) {
795 // backup plan if forward safe table is not available
796 // TODO: check whether this path can be discarded
797 // It's probably OK to say that rules must supply both safe tables
798 // if they use safe tables at all. We have certainly never described
799 // to anyone how to work with just one safe table.
800 utext_setNativeIndex(fText, offset);
801 UTEXT_NEXT32(fText);
802
803 // handle previous will give result <= offset
804 handlePrevious(fData->fSafeRevTable);
805
806 // next will give result 0 or 1 boundary away from offset,
807 // most of the time
808 // we have to
809 int32_t oldresult = next();
810 while (oldresult < offset) {
811 int32_t result = next();
812 if (result >= offset) {
813 return oldresult;
814 }
815 oldresult = result;
816 }
817 int32_t result = previous();
818 if (result >= offset) {
819 return previous();
820 }
821 return result;
822 }
823
824 // old rule syntax
825 utext_setNativeIndex(fText, offset);
826 return previous();
827 }
828
829 /**
830 * Returns true if the specfied position is a boundary position. As a side
831 * effect, leaves the iterator pointing to the first boundary position at
832 * or after "offset".
833 * @param offset the offset to check.
834 * @return True if "offset" is a boundary position.
835 */
isBoundary(int32_t offset)836 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
837 // the beginning index of the iterator is always a boundary position by definition
838 if (offset == 0) {
839 first(); // For side effects on current position, tag values.
840 return TRUE;
841 }
842
843 if (offset == (int32_t)utext_nativeLength(fText)) {
844 last(); // For side effects on current position, tag values.
845 return TRUE;
846 }
847
848 // out-of-range indexes are never boundary positions
849 if (offset < 0) {
850 first(); // For side effects on current position, tag values.
851 return FALSE;
852 }
853
854 if (offset > utext_nativeLength(fText)) {
855 last(); // For side effects on current position, tag values.
856 return FALSE;
857 }
858
859 // otherwise, we can use following() on the position before the specified
860 // one and return true if the position we get back is the one the user
861 // specified
862 utext_previous32From(fText, offset);
863 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
864 UBool result = following(backOne) == offset;
865 return result;
866 }
867
868 /**
869 * Returns the current iteration position.
870 * @return The current iteration position.
871 */
current(void) const872 int32_t RuleBasedBreakIterator::current(void) const {
873 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
874 return pos;
875 }
876
877 //=======================================================================
878 // implementation
879 //=======================================================================
880
881 //
882 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
883 // of user text. A variable with this enum type keeps track of where we
884 // are. The state machine only fetches user input while in the RUN mode.
885 //
886 enum RBBIRunMode {
887 RBBI_START, // state machine processing is before first char of input
888 RBBI_RUN, // state machine processing is in the user text
889 RBBI_END // state machine processing is after end of user text.
890 };
891
892
893 //-----------------------------------------------------------------------------------
894 //
895 // handleNext(stateTable)
896 // This method is the actual implementation of the rbbi next() method.
897 // This method initializes the state machine to state 1
898 // and advances through the text character by character until we reach the end
899 // of the text or the state machine transitions to state 0. We update our return
900 // value every time the state machine passes through an accepting state.
901 //
902 //-----------------------------------------------------------------------------------
handleNext(const RBBIStateTable * statetable)903 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
904 int32_t state;
905 int16_t category = 0;
906 RBBIRunMode mode;
907
908 RBBIStateTableRow *row;
909 UChar32 c;
910 int32_t lookaheadStatus = 0;
911 int32_t lookaheadTagIdx = 0;
912 int32_t result = 0;
913 int32_t initialPosition = 0;
914 int32_t lookaheadResult = 0;
915 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
916 const char *tableData = statetable->fTableData;
917 uint32_t tableRowLen = statetable->fRowLen;
918
919 #ifdef RBBI_DEBUG
920 if (fTrace) {
921 RBBIDebugPuts("Handle Next pos char state category");
922 }
923 #endif
924
925 // No matter what, handleNext alway correctly sets the break tag value.
926 fLastStatusIndexValid = TRUE;
927 fLastRuleStatusIndex = 0;
928
929 // if we're already at the end of the text, return DONE.
930 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
931 result = initialPosition;
932 c = UTEXT_NEXT32(fText);
933 if (fData == NULL || c==U_SENTINEL) {
934 return BreakIterator::DONE;
935 }
936
937 // Set the initial state for the state machine
938 state = START_STATE;
939 row = (RBBIStateTableRow *)
940 //(statetable->fTableData + (statetable->fRowLen * state));
941 (tableData + tableRowLen * state);
942
943
944 mode = RBBI_RUN;
945 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
946 category = 2;
947 mode = RBBI_START;
948 }
949
950
951 // loop until we reach the end of the text or transition to state 0
952 //
953 for (;;) {
954 if (c == U_SENTINEL) {
955 // Reached end of input string.
956 if (mode == RBBI_END) {
957 // We have already run the loop one last time with the
958 // character set to the psueudo {eof} value. Now it is time
959 // to unconditionally bail out.
960 if (lookaheadResult > result) {
961 // We ran off the end of the string with a pending look-ahead match.
962 // Treat this as if the look-ahead condition had been met, and return
963 // the match at the / position from the look-ahead rule.
964 result = lookaheadResult;
965 fLastRuleStatusIndex = lookaheadTagIdx;
966 lookaheadStatus = 0;
967 }
968 break;
969 }
970 // Run the loop one last time with the fake end-of-input character category.
971 mode = RBBI_END;
972 category = 1;
973 }
974
975 //
976 // Get the char category. An incoming category of 1 or 2 means that
977 // we are preset for doing the beginning or end of input, and
978 // that we shouldn't get a category from an actual text input character.
979 //
980 if (mode == RBBI_RUN) {
981 // look up the current character's character category, which tells us
982 // which column in the state table to look at.
983 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
984 // not the size of the character going in, which is a UChar32.
985 //
986 UTRIE_GET16(&fData->fTrie, c, category);
987
988 // Check the dictionary bit in the character's category.
989 // Counter is only used by dictionary based iterators (subclasses).
990 // Chars that need to be handled by a dictionary have a flag bit set
991 // in their category values.
992 //
993 if ((category & 0x4000) != 0) {
994 fDictionaryCharCount++;
995 // And off the dictionary flag bit.
996 category &= ~0x4000;
997 }
998 }
999
1000 #ifdef RBBI_DEBUG
1001 if (fTrace) {
1002 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText));
1003 if (0x20<=c && c<0x7f) {
1004 RBBIDebugPrintf("\"%c\" ", c);
1005 } else {
1006 RBBIDebugPrintf("%5x ", c);
1007 }
1008 RBBIDebugPrintf("%3d %3d\n", state, category);
1009 }
1010 #endif
1011
1012 // State Transition - move machine to its next state
1013 //
1014 state = row->fNextState[category];
1015 row = (RBBIStateTableRow *)
1016 // (statetable->fTableData + (statetable->fRowLen * state));
1017 (tableData + tableRowLen * state);
1018
1019
1020 if (row->fAccepting == -1) {
1021 // Match found, common case.
1022 if (mode != RBBI_START) {
1023 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1024 }
1025 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
1026 }
1027
1028 if (row->fLookAhead != 0) {
1029 if (lookaheadStatus != 0
1030 && row->fAccepting == lookaheadStatus) {
1031 // Lookahead match is completed.
1032 result = lookaheadResult;
1033 fLastRuleStatusIndex = lookaheadTagIdx;
1034 lookaheadStatus = 0;
1035 // TODO: make a standalone hard break in a rule work.
1036 if (lookAheadHardBreak) {
1037 UTEXT_SETNATIVEINDEX(fText, result);
1038 return result;
1039 }
1040 // Look-ahead completed, but other rules may match further. Continue on
1041 // TODO: junk this feature? I don't think it's used anywhwere.
1042 goto continueOn;
1043 }
1044
1045 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1046 lookaheadResult = r;
1047 lookaheadStatus = row->fLookAhead;
1048 lookaheadTagIdx = row->fTagIdx;
1049 goto continueOn;
1050 }
1051
1052
1053 if (row->fAccepting != 0) {
1054 // Because this is an accepting state, any in-progress look-ahead match
1055 // is no longer relavant. Clear out the pending lookahead status.
1056 lookaheadStatus = 0; // clear out any pending look-ahead match.
1057 }
1058
1059 continueOn:
1060 if (state == STOP_STATE) {
1061 // This is the normal exit from the lookup state machine.
1062 // We have advanced through the string until it is certain that no
1063 // longer match is possible, no matter what characters follow.
1064 break;
1065 }
1066
1067 // Advance to the next character.
1068 // If this is a beginning-of-input loop iteration, don't advance
1069 // the input position. The next iteration will be processing the
1070 // first real input character.
1071 if (mode == RBBI_RUN) {
1072 c = UTEXT_NEXT32(fText);
1073 } else {
1074 if (mode == RBBI_START) {
1075 mode = RBBI_RUN;
1076 }
1077 }
1078
1079
1080 }
1081
1082 // The state machine is done. Check whether it found a match...
1083
1084 // If the iterator failed to advance in the match engine, force it ahead by one.
1085 // (This really indicates a defect in the break rules. They should always match
1086 // at least one character.)
1087 if (result == initialPosition) {
1088 UTEXT_SETNATIVEINDEX(fText, initialPosition);
1089 UTEXT_NEXT32(fText);
1090 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1091 }
1092
1093 // Leave the iterator at our result position.
1094 UTEXT_SETNATIVEINDEX(fText, result);
1095 #ifdef RBBI_DEBUG
1096 if (fTrace) {
1097 RBBIDebugPrintf("result = %d\n\n", result);
1098 }
1099 #endif
1100 return result;
1101 }
1102
1103
1104
1105 //-----------------------------------------------------------------------------------
1106 //
1107 // handlePrevious()
1108 //
1109 // Iterate backwards, according to the logic of the reverse rules.
1110 // This version handles the exact style backwards rules.
1111 //
1112 // The logic of this function is very similar to handleNext(), above.
1113 //
1114 //-----------------------------------------------------------------------------------
handlePrevious(const RBBIStateTable * statetable)1115 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1116 int32_t state;
1117 int16_t category = 0;
1118 RBBIRunMode mode;
1119 RBBIStateTableRow *row;
1120 UChar32 c;
1121 int32_t lookaheadStatus = 0;
1122 int32_t result = 0;
1123 int32_t initialPosition = 0;
1124 int32_t lookaheadResult = 0;
1125 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1126
1127 #ifdef RBBI_DEBUG
1128 if (fTrace) {
1129 RBBIDebugPuts("Handle Previous pos char state category");
1130 }
1131 #endif
1132
1133 // handlePrevious() never gets the rule status.
1134 // Flag the status as invalid; if the user ever asks for status, we will need
1135 // to back up, then re-find the break position using handleNext(), which does
1136 // get the status value.
1137 fLastStatusIndexValid = FALSE;
1138 fLastRuleStatusIndex = 0;
1139
1140 // if we're already at the start of the text, return DONE.
1141 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1142 return BreakIterator::DONE;
1143 }
1144
1145 // Set up the starting char.
1146 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1147 result = initialPosition;
1148 c = UTEXT_PREVIOUS32(fText);
1149
1150 // Set the initial state for the state machine
1151 state = START_STATE;
1152 row = (RBBIStateTableRow *)
1153 (statetable->fTableData + (statetable->fRowLen * state));
1154 category = 3;
1155 mode = RBBI_RUN;
1156 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1157 category = 2;
1158 mode = RBBI_START;
1159 }
1160
1161
1162 // loop until we reach the start of the text or transition to state 0
1163 //
1164 for (;;) {
1165 if (c == U_SENTINEL) {
1166 // Reached end of input string.
1167 if (mode == RBBI_END ||
1168 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
1169 // We have already run the loop one last time with the
1170 // character set to the psueudo {eof} value. Now it is time
1171 // to unconditionally bail out.
1172 // (Or we have an old format binary rule file that does not support {eof}.)
1173 if (lookaheadResult < result) {
1174 // We ran off the end of the string with a pending look-ahead match.
1175 // Treat this as if the look-ahead condition had been met, and return
1176 // the match at the / position from the look-ahead rule.
1177 result = lookaheadResult;
1178 lookaheadStatus = 0;
1179 } else if (result == initialPosition) {
1180 // Ran off start, no match found.
1181 // move one index one (towards the start, since we are doing a previous())
1182 UTEXT_SETNATIVEINDEX(fText, initialPosition);
1183 UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check.
1184 }
1185 break;
1186 }
1187 // Run the loop one last time with the fake end-of-input character category.
1188 mode = RBBI_END;
1189 category = 1;
1190 }
1191
1192 //
1193 // Get the char category. An incoming category of 1 or 2 means that
1194 // we are preset for doing the beginning or end of input, and
1195 // that we shouldn't get a category from an actual text input character.
1196 //
1197 if (mode == RBBI_RUN) {
1198 // look up the current character's character category, which tells us
1199 // which column in the state table to look at.
1200 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1201 // not the size of the character going in, which is a UChar32.
1202 //
1203 UTRIE_GET16(&fData->fTrie, c, category);
1204
1205 // Check the dictionary bit in the character's category.
1206 // Counter is only used by dictionary based iterators (subclasses).
1207 // Chars that need to be handled by a dictionary have a flag bit set
1208 // in their category values.
1209 //
1210 if ((category & 0x4000) != 0) {
1211 fDictionaryCharCount++;
1212 // And off the dictionary flag bit.
1213 category &= ~0x4000;
1214 }
1215 }
1216
1217 #ifdef RBBI_DEBUG
1218 if (fTrace) {
1219 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText));
1220 if (0x20<=c && c<0x7f) {
1221 RBBIDebugPrintf("\"%c\" ", c);
1222 } else {
1223 RBBIDebugPrintf("%5x ", c);
1224 }
1225 RBBIDebugPrintf("%3d %3d\n", state, category);
1226 }
1227 #endif
1228
1229 // State Transition - move machine to its next state
1230 //
1231 state = row->fNextState[category];
1232 row = (RBBIStateTableRow *)
1233 (statetable->fTableData + (statetable->fRowLen * state));
1234
1235 if (row->fAccepting == -1) {
1236 // Match found, common case.
1237 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1238 }
1239
1240 if (row->fLookAhead != 0) {
1241 if (lookaheadStatus != 0
1242 && row->fAccepting == lookaheadStatus) {
1243 // Lookahead match is completed.
1244 result = lookaheadResult;
1245 lookaheadStatus = 0;
1246 // TODO: make a standalone hard break in a rule work.
1247 if (lookAheadHardBreak) {
1248 UTEXT_SETNATIVEINDEX(fText, result);
1249 return result;
1250 }
1251 // Look-ahead completed, but other rules may match further. Continue on
1252 // TODO: junk this feature? I don't think it's used anywhwere.
1253 goto continueOn;
1254 }
1255
1256 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1257 lookaheadResult = r;
1258 lookaheadStatus = row->fLookAhead;
1259 goto continueOn;
1260 }
1261
1262
1263 if (row->fAccepting != 0) {
1264 // Because this is an accepting state, any in-progress look-ahead match
1265 // is no longer relavant. Clear out the pending lookahead status.
1266 lookaheadStatus = 0;
1267 }
1268
1269 continueOn:
1270 if (state == STOP_STATE) {
1271 // This is the normal exit from the lookup state machine.
1272 // We have advanced through the string until it is certain that no
1273 // longer match is possible, no matter what characters follow.
1274 break;
1275 }
1276
1277 // Move (backwards) to the next character to process.
1278 // If this is a beginning-of-input loop iteration, don't advance
1279 // the input position. The next iteration will be processing the
1280 // first real input character.
1281 if (mode == RBBI_RUN) {
1282 c = UTEXT_PREVIOUS32(fText);
1283 } else {
1284 if (mode == RBBI_START) {
1285 mode = RBBI_RUN;
1286 }
1287 }
1288 }
1289
1290 // The state machine is done. Check whether it found a match...
1291
1292 // If the iterator failed to advance in the match engine, force it ahead by one.
1293 // (This really indicates a defect in the break rules. They should always match
1294 // at least one character.)
1295 if (result == initialPosition) {
1296 UTEXT_SETNATIVEINDEX(fText, initialPosition);
1297 UTEXT_PREVIOUS32(fText);
1298 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1299 }
1300
1301 // Leave the iterator at our result position.
1302 UTEXT_SETNATIVEINDEX(fText, result);
1303 #ifdef RBBI_DEBUG
1304 if (fTrace) {
1305 RBBIDebugPrintf("result = %d\n\n", result);
1306 }
1307 #endif
1308 return result;
1309 }
1310
1311
1312 void
reset()1313 RuleBasedBreakIterator::reset()
1314 {
1315 if (fCachedBreakPositions) {
1316 uprv_free(fCachedBreakPositions);
1317 }
1318 fCachedBreakPositions = NULL;
1319 fNumCachedBreakPositions = 0;
1320 fDictionaryCharCount = 0;
1321 fPositionInCache = 0;
1322 }
1323
1324
1325
1326 //-------------------------------------------------------------------------------
1327 //
1328 // getRuleStatus() Return the break rule tag associated with the current
1329 // iterator position. If the iterator arrived at its current
1330 // position by iterating forwards, the value will have been
1331 // cached by the handleNext() function.
1332 //
1333 // If no cached status value is available, the status is
1334 // found by doing a previous() followed by a next(), which
1335 // leaves the iterator where it started, and computes the
1336 // status while doing the next().
1337 //
1338 //-------------------------------------------------------------------------------
makeRuleStatusValid()1339 void RuleBasedBreakIterator::makeRuleStatusValid() {
1340 if (fLastStatusIndexValid == FALSE) {
1341 // No cached status is available.
1342 if (fText == NULL || current() == 0) {
1343 // At start of text, or there is no text. Status is always zero.
1344 fLastRuleStatusIndex = 0;
1345 fLastStatusIndexValid = TRUE;
1346 } else {
1347 // Not at start of text. Find status the tedious way.
1348 int32_t pa = current();
1349 previous();
1350 if (fNumCachedBreakPositions > 0) {
1351 reset(); // Blow off the dictionary cache
1352 }
1353 int32_t pb = next();
1354 if (pa != pb) {
1355 // note: the if (pa != pb) test is here only to eliminate warnings for
1356 // unused local variables on gcc. Logically, it isn't needed.
1357 U_ASSERT(pa == pb);
1358 }
1359 }
1360 }
1361 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx);
1362 }
1363
1364
getRuleStatus() const1365 int32_t RuleBasedBreakIterator::getRuleStatus() const {
1366 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
1367 nonConstThis->makeRuleStatusValid();
1368
1369 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1370 // (the number of status values.)
1371 // This function returns the last (largest) of the array of status values.
1372 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1373 int32_t tagVal = fData->fRuleStatusTable[idx];
1374
1375 return tagVal;
1376 }
1377
1378
1379
1380
getRuleStatusVec(int32_t * fillInVec,int32_t capacity,UErrorCode & status)1381 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1382 int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1383 {
1384 if (U_FAILURE(status)) {
1385 return 0;
1386 }
1387
1388 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
1389 nonConstThis->makeRuleStatusValid();
1390 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1391 int32_t numValsToCopy = numVals;
1392 if (numVals > capacity) {
1393 status = U_BUFFER_OVERFLOW_ERROR;
1394 numValsToCopy = capacity;
1395 }
1396 int i;
1397 for (i=0; i<numValsToCopy; i++) {
1398 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1399 }
1400 return numVals;
1401 }
1402
1403
1404
1405 //-------------------------------------------------------------------------------
1406 //
1407 // getBinaryRules Access to the compiled form of the rules,
1408 // for use by build system tools that save the data
1409 // for standard iterator types.
1410 //
1411 //-------------------------------------------------------------------------------
getBinaryRules(uint32_t & length)1412 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1413 const uint8_t *retPtr = NULL;
1414 length = 0;
1415
1416 if (fData != NULL) {
1417 retPtr = (const uint8_t *)fData->fHeader;
1418 length = fData->fHeader->fLength;
1419 }
1420 return retPtr;
1421 }
1422
1423
1424
1425
1426 //-------------------------------------------------------------------------------
1427 //
1428 // BufferClone TODO: In my (Andy) opinion, this function should be deprecated.
1429 // Saving one heap allocation isn't worth the trouble.
1430 // Cloning shouldn't be done in tight loops, and
1431 // making the clone copy involves other heap operations anyway.
1432 // And the application code for correctly dealing with buffer
1433 // size problems and the eventual object destruction is ugly.
1434 //
1435 //-------------------------------------------------------------------------------
createBufferClone(void * stackBuffer,int32_t & bufferSize,UErrorCode & status)1436 BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1437 int32_t &bufferSize,
1438 UErrorCode &status)
1439 {
1440 if (U_FAILURE(status)){
1441 return NULL;
1442 }
1443
1444 //
1445 // If user buffer size is zero this is a preflight operation to
1446 // obtain the needed buffer size, allowing for worst case misalignment.
1447 //
1448 if (bufferSize == 0) {
1449 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1450 return NULL;
1451 }
1452
1453
1454 //
1455 // Check the alignment and size of the user supplied buffer.
1456 // Allocate heap memory if the user supplied memory is insufficient.
1457 //
1458 char *buf = (char *)stackBuffer;
1459 uint32_t s = bufferSize;
1460
1461 if (stackBuffer == NULL) {
1462 s = 0; // Ignore size, force allocation if user didn't give us a buffer.
1463 }
1464 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1465 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1466 s -= offsetUp;
1467 buf += offsetUp;
1468 }
1469 if (s < sizeof(RuleBasedBreakIterator)) {
1470 // Not enough room in the caller-supplied buffer.
1471 // Do a plain-vanilla heap based clone and return that, along with
1472 // a warning that the clone was allocated.
1473 RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
1474 if (clonedBI == 0) {
1475 status = U_MEMORY_ALLOCATION_ERROR;
1476 } else {
1477 status = U_SAFECLONE_ALLOCATED_WARNING;
1478 }
1479 return clonedBI;
1480 }
1481
1482 //
1483 // Clone the source BI into the caller-supplied buffer.
1484 // TODO: using an overloaded operator new to directly initialize the
1485 // copy in the user's buffer would be better, but it doesn't seem
1486 // to get along with namespaces. Investigate why.
1487 //
1488 // The memcpy is only safe with an empty (default constructed)
1489 // break iterator. Use on others can screw up reference counts
1490 // to data. memcpy-ing objects is not really a good idea...
1491 //
1492 RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy
1493 RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
1494 uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
1495 clone->init(); // Init RuleBasedBreakIterator part, (user default constructor)
1496 *clone = *this; // clone = the real BI we want.
1497 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code)
1498
1499 return clone;
1500 }
1501
1502
1503 //-------------------------------------------------------------------------------
1504 //
1505 // isDictionaryChar Return true if the category lookup for this char
1506 // indicates that it is in the set of dictionary lookup
1507 // chars.
1508 //
1509 // This function is intended for use by dictionary based
1510 // break iterators.
1511 //
1512 //-------------------------------------------------------------------------------
1513 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
1514 if (fData == NULL) {
1515 return FALSE;
1516 }
1517 uint16_t category;
1518 UTRIE_GET16(&fData->fTrie, c, category);
1519 return (category & 0x4000) != 0;
1520 }*/
1521
1522
1523 //-------------------------------------------------------------------------------
1524 //
1525 // checkDictionary This function handles all processing of characters in
1526 // the "dictionary" set. It will determine the appropriate
1527 // course of action, and possibly set up a cache in the
1528 // process.
1529 //
1530 //-------------------------------------------------------------------------------
checkDictionary(int32_t startPos,int32_t endPos,UBool reverse)1531 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1532 int32_t endPos,
1533 UBool reverse) {
1534 // Reset the old break cache first.
1535 uint32_t dictionaryCount = fDictionaryCharCount;
1536 reset();
1537
1538 if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
1539 return (reverse ? startPos : endPos);
1540 }
1541
1542 // Starting from the starting point, scan towards the proposed result,
1543 // looking for the first dictionary character (which may be the one
1544 // we're on, if we're starting in the middle of a range).
1545 utext_setNativeIndex(fText, reverse ? endPos : startPos);
1546 if (reverse) {
1547 UTEXT_PREVIOUS32(fText);
1548 }
1549
1550 int32_t rangeStart = startPos;
1551 int32_t rangeEnd = endPos;
1552
1553 uint16_t category;
1554 int32_t current;
1555 UErrorCode status = U_ZERO_ERROR;
1556 UStack breaks(status);
1557 int32_t foundBreakCount = 0;
1558 UChar32 c = utext_current32(fText);
1559
1560 UTRIE_GET16(&fData->fTrie, c, category);
1561
1562 // Is the character we're starting on a dictionary character? If so, we
1563 // need to back up to include the entire run; otherwise the results of
1564 // the break algorithm will differ depending on where we start. Since
1565 // the result is cached and there is typically a non-dictionary break
1566 // within a small number of words, there should be little performance impact.
1567 if (category & 0x4000) {
1568 if (reverse) {
1569 do {
1570 utext_next32(fText); // TODO: recast to work directly with postincrement.
1571 c = utext_current32(fText);
1572 UTRIE_GET16(&fData->fTrie, c, category);
1573 } while (c != U_SENTINEL && (category & 0x4000));
1574 // Back up to the last dictionary character
1575 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1576 if (c == U_SENTINEL) {
1577 // c = fText->last32();
1578 // TODO: why was this if needed?
1579 c = UTEXT_PREVIOUS32(fText);
1580 }
1581 else {
1582 c = UTEXT_PREVIOUS32(fText);
1583 }
1584 }
1585 else {
1586 do {
1587 c = UTEXT_PREVIOUS32(fText);
1588 UTRIE_GET16(&fData->fTrie, c, category);
1589 }
1590 while (c != U_SENTINEL && (category & 0x4000));
1591 // Back up to the last dictionary character
1592 if (c == U_SENTINEL) {
1593 // c = fText->first32();
1594 c = utext_current32(fText);
1595 }
1596 else {
1597 utext_next32(fText);
1598 c = utext_current32(fText);
1599 }
1600 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1601 }
1602 UTRIE_GET16(&fData->fTrie, c, category);
1603 }
1604
1605 // Loop through the text, looking for ranges of dictionary characters.
1606 // For each span, find the appropriate break engine, and ask it to find
1607 // any breaks within the span.
1608 // Note: we always do this in the forward direction, so that the break
1609 // cache is built in the right order.
1610 if (reverse) {
1611 utext_setNativeIndex(fText, rangeStart);
1612 c = utext_current32(fText);
1613 UTRIE_GET16(&fData->fTrie, c, category);
1614 }
1615 while(U_SUCCESS(status)) {
1616 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1617 utext_next32(fText); // TODO: tweak for post-increment operation
1618 c = utext_current32(fText);
1619 UTRIE_GET16(&fData->fTrie, c, category);
1620 }
1621 if (current >= rangeEnd) {
1622 break;
1623 }
1624
1625 // We now have a dictionary character. Get the appropriate language object
1626 // to deal with it.
1627 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1628
1629 // Ask the language object if there are any breaks. It will leave the text
1630 // pointer on the other side of its range, ready to search for the next one.
1631 if (lbe != NULL) {
1632 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1633 }
1634
1635 // Reload the loop variables for the next go-round
1636 c = utext_current32(fText);
1637 UTRIE_GET16(&fData->fTrie, c, category);
1638 }
1639
1640 // If we found breaks, build a new break cache. The first and last entries must
1641 // be the original starting and ending position.
1642 if (foundBreakCount > 0) {
1643 int32_t totalBreaks = foundBreakCount;
1644 if (startPos < breaks.elementAti(0)) {
1645 totalBreaks += 1;
1646 }
1647 if (endPos > breaks.peeki()) {
1648 totalBreaks += 1;
1649 }
1650 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1651 if (fCachedBreakPositions != NULL) {
1652 int32_t out = 0;
1653 fNumCachedBreakPositions = totalBreaks;
1654 if (startPos < breaks.elementAti(0)) {
1655 fCachedBreakPositions[out++] = startPos;
1656 }
1657 for (int32_t i = 0; i < foundBreakCount; ++i) {
1658 fCachedBreakPositions[out++] = breaks.elementAti(i);
1659 }
1660 if (endPos > fCachedBreakPositions[out-1]) {
1661 fCachedBreakPositions[out] = endPos;
1662 }
1663 // If there are breaks, then by definition, we are replacing the original
1664 // proposed break by one of the breaks we found. Use following() and
1665 // preceding() to do the work. They should never recurse in this case.
1666 if (reverse) {
1667 return preceding(endPos - 1);
1668 }
1669 else {
1670 return following(startPos);
1671 }
1672 }
1673 // If the allocation failed, just fall through to the "no breaks found" case.
1674 }
1675
1676 // If we get here, there were no language-based breaks. Set the text pointer
1677 // to the original proposed break.
1678 utext_setNativeIndex(fText, reverse ? startPos : endPos);
1679 return (reverse ? startPos : endPos);
1680 }
1681
1682 U_NAMESPACE_END
1683
1684 // defined in ucln_cmn.h
1685
1686 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
1687
1688 /**
1689 * Release all static memory held by breakiterator.
1690 */
1691 U_CDECL_BEGIN
breakiterator_cleanup_dict(void)1692 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1693 if (gLanguageBreakFactories) {
1694 delete gLanguageBreakFactories;
1695 gLanguageBreakFactories = NULL;
1696 }
1697 return TRUE;
1698 }
1699 U_CDECL_END
1700
1701 U_CDECL_BEGIN
_deleteFactory(void * obj)1702 static void U_CALLCONV _deleteFactory(void *obj) {
1703 delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
1704 }
1705 U_CDECL_END
1706 U_NAMESPACE_BEGIN
1707
1708 static const LanguageBreakEngine*
getLanguageBreakEngineFromFactory(UChar32 c,int32_t breakType)1709 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1710 {
1711 UBool needsInit;
1712 UErrorCode status = U_ZERO_ERROR;
1713 UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
1714
1715 if (needsInit) {
1716 UStack *factories = new UStack(_deleteFactory, NULL, status);
1717 if (U_SUCCESS(status)) {
1718 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1719 factories->push(builtIn, status);
1720 #ifdef U_LOCAL_SERVICE_HOOK
1721 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1722 if (extra != NULL) {
1723 factories->push(extra, status);
1724 }
1725 #endif
1726 }
1727 umtx_lock(NULL);
1728 if (gLanguageBreakFactories == NULL) {
1729 gLanguageBreakFactories = factories;
1730 factories = NULL;
1731 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1732 }
1733 umtx_unlock(NULL);
1734 delete factories;
1735 }
1736
1737 if (gLanguageBreakFactories == NULL) {
1738 return NULL;
1739 }
1740
1741 int32_t i = gLanguageBreakFactories->size();
1742 const LanguageBreakEngine *lbe = NULL;
1743 while (--i >= 0) {
1744 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1745 lbe = factory->getEngineFor(c, breakType);
1746 if (lbe != NULL) {
1747 break;
1748 }
1749 }
1750 return lbe;
1751 }
1752
1753
1754 //-------------------------------------------------------------------------------
1755 //
1756 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1757 // the characer c.
1758 //
1759 //-------------------------------------------------------------------------------
1760 const LanguageBreakEngine *
getLanguageBreakEngine(UChar32 c)1761 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1762 const LanguageBreakEngine *lbe = NULL;
1763 UErrorCode status = U_ZERO_ERROR;
1764
1765 if (fLanguageBreakEngines == NULL) {
1766 fLanguageBreakEngines = new UStack(status);
1767 if (U_FAILURE(status)) {
1768 delete fLanguageBreakEngines;
1769 fLanguageBreakEngines = 0;
1770 return NULL;
1771 }
1772 }
1773
1774 int32_t i = fLanguageBreakEngines->size();
1775 while (--i >= 0) {
1776 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1777 if (lbe->handles(c, fBreakType)) {
1778 return lbe;
1779 }
1780 }
1781
1782 // No existing dictionary took the character. See if a factory wants to
1783 // give us a new LanguageBreakEngine for this character.
1784 lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1785
1786 // If we got one, use it and push it on our stack.
1787 if (lbe != NULL) {
1788 fLanguageBreakEngines->push((void *)lbe, status);
1789 // Even if we can't remember it, we can keep looking it up, so
1790 // return it even if the push fails.
1791 return lbe;
1792 }
1793
1794 // No engine is forthcoming for this character. Add it to the
1795 // reject set. Create the reject break engine if needed.
1796 if (fUnhandledBreakEngine == NULL) {
1797 fUnhandledBreakEngine = new UnhandledEngine(status);
1798 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1799 status = U_MEMORY_ALLOCATION_ERROR;
1800 }
1801 // Put it last so that scripts for which we have an engine get tried
1802 // first.
1803 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1804 // If we can't insert it, or creation failed, get rid of it
1805 if (U_FAILURE(status)) {
1806 delete fUnhandledBreakEngine;
1807 fUnhandledBreakEngine = 0;
1808 return NULL;
1809 }
1810 }
1811
1812 // Tell the reject engine about the character; at its discretion, it may
1813 // add more than just the one character.
1814 fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1815
1816 return fUnhandledBreakEngine;
1817 }
1818
1819
1820
1821 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1822 return fBreakType;
1823 }*/
1824
setBreakType(int32_t type)1825 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1826 fBreakType = type;
1827 reset();
1828 }
1829
1830 U_NAMESPACE_END
1831
1832 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1833