1 //
2 // rbbisetb.cpp
3 //
4 /*
5 ***************************************************************************
6 * Copyright (C) 2002-2005 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
9 */
10 //
11 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
12 // (part of the rule building process.)
13 //
14 // Starting with the rules parse tree from the scanner,
15 //
16 // - Enumerate the set of UnicodeSets that are referenced
17 // by the RBBI rules.
18 // - compute a set of non-overlapping character ranges
19 // with all characters within a range belonging to the same
20 // set of input uniocde sets.
21 // - Derive a set of non-overlapping UnicodeSet (like things)
22 // that will correspond to columns in the state table for
23 // the RBBI execution engine. All characters within one
24 // of these sets belong to the same set of the original
25 // UnicodeSets from the user's rules.
26 // - construct the trie table that maps input characters
27 // to the index of the matching non-overlapping set of set from
28 // the previous step.
29 //
30
31 #include "unicode/utypes.h"
32
33 #if !UCONFIG_NO_BREAK_ITERATION
34
35 #include "unicode/uniset.h"
36 #include "utrie.h"
37 #include "uvector.h"
38 #include "uassert.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41
42 #include "rbbisetb.h"
43 #include "rbbinode.h"
44
45
46 //------------------------------------------------------------------------
47 //
48 // getFoldedRBBIValue Call-back function used during building of Trie table.
49 // Folding value: just store the offset (16 bits)
50 // if there is any non-0 entry.
51 // (It'd really be nice if the Trie builder would provide a
52 // simple default, so this function could go away from here.)
53 //
54 //------------------------------------------------------------------------
55 /* folding value: just store the offset (16 bits) if there is any non-0 entry */
56 U_CDECL_BEGIN
57 static uint32_t U_CALLCONV
getFoldedRBBIValue(UNewTrie * trie,UChar32 start,int32_t offset)58 getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
59 uint32_t value;
60 UChar32 limit;
61 UBool inBlockZero;
62
63 limit=start+0x400;
64 while(start<limit) {
65 value=utrie_get32(trie, start, &inBlockZero);
66 if(inBlockZero) {
67 start+=UTRIE_DATA_BLOCK_LENGTH;
68 } else if(value!=0) {
69 return (uint32_t)(offset|0x8000);
70 } else {
71 ++start;
72 }
73 }
74 return 0;
75 }
76
77
78 U_CDECL_END
79
80
81
82 U_NAMESPACE_BEGIN
83
84 //------------------------------------------------------------------------
85 //
86 // Constructor
87 //
88 //------------------------------------------------------------------------
RBBISetBuilder(RBBIRuleBuilder * rb)89 RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
90 {
91 fRB = rb;
92 fStatus = rb->fStatus;
93 fRangeList = 0;
94 fTrie = 0;
95 fTrieSize = 0;
96 fGroupCount = 0;
97 fSawBOF = FALSE;
98 }
99
100
101 //------------------------------------------------------------------------
102 //
103 // Destructor
104 //
105 //------------------------------------------------------------------------
~RBBISetBuilder()106 RBBISetBuilder::~RBBISetBuilder()
107 {
108 RangeDescriptor *nextRangeDesc;
109
110 // Walk through & delete the linked list of RangeDescriptors
111 for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
112 RangeDescriptor *r = nextRangeDesc;
113 nextRangeDesc = r->fNext;
114 delete r;
115 }
116
117 utrie_close(fTrie);
118 }
119
120
121
122
123 //------------------------------------------------------------------------
124 //
125 // build Build the list of non-overlapping character ranges
126 // from the Unicode Sets.
127 //
128 //------------------------------------------------------------------------
build()129 void RBBISetBuilder::build() {
130 RBBINode *usetNode;
131 RangeDescriptor *rlRange;
132
133 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
134
135 //
136 // Initialize the process by creating a single range encompassing all characters
137 // that is in no sets.
138 //
139 fRangeList = new RangeDescriptor(*fStatus); // will check for status here
140 fRangeList->fStartChar = 0;
141 fRangeList->fEndChar = 0x10ffff;
142
143 if (U_FAILURE(*fStatus)) {
144 return;
145 }
146
147 //
148 // Find the set of non-overlapping ranges of characters
149 //
150 int ni;
151 for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
152 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
153 if (usetNode==NULL) {
154 break;
155 }
156
157 UnicodeSet *inputSet = usetNode->fInputSet;
158 int32_t inputSetRangeCount = inputSet->getRangeCount();
159 int inputSetRangeIndex = 0;
160 rlRange = fRangeList;
161
162 for (;;) {
163 if (inputSetRangeIndex >= inputSetRangeCount) {
164 break;
165 }
166 UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
167 UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);
168
169 // skip over ranges from the range list that are completely
170 // below the current range from the input unicode set.
171 while (rlRange->fEndChar < inputSetRangeBegin) {
172 rlRange = rlRange->fNext;
173 }
174
175 // If the start of the range from the range list is before with
176 // the start of the range from the unicode set, split the range list range
177 // in two, with one part being before (wholly outside of) the unicode set
178 // and the other containing the rest.
179 // Then continue the loop; the post-split current range will then be skipped
180 // over
181 if (rlRange->fStartChar < inputSetRangeBegin) {
182 rlRange->split(inputSetRangeBegin, *fStatus);
183 if (U_FAILURE(*fStatus)) {
184 return;
185 }
186 continue;
187 }
188
189 // Same thing at the end of the ranges...
190 // If the end of the range from the range list doesn't coincide with
191 // the end of the range from the unicode set, split the range list
192 // range in two. The first part of the split range will be
193 // wholly inside the Unicode set.
194 if (rlRange->fEndChar > inputSetRangeEnd) {
195 rlRange->split(inputSetRangeEnd+1, *fStatus);
196 if (U_FAILURE(*fStatus)) {
197 return;
198 }
199 }
200
201 // The current rlRange is now entirely within the UnicodeSet range.
202 // Add this unicode set to the list of sets for this rlRange
203 if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
204 rlRange->fIncludesSets->addElement(usetNode, *fStatus);
205 if (U_FAILURE(*fStatus)) {
206 return;
207 }
208 }
209
210 // Advance over ranges that we are finished with.
211 if (inputSetRangeEnd == rlRange->fEndChar) {
212 inputSetRangeIndex++;
213 }
214 rlRange = rlRange->fNext;
215 }
216 }
217
218 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
219
220 //
221 // Group the above ranges, with each group consisting of one or more
222 // ranges that are in exactly the same set of original UnicodeSets.
223 // The groups are numbered, and these group numbers are the set of
224 // input symbols recognized by the run-time state machine.
225 //
226 // Numbering: # 0 (state table column 0) is unused.
227 // # 1 is reserved - table column 1 is for end-of-input
228 // # 2 is reserved - table column 2 is for beginning-in-input
229 // # 3 is the first range list.
230 //
231 RangeDescriptor *rlSearchRange;
232 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
233 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
234 if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
235 rlRange->fNum = rlSearchRange->fNum;
236 break;
237 }
238 }
239 if (rlRange->fNum == 0) {
240 fGroupCount ++;
241 rlRange->fNum = fGroupCount+2;
242 rlRange->setDictionaryFlag();
243 addValToSets(rlRange->fIncludesSets, fGroupCount+2);
244 }
245 }
246
247 // Handle input sets that contain the special string {eof}.
248 // Column 1 of the state table is reserved for EOF on input.
249 // Column 2 is reserved for before-the-start-input.
250 // (This column can be optimized away later if there are no rule
251 // references to {bof}.)
252 // Add this column value (1 or 2) to the equivalent expression
253 // subtree for each UnicodeSet that contains the string {eof}
254 // Because {bof} and {eof} are not a characters in the normal sense,
255 // they doesn't affect the computation of ranges or TRIE.
256 static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
257 static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
258
259 UnicodeString eofString(eofUString);
260 UnicodeString bofString(bofUString);
261 for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
262 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
263 if (usetNode==NULL) {
264 break;
265 }
266 UnicodeSet *inputSet = usetNode->fInputSet;
267 if (inputSet->contains(eofString)) {
268 addValToSet(usetNode, 1);
269 }
270 if (inputSet->contains(bofString)) {
271 addValToSet(usetNode, 2);
272 fSawBOF = TRUE;
273 }
274 }
275
276
277 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
278 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
279
280 //
281 // Build the Trie table for mapping UChar32 values to the corresponding
282 // range group number
283 //
284 fTrie = utrie_open(NULL, // Pre-existing trie to be filled in
285 NULL, // Data array (utrie will allocate one)
286 100000, // Max Data Length
287 0, // Initial value for all code points
288 0, // Lead surrogate unit value
289 TRUE); // Keep Latin 1 in separately
290
291
292 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
293 utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
294 }
295 }
296
297
298
299 //-----------------------------------------------------------------------------------
300 //
301 // getTrieSize() Return the size that will be required to serialize the Trie.
302 //
303 //-----------------------------------------------------------------------------------
getTrieSize()304 int32_t RBBISetBuilder::getTrieSize() /*const*/ {
305 fTrieSize = utrie_serialize(fTrie,
306 NULL, // Buffer
307 0, // Capacity
308 getFoldedRBBIValue,
309 TRUE, // Reduce to 16 bits
310 fStatus);
311 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
312 return fTrieSize;
313 }
314
315
316 //-----------------------------------------------------------------------------------
317 //
318 // serializeTrie() Put the serialized trie at the specified address.
319 // Trust the caller to have given us enough memory.
320 // getTrieSize() MUST be called first.
321 //
322 //-----------------------------------------------------------------------------------
serializeTrie(uint8_t * where)323 void RBBISetBuilder::serializeTrie(uint8_t *where) {
324 utrie_serialize(fTrie,
325 where, // Buffer
326 fTrieSize, // Capacity
327 getFoldedRBBIValue,
328 TRUE, // Reduce to 16 bits
329 fStatus);
330 }
331
332 //------------------------------------------------------------------------
333 //
334 // addValToSets Add a runtime-mapped input value to each uset from a
335 // list of uset nodes. (val corresponds to a state table column.)
336 // For each of the original Unicode sets - which correspond
337 // directly to uset nodes - a logically equivalent expression
338 // is constructed in terms of the remapped runtime input
339 // symbol set. This function adds one runtime input symbol to
340 // a list of sets.
341 //
342 // The "logically equivalent expression" is the tree for an
343 // or-ing together of all of the symbols that go into the set.
344 //
345 //------------------------------------------------------------------------
addValToSets(UVector * sets,uint32_t val)346 void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
347 int32_t ix;
348
349 for (ix=0; ix<sets->size(); ix++) {
350 RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
351 addValToSet(usetNode, val);
352 }
353 }
354
addValToSet(RBBINode * usetNode,uint32_t val)355 void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
356 RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
357 leafNode->fVal = (unsigned short)val;
358 if (usetNode->fLeftChild == NULL) {
359 usetNode->fLeftChild = leafNode;
360 leafNode->fParent = usetNode;
361 } else {
362 // There are already input symbols present for this set.
363 // Set up an OR node, with the previous stuff as the left child
364 // and the new value as the right child.
365 RBBINode *orNode = new RBBINode(RBBINode::opOr);
366 orNode->fLeftChild = usetNode->fLeftChild;
367 orNode->fRightChild = leafNode;
368 orNode->fLeftChild->fParent = orNode;
369 orNode->fRightChild->fParent = orNode;
370 usetNode->fLeftChild = orNode;
371 orNode->fParent = usetNode;
372 }
373 }
374
375
376 //------------------------------------------------------------------------
377 //
378 // getNumCharCategories
379 //
380 //------------------------------------------------------------------------
getNumCharCategories() const381 int32_t RBBISetBuilder::getNumCharCategories() const {
382 return fGroupCount + 3;
383 }
384
385
386 //------------------------------------------------------------------------
387 //
388 // sawBOF
389 //
390 //------------------------------------------------------------------------
sawBOF() const391 UBool RBBISetBuilder::sawBOF() const {
392 return fSawBOF;
393 }
394
395
396 //------------------------------------------------------------------------
397 //
398 // getFirstChar Given a runtime RBBI character category, find
399 // the first UChar32 that is in the set of chars
400 // in the category.
401 //------------------------------------------------------------------------
getFirstChar(int32_t category) const402 UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
403 RangeDescriptor *rlRange;
404 UChar32 retVal = (UChar32)-1;
405 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
406 if (rlRange->fNum == category) {
407 retVal = rlRange->fStartChar;
408 break;
409 }
410 }
411 return retVal;
412 }
413
414
415
416 //------------------------------------------------------------------------
417 //
418 // printRanges A debugging function.
419 // dump out all of the range definitions.
420 //
421 //------------------------------------------------------------------------
422 #ifdef RBBI_DEBUG
printRanges()423 void RBBISetBuilder::printRanges() {
424 RangeDescriptor *rlRange;
425 int i;
426
427 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
428 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
429 RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
430
431 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
432 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
433 UnicodeString setName = UNICODE_STRING("anon", 4);
434 RBBINode *setRef = usetNode->fParent;
435 if (setRef != NULL) {
436 RBBINode *varRef = setRef->fParent;
437 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
438 setName = varRef->fText;
439 }
440 }
441 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
442 }
443 RBBIDebugPrintf("\n");
444 }
445 }
446 #endif
447
448
449 //------------------------------------------------------------------------
450 //
451 // printRangeGroups A debugging function.
452 // dump out all of the range groups.
453 //
454 //------------------------------------------------------------------------
455 #ifdef RBBI_DEBUG
printRangeGroups()456 void RBBISetBuilder::printRangeGroups() {
457 RangeDescriptor *rlRange;
458 RangeDescriptor *tRange;
459 int i;
460 int lastPrintedGroupNum = 0;
461
462 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
463 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
464 int groupNum = rlRange->fNum & 0xbfff;
465 if (groupNum > lastPrintedGroupNum) {
466 lastPrintedGroupNum = groupNum;
467 RBBIDebugPrintf("%2i ", groupNum);
468
469 if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}
470
471 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
472 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
473 UnicodeString setName = UNICODE_STRING("anon", 4);
474 RBBINode *setRef = usetNode->fParent;
475 if (setRef != NULL) {
476 RBBINode *varRef = setRef->fParent;
477 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
478 setName = varRef->fText;
479 }
480 }
481 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
482 }
483
484 i = 0;
485 for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
486 if (tRange->fNum == rlRange->fNum) {
487 if (i++ % 5 == 0) {
488 RBBIDebugPrintf("\n ");
489 }
490 RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
491 }
492 }
493 RBBIDebugPrintf("\n");
494 }
495 }
496 RBBIDebugPrintf("\n");
497 }
498 #endif
499
500
501 //------------------------------------------------------------------------
502 //
503 // printSets A debugging function.
504 // dump out all of the set definitions.
505 //
506 //------------------------------------------------------------------------
507 #ifdef RBBI_DEBUG
printSets()508 void RBBISetBuilder::printSets() {
509 int i;
510
511 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
512 for (i=0; ; i++) {
513 RBBINode *usetNode;
514 RBBINode *setRef;
515 RBBINode *varRef;
516 UnicodeString setName;
517
518 usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
519 if (usetNode == NULL) {
520 break;
521 }
522
523 RBBIDebugPrintf("%3d ", i);
524 setName = UNICODE_STRING("anonymous", 9);
525 setRef = usetNode->fParent;
526 if (setRef != NULL) {
527 varRef = setRef->fParent;
528 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
529 setName = varRef->fText;
530 }
531 }
532 RBBI_DEBUG_printUnicodeString(setName);
533 RBBIDebugPrintf(" ");
534 RBBI_DEBUG_printUnicodeString(usetNode->fText);
535 RBBIDebugPrintf("\n");
536 if (usetNode->fLeftChild != NULL) {
537 usetNode->fLeftChild->printTree(TRUE);
538 }
539 }
540 RBBIDebugPrintf("\n");
541 }
542 #endif
543
544
545
546 //-------------------------------------------------------------------------------------
547 //
548 // RangeDescriptor copy constructor
549 //
550 //-------------------------------------------------------------------------------------
551
RangeDescriptor(const RangeDescriptor & other,UErrorCode & status)552 RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
553 int i;
554
555 this->fStartChar = other.fStartChar;
556 this->fEndChar = other.fEndChar;
557 this->fNum = other.fNum;
558 this->fNext = NULL;
559 UErrorCode oldstatus = status;
560 this->fIncludesSets = new UVector(status);
561 if (U_FAILURE(oldstatus)) {
562 status = oldstatus;
563 }
564 if (U_FAILURE(status)) {
565 return;
566 }
567 /* test for NULL */
568 if (this->fIncludesSets == 0) {
569 status = U_MEMORY_ALLOCATION_ERROR;
570 return;
571 }
572
573 for (i=0; i<other.fIncludesSets->size(); i++) {
574 this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
575 }
576 }
577
578
579 //-------------------------------------------------------------------------------------
580 //
581 // RangeDesriptor default constructor
582 //
583 //-------------------------------------------------------------------------------------
RangeDescriptor(UErrorCode & status)584 RangeDescriptor::RangeDescriptor(UErrorCode &status) {
585 this->fStartChar = 0;
586 this->fEndChar = 0;
587 this->fNum = 0;
588 this->fNext = NULL;
589 UErrorCode oldstatus = status;
590 this->fIncludesSets = new UVector(status);
591 if (U_FAILURE(oldstatus)) {
592 status = oldstatus;
593 }
594 if (U_FAILURE(status)) {
595 return;
596 }
597 /* test for NULL */
598 if(this->fIncludesSets == 0) {
599 status = U_MEMORY_ALLOCATION_ERROR;
600 return;
601 }
602
603 }
604
605
606 //-------------------------------------------------------------------------------------
607 //
608 // RangeDesriptor Destructor
609 //
610 //-------------------------------------------------------------------------------------
~RangeDescriptor()611 RangeDescriptor::~RangeDescriptor() {
612 delete fIncludesSets;
613 fIncludesSets = NULL;
614 }
615
616 //-------------------------------------------------------------------------------------
617 //
618 // RangeDesriptor::split()
619 //
620 //-------------------------------------------------------------------------------------
split(UChar32 where,UErrorCode & status)621 void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
622 U_ASSERT(where>fStartChar && where<=fEndChar);
623 RangeDescriptor *nr = new RangeDescriptor(*this, status);
624 if (U_FAILURE(status)) {
625 return;
626 }
627 /* test for NULL */
628 if(nr == 0) {
629 status = U_MEMORY_ALLOCATION_ERROR;
630 return;
631 }
632
633 // RangeDescriptor copy constructor copies all fields.
634 // Only need to update those that are different after the split.
635 nr->fStartChar = where;
636 this->fEndChar = where-1;
637 nr->fNext = this->fNext;
638 this->fNext = nr;
639 }
640
641
642 //-------------------------------------------------------------------------------------
643 //
644 // RangeDescriptor::setDictionaryFlag
645 //
646 // Character Category Numbers that include characters from
647 // the original Unicode Set named "dictionary" have bit 14
648 // set to 1. The RBBI runtime engine uses this to trigger
649 // use of the word dictionary.
650 //
651 // This function looks through the Unicode Sets that it
652 // (the range) includes, and sets the bit in fNum when
653 // "dictionary" is among them.
654 //
655 // TODO: a faster way would be to find the set node for
656 // "dictionary" just once, rather than looking it
657 // up by name every time.
658 //
659 //-------------------------------------------------------------------------------------
setDictionaryFlag()660 void RangeDescriptor::setDictionaryFlag() {
661 int i;
662
663 for (i=0; i<this->fIncludesSets->size(); i++) {
664 RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
665 UnicodeString setName;
666 RBBINode *setRef = usetNode->fParent;
667 if (setRef != NULL) {
668 RBBINode *varRef = setRef->fParent;
669 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
670 setName = varRef->fText;
671 }
672 }
673 if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
674 this->fNum |= 0x4000;
675 break;
676 }
677 }
678 }
679
680
681
682 U_NAMESPACE_END
683
684 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
685