1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 // rbbisetb.cpp
5 //
6 /*
7 ***************************************************************************
8 * Copyright (C) 2002-2008 International Business Machines Corporation *
9 * and others. All rights reserved. *
10 ***************************************************************************
11 */
12 //
13 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
14 // (part of the rule building process.)
15 //
16 // Starting with the rules parse tree from the scanner,
17 //
18 // - Enumerate the set of UnicodeSets that are referenced
19 // by the RBBI rules.
20 // - compute a set of non-overlapping character ranges
21 // with all characters within a range belonging to the same
22 // set of input uniocde sets.
23 // - Derive a set of non-overlapping UnicodeSet (like things)
24 // that will correspond to columns in the state table for
25 // the RBBI execution engine. All characters within one
26 // of these sets belong to the same set of the original
27 // UnicodeSets from the user's rules.
28 // - construct the trie table that maps input characters
29 // to the index of the matching non-overlapping set of set from
30 // the previous step.
31 //
32
33 #include "unicode/utypes.h"
34
35 #if !UCONFIG_NO_BREAK_ITERATION
36
37 #include "unicode/uniset.h"
38 #include "utrie2.h"
39 #include "uvector.h"
40 #include "uassert.h"
41 #include "cmemory.h"
42 #include "cstring.h"
43
44 #include "rbbisetb.h"
45 #include "rbbinode.h"
46
47 U_NAMESPACE_BEGIN
48
49 //------------------------------------------------------------------------
50 //
51 // Constructor
52 //
53 //------------------------------------------------------------------------
RBBISetBuilder(RBBIRuleBuilder * rb)54 RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
55 {
56 fRB = rb;
57 fStatus = rb->fStatus;
58 fRangeList = 0;
59 fTrie = 0;
60 fTrieSize = 0;
61 fGroupCount = 0;
62 fSawBOF = FALSE;
63 }
64
65
66 //------------------------------------------------------------------------
67 //
68 // Destructor
69 //
70 //------------------------------------------------------------------------
~RBBISetBuilder()71 RBBISetBuilder::~RBBISetBuilder()
72 {
73 RangeDescriptor *nextRangeDesc;
74
75 // Walk through & delete the linked list of RangeDescriptors
76 for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
77 RangeDescriptor *r = nextRangeDesc;
78 nextRangeDesc = r->fNext;
79 delete r;
80 }
81
82 utrie2_close(fTrie);
83 }
84
85
86
87
88 //------------------------------------------------------------------------
89 //
90 // build Build the list of non-overlapping character ranges
91 // from the Unicode Sets.
92 //
93 //------------------------------------------------------------------------
buildRanges()94 void RBBISetBuilder::buildRanges() {
95 RBBINode *usetNode;
96 RangeDescriptor *rlRange;
97
98 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
99
100 //
101 // Initialize the process by creating a single range encompassing all characters
102 // that is in no sets.
103 //
104 fRangeList = new RangeDescriptor(*fStatus); // will check for status here
105 if (fRangeList == NULL) {
106 *fStatus = U_MEMORY_ALLOCATION_ERROR;
107 return;
108 }
109 fRangeList->fStartChar = 0;
110 fRangeList->fEndChar = 0x10ffff;
111
112 if (U_FAILURE(*fStatus)) {
113 return;
114 }
115
116 //
117 // Find the set of non-overlapping ranges of characters
118 //
119 int ni;
120 for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
121 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
122 if (usetNode==NULL) {
123 break;
124 }
125
126 UnicodeSet *inputSet = usetNode->fInputSet;
127 int32_t inputSetRangeCount = inputSet->getRangeCount();
128 int inputSetRangeIndex = 0;
129 rlRange = fRangeList;
130
131 for (;;) {
132 if (inputSetRangeIndex >= inputSetRangeCount) {
133 break;
134 }
135 UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
136 UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);
137
138 // skip over ranges from the range list that are completely
139 // below the current range from the input unicode set.
140 while (rlRange->fEndChar < inputSetRangeBegin) {
141 rlRange = rlRange->fNext;
142 }
143
144 // If the start of the range from the range list is before with
145 // the start of the range from the unicode set, split the range list range
146 // in two, with one part being before (wholly outside of) the unicode set
147 // and the other containing the rest.
148 // Then continue the loop; the post-split current range will then be skipped
149 // over
150 if (rlRange->fStartChar < inputSetRangeBegin) {
151 rlRange->split(inputSetRangeBegin, *fStatus);
152 if (U_FAILURE(*fStatus)) {
153 return;
154 }
155 continue;
156 }
157
158 // Same thing at the end of the ranges...
159 // If the end of the range from the range list doesn't coincide with
160 // the end of the range from the unicode set, split the range list
161 // range in two. The first part of the split range will be
162 // wholly inside the Unicode set.
163 if (rlRange->fEndChar > inputSetRangeEnd) {
164 rlRange->split(inputSetRangeEnd+1, *fStatus);
165 if (U_FAILURE(*fStatus)) {
166 return;
167 }
168 }
169
170 // The current rlRange is now entirely within the UnicodeSet range.
171 // Add this unicode set to the list of sets for this rlRange
172 if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
173 rlRange->fIncludesSets->addElement(usetNode, *fStatus);
174 if (U_FAILURE(*fStatus)) {
175 return;
176 }
177 }
178
179 // Advance over ranges that we are finished with.
180 if (inputSetRangeEnd == rlRange->fEndChar) {
181 inputSetRangeIndex++;
182 }
183 rlRange = rlRange->fNext;
184 }
185 }
186
187 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
188
189 //
190 // Group the above ranges, with each group consisting of one or more
191 // ranges that are in exactly the same set of original UnicodeSets.
192 // The groups are numbered, and these group numbers are the set of
193 // input symbols recognized by the run-time state machine.
194 //
195 // Numbering: # 0 (state table column 0) is unused.
196 // # 1 is reserved - table column 1 is for end-of-input
197 // # 2 is reserved - table column 2 is for beginning-in-input
198 // # 3 is the first range list.
199 //
200 RangeDescriptor *rlSearchRange;
201 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
202 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
203 if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
204 rlRange->fNum = rlSearchRange->fNum;
205 break;
206 }
207 }
208 if (rlRange->fNum == 0) {
209 fGroupCount ++;
210 rlRange->fNum = fGroupCount+2;
211 rlRange->setDictionaryFlag();
212 addValToSets(rlRange->fIncludesSets, fGroupCount+2);
213 }
214 }
215
216 // Handle input sets that contain the special string {eof}.
217 // Column 1 of the state table is reserved for EOF on input.
218 // Column 2 is reserved for before-the-start-input.
219 // (This column can be optimized away later if there are no rule
220 // references to {bof}.)
221 // Add this column value (1 or 2) to the equivalent expression
222 // subtree for each UnicodeSet that contains the string {eof}
223 // Because {bof} and {eof} are not a characters in the normal sense,
224 // they doesn't affect the computation of ranges or TRIE.
225 static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
226 static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
227
228 UnicodeString eofString(eofUString);
229 UnicodeString bofString(bofUString);
230 for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
231 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
232 if (usetNode==NULL) {
233 break;
234 }
235 UnicodeSet *inputSet = usetNode->fInputSet;
236 if (inputSet->contains(eofString)) {
237 addValToSet(usetNode, 1);
238 }
239 if (inputSet->contains(bofString)) {
240 addValToSet(usetNode, 2);
241 fSawBOF = TRUE;
242 }
243 }
244
245
246 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
247 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
248 }
249
250
251 //
252 // Build the Trie table for mapping UChar32 values to the corresponding
253 // range group number.
254 //
buildTrie()255 void RBBISetBuilder::buildTrie() {
256 RangeDescriptor *rlRange;
257
258 fTrie = utrie2_open(0, // Initial value for all code points.
259 0, // Error value for out-of-range input.
260 fStatus);
261
262 for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
263 utrie2_setRange32(fTrie,
264 rlRange->fStartChar, // Range start
265 rlRange->fEndChar, // Range end (inclusive)
266 rlRange->fNum, // value for range
267 TRUE, // Overwrite previously written values
268 fStatus);
269 }
270 }
271
272
mergeCategories(IntPair categories)273 void RBBISetBuilder::mergeCategories(IntPair categories) {
274 U_ASSERT(categories.first >= 1);
275 U_ASSERT(categories.second > categories.first);
276 for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
277 int32_t rangeNum = rd->fNum & ~DICT_BIT;
278 int32_t rangeDict = rd->fNum & DICT_BIT;
279 if (rangeNum == categories.second) {
280 rd->fNum = categories.first | rangeDict;
281 } else if (rangeNum > categories.second) {
282 rd->fNum--;
283 }
284 }
285 --fGroupCount;
286 }
287
288
289 //-----------------------------------------------------------------------------------
290 //
291 // getTrieSize() Return the size that will be required to serialize the Trie.
292 //
293 //-----------------------------------------------------------------------------------
getTrieSize()294 int32_t RBBISetBuilder::getTrieSize() {
295 if (U_FAILURE(*fStatus)) {
296 return 0;
297 }
298 utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
299 fTrieSize = utrie2_serialize(fTrie,
300 NULL, // Buffer
301 0, // Capacity
302 fStatus);
303 if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
304 *fStatus = U_ZERO_ERROR;
305 }
306 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
307 return fTrieSize;
308 }
309
310
311 //-----------------------------------------------------------------------------------
312 //
313 // serializeTrie() Put the serialized trie at the specified address.
314 // Trust the caller to have given us enough memory.
315 // getTrieSize() MUST be called first.
316 //
317 //-----------------------------------------------------------------------------------
serializeTrie(uint8_t * where)318 void RBBISetBuilder::serializeTrie(uint8_t *where) {
319 utrie2_serialize(fTrie,
320 where, // Buffer
321 fTrieSize, // Capacity
322 fStatus);
323 }
324
325 //------------------------------------------------------------------------
326 //
327 // addValToSets Add a runtime-mapped input value to each uset from a
328 // list of uset nodes. (val corresponds to a state table column.)
329 // For each of the original Unicode sets - which correspond
330 // directly to uset nodes - a logically equivalent expression
331 // is constructed in terms of the remapped runtime input
332 // symbol set. This function adds one runtime input symbol to
333 // a list of sets.
334 //
335 // The "logically equivalent expression" is the tree for an
336 // or-ing together of all of the symbols that go into the set.
337 //
338 //------------------------------------------------------------------------
addValToSets(UVector * sets,uint32_t val)339 void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
340 int32_t ix;
341
342 for (ix=0; ix<sets->size(); ix++) {
343 RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
344 addValToSet(usetNode, val);
345 }
346 }
347
addValToSet(RBBINode * usetNode,uint32_t val)348 void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
349 RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
350 if (leafNode == NULL) {
351 *fStatus = U_MEMORY_ALLOCATION_ERROR;
352 return;
353 }
354 leafNode->fVal = (unsigned short)val;
355 if (usetNode->fLeftChild == NULL) {
356 usetNode->fLeftChild = leafNode;
357 leafNode->fParent = usetNode;
358 } else {
359 // There are already input symbols present for this set.
360 // Set up an OR node, with the previous stuff as the left child
361 // and the new value as the right child.
362 RBBINode *orNode = new RBBINode(RBBINode::opOr);
363 if (orNode == NULL) {
364 *fStatus = U_MEMORY_ALLOCATION_ERROR;
365 return;
366 }
367 orNode->fLeftChild = usetNode->fLeftChild;
368 orNode->fRightChild = leafNode;
369 orNode->fLeftChild->fParent = orNode;
370 orNode->fRightChild->fParent = orNode;
371 usetNode->fLeftChild = orNode;
372 orNode->fParent = usetNode;
373 }
374 }
375
376
377 //------------------------------------------------------------------------
378 //
379 // getNumCharCategories
380 //
381 //------------------------------------------------------------------------
getNumCharCategories() const382 int32_t RBBISetBuilder::getNumCharCategories() const {
383 return fGroupCount + 3;
384 }
385
386
387 //------------------------------------------------------------------------
388 //
389 // sawBOF
390 //
391 //------------------------------------------------------------------------
sawBOF() const392 UBool RBBISetBuilder::sawBOF() const {
393 return fSawBOF;
394 }
395
396
397 //------------------------------------------------------------------------
398 //
399 // getFirstChar Given a runtime RBBI character category, find
400 // the first UChar32 that is in the set of chars
401 // in the category.
402 //------------------------------------------------------------------------
getFirstChar(int32_t category) const403 UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
404 RangeDescriptor *rlRange;
405 UChar32 retVal = (UChar32)-1;
406 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
407 if (rlRange->fNum == category) {
408 retVal = rlRange->fStartChar;
409 break;
410 }
411 }
412 return retVal;
413 }
414
415
416
417 //------------------------------------------------------------------------
418 //
419 // printRanges A debugging function.
420 // dump out all of the range definitions.
421 //
422 //------------------------------------------------------------------------
423 #ifdef RBBI_DEBUG
printRanges()424 void RBBISetBuilder::printRanges() {
425 RangeDescriptor *rlRange;
426 int i;
427
428 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
429 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
430 RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
431
432 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
433 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
434 UnicodeString setName = UNICODE_STRING("anon", 4);
435 RBBINode *setRef = usetNode->fParent;
436 if (setRef != NULL) {
437 RBBINode *varRef = setRef->fParent;
438 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
439 setName = varRef->fText;
440 }
441 }
442 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
443 }
444 RBBIDebugPrintf("\n");
445 }
446 }
447 #endif
448
449
450 //------------------------------------------------------------------------
451 //
452 // printRangeGroups A debugging function.
453 // dump out all of the range groups.
454 //
455 //------------------------------------------------------------------------
456 #ifdef RBBI_DEBUG
printRangeGroups()457 void RBBISetBuilder::printRangeGroups() {
458 RangeDescriptor *rlRange;
459 RangeDescriptor *tRange;
460 int i;
461 int lastPrintedGroupNum = 0;
462
463 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
464 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
465 int groupNum = rlRange->fNum & 0xbfff;
466 if (groupNum > lastPrintedGroupNum) {
467 lastPrintedGroupNum = groupNum;
468 RBBIDebugPrintf("%2i ", groupNum);
469
470 if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" <DICT> ");}
471
472 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
473 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
474 UnicodeString setName = UNICODE_STRING("anon", 4);
475 RBBINode *setRef = usetNode->fParent;
476 if (setRef != NULL) {
477 RBBINode *varRef = setRef->fParent;
478 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
479 setName = varRef->fText;
480 }
481 }
482 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
483 }
484
485 i = 0;
486 for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
487 if (tRange->fNum == rlRange->fNum) {
488 if (i++ % 5 == 0) {
489 RBBIDebugPrintf("\n ");
490 }
491 RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
492 }
493 }
494 RBBIDebugPrintf("\n");
495 }
496 }
497 RBBIDebugPrintf("\n");
498 }
499 #endif
500
501
502 //------------------------------------------------------------------------
503 //
504 // printSets A debugging function.
505 // dump out all of the set definitions.
506 //
507 //------------------------------------------------------------------------
508 #ifdef RBBI_DEBUG
printSets()509 void RBBISetBuilder::printSets() {
510 int i;
511
512 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
513 for (i=0; ; i++) {
514 RBBINode *usetNode;
515 RBBINode *setRef;
516 RBBINode *varRef;
517 UnicodeString setName;
518
519 usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
520 if (usetNode == NULL) {
521 break;
522 }
523
524 RBBIDebugPrintf("%3d ", i);
525 setName = UNICODE_STRING("anonymous", 9);
526 setRef = usetNode->fParent;
527 if (setRef != NULL) {
528 varRef = setRef->fParent;
529 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
530 setName = varRef->fText;
531 }
532 }
533 RBBI_DEBUG_printUnicodeString(setName);
534 RBBIDebugPrintf(" ");
535 RBBI_DEBUG_printUnicodeString(usetNode->fText);
536 RBBIDebugPrintf("\n");
537 if (usetNode->fLeftChild != NULL) {
538 RBBINode::printTree(usetNode->fLeftChild, TRUE);
539 }
540 }
541 RBBIDebugPrintf("\n");
542 }
543 #endif
544
545
546
547 //-------------------------------------------------------------------------------------
548 //
549 // RangeDescriptor copy constructor
550 //
551 //-------------------------------------------------------------------------------------
552
RangeDescriptor(const RangeDescriptor & other,UErrorCode & status)553 RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
554 int i;
555
556 this->fStartChar = other.fStartChar;
557 this->fEndChar = other.fEndChar;
558 this->fNum = other.fNum;
559 this->fNext = NULL;
560 UErrorCode oldstatus = status;
561 this->fIncludesSets = new UVector(status);
562 if (U_FAILURE(oldstatus)) {
563 status = oldstatus;
564 }
565 if (U_FAILURE(status)) {
566 return;
567 }
568 /* test for NULL */
569 if (this->fIncludesSets == 0) {
570 status = U_MEMORY_ALLOCATION_ERROR;
571 return;
572 }
573
574 for (i=0; i<other.fIncludesSets->size(); i++) {
575 this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
576 }
577 }
578
579
580 //-------------------------------------------------------------------------------------
581 //
582 // RangeDesriptor default constructor
583 //
584 //-------------------------------------------------------------------------------------
RangeDescriptor(UErrorCode & status)585 RangeDescriptor::RangeDescriptor(UErrorCode &status) {
586 this->fStartChar = 0;
587 this->fEndChar = 0;
588 this->fNum = 0;
589 this->fNext = NULL;
590 UErrorCode oldstatus = status;
591 this->fIncludesSets = new UVector(status);
592 if (U_FAILURE(oldstatus)) {
593 status = oldstatus;
594 }
595 if (U_FAILURE(status)) {
596 return;
597 }
598 /* test for NULL */
599 if(this->fIncludesSets == 0) {
600 status = U_MEMORY_ALLOCATION_ERROR;
601 return;
602 }
603
604 }
605
606
607 //-------------------------------------------------------------------------------------
608 //
609 // RangeDesriptor Destructor
610 //
611 //-------------------------------------------------------------------------------------
~RangeDescriptor()612 RangeDescriptor::~RangeDescriptor() {
613 delete fIncludesSets;
614 fIncludesSets = NULL;
615 }
616
617 //-------------------------------------------------------------------------------------
618 //
619 // RangeDesriptor::split()
620 //
621 //-------------------------------------------------------------------------------------
split(UChar32 where,UErrorCode & status)622 void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
623 U_ASSERT(where>fStartChar && where<=fEndChar);
624 RangeDescriptor *nr = new RangeDescriptor(*this, status);
625 if(nr == 0) {
626 status = U_MEMORY_ALLOCATION_ERROR;
627 return;
628 }
629 if (U_FAILURE(status)) {
630 delete nr;
631 return;
632 }
633 // RangeDescriptor copy constructor copies all fields.
634 // Only need to update those that are different after the split.
635 nr->fStartChar = where;
636 this->fEndChar = where-1;
637 nr->fNext = this->fNext;
638 this->fNext = nr;
639 }
640
641
642 //-------------------------------------------------------------------------------------
643 //
644 // RangeDescriptor::setDictionaryFlag
645 //
646 // Character Category Numbers that include characters from
647 // the original Unicode Set named "dictionary" have bit 14
648 // set to 1. The RBBI runtime engine uses this to trigger
649 // use of the word dictionary.
650 //
651 // This function looks through the Unicode Sets that it
652 // (the range) includes, and sets the bit in fNum when
653 // "dictionary" is among them.
654 //
655 // TODO: a faster way would be to find the set node for
656 // "dictionary" just once, rather than looking it
657 // up by name every time.
658 //
659 //-------------------------------------------------------------------------------------
setDictionaryFlag()660 void RangeDescriptor::setDictionaryFlag() {
661 int i;
662
663 static const char16_t *dictionary = u"dictionary";
664 for (i=0; i<fIncludesSets->size(); i++) {
665 RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
666 RBBINode *setRef = usetNode->fParent;
667 if (setRef != nullptr) {
668 RBBINode *varRef = setRef->fParent;
669 if (varRef && varRef->fType == RBBINode::varRef) {
670 const UnicodeString *setName = &varRef->fText;
671 if (setName->compare(dictionary, -1) == 0) {
672 fNum |= RBBISetBuilder::DICT_BIT;
673 break;
674 }
675 }
676 }
677 }
678 }
679
680
681
682 U_NAMESPACE_END
683
684 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
685