1 //
2 // file: rbbiscan.cpp
3 //
4 // Copyright (C) 2002-2015, International Business Machines Corporation and others.
5 // All Rights Reserved.
6 //
7 // This file contains the Rule Based Break Iterator Rule Builder functions for
8 // scanning the rules and assembling a parse tree. This is the first phase
9 // of compiling the rules.
10 //
11 // The overall of the rules is managed by class RBBIRuleBuilder, which will
12 // create and use an instance of this class as part of the process.
13 //
14
15 #include "unicode/utypes.h"
16
17 #if !UCONFIG_NO_BREAK_ITERATION
18
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "cmemory.h"
26 #include "cstring.h"
27
28 #include "rbbirpt.h" // Contains state table for the rbbi rules parser.
29 // generated by a Perl script.
30 #include "rbbirb.h"
31 #include "rbbinode.h"
32 #include "rbbiscan.h"
33 #include "rbbitblb.h"
34
35 #include "uassert.h"
36
37 //------------------------------------------------------------------------------
38 //
39 // Unicode Set init strings for each of the character classes needed for parsing a rule file.
40 // (Initialized with hex values for portability to EBCDIC based machines.
41 // Really ugly, but there's no good way to avoid it.)
42 //
43 // The sets are referred to by name in the rbbirpt.txt, which is the
44 // source form of the state transition table for the RBBI rule parser.
45 //
46 //------------------------------------------------------------------------------
47 static const UChar gRuleSet_rule_char_pattern[] = {
48 // [ ^ [ \ p { Z } \ u 0 0 2 0
49 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
50 // - \ u 0 0 7 f ] - [ \ p
51 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
52 // { L } ] - [ \ p { N } ] ]
53 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
54
55 static const UChar gRuleSet_name_char_pattern[] = {
56 // [ _ \ p { L } \ p { N } ]
57 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
58
59 static const UChar gRuleSet_digit_char_pattern[] = {
60 // [ 0 - 9 ]
61 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
62
63 static const UChar gRuleSet_name_start_char_pattern[] = {
64 // [ _ \ p { L } ]
65 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
66
67 static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any"
68
69
70 U_CDECL_BEGIN
RBBISetTable_deleter(void * p)71 static void U_CALLCONV RBBISetTable_deleter(void *p) {
72 icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p;
73 delete px->key;
74 // Note: px->val is owned by the linked list "fSetsListHead" in scanner.
75 // Don't delete the value nodes here.
76 uprv_free(px);
77 }
78 U_CDECL_END
79
80 U_NAMESPACE_BEGIN
81
82 //------------------------------------------------------------------------------
83 //
84 // Constructor.
85 //
86 //------------------------------------------------------------------------------
RBBIRuleScanner(RBBIRuleBuilder * rb)87 RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
88 {
89 fRB = rb;
90 fStackPtr = 0;
91 fStack[fStackPtr] = 0;
92 fNodeStackPtr = 0;
93 fRuleNum = 0;
94 fNodeStack[0] = NULL;
95
96 fSymbolTable = NULL;
97 fSetTable = NULL;
98
99 fScanIndex = 0;
100 fNextIndex = 0;
101
102 fReverseRule = FALSE;
103 fLookAheadRule = FALSE;
104
105 fLineNum = 1;
106 fCharNum = 0;
107 fQuoteMode = FALSE;
108
109 // Do not check status until after all critical fields are sufficiently initialized
110 // that the destructor can run cleanly.
111 if (U_FAILURE(*rb->fStatus)) {
112 return;
113 }
114
115 //
116 // Set up the constant Unicode Sets.
117 // Note: These could be made static, lazily initialized, and shared among
118 // all instances of RBBIRuleScanners. BUT this is quite a bit simpler,
119 // and the time to build these few sets should be small compared to a
120 // full break iterator build.
121 fRuleSets[kRuleSet_rule_char-128]
122 = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus);
123 // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
124 fRuleSets[kRuleSet_white_space-128].
125 add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
126 fRuleSets[kRuleSet_name_char-128]
127 = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus);
128 fRuleSets[kRuleSet_name_start_char-128]
129 = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
130 fRuleSets[kRuleSet_digit_char-128]
131 = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus);
132 if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
133 // This case happens if ICU's data is missing. UnicodeSet tries to look up property
134 // names from the init string, can't find them, and claims an illegal argument.
135 // Change the error so that the actual problem will be clearer to users.
136 *rb->fStatus = U_BRK_INIT_ERROR;
137 }
138 if (U_FAILURE(*rb->fStatus)) {
139 return;
140 }
141
142 fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
143 if (fSymbolTable == NULL) {
144 *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
145 return;
146 }
147 fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus);
148 if (U_FAILURE(*rb->fStatus)) {
149 return;
150 }
151 uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
152 }
153
154
155
156 //------------------------------------------------------------------------------
157 //
158 // Destructor
159 //
160 //------------------------------------------------------------------------------
~RBBIRuleScanner()161 RBBIRuleScanner::~RBBIRuleScanner() {
162 delete fSymbolTable;
163 if (fSetTable != NULL) {
164 uhash_close(fSetTable);
165 fSetTable = NULL;
166
167 }
168
169
170 // Node Stack.
171 // Normally has one entry, which is the entire parse tree for the rules.
172 // If errors occured, there may be additional subtrees left on the stack.
173 while (fNodeStackPtr > 0) {
174 delete fNodeStack[fNodeStackPtr];
175 fNodeStackPtr--;
176 }
177
178 }
179
180 //------------------------------------------------------------------------------
181 //
182 // doParseAction Do some action during rule parsing.
183 // Called by the parse state machine.
184 // Actions build the parse tree and Unicode Sets,
185 // and maintain the parse stack for nested expressions.
186 //
187 // TODO: unify EParseAction and RBBI_RuleParseAction enum types.
188 // They represent exactly the same thing. They're separate
189 // only to work around enum forward declaration restrictions
190 // in some compilers, while at the same time avoiding multiple
191 // definitions problems. I'm sure that there's a better way.
192 //
193 //------------------------------------------------------------------------------
doParseActions(int32_t action)194 UBool RBBIRuleScanner::doParseActions(int32_t action)
195 {
196 RBBINode *n = NULL;
197
198 UBool returnVal = TRUE;
199
200 switch (action) {
201
202 case doExprStart:
203 pushNewNode(RBBINode::opStart);
204 fRuleNum++;
205 break;
206
207
208 case doExprOrOperator:
209 {
210 fixOpStack(RBBINode::precOpCat);
211 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
212 RBBINode *orNode = pushNewNode(RBBINode::opOr);
213 if (U_FAILURE(*fRB->fStatus)) {
214 break;
215 }
216 orNode->fLeftChild = operandNode;
217 operandNode->fParent = orNode;
218 }
219 break;
220
221 case doExprCatOperator:
222 // concatenation operator.
223 // For the implicit concatenation of adjacent terms in an expression that are
224 // not separated by any other operator. Action is invoked between the
225 // actions for the two terms.
226 {
227 fixOpStack(RBBINode::precOpCat);
228 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
229 RBBINode *catNode = pushNewNode(RBBINode::opCat);
230 if (U_FAILURE(*fRB->fStatus)) {
231 break;
232 }
233 catNode->fLeftChild = operandNode;
234 operandNode->fParent = catNode;
235 }
236 break;
237
238 case doLParen:
239 // Open Paren.
240 // The openParen node is a dummy operation type with a low precedence,
241 // which has the affect of ensuring that any real binary op that
242 // follows within the parens binds more tightly to the operands than
243 // stuff outside of the parens.
244 pushNewNode(RBBINode::opLParen);
245 break;
246
247 case doExprRParen:
248 fixOpStack(RBBINode::precLParen);
249 break;
250
251 case doNOP:
252 break;
253
254 case doStartAssign:
255 // We've just scanned "$variable = "
256 // The top of the node stack has the $variable ref node.
257
258 // Save the start position of the RHS text in the StartExpression node
259 // that precedes the $variableReference node on the stack.
260 // This will eventually be used when saving the full $variable replacement
261 // text as a string.
262 n = fNodeStack[fNodeStackPtr-1];
263 n->fFirstPos = fNextIndex; // move past the '='
264
265 // Push a new start-of-expression node; needed to keep parse of the
266 // RHS expression happy.
267 pushNewNode(RBBINode::opStart);
268 break;
269
270
271
272
273 case doEndAssign:
274 {
275 // We have reached the end of an assignement statement.
276 // Current scan char is the ';' that terminates the assignment.
277
278 // Terminate expression, leaves expression parse tree rooted in TOS node.
279 fixOpStack(RBBINode::precStart);
280
281 RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2];
282 RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1];
283 RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr];
284
285 // Save original text of right side of assignment, excluding the terminating ';'
286 // in the root of the node for the right-hand-side expression.
287 RHSExprNode->fFirstPos = startExprNode->fFirstPos;
288 RHSExprNode->fLastPos = fScanIndex;
289 fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
290
291 // Expression parse tree becomes l. child of the $variable reference node.
292 varRefNode->fLeftChild = RHSExprNode;
293 RHSExprNode->fParent = varRefNode;
294
295 // Make a symbol table entry for the $variableRef node.
296 fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
297 if (U_FAILURE(*fRB->fStatus)) {
298 // This is a round-about way to get the parse position set
299 // so that duplicate symbols error messages include a line number.
300 UErrorCode t = *fRB->fStatus;
301 *fRB->fStatus = U_ZERO_ERROR;
302 error(t);
303 }
304
305 // Clean up the stack.
306 delete startExprNode;
307 fNodeStackPtr-=3;
308 break;
309 }
310
311 case doEndOfRule:
312 {
313 fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression
314 if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node.
315 break;
316 }
317 #ifdef RBBI_DEBUG
318 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
319 #endif
320 U_ASSERT(fNodeStackPtr == 1);
321
322 // If this rule includes a look-ahead '/', add a endMark node to the
323 // expression tree.
324 if (fLookAheadRule) {
325 RBBINode *thisRule = fNodeStack[fNodeStackPtr];
326 RBBINode *endNode = pushNewNode(RBBINode::endMark);
327 RBBINode *catNode = pushNewNode(RBBINode::opCat);
328 if (U_FAILURE(*fRB->fStatus)) {
329 break;
330 }
331 fNodeStackPtr -= 2;
332 catNode->fLeftChild = thisRule;
333 catNode->fRightChild = endNode;
334 fNodeStack[fNodeStackPtr] = catNode;
335 endNode->fVal = fRuleNum;
336 endNode->fLookAheadEnd = TRUE;
337 }
338
339 // All rule expressions are ORed together.
340 // The ';' that terminates an expression really just functions as a '|' with
341 // a low operator prededence.
342 //
343 // Each of the four sets of rules are collected separately.
344 // (forward, reverse, safe_forward, safe_reverse)
345 // OR this rule into the appropriate group of them.
346 //
347 RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
348
349 if (*destRules != NULL) {
350 // This is not the first rule encounted.
351 // OR previous stuff (from *destRules)
352 // with the current rule expression (on the Node Stack)
353 // with the resulting OR expression going to *destRules
354 //
355 RBBINode *thisRule = fNodeStack[fNodeStackPtr];
356 RBBINode *prevRules = *destRules;
357 RBBINode *orNode = pushNewNode(RBBINode::opOr);
358 if (U_FAILURE(*fRB->fStatus)) {
359 break;
360 }
361 orNode->fLeftChild = prevRules;
362 prevRules->fParent = orNode;
363 orNode->fRightChild = thisRule;
364 thisRule->fParent = orNode;
365 *destRules = orNode;
366 }
367 else
368 {
369 // This is the first rule encountered (for this direction).
370 // Just move its parse tree from the stack to *destRules.
371 *destRules = fNodeStack[fNodeStackPtr];
372 }
373 fReverseRule = FALSE; // in preparation for the next rule.
374 fLookAheadRule = FALSE;
375 fNodeStackPtr = 0;
376 }
377 break;
378
379
380 case doRuleError:
381 error(U_BRK_RULE_SYNTAX);
382 returnVal = FALSE;
383 break;
384
385
386 case doVariableNameExpectedErr:
387 error(U_BRK_RULE_SYNTAX);
388 break;
389
390
391 //
392 // Unary operands + ? *
393 // These all appear after the operand to which they apply.
394 // When we hit one, the operand (may be a whole sub expression)
395 // will be on the top of the stack.
396 // Unary Operator becomes TOS, with the old TOS as its one child.
397 case doUnaryOpPlus:
398 {
399 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
400 RBBINode *plusNode = pushNewNode(RBBINode::opPlus);
401 if (U_FAILURE(*fRB->fStatus)) {
402 break;
403 }
404 plusNode->fLeftChild = operandNode;
405 operandNode->fParent = plusNode;
406 }
407 break;
408
409 case doUnaryOpQuestion:
410 {
411 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
412 RBBINode *qNode = pushNewNode(RBBINode::opQuestion);
413 if (U_FAILURE(*fRB->fStatus)) {
414 break;
415 }
416 qNode->fLeftChild = operandNode;
417 operandNode->fParent = qNode;
418 }
419 break;
420
421 case doUnaryOpStar:
422 {
423 RBBINode *operandNode = fNodeStack[fNodeStackPtr--];
424 RBBINode *starNode = pushNewNode(RBBINode::opStar);
425 if (U_FAILURE(*fRB->fStatus)) {
426 break;
427 }
428 starNode->fLeftChild = operandNode;
429 operandNode->fParent = starNode;
430 }
431 break;
432
433 case doRuleChar:
434 // A "Rule Character" is any single character that is a literal part
435 // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]"
436 // These are pretty uncommon in break rules; the terms are more commonly
437 // sets. To keep things uniform, treat these characters like as
438 // sets that just happen to contain only one character.
439 {
440 n = pushNewNode(RBBINode::setRef);
441 if (U_FAILURE(*fRB->fStatus)) {
442 break;
443 }
444 findSetFor(UnicodeString(fC.fChar), n);
445 n->fFirstPos = fScanIndex;
446 n->fLastPos = fNextIndex;
447 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
448 break;
449 }
450
451 case doDotAny:
452 // scanned a ".", meaning match any single character.
453 {
454 n = pushNewNode(RBBINode::setRef);
455 if (U_FAILURE(*fRB->fStatus)) {
456 break;
457 }
458 findSetFor(UnicodeString(TRUE, kAny, 3), n);
459 n->fFirstPos = fScanIndex;
460 n->fLastPos = fNextIndex;
461 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
462 break;
463 }
464
465 case doSlash:
466 // Scanned a '/', which identifies a look-ahead break position in a rule.
467 n = pushNewNode(RBBINode::lookAhead);
468 if (U_FAILURE(*fRB->fStatus)) {
469 break;
470 }
471 n->fVal = fRuleNum;
472 n->fFirstPos = fScanIndex;
473 n->fLastPos = fNextIndex;
474 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
475 fLookAheadRule = TRUE;
476 break;
477
478
479 case doStartTagValue:
480 // Scanned a '{', the opening delimiter for a tag value within a rule.
481 n = pushNewNode(RBBINode::tag);
482 if (U_FAILURE(*fRB->fStatus)) {
483 break;
484 }
485 n->fVal = 0;
486 n->fFirstPos = fScanIndex;
487 n->fLastPos = fNextIndex;
488 break;
489
490 case doTagDigit:
491 // Just scanned a decimal digit that's part of a tag value
492 {
493 n = fNodeStack[fNodeStackPtr];
494 uint32_t v = u_charDigitValue(fC.fChar);
495 U_ASSERT(v < 10);
496 n->fVal = n->fVal*10 + v;
497 break;
498 }
499
500 case doTagValue:
501 n = fNodeStack[fNodeStackPtr];
502 n->fLastPos = fNextIndex;
503 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
504 break;
505
506 case doTagExpectedError:
507 error(U_BRK_MALFORMED_RULE_TAG);
508 returnVal = FALSE;
509 break;
510
511 case doOptionStart:
512 // Scanning a !!option. At the start of string.
513 fOptionStart = fScanIndex;
514 break;
515
516 case doOptionEnd:
517 {
518 UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
519 if (opt == UNICODE_STRING("chain", 5)) {
520 fRB->fChainRules = TRUE;
521 } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
522 fRB->fLBCMNoChain = TRUE;
523 } else if (opt == UNICODE_STRING("forward", 7)) {
524 fRB->fDefaultTree = &fRB->fForwardTree;
525 } else if (opt == UNICODE_STRING("reverse", 7)) {
526 fRB->fDefaultTree = &fRB->fReverseTree;
527 } else if (opt == UNICODE_STRING("safe_forward", 12)) {
528 fRB->fDefaultTree = &fRB->fSafeFwdTree;
529 } else if (opt == UNICODE_STRING("safe_reverse", 12)) {
530 fRB->fDefaultTree = &fRB->fSafeRevTree;
531 } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
532 fRB->fLookAheadHardBreak = TRUE;
533 } else {
534 error(U_BRK_UNRECOGNIZED_OPTION);
535 }
536 }
537 break;
538
539 case doReverseDir:
540 fReverseRule = TRUE;
541 break;
542
543 case doStartVariableName:
544 n = pushNewNode(RBBINode::varRef);
545 if (U_FAILURE(*fRB->fStatus)) {
546 break;
547 }
548 n->fFirstPos = fScanIndex;
549 break;
550
551 case doEndVariableName:
552 n = fNodeStack[fNodeStackPtr];
553 if (n==NULL || n->fType != RBBINode::varRef) {
554 error(U_BRK_INTERNAL_ERROR);
555 break;
556 }
557 n->fLastPos = fScanIndex;
558 fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
559 // Look the newly scanned name up in the symbol table
560 // If there's an entry, set the l. child of the var ref to the replacement expression.
561 // (We also pass through here when scanning assignments, but no harm is done, other
562 // than a slight wasted effort that seems hard to avoid. Lookup will be null)
563 n->fLeftChild = fSymbolTable->lookupNode(n->fText);
564 break;
565
566 case doCheckVarDef:
567 n = fNodeStack[fNodeStackPtr];
568 if (n->fLeftChild == NULL) {
569 error(U_BRK_UNDEFINED_VARIABLE);
570 returnVal = FALSE;
571 }
572 break;
573
574 case doExprFinished:
575 break;
576
577 case doRuleErrorAssignExpr:
578 error(U_BRK_ASSIGN_ERROR);
579 returnVal = FALSE;
580 break;
581
582 case doExit:
583 returnVal = FALSE;
584 break;
585
586 case doScanUnicodeSet:
587 scanSet();
588 break;
589
590 default:
591 error(U_BRK_INTERNAL_ERROR);
592 returnVal = FALSE;
593 break;
594 }
595 return returnVal && U_SUCCESS(*fRB->fStatus);
596 }
597
598
599
600
601 //------------------------------------------------------------------------------
602 //
603 // Error Report a rule parse error.
604 // Only report it if no previous error has been recorded.
605 //
606 //------------------------------------------------------------------------------
error(UErrorCode e)607 void RBBIRuleScanner::error(UErrorCode e) {
608 if (U_SUCCESS(*fRB->fStatus)) {
609 *fRB->fStatus = e;
610 if (fRB->fParseError) {
611 fRB->fParseError->line = fLineNum;
612 fRB->fParseError->offset = fCharNum;
613 fRB->fParseError->preContext[0] = 0;
614 fRB->fParseError->postContext[0] = 0;
615 }
616 }
617 }
618
619
620
621
622 //------------------------------------------------------------------------------
623 //
624 // fixOpStack The parse stack holds partially assembled chunks of the parse tree.
625 // An entry on the stack may be as small as a single setRef node,
626 // or as large as the parse tree
627 // for an entire expression (this will be the one item left on the stack
628 // when the parsing of an RBBI rule completes.
629 //
630 // This function is called when a binary operator is encountered.
631 // It looks back up the stack for operators that are not yet associated
632 // with a right operand, and if the precedence of the stacked operator >=
633 // the precedence of the current operator, binds the operand left,
634 // to the previously encountered operator.
635 //
636 //------------------------------------------------------------------------------
fixOpStack(RBBINode::OpPrecedence p)637 void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
638 RBBINode *n;
639 // printNodeStack("entering fixOpStack()");
640 for (;;) {
641 n = fNodeStack[fNodeStackPtr-1]; // an operator node
642 if (n->fPrecedence == 0) {
643 RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
644 error(U_BRK_INTERNAL_ERROR);
645 return;
646 }
647
648 if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
649 // The most recent operand goes with the current operator,
650 // not with the previously stacked one.
651 break;
652 }
653 // Stack operator is a binary op ( '|' or concatenation)
654 // TOS operand becomes right child of this operator.
655 // Resulting subexpression becomes the TOS operand.
656 n->fRightChild = fNodeStack[fNodeStackPtr];
657 fNodeStack[fNodeStackPtr]->fParent = n;
658 fNodeStackPtr--;
659 // printNodeStack("looping in fixOpStack() ");
660 }
661
662 if (p <= RBBINode::precLParen) {
663 // Scan is at a right paren or end of expression.
664 // The scanned item must match the stack, or else there was an error.
665 // Discard the left paren (or start expr) node from the stack,
666 // leaving the completed (sub)expression as TOS.
667 if (n->fPrecedence != p) {
668 // Right paren encountered matched start of expression node, or
669 // end of expression matched with a left paren node.
670 error(U_BRK_MISMATCHED_PAREN);
671 }
672 fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
673 fNodeStackPtr--;
674 // Delete the now-discarded LParen or Start node.
675 delete n;
676 }
677 // printNodeStack("leaving fixOpStack()");
678 }
679
680
681
682
683 //------------------------------------------------------------------------------
684 //
685 // findSetFor given a UnicodeString,
686 // - find the corresponding Unicode Set (uset node)
687 // (create one if necessary)
688 // - Set fLeftChild of the caller's node (should be a setRef node)
689 // to the uset node
690 // Maintain a hash table of uset nodes, so the same one is always used
691 // for the same string.
692 // If a "to adopt" set is provided and we haven't seen this key before,
693 // add the provided set to the hash table.
694 // If the string is one (32 bit) char in length, the set contains
695 // just one element which is the char in question.
696 // If the string is "any", return a set containing all chars.
697 //
698 //------------------------------------------------------------------------------
findSetFor(const UnicodeString & s,RBBINode * node,UnicodeSet * setToAdopt)699 void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
700
701 RBBISetTableEl *el;
702
703 // First check whether we've already cached a set for this string.
704 // If so, just use the cached set in the new node.
705 // delete any set provided by the caller, since we own it.
706 el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
707 if (el != NULL) {
708 delete setToAdopt;
709 node->fLeftChild = el->val;
710 U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
711 return;
712 }
713
714 // Haven't seen this set before.
715 // If the caller didn't provide us with a prebuilt set,
716 // create a new UnicodeSet now.
717 if (setToAdopt == NULL) {
718 if (s.compare(kAny, -1) == 0) {
719 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
720 } else {
721 UChar32 c;
722 c = s.char32At(0);
723 setToAdopt = new UnicodeSet(c, c);
724 }
725 }
726
727 //
728 // Make a new uset node to refer to this UnicodeSet
729 // This new uset node becomes the child of the caller's setReference node.
730 //
731 RBBINode *usetNode = new RBBINode(RBBINode::uset);
732 if (usetNode == NULL) {
733 error(U_MEMORY_ALLOCATION_ERROR);
734 return;
735 }
736 usetNode->fInputSet = setToAdopt;
737 usetNode->fParent = node;
738 node->fLeftChild = usetNode;
739 usetNode->fText = s;
740
741
742 //
743 // Add the new uset node to the list of all uset nodes.
744 //
745 fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
746
747
748 //
749 // Add the new set to the set hash table.
750 //
751 el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
752 UnicodeString *tkey = new UnicodeString(s);
753 if (tkey == NULL || el == NULL || setToAdopt == NULL) {
754 // Delete to avoid memory leak
755 delete tkey;
756 tkey = NULL;
757 uprv_free(el);
758 el = NULL;
759 delete setToAdopt;
760 setToAdopt = NULL;
761
762 error(U_MEMORY_ALLOCATION_ERROR);
763 return;
764 }
765 el->key = tkey;
766 el->val = usetNode;
767 uhash_put(fSetTable, el->key, el, fRB->fStatus);
768
769 return;
770 }
771
772
773
774 //
775 // Assorted Unicode character constants.
776 // Numeric because there is no portable way to enter them as literals.
777 // (Think EBCDIC).
778 //
779 static const UChar chCR = 0x0d; // New lines, for terminating comments.
780 static const UChar chLF = 0x0a;
781 static const UChar chNEL = 0x85; // NEL newline variant
782 static const UChar chLS = 0x2028; // Unicode Line Separator
783 static const UChar chApos = 0x27; // single quote, for quoted chars.
784 static const UChar chPound = 0x23; // '#', introduces a comment.
785 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
786 static const UChar chLParen = 0x28;
787 static const UChar chRParen = 0x29;
788
789
790 //------------------------------------------------------------------------------
791 //
792 // stripRules Return a rules string without unnecessary
793 // characters.
794 //
795 //------------------------------------------------------------------------------
stripRules(const UnicodeString & rules)796 UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
797 UnicodeString strippedRules;
798 int rulesLength = rules.length();
799 for (int idx = 0; idx < rulesLength; ) {
800 UChar ch = rules[idx++];
801 if (ch == chPound) {
802 while (idx < rulesLength
803 && ch != chCR && ch != chLF && ch != chNEL)
804 {
805 ch = rules[idx++];
806 }
807 }
808 if (!u_isISOControl(ch)) {
809 strippedRules.append(ch);
810 }
811 }
812 // strippedRules = strippedRules.unescape();
813 return strippedRules;
814 }
815
816
817 //------------------------------------------------------------------------------
818 //
819 // nextCharLL Low Level Next Char from rule input source.
820 // Get a char from the input character iterator,
821 // keep track of input position for error reporting.
822 //
823 //------------------------------------------------------------------------------
nextCharLL()824 UChar32 RBBIRuleScanner::nextCharLL() {
825 UChar32 ch;
826
827 if (fNextIndex >= fRB->fRules.length()) {
828 return (UChar32)-1;
829 }
830 ch = fRB->fRules.char32At(fNextIndex);
831 fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
832
833 if (ch == chCR ||
834 ch == chNEL ||
835 ch == chLS ||
836 (ch == chLF && fLastChar != chCR)) {
837 // Character is starting a new line. Bump up the line number, and
838 // reset the column to 0.
839 fLineNum++;
840 fCharNum=0;
841 if (fQuoteMode) {
842 error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
843 fQuoteMode = FALSE;
844 }
845 }
846 else {
847 // Character is not starting a new line. Except in the case of a
848 // LF following a CR, increment the column position.
849 if (ch != chLF) {
850 fCharNum++;
851 }
852 }
853 fLastChar = ch;
854 return ch;
855 }
856
857
858 //------------------------------------------------------------------------------
859 //
860 // nextChar for rules scanning. At this level, we handle stripping
861 // out comments and processing backslash character escapes.
862 // The rest of the rules grammar is handled at the next level up.
863 //
864 //------------------------------------------------------------------------------
nextChar(RBBIRuleChar & c)865 void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
866
867 // Unicode Character constants needed for the processing done by nextChar(),
868 // in hex because literals wont work on EBCDIC machines.
869
870 fScanIndex = fNextIndex;
871 c.fChar = nextCharLL();
872 c.fEscaped = FALSE;
873
874 //
875 // check for '' sequence.
876 // These are recognized in all contexts, whether in quoted text or not.
877 //
878 if (c.fChar == chApos) {
879 if (fRB->fRules.char32At(fNextIndex) == chApos) {
880 c.fChar = nextCharLL(); // get nextChar officially so character counts
881 c.fEscaped = TRUE; // stay correct.
882 }
883 else
884 {
885 // Single quote, by itself.
886 // Toggle quoting mode.
887 // Return either '(' or ')', because quotes cause a grouping of the quoted text.
888 fQuoteMode = !fQuoteMode;
889 if (fQuoteMode == TRUE) {
890 c.fChar = chLParen;
891 } else {
892 c.fChar = chRParen;
893 }
894 c.fEscaped = FALSE; // The paren that we return is not escaped.
895 return;
896 }
897 }
898
899 if (fQuoteMode) {
900 c.fEscaped = TRUE;
901 }
902 else
903 {
904 // We are not in a 'quoted region' of the source.
905 //
906 if (c.fChar == chPound) {
907 // Start of a comment. Consume the rest of it.
908 // The new-line char that terminates the comment is always returned.
909 // It will be treated as white-space, and serves to break up anything
910 // that might otherwise incorrectly clump together with a comment in
911 // the middle (a variable name, for example.)
912 for (;;) {
913 c.fChar = nextCharLL();
914 if (c.fChar == (UChar32)-1 || // EOF
915 c.fChar == chCR ||
916 c.fChar == chLF ||
917 c.fChar == chNEL ||
918 c.fChar == chLS) {break;}
919 }
920 }
921 if (c.fChar == (UChar32)-1) {
922 return;
923 }
924
925 //
926 // check for backslash escaped characters.
927 // Use UnicodeString::unescapeAt() to handle them.
928 //
929 if (c.fChar == chBackSlash) {
930 c.fEscaped = TRUE;
931 int32_t startX = fNextIndex;
932 c.fChar = fRB->fRules.unescapeAt(fNextIndex);
933 if (fNextIndex == startX) {
934 error(U_BRK_HEX_DIGITS_EXPECTED);
935 }
936 fCharNum += fNextIndex-startX;
937 }
938 }
939 // putc(c.fChar, stdout);
940 }
941
942 //------------------------------------------------------------------------------
943 //
944 // Parse RBBI rules. The state machine for rules parsing is here.
945 // The state tables are hand-written in the file rbbirpt.txt,
946 // and converted to the form used here by a perl
947 // script rbbicst.pl
948 //
949 //------------------------------------------------------------------------------
parse()950 void RBBIRuleScanner::parse() {
951 uint16_t state;
952 const RBBIRuleTableEl *tableEl;
953
954 if (U_FAILURE(*fRB->fStatus)) {
955 return;
956 }
957
958 state = 1;
959 nextChar(fC);
960 //
961 // Main loop for the rule parsing state machine.
962 // Runs once per state transition.
963 // Each time through optionally performs, depending on the state table,
964 // - an advance to the the next input char
965 // - an action to be performed.
966 // - pushing or popping a state to/from the local state return stack.
967 //
968 for (;;) {
969 // Bail out if anything has gone wrong.
970 // RBBI rule file parsing stops on the first error encountered.
971 if (U_FAILURE(*fRB->fStatus)) {
972 break;
973 }
974
975 // Quit if state == 0. This is the normal way to exit the state machine.
976 //
977 if (state == 0) {
978 break;
979 }
980
981 // Find the state table element that matches the input char from the rule, or the
982 // class of the input character. Start with the first table row for this
983 // state, then linearly scan forward until we find a row that matches the
984 // character. The last row for each state always matches all characters, so
985 // the search will stop there, if not before.
986 //
987 tableEl = &gRuleParseStateTable[state];
988 #ifdef RBBI_DEBUG
989 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
990 RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ",
991 fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
992 }
993 #endif
994
995 for (;;) {
996 #ifdef RBBI_DEBUG
997 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}
998 #endif
999 if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) {
1000 // Table row specified an individual character, not a set, and
1001 // the input character is not escaped, and
1002 // the input character matched it.
1003 break;
1004 }
1005 if (tableEl->fCharClass == 255) {
1006 // Table row specified default, match anything character class.
1007 break;
1008 }
1009 if (tableEl->fCharClass == 254 && fC.fEscaped) {
1010 // Table row specified "escaped" and the char was escaped.
1011 break;
1012 }
1013 if (tableEl->fCharClass == 253 && fC.fEscaped &&
1014 (fC.fChar == 0x50 || fC.fChar == 0x70 )) {
1015 // Table row specified "escaped P" and the char is either 'p' or 'P'.
1016 break;
1017 }
1018 if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) {
1019 // Table row specified eof and we hit eof on the input.
1020 break;
1021 }
1022
1023 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
1024 fC.fEscaped == FALSE && // char is not escaped &&
1025 fC.fChar != (UChar32)-1) { // char is not EOF
1026 U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets));
1027 if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
1028 // Table row specified a character class, or set of characters,
1029 // and the current char matches it.
1030 break;
1031 }
1032 }
1033
1034 // No match on this row, advance to the next row for this state,
1035 tableEl++;
1036 }
1037 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
1038
1039 //
1040 // We've found the row of the state table that matches the current input
1041 // character from the rules string.
1042 // Perform any action specified by this row in the state table.
1043 if (doParseActions((int32_t)tableEl->fAction) == FALSE) {
1044 // Break out of the state machine loop if the
1045 // the action signalled some kind of error, or
1046 // the action was to exit, occurs on normal end-of-rules-input.
1047 break;
1048 }
1049
1050 if (tableEl->fPushState != 0) {
1051 fStackPtr++;
1052 if (fStackPtr >= kStackSize) {
1053 error(U_BRK_INTERNAL_ERROR);
1054 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
1055 fStackPtr--;
1056 }
1057 fStack[fStackPtr] = tableEl->fPushState;
1058 }
1059
1060 if (tableEl->fNextChar) {
1061 nextChar(fC);
1062 }
1063
1064 // Get the next state from the table entry, or from the
1065 // state stack if the next state was specified as "pop".
1066 if (tableEl->fNextState != 255) {
1067 state = tableEl->fNextState;
1068 } else {
1069 state = fStack[fStackPtr];
1070 fStackPtr--;
1071 if (fStackPtr < 0) {
1072 error(U_BRK_INTERNAL_ERROR);
1073 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
1074 fStackPtr++;
1075 }
1076 }
1077
1078 }
1079
1080 //
1081 // If there were NO user specified reverse rules, set up the equivalent of ".*;"
1082 //
1083 if (fRB->fReverseTree == NULL) {
1084 fRB->fReverseTree = pushNewNode(RBBINode::opStar);
1085 RBBINode *operand = pushNewNode(RBBINode::setRef);
1086 if (U_FAILURE(*fRB->fStatus)) {
1087 return;
1088 }
1089 findSetFor(UnicodeString(TRUE, kAny, 3), operand);
1090 fRB->fReverseTree->fLeftChild = operand;
1091 operand->fParent = fRB->fReverseTree;
1092 fNodeStackPtr -= 2;
1093 }
1094
1095
1096 //
1097 // Parsing of the input RBBI rules is complete.
1098 // We now have a parse tree for the rule expressions
1099 // and a list of all UnicodeSets that are referenced.
1100 //
1101 #ifdef RBBI_DEBUG
1102 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
1103 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
1104 {
1105 RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
1106 fRB->fForwardTree->printTree(TRUE);
1107 RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
1108 fRB->fReverseTree->printTree(TRUE);
1109 RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
1110 fRB->fSafeFwdTree->printTree(TRUE);
1111 RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
1112 fRB->fSafeRevTree->printTree(TRUE);
1113 }
1114 #endif
1115 }
1116
1117
1118 //------------------------------------------------------------------------------
1119 //
1120 // printNodeStack for debugging...
1121 //
1122 //------------------------------------------------------------------------------
1123 #ifdef RBBI_DEBUG
printNodeStack(const char * title)1124 void RBBIRuleScanner::printNodeStack(const char *title) {
1125 int i;
1126 RBBIDebugPrintf("%s. Dumping node stack...\n", title);
1127 for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);}
1128 }
1129 #endif
1130
1131
1132
1133
1134 //------------------------------------------------------------------------------
1135 //
1136 // pushNewNode create a new RBBINode of the specified type and push it
1137 // onto the stack of nodes.
1138 //
1139 //------------------------------------------------------------------------------
pushNewNode(RBBINode::NodeType t)1140 RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) {
1141 if (U_FAILURE(*fRB->fStatus)) {
1142 return NULL;
1143 }
1144 fNodeStackPtr++;
1145 if (fNodeStackPtr >= kStackSize) {
1146 error(U_BRK_INTERNAL_ERROR);
1147 RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
1148 *fRB->fStatus = U_BRK_INTERNAL_ERROR;
1149 return NULL;
1150 }
1151 fNodeStack[fNodeStackPtr] = new RBBINode(t);
1152 if (fNodeStack[fNodeStackPtr] == NULL) {
1153 *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
1154 }
1155 return fNodeStack[fNodeStackPtr];
1156 }
1157
1158
1159
1160 //------------------------------------------------------------------------------
1161 //
1162 // scanSet Construct a UnicodeSet from the text at the current scan
1163 // position. Advance the scan position to the first character
1164 // after the set.
1165 //
1166 // A new RBBI setref node referring to the set is pushed onto the node
1167 // stack.
1168 //
1169 // The scan position is normally under the control of the state machine
1170 // that controls rule parsing. UnicodeSets, however, are parsed by
1171 // the UnicodeSet constructor, not by the RBBI rule parser.
1172 //
1173 //------------------------------------------------------------------------------
scanSet()1174 void RBBIRuleScanner::scanSet() {
1175 UnicodeSet *uset;
1176 ParsePosition pos;
1177 int startPos;
1178 int i;
1179
1180 if (U_FAILURE(*fRB->fStatus)) {
1181 return;
1182 }
1183
1184 pos.setIndex(fScanIndex);
1185 startPos = fScanIndex;
1186 UErrorCode localStatus = U_ZERO_ERROR;
1187 uset = new UnicodeSet();
1188 if (uset == NULL) {
1189 localStatus = U_MEMORY_ALLOCATION_ERROR;
1190 } else {
1191 uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
1192 }
1193 if (U_FAILURE(localStatus)) {
1194 // TODO: Get more accurate position of the error from UnicodeSet's return info.
1195 // UnicodeSet appears to not be reporting correctly at this time.
1196 #ifdef RBBI_DEBUG
1197 RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
1198 #endif
1199 error(localStatus);
1200 delete uset;
1201 return;
1202 }
1203
1204 // Verify that the set contains at least one code point.
1205 //
1206 U_ASSERT(uset!=NULL);
1207 if (uset->isEmpty()) {
1208 // This set is empty.
1209 // Make it an error, because it almost certainly is not what the user wanted.
1210 // Also, avoids having to think about corner cases in the tree manipulation code
1211 // that occurs later on.
1212 error(U_BRK_RULE_EMPTY_SET);
1213 delete uset;
1214 return;
1215 }
1216
1217
1218 // Advance the RBBI parse postion over the UnicodeSet pattern.
1219 // Don't just set fScanIndex because the line/char positions maintained
1220 // for error reporting would be thrown off.
1221 i = pos.getIndex();
1222 for (;;) {
1223 if (fNextIndex >= i) {
1224 break;
1225 }
1226 nextCharLL();
1227 }
1228
1229 if (U_SUCCESS(*fRB->fStatus)) {
1230 RBBINode *n;
1231
1232 n = pushNewNode(RBBINode::setRef);
1233 if (U_FAILURE(*fRB->fStatus)) {
1234 return;
1235 }
1236 n->fFirstPos = startPos;
1237 n->fLastPos = fNextIndex;
1238 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
1239 // findSetFor() serves several purposes here:
1240 // - Adopts storage for the UnicodeSet, will be responsible for deleting.
1241 // - Mantains collection of all sets in use, needed later for establishing
1242 // character categories for run time engine.
1243 // - Eliminates mulitiple instances of the same set.
1244 // - Creates a new uset node if necessary (if this isn't a duplicate.)
1245 findSetFor(n->fText, n, uset);
1246 }
1247
1248 }
1249
1250 U_NAMESPACE_END
1251
1252 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1253