• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 **********************************************************************
5 *   Copyright (c) 2001-2011, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 package com.ibm.icu.text;
10 
11 import java.text.ParsePosition;
12 import java.util.ArrayList;
13 import java.util.HashMap;
14 import java.util.List;
15 import java.util.Map;
16 
17 import com.ibm.icu.impl.IllegalIcuArgumentException;
18 import com.ibm.icu.impl.PatternProps;
19 import com.ibm.icu.impl.Utility;
20 import com.ibm.icu.lang.UCharacter;
21 import com.ibm.icu.text.RuleBasedTransliterator.Data;
22 
23 class TransliteratorParser {
24 
25     //----------------------------------------------------------------------
26     // Data members
27     //----------------------------------------------------------------------
28 
29     /**
30      * PUBLIC data member.
31      * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group
32      * of rules in the rule set
33      */
34     public List<Data> dataVector;
35 
36     /**
37      * PUBLIC data member.
38      * A Vector of Strings containing all of the ID blocks in the rule set
39      */
40     public List<String> idBlockVector;
41 
42     /**
43      * The current data object for which we are parsing rules
44      */
45     private Data curData;
46 
47     /**
48      * PUBLIC data member containing the parsed compound filter, if any.
49      */
50     public UnicodeSet compoundFilter;
51 
52 
53     private int direction;
54 
55     /**
56      * Temporary symbol table used during parsing.
57      */
58     private ParseData parseData;
59 
60     /**
61      * Temporary vector of set variables.  When parsing is complete, this
62      * is copied into the array data.variables.  As with data.variables,
63      * element 0 corresponds to character data.variablesBase.
64      */
65     private List<Object> variablesVector;
66 
67     /**
68      * Temporary table of variable names.  When parsing is complete, this is
69      * copied into data.variableNames.
70      */
71     private Map<String, char[]> variableNames;
72 
73     /**
74      * String of standins for segments.  Used during the parsing of a single
75      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
76      * to StringMatcher object segmentObjects.elementAt(0), etc.
77      */
78     private StringBuffer segmentStandins;
79 
80     /**
81      * Vector of StringMatcher objects for segments.  Used during the
82      * parsing of a single rule.
83      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
84      * to StringMatcher object segmentObjects.elementAt(0), etc.
85      */
86     private List<StringMatcher> segmentObjects;
87 
88     /**
89      * The next available stand-in for variables.  This starts at some point in
90      * the private use area (discovered dynamically) and increments up toward
91      * <code>variableLimit</code>.  At any point during parsing, available
92      * variables are <code>variableNext..variableLimit-1</code>.
93      */
94     private char variableNext;
95 
96     /**
97      * The last available stand-in for variables.  This is discovered
98      * dynamically.  At any point during parsing, available variables are
99      * <code>variableNext..variableLimit-1</code>.  During variable definition
100      * we use the special value variableLimit-1 as a placeholder.
101      */
102     private char variableLimit;
103 
104     /**
105      * When we encounter an undefined variable, we do not immediately signal
106      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
107      * Instead, we save the name of the undefined variable, and substitute
108      * in the placeholder char variableLimit - 1, and decrement
109      * variableLimit.
110      */
111     private String undefinedVariableName;
112 
113     /**
114      * The stand-in character for the 'dot' set, represented by '.' in
115      * patterns.  This is allocated the first time it is needed, and
116      * reused thereafter.
117      */
118     private int dotStandIn = -1;
119 
120     //----------------------------------------------------------------------
121     // Constants
122     //----------------------------------------------------------------------
123 
124     // Indicator for ID blocks
125     private static final String ID_TOKEN = "::";
126     private static final int ID_TOKEN_LEN = 2;
127 
128 /*
129 (reserved for future expansion)
130     // markers for beginning and end of rule groups
131     private static final String BEGIN_TOKEN = "BEGIN";
132     private static final String END_TOKEN = "END";
133 */
134 
135     // Operators
136     private static final char VARIABLE_DEF_OP   = '=';
137     private static final char FORWARD_RULE_OP   = '>';
138     private static final char REVERSE_RULE_OP   = '<';
139     private static final char FWDREV_RULE_OP    = '~'; // internal rep of <> op
140 
141     private static final String OPERATORS = "=><\u2190\u2192\u2194";
142     private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";
143 
144     // Other special characters
145     private static final char QUOTE               = '\'';
146     private static final char ESCAPE              = '\\';
147     private static final char END_OF_RULE         = ';';
148     private static final char RULE_COMMENT_CHAR   = '#';
149 
150     private static final char CONTEXT_ANTE        = '{'; // ante{key
151     private static final char CONTEXT_POST        = '}'; // key}post
152     private static final char CURSOR_POS          = '|';
153     private static final char CURSOR_OFFSET       = '@';
154     private static final char ANCHOR_START        = '^';
155 
156     private static final char KLEENE_STAR         = '*';
157     private static final char ONE_OR_MORE         = '+';
158     private static final char ZERO_OR_ONE         = '?';
159 
160     private static final char DOT                 = '.';
161     private static final String DOT_SET           = "[^[:Zp:][:Zl:]\\r\\n$]";
162 
163     // By definition, the ANCHOR_END special character is a
164     // trailing SymbolTable.SYMBOL_REF character.
165     // private static final char ANCHOR_END       = '$';
166 
167     // Segments of the input string are delimited by "(" and ")".  In the
168     // output string these segments are referenced as "$1", "$2", etc.
169     private static final char SEGMENT_OPEN        = '(';
170     private static final char SEGMENT_CLOSE       = ')';
171 
172     // A function is denoted &Source-Target/Variant(text)
173     private static final char FUNCTION            = '&';
174 
175     // Aliases for some of the syntax characters. These are provided so
176     // transliteration rules can be expressed in XML without clashing with
177     // XML syntax characters '<', '>', and '&'.
178     private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
179     private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
180     private static final char ALT_FWDREV_RULE_OP  = '\u2194'; // Left Right Arrow
181     private static final char ALT_FUNCTION        = '\u2206'; // Increment (~Greek Capital Delta)
182 
183     // Special characters disallowed at the top level
184     private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
185 
186     // Special characters disallowed within a segment
187     private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]");
188 
189     // Special characters disallowed within a function argument
190     private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");
191 
192     //----------------------------------------------------------------------
193     // class ParseData
194     //----------------------------------------------------------------------
195 
196     /**
197      * This class implements the SymbolTable interface.  It is used
198      * during parsing to give UnicodeSet access to variables that
199      * have been defined so far.  Note that it uses variablesVector,
200      * _not_ data.variables.
201      */
202     private class ParseData implements SymbolTable {
203 
204         /**
205          * Implement SymbolTable API.
206          */
207         @Override
lookup(String name)208         public char[] lookup(String name) {
209             return variableNames.get(name);
210         }
211 
212         /**
213          * Implement SymbolTable API.
214          */
215         @Override
lookupMatcher(int ch)216         public UnicodeMatcher lookupMatcher(int ch) {
217             // Note that we cannot use data.lookup() because the
218             // set array has not been constructed yet.
219             int i = ch - curData.variablesBase;
220             if (i >= 0 && i < variablesVector.size()) {
221                 return (UnicodeMatcher) variablesVector.get(i);
222             }
223             return null;
224         }
225 
226         /**
227          * Implement SymbolTable API.  Parse out a symbol reference
228          * name.
229          */
230         @Override
parseReference(String text, ParsePosition pos, int limit)231         public String parseReference(String text, ParsePosition pos, int limit) {
232             int start = pos.getIndex();
233             int i = start;
234             while (i < limit) {
235                 char c = text.charAt(i);
236                 if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) ||
237                     !UCharacter.isUnicodeIdentifierPart(c)) {
238                     break;
239                 }
240                 ++i;
241             }
242             if (i == start) { // No valid name chars
243                 return null;
244             }
245             pos.setIndex(i);
246             return text.substring(start, i);
247         }
248 
249         /**
250          * Return true if the given character is a matcher standin or a plain
251          * character (non standin).
252          */
isMatcher(int ch)253         public boolean isMatcher(int ch) {
254             // Note that we cannot use data.lookup() because the
255             // set array has not been constructed yet.
256             int i = ch - curData.variablesBase;
257             if (i >= 0 && i < variablesVector.size()) {
258                 return variablesVector.get(i) instanceof UnicodeMatcher;
259             }
260             return true;
261         }
262 
263         /**
264          * Return true if the given character is a replacer standin or a plain
265          * character (non standin).
266          */
isReplacer(int ch)267         public boolean isReplacer(int ch) {
268             // Note that we cannot use data.lookup() because the
269             // set array has not been constructed yet.
270             int i = ch - curData.variablesBase;
271             if (i >= 0 && i < variablesVector.size()) {
272                 return variablesVector.get(i) instanceof UnicodeReplacer;
273             }
274             return true;
275         }
276     }
277 
278     //----------------------------------------------------------------------
279     // classes RuleBody, RuleArray, and RuleReader
280     //----------------------------------------------------------------------
281 
282     /**
283      * A private abstract class representing the interface to rule
284      * source code that is broken up into lines.  Handles the
285      * folding of lines terminated by a backslash.  This folding
286      * is limited; it does not account for comments, quotes, or
287      * escapes, so its use to be limited.
288      */
289     private static abstract class RuleBody {
290 
291         /**
292          * Retrieve the next line of the source, or return null if
293          * none.  Folds lines terminated by a backslash into the
294          * next line, without regard for comments, quotes, or
295          * escapes.
296          */
nextLine()297         String nextLine() {
298             String s = handleNextLine();
299             if (s != null &&
300                 s.length() > 0 &&
301                 s.charAt(s.length() - 1) == '\\') {
302                 StringBuilder b = new StringBuilder(s);
303                 do {
304                     b.deleteCharAt(b.length()-1);
305                     s = handleNextLine();
306                     if (s == null) {
307                         break;
308                     }
309                     b.append(s);
310                 } while (s.length() > 0 &&
311                          s.charAt(s.length() - 1) == '\\');
312                 s = b.toString();
313             }
314             return s;
315         }
316 
317         /**
318          * Reset to the first line of the source.
319          */
reset()320         abstract void reset();
321 
322         /**
323          * Subclass method to return the next line of the source.
324          */
handleNextLine()325         abstract String handleNextLine();
326     }
327 
328     /**
329      * RuleBody subclass for a String[] array.
330      */
331     private static class RuleArray extends RuleBody {
332         String[] array;
333         int i;
RuleArray(String[] array)334         public RuleArray(String[] array) { this.array = array; i = 0; }
335         @Override
handleNextLine()336         public String handleNextLine() {
337             return (i < array.length) ? array[i++] : null;
338         }
339         @Override
reset()340         public void reset() {
341             i = 0;
342         }
343     }
344 
345     /*
346      * RuleBody subclass for a ResourceReader.
347      */
348 /*    private static class RuleReader extends RuleBody {
349         ResourceReader reader;
350         public RuleReader(ResourceReader reader) { this.reader = reader; }
351         public String handleNextLine() {
352             try {
353                 return reader.readLine();
354             } catch (java.io.IOException e) {}
355             return null;
356         }
357         public void reset() {
358             reader.reset();
359         }
360     }*/
361 
362     //----------------------------------------------------------------------
363     // class RuleHalf
364     //----------------------------------------------------------------------
365 
366     /**
367      * A class representing one side of a rule.  This class knows how to
368      * parse half of a rule.  It is tightly coupled to the method
369      * TransliteratorParser.parseRule().
370      */
371     private static class RuleHalf {
372 
373         public String text;
374 
375         public int cursor = -1; // position of cursor in text
376         public int ante = -1;   // position of ante context marker '{' in text
377         public int post = -1;   // position of post context marker '}' in text
378 
379         // Record the offset to the cursor either to the left or to the
380         // right of the key.  This is indicated by characters on the output
381         // side that allow the cursor to be positioned arbitrarily within
382         // the matching text.  For example, abc{def} > | @@@ xyz; changes
383         // def to xyz and moves the cursor to before abc.  Offset characters
384         // must be at the start or end, and they cannot move the cursor past
385         // the ante- or postcontext text.  Placeholders are only valid in
386         // output text.  The length of the ante and post context is
387         // determined at runtime, because of supplementals and quantifiers.
388         public int cursorOffset = 0; // only nonzero on output side
389 
390         // Position of first CURSOR_OFFSET on _right_.  This will be -1
391         // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
392         private int cursorOffsetPos = 0;
393 
394         public boolean anchorStart = false;
395         public boolean anchorEnd   = false;
396 
397         /**
398          * The segment number from 1..n of the next '(' we see
399          * during parsing; 1-based.
400          */
401         private int nextSegmentNumber = 1;
402 
403         /**
404          * Parse one side of a rule, stopping at either the limit,
405          * the END_OF_RULE character, or an operator.
406          * @return the index after the terminating character, or
407          * if limit was reached, limit
408          */
parse(String rule, int pos, int limit, TransliteratorParser parser)409         public int parse(String rule, int pos, int limit,
410                          TransliteratorParser parser) {
411             int start = pos;
412             StringBuffer buf = new StringBuffer();
413             pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false);
414             text = buf.toString();
415 
416             if (cursorOffset > 0 && cursor != cursorOffsetPos) {
417                 syntaxError("Misplaced " + CURSOR_POS, rule, start);
418             }
419 
420             return pos;
421         }
422 
423         /**
424          * Parse a section of one side of a rule, stopping at either
425          * the limit, the END_OF_RULE character, an operator, or a
426          * segment close character.  This method parses both a
427          * top-level rule half and a segment within such a rule half.
428          * It calls itself recursively to parse segments and nested
429          * segments.
430          * @param buf buffer into which to accumulate the rule pattern
431          * characters, either literal characters from the rule or
432          * standins for UnicodeMatcher objects including segments.
433          * @param illegal the set of special characters that is illegal during
434          * this parse.
435          * @param isSegment if true, then we've already seen a '(' and
436          * pos on entry points right after it.  Accumulate everything
437          * up to the closing ')', put it in a segment matcher object,
438          * generate a standin for it, and add the standin to buf.  As
439          * a side effect, update the segments vector with a reference
440          * to the segment matcher.  This works recursively for nested
441          * segments.  If isSegment is false, just accumulate
442          * characters into buf.
443          * @return the index after the terminating character, or
444          * if limit was reached, limit
445          */
parseSection(String rule, int pos, int limit, TransliteratorParser parser, StringBuffer buf, UnicodeSet illegal, boolean isSegment)446         private int parseSection(String rule, int pos, int limit,
447                                  TransliteratorParser parser,
448                                  StringBuffer buf,
449                                  UnicodeSet illegal,
450                                  boolean isSegment) {
451             int start = pos;
452             ParsePosition pp = null;
453             int quoteStart = -1; // Most recent 'single quoted string'
454             int quoteLimit = -1;
455             int varStart = -1; // Most recent $variableReference
456             int varLimit = -1;
457             int[] iref = new int[1];
458             int bufStart = buf.length();
459 
460         main:
461             while (pos < limit) {
462                 // Since all syntax characters are in the BMP, fetching
463                 // 16-bit code units suffices here.
464                 char c = rule.charAt(pos++);
465                 if (PatternProps.isWhiteSpace(c)) {
466                     continue;
467                 }
468                 // HALF_ENDERS is all chars that end a rule half: "<>=;"
469                 if (HALF_ENDERS.indexOf(c) >= 0) {
470                     ///CLOVER:OFF
471                     // isSegment is always false
472                     if (isSegment) {
473                         syntaxError("Unclosed segment", rule, start);
474                     }
475                     ///CLOVER:ON
476                     break main;
477                 }
478                 if (anchorEnd) {
479                     // Text after a presumed end anchor is a syntax err
480                     syntaxError("Malformed variable reference", rule, start);
481                 }
482                 if (UnicodeSet.resemblesPattern(rule, pos-1)) {
483                     if (pp == null) {
484                         pp = new ParsePosition(0);
485                     }
486                     pp.setIndex(pos-1); // Backup to opening '['
487                     buf.append(parser.parseSet(rule, pp));
488                     pos = pp.getIndex();
489                     continue;
490                 }
491                 // Handle escapes
492                 if (c == ESCAPE) {
493                     if (pos == limit) {
494                         syntaxError("Trailing backslash", rule, start);
495                     }
496                     iref[0] = pos;
497                     int escaped = Utility.unescapeAt(rule, iref);
498                     pos = iref[0];
499                     if (escaped == -1) {
500                         syntaxError("Malformed escape", rule, start);
501                     }
502                     parser.checkVariableRange(escaped, rule, start);
503                     UTF16.append(buf, escaped);
504                     continue;
505                 }
506                 // Handle quoted matter
507                 if (c == QUOTE) {
508                     int iq = rule.indexOf(QUOTE, pos);
509                     if (iq == pos) {
510                         buf.append(c); // Parse [''] outside quotes as [']
511                         ++pos;
512                     } else {
513                         /* This loop picks up a run of quoted text of the
514                          * form 'aaaa' each time through.  If this run
515                          * hasn't really ended ('aaaa''bbbb') then it keeps
516                          * looping, each time adding on a new run.  When it
517                          * reaches the final quote it breaks.
518                          */
519                         quoteStart = buf.length();
520                         for (;;) {
521                             if (iq < 0) {
522                                 syntaxError("Unterminated quote", rule, start);
523                             }
524                             buf.append(rule.substring(pos, iq));
525                             pos = iq+1;
526                             if (pos < limit && rule.charAt(pos) == QUOTE) {
527                             // Parse [''] inside quotes as [']
528                                 iq = rule.indexOf(QUOTE, pos+1);
529                             // Continue looping
530                             } else {
531                                 break;
532                             }
533                         }
534                         quoteLimit = buf.length();
535 
536                         for (iq=quoteStart; iq<quoteLimit; ++iq) {
537                             parser.checkVariableRange(buf.charAt(iq), rule, start);
538                         }
539                     }
540                     continue;
541                 }
542 
543                 parser.checkVariableRange(c, rule, start);
544 
545                 if (illegal.contains(c)) {
546                     syntaxError("Illegal character '" + c + '\'', rule, start);
547                 }
548 
549                 switch (c) {
550 
551                 //------------------------------------------------------
552                 // Elements allowed within and out of segments
553                 //------------------------------------------------------
554                 case ANCHOR_START:
555                     if (buf.length() == 0 && !anchorStart) {
556                         anchorStart = true;
557                     } else {
558                         syntaxError("Misplaced anchor start",
559                                     rule, start);
560                     }
561                     break;
562                 case SEGMENT_OPEN:
563                     {
564                         // bufSegStart is the offset in buf to the first
565                         // character of the segment we are parsing.
566                         int bufSegStart = buf.length();
567 
568                         // Record segment number now, since nextSegmentNumber
569                         // will be incremented during the call to parseSection
570                         // if there are nested segments.
571                         int segmentNumber = nextSegmentNumber++; // 1-based
572 
573                         // Parse the segment
574                         pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true);
575 
576                         // After parsing a segment, the relevant characters are
577                         // in buf, starting at offset bufSegStart.  Extract them
578                         // into a string matcher, and replace them with a
579                         // standin for that matcher.
580                         StringMatcher m =
581                             new StringMatcher(buf.substring(bufSegStart),
582                                               segmentNumber, parser.curData);
583 
584                         // Record and associate object and segment number
585                         parser.setSegmentObject(segmentNumber, m);
586                         buf.setLength(bufSegStart);
587                         buf.append(parser.getSegmentStandin(segmentNumber));
588                     }
589                     break;
590                 case FUNCTION:
591                 case ALT_FUNCTION:
592                     {
593                         iref[0] = pos;
594                         TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);
595                         // The next character MUST be a segment open
596                         if (single == null ||
597                             !Utility.parseChar(rule, iref, SEGMENT_OPEN)) {
598                             syntaxError("Invalid function", rule, start);
599                         }
600 
601                         Transliterator t = single.getInstance();
602                         if (t == null) {
603                             syntaxError("Invalid function ID", rule, start);
604                         }
605 
606                         // bufSegStart is the offset in buf to the first
607                         // character of the segment we are parsing.
608                         int bufSegStart = buf.length();
609 
610                         // Parse the segment
611                         pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true);
612 
613                         // After parsing a segment, the relevant characters are
614                         // in buf, starting at offset bufSegStart.
615                         FunctionReplacer r =
616                             new FunctionReplacer(t,
617                                 new StringReplacer(buf.substring(bufSegStart), parser.curData));
618 
619                         // Replace the buffer contents with a stand-in
620                         buf.setLength(bufSegStart);
621                         buf.append(parser.generateStandInFor(r));
622                     }
623                     break;
624                 case SymbolTable.SYMBOL_REF:
625                     // Handle variable references and segment references "$1" .. "$9"
626                     {
627                         // A variable reference must be followed immediately
628                         // by a Unicode identifier start and zero or more
629                         // Unicode identifier part characters, or by a digit
630                         // 1..9 if it is a segment reference.
631                         if (pos == limit) {
632                             // A variable ref character at the end acts as
633                             // an anchor to the context limit, as in perl.
634                             anchorEnd = true;
635                             break;
636                         }
637                         // Parse "$1" "$2" .. "$9" .. (no upper limit)
638                         c = rule.charAt(pos);
639                         int r = UCharacter.digit(c, 10);
640                         if (r >= 1 && r <= 9) {
641                             iref[0] = pos;
642                             r = Utility.parseNumber(rule, iref, 10);
643                             if (r < 0) {
644                                 syntaxError("Undefined segment reference",
645                                             rule, start);
646                             }
647                             pos = iref[0];
648                             buf.append(parser.getSegmentStandin(r));
649                         } else {
650                             if (pp == null) { // Lazy create
651                                 pp = new ParsePosition(0);
652                             }
653                             pp.setIndex(pos);
654                             String name = parser.parseData.
655                                 parseReference(rule, pp, limit);
656                             if (name == null) {
657                                 // This means the '$' was not followed by a
658                                 // valid name.  Try to interpret it as an
659                                 // end anchor then.  If this also doesn't work
660                                 // (if we see a following character) then signal
661                                 // an error.
662                                 anchorEnd = true;
663                                 break;
664                             }
665                             pos = pp.getIndex();
666                             // If this is a variable definition statement,
667                             // then the LHS variable will be undefined.  In
668                             // that case appendVariableDef() will append the
669                             // special placeholder char variableLimit-1.
670                             varStart = buf.length();
671                             parser.appendVariableDef(name, buf);
672                             varLimit = buf.length();
673                         }
674                     }
675                     break;
676                 case DOT:
677                     buf.append(parser.getDotStandIn());
678                     break;
679                 case KLEENE_STAR:
680                 case ONE_OR_MORE:
681                 case ZERO_OR_ONE:
682                     // Quantifiers.  We handle single characters, quoted strings,
683                     // variable references, and segments.
684                     //  a+      matches  aaa
685                     //  'foo'+  matches  foofoofoo
686                     //  $v+     matches  xyxyxy if $v == xy
687                     //  (seg)+  matches  segsegseg
688                     {
689                         ///CLOVER:OFF
690                         // isSegment is always false
691                         if (isSegment && buf.length() == bufStart) {
692                             // The */+ immediately follows '('
693                             syntaxError("Misplaced quantifier", rule, start);
694                             break;
695                         }
696                         ///CLOVER:ON
697 
698                         int qstart, qlimit;
699                         // The */+ follows an isolated character or quote
700                         // or variable reference
701                         if (buf.length() == quoteLimit) {
702                             // The */+ follows a 'quoted string'
703                             qstart = quoteStart;
704                             qlimit = quoteLimit;
705                         } else if (buf.length() == varLimit) {
706                             // The */+ follows a $variableReference
707                             qstart = varStart;
708                             qlimit = varLimit;
709                         } else {
710                             // The */+ follows a single character, possibly
711                             // a segment standin
712                             qstart = buf.length() - 1;
713                             qlimit = qstart + 1;
714                         }
715 
716                         UnicodeMatcher m;
717                         try {
718                             m = new StringMatcher(buf.toString(), qstart, qlimit,
719                                               0, parser.curData);
720                         } catch (RuntimeException e) {
721                             final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos);
722                             final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "...";
723                             throw new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$"
724                                     + postContext).initCause(e);
725                         }
726                         int min = 0;
727                         int max = Quantifier.MAX;
728                         switch (c) {
729                         case ONE_OR_MORE:
730                             min = 1;
731                             break;
732                         case ZERO_OR_ONE:
733                             min = 0;
734                             max = 1;
735                             break;
736                             // case KLEENE_STAR:
737                             //    do nothing -- min, max already set
738                         }
739                         m = new Quantifier(m, min, max);
740                         buf.setLength(qstart);
741                         buf.append(parser.generateStandInFor(m));
742                     }
743                     break;
744 
745                 //------------------------------------------------------
746                 // Elements allowed ONLY WITHIN segments
747                 //------------------------------------------------------
748                 case SEGMENT_CLOSE:
749                     // assert(isSegment);
750                     // We're done parsing a segment.
751                     break main;
752 
753                 //------------------------------------------------------
754                 // Elements allowed ONLY OUTSIDE segments
755                 //------------------------------------------------------
756                 case CONTEXT_ANTE:
757                     if (ante >= 0) {
758                         syntaxError("Multiple ante contexts", rule, start);
759                     }
760                     ante = buf.length();
761                     break;
762                 case CONTEXT_POST:
763                     if (post >= 0) {
764                         syntaxError("Multiple post contexts", rule, start);
765                     }
766                     post = buf.length();
767                     break;
768                 case CURSOR_POS:
769                     if (cursor >= 0) {
770                         syntaxError("Multiple cursors", rule, start);
771                     }
772                     cursor = buf.length();
773                     break;
774                 case CURSOR_OFFSET:
775                     if (cursorOffset < 0) {
776                         if (buf.length() > 0) {
777                             syntaxError("Misplaced " + c, rule, start);
778                         }
779                         --cursorOffset;
780                     } else if (cursorOffset > 0) {
781                         if (buf.length() != cursorOffsetPos || cursor >= 0) {
782                             syntaxError("Misplaced " + c, rule, start);
783                         }
784                         ++cursorOffset;
785                     } else {
786                         if (cursor == 0 && buf.length() == 0) {
787                             cursorOffset = -1;
788                         } else if (cursor < 0) {
789                             cursorOffsetPos = buf.length();
790                             cursorOffset = 1;
791                         } else {
792                             syntaxError("Misplaced " + c, rule, start);
793                         }
794                     }
795                     break;
796 
797                 //------------------------------------------------------
798                 // Non-special characters
799                 //------------------------------------------------------
800                 default:
801                     // Disallow unquoted characters other than [0-9A-Za-z]
802                     // in the printable ASCII range.  These characters are
803                     // reserved for possible future use.
804                     if (c >= 0x0021 && c <= 0x007E &&
805                         !((c >= '0' && c <= '9') ||
806                           (c >= 'A' && c <= 'Z') ||
807                           (c >= 'a' && c <= 'z'))) {
808                         syntaxError("Unquoted " + c, rule, start);
809                     }
810                     buf.append(c);
811                     break;
812                 }
813             }
814             return pos;
815         }
816 
817         /**
818          * Remove context.
819          */
removeContext()820         void removeContext() {
821             text = text.substring(ante < 0 ? 0 : ante,
822                                   post < 0 ? text.length() : post);
823             ante = post = -1;
824             anchorStart = anchorEnd = false;
825         }
826 
827         /**
828          * Return true if this half looks like valid output, that is, does not
829          * contain quantifiers or other special input-only elements.
830          */
isValidOutput(TransliteratorParser parser)831         public boolean isValidOutput(TransliteratorParser parser) {
832             for (int i=0; i<text.length(); ) {
833                 int c = UTF16.charAt(text, i);
834                 i += UTF16.getCharCount(c);
835                 if (!parser.parseData.isReplacer(c)) {
836                     return false;
837                 }
838             }
839             return true;
840         }
841 
842         /**
843          * Return true if this half looks like valid input, that is, does not
844          * contain functions or other special output-only elements.
845          */
isValidInput(TransliteratorParser parser)846         public boolean isValidInput(TransliteratorParser parser) {
847             for (int i=0; i<text.length(); ) {
848                 int c = UTF16.charAt(text, i);
849                 i += UTF16.getCharCount(c);
850                 if (!parser.parseData.isMatcher(c)) {
851                     return false;
852                 }
853             }
854             return true;
855         }
856     }
857 
858     //----------------------------------------------------------------------
859     // PUBLIC methods
860     //----------------------------------------------------------------------
861 
862     /**
863      * Constructor.
864      */
TransliteratorParser()865     public TransliteratorParser() {
866     }
867 
868     /**
869      * Parse a set of rules.  After the parse completes, examine the public
870      * data members for results.
871      */
parse(String rules, int dir)872     public void parse(String rules, int dir) {
873         parseRules(new RuleArray(new String[] { rules }), dir);
874     }
875 
876     /*
877      * Parse a set of rules.  After the parse completes, examine the public
878      * data members for results.
879      */
880 /*    public void parse(ResourceReader rules, int direction) {
881         parseRules(new RuleReader(rules), direction);
882     }*/
883 
884     //----------------------------------------------------------------------
885     // PRIVATE methods
886     //----------------------------------------------------------------------
887 
888     /**
889      * Parse an array of zero or more rules.  The strings in the array are
890      * treated as if they were concatenated together, with rule terminators
891      * inserted between array elements if not present already.
892      *
893      * Any previous rules are discarded.  Typically this method is called exactly
894      * once, during construction.
895      *
896      * The member this.data will be set to null if there are no rules.
897      *
898      * @exception IllegalIcuArgumentException if there is a syntax error in the
899      * rules
900      */
parseRules(RuleBody ruleArray, int dir)901     void parseRules(RuleBody ruleArray, int dir) {
902         boolean parsingIDs = true;
903         int ruleCount = 0;
904 
905         dataVector = new ArrayList<Data>();
906         idBlockVector = new ArrayList<String>();
907         curData = null;
908         direction = dir;
909         compoundFilter = null;
910         variablesVector = new ArrayList<Object>();
911         variableNames = new HashMap<String, char[]>();
912         parseData = new ParseData();
913 
914         List<RuntimeException> errors = new ArrayList<RuntimeException>();
915         int errorCount = 0;
916 
917         ruleArray.reset();
918 
919         StringBuilder idBlockResult = new StringBuilder();
920 
921         // The compound filter offset is an index into idBlockResult.
922         // If it is 0, then the compound filter occurred at the start,
923         // and it is the offset to the _start_ of the compound filter
924         // pattern.  Otherwise it is the offset to the _limit_ of the
925         // compound filter pattern within idBlockResult.
926         this.compoundFilter = null;
927         int compoundFilterOffset = -1;
928 
929     main:
930         for (;;) {
931             String rule = ruleArray.nextLine();
932             if (rule == null) {
933                 break;
934             }
935             int pos = 0;
936             int limit = rule.length();
937             while (pos < limit) {
938                 char c = rule.charAt(pos++);
939                 if (PatternProps.isWhiteSpace(c)) {
940                     continue;
941                 }
942                 // Skip lines starting with the comment character
943                 if (c == RULE_COMMENT_CHAR) {
944                     pos = rule.indexOf("\n", pos) + 1;
945                     if (pos == 0) {
946                         break; // No "\n" found; rest of rule is a commnet
947                     }
948                     continue; // Either fall out or restart with next line
949                 }
950 
951                 // skip empty rules
952                 if (c == END_OF_RULE)
953                     continue;
954 
955                 // Often a rule file contains multiple errors.  It's
956                 // convenient to the rule author if these are all reported
957                 // at once.  We keep parsing rules even after a failure, up
958                 // to a specified limit, and report all errors at once.
959                 try {
960                     ++ruleCount;
961 
962                     // We've found the start of a rule or ID.  c is its first
963                     // character, and pos points past c.
964                     --pos;
965                     // Look for an ID token.  Must have at least ID_TOKEN_LEN + 1
966                     // chars left.
967                     if ((pos + ID_TOKEN_LEN + 1) <= limit &&
968                             rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
969                         pos += ID_TOKEN_LEN;
970                         c = rule.charAt(pos);
971                         while (PatternProps.isWhiteSpace(c) && pos < limit) {
972                             ++pos;
973                             c = rule.charAt(pos);
974                         }
975                         int[] p = new int[] { pos };
976 
977                         if (!parsingIDs) {
978                             if (curData != null) {
979                                 if (direction == Transliterator.FORWARD)
980                                     dataVector.add(curData);
981                                 else
982                                     dataVector.add(0, curData);
983                                 curData = null;
984                             }
985                             parsingIDs = true;
986                         }
987 
988                         TransliteratorIDParser.SingleID id =
989                             TransliteratorIDParser.parseSingleID(
990                                           rule, p, direction);
991                         if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) {
992                             // Successful ::ID parse.
993 
994                             if (direction == Transliterator.FORWARD) {
995                                 idBlockResult.append(id.canonID).append(END_OF_RULE);
996                             } else {
997                                 idBlockResult.insert(0, id.canonID + END_OF_RULE);
998                             }
999 
1000                         } else {
1001                             // Couldn't parse an ID.  Try to parse a global filter
1002                             int[] withParens = new int[] { -1 };
1003                             UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null);
1004                             if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) {
1005                                 if ((direction == Transliterator.FORWARD) ==
1006                                     (withParens[0] == 0)) {
1007                                     if (compoundFilter != null) {
1008                                         // Multiple compound filters
1009                                         syntaxError("Multiple global filters", rule, pos);
1010                                     }
1011                                     compoundFilter = f;
1012                                     compoundFilterOffset = ruleCount;
1013                                }
1014                             } else {
1015                                 // Invalid ::id
1016                                 // Can be parsed as neither an ID nor a global filter
1017                                 syntaxError("Invalid ::ID", rule, pos);
1018                             }
1019                         }
1020 
1021                         pos = p[0];
1022                     } else {
1023                         if (parsingIDs) {
1024                             if (direction == Transliterator.FORWARD)
1025                                 idBlockVector.add(idBlockResult.toString());
1026                             else
1027                                 idBlockVector.add(0, idBlockResult.toString());
1028                             idBlockResult.delete(0, idBlockResult.length());
1029                             parsingIDs = false;
1030                             curData = new RuleBasedTransliterator.Data();
1031 
1032                             // By default, rules use part of the private use area
1033                             // E000..F8FF for variables and other stand-ins.  Currently
1034                             // the range F000..F8FF is typically sufficient.  The 'use
1035                             // variable range' pragma allows rule sets to modify this.
1036                             setVariableRange(0xF000, 0xF8FF);
1037                         }
1038 
1039                         if (resemblesPragma(rule, pos, limit)) {
1040                             int ppp = parsePragma(rule, pos, limit);
1041                             if (ppp < 0) {
1042                                 syntaxError("Unrecognized pragma", rule, pos);
1043                             }
1044                             pos = ppp;
1045                         // Parse a rule
1046                         } else {
1047                             pos = parseRule(rule, pos, limit);
1048                         }
1049                     }
1050                 } catch (IllegalArgumentException e) {
1051                     if (errorCount == 30) {
1052                         IllegalIcuArgumentException icuEx = new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched");
1053                         icuEx.initCause(e);
1054                         errors.add(icuEx);
1055                         break main;
1056                     }
1057                     e.fillInStackTrace();
1058                     errors.add(e);
1059                     ++errorCount;
1060                     pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
1061                 }
1062             }
1063         }
1064         if (parsingIDs && idBlockResult.length() > 0) {
1065             if (direction == Transliterator.FORWARD)
1066                 idBlockVector.add(idBlockResult.toString());
1067             else
1068                 idBlockVector.add(0, idBlockResult.toString());
1069         }
1070         else if (!parsingIDs && curData != null) {
1071             if (direction == Transliterator.FORWARD)
1072                 dataVector.add(curData);
1073             else
1074                 dataVector.add(0, curData);
1075         }
1076 
1077         // Convert the set vector to an array
1078         for (int i = 0; i < dataVector.size(); i++) {
1079             Data data = dataVector.get(i);
1080             data.variables = new Object[variablesVector.size()];
1081             variablesVector.toArray(data.variables);
1082             data.variableNames = new HashMap<String, char[]>();
1083             data.variableNames.putAll(variableNames);
1084         }
1085         variablesVector = null;
1086 
1087         // Do more syntax checking and index the rules
1088         try {
1089             if (compoundFilter != null) {
1090                 if ((direction == Transliterator.FORWARD &&
1091                      compoundFilterOffset != 1) ||
1092                     (direction == Transliterator.REVERSE &&
1093                      compoundFilterOffset != ruleCount)) {
1094                     throw new IllegalIcuArgumentException("Compound filters misplaced");
1095                 }
1096             }
1097 
1098             for (int i = 0; i < dataVector.size(); i++) {
1099                 Data data = dataVector.get(i);
1100                 data.ruleSet.freeze();
1101             }
1102 
1103             if (idBlockVector.size() == 1 && (idBlockVector.get(0)).length() == 0)
1104                 idBlockVector.remove(0);
1105 
1106         } catch (IllegalArgumentException e) {
1107             e.fillInStackTrace();
1108             errors.add(e);
1109         }
1110 
1111         if (errors.size() != 0) {
1112             for (int i = errors.size()-1; i > 0; --i) {
1113                 RuntimeException previous = errors.get(i-1);
1114                 while (previous.getCause() != null) {
1115                     previous = (RuntimeException) previous.getCause(); // chain specially
1116                 }
1117                 previous.initCause(errors.get(i));
1118             }
1119             throw errors.get(0);
1120             // if initCause not supported: throw new IllegalArgumentException(errors.toString());
1121         }
1122     }
1123 
1124     /**
1125      * MAIN PARSER.  Parse the next rule in the given rule string, starting
1126      * at pos.  Return the index after the last character parsed.  Do not
1127      * parse characters at or after limit.
1128      *
1129      * Important:  The character at pos must be a non-whitespace character
1130      * that is not the comment character.
1131      *
1132      * This method handles quoting, escaping, and whitespace removal.  It
1133      * parses the end-of-rule character.  It recognizes context and cursor
1134      * indicators.  Once it does a lexical breakdown of the rule at pos, it
1135      * creates a rule object and adds it to our rule list.
1136      *
1137      * This method is tightly coupled to the inner class RuleHalf.
1138      */
parseRule(String rule, int pos, int limit)1139     private int parseRule(String rule, int pos, int limit) {
1140         // Locate the left side, operator, and right side
1141         int start = pos;
1142         char operator = 0;
1143 
1144         // Set up segments data
1145         segmentStandins = new StringBuffer();
1146         segmentObjects = new ArrayList<StringMatcher>();
1147 
1148         RuleHalf left  = new RuleHalf();
1149         RuleHalf right = new RuleHalf();
1150 
1151         undefinedVariableName = null;
1152         pos = left.parse(rule, pos, limit, this);
1153 
1154         if (pos == limit ||
1155             OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
1156             syntaxError("No operator pos=" + pos, rule, start);
1157         }
1158         ++pos;
1159 
1160         // Found an operator char.  Check for forward-reverse operator.
1161         if (operator == REVERSE_RULE_OP &&
1162             (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
1163             ++pos;
1164             operator = FWDREV_RULE_OP;
1165         }
1166 
1167         // Translate alternate op characters.
1168         switch (operator) {
1169         case ALT_FORWARD_RULE_OP:
1170             operator = FORWARD_RULE_OP;
1171             break;
1172         case ALT_REVERSE_RULE_OP:
1173             operator = REVERSE_RULE_OP;
1174             break;
1175         case ALT_FWDREV_RULE_OP:
1176             operator = FWDREV_RULE_OP;
1177             break;
1178         }
1179 
1180         pos = right.parse(rule, pos, limit, this);
1181 
1182         if (pos < limit) {
1183             if (rule.charAt(--pos) == END_OF_RULE) {
1184                 ++pos;
1185             } else {
1186                 // RuleHalf parser must have terminated at an operator
1187                 syntaxError("Unquoted operator", rule, start);
1188             }
1189         }
1190 
1191         if (operator == VARIABLE_DEF_OP) {
1192             // LHS is the name.  RHS is a single character, either a literal
1193             // or a set (already parsed).  If RHS is longer than one
1194             // character, it is either a multi-character string, or multiple
1195             // sets, or a mixture of chars and sets -- syntax error.
1196 
1197             // We expect to see a single undefined variable (the one being
1198             // defined).
1199             if (undefinedVariableName == null) {
1200                 syntaxError("Missing '$' or duplicate definition", rule, start);
1201             }
1202             if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
1203                 syntaxError("Malformed LHS", rule, start);
1204             }
1205             if (left.anchorStart || left.anchorEnd ||
1206                 right.anchorStart || right.anchorEnd) {
1207                 syntaxError("Malformed variable def", rule, start);
1208             }
1209             // We allow anything on the right, including an empty string.
1210             int n = right.text.length();
1211             char[] value = new char[n];
1212             right.text.getChars(0, n, value, 0);
1213             variableNames.put(undefinedVariableName, value);
1214 
1215             ++variableLimit;
1216             return pos;
1217         }
1218 
1219         // If this is not a variable definition rule, we shouldn't have
1220         // any undefined variable names.
1221         if (undefinedVariableName != null) {
1222             syntaxError("Undefined variable $" + undefinedVariableName,
1223                         rule, start);
1224         }
1225 
1226         // Verify segments
1227         if (segmentStandins.length() > segmentObjects.size()) {
1228             syntaxError("Undefined segment reference", rule, start);
1229         }
1230         for (int i=0; i<segmentStandins.length(); ++i) {
1231             if (segmentStandins.charAt(i) == 0) {
1232                 syntaxError("Internal error", rule, start); // will never happen
1233             }
1234         }
1235         for (int i=0; i<segmentObjects.size(); ++i) {
1236             if (segmentObjects.get(i) == null) {
1237                 syntaxError("Internal error", rule, start); // will never happen
1238             }
1239         }
1240 
1241         // If the direction we want doesn't match the rule
1242         // direction, do nothing.
1243         if (operator != FWDREV_RULE_OP &&
1244             ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
1245             return pos;
1246         }
1247 
1248         // Transform the rule into a forward rule by swapping the
1249         // sides if necessary.
1250         if (direction == Transliterator.REVERSE) {
1251             RuleHalf temp = left;
1252             left = right;
1253             right = temp;
1254         }
1255 
1256         // Remove non-applicable elements in forward-reverse
1257         // rules.  Bidirectional rules ignore elements that do not
1258         // apply.
1259         if (operator == FWDREV_RULE_OP) {
1260             right.removeContext();
1261             left.cursor = -1;
1262             left.cursorOffset = 0;
1263         }
1264 
1265         // Normalize context
1266         if (left.ante < 0) {
1267             left.ante = 0;
1268         }
1269         if (left.post < 0) {
1270             left.post = left.text.length();
1271         }
1272 
1273         // Context is only allowed on the input side.  Cursors are only
1274         // allowed on the output side.  Segment delimiters can only appear
1275         // on the left, and references on the right.  Cursor offset
1276         // cannot appear without an explicit cursor.  Cursor offset
1277         // cannot place the cursor outside the limits of the context.
1278         // Anchors are only allowed on the input side.
1279         if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
1280             (right.cursorOffset != 0 && right.cursor < 0) ||
1281             // - The following two checks were used to ensure that the
1282             // - the cursor offset stayed within the ante- or postcontext.
1283             // - However, with the addition of quantifiers, we have to
1284             // - allow arbitrary cursor offsets and do runtime checking.
1285             //(right.cursorOffset > (left.text.length() - left.post)) ||
1286             //(-right.cursorOffset > left.ante) ||
1287             right.anchorStart || right.anchorEnd ||
1288             !left.isValidInput(this) || !right.isValidOutput(this) ||
1289             left.ante > left.post) {
1290             syntaxError("Malformed rule", rule, start);
1291         }
1292 
1293         // Flatten segment objects vector to an array
1294         UnicodeMatcher[] segmentsArray = null;
1295         if (segmentObjects.size() > 0) {
1296             segmentsArray = new UnicodeMatcher[segmentObjects.size()];
1297             segmentObjects.toArray(segmentsArray);
1298         }
1299 
1300         curData.ruleSet.addRule(new TransliterationRule(
1301                                      left.text, left.ante, left.post,
1302                                      right.text, right.cursor, right.cursorOffset,
1303                                      segmentsArray,
1304                                      left.anchorStart, left.anchorEnd,
1305                                      curData));
1306 
1307         return pos;
1308     }
1309 
1310     /**
1311      * Set the variable range to [start, end] (inclusive).
1312      */
setVariableRange(int start, int end)1313     private void setVariableRange(int start, int end) {
1314         if (start > end || start < 0 || end > 0xFFFF) {
1315             throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end);
1316         }
1317 
1318         curData.variablesBase = (char) start; // first private use
1319 
1320         if (dataVector.size() == 0) {
1321             variableNext = (char) start;
1322             variableLimit = (char) (end + 1);
1323         }
1324     }
1325 
1326     /**
1327      * Assert that the given character is NOT within the variable range.
1328      * If it is, signal an error.  This is neccesary to ensure that the
1329      * variable range does not overlap characters used in a rule.
1330      */
checkVariableRange(int ch, String rule, int start)1331     private void checkVariableRange(int ch, String rule, int start) {
1332         if (ch >= curData.variablesBase && ch < variableLimit) {
1333             syntaxError("Variable range character in rule", rule, start);
1334         }
1335     }
1336 
1337     // (The following method is part of an unimplemented feature.
1338     // Remove this clover pragma after the feature is implemented.
1339     // 2003-06-11 ICU 2.6 Alan)
1340     ///CLOVER:OFF
1341     /**
1342      * Set the maximum backup to 'backup', in response to a pragma
1343      * statement.
1344      */
pragmaMaximumBackup(int backup)1345     private void pragmaMaximumBackup(int backup) {
1346         //TODO Finish
1347         throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet");
1348     }
1349     ///CLOVER:ON
1350 
1351     // (The following method is part of an unimplemented feature.
1352     // Remove this clover pragma after the feature is implemented.
1353     // 2003-06-11 ICU 2.6 Alan)
1354     ///CLOVER:OFF
1355     /**
1356      * Begin normalizing all rules using the given mode, in response
1357      * to a pragma statement.
1358      */
pragmaNormalizeRules(Normalizer.Mode mode)1359     private void pragmaNormalizeRules(Normalizer.Mode mode) {
1360         //TODO Finish
1361         throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet");
1362     }
1363     ///CLOVER:ON
1364 
1365     /**
1366      * Return true if the given rule looks like a pragma.
1367      * @param pos offset to the first non-whitespace character
1368      * of the rule.
1369      * @param limit pointer past the last character of the rule.
1370      */
resemblesPragma(String rule, int pos, int limit)1371     static boolean resemblesPragma(String rule, int pos, int limit) {
1372         // Must start with /use\s/i
1373         return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
1374     }
1375 
1376     /**
1377      * Parse a pragma.  This method assumes resemblesPragma() has
1378      * already returned true.
1379      * @param pos offset to the first non-whitespace character
1380      * of the rule.
1381      * @param limit pointer past the last character of the rule.
1382      * @return the position index after the final ';' of the pragma,
1383      * or -1 on failure.
1384      */
parsePragma(String rule, int pos, int limit)1385     private int parsePragma(String rule, int pos, int limit) {
1386         int[] array = new int[2];
1387 
1388         // resemblesPragma() has already returned true, so we
1389         // know that pos points to /use\s/i; we can skip 4 characters
1390         // immediately
1391         pos += 4;
1392 
1393         // Here are the pragmas we recognize:
1394         // use variable range 0xE000 0xEFFF;
1395         // use maximum backup 16;
1396         // use nfd rules;
1397         int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);
1398         if (p >= 0) {
1399             setVariableRange(array[0], array[1]);
1400             return p;
1401         }
1402 
1403         p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);
1404         if (p >= 0) {
1405             pragmaMaximumBackup(array[0]);
1406             return p;
1407         }
1408 
1409         p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
1410         if (p >= 0) {
1411             pragmaNormalizeRules(Normalizer.NFD);
1412             return p;
1413         }
1414 
1415         p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
1416         if (p >= 0) {
1417             pragmaNormalizeRules(Normalizer.NFC);
1418             return p;
1419         }
1420 
1421         // Syntax error: unable to parse pragma
1422         return -1;
1423     }
1424 
1425     /**
1426      * Throw an exception indicating a syntax error.  Search the rule string
1427      * for the probable end of the rule.  Of course, if the error is that
1428      * the end of rule marker is missing, then the rule end will not be found.
1429      * In any case the rule start will be correctly reported.
1430      * @param msg error description
1431      * @param rule pattern string
1432      * @param start position of first character of current rule
1433      */
syntaxError(String msg, String rule, int start)1434     static final void syntaxError(String msg, String rule, int start) {
1435         int end = ruleEnd(rule, start, rule.length());
1436         throw new IllegalIcuArgumentException(msg + " in \"" +
1437                                            Utility.escape(rule.substring(start, end)) + '"');
1438     }
1439 
ruleEnd(String rule, int start, int limit)1440     static final int ruleEnd(String rule, int start, int limit) {
1441         int end = Utility.quotedIndexOf(rule, start, limit, ";");
1442         if (end < 0) {
1443             end = limit;
1444         }
1445         return end;
1446     }
1447 
1448     /**
1449      * Parse a UnicodeSet out, store it, and return the stand-in character
1450      * used to represent it.
1451      */
parseSet(String rule, ParsePosition pos)1452     private final char parseSet(String rule, ParsePosition pos) {
1453         UnicodeSet set = new UnicodeSet(rule, pos, parseData);
1454         if (variableNext >= variableLimit) {
1455             throw new RuntimeException("Private use variables exhausted");
1456         }
1457         set.compact();
1458         return generateStandInFor(set);
1459     }
1460 
1461     /**
1462      * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer.
1463      * Store the object.
1464      */
generateStandInFor(Object obj)1465     char generateStandInFor(Object obj) {
1466         // assert(obj != null);
1467 
1468         // Look up previous stand-in, if any.  This is a short list
1469         // (typical n is 0, 1, or 2); linear search is optimal.
1470         for (int i=0; i<variablesVector.size(); ++i) {
1471             if (variablesVector.get(i) == obj) { // [sic] pointer comparison
1472                 return (char) (curData.variablesBase + i);
1473             }
1474         }
1475 
1476         if (variableNext >= variableLimit) {
1477             throw new RuntimeException("Variable range exhausted");
1478         }
1479         variablesVector.add(obj);
1480         return variableNext++;
1481     }
1482 
1483     /**
1484      * Return the standin for segment seg (1-based).
1485      */
getSegmentStandin(int seg)1486     public char getSegmentStandin(int seg) {
1487         if (segmentStandins.length() < seg) {
1488             segmentStandins.setLength(seg);
1489         }
1490         char c = segmentStandins.charAt(seg-1);
1491         if (c == 0) {
1492             if (variableNext >= variableLimit) {
1493                 throw new RuntimeException("Variable range exhausted");
1494             }
1495             c = variableNext++;
1496             // Set a placeholder in the master variables vector that will be
1497             // filled in later by setSegmentObject().  We know that we will get
1498             // called first because setSegmentObject() will call us.
1499             variablesVector.add(null);
1500             segmentStandins.setCharAt(seg-1, c);
1501         }
1502         return c;
1503     }
1504 
1505     /**
1506      * Set the object for segment seg (1-based).
1507      */
setSegmentObject(int seg, StringMatcher obj)1508     public void setSegmentObject(int seg, StringMatcher obj) {
1509         // Since we call parseSection() recursively, nested
1510         // segments will result in segment i+1 getting parsed
1511         // and stored before segment i; be careful with the
1512         // vector handling here.
1513         while (segmentObjects.size() < seg) {
1514             segmentObjects.add(null);
1515         }
1516         int index = getSegmentStandin(seg) - curData.variablesBase;
1517         if (segmentObjects.get(seg-1) != null ||
1518             variablesVector.get(index) != null) {
1519             throw new RuntimeException(); // should never happen
1520         }
1521         segmentObjects.set(seg-1, obj);
1522         variablesVector.set(index, obj);
1523     }
1524 
1525     /**
1526      * Return the stand-in for the dot set.  It is allocated the first
1527      * time and reused thereafter.
1528      */
getDotStandIn()1529     char getDotStandIn() {
1530         if (dotStandIn == -1) {
1531             dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
1532         }
1533         return (char) dotStandIn;
1534     }
1535 
1536     /**
1537      * Append the value of the given variable name to the given
1538      * StringBuffer.
1539      * @exception IllegalIcuArgumentException if the name is unknown.
1540      */
appendVariableDef(String name, StringBuffer buf)1541     private void appendVariableDef(String name, StringBuffer buf) {
1542         char[] ch = variableNames.get(name);
1543         if (ch == null) {
1544             // We allow one undefined variable so that variable definition
1545             // statements work.  For the first undefined variable we return
1546             // the special placeholder variableLimit-1, and save the variable
1547             // name.
1548             if (undefinedVariableName == null) {
1549                 undefinedVariableName = name;
1550                 if (variableNext >= variableLimit) {
1551                     throw new RuntimeException("Private use variables exhausted");
1552                 }
1553                 buf.append(--variableLimit);
1554             } else {
1555                 throw new IllegalIcuArgumentException("Undefined variable $"
1556                                                    + name);
1557             }
1558         } else {
1559             buf.append(ch);
1560         }
1561     }
1562 }
1563 
1564 //eof
1565