• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 **********************************************************************
6 *   Copyright (c) 2001-2011, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 **********************************************************************
9 */
10 package ohos.global.icu.text;
11 
12 import java.text.ParsePosition;
13 import java.util.ArrayList;
14 import java.util.HashMap;
15 import java.util.List;
16 import java.util.Map;
17 
18 import ohos.global.icu.impl.IllegalIcuArgumentException;
19 import ohos.global.icu.impl.PatternProps;
20 import ohos.global.icu.impl.Utility;
21 import ohos.global.icu.lang.UCharacter;
22 import ohos.global.icu.text.RuleBasedTransliterator.Data;
23 
24 class TransliteratorParser {
25 
26     //----------------------------------------------------------------------
27     // Data members
28     //----------------------------------------------------------------------
29 
30     /**
31      * PUBLIC data member.
32      * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group
33      * of rules in the rule set
34      */
35     public List<Data> dataVector;
36 
37     /**
38      * PUBLIC data member.
39      * A Vector of Strings containing all of the ID blocks in the rule set
40      */
41     public List<String> idBlockVector;
42 
43     /**
44      * The current data object for which we are parsing rules
45      */
46     private Data curData;
47 
48     /**
49      * PUBLIC data member containing the parsed compound filter, if any.
50      */
51     public UnicodeSet compoundFilter;
52 
53 
54     private int direction;
55 
56     /**
57      * Temporary symbol table used during parsing.
58      */
59     private ParseData parseData;
60 
61     /**
62      * Temporary vector of set variables.  When parsing is complete, this
63      * is copied into the array data.variables.  As with data.variables,
64      * element 0 corresponds to character data.variablesBase.
65      */
66     private List<Object> variablesVector;
67 
68     /**
69      * Temporary table of variable names.  When parsing is complete, this is
70      * copied into data.variableNames.
71      */
72     private Map<String, char[]> variableNames;
73 
74     /**
75      * String of standins for segments.  Used during the parsing of a single
76      * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
77      * to StringMatcher object segmentObjects.elementAt(0), etc.
78      */
79     private StringBuffer segmentStandins;
80 
81     /**
82      * Vector of StringMatcher objects for segments.  Used during the
83      * parsing of a single rule.
84      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
85      * to StringMatcher object segmentObjects.elementAt(0), etc.
86      */
87     private List<StringMatcher> segmentObjects;
88 
89     /**
90      * The next available stand-in for variables.  This starts at some point in
91      * the private use area (discovered dynamically) and increments up toward
92      * <code>variableLimit</code>.  At any point during parsing, available
93      * variables are <code>variableNext..variableLimit-1</code>.
94      */
95     private char variableNext;
96 
97     /**
98      * The last available stand-in for variables.  This is discovered
99      * dynamically.  At any point during parsing, available variables are
100      * <code>variableNext..variableLimit-1</code>.  During variable definition
101      * we use the special value variableLimit-1 as a placeholder.
102      */
103     private char variableLimit;
104 
105     /**
106      * When we encounter an undefined variable, we do not immediately signal
107      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
108      * Instead, we save the name of the undefined variable, and substitute
109      * in the placeholder char variableLimit - 1, and decrement
110      * variableLimit.
111      */
112     private String undefinedVariableName;
113 
114     /**
115      * The stand-in character for the 'dot' set, represented by '.' in
116      * patterns.  This is allocated the first time it is needed, and
117      * reused thereafter.
118      */
119     private int dotStandIn = -1;
120 
121     //----------------------------------------------------------------------
122     // Constants
123     //----------------------------------------------------------------------
124 
125     // Indicator for ID blocks
126     private static final String ID_TOKEN = "::";
127     private static final int ID_TOKEN_LEN = 2;
128 
129 /*
130 (reserved for future expansion)
131     // markers for beginning and end of rule groups
132     private static final String BEGIN_TOKEN = "BEGIN";
133     private static final String END_TOKEN = "END";
134 */
135 
136     // Operators
137     private static final char VARIABLE_DEF_OP   = '=';
138     private static final char FORWARD_RULE_OP   = '>';
139     private static final char REVERSE_RULE_OP   = '<';
140     private static final char FWDREV_RULE_OP    = '~'; // internal rep of <> op
141 
142     private static final String OPERATORS = "=><\u2190\u2192\u2194";
143     private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";
144 
145     // Other special characters
146     private static final char QUOTE               = '\'';
147     private static final char ESCAPE              = '\\';
148     private static final char END_OF_RULE         = ';';
149     private static final char RULE_COMMENT_CHAR   = '#';
150 
151     private static final char CONTEXT_ANTE        = '{'; // ante{key
152     private static final char CONTEXT_POST        = '}'; // key}post
153     private static final char CURSOR_POS          = '|';
154     private static final char CURSOR_OFFSET       = '@';
155     private static final char ANCHOR_START        = '^';
156 
157     private static final char KLEENE_STAR         = '*';
158     private static final char ONE_OR_MORE         = '+';
159     private static final char ZERO_OR_ONE         = '?';
160 
161     private static final char DOT                 = '.';
162     private static final String DOT_SET           = "[^[:Zp:][:Zl:]\\r\\n$]";
163 
164     // By definition, the ANCHOR_END special character is a
165     // trailing SymbolTable.SYMBOL_REF character.
166     // private static final char ANCHOR_END       = '$';
167 
168     // Segments of the input string are delimited by "(" and ")".  In the
169     // output string these segments are referenced as "$1", "$2", etc.
170     private static final char SEGMENT_OPEN        = '(';
171     private static final char SEGMENT_CLOSE       = ')';
172 
173     // A function is denoted &Source-Target/Variant(text)
174     private static final char FUNCTION            = '&';
175 
176     // Aliases for some of the syntax characters. These are provided so
177     // transliteration rules can be expressed in XML without clashing with
178     // XML syntax characters '<', '>', and '&'.
179     private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
180     private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
181     private static final char ALT_FWDREV_RULE_OP  = '\u2194'; // Left Right Arrow
182     private static final char ALT_FUNCTION        = '\u2206'; // Increment (~Greek Capital Delta)
183 
184     // Special characters disallowed at the top level
185     private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
186 
187     // Special characters disallowed within a segment
188     private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]");
189 
190     // Special characters disallowed within a function argument
191     private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");
192 
193     //----------------------------------------------------------------------
194     // class ParseData
195     //----------------------------------------------------------------------
196 
197     /**
198      * This class implements the SymbolTable interface.  It is used
199      * during parsing to give UnicodeSet access to variables that
200      * have been defined so far.  Note that it uses variablesVector,
201      * _not_ data.variables.
202      */
203     private class ParseData implements SymbolTable {
204 
205         /**
206          * Implement SymbolTable API.
207          */
208         @Override
lookup(String name)209         public char[] lookup(String name) {
210             return variableNames.get(name);
211         }
212 
213         /**
214          * Implement SymbolTable API.
215          */
216         @Override
lookupMatcher(int ch)217         public UnicodeMatcher lookupMatcher(int ch) {
218             // Note that we cannot use data.lookup() because the
219             // set array has not been constructed yet.
220             int i = ch - curData.variablesBase;
221             if (i >= 0 && i < variablesVector.size()) {
222                 return (UnicodeMatcher) variablesVector.get(i);
223             }
224             return null;
225         }
226 
227         /**
228          * Implement SymbolTable API.  Parse out a symbol reference
229          * name.
230          */
231         @Override
parseReference(String text, ParsePosition pos, int limit)232         public String parseReference(String text, ParsePosition pos, int limit) {
233             int start = pos.getIndex();
234             int i = start;
235             while (i < limit) {
236                 char c = text.charAt(i);
237                 if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) ||
238                     !UCharacter.isUnicodeIdentifierPart(c)) {
239                     break;
240                 }
241                 ++i;
242             }
243             if (i == start) { // No valid name chars
244                 return null;
245             }
246             pos.setIndex(i);
247             return text.substring(start, i);
248         }
249 
250         /**
251          * Return true if the given character is a matcher standin or a plain
252          * character (non standin).
253          */
isMatcher(int ch)254         public boolean isMatcher(int ch) {
255             // Note that we cannot use data.lookup() because the
256             // set array has not been constructed yet.
257             int i = ch - curData.variablesBase;
258             if (i >= 0 && i < variablesVector.size()) {
259                 return variablesVector.get(i) instanceof UnicodeMatcher;
260             }
261             return true;
262         }
263 
264         /**
265          * Return true if the given character is a replacer standin or a plain
266          * character (non standin).
267          */
isReplacer(int ch)268         public boolean isReplacer(int ch) {
269             // Note that we cannot use data.lookup() because the
270             // set array has not been constructed yet.
271             int i = ch - curData.variablesBase;
272             if (i >= 0 && i < variablesVector.size()) {
273                 return variablesVector.get(i) instanceof UnicodeReplacer;
274             }
275             return true;
276         }
277     }
278 
279     //----------------------------------------------------------------------
280     // classes RuleBody, RuleArray, and RuleReader
281     //----------------------------------------------------------------------
282 
283     /**
284      * A private abstract class representing the interface to rule
285      * source code that is broken up into lines.  Handles the
286      * folding of lines terminated by a backslash.  This folding
287      * is limited; it does not account for comments, quotes, or
288      * escapes, so its use to be limited.
289      */
290     private static abstract class RuleBody {
291 
292         /**
293          * Retrieve the next line of the source, or return null if
294          * none.  Folds lines terminated by a backslash into the
295          * next line, without regard for comments, quotes, or
296          * escapes.
297          */
nextLine()298         String nextLine() {
299             String s = handleNextLine();
300             if (s != null &&
301                 s.length() > 0 &&
302                 s.charAt(s.length() - 1) == '\\') {
303                 StringBuilder b = new StringBuilder(s);
304                 do {
305                     b.deleteCharAt(b.length()-1);
306                     s = handleNextLine();
307                     if (s == null) {
308                         break;
309                     }
310                     b.append(s);
311                 } while (s.length() > 0 &&
312                          s.charAt(s.length() - 1) == '\\');
313                 s = b.toString();
314             }
315             return s;
316         }
317 
318         /**
319          * Reset to the first line of the source.
320          */
reset()321         abstract void reset();
322 
323         /**
324          * Subclass method to return the next line of the source.
325          */
handleNextLine()326         abstract String handleNextLine();
327     }
328 
329     /**
330      * RuleBody subclass for a String[] array.
331      */
332     private static class RuleArray extends RuleBody {
333         String[] array;
334         int i;
RuleArray(String[] array)335         public RuleArray(String[] array) { this.array = array; i = 0; }
336         @Override
handleNextLine()337         public String handleNextLine() {
338             return (i < array.length) ? array[i++] : null;
339         }
340         @Override
reset()341         public void reset() {
342             i = 0;
343         }
344     }
345 
346     /*
347      * RuleBody subclass for a ResourceReader.
348      */
349 /*    private static class RuleReader extends RuleBody {
350         ResourceReader reader;
351         public RuleReader(ResourceReader reader) { this.reader = reader; }
352         public String handleNextLine() {
353             try {
354                 return reader.readLine();
355             } catch (java.io.IOException e) {}
356             return null;
357         }
358         public void reset() {
359             reader.reset();
360         }
361     }*/
362 
363     //----------------------------------------------------------------------
364     // class RuleHalf
365     //----------------------------------------------------------------------
366 
367     /**
368      * A class representing one side of a rule.  This class knows how to
369      * parse half of a rule.  It is tightly coupled to the method
370      * TransliteratorParser.parseRule().
371      */
372     private static class RuleHalf {
373 
374         public String text;
375 
376         public int cursor = -1; // position of cursor in text
377         public int ante = -1;   // position of ante context marker '{' in text
378         public int post = -1;   // position of post context marker '}' in text
379 
380         // Record the offset to the cursor either to the left or to the
381         // right of the key.  This is indicated by characters on the output
382         // side that allow the cursor to be positioned arbitrarily within
383         // the matching text.  For example, abc{def} > | @@@ xyz; changes
384         // def to xyz and moves the cursor to before abc.  Offset characters
385         // must be at the start or end, and they cannot move the cursor past
386         // the ante- or postcontext text.  Placeholders are only valid in
387         // output text.  The length of the ante and post context is
388         // determined at runtime, because of supplementals and quantifiers.
389         public int cursorOffset = 0; // only nonzero on output side
390 
391         // Position of first CURSOR_OFFSET on _right_.  This will be -1
392         // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
393         private int cursorOffsetPos = 0;
394 
395         public boolean anchorStart = false;
396         public boolean anchorEnd   = false;
397 
398         /**
399          * The segment number from 1..n of the next '(' we see
400          * during parsing; 1-based.
401          */
402         private int nextSegmentNumber = 1;
403 
404         /**
405          * Parse one side of a rule, stopping at either the limit,
406          * the END_OF_RULE character, or an operator.
407          * @return the index after the terminating character, or
408          * if limit was reached, limit
409          */
parse(String rule, int pos, int limit, TransliteratorParser parser)410         public int parse(String rule, int pos, int limit,
411                          TransliteratorParser parser) {
412             int start = pos;
413             StringBuffer buf = new StringBuffer();
414             pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false);
415             text = buf.toString();
416 
417             if (cursorOffset > 0 && cursor != cursorOffsetPos) {
418                 syntaxError("Misplaced " + CURSOR_POS, rule, start);
419             }
420 
421             return pos;
422         }
423 
424         /**
425          * Parse a section of one side of a rule, stopping at either
426          * the limit, the END_OF_RULE character, an operator, or a
427          * segment close character.  This method parses both a
428          * top-level rule half and a segment within such a rule half.
429          * It calls itself recursively to parse segments and nested
430          * segments.
431          * @param buf buffer into which to accumulate the rule pattern
432          * characters, either literal characters from the rule or
433          * standins for UnicodeMatcher objects including segments.
434          * @param illegal the set of special characters that is illegal during
435          * this parse.
436          * @param isSegment if true, then we've already seen a '(' and
437          * pos on entry points right after it.  Accumulate everything
438          * up to the closing ')', put it in a segment matcher object,
439          * generate a standin for it, and add the standin to buf.  As
440          * a side effect, update the segments vector with a reference
441          * to the segment matcher.  This works recursively for nested
442          * segments.  If isSegment is false, just accumulate
443          * characters into buf.
444          * @return the index after the terminating character, or
445          * if limit was reached, limit
446          */
parseSection(String rule, int pos, int limit, TransliteratorParser parser, StringBuffer buf, UnicodeSet illegal, boolean isSegment)447         private int parseSection(String rule, int pos, int limit,
448                                  TransliteratorParser parser,
449                                  StringBuffer buf,
450                                  UnicodeSet illegal,
451                                  boolean isSegment) {
452             int start = pos;
453             ParsePosition pp = null;
454             int quoteStart = -1; // Most recent 'single quoted string'
455             int quoteLimit = -1;
456             int varStart = -1; // Most recent $variableReference
457             int varLimit = -1;
458             int[] iref = new int[1];
459             int bufStart = buf.length();
460 
461         main:
462             while (pos < limit) {
463                 // Since all syntax characters are in the BMP, fetching
464                 // 16-bit code units suffices here.
465                 char c = rule.charAt(pos++);
466                 if (PatternProps.isWhiteSpace(c)) {
467                     continue;
468                 }
469                 // HALF_ENDERS is all chars that end a rule half: "<>=;"
470                 if (HALF_ENDERS.indexOf(c) >= 0) {
471                     ///CLOVER:OFF
472                     // isSegment is always false
473                     if (isSegment) {
474                         syntaxError("Unclosed segment", rule, start);
475                     }
476                     ///CLOVER:ON
477                     break main;
478                 }
479                 if (anchorEnd) {
480                     // Text after a presumed end anchor is a syntax err
481                     syntaxError("Malformed variable reference", rule, start);
482                 }
483                 if (UnicodeSet.resemblesPattern(rule, pos-1)) {
484                     if (pp == null) {
485                         pp = new ParsePosition(0);
486                     }
487                     pp.setIndex(pos-1); // Backup to opening '['
488                     buf.append(parser.parseSet(rule, pp));
489                     pos = pp.getIndex();
490                     continue;
491                 }
492                 // Handle escapes
493                 if (c == ESCAPE) {
494                     if (pos == limit) {
495                         syntaxError("Trailing backslash", rule, start);
496                     }
497                     iref[0] = pos;
498                     int escaped = Utility.unescapeAt(rule, iref);
499                     pos = iref[0];
500                     if (escaped == -1) {
501                         syntaxError("Malformed escape", rule, start);
502                     }
503                     parser.checkVariableRange(escaped, rule, start);
504                     UTF16.append(buf, escaped);
505                     continue;
506                 }
507                 // Handle quoted matter
508                 if (c == QUOTE) {
509                     int iq = rule.indexOf(QUOTE, pos);
510                     if (iq == pos) {
511                         buf.append(c); // Parse [''] outside quotes as [']
512                         ++pos;
513                     } else {
514                         /* This loop picks up a run of quoted text of the
515                          * form 'aaaa' each time through.  If this run
516                          * hasn't really ended ('aaaa''bbbb') then it keeps
517                          * looping, each time adding on a new run.  When it
518                          * reaches the final quote it breaks.
519                          */
520                         quoteStart = buf.length();
521                         for (;;) {
522                             if (iq < 0) {
523                                 syntaxError("Unterminated quote", rule, start);
524                             }
525                             buf.append(rule.substring(pos, iq));
526                             pos = iq+1;
527                             if (pos < limit && rule.charAt(pos) == QUOTE) {
528                             // Parse [''] inside quotes as [']
529                                 iq = rule.indexOf(QUOTE, pos+1);
530                             // Continue looping
531                             } else {
532                                 break;
533                             }
534                         }
535                         quoteLimit = buf.length();
536 
537                         for (iq=quoteStart; iq<quoteLimit; ++iq) {
538                             parser.checkVariableRange(buf.charAt(iq), rule, start);
539                         }
540                     }
541                     continue;
542                 }
543 
544                 parser.checkVariableRange(c, rule, start);
545 
546                 if (illegal.contains(c)) {
547                     syntaxError("Illegal character '" + c + '\'', rule, start);
548                 }
549 
550                 switch (c) {
551 
552                 //------------------------------------------------------
553                 // Elements allowed within and out of segments
554                 //------------------------------------------------------
555                 case ANCHOR_START:
556                     if (buf.length() == 0 && !anchorStart) {
557                         anchorStart = true;
558                     } else {
559                         syntaxError("Misplaced anchor start",
560                                     rule, start);
561                     }
562                     break;
563                 case SEGMENT_OPEN:
564                     {
565                         // bufSegStart is the offset in buf to the first
566                         // character of the segment we are parsing.
567                         int bufSegStart = buf.length();
568 
569                         // Record segment number now, since nextSegmentNumber
570                         // will be incremented during the call to parseSection
571                         // if there are nested segments.
572                         int segmentNumber = nextSegmentNumber++; // 1-based
573 
574                         // Parse the segment
575                         pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true);
576 
577                         // After parsing a segment, the relevant characters are
578                         // in buf, starting at offset bufSegStart.  Extract them
579                         // into a string matcher, and replace them with a
580                         // standin for that matcher.
581                         StringMatcher m =
582                             new StringMatcher(buf.substring(bufSegStart),
583                                               segmentNumber, parser.curData);
584 
585                         // Record and associate object and segment number
586                         parser.setSegmentObject(segmentNumber, m);
587                         buf.setLength(bufSegStart);
588                         buf.append(parser.getSegmentStandin(segmentNumber));
589                     }
590                     break;
591                 case FUNCTION:
592                 case ALT_FUNCTION:
593                     {
594                         iref[0] = pos;
595                         TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);
596                         // The next character MUST be a segment open
597                         if (single == null ||
598                             !Utility.parseChar(rule, iref, SEGMENT_OPEN)) {
599                             syntaxError("Invalid function", rule, start);
600                         }
601 
602                         Transliterator t = single.getInstance();
603                         if (t == null) {
604                             syntaxError("Invalid function ID", rule, start);
605                         }
606 
607                         // bufSegStart is the offset in buf to the first
608                         // character of the segment we are parsing.
609                         int bufSegStart = buf.length();
610 
611                         // Parse the segment
612                         pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true);
613 
614                         // After parsing a segment, the relevant characters are
615                         // in buf, starting at offset bufSegStart.
616                         FunctionReplacer r =
617                             new FunctionReplacer(t,
618                                 new StringReplacer(buf.substring(bufSegStart), parser.curData));
619 
620                         // Replace the buffer contents with a stand-in
621                         buf.setLength(bufSegStart);
622                         buf.append(parser.generateStandInFor(r));
623                     }
624                     break;
625                 case SymbolTable.SYMBOL_REF:
626                     // Handle variable references and segment references "$1" .. "$9"
627                     {
628                         // A variable reference must be followed immediately
629                         // by a Unicode identifier start and zero or more
630                         // Unicode identifier part characters, or by a digit
631                         // 1..9 if it is a segment reference.
632                         if (pos == limit) {
633                             // A variable ref character at the end acts as
634                             // an anchor to the context limit, as in perl.
635                             anchorEnd = true;
636                             break;
637                         }
638                         // Parse "$1" "$2" .. "$9" .. (no upper limit)
639                         c = rule.charAt(pos);
640                         int r = UCharacter.digit(c, 10);
641                         if (r >= 1 && r <= 9) {
642                             iref[0] = pos;
643                             r = Utility.parseNumber(rule, iref, 10);
644                             if (r < 0) {
645                                 syntaxError("Undefined segment reference",
646                                             rule, start);
647                             }
648                             pos = iref[0];
649                             buf.append(parser.getSegmentStandin(r));
650                         } else {
651                             if (pp == null) { // Lazy create
652                                 pp = new ParsePosition(0);
653                             }
654                             pp.setIndex(pos);
655                             String name = parser.parseData.
656                                 parseReference(rule, pp, limit);
657                             if (name == null) {
658                                 // This means the '$' was not followed by a
659                                 // valid name.  Try to interpret it as an
660                                 // end anchor then.  If this also doesn't work
661                                 // (if we see a following character) then signal
662                                 // an error.
663                                 anchorEnd = true;
664                                 break;
665                             }
666                             pos = pp.getIndex();
667                             // If this is a variable definition statement,
668                             // then the LHS variable will be undefined.  In
669                             // that case appendVariableDef() will append the
670                             // special placeholder char variableLimit-1.
671                             varStart = buf.length();
672                             parser.appendVariableDef(name, buf);
673                             varLimit = buf.length();
674                         }
675                     }
676                     break;
677                 case DOT:
678                     buf.append(parser.getDotStandIn());
679                     break;
680                 case KLEENE_STAR:
681                 case ONE_OR_MORE:
682                 case ZERO_OR_ONE:
683                     // Quantifiers.  We handle single characters, quoted strings,
684                     // variable references, and segments.
685                     //  a+      matches  aaa
686                     //  'foo'+  matches  foofoofoo
687                     //  $v+     matches  xyxyxy if $v == xy
688                     //  (seg)+  matches  segsegseg
689                     {
690                         ///CLOVER:OFF
691                         // isSegment is always false
692                         if (isSegment && buf.length() == bufStart) {
693                             // The */+ immediately follows '('
694                             syntaxError("Misplaced quantifier", rule, start);
695                             break;
696                         }
697                         ///CLOVER:ON
698 
699                         int qstart, qlimit;
700                         // The */+ follows an isolated character or quote
701                         // or variable reference
702                         if (buf.length() == quoteLimit) {
703                             // The */+ follows a 'quoted string'
704                             qstart = quoteStart;
705                             qlimit = quoteLimit;
706                         } else if (buf.length() == varLimit) {
707                             // The */+ follows a $variableReference
708                             qstart = varStart;
709                             qlimit = varLimit;
710                         } else {
711                             // The */+ follows a single character, possibly
712                             // a segment standin
713                             qstart = buf.length() - 1;
714                             qlimit = qstart + 1;
715                         }
716 
717                         UnicodeMatcher m;
718                         try {
719                             m = new StringMatcher(buf.toString(), qstart, qlimit,
720                                               0, parser.curData);
721                         } catch (RuntimeException e) {
722                             final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos);
723                             final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "...";
724                             throw new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$"
725                                     + postContext).initCause(e);
726                         }
727                         int min = 0;
728                         int max = Quantifier.MAX;
729                         switch (c) {
730                         case ONE_OR_MORE:
731                             min = 1;
732                             break;
733                         case ZERO_OR_ONE:
734                             min = 0;
735                             max = 1;
736                             break;
737                             // case KLEENE_STAR:
738                             //    do nothing -- min, max already set
739                         }
740                         m = new Quantifier(m, min, max);
741                         buf.setLength(qstart);
742                         buf.append(parser.generateStandInFor(m));
743                     }
744                     break;
745 
746                 //------------------------------------------------------
747                 // Elements allowed ONLY WITHIN segments
748                 //------------------------------------------------------
749                 case SEGMENT_CLOSE:
750                     // assert(isSegment);
751                     // We're done parsing a segment.
752                     break main;
753 
754                 //------------------------------------------------------
755                 // Elements allowed ONLY OUTSIDE segments
756                 //------------------------------------------------------
757                 case CONTEXT_ANTE:
758                     if (ante >= 0) {
759                         syntaxError("Multiple ante contexts", rule, start);
760                     }
761                     ante = buf.length();
762                     break;
763                 case CONTEXT_POST:
764                     if (post >= 0) {
765                         syntaxError("Multiple post contexts", rule, start);
766                     }
767                     post = buf.length();
768                     break;
769                 case CURSOR_POS:
770                     if (cursor >= 0) {
771                         syntaxError("Multiple cursors", rule, start);
772                     }
773                     cursor = buf.length();
774                     break;
775                 case CURSOR_OFFSET:
776                     if (cursorOffset < 0) {
777                         if (buf.length() > 0) {
778                             syntaxError("Misplaced " + c, rule, start);
779                         }
780                         --cursorOffset;
781                     } else if (cursorOffset > 0) {
782                         if (buf.length() != cursorOffsetPos || cursor >= 0) {
783                             syntaxError("Misplaced " + c, rule, start);
784                         }
785                         ++cursorOffset;
786                     } else {
787                         if (cursor == 0 && buf.length() == 0) {
788                             cursorOffset = -1;
789                         } else if (cursor < 0) {
790                             cursorOffsetPos = buf.length();
791                             cursorOffset = 1;
792                         } else {
793                             syntaxError("Misplaced " + c, rule, start);
794                         }
795                     }
796                     break;
797 
798                 //------------------------------------------------------
799                 // Non-special characters
800                 //------------------------------------------------------
801                 default:
802                     // Disallow unquoted characters other than [0-9A-Za-z]
803                     // in the printable ASCII range.  These characters are
804                     // reserved for possible future use.
805                     if (c >= 0x0021 && c <= 0x007E &&
806                         !((c >= '0' && c <= '9') ||
807                           (c >= 'A' && c <= 'Z') ||
808                           (c >= 'a' && c <= 'z'))) {
809                         syntaxError("Unquoted " + c, rule, start);
810                     }
811                     buf.append(c);
812                     break;
813                 }
814             }
815             return pos;
816         }
817 
818         /**
819          * Remove context.
820          */
removeContext()821         void removeContext() {
822             text = text.substring(ante < 0 ? 0 : ante,
823                                   post < 0 ? text.length() : post);
824             ante = post = -1;
825             anchorStart = anchorEnd = false;
826         }
827 
828         /**
829          * Return true if this half looks like valid output, that is, does not
830          * contain quantifiers or other special input-only elements.
831          */
isValidOutput(TransliteratorParser parser)832         public boolean isValidOutput(TransliteratorParser parser) {
833             for (int i=0; i<text.length(); ) {
834                 int c = UTF16.charAt(text, i);
835                 i += UTF16.getCharCount(c);
836                 if (!parser.parseData.isReplacer(c)) {
837                     return false;
838                 }
839             }
840             return true;
841         }
842 
843         /**
844          * Return true if this half looks like valid input, that is, does not
845          * contain functions or other special output-only elements.
846          */
isValidInput(TransliteratorParser parser)847         public boolean isValidInput(TransliteratorParser parser) {
848             for (int i=0; i<text.length(); ) {
849                 int c = UTF16.charAt(text, i);
850                 i += UTF16.getCharCount(c);
851                 if (!parser.parseData.isMatcher(c)) {
852                     return false;
853                 }
854             }
855             return true;
856         }
857     }
858 
859     //----------------------------------------------------------------------
860     // PUBLIC methods
861     //----------------------------------------------------------------------
862 
863     /**
864      * Constructor.
865      */
TransliteratorParser()866     public TransliteratorParser() {
867     }
868 
869     /**
870      * Parse a set of rules.  After the parse completes, examine the public
871      * data members for results.
872      */
parse(String rules, int dir)873     public void parse(String rules, int dir) {
874         parseRules(new RuleArray(new String[] { rules }), dir);
875     }
876 
877     /*
878      * Parse a set of rules.  After the parse completes, examine the public
879      * data members for results.
880      */
881 /*    public void parse(ResourceReader rules, int direction) {
882         parseRules(new RuleReader(rules), direction);
883     }*/
884 
885     //----------------------------------------------------------------------
886     // PRIVATE methods
887     //----------------------------------------------------------------------
888 
889     /**
890      * Parse an array of zero or more rules.  The strings in the array are
891      * treated as if they were concatenated together, with rule terminators
892      * inserted between array elements if not present already.
893      *
894      * Any previous rules are discarded.  Typically this method is called exactly
895      * once, during construction.
896      *
897      * The member this.data will be set to null if there are no rules.
898      *
899      * @exception IllegalIcuArgumentException if there is a syntax error in the
900      * rules
901      */
parseRules(RuleBody ruleArray, int dir)902     void parseRules(RuleBody ruleArray, int dir) {
903         boolean parsingIDs = true;
904         int ruleCount = 0;
905 
906         dataVector = new ArrayList<Data>();
907         idBlockVector = new ArrayList<String>();
908         curData = null;
909         direction = dir;
910         compoundFilter = null;
911         variablesVector = new ArrayList<Object>();
912         variableNames = new HashMap<String, char[]>();
913         parseData = new ParseData();
914 
915         List<RuntimeException> errors = new ArrayList<RuntimeException>();
916         int errorCount = 0;
917 
918         ruleArray.reset();
919 
920         StringBuilder idBlockResult = new StringBuilder();
921 
922         // The compound filter offset is an index into idBlockResult.
923         // If it is 0, then the compound filter occurred at the start,
924         // and it is the offset to the _start_ of the compound filter
925         // pattern.  Otherwise it is the offset to the _limit_ of the
926         // compound filter pattern within idBlockResult.
927         this.compoundFilter = null;
928         int compoundFilterOffset = -1;
929 
930     main:
931         for (;;) {
932             String rule = ruleArray.nextLine();
933             if (rule == null) {
934                 break;
935             }
936             int pos = 0;
937             int limit = rule.length();
938             while (pos < limit) {
939                 char c = rule.charAt(pos++);
940                 if (PatternProps.isWhiteSpace(c)) {
941                     continue;
942                 }
943                 // Skip lines starting with the comment character
944                 if (c == RULE_COMMENT_CHAR) {
945                     pos = rule.indexOf("\n", pos) + 1;
946                     if (pos == 0) {
947                         break; // No "\n" found; rest of rule is a commnet
948                     }
949                     continue; // Either fall out or restart with next line
950                 }
951 
952                 // skip empty rules
953                 if (c == END_OF_RULE)
954                     continue;
955 
956                 // Often a rule file contains multiple errors.  It's
957                 // convenient to the rule author if these are all reported
958                 // at once.  We keep parsing rules even after a failure, up
959                 // to a specified limit, and report all errors at once.
960                 try {
961                     ++ruleCount;
962 
963                     // We've found the start of a rule or ID.  c is its first
964                     // character, and pos points past c.
965                     --pos;
966                     // Look for an ID token.  Must have at least ID_TOKEN_LEN + 1
967                     // chars left.
968                     if ((pos + ID_TOKEN_LEN + 1) <= limit &&
969                             rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
970                         pos += ID_TOKEN_LEN;
971                         c = rule.charAt(pos);
972                         while (PatternProps.isWhiteSpace(c) && pos < limit) {
973                             ++pos;
974                             c = rule.charAt(pos);
975                         }
976                         int[] p = new int[] { pos };
977 
978                         if (!parsingIDs) {
979                             if (curData != null) {
980                                 if (direction == Transliterator.FORWARD)
981                                     dataVector.add(curData);
982                                 else
983                                     dataVector.add(0, curData);
984                                 curData = null;
985                             }
986                             parsingIDs = true;
987                         }
988 
989                         TransliteratorIDParser.SingleID id =
990                             TransliteratorIDParser.parseSingleID(
991                                           rule, p, direction);
992                         if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) {
993                             // Successful ::ID parse.
994 
995                             if (direction == Transliterator.FORWARD) {
996                                 idBlockResult.append(id.canonID).append(END_OF_RULE);
997                             } else {
998                                 idBlockResult.insert(0, id.canonID + END_OF_RULE);
999                             }
1000 
1001                         } else {
1002                             // Couldn't parse an ID.  Try to parse a global filter
1003                             int[] withParens = new int[] { -1 };
1004                             UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null);
1005                             if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) {
1006                                 if ((direction == Transliterator.FORWARD) ==
1007                                     (withParens[0] == 0)) {
1008                                     if (compoundFilter != null) {
1009                                         // Multiple compound filters
1010                                         syntaxError("Multiple global filters", rule, pos);
1011                                     }
1012                                     compoundFilter = f;
1013                                     compoundFilterOffset = ruleCount;
1014                                }
1015                             } else {
1016                                 // Invalid ::id
1017                                 // Can be parsed as neither an ID nor a global filter
1018                                 syntaxError("Invalid ::ID", rule, pos);
1019                             }
1020                         }
1021 
1022                         pos = p[0];
1023                     } else {
1024                         if (parsingIDs) {
1025                             if (direction == Transliterator.FORWARD)
1026                                 idBlockVector.add(idBlockResult.toString());
1027                             else
1028                                 idBlockVector.add(0, idBlockResult.toString());
1029                             idBlockResult.delete(0, idBlockResult.length());
1030                             parsingIDs = false;
1031                             curData = new RuleBasedTransliterator.Data();
1032 
1033                             // By default, rules use part of the private use area
1034                             // E000..F8FF for variables and other stand-ins.  Currently
1035                             // the range F000..F8FF is typically sufficient.  The 'use
1036                             // variable range' pragma allows rule sets to modify this.
1037                             setVariableRange(0xF000, 0xF8FF);
1038                         }
1039 
1040                         if (resemblesPragma(rule, pos, limit)) {
1041                             int ppp = parsePragma(rule, pos, limit);
1042                             if (ppp < 0) {
1043                                 syntaxError("Unrecognized pragma", rule, pos);
1044                             }
1045                             pos = ppp;
1046                         // Parse a rule
1047                         } else {
1048                             pos = parseRule(rule, pos, limit);
1049                         }
1050                     }
1051                 } catch (IllegalArgumentException e) {
1052                     if (errorCount == 30) {
1053                         IllegalIcuArgumentException icuEx = new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched");
1054                         icuEx.initCause(e);
1055                         errors.add(icuEx);
1056                         break main;
1057                     }
1058                     e.fillInStackTrace();
1059                     errors.add(e);
1060                     ++errorCount;
1061                     pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
1062                 }
1063             }
1064         }
1065         if (parsingIDs && idBlockResult.length() > 0) {
1066             if (direction == Transliterator.FORWARD)
1067                 idBlockVector.add(idBlockResult.toString());
1068             else
1069                 idBlockVector.add(0, idBlockResult.toString());
1070         }
1071         else if (!parsingIDs && curData != null) {
1072             if (direction == Transliterator.FORWARD)
1073                 dataVector.add(curData);
1074             else
1075                 dataVector.add(0, curData);
1076         }
1077 
1078         // Convert the set vector to an array
1079         for (int i = 0; i < dataVector.size(); i++) {
1080             Data data = dataVector.get(i);
1081             data.variables = new Object[variablesVector.size()];
1082             variablesVector.toArray(data.variables);
1083             data.variableNames = new HashMap<String, char[]>();
1084             data.variableNames.putAll(variableNames);
1085         }
1086         variablesVector = null;
1087 
1088         // Do more syntax checking and index the rules
1089         try {
1090             if (compoundFilter != null) {
1091                 if ((direction == Transliterator.FORWARD &&
1092                      compoundFilterOffset != 1) ||
1093                     (direction == Transliterator.REVERSE &&
1094                      compoundFilterOffset != ruleCount)) {
1095                     throw new IllegalIcuArgumentException("Compound filters misplaced");
1096                 }
1097             }
1098 
1099             for (int i = 0; i < dataVector.size(); i++) {
1100                 Data data = dataVector.get(i);
1101                 data.ruleSet.freeze();
1102             }
1103 
1104             if (idBlockVector.size() == 1 && (idBlockVector.get(0)).length() == 0)
1105                 idBlockVector.remove(0);
1106 
1107         } catch (IllegalArgumentException e) {
1108             e.fillInStackTrace();
1109             errors.add(e);
1110         }
1111 
1112         if (errors.size() != 0) {
1113             for (int i = errors.size()-1; i > 0; --i) {
1114                 RuntimeException previous = errors.get(i-1);
1115                 while (previous.getCause() != null) {
1116                     previous = (RuntimeException) previous.getCause(); // chain specially
1117                 }
1118                 previous.initCause(errors.get(i));
1119             }
1120             throw errors.get(0);
1121             // if initCause not supported: throw new IllegalArgumentException(errors.toString());
1122         }
1123     }
1124 
1125     /**
1126      * MAIN PARSER.  Parse the next rule in the given rule string, starting
1127      * at pos.  Return the index after the last character parsed.  Do not
1128      * parse characters at or after limit.
1129      *
1130      * Important:  The character at pos must be a non-whitespace character
1131      * that is not the comment character.
1132      *
1133      * This method handles quoting, escaping, and whitespace removal.  It
1134      * parses the end-of-rule character.  It recognizes context and cursor
1135      * indicators.  Once it does a lexical breakdown of the rule at pos, it
1136      * creates a rule object and adds it to our rule list.
1137      *
1138      * This method is tightly coupled to the inner class RuleHalf.
1139      */
parseRule(String rule, int pos, int limit)1140     private int parseRule(String rule, int pos, int limit) {
1141         // Locate the left side, operator, and right side
1142         int start = pos;
1143         char operator = 0;
1144 
1145         // Set up segments data
1146         segmentStandins = new StringBuffer();
1147         segmentObjects = new ArrayList<StringMatcher>();
1148 
1149         RuleHalf left  = new RuleHalf();
1150         RuleHalf right = new RuleHalf();
1151 
1152         undefinedVariableName = null;
1153         pos = left.parse(rule, pos, limit, this);
1154 
1155         if (pos == limit ||
1156             OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
1157             syntaxError("No operator pos=" + pos, rule, start);
1158         }
1159         ++pos;
1160 
1161         // Found an operator char.  Check for forward-reverse operator.
1162         if (operator == REVERSE_RULE_OP &&
1163             (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
1164             ++pos;
1165             operator = FWDREV_RULE_OP;
1166         }
1167 
1168         // Translate alternate op characters.
1169         switch (operator) {
1170         case ALT_FORWARD_RULE_OP:
1171             operator = FORWARD_RULE_OP;
1172             break;
1173         case ALT_REVERSE_RULE_OP:
1174             operator = REVERSE_RULE_OP;
1175             break;
1176         case ALT_FWDREV_RULE_OP:
1177             operator = FWDREV_RULE_OP;
1178             break;
1179         }
1180 
1181         pos = right.parse(rule, pos, limit, this);
1182 
1183         if (pos < limit) {
1184             if (rule.charAt(--pos) == END_OF_RULE) {
1185                 ++pos;
1186             } else {
1187                 // RuleHalf parser must have terminated at an operator
1188                 syntaxError("Unquoted operator", rule, start);
1189             }
1190         }
1191 
1192         if (operator == VARIABLE_DEF_OP) {
1193             // LHS is the name.  RHS is a single character, either a literal
1194             // or a set (already parsed).  If RHS is longer than one
1195             // character, it is either a multi-character string, or multiple
1196             // sets, or a mixture of chars and sets -- syntax error.
1197 
1198             // We expect to see a single undefined variable (the one being
1199             // defined).
1200             if (undefinedVariableName == null) {
1201                 syntaxError("Missing '$' or duplicate definition", rule, start);
1202             }
1203             if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
1204                 syntaxError("Malformed LHS", rule, start);
1205             }
1206             if (left.anchorStart || left.anchorEnd ||
1207                 right.anchorStart || right.anchorEnd) {
1208                 syntaxError("Malformed variable def", rule, start);
1209             }
1210             // We allow anything on the right, including an empty string.
1211             int n = right.text.length();
1212             char[] value = new char[n];
1213             right.text.getChars(0, n, value, 0);
1214             variableNames.put(undefinedVariableName, value);
1215 
1216             ++variableLimit;
1217             return pos;
1218         }
1219 
1220         // If this is not a variable definition rule, we shouldn't have
1221         // any undefined variable names.
1222         if (undefinedVariableName != null) {
1223             syntaxError("Undefined variable $" + undefinedVariableName,
1224                         rule, start);
1225         }
1226 
1227         // Verify segments
1228         if (segmentStandins.length() > segmentObjects.size()) {
1229             syntaxError("Undefined segment reference", rule, start);
1230         }
1231         for (int i=0; i<segmentStandins.length(); ++i) {
1232             if (segmentStandins.charAt(i) == 0) {
1233                 syntaxError("Internal error", rule, start); // will never happen
1234             }
1235         }
1236         for (int i=0; i<segmentObjects.size(); ++i) {
1237             if (segmentObjects.get(i) == null) {
1238                 syntaxError("Internal error", rule, start); // will never happen
1239             }
1240         }
1241 
1242         // If the direction we want doesn't match the rule
1243         // direction, do nothing.
1244         if (operator != FWDREV_RULE_OP &&
1245             ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
1246             return pos;
1247         }
1248 
1249         // Transform the rule into a forward rule by swapping the
1250         // sides if necessary.
1251         if (direction == Transliterator.REVERSE) {
1252             RuleHalf temp = left;
1253             left = right;
1254             right = temp;
1255         }
1256 
1257         // Remove non-applicable elements in forward-reverse
1258         // rules.  Bidirectional rules ignore elements that do not
1259         // apply.
1260         if (operator == FWDREV_RULE_OP) {
1261             right.removeContext();
1262             left.cursor = -1;
1263             left.cursorOffset = 0;
1264         }
1265 
1266         // Normalize context
1267         if (left.ante < 0) {
1268             left.ante = 0;
1269         }
1270         if (left.post < 0) {
1271             left.post = left.text.length();
1272         }
1273 
1274         // Context is only allowed on the input side.  Cursors are only
1275         // allowed on the output side.  Segment delimiters can only appear
1276         // on the left, and references on the right.  Cursor offset
1277         // cannot appear without an explicit cursor.  Cursor offset
1278         // cannot place the cursor outside the limits of the context.
1279         // Anchors are only allowed on the input side.
1280         if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
1281             (right.cursorOffset != 0 && right.cursor < 0) ||
1282             // - The following two checks were used to ensure that the
1283             // - the cursor offset stayed within the ante- or postcontext.
1284             // - However, with the addition of quantifiers, we have to
1285             // - allow arbitrary cursor offsets and do runtime checking.
1286             //(right.cursorOffset > (left.text.length() - left.post)) ||
1287             //(-right.cursorOffset > left.ante) ||
1288             right.anchorStart || right.anchorEnd ||
1289             !left.isValidInput(this) || !right.isValidOutput(this) ||
1290             left.ante > left.post) {
1291             syntaxError("Malformed rule", rule, start);
1292         }
1293 
1294         // Flatten segment objects vector to an array
1295         UnicodeMatcher[] segmentsArray = null;
1296         if (segmentObjects.size() > 0) {
1297             segmentsArray = new UnicodeMatcher[segmentObjects.size()];
1298             segmentObjects.toArray(segmentsArray);
1299         }
1300 
1301         curData.ruleSet.addRule(new TransliterationRule(
1302                                      left.text, left.ante, left.post,
1303                                      right.text, right.cursor, right.cursorOffset,
1304                                      segmentsArray,
1305                                      left.anchorStart, left.anchorEnd,
1306                                      curData));
1307 
1308         return pos;
1309     }
1310 
1311     /**
1312      * Set the variable range to [start, end] (inclusive).
1313      */
setVariableRange(int start, int end)1314     private void setVariableRange(int start, int end) {
1315         if (start > end || start < 0 || end > 0xFFFF) {
1316             throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end);
1317         }
1318 
1319         curData.variablesBase = (char) start; // first private use
1320 
1321         if (dataVector.size() == 0) {
1322             variableNext = (char) start;
1323             variableLimit = (char) (end + 1);
1324         }
1325     }
1326 
1327     /**
1328      * Assert that the given character is NOT within the variable range.
1329      * If it is, signal an error.  This is neccesary to ensure that the
1330      * variable range does not overlap characters used in a rule.
1331      */
checkVariableRange(int ch, String rule, int start)1332     private void checkVariableRange(int ch, String rule, int start) {
1333         if (ch >= curData.variablesBase && ch < variableLimit) {
1334             syntaxError("Variable range character in rule", rule, start);
1335         }
1336     }
1337 
1338     // (The following method is part of an unimplemented feature.
1339     // Remove this clover pragma after the feature is implemented.
1340     // 2003-06-11 ICU 2.6 Alan)
1341     ///CLOVER:OFF
1342     /**
1343      * Set the maximum backup to 'backup', in response to a pragma
1344      * statement.
1345      */
pragmaMaximumBackup(int backup)1346     private void pragmaMaximumBackup(int backup) {
1347         //TODO Finish
1348         throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet");
1349     }
1350     ///CLOVER:ON
1351 
1352     // (The following method is part of an unimplemented feature.
1353     // Remove this clover pragma after the feature is implemented.
1354     // 2003-06-11 ICU 2.6 Alan)
1355     ///CLOVER:OFF
1356     /**
1357      * Begin normalizing all rules using the given mode, in response
1358      * to a pragma statement.
1359      */
pragmaNormalizeRules(Normalizer.Mode mode)1360     private void pragmaNormalizeRules(Normalizer.Mode mode) {
1361         //TODO Finish
1362         throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet");
1363     }
1364     ///CLOVER:ON
1365 
1366     /**
1367      * Return true if the given rule looks like a pragma.
1368      * @param pos offset to the first non-whitespace character
1369      * of the rule.
1370      * @param limit pointer past the last character of the rule.
1371      */
resemblesPragma(String rule, int pos, int limit)1372     static boolean resemblesPragma(String rule, int pos, int limit) {
1373         // Must start with /use\s/i
1374         return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
1375     }
1376 
1377     /**
1378      * Parse a pragma.  This method assumes resemblesPragma() has
1379      * already returned true.
1380      * @param pos offset to the first non-whitespace character
1381      * of the rule.
1382      * @param limit pointer past the last character of the rule.
1383      * @return the position index after the final ';' of the pragma,
1384      * or -1 on failure.
1385      */
parsePragma(String rule, int pos, int limit)1386     private int parsePragma(String rule, int pos, int limit) {
1387         int[] array = new int[2];
1388 
1389         // resemblesPragma() has already returned true, so we
1390         // know that pos points to /use\s/i; we can skip 4 characters
1391         // immediately
1392         pos += 4;
1393 
1394         // Here are the pragmas we recognize:
1395         // use variable range 0xE000 0xEFFF;
1396         // use maximum backup 16;
1397         // use nfd rules;
1398         int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);
1399         if (p >= 0) {
1400             setVariableRange(array[0], array[1]);
1401             return p;
1402         }
1403 
1404         p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);
1405         if (p >= 0) {
1406             pragmaMaximumBackup(array[0]);
1407             return p;
1408         }
1409 
1410         p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
1411         if (p >= 0) {
1412             pragmaNormalizeRules(Normalizer.NFD);
1413             return p;
1414         }
1415 
1416         p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
1417         if (p >= 0) {
1418             pragmaNormalizeRules(Normalizer.NFC);
1419             return p;
1420         }
1421 
1422         // Syntax error: unable to parse pragma
1423         return -1;
1424     }
1425 
1426     /**
1427      * Throw an exception indicating a syntax error.  Search the rule string
1428      * for the probable end of the rule.  Of course, if the error is that
1429      * the end of rule marker is missing, then the rule end will not be found.
1430      * In any case the rule start will be correctly reported.
1431      * @param msg error description
1432      * @param rule pattern string
1433      * @param start position of first character of current rule
1434      */
syntaxError(String msg, String rule, int start)1435     static final void syntaxError(String msg, String rule, int start) {
1436         int end = ruleEnd(rule, start, rule.length());
1437         throw new IllegalIcuArgumentException(msg + " in \"" +
1438                                            Utility.escape(rule.substring(start, end)) + '"');
1439     }
1440 
ruleEnd(String rule, int start, int limit)1441     static final int ruleEnd(String rule, int start, int limit) {
1442         int end = Utility.quotedIndexOf(rule, start, limit, ";");
1443         if (end < 0) {
1444             end = limit;
1445         }
1446         return end;
1447     }
1448 
1449     /**
1450      * Parse a UnicodeSet out, store it, and return the stand-in character
1451      * used to represent it.
1452      */
parseSet(String rule, ParsePosition pos)1453     private final char parseSet(String rule, ParsePosition pos) {
1454         UnicodeSet set = new UnicodeSet(rule, pos, parseData);
1455         if (variableNext >= variableLimit) {
1456             throw new RuntimeException("Private use variables exhausted");
1457         }
1458         set.compact();
1459         return generateStandInFor(set);
1460     }
1461 
1462     /**
1463      * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer.
1464      * Store the object.
1465      */
generateStandInFor(Object obj)1466     char generateStandInFor(Object obj) {
1467         // assert(obj != null);
1468 
1469         // Look up previous stand-in, if any.  This is a short list
1470         // (typical n is 0, 1, or 2); linear search is optimal.
1471         for (int i=0; i<variablesVector.size(); ++i) {
1472             if (variablesVector.get(i) == obj) { // [sic] pointer comparison
1473                 return (char) (curData.variablesBase + i);
1474             }
1475         }
1476 
1477         if (variableNext >= variableLimit) {
1478             throw new RuntimeException("Variable range exhausted");
1479         }
1480         variablesVector.add(obj);
1481         return variableNext++;
1482     }
1483 
1484     /**
1485      * Return the standin for segment seg (1-based).
1486      */
getSegmentStandin(int seg)1487     public char getSegmentStandin(int seg) {
1488         if (segmentStandins.length() < seg) {
1489             segmentStandins.setLength(seg);
1490         }
1491         char c = segmentStandins.charAt(seg-1);
1492         if (c == 0) {
1493             if (variableNext >= variableLimit) {
1494                 throw new RuntimeException("Variable range exhausted");
1495             }
1496             c = variableNext++;
1497             // Set a placeholder in the master variables vector that will be
1498             // filled in later by setSegmentObject().  We know that we will get
1499             // called first because setSegmentObject() will call us.
1500             variablesVector.add(null);
1501             segmentStandins.setCharAt(seg-1, c);
1502         }
1503         return c;
1504     }
1505 
1506     /**
1507      * Set the object for segment seg (1-based).
1508      */
setSegmentObject(int seg, StringMatcher obj)1509     public void setSegmentObject(int seg, StringMatcher obj) {
1510         // Since we call parseSection() recursively, nested
1511         // segments will result in segment i+1 getting parsed
1512         // and stored before segment i; be careful with the
1513         // vector handling here.
1514         while (segmentObjects.size() < seg) {
1515             segmentObjects.add(null);
1516         }
1517         int index = getSegmentStandin(seg) - curData.variablesBase;
1518         if (segmentObjects.get(seg-1) != null ||
1519             variablesVector.get(index) != null) {
1520             throw new RuntimeException(); // should never happen
1521         }
1522         segmentObjects.set(seg-1, obj);
1523         variablesVector.set(index, obj);
1524     }
1525 
1526     /**
1527      * Return the stand-in for the dot set.  It is allocated the first
1528      * time and reused thereafter.
1529      */
getDotStandIn()1530     char getDotStandIn() {
1531         if (dotStandIn == -1) {
1532             dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
1533         }
1534         return (char) dotStandIn;
1535     }
1536 
1537     /**
1538      * Append the value of the given variable name to the given
1539      * StringBuffer.
1540      * @exception IllegalIcuArgumentException if the name is unknown.
1541      */
appendVariableDef(String name, StringBuffer buf)1542     private void appendVariableDef(String name, StringBuffer buf) {
1543         char[] ch = variableNames.get(name);
1544         if (ch == null) {
1545             // We allow one undefined variable so that variable definition
1546             // statements work.  For the first undefined variable we return
1547             // the special placeholder variableLimit-1, and save the variable
1548             // name.
1549             if (undefinedVariableName == null) {
1550                 undefinedVariableName = name;
1551                 if (variableNext >= variableLimit) {
1552                     throw new RuntimeException("Private use variables exhausted");
1553                 }
1554                 buf.append(--variableLimit);
1555             } else {
1556                 throw new IllegalIcuArgumentException("Undefined variable $"
1557                                                    + name);
1558             }
1559         } else {
1560             buf.append(ch);
1561         }
1562     }
1563 }
1564 
1565 //eof
1566