• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 1996-2015, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 
11 package android.icu.text;
12 
13 import java.io.IOException;
14 import java.nio.ByteBuffer;
15 import java.nio.ByteOrder;
16 
17 import android.icu.impl.CharTrie;
18 import android.icu.impl.ICUBinary;
19 import android.icu.impl.ICUBinary.Authenticate;
20 import android.icu.impl.Trie;
21 
22 /**
23 * <p>Internal class used for Rule Based Break Iterators</p>
24 * <p>This class provides access to the compiled break rule data, as
25 * it is stored in a .brk file.
26 */
27 final class RBBIDataWrapper {
28     //
29     // These fields are the ready-to-use compiled rule data, as
30     //   read from the file.
31     //
32     RBBIDataHeader fHeader;
33     short          fFTable[];
34     short          fRTable[];
35     short          fSFTable[];
36     short          fSRTable[];
37     CharTrie       fTrie;
38     String         fRuleSource;
39     int            fStatusTable[];
40 
41     private boolean isBigEndian;
42 
43     static final int DATA_FORMAT = 0x42726b20;  // "Brk "
44     static final int FORMAT_VERSION = 0x03010000;  // 3.1
45 
46     private static final class IsAcceptable implements Authenticate {
47         // @Override when we switch to Java 6
48         @Override
isDataVersionAcceptable(byte version[])49         public boolean isDataVersionAcceptable(byte version[]) {
50             return version[0] == (FORMAT_VERSION >>> 24);
51         }
52     }
53     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
54 
55     //
56     // Indexes to fields in the ICU4C style binary form of the RBBI Data Header
57     //   Used by the rule compiler when flattening the data.
58     //
59     final static int    DH_SIZE           = 24;
60     final static int    DH_MAGIC          = 0;
61     final static int    DH_FORMATVERSION  = 1;
62     final static int    DH_LENGTH         = 2;
63     final static int    DH_CATCOUNT       = 3;
64     final static int    DH_FTABLE         = 4;
65     final static int    DH_FTABLELEN      = 5;
66     final static int    DH_RTABLE         = 6;
67     final static int    DH_RTABLELEN      = 7;
68     final static int    DH_SFTABLE        = 8;
69     final static int    DH_SFTABLELEN     = 9;
70     final static int    DH_SRTABLE        = 10;
71     final static int    DH_SRTABLELEN     = 11;
72     final static int    DH_TRIE           = 12;
73     final static int    DH_TRIELEN        = 13;
74     final static int    DH_RULESOURCE     = 14;
75     final static int    DH_RULESOURCELEN  = 15;
76     final static int    DH_STATUSTABLE    = 16;
77     final static int    DH_STATUSTABLELEN = 17;
78 
79 
80     // Index offsets to the fields in a state table row.
81     //    Corresponds to struct RBBIStateTableRow in the C version.
82     //
83     final static int      ACCEPTING  = 0;
84     final static int      LOOKAHEAD  = 1;
85     final static int      TAGIDX     = 2;
86     final static int      RESERVED   = 3;
87     final static int      NEXTSTATES = 4;
88 
89     // Index offsets to header fields of a state table
90     //     struct RBBIStateTable {...   in the C version.
91     //
92             static final int NUMSTATES  = 0;
93             static final int ROWLEN     = 2;
94             static final int FLAGS      = 4;
95     //ivate static final int RESERVED_2 = 6;
96     private static final int ROW_DATA   = 8;
97 
98     //  Bit selectors for the "FLAGS" field of the state table header
99     //     enum RBBIStateTableFlags in the C version.
100     //
101     final static int      RBBI_LOOKAHEAD_HARD_BREAK = 1;
102     final static int      RBBI_BOF_REQUIRED         = 2;
103 
104     /**
105      * Data Header.  A struct-like class with the fields from the RBBI data file header.
106      */
107     final static class RBBIDataHeader {
108         int         fMagic;         //  == 0xbla0
109         int         fVersion;       //  == 1 (for ICU 3.2 and earlier.
110         byte[]      fFormatVersion; //  For ICU 3.4 and later.
111         int         fLength;        //  Total length in bytes of this RBBI Data,
112                                        //      including all sections, not just the header.
113         int         fCatCount;      //  Number of character categories.
114 
115         //
116         //  Offsets and sizes of each of the subsections within the RBBI data.
117         //  All offsets are bytes from the start of the RBBIDataHeader.
118         //  All sizes are in bytes.
119         //
120         int         fFTable;         //  forward state transition table.
121         int         fFTableLen;
122         int         fRTable;         //  Offset to the reverse state transition table.
123         int         fRTableLen;
124         int         fSFTable;        //  safe point forward transition table
125         int         fSFTableLen;
126         int         fSRTable;        //  safe point reverse transition table
127         int         fSRTableLen;
128         int         fTrie;           //  Offset to Trie data for character categories
129         int         fTrieLen;
130         int         fRuleSource;     //  Offset to the source for for the break
131         int         fRuleSourceLen;  //    rules.  Stored UChar *.
132         int         fStatusTable;    // Offset to the table of rule status values
133         int         fStatusTableLen;
134 
RBBIDataHeader()135         public RBBIDataHeader() {
136             fMagic = 0;
137             fFormatVersion = new byte[4];
138         }
139     }
140 
141 
142     /**
143      * RBBI State Table Indexing Function.  Given a state number, return the
144      * array index of the start of the state table row for that state.
145      *
146      */
getRowIndex(int state)147     int getRowIndex(int state){
148         return ROW_DATA + state * (fHeader.fCatCount + 4);
149     }
150 
151     static class TrieFoldingFunc implements  Trie.DataManipulate {
152         @Override
getFoldingOffset(int data)153         public int getFoldingOffset(int data) {
154             if ((data & 0x8000) != 0) {
155                 return data & 0x7fff;
156             } else {
157                 return 0;
158             }
159         }
160     }
161     static TrieFoldingFunc  fTrieFoldingFunc = new TrieFoldingFunc();
162 
163 
RBBIDataWrapper()164     RBBIDataWrapper() {
165     }
166 
167     /*
168      *  Get an RBBIDataWrapper from an InputStream onto a pre-compiled set
169      *  of RBBI rules.
170      */
get(ByteBuffer bytes)171     static RBBIDataWrapper get(ByteBuffer bytes) throws IOException {
172         RBBIDataWrapper This = new RBBIDataWrapper();
173 
174         ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE);
175         This.isBigEndian = bytes.order() == ByteOrder.BIG_ENDIAN;
176 
177         // Read in the RBBI data header...
178         This.fHeader = new  RBBIDataHeader();
179         This.fHeader.fMagic          = bytes.getInt();
180         // Read the same 4 bytes as an int and as a byte array: The data format could be
181         // the old fVersion=1 (TODO: probably not with a real ICU data header?)
182         // or the new fFormatVersion=3.x.
183         This.fHeader.fVersion        = bytes.getInt(bytes.position());
184         This.fHeader.fFormatVersion[0] = bytes.get();
185         This.fHeader.fFormatVersion[1] = bytes.get();
186         This.fHeader.fFormatVersion[2] = bytes.get();
187         This.fHeader.fFormatVersion[3] = bytes.get();
188         This.fHeader.fLength         = bytes.getInt();
189         This.fHeader.fCatCount       = bytes.getInt();
190         This.fHeader.fFTable         = bytes.getInt();
191         This.fHeader.fFTableLen      = bytes.getInt();
192         This.fHeader.fRTable         = bytes.getInt();
193         This.fHeader.fRTableLen      = bytes.getInt();
194         This.fHeader.fSFTable        = bytes.getInt();
195         This.fHeader.fSFTableLen     = bytes.getInt();
196         This.fHeader.fSRTable        = bytes.getInt();
197         This.fHeader.fSRTableLen     = bytes.getInt();
198         This.fHeader.fTrie           = bytes.getInt();
199         This.fHeader.fTrieLen        = bytes.getInt();
200         This.fHeader.fRuleSource     = bytes.getInt();
201         This.fHeader.fRuleSourceLen  = bytes.getInt();
202         This.fHeader.fStatusTable    = bytes.getInt();
203         This.fHeader.fStatusTableLen = bytes.getInt();
204         ICUBinary.skipBytes(bytes, 6 * 4);    // uint32_t  fReserved[6];
205 
206 
207         if (This.fHeader.fMagic != 0xb1a0 ||
208                 ! (This.fHeader.fVersion == 1  ||         // ICU 3.2 and earlier
209                    This.fHeader.fFormatVersion[0] == 3)   // ICU 3.4
210             ) {
211             throw new IOException("Break Iterator Rule Data Magic Number Incorrect, or unsupported data version.");
212         }
213 
214         // Current position in the buffer.
215         int pos = 24 * 4;     // offset of end of header, which has 24 fields, all int32_t (4 bytes)
216 
217         //
218         // Read in the Forward state transition table as an array of shorts.
219         //
220 
221         //   Quick Sanity Check
222         if (This.fHeader.fFTable < pos || This.fHeader.fFTable > This.fHeader.fLength) {
223              throw new IOException("Break iterator Rule data corrupt");
224         }
225 
226         //    Skip over any padding preceding this table
227         ICUBinary.skipBytes(bytes, This.fHeader.fFTable - pos);
228         pos = This.fHeader.fFTable;
229 
230         This.fFTable = ICUBinary.getShorts(
231                 bytes, This.fHeader.fFTableLen / 2, This.fHeader.fFTableLen & 1);
232         pos += This.fHeader.fFTableLen;
233 
234         //
235         // Read in the Reverse state table
236         //
237 
238         // Skip over any padding in the file
239         ICUBinary.skipBytes(bytes, This.fHeader.fRTable - pos);
240         pos = This.fHeader.fRTable;
241 
242         // Create & fill the table itself.
243         This.fRTable = ICUBinary.getShorts(
244                 bytes, This.fHeader.fRTableLen / 2, This.fHeader.fRTableLen & 1);
245         pos += This.fHeader.fRTableLen;
246 
247         //
248         // Read in the Safe Forward state table
249         //
250         if (This.fHeader.fSFTableLen > 0) {
251             // Skip over any padding in the file
252             ICUBinary.skipBytes(bytes, This.fHeader.fSFTable - pos);
253             pos = This.fHeader.fSFTable;
254 
255             // Create & fill the table itself.
256             This.fSFTable = ICUBinary.getShorts(
257                     bytes, This.fHeader.fSFTableLen / 2, This.fHeader.fSFTableLen & 1);
258             pos += This.fHeader.fSFTableLen;
259         }
260 
261         //
262         // Read in the Safe Reverse state table
263         //
264         if (This.fHeader.fSRTableLen > 0) {
265             // Skip over any padding in the file
266             ICUBinary.skipBytes(bytes, This.fHeader.fSRTable - pos);
267             pos = This.fHeader.fSRTable;
268 
269             // Create & fill the table itself.
270             This.fSRTable = ICUBinary.getShorts(
271                     bytes, This.fHeader.fSRTableLen / 2, This.fHeader.fSRTableLen & 1);
272             pos += This.fHeader.fSRTableLen;
273         }
274 
275         //
276         // Unserialize the Character categories TRIE
277         //     Because we can't be absolutely certain where the Trie deserialize will
278         //     leave the buffer, leave position unchanged.
279         //     The seek to the start of the next item following the TRIE will get us
280         //     back in sync.
281         //
282         ICUBinary.skipBytes(bytes, This.fHeader.fTrie - pos);  // seek buffer from end of
283         pos = This.fHeader.fTrie;               // previous section to the start of the trie
284 
285         bytes.mark();                           // Mark position of start of TRIE in the input
286                                                 //  and tell Java to keep the mark valid so long
287                                                 //  as we don't go more than 100 bytes past the
288                                                 //  past the end of the TRIE.
289 
290         This.fTrie = new CharTrie(bytes, fTrieFoldingFunc);  // Deserialize the TRIE, leaving buffer
291                                                 //  at an unknown position, preceding the
292                                                 //  padding between TRIE and following section.
293 
294         bytes.reset();                          // Move buffer back to marked position at
295                                                 //   the start of the serialized TRIE.  Now our
296                                                 //   "pos" variable and the buffer are in
297                                                 //   agreement.
298 
299         //
300         // Read the Rule Status Table
301         //
302         if (pos > This.fHeader.fStatusTable) {
303             throw new IOException("Break iterator Rule data corrupt");
304         }
305         ICUBinary.skipBytes(bytes, This.fHeader.fStatusTable - pos);
306         pos = This.fHeader.fStatusTable;
307         This.fStatusTable = ICUBinary.getInts(
308                 bytes, This.fHeader.fStatusTableLen / 4, This.fHeader.fStatusTableLen & 3);
309         pos += This.fHeader.fStatusTableLen;
310 
311         //
312         // Put the break rule source into a String
313         //
314         if (pos > This.fHeader.fRuleSource) {
315             throw new IOException("Break iterator Rule data corrupt");
316         }
317         ICUBinary.skipBytes(bytes, This.fHeader.fRuleSource - pos);
318         pos = This.fHeader.fRuleSource;
319         This.fRuleSource = ICUBinary.getString(
320                 bytes, This.fHeader.fRuleSourceLen / 2, This.fHeader.fRuleSourceLen & 1);
321 
322         if (RuleBasedBreakIterator.fDebugEnv!=null && RuleBasedBreakIterator.fDebugEnv.indexOf("data")>=0) {
323             This.dump(System.out);
324         }
325         return This;
326     }
327 
328     ///CLOVER:OFF
329     //  Getters for fields from the state table header
330     //
getStateTableNumStates(short table[])331     private int getStateTableNumStates(short table[]) {
332         if (isBigEndian) {
333             return (table[NUMSTATES] << 16) | (table[NUMSTATES+1] & 0xffff);
334         } else {
335             return (table[NUMSTATES+1] << 16) | (table[NUMSTATES] & 0xffff);
336         }
337     }
338     ///CLOVER:ON
339 
getStateTableFlags(short table[])340     int getStateTableFlags(short table[]) {
341         // This works for up to 15 flags bits.
342         return table[isBigEndian ? FLAGS + 1 : FLAGS];
343     }
344 
345     ///CLOVER:OFF
346     /* Debug function to display the break iterator data. */
dump(java.io.PrintStream out)347     void dump(java.io.PrintStream out) {
348         if (fFTable.length == 0) {
349             // There is no table. Fail early for testing purposes.
350             throw new NullPointerException();
351         }
352         out.println("RBBI Data Wrapper dump ...");
353         out.println();
354         out.println("Forward State Table");
355         dumpTable(out, fFTable);
356         out.println("Reverse State Table");
357         dumpTable(out, fRTable);
358         out.println("Forward Safe Points Table");
359         dumpTable(out, fSFTable);
360         out.println("Reverse Safe Points Table");
361         dumpTable(out, fSRTable);
362 
363         dumpCharCategories(out);
364         out.println("Source Rules: " + fRuleSource);
365 
366     }
367     ///CLOVER:ON
368 
369     ///CLOVER:OFF
370     /* Fixed width int-to-string conversion. */
intToString(int n, int width)371     static public String intToString(int n, int width) {
372         StringBuilder  dest = new StringBuilder(width);
373         dest.append(n);
374         while (dest.length() < width) {
375            dest.insert(0, ' ');
376         }
377         return dest.toString();
378     }
379     ///CLOVER:ON
380 
381     ///CLOVER:OFF
382     /* Fixed width int-to-string conversion. */
intToHexString(int n, int width)383     static public String intToHexString(int n, int width) {
384         StringBuilder  dest = new StringBuilder(width);
385         dest.append(Integer.toHexString(n));
386         while (dest.length() < width) {
387            dest.insert(0, ' ');
388         }
389         return dest.toString();
390     }
391     ///CLOVER:ON
392 
393     ///CLOVER:OFF
394     /** Dump a state table.  (A full set of RBBI rules has 4 state tables.)  */
dumpTable(java.io.PrintStream out, short table[])395     private void dumpTable(java.io.PrintStream out, short table[]) {
396         if (table == null)   {
397             out.println("  -- null -- ");
398         } else {
399             int n;
400             int state;
401             StringBuilder header = new StringBuilder(" Row  Acc Look  Tag");
402             for (n=0; n<fHeader.fCatCount; n++) {
403                 header.append(intToString(n, 5));
404             }
405             out.println(header.toString());
406             for (n=0; n<header.length(); n++) {
407                 out.print("-");
408             }
409             out.println();
410             for (state=0; state< getStateTableNumStates(table); state++) {
411                 dumpRow(out, table, state);
412             }
413             out.println();
414         }
415     }
416     ///CLOVER:ON
417 
418     ///CLOVER:OFF
419     /**
420      * Dump (for debug) a single row of an RBBI state table
421      * @param table
422      * @param state
423      */
dumpRow(java.io.PrintStream out, short table[], int state)424     private void dumpRow(java.io.PrintStream out, short table[], int   state) {
425         StringBuilder dest = new StringBuilder(fHeader.fCatCount*5 + 20);
426         dest.append(intToString(state, 4));
427         int row = getRowIndex(state);
428         if (table[row+ACCEPTING] != 0) {
429            dest.append(intToString(table[row+ACCEPTING], 5));
430         }else {
431             dest.append("     ");
432         }
433         if (table[row+LOOKAHEAD] != 0) {
434             dest.append(intToString(table[row+LOOKAHEAD], 5));
435         }else {
436             dest.append("     ");
437         }
438         dest.append(intToString(table[row+TAGIDX], 5));
439 
440         for (int col=0; col<fHeader.fCatCount; col++) {
441             dest.append(intToString(table[row+NEXTSTATES+col], 5));
442         }
443 
444         out.println(dest);
445     }
446     ///CLOVER:ON
447 
448     ///CLOVER:OFF
dumpCharCategories(java.io.PrintStream out)449     private void dumpCharCategories(java.io.PrintStream out) {
450         int n = fHeader.fCatCount;
451         String   catStrings[] = new  String[n+1];
452         int      rangeStart = 0;
453         int      rangeEnd = 0;
454         int      lastCat = -1;
455         int      char32;
456         int      category;
457         int      lastNewline[] = new int[n+1];
458 
459         for (category = 0; category <= fHeader.fCatCount; category ++) {
460             catStrings[category] = "";
461         }
462         out.println("\nCharacter Categories");
463         out.println("--------------------");
464         for (char32 = 0; char32<=0x10ffff; char32++) {
465             category = fTrie.getCodePointValue(char32);
466             category &= ~0x4000;            // Mask off dictionary bit.
467             if (category < 0 || category > fHeader.fCatCount) {
468                 out.println("Error, bad category " + Integer.toHexString(category) +
469                         " for char " + Integer.toHexString(char32));
470                 break;
471             }
472             if (category == lastCat ) {
473                 rangeEnd = char32;
474             } else {
475                 if (lastCat >= 0) {
476                     if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) {
477                         lastNewline[lastCat] = catStrings[lastCat].length() + 10;
478                         catStrings[lastCat] += "\n       ";
479                     }
480 
481                     catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
482                     if (rangeEnd != rangeStart) {
483                         catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
484                     }
485                 }
486                 lastCat = category;
487                 rangeStart = rangeEnd = char32;
488             }
489         }
490         catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
491         if (rangeEnd != rangeStart) {
492             catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
493         }
494 
495         for (category = 0; category <= fHeader.fCatCount; category ++) {
496             out.println (intToString(category, 5) + "  " + catStrings[category]);
497         }
498         out.println();
499     }
500     ///CLOVER:ON
501 
502     /*static RBBIDataWrapper get(String name) throws IOException {
503         String  fullName = "data/" + name;
504         InputStream is = ICUData.getRequiredStream(fullName);
505         return get(is);
506     }
507 
508     public static void main(String[] args) {
509         String s;
510         if (args.length == 0) {
511             s = "char";
512         } else {
513             s = args[0];
514         }
515         System.out.println("RBBIDataWrapper.main(" + s + ") ");
516 
517         String versionedName = ICUResourceBundle.ICU_BUNDLE+"/"+ s + ".brk";
518 
519         try {
520             RBBIDataWrapper This = RBBIDataWrapper.get(versionedName);
521             This.dump();
522         }
523        catch (Exception e) {
524            System.out.println("Exception: " + e.toString());
525        }
526 
527     }*/
528 }
529