• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 1996-2015, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.text;
11 
12 import java.io.IOException;
13 import java.nio.ByteBuffer;
14 import java.nio.ByteOrder;
15 
16 import com.ibm.icu.impl.CharTrie;
17 import com.ibm.icu.impl.ICUBinary;
18 import com.ibm.icu.impl.ICUBinary.Authenticate;
19 import com.ibm.icu.impl.Trie;
20 
21 /**
22 * <p>Internal class used for Rule Based Break Iterators</p>
23 * <p>This class provides access to the compiled break rule data, as
24 * it is stored in a .brk file.
25 */
26 final class RBBIDataWrapper {
27     //
28     // These fields are the ready-to-use compiled rule data, as
29     //   read from the file.
30     //
31     RBBIDataHeader fHeader;
32     short          fFTable[];
33     short          fRTable[];
34     short          fSFTable[];
35     short          fSRTable[];
36     CharTrie       fTrie;
37     String         fRuleSource;
38     int            fStatusTable[];
39 
40     private boolean isBigEndian;
41 
42     static final int DATA_FORMAT = 0x42726b20;  // "Brk "
43     static final int FORMAT_VERSION = 0x03010000;  // 3.1
44 
45     private static final class IsAcceptable implements Authenticate {
46         // @Override when we switch to Java 6
47         @Override
isDataVersionAcceptable(byte version[])48         public boolean isDataVersionAcceptable(byte version[]) {
49             return version[0] == (FORMAT_VERSION >>> 24);
50         }
51     }
52     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
53 
54     //
55     // Indexes to fields in the ICU4C style binary form of the RBBI Data Header
56     //   Used by the rule compiler when flattening the data.
57     //
58     final static int    DH_SIZE           = 24;
59     final static int    DH_MAGIC          = 0;
60     final static int    DH_FORMATVERSION  = 1;
61     final static int    DH_LENGTH         = 2;
62     final static int    DH_CATCOUNT       = 3;
63     final static int    DH_FTABLE         = 4;
64     final static int    DH_FTABLELEN      = 5;
65     final static int    DH_RTABLE         = 6;
66     final static int    DH_RTABLELEN      = 7;
67     final static int    DH_SFTABLE        = 8;
68     final static int    DH_SFTABLELEN     = 9;
69     final static int    DH_SRTABLE        = 10;
70     final static int    DH_SRTABLELEN     = 11;
71     final static int    DH_TRIE           = 12;
72     final static int    DH_TRIELEN        = 13;
73     final static int    DH_RULESOURCE     = 14;
74     final static int    DH_RULESOURCELEN  = 15;
75     final static int    DH_STATUSTABLE    = 16;
76     final static int    DH_STATUSTABLELEN = 17;
77 
78 
79     // Index offsets to the fields in a state table row.
80     //    Corresponds to struct RBBIStateTableRow in the C version.
81     //
82     final static int      ACCEPTING  = 0;
83     final static int      LOOKAHEAD  = 1;
84     final static int      TAGIDX     = 2;
85     final static int      RESERVED   = 3;
86     final static int      NEXTSTATES = 4;
87 
88     // Index offsets to header fields of a state table
89     //     struct RBBIStateTable {...   in the C version.
90     //
91             static final int NUMSTATES  = 0;
92             static final int ROWLEN     = 2;
93             static final int FLAGS      = 4;
94     //ivate static final int RESERVED_2 = 6;
95     private static final int ROW_DATA   = 8;
96 
97     //  Bit selectors for the "FLAGS" field of the state table header
98     //     enum RBBIStateTableFlags in the C version.
99     //
100     final static int      RBBI_LOOKAHEAD_HARD_BREAK = 1;
101     final static int      RBBI_BOF_REQUIRED         = 2;
102 
103     /**
104      * Data Header.  A struct-like class with the fields from the RBBI data file header.
105      */
106     final static class RBBIDataHeader {
107         int         fMagic;         //  == 0xbla0
108         int         fVersion;       //  == 1 (for ICU 3.2 and earlier.
109         byte[]      fFormatVersion; //  For ICU 3.4 and later.
110         int         fLength;        //  Total length in bytes of this RBBI Data,
111                                        //      including all sections, not just the header.
112         int         fCatCount;      //  Number of character categories.
113 
114         //
115         //  Offsets and sizes of each of the subsections within the RBBI data.
116         //  All offsets are bytes from the start of the RBBIDataHeader.
117         //  All sizes are in bytes.
118         //
119         int         fFTable;         //  forward state transition table.
120         int         fFTableLen;
121         int         fRTable;         //  Offset to the reverse state transition table.
122         int         fRTableLen;
123         int         fSFTable;        //  safe point forward transition table
124         int         fSFTableLen;
125         int         fSRTable;        //  safe point reverse transition table
126         int         fSRTableLen;
127         int         fTrie;           //  Offset to Trie data for character categories
128         int         fTrieLen;
129         int         fRuleSource;     //  Offset to the source for for the break
130         int         fRuleSourceLen;  //    rules.  Stored UChar *.
131         int         fStatusTable;    // Offset to the table of rule status values
132         int         fStatusTableLen;
133 
RBBIDataHeader()134         public RBBIDataHeader() {
135             fMagic = 0;
136             fFormatVersion = new byte[4];
137         }
138     }
139 
140 
141     /**
142      * RBBI State Table Indexing Function.  Given a state number, return the
143      * array index of the start of the state table row for that state.
144      *
145      */
getRowIndex(int state)146     int getRowIndex(int state){
147         return ROW_DATA + state * (fHeader.fCatCount + 4);
148     }
149 
150     static class TrieFoldingFunc implements  Trie.DataManipulate {
151         @Override
getFoldingOffset(int data)152         public int getFoldingOffset(int data) {
153             if ((data & 0x8000) != 0) {
154                 return data & 0x7fff;
155             } else {
156                 return 0;
157             }
158         }
159     }
160     static TrieFoldingFunc  fTrieFoldingFunc = new TrieFoldingFunc();
161 
162 
RBBIDataWrapper()163     RBBIDataWrapper() {
164     }
165 
166     /*
167      *  Get an RBBIDataWrapper from an InputStream onto a pre-compiled set
168      *  of RBBI rules.
169      */
get(ByteBuffer bytes)170     static RBBIDataWrapper get(ByteBuffer bytes) throws IOException {
171         RBBIDataWrapper This = new RBBIDataWrapper();
172 
173         ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE);
174         This.isBigEndian = bytes.order() == ByteOrder.BIG_ENDIAN;
175 
176         // Read in the RBBI data header...
177         This.fHeader = new  RBBIDataHeader();
178         This.fHeader.fMagic          = bytes.getInt();
179         // Read the same 4 bytes as an int and as a byte array: The data format could be
180         // the old fVersion=1 (TODO: probably not with a real ICU data header?)
181         // or the new fFormatVersion=3.x.
182         This.fHeader.fVersion        = bytes.getInt(bytes.position());
183         This.fHeader.fFormatVersion[0] = bytes.get();
184         This.fHeader.fFormatVersion[1] = bytes.get();
185         This.fHeader.fFormatVersion[2] = bytes.get();
186         This.fHeader.fFormatVersion[3] = bytes.get();
187         This.fHeader.fLength         = bytes.getInt();
188         This.fHeader.fCatCount       = bytes.getInt();
189         This.fHeader.fFTable         = bytes.getInt();
190         This.fHeader.fFTableLen      = bytes.getInt();
191         This.fHeader.fRTable         = bytes.getInt();
192         This.fHeader.fRTableLen      = bytes.getInt();
193         This.fHeader.fSFTable        = bytes.getInt();
194         This.fHeader.fSFTableLen     = bytes.getInt();
195         This.fHeader.fSRTable        = bytes.getInt();
196         This.fHeader.fSRTableLen     = bytes.getInt();
197         This.fHeader.fTrie           = bytes.getInt();
198         This.fHeader.fTrieLen        = bytes.getInt();
199         This.fHeader.fRuleSource     = bytes.getInt();
200         This.fHeader.fRuleSourceLen  = bytes.getInt();
201         This.fHeader.fStatusTable    = bytes.getInt();
202         This.fHeader.fStatusTableLen = bytes.getInt();
203         ICUBinary.skipBytes(bytes, 6 * 4);    // uint32_t  fReserved[6];
204 
205 
206         if (This.fHeader.fMagic != 0xb1a0 ||
207                 ! (This.fHeader.fVersion == 1  ||         // ICU 3.2 and earlier
208                    This.fHeader.fFormatVersion[0] == 3)   // ICU 3.4
209             ) {
210             throw new IOException("Break Iterator Rule Data Magic Number Incorrect, or unsupported data version.");
211         }
212 
213         // Current position in the buffer.
214         int pos = 24 * 4;     // offset of end of header, which has 24 fields, all int32_t (4 bytes)
215 
216         //
217         // Read in the Forward state transition table as an array of shorts.
218         //
219 
220         //   Quick Sanity Check
221         if (This.fHeader.fFTable < pos || This.fHeader.fFTable > This.fHeader.fLength) {
222              throw new IOException("Break iterator Rule data corrupt");
223         }
224 
225         //    Skip over any padding preceding this table
226         ICUBinary.skipBytes(bytes, This.fHeader.fFTable - pos);
227         pos = This.fHeader.fFTable;
228 
229         This.fFTable = ICUBinary.getShorts(
230                 bytes, This.fHeader.fFTableLen / 2, This.fHeader.fFTableLen & 1);
231         pos += This.fHeader.fFTableLen;
232 
233         //
234         // Read in the Reverse state table
235         //
236 
237         // Skip over any padding in the file
238         ICUBinary.skipBytes(bytes, This.fHeader.fRTable - pos);
239         pos = This.fHeader.fRTable;
240 
241         // Create & fill the table itself.
242         This.fRTable = ICUBinary.getShorts(
243                 bytes, This.fHeader.fRTableLen / 2, This.fHeader.fRTableLen & 1);
244         pos += This.fHeader.fRTableLen;
245 
246         //
247         // Read in the Safe Forward state table
248         //
249         if (This.fHeader.fSFTableLen > 0) {
250             // Skip over any padding in the file
251             ICUBinary.skipBytes(bytes, This.fHeader.fSFTable - pos);
252             pos = This.fHeader.fSFTable;
253 
254             // Create & fill the table itself.
255             This.fSFTable = ICUBinary.getShorts(
256                     bytes, This.fHeader.fSFTableLen / 2, This.fHeader.fSFTableLen & 1);
257             pos += This.fHeader.fSFTableLen;
258         }
259 
260         //
261         // Read in the Safe Reverse state table
262         //
263         if (This.fHeader.fSRTableLen > 0) {
264             // Skip over any padding in the file
265             ICUBinary.skipBytes(bytes, This.fHeader.fSRTable - pos);
266             pos = This.fHeader.fSRTable;
267 
268             // Create & fill the table itself.
269             This.fSRTable = ICUBinary.getShorts(
270                     bytes, This.fHeader.fSRTableLen / 2, This.fHeader.fSRTableLen & 1);
271             pos += This.fHeader.fSRTableLen;
272         }
273 
274         //
275         // Unserialize the Character categories TRIE
276         //     Because we can't be absolutely certain where the Trie deserialize will
277         //     leave the buffer, leave position unchanged.
278         //     The seek to the start of the next item following the TRIE will get us
279         //     back in sync.
280         //
281         ICUBinary.skipBytes(bytes, This.fHeader.fTrie - pos);  // seek buffer from end of
282         pos = This.fHeader.fTrie;               // previous section to the start of the trie
283 
284         bytes.mark();                           // Mark position of start of TRIE in the input
285                                                 //  and tell Java to keep the mark valid so long
286                                                 //  as we don't go more than 100 bytes past the
287                                                 //  past the end of the TRIE.
288 
289         This.fTrie = new CharTrie(bytes, fTrieFoldingFunc);  // Deserialize the TRIE, leaving buffer
290                                                 //  at an unknown position, preceding the
291                                                 //  padding between TRIE and following section.
292 
293         bytes.reset();                          // Move buffer back to marked position at
294                                                 //   the start of the serialized TRIE.  Now our
295                                                 //   "pos" variable and the buffer are in
296                                                 //   agreement.
297 
298         //
299         // Read the Rule Status Table
300         //
301         if (pos > This.fHeader.fStatusTable) {
302             throw new IOException("Break iterator Rule data corrupt");
303         }
304         ICUBinary.skipBytes(bytes, This.fHeader.fStatusTable - pos);
305         pos = This.fHeader.fStatusTable;
306         This.fStatusTable = ICUBinary.getInts(
307                 bytes, This.fHeader.fStatusTableLen / 4, This.fHeader.fStatusTableLen & 3);
308         pos += This.fHeader.fStatusTableLen;
309 
310         //
311         // Put the break rule source into a String
312         //
313         if (pos > This.fHeader.fRuleSource) {
314             throw new IOException("Break iterator Rule data corrupt");
315         }
316         ICUBinary.skipBytes(bytes, This.fHeader.fRuleSource - pos);
317         pos = This.fHeader.fRuleSource;
318         This.fRuleSource = ICUBinary.getString(
319                 bytes, This.fHeader.fRuleSourceLen / 2, This.fHeader.fRuleSourceLen & 1);
320 
321         if (RuleBasedBreakIterator.fDebugEnv!=null && RuleBasedBreakIterator.fDebugEnv.indexOf("data")>=0) {
322             This.dump(System.out);
323         }
324         return This;
325     }
326 
327     ///CLOVER:OFF
328     //  Getters for fields from the state table header
329     //
getStateTableNumStates(short table[])330     private int getStateTableNumStates(short table[]) {
331         if (isBigEndian) {
332             return (table[NUMSTATES] << 16) | (table[NUMSTATES+1] & 0xffff);
333         } else {
334             return (table[NUMSTATES+1] << 16) | (table[NUMSTATES] & 0xffff);
335         }
336     }
337     ///CLOVER:ON
338 
getStateTableFlags(short table[])339     int getStateTableFlags(short table[]) {
340         // This works for up to 15 flags bits.
341         return table[isBigEndian ? FLAGS + 1 : FLAGS];
342     }
343 
344     ///CLOVER:OFF
345     /* Debug function to display the break iterator data. */
dump(java.io.PrintStream out)346     void dump(java.io.PrintStream out) {
347         if (fFTable.length == 0) {
348             // There is no table. Fail early for testing purposes.
349             throw new NullPointerException();
350         }
351         out.println("RBBI Data Wrapper dump ...");
352         out.println();
353         out.println("Forward State Table");
354         dumpTable(out, fFTable);
355         out.println("Reverse State Table");
356         dumpTable(out, fRTable);
357         out.println("Forward Safe Points Table");
358         dumpTable(out, fSFTable);
359         out.println("Reverse Safe Points Table");
360         dumpTable(out, fSRTable);
361 
362         dumpCharCategories(out);
363         out.println("Source Rules: " + fRuleSource);
364 
365     }
366     ///CLOVER:ON
367 
368     ///CLOVER:OFF
369     /* Fixed width int-to-string conversion. */
intToString(int n, int width)370     static public String intToString(int n, int width) {
371         StringBuilder  dest = new StringBuilder(width);
372         dest.append(n);
373         while (dest.length() < width) {
374            dest.insert(0, ' ');
375         }
376         return dest.toString();
377     }
378     ///CLOVER:ON
379 
380     ///CLOVER:OFF
381     /* Fixed width int-to-string conversion. */
intToHexString(int n, int width)382     static public String intToHexString(int n, int width) {
383         StringBuilder  dest = new StringBuilder(width);
384         dest.append(Integer.toHexString(n));
385         while (dest.length() < width) {
386            dest.insert(0, ' ');
387         }
388         return dest.toString();
389     }
390     ///CLOVER:ON
391 
392     ///CLOVER:OFF
393     /** Dump a state table.  (A full set of RBBI rules has 4 state tables.)  */
dumpTable(java.io.PrintStream out, short table[])394     private void dumpTable(java.io.PrintStream out, short table[]) {
395         if (table == null)   {
396             out.println("  -- null -- ");
397         } else {
398             int n;
399             int state;
400             StringBuilder header = new StringBuilder(" Row  Acc Look  Tag");
401             for (n=0; n<fHeader.fCatCount; n++) {
402                 header.append(intToString(n, 5));
403             }
404             out.println(header.toString());
405             for (n=0; n<header.length(); n++) {
406                 out.print("-");
407             }
408             out.println();
409             for (state=0; state< getStateTableNumStates(table); state++) {
410                 dumpRow(out, table, state);
411             }
412             out.println();
413         }
414     }
415     ///CLOVER:ON
416 
417     ///CLOVER:OFF
418     /**
419      * Dump (for debug) a single row of an RBBI state table
420      * @param table
421      * @param state
422      */
dumpRow(java.io.PrintStream out, short table[], int state)423     private void dumpRow(java.io.PrintStream out, short table[], int   state) {
424         StringBuilder dest = new StringBuilder(fHeader.fCatCount*5 + 20);
425         dest.append(intToString(state, 4));
426         int row = getRowIndex(state);
427         if (table[row+ACCEPTING] != 0) {
428            dest.append(intToString(table[row+ACCEPTING], 5));
429         }else {
430             dest.append("     ");
431         }
432         if (table[row+LOOKAHEAD] != 0) {
433             dest.append(intToString(table[row+LOOKAHEAD], 5));
434         }else {
435             dest.append("     ");
436         }
437         dest.append(intToString(table[row+TAGIDX], 5));
438 
439         for (int col=0; col<fHeader.fCatCount; col++) {
440             dest.append(intToString(table[row+NEXTSTATES+col], 5));
441         }
442 
443         out.println(dest);
444     }
445     ///CLOVER:ON
446 
447     ///CLOVER:OFF
dumpCharCategories(java.io.PrintStream out)448     private void dumpCharCategories(java.io.PrintStream out) {
449         int n = fHeader.fCatCount;
450         String   catStrings[] = new  String[n+1];
451         int      rangeStart = 0;
452         int      rangeEnd = 0;
453         int      lastCat = -1;
454         int      char32;
455         int      category;
456         int      lastNewline[] = new int[n+1];
457 
458         for (category = 0; category <= fHeader.fCatCount; category ++) {
459             catStrings[category] = "";
460         }
461         out.println("\nCharacter Categories");
462         out.println("--------------------");
463         for (char32 = 0; char32<=0x10ffff; char32++) {
464             category = fTrie.getCodePointValue(char32);
465             category &= ~0x4000;            // Mask off dictionary bit.
466             if (category < 0 || category > fHeader.fCatCount) {
467                 out.println("Error, bad category " + Integer.toHexString(category) +
468                         " for char " + Integer.toHexString(char32));
469                 break;
470             }
471             if (category == lastCat ) {
472                 rangeEnd = char32;
473             } else {
474                 if (lastCat >= 0) {
475                     if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) {
476                         lastNewline[lastCat] = catStrings[lastCat].length() + 10;
477                         catStrings[lastCat] += "\n       ";
478                     }
479 
480                     catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
481                     if (rangeEnd != rangeStart) {
482                         catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
483                     }
484                 }
485                 lastCat = category;
486                 rangeStart = rangeEnd = char32;
487             }
488         }
489         catStrings[lastCat] += " " + Integer.toHexString(rangeStart);
490         if (rangeEnd != rangeStart) {
491             catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd);
492         }
493 
494         for (category = 0; category <= fHeader.fCatCount; category ++) {
495             out.println (intToString(category, 5) + "  " + catStrings[category]);
496         }
497         out.println();
498     }
499     ///CLOVER:ON
500 
501     /*static RBBIDataWrapper get(String name) throws IOException {
502         String  fullName = "data/" + name;
503         InputStream is = ICUData.getRequiredStream(fullName);
504         return get(is);
505     }
506 
507     public static void main(String[] args) {
508         String s;
509         if (args.length == 0) {
510             s = "char";
511         } else {
512             s = args[0];
513         }
514         System.out.println("RBBIDataWrapper.main(" + s + ") ");
515 
516         String versionedName = ICUResourceBundle.ICU_BUNDLE+"/"+ s + ".brk";
517 
518         try {
519             RBBIDataWrapper This = RBBIDataWrapper.get(versionedName);
520             This.dump();
521         }
522        catch (Exception e) {
523            System.out.println("Exception: " + e.toString());
524        }
525 
526     }*/
527 }
528