• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  rbbirb.cpp
5 //
6 //  Copyright (C) 2002-2011, International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
10 //    building (compiling) break rules into the tables required by the runtime
11 //    RBBI engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/brkiter.h"
19 #include "unicode/rbbi.h"
20 #include "unicode/ubrk.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uchriter.h"
25 #include "unicode/parsepos.h"
26 #include "unicode/parseerr.h"
27 
28 #include "cmemory.h"
29 #include "cstring.h"
30 #include "rbbirb.h"
31 #include "rbbinode.h"
32 #include "rbbiscan.h"
33 #include "rbbisetb.h"
34 #include "rbbitblb.h"
35 #include "rbbidata.h"
36 #include "uassert.h"
37 
38 
39 U_NAMESPACE_BEGIN
40 
41 
42 //----------------------------------------------------------------------------------------
43 //
44 //  Constructor.
45 //
46 //----------------------------------------------------------------------------------------
RBBIRuleBuilder(const UnicodeString & rules,UParseError * parseErr,UErrorCode & status)47 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
48                                        UParseError     *parseErr,
49                                        UErrorCode      &status)
50  : fRules(rules), fStrippedRules(rules)
51 {
52     fStatus = &status; // status is checked below
53     fParseError = parseErr;
54     fDebugEnv   = NULL;
55 #ifdef RBBI_DEBUG
56     fDebugEnv   = getenv("U_RBBIDEBUG");
57 #endif
58 
59 
60     fForwardTree        = NULL;
61     fReverseTree        = NULL;
62     fSafeFwdTree        = NULL;
63     fSafeRevTree        = NULL;
64     fDefaultTree        = &fForwardTree;
65     fForwardTable       = NULL;
66     fRuleStatusVals     = NULL;
67     fChainRules         = FALSE;
68     fLBCMNoChain        = FALSE;
69     fLookAheadHardBreak = FALSE;
70     fUSetNodes          = NULL;
71     fRuleStatusVals     = NULL;
72     fScanner            = NULL;
73     fSetBuilder         = NULL;
74     if (parseErr) {
75         uprv_memset(parseErr, 0, sizeof(UParseError));
76     }
77 
78     if (U_FAILURE(status)) {
79         return;
80     }
81 
82     fUSetNodes          = new UVector(status); // bcos status gets overwritten here
83     fRuleStatusVals     = new UVector(status);
84     fScanner            = new RBBIRuleScanner(this);
85     fSetBuilder         = new RBBISetBuilder(this);
86     if (U_FAILURE(status)) {
87         return;
88     }
89     if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
90         status = U_MEMORY_ALLOCATION_ERROR;
91     }
92 }
93 
94 
95 
96 //----------------------------------------------------------------------------------------
97 //
98 //  Destructor
99 //
100 //----------------------------------------------------------------------------------------
~RBBIRuleBuilder()101 RBBIRuleBuilder::~RBBIRuleBuilder() {
102 
103     int        i;
104     for (i=0; ; i++) {
105         RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
106         if (n==NULL) {
107             break;
108         }
109         delete n;
110     }
111 
112     delete fUSetNodes;
113     delete fSetBuilder;
114     delete fForwardTable;
115     delete fForwardTree;
116     delete fReverseTree;
117     delete fSafeFwdTree;
118     delete fSafeRevTree;
119     delete fScanner;
120     delete fRuleStatusVals;
121 }
122 
123 
124 
125 
126 
127 //----------------------------------------------------------------------------------------
128 //
129 //   flattenData() -  Collect up the compiled RBBI rule data and put it into
130 //                    the format for saving in ICU data files,
131 //                    which is also the format needed by the RBBI runtime engine.
132 //
133 //----------------------------------------------------------------------------------------
align8(int32_t i)134 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
135 
flattenData()136 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
137     int32_t    i;
138 
139     if (U_FAILURE(*fStatus)) {
140         return NULL;
141     }
142 
143     // Remove whitespace from the rules to make it smaller.
144     // The rule parser has already removed comments.
145     fStrippedRules = fScanner->stripRules(fStrippedRules);
146 
147     // Calculate the size of each section in the data.
148     //   Sizes here are padded up to a multiple of 8 for better memory alignment.
149     //   Sections sizes actually stored in the header are for the actual data
150     //     without the padding.
151     //
152     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
153     int32_t forwardTableSize  = align8(fForwardTable->getTableSize());
154     int32_t reverseTableSize  = align8(fForwardTable->getSafeTableSize());
155     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
156     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
157     int32_t rulesSize         = align8((fStrippedRules.length()+1) * sizeof(UChar));
158 
159     int32_t         totalSize = headerSize
160                                 + forwardTableSize
161                                 + reverseTableSize
162                                 + statusTableSize + trieSize + rulesSize;
163 
164 #ifdef RBBI_DEBUG
165     if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
166         RBBIDebugPrintf("Header Size:        %8d\n", headerSize);
167         RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
168         RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
169         RBBIDebugPrintf("Trie Size:          %8d\n", trieSize);
170         RBBIDebugPrintf("Status Table Size:  %8d\n", statusTableSize);
171         RBBIDebugPrintf("Rules Size:         %8d\n", rulesSize);
172         RBBIDebugPrintf("-----------------------------\n");
173         RBBIDebugPrintf("Total Size:         %8d\n", totalSize);
174     }
175 #endif
176 
177     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
178     if (data == NULL) {
179         *fStatus = U_MEMORY_ALLOCATION_ERROR;
180         return NULL;
181     }
182     uprv_memset(data, 0, totalSize);
183 
184 
185     data->fMagic            = 0xb1a0;
186     data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
187     data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
188     data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
189     data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
190     data->fLength           = totalSize;
191     data->fCatCount         = fSetBuilder->getNumCharCategories();
192 
193     data->fFTable        = headerSize;
194     data->fFTableLen     = forwardTableSize;
195 
196     data->fRTable        = data->fFTable  + data->fFTableLen;
197     data->fRTableLen     = reverseTableSize;
198 
199     data->fTrie          = data->fRTable + data->fRTableLen;
200     data->fTrieLen       = fSetBuilder->getTrieSize();
201     data->fStatusTable   = data->fTrie    + trieSize;
202     data->fStatusTableLen= statusTableSize;
203     data->fRuleSource    = data->fStatusTable + statusTableSize;
204     data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
205 
206     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
207 
208     fForwardTable->exportTable((uint8_t *)data + data->fFTable);
209     fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
210     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
211 
212     int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
213     for (i=0; i<fRuleStatusVals->size(); i++) {
214         ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
215     }
216 
217     fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
218 
219     return data;
220 }
221 
222 
223 //----------------------------------------------------------------------------------------
224 //
225 //  createRuleBasedBreakIterator    construct from source rules that are passed in
226 //                                  in a UnicodeString
227 //
228 //----------------------------------------------------------------------------------------
229 BreakIterator *
createRuleBasedBreakIterator(const UnicodeString & rules,UParseError * parseError,UErrorCode & status)230 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
231                                     UParseError      *parseError,
232                                     UErrorCode       &status)
233 {
234     //
235     // Read the input rules, generate a parse tree, symbol table,
236     // and list of all Unicode Sets referenced by the rules.
237     //
238     RBBIRuleBuilder  builder(rules, parseError, status);
239     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
240         return NULL;
241     }
242 
243     RBBIDataHeader *data = builder.build(status);
244 
245     if (U_FAILURE(status)) {
246         return nullptr;
247     }
248 
249     //
250     //  Create a break iterator from the compiled rules.
251     //     (Identical to creation from stored pre-compiled rules)
252     //
253     // status is checked after init in construction.
254     RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
255     if (U_FAILURE(status)) {
256         delete This;
257         This = NULL;
258     }
259     else if(This == NULL) { // test for NULL
260         status = U_MEMORY_ALLOCATION_ERROR;
261     }
262     return This;
263 }
264 
build(UErrorCode & status)265 RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
266     if (U_FAILURE(status)) {
267         return nullptr;
268     }
269 
270     fScanner->parse();
271     if (U_FAILURE(status)) {
272         return nullptr;
273     }
274 
275     //
276     // UnicodeSet processing.
277     //    Munge the Unicode Sets to create a set of character categories.
278     //    Generate the mapping tables (TRIE) from input code points to
279     //    the character categories.
280     //
281     fSetBuilder->buildRanges();
282 
283     //
284     //   Generate the DFA state transition table.
285     //
286     fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
287     if (fForwardTable == nullptr) {
288         status = U_MEMORY_ALLOCATION_ERROR;
289         return nullptr;
290     }
291 
292     fForwardTable->buildForwardTable();
293     optimizeTables();
294     fForwardTable->buildSafeReverseTable(status);
295 
296 
297 #ifdef RBBI_DEBUG
298     if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
299         fForwardTable->printStates();
300         fForwardTable->printRuleStatusTable();
301         fForwardTable->printReverseTable();
302     }
303 #endif
304 
305     fSetBuilder->buildTrie();
306 
307     //
308     //   Package up the compiled data into a memory image
309     //      in the run-time format.
310     //
311     RBBIDataHeader *data = flattenData(); // returns NULL if error
312     if (U_FAILURE(status)) {
313         return nullptr;
314     }
315     return data;
316 }
317 
optimizeTables()318 void RBBIRuleBuilder::optimizeTables() {
319     bool didSomething;
320     do {
321         didSomething = false;
322 
323         // Begin looking for duplicates with char class 3.
324         // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
325         // and should not have other categories merged into them.
326         IntPair duplPair = {3, 0};
327         while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
328             fSetBuilder->mergeCategories(duplPair);
329             fForwardTable->removeColumn(duplPair.second);
330             didSomething = true;
331         }
332 
333         while (fForwardTable->removeDuplicateStates() > 0) {
334             didSomething = true;
335         }
336     } while (didSomething);
337 }
338 
339 U_NAMESPACE_END
340 
341 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
342