1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 // file: rbbirb.cpp
5 //
6 // Copyright (C) 2002-2011, International Business Machines Corporation and others.
7 // All Rights Reserved.
8 //
9 // This file contains the RBBIRuleBuilder class implementation. This is the main class for
10 // building (compiling) break rules into the tables required by the runtime
11 // RBBI engine.
12 //
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/brkiter.h"
19 #include "unicode/rbbi.h"
20 #include "unicode/ubrk.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uchriter.h"
25 #include "unicode/ustring.h"
26 #include "unicode/parsepos.h"
27 #include "unicode/parseerr.h"
28
29 #include "cmemory.h"
30 #include "cstring.h"
31 #include "rbbirb.h"
32 #include "rbbinode.h"
33 #include "rbbiscan.h"
34 #include "rbbisetb.h"
35 #include "rbbitblb.h"
36 #include "rbbidata.h"
37 #include "uassert.h"
38
39
40 U_NAMESPACE_BEGIN
41
42
43 //----------------------------------------------------------------------------------------
44 //
45 // Constructor.
46 //
47 //----------------------------------------------------------------------------------------
RBBIRuleBuilder(const UnicodeString & rules,UParseError * parseErr,UErrorCode & status)48 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
49 UParseError *parseErr,
50 UErrorCode &status)
51 : fRules(rules), fStrippedRules(rules)
52 {
53 fStatus = &status; // status is checked below
54 fParseError = parseErr;
55 fDebugEnv = NULL;
56 #ifdef RBBI_DEBUG
57 fDebugEnv = getenv("U_RBBIDEBUG");
58 #endif
59
60
61 fForwardTree = NULL;
62 fReverseTree = NULL;
63 fSafeFwdTree = NULL;
64 fSafeRevTree = NULL;
65 fDefaultTree = &fForwardTree;
66 fForwardTable = NULL;
67 fRuleStatusVals = NULL;
68 fChainRules = FALSE;
69 fLBCMNoChain = FALSE;
70 fLookAheadHardBreak = FALSE;
71 fUSetNodes = NULL;
72 fRuleStatusVals = NULL;
73 fScanner = NULL;
74 fSetBuilder = NULL;
75 if (parseErr) {
76 uprv_memset(parseErr, 0, sizeof(UParseError));
77 }
78
79 if (U_FAILURE(status)) {
80 return;
81 }
82
83 fUSetNodes = new UVector(status); // bcos status gets overwritten here
84 fRuleStatusVals = new UVector(status);
85 fScanner = new RBBIRuleScanner(this);
86 fSetBuilder = new RBBISetBuilder(this);
87 if (U_FAILURE(status)) {
88 return;
89 }
90 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
91 status = U_MEMORY_ALLOCATION_ERROR;
92 }
93 }
94
95
96
97 //----------------------------------------------------------------------------------------
98 //
99 // Destructor
100 //
101 //----------------------------------------------------------------------------------------
~RBBIRuleBuilder()102 RBBIRuleBuilder::~RBBIRuleBuilder() {
103
104 int i;
105 for (i=0; ; i++) {
106 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
107 if (n==NULL) {
108 break;
109 }
110 delete n;
111 }
112
113 delete fUSetNodes;
114 delete fSetBuilder;
115 delete fForwardTable;
116 delete fForwardTree;
117 delete fReverseTree;
118 delete fSafeFwdTree;
119 delete fSafeRevTree;
120 delete fScanner;
121 delete fRuleStatusVals;
122 }
123
124
125
126
127
128 //----------------------------------------------------------------------------------------
129 //
130 // flattenData() - Collect up the compiled RBBI rule data and put it into
131 // the format for saving in ICU data files,
132 // which is also the format needed by the RBBI runtime engine.
133 //
134 //----------------------------------------------------------------------------------------
align8(int32_t i)135 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
136
flattenData()137 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
138 int32_t i;
139
140 if (U_FAILURE(*fStatus)) {
141 return NULL;
142 }
143
144 // Remove whitespace from the rules to make it smaller.
145 // The rule parser has already removed comments.
146 fStrippedRules = fScanner->stripRules(fStrippedRules);
147
148 // Calculate the size of each section in the data.
149 // Sizes here are padded up to a multiple of 8 for better memory alignment.
150 // Sections sizes actually stored in the header are for the actual data
151 // without the padding.
152 //
153 int32_t headerSize = align8(sizeof(RBBIDataHeader));
154 int32_t forwardTableSize = align8(fForwardTable->getTableSize());
155 int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
156 int32_t trieSize = align8(fSetBuilder->getTrieSize());
157 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
158
159 int32_t rulesLengthInUTF8 = 0;
160 u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8,
161 fStrippedRules.getBuffer(), fStrippedRules.length(),
162 0xfffd, nullptr, fStatus);
163 *fStatus = U_ZERO_ERROR;
164
165 int32_t rulesSize = align8((rulesLengthInUTF8+1));
166
167 int32_t totalSize = headerSize
168 + forwardTableSize
169 + reverseTableSize
170 + statusTableSize + trieSize + rulesSize;
171
172 #ifdef RBBI_DEBUG
173 if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
174 RBBIDebugPrintf("Header Size: %8d\n", headerSize);
175 RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
176 RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
177 RBBIDebugPrintf("Trie Size: %8d\n", trieSize);
178 RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize);
179 RBBIDebugPrintf("Rules Size: %8d\n", rulesSize);
180 RBBIDebugPrintf("-----------------------------\n");
181 RBBIDebugPrintf("Total Size: %8d\n", totalSize);
182 }
183 #endif
184
185 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
186 if (data == NULL) {
187 *fStatus = U_MEMORY_ALLOCATION_ERROR;
188 return NULL;
189 }
190 uprv_memset(data, 0, totalSize);
191
192
193 data->fMagic = 0xb1a0;
194 data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
195 data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
196 data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
197 data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
198 data->fLength = totalSize;
199 data->fCatCount = fSetBuilder->getNumCharCategories();
200
201 data->fFTable = headerSize;
202 data->fFTableLen = forwardTableSize;
203
204 data->fRTable = data->fFTable + data->fFTableLen;
205 data->fRTableLen = reverseTableSize;
206
207 data->fTrie = data->fRTable + data->fRTableLen;
208 data->fTrieLen = trieSize;
209 data->fStatusTable = data->fTrie + data->fTrieLen;
210 data->fStatusTableLen= statusTableSize;
211 data->fRuleSource = data->fStatusTable + statusTableSize;
212 data->fRuleSourceLen = rulesLengthInUTF8;
213
214 uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
215
216 fForwardTable->exportTable((uint8_t *)data + data->fFTable);
217 fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
218 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
219
220 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
221 for (i=0; i<fRuleStatusVals->size(); i++) {
222 ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
223 }
224
225 u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8,
226 fStrippedRules.getBuffer(), fStrippedRules.length(),
227 0xfffd, nullptr, fStatus);
228 if (U_FAILURE(*fStatus)) {
229 return NULL;
230 }
231
232 return data;
233 }
234
235
236 //----------------------------------------------------------------------------------------
237 //
238 // createRuleBasedBreakIterator construct from source rules that are passed in
239 // in a UnicodeString
240 //
241 //----------------------------------------------------------------------------------------
242 BreakIterator *
createRuleBasedBreakIterator(const UnicodeString & rules,UParseError * parseError,UErrorCode & status)243 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
244 UParseError *parseError,
245 UErrorCode &status)
246 {
247 //
248 // Read the input rules, generate a parse tree, symbol table,
249 // and list of all Unicode Sets referenced by the rules.
250 //
251 RBBIRuleBuilder builder(rules, parseError, status);
252 if (U_FAILURE(status)) { // status checked here bcos build below doesn't
253 return NULL;
254 }
255
256 RBBIDataHeader *data = builder.build(status);
257
258 if (U_FAILURE(status)) {
259 return nullptr;
260 }
261
262 //
263 // Create a break iterator from the compiled rules.
264 // (Identical to creation from stored pre-compiled rules)
265 //
266 // status is checked after init in construction.
267 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
268 if (U_FAILURE(status)) {
269 delete This;
270 This = NULL;
271 }
272 else if(This == NULL) { // test for NULL
273 status = U_MEMORY_ALLOCATION_ERROR;
274 }
275 return This;
276 }
277
build(UErrorCode & status)278 RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
279 if (U_FAILURE(status)) {
280 return nullptr;
281 }
282
283 fScanner->parse();
284 if (U_FAILURE(status)) {
285 return nullptr;
286 }
287
288 //
289 // UnicodeSet processing.
290 // Munge the Unicode Sets to create an initial set of character categories.
291 //
292 fSetBuilder->buildRanges();
293
294 //
295 // Generate the DFA state transition table.
296 //
297 fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
298 if (fForwardTable == nullptr) {
299 status = U_MEMORY_ALLOCATION_ERROR;
300 return nullptr;
301 }
302
303 fForwardTable->buildForwardTable();
304
305 // State table and character category optimization.
306 // Merge equivalent rows and columns.
307 // Note that this process alters the initial set of character categories,
308 // causing the representation of UnicodeSets in the parse tree to become invalid.
309
310 optimizeTables();
311 fForwardTable->buildSafeReverseTable(status);
312
313
314 #ifdef RBBI_DEBUG
315 if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
316 fForwardTable->printStates();
317 fForwardTable->printRuleStatusTable();
318 fForwardTable->printReverseTable();
319 }
320 #endif
321
322 // Generate the mapping tables (TRIE) from input code points to
323 // the character categories.
324 //
325 fSetBuilder->buildTrie();
326
327 //
328 // Package up the compiled data into a memory image
329 // in the run-time format.
330 //
331 RBBIDataHeader *data = flattenData(); // returns NULL if error
332 if (U_FAILURE(status)) {
333 return nullptr;
334 }
335 return data;
336 }
337
optimizeTables()338 void RBBIRuleBuilder::optimizeTables() {
339 bool didSomething;
340 do {
341 didSomething = false;
342
343 // Begin looking for duplicates with char class 3.
344 // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
345 // and should not have other categories merged into them.
346 IntPair duplPair = {3, 0};
347 while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
348 fSetBuilder->mergeCategories(duplPair);
349 fForwardTable->removeColumn(duplPair.second);
350 didSomething = true;
351 }
352
353 while (fForwardTable->removeDuplicateStates() > 0) {
354 didSomething = true;
355 }
356 } while (didSomething);
357 }
358
359 U_NAMESPACE_END
360
361 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
362