1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 // file: rbbirb.cpp
5 //
6 // Copyright (C) 2002-2011, International Business Machines Corporation and others.
7 // All Rights Reserved.
8 //
9 // This file contains the RBBIRuleBuilder class implementation. This is the main class for
10 // building (compiling) break rules into the tables required by the runtime
11 // RBBI engine.
12 //
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/brkiter.h"
19 #include "unicode/rbbi.h"
20 #include "unicode/ubrk.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uchriter.h"
25 #include "unicode/parsepos.h"
26 #include "unicode/parseerr.h"
27
28 #include "cmemory.h"
29 #include "cstring.h"
30 #include "rbbirb.h"
31 #include "rbbinode.h"
32 #include "rbbiscan.h"
33 #include "rbbisetb.h"
34 #include "rbbitblb.h"
35 #include "rbbidata.h"
36 #include "uassert.h"
37
38
39 U_NAMESPACE_BEGIN
40
41
42 //----------------------------------------------------------------------------------------
43 //
44 // Constructor.
45 //
46 //----------------------------------------------------------------------------------------
RBBIRuleBuilder(const UnicodeString & rules,UParseError * parseErr,UErrorCode & status)47 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
48 UParseError *parseErr,
49 UErrorCode &status)
50 : fRules(rules)
51 {
52 fStatus = &status; // status is checked below
53 fParseError = parseErr;
54 fDebugEnv = NULL;
55 #ifdef RBBI_DEBUG
56 fDebugEnv = getenv("U_RBBIDEBUG");
57 #endif
58
59
60 fForwardTree = NULL;
61 fReverseTree = NULL;
62 fSafeFwdTree = NULL;
63 fSafeRevTree = NULL;
64 fDefaultTree = &fForwardTree;
65 fForwardTables = NULL;
66 fReverseTables = NULL;
67 fSafeFwdTables = NULL;
68 fSafeRevTables = NULL;
69 fRuleStatusVals = NULL;
70 fChainRules = FALSE;
71 fLBCMNoChain = FALSE;
72 fLookAheadHardBreak = FALSE;
73 fUSetNodes = NULL;
74 fRuleStatusVals = NULL;
75 fScanner = NULL;
76 fSetBuilder = NULL;
77 if (parseErr) {
78 uprv_memset(parseErr, 0, sizeof(UParseError));
79 }
80
81 if (U_FAILURE(status)) {
82 return;
83 }
84
85 fUSetNodes = new UVector(status); // bcos status gets overwritten here
86 fRuleStatusVals = new UVector(status);
87 fScanner = new RBBIRuleScanner(this);
88 fSetBuilder = new RBBISetBuilder(this);
89 if (U_FAILURE(status)) {
90 return;
91 }
92 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
93 status = U_MEMORY_ALLOCATION_ERROR;
94 }
95 }
96
97
98
99 //----------------------------------------------------------------------------------------
100 //
101 // Destructor
102 //
103 //----------------------------------------------------------------------------------------
~RBBIRuleBuilder()104 RBBIRuleBuilder::~RBBIRuleBuilder() {
105
106 int i;
107 for (i=0; ; i++) {
108 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
109 if (n==NULL) {
110 break;
111 }
112 delete n;
113 }
114
115 delete fUSetNodes;
116 delete fSetBuilder;
117 delete fForwardTables;
118 delete fReverseTables;
119 delete fSafeFwdTables;
120 delete fSafeRevTables;
121
122 delete fForwardTree;
123 delete fReverseTree;
124 delete fSafeFwdTree;
125 delete fSafeRevTree;
126 delete fScanner;
127 delete fRuleStatusVals;
128 }
129
130
131
132
133
134 //----------------------------------------------------------------------------------------
135 //
136 // flattenData() - Collect up the compiled RBBI rule data and put it into
137 // the format for saving in ICU data files,
138 // which is also the format needed by the RBBI runtime engine.
139 //
140 //----------------------------------------------------------------------------------------
align8(int32_t i)141 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
142
flattenData()143 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
144 int32_t i;
145
146 if (U_FAILURE(*fStatus)) {
147 return NULL;
148 }
149
150 // Remove comments and whitespace from the rules to make it smaller.
151 UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
152
153 // Calculate the size of each section in the data.
154 // Sizes here are padded up to a multiple of 8 for better memory alignment.
155 // Sections sizes actually stored in the header are for the actual data
156 // without the padding.
157 //
158 int32_t headerSize = align8(sizeof(RBBIDataHeader));
159 int32_t forwardTableSize = align8(fForwardTables->getTableSize());
160 int32_t reverseTableSize = align8(fReverseTables->getTableSize());
161 int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
162 int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
163 int32_t trieSize = align8(fSetBuilder->getTrieSize());
164 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
165 int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
166
167 (void)safeFwdTableSize;
168
169 int32_t totalSize = headerSize
170 + forwardTableSize
171 + /* reverseTableSize */ 0
172 + /* safeFwdTableSize */ 0
173 + (safeRevTableSize ? safeRevTableSize : reverseTableSize)
174 + statusTableSize + trieSize + rulesSize;
175
176 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
177 if (data == NULL) {
178 *fStatus = U_MEMORY_ALLOCATION_ERROR;
179 return NULL;
180 }
181 uprv_memset(data, 0, totalSize);
182
183
184 data->fMagic = 0xb1a0;
185 data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
186 data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
187 data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
188 data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
189 data->fLength = totalSize;
190 data->fCatCount = fSetBuilder->getNumCharCategories();
191
192 // Only save the forward table and the safe reverse table,
193 // because these are the only ones used at run-time.
194 //
195 // For the moment, we still build the other tables if they are present in the rule source files,
196 // for backwards compatibility. Old rule files need to work, and this is the simplest approach.
197 //
198 // Additional backwards compatibility consideration: if no safe rules are provided, consider the
199 // reverse rules to actually be the safe reverse rules.
200
201 data->fFTable = headerSize;
202 data->fFTableLen = forwardTableSize;
203
204 // Do not save Reverse Table.
205 data->fRTable = data->fFTable + forwardTableSize;
206 data->fRTableLen = 0;
207
208 // Do not save the Safe Forward table.
209 data->fSFTable = data->fRTable + 0;
210 data->fSFTableLen = 0;
211
212 data->fSRTable = data->fSFTable + 0;
213 if (safeRevTableSize > 0) {
214 data->fSRTableLen = safeRevTableSize;
215 } else if (reverseTableSize > 0) {
216 data->fSRTableLen = reverseTableSize;
217 } else {
218 U_ASSERT(FALSE); // Rule build should have failed for lack of a reverse table
219 // before reaching this point.
220 }
221
222
223 data->fTrie = data->fSRTable + data->fSRTableLen;
224 data->fTrieLen = fSetBuilder->getTrieSize();
225 data->fStatusTable = data->fTrie + trieSize;
226 data->fStatusTableLen= statusTableSize;
227 data->fRuleSource = data->fStatusTable + statusTableSize;
228 data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
229
230 uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
231
232 fForwardTables->exportTable((uint8_t *)data + data->fFTable);
233 // fReverseTables->exportTable((uint8_t *)data + data->fRTable);
234 // fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
235 if (safeRevTableSize > 0) {
236 fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
237 } else {
238 fReverseTables->exportTable((uint8_t *)data + data->fSRTable);
239 }
240
241 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
242
243 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
244 for (i=0; i<fRuleStatusVals->size(); i++) {
245 ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
246 }
247
248 strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
249
250 return data;
251 }
252
253
254
255
256
257
258 //----------------------------------------------------------------------------------------
259 //
260 // createRuleBasedBreakIterator construct from source rules that are passed in
261 // in a UnicodeString
262 //
263 //----------------------------------------------------------------------------------------
264 BreakIterator *
createRuleBasedBreakIterator(const UnicodeString & rules,UParseError * parseError,UErrorCode & status)265 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
266 UParseError *parseError,
267 UErrorCode &status)
268 {
269 // status checked below
270
271 //
272 // Read the input rules, generate a parse tree, symbol table,
273 // and list of all Unicode Sets referenced by the rules.
274 //
275 RBBIRuleBuilder builder(rules, parseError, status);
276 if (U_FAILURE(status)) { // status checked here bcos build below doesn't
277 return NULL;
278 }
279 builder.fScanner->parse();
280
281 //
282 // UnicodeSet processing.
283 // Munge the Unicode Sets to create a set of character categories.
284 // Generate the mapping tables (TRIE) from input 32-bit characters to
285 // the character categories.
286 //
287 builder.fSetBuilder->build();
288
289
290 //
291 // Generate the DFA state transition table.
292 //
293 builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
294 builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
295 builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
296 builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
297 if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
298 builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
299 {
300 status = U_MEMORY_ALLOCATION_ERROR;
301 delete builder.fForwardTables; builder.fForwardTables = NULL;
302 delete builder.fReverseTables; builder.fReverseTables = NULL;
303 delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
304 delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
305 return NULL;
306 }
307
308 builder.fForwardTables->build();
309 builder.fReverseTables->build();
310 builder.fSafeFwdTables->build();
311 builder.fSafeRevTables->build();
312
313 #ifdef RBBI_DEBUG
314 if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
315 builder.fForwardTables->printRuleStatusTable();
316 }
317 #endif
318
319 //
320 // Package up the compiled data into a memory image
321 // in the run-time format.
322 //
323 RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
324 if (U_FAILURE(*builder.fStatus)) {
325 return NULL;
326 }
327
328
329 //
330 // Clean up the compiler related stuff
331 //
332
333
334 //
335 // Create a break iterator from the compiled rules.
336 // (Identical to creation from stored pre-compiled rules)
337 //
338 // status is checked after init in construction.
339 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
340 if (U_FAILURE(status)) {
341 delete This;
342 This = NULL;
343 }
344 else if(This == NULL) { // test for NULL
345 status = U_MEMORY_ALLOCATION_ERROR;
346 }
347 return This;
348 }
349
350 U_NAMESPACE_END
351
352 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
353