1 //
2 // file: rbbirb.cpp
3 //
4 // Copyright (C) 2002-2008, International Business Machines Corporation and others.
5 // All Rights Reserved.
6 //
7 // This file contains the RBBIRuleBuilder class implementation. This is the main class for
8 // building (compiling) break rules into the tables required by the runtime
9 // RBBI engine.
10 //
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_BREAK_ITERATION
15
16 #include "unicode/brkiter.h"
17 #include "unicode/rbbi.h"
18 #include "unicode/ubrk.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "cmemory.h"
26 #include "cstring.h"
27
28 #include "rbbirb.h"
29 #include "rbbinode.h"
30
31 #include "rbbiscan.h"
32 #include "rbbisetb.h"
33 #include "rbbitblb.h"
34 #include "rbbidata.h"
35
36
37 U_NAMESPACE_BEGIN
38
39
40 //----------------------------------------------------------------------------------------
41 //
42 // Constructor.
43 //
44 //----------------------------------------------------------------------------------------
RBBIRuleBuilder(const UnicodeString & rules,UParseError * parseErr,UErrorCode & status)45 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
46 UParseError *parseErr,
47 UErrorCode &status)
48 : fRules(rules)
49 {
50 fStatus = &status; // status is checked below
51 fParseError = parseErr;
52 fDebugEnv = NULL;
53 #ifdef RBBI_DEBUG
54 fDebugEnv = getenv("U_RBBIDEBUG");
55 #endif
56
57
58 fForwardTree = NULL;
59 fReverseTree = NULL;
60 fSafeFwdTree = NULL;
61 fSafeRevTree = NULL;
62 fDefaultTree = &fForwardTree;
63 fForwardTables = NULL;
64 fReverseTables = NULL;
65 fSafeFwdTables = NULL;
66 fSafeRevTables = NULL;
67 fRuleStatusVals = NULL;
68 fChainRules = FALSE;
69 fLBCMNoChain = FALSE;
70 fLookAheadHardBreak = FALSE;
71 fUSetNodes = NULL;
72 fRuleStatusVals = NULL;
73 fScanner = NULL;
74 fSetBuilder = NULL;
75 if (parseErr) {
76 uprv_memset(parseErr, 0, sizeof(UParseError));
77 }
78
79 if (U_FAILURE(status)) {
80 return;
81 }
82
83 fUSetNodes = new UVector(status); // bcos status gets overwritten here
84 fRuleStatusVals = new UVector(status);
85 fScanner = new RBBIRuleScanner(this);
86 fSetBuilder = new RBBISetBuilder(this);
87 if (U_FAILURE(status)) {
88 return;
89 }
90 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
91 status = U_MEMORY_ALLOCATION_ERROR;
92 }
93 }
94
95
96
97 //----------------------------------------------------------------------------------------
98 //
99 // Destructor
100 //
101 //----------------------------------------------------------------------------------------
~RBBIRuleBuilder()102 RBBIRuleBuilder::~RBBIRuleBuilder() {
103
104 int i;
105 for (i=0; ; i++) {
106 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
107 if (n==NULL) {
108 break;
109 }
110 delete n;
111 }
112
113 delete fUSetNodes;
114 delete fSetBuilder;
115 delete fForwardTables;
116 delete fReverseTables;
117 delete fSafeFwdTables;
118 delete fSafeRevTables;
119
120 delete fForwardTree;
121 delete fReverseTree;
122 delete fSafeFwdTree;
123 delete fSafeRevTree;
124 delete fScanner;
125 delete fRuleStatusVals;
126 }
127
128
129
130
131
132 //----------------------------------------------------------------------------------------
133 //
134 // flattenData() - Collect up the compiled RBBI rule data and put it into
135 // the format for saving in ICU data files,
136 // which is also the format needed by the RBBI runtime engine.
137 //
138 //----------------------------------------------------------------------------------------
align8(int32_t i)139 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
140
flattenData()141 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
142 int32_t i;
143
144 if (U_FAILURE(*fStatus)) {
145 return NULL;
146 }
147
148 // Remove comments and whitespace from the rules to make it smaller.
149 UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
150
151 // Calculate the size of each section in the data.
152 // Sizes here are padded up to a multiple of 8 for better memory alignment.
153 // Sections sizes actually stored in the header are for the actual data
154 // without the padding.
155 //
156 int32_t headerSize = align8(sizeof(RBBIDataHeader));
157 int32_t forwardTableSize = align8(fForwardTables->getTableSize());
158 int32_t reverseTableSize = align8(fReverseTables->getTableSize());
159 int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
160 int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
161 int32_t trieSize = align8(fSetBuilder->getTrieSize());
162 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
163 int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
164
165 int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
166 + safeFwdTableSize + safeRevTableSize
167 + statusTableSize + trieSize + rulesSize;
168
169 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
170 if (data == NULL) {
171 *fStatus = U_MEMORY_ALLOCATION_ERROR;
172 return NULL;
173 }
174 uprv_memset(data, 0, totalSize);
175
176
177 data->fMagic = 0xb1a0;
178 data->fFormatVersion[0] = 3;
179 data->fFormatVersion[1] = 1;
180 data->fFormatVersion[2] = 0;
181 data->fFormatVersion[3] = 0;
182 data->fLength = totalSize;
183 data->fCatCount = fSetBuilder->getNumCharCategories();
184
185 data->fFTable = headerSize;
186 data->fFTableLen = forwardTableSize;
187 data->fRTable = data->fFTable + forwardTableSize;
188 data->fRTableLen = reverseTableSize;
189 data->fSFTable = data->fRTable + reverseTableSize;
190 data->fSFTableLen = safeFwdTableSize;
191 data->fSRTable = data->fSFTable + safeFwdTableSize;
192 data->fSRTableLen = safeRevTableSize;
193
194 data->fTrie = data->fSRTable + safeRevTableSize;
195 data->fTrieLen = fSetBuilder->getTrieSize();
196 data->fStatusTable = data->fTrie + trieSize;
197 data->fStatusTableLen= statusTableSize;
198 data->fRuleSource = data->fStatusTable + statusTableSize;
199 data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
200
201 uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
202
203 fForwardTables->exportTable((uint8_t *)data + data->fFTable);
204 fReverseTables->exportTable((uint8_t *)data + data->fRTable);
205 fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
206 fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
207 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
208
209 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
210 for (i=0; i<fRuleStatusVals->size(); i++) {
211 ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
212 }
213
214 strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
215
216 return data;
217 }
218
219
220
221
222
223
224 //----------------------------------------------------------------------------------------
225 //
226 // createRuleBasedBreakIterator construct from source rules that are passed in
227 // in a UnicodeString
228 //
229 //----------------------------------------------------------------------------------------
230 BreakIterator *
createRuleBasedBreakIterator(const UnicodeString & rules,UParseError * parseError,UErrorCode & status)231 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
232 UParseError *parseError,
233 UErrorCode &status)
234 {
235 // status checked below
236
237 //
238 // Read the input rules, generate a parse tree, symbol table,
239 // and list of all Unicode Sets referenced by the rules.
240 //
241 RBBIRuleBuilder builder(rules, parseError, status);
242 if (U_FAILURE(status)) { // status checked here bcos build below doesn't
243 return NULL;
244 }
245 builder.fScanner->parse();
246
247 //
248 // UnicodeSet processing.
249 // Munge the Unicode Sets to create a set of character categories.
250 // Generate the mapping tables (TRIE) from input 32-bit characters to
251 // the character categories.
252 //
253 builder.fSetBuilder->build();
254
255
256 //
257 // Generate the DFA state transition table.
258 //
259 builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
260 builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
261 builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
262 builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
263 if (U_SUCCESS(status)
264 && (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
265 builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL))
266 {
267 status = U_MEMORY_ALLOCATION_ERROR;
268 }
269
270 // Before building the tables, check to make sure the status is ok.
271 if (U_FAILURE(status)) {
272 delete builder.fForwardTables; builder.fForwardTables = NULL;
273 delete builder.fReverseTables; builder.fReverseTables = NULL;
274 delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
275 delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
276 return NULL;
277 }
278
279 builder.fForwardTables->build();
280 builder.fReverseTables->build();
281 builder.fSafeFwdTables->build();
282 builder.fSafeRevTables->build();
283
284 #ifdef RBBI_DEBUG
285 if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
286 builder.fForwardTables->printRuleStatusTable();
287 }
288 #endif
289
290 //
291 // Package up the compiled data into a memory image
292 // in the run-time format.
293 //
294 RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
295 if (U_FAILURE(*builder.fStatus)) {
296 return NULL;
297 }
298
299
300 //
301 // Clean up the compiler related stuff
302 //
303
304
305 //
306 // Create a break iterator from the compiled rules.
307 // (Identical to creation from stored pre-compiled rules)
308 //
309 // status is checked after init in construction.
310 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
311 if (U_FAILURE(status)) {
312 delete This;
313 This = NULL;
314 }
315 else if(This == NULL) { // test for NULL
316 status = U_MEMORY_ALLOCATION_ERROR;
317 }
318 return This;
319 }
320
321 U_NAMESPACE_END
322
323 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
324