1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 *
9 * File genbrk.c
10 */
11
12 //--------------------------------------------------------------------
13 //
14 // Tool for generating RuleBasedBreakIterator data files (.brk files).
15 // .brk files contain the precompiled rules for standard types
16 // of iterators - word, line, sentence, etc.
17 //
18 // Usage: genbrk [options] -r rule-file.txt -o output-file.brk
19 //
20 // options: -v verbose
21 // -? or -h help
22 //
23 // The input rule file is a plain text file containing break rules
24 // in the input format accepted by RuleBasedBreakIterators. The
25 // file can be encoded as UTF-8 or UTF-16 (either endian). Files
26 // encoded as UTF-16 must include a BOM.
27 //
28 //--------------------------------------------------------------------
29
30 #include "unicode/utypes.h"
31 #include "unicode/ucnv.h"
32 #include "unicode/unistr.h"
33 #include "unicode/rbbi.h"
34 #include "unicode/uclean.h"
35 #include "unicode/udata.h"
36 #include "unicode/putil.h"
37
38 #include "uoptions.h"
39 #include "unewdata.h"
40 #include "ucmndata.h"
41 #include "rbbidata.h"
42 #include "cmemory.h"
43
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47
48 U_NAMESPACE_USE
49
50 static char *progName;
51 static UOption options[]={
52 UOPTION_HELP_H, /* 0 */
53 UOPTION_HELP_QUESTION_MARK, /* 1 */
54 UOPTION_VERBOSE, /* 2 */
55 { "rules", nullptr, nullptr, nullptr, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
56 { "out", nullptr, nullptr, nullptr, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */
57 UOPTION_ICUDATADIR, /* 5 */
58 UOPTION_DESTDIR, /* 6 */
59 UOPTION_COPYRIGHT, /* 7 */
60 UOPTION_QUIET, /* 8 */
61 };
62
usageAndDie(int retCode)63 void usageAndDie(int retCode) {
64 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
65 printf("\tRead in break iteration rules text and write out the binary data.\n"
66 "\tIf the rule file does not have a Unicode signature byte sequence, it is assumed\n"
67 "\tto be UTF-8.\n"
68 "options:\n"
69 "\t-h or -? or --help this usage text\n"
70 "\t-V or --version show a version message\n"
71 "\t-c or --copyright include a copyright notice\n"
72 "\t-v or --verbose turn on verbose output\n"
73 "\t-q or --quiet do not display warnings and progress\n"
74 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
75 "\t followed by path, defaults to %s\n"
76 "\t-d or --destdir destination directory, followed by the path\n",
77 u_getDataDirectory());
78 exit (retCode);
79 }
80
81
82 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
83
84 /* dummy UDataInfo cf. udata.h */
85 static UDataInfo dummyDataInfo = {
86 sizeof(UDataInfo),
87 0,
88
89 U_IS_BIG_ENDIAN,
90 U_CHARSET_FAMILY,
91 U_SIZEOF_UCHAR,
92 0,
93
94 { 0, 0, 0, 0 }, /* dummy dataFormat */
95 { 0, 0, 0, 0 }, /* dummy formatVersion */
96 { 0, 0, 0, 0 } /* dummy dataVersion */
97 };
98
99 #else
100
101 //
102 // Set up the ICU data header, defined in ucmndata.h
103 //
104 DataHeader dh ={
105 {sizeof(DataHeader), // Struct MappedData
106 0xda,
107 0x27},
108
109 { // struct UDataInfo
110 sizeof(UDataInfo), // size
111 0, // reserved
112 U_IS_BIG_ENDIAN,
113 U_CHARSET_FAMILY,
114 U_SIZEOF_UCHAR,
115 0, // reserved
116
117 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
118 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values
119 // from the RBBI rule builder. The values declared
120 // here should never appear in any real RBBI data.
121 { 4, 1, 0, 0 } // dataVersion (Unicode version)
122 }};
123
124 #endif
125
126 //----------------------------------------------------------------------------
127 //
128 // main for genbrk
129 //
130 //----------------------------------------------------------------------------
main(int argc,char ** argv)131 int main(int argc, char **argv) {
132 UErrorCode status = U_ZERO_ERROR;
133 const char *ruleFileName;
134 const char *outFileName;
135 const char *outDir = nullptr;
136 const char *copyright = nullptr;
137
138 //
139 // Pick up and check the command line arguments,
140 // using the standard ICU tool utils option handling.
141 //
142 U_MAIN_INIT_ARGS(argc, argv);
143 progName = argv[0];
144 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
145 if(argc<0) {
146 // Unrecognized option
147 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
148 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
149 }
150
151 if(options[0].doesOccur || options[1].doesOccur) {
152 // -? or -h for help.
153 usageAndDie(0);
154 }
155
156 if (!(options[3].doesOccur && options[4].doesOccur)) {
157 fprintf(stderr, "rule file and output file must both be specified.\n");
158 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
159 }
160 ruleFileName = options[3].value;
161 outFileName = options[4].value;
162
163 if (options[5].doesOccur) {
164 u_setDataDirectory(options[5].value);
165 }
166
167 status = U_ZERO_ERROR;
168
169 /* Combine the directory with the file name */
170 if(options[6].doesOccur) {
171 outDir = options[6].value;
172 }
173 if (options[7].doesOccur) {
174 copyright = U_COPYRIGHT_STRING;
175 }
176
177 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
178
179 UNewDataMemory *pData;
180 char msg[1024];
181
182 /* write message with just the name */
183 snprintf(msg, sizeof(msg), "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
184 fprintf(stderr, "%s\n", msg);
185
186 /* write the dummy data file */
187 pData = udata_create(outDir, nullptr, outFileName, &dummyDataInfo, nullptr, &status);
188 udata_writeBlock(pData, msg, strlen(msg));
189 udata_finish(pData, &status);
190 return (int)status;
191
192 #else
193 /* Initialize ICU */
194 u_init(&status);
195 if (U_FAILURE(status)) {
196 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
197 argv[0], u_errorName(status));
198 exit(1);
199 }
200 status = U_ZERO_ERROR;
201
202 //
203 // Read in the rule source file
204 //
205 long result;
206 long ruleFileSize;
207 FILE *file;
208 char *ruleBufferC;
209
210 file = fopen(ruleFileName, "rb");
211 if( file == 0 ) {
212 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
213 exit(-1);
214 }
215 fseek(file, 0, SEEK_END);
216 ruleFileSize = ftell(file);
217 fseek(file, 0, SEEK_SET);
218 ruleBufferC = new char[ruleFileSize+10];
219
220 result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
221 if (result != ruleFileSize) {
222 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
223 exit (-1);
224 }
225 ruleBufferC[ruleFileSize]=0;
226 fclose(file);
227
228 //
229 // Look for a Unicode Signature (BOM) on the rule file
230 //
231 int32_t signatureLength;
232 const char * ruleSourceC = ruleBufferC;
233 const char* encoding = ucnv_detectUnicodeSignature(
234 ruleSourceC, ruleFileSize, &signatureLength, &status);
235 if (U_FAILURE(status)) {
236 exit(status);
237 }
238 if (encoding == nullptr) {
239 // In the absence of a BOM, assume the rule file is in UTF-8.
240 encoding = "UTF-8";
241 } else {
242 ruleSourceC += signatureLength;
243 ruleFileSize -= signatureLength;
244 }
245
246 //
247 // Open a converter to take the rule file to UTF-16
248 //
249 UConverter* conv;
250 conv = ucnv_open(encoding, &status);
251 if (U_FAILURE(status)) {
252 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
253 exit(status);
254 }
255
256 //
257 // Convert the rules to char16_t.
258 // Preflight first to determine required buffer size.
259 //
260 uint32_t destCap = ucnv_toUChars(conv,
261 nullptr, // dest,
262 0, // destCapacity,
263 ruleSourceC,
264 ruleFileSize,
265 &status);
266 if (status != U_BUFFER_OVERFLOW_ERROR) {
267 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
268 exit(status);
269 }
270
271 status = U_ZERO_ERROR;
272 char16_t *ruleSourceU = new char16_t[destCap+1];
273 ucnv_toUChars(conv,
274 ruleSourceU, // dest,
275 destCap+1,
276 ruleSourceC,
277 ruleFileSize,
278 &status);
279 if (U_FAILURE(status)) {
280 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
281 exit(status);
282 }
283 ucnv_close(conv);
284
285
286 //
287 // Put the source rules into a UnicodeString
288 //
289 UnicodeString ruleSourceS(false, ruleSourceU, destCap);
290
291 //
292 // Create the break iterator from the rules
293 // This will compile the rules.
294 //
295 UParseError parseError;
296 parseError.line = 0;
297 parseError.offset = 0;
298 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
299 if (U_FAILURE(status)) {
300 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
301 u_errorName(status), (int)parseError.line, (int)parseError.offset);
302 exit(status);
303 }
304
305
306 //
307 // Get the compiled rule data from the break iterator.
308 //
309 uint32_t outDataSize;
310 const uint8_t *outData;
311 outData = bi->getBinaryRules(outDataSize);
312
313 // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
314 uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
315
316 //
317 // Create the output file
318 //
319 size_t bytesWritten;
320 UNewDataMemory *pData;
321 pData = udata_create(outDir, nullptr, outFileName, &(dh.info), copyright, &status);
322 if(U_FAILURE(status)) {
323 fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
324 outFileName, u_errorName(status));
325 exit(status);
326 }
327
328
329 // Write the data itself.
330 udata_writeBlock(pData, outData, outDataSize);
331 // finish up
332 bytesWritten = udata_finish(pData, &status);
333 if(U_FAILURE(status)) {
334 fprintf(stderr, "genbrk: error %d writing the output file\n", status);
335 exit(status);
336 }
337
338 if (bytesWritten != outDataSize) {
339 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
340 exit(-1);
341 }
342
343 delete bi;
344 delete[] ruleSourceU;
345 delete[] ruleBufferC;
346 u_cleanup();
347
348
349 if(!options[8].doesOccur) {
350 printf("genbrk: tool completed successfully.\n");
351 }
352 return 0;
353
354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
355 }
356
357