1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 *
9 * File genbrk.c
10 */
11
12 //--------------------------------------------------------------------
13 //
14 // Tool for generating RuleBasedBreakIterator data files (.brk files).
15 // .brk files contain the precompiled rules for standard types
16 // of iterators - word, line, sentence, etc.
17 //
18 // Usage: genbrk [options] -r rule-file.txt -o output-file.brk
19 //
20 // options: -v verbose
21 // -? or -h help
22 //
23 // The input rule file is a plain text file containing break rules
24 // in the input format accepted by RuleBasedBreakIterators. The
25 // file can be encoded as utf-8, or utf-16 (either endian), or
26 // in the default code page (platform dependent.). utf encoded
27 // files must include a BOM.
28 //
29 //--------------------------------------------------------------------
30
31 #include "unicode/utypes.h"
32 #include "unicode/ucnv.h"
33 #include "unicode/unistr.h"
34 #include "unicode/rbbi.h"
35 #include "unicode/uclean.h"
36 #include "unicode/udata.h"
37 #include "unicode/putil.h"
38
39 #include "uoptions.h"
40 #include "unewdata.h"
41 #include "ucmndata.h"
42 #include "rbbidata.h"
43 #include "cmemory.h"
44
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48
49 U_NAMESPACE_USE
50
51 static char *progName;
52 static UOption options[]={
53 UOPTION_HELP_H, /* 0 */
54 UOPTION_HELP_QUESTION_MARK, /* 1 */
55 UOPTION_VERBOSE, /* 2 */
56 { "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
57 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */
58 UOPTION_ICUDATADIR, /* 5 */
59 UOPTION_DESTDIR, /* 6 */
60 UOPTION_COPYRIGHT, /* 7 */
61 UOPTION_QUIET, /* 8 */
62 };
63
usageAndDie(int retCode)64 void usageAndDie(int retCode) {
65 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
66 printf("\tRead in break iteration rules text and write out the binary data\n"
67 "options:\n"
68 "\t-h or -? or --help this usage text\n"
69 "\t-V or --version show a version message\n"
70 "\t-c or --copyright include a copyright notice\n"
71 "\t-v or --verbose turn on verbose output\n"
72 "\t-q or --quiet do not display warnings and progress\n"
73 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
74 "\t followed by path, defaults to %s\n"
75 "\t-d or --destdir destination directory, followed by the path\n",
76 u_getDataDirectory());
77 exit (retCode);
78 }
79
80
81 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
82
83 /* dummy UDataInfo cf. udata.h */
84 static UDataInfo dummyDataInfo = {
85 sizeof(UDataInfo),
86 0,
87
88 U_IS_BIG_ENDIAN,
89 U_CHARSET_FAMILY,
90 U_SIZEOF_UCHAR,
91 0,
92
93 { 0, 0, 0, 0 }, /* dummy dataFormat */
94 { 0, 0, 0, 0 }, /* dummy formatVersion */
95 { 0, 0, 0, 0 } /* dummy dataVersion */
96 };
97
98 #else
99
100 //
101 // Set up the ICU data header, defined in ucmndata.h
102 //
103 DataHeader dh ={
104 {sizeof(DataHeader), // Struct MappedData
105 0xda,
106 0x27},
107
108 { // struct UDataInfo
109 sizeof(UDataInfo), // size
110 0, // reserved
111 U_IS_BIG_ENDIAN,
112 U_CHARSET_FAMILY,
113 U_SIZEOF_UCHAR,
114 0, // reserved
115
116 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
117 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values
118 // from the RBBI rule builder. The values declared
119 // here should never appear in any real RBBI data.
120 { 4, 1, 0, 0 } // dataVersion (Unicode version)
121 }};
122
123 #endif
124
125 //----------------------------------------------------------------------------
126 //
127 // main for genbrk
128 //
129 //----------------------------------------------------------------------------
main(int argc,char ** argv)130 int main(int argc, char **argv) {
131 UErrorCode status = U_ZERO_ERROR;
132 const char *ruleFileName;
133 const char *outFileName;
134 const char *outDir = NULL;
135 const char *copyright = NULL;
136
137 //
138 // Pick up and check the command line arguments,
139 // using the standard ICU tool utils option handling.
140 //
141 U_MAIN_INIT_ARGS(argc, argv);
142 progName = argv[0];
143 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
144 if(argc<0) {
145 // Unrecognized option
146 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
147 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
148 }
149
150 if(options[0].doesOccur || options[1].doesOccur) {
151 // -? or -h for help.
152 usageAndDie(0);
153 }
154
155 if (!(options[3].doesOccur && options[4].doesOccur)) {
156 fprintf(stderr, "rule file and output file must both be specified.\n");
157 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
158 }
159 ruleFileName = options[3].value;
160 outFileName = options[4].value;
161
162 if (options[5].doesOccur) {
163 u_setDataDirectory(options[5].value);
164 }
165
166 status = U_ZERO_ERROR;
167
168 /* Combine the directory with the file name */
169 if(options[6].doesOccur) {
170 outDir = options[6].value;
171 }
172 if (options[7].doesOccur) {
173 copyright = U_COPYRIGHT_STRING;
174 }
175
176 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
177
178 UNewDataMemory *pData;
179 char msg[1024];
180
181 /* write message with just the name */
182 sprintf(msg, "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
183 fprintf(stderr, "%s\n", msg);
184
185 /* write the dummy data file */
186 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
187 udata_writeBlock(pData, msg, strlen(msg));
188 udata_finish(pData, &status);
189 return (int)status;
190
191 #else
192 /* Initialize ICU */
193 u_init(&status);
194 if (U_FAILURE(status)) {
195 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
196 argv[0], u_errorName(status));
197 exit(1);
198 }
199 status = U_ZERO_ERROR;
200
201 //
202 // Read in the rule source file
203 //
204 long result;
205 long ruleFileSize;
206 FILE *file;
207 char *ruleBufferC;
208
209 file = fopen(ruleFileName, "rb");
210 if( file == 0 ) {
211 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
212 exit(-1);
213 }
214 fseek(file, 0, SEEK_END);
215 ruleFileSize = ftell(file);
216 fseek(file, 0, SEEK_SET);
217 ruleBufferC = new char[ruleFileSize+10];
218
219 result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
220 if (result != ruleFileSize) {
221 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
222 exit (-1);
223 }
224 ruleBufferC[ruleFileSize]=0;
225 fclose(file);
226
227 //
228 // Look for a Unicode Signature (BOM) on the rule file
229 //
230 int32_t signatureLength;
231 const char * ruleSourceC = ruleBufferC;
232 const char* encoding = ucnv_detectUnicodeSignature(
233 ruleSourceC, ruleFileSize, &signatureLength, &status);
234 if (U_FAILURE(status)) {
235 exit(status);
236 }
237 if(encoding!=NULL ){
238 ruleSourceC += signatureLength;
239 ruleFileSize -= signatureLength;
240 }
241
242 //
243 // Open a converter to take the rule file to UTF-16
244 //
245 UConverter* conv;
246 conv = ucnv_open(encoding, &status);
247 if (U_FAILURE(status)) {
248 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
249 exit(status);
250 }
251
252 //
253 // Convert the rules to UChar.
254 // Preflight first to determine required buffer size.
255 //
256 uint32_t destCap = ucnv_toUChars(conv,
257 NULL, // dest,
258 0, // destCapacity,
259 ruleSourceC,
260 ruleFileSize,
261 &status);
262 if (status != U_BUFFER_OVERFLOW_ERROR) {
263 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
264 exit(status);
265 }
266
267 status = U_ZERO_ERROR;
268 UChar *ruleSourceU = new UChar[destCap+1];
269 ucnv_toUChars(conv,
270 ruleSourceU, // dest,
271 destCap+1,
272 ruleSourceC,
273 ruleFileSize,
274 &status);
275 if (U_FAILURE(status)) {
276 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
277 exit(status);
278 }
279 ucnv_close(conv);
280
281
282 //
283 // Put the source rules into a UnicodeString
284 //
285 UnicodeString ruleSourceS(false, ruleSourceU, destCap);
286
287 //
288 // Create the break iterator from the rules
289 // This will compile the rules.
290 //
291 UParseError parseError;
292 parseError.line = 0;
293 parseError.offset = 0;
294 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
295 if (U_FAILURE(status)) {
296 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
297 u_errorName(status), (int)parseError.line, (int)parseError.offset);
298 exit(status);
299 }
300
301
302 //
303 // Get the compiled rule data from the break iterator.
304 //
305 uint32_t outDataSize;
306 const uint8_t *outData;
307 outData = bi->getBinaryRules(outDataSize);
308
309 // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
310 uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
311
312 //
313 // Create the output file
314 //
315 size_t bytesWritten;
316 UNewDataMemory *pData;
317 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
318 if(U_FAILURE(status)) {
319 fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
320 outFileName, u_errorName(status));
321 exit(status);
322 }
323
324
325 // Write the data itself.
326 udata_writeBlock(pData, outData, outDataSize);
327 // finish up
328 bytesWritten = udata_finish(pData, &status);
329 if(U_FAILURE(status)) {
330 fprintf(stderr, "genbrk: error %d writing the output file\n", status);
331 exit(status);
332 }
333
334 if (bytesWritten != outDataSize) {
335 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
336 exit(-1);
337 }
338
339 delete bi;
340 delete[] ruleSourceU;
341 delete[] ruleBufferC;
342 u_cleanup();
343
344
345 if(!options[8].doesOccur) {
346 printf("genbrk: tool completed successfully.\n");
347 }
348 return 0;
349
350 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
351 }
352
353