• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File genbrk.c
10 */
11 
12 //--------------------------------------------------------------------
13 //
14 //   Tool for generating RuleBasedBreakIterator data files (.brk files).
15 //   .brk files contain the precompiled rules for standard types
16 //   of iterators - word, line, sentence, etc.
17 //
18 //   Usage:  genbrk [options] -r rule-file.txt  -o output-file.brk
19 //
20 //       options:   -v         verbose
21 //                  -? or -h   help
22 //
23 //   The input rule file is a plain text file containing break rules
24 //    in the input format accepted by RuleBasedBreakIterators.  The
25 //    file can be encoded as UTF-8 or UTF-16 (either endian).  Files
26 //    encoded as UTF-16 must include a BOM.
27 //
28 //--------------------------------------------------------------------
29 
30 #include "unicode/utypes.h"
31 #include "unicode/ucnv.h"
32 #include "unicode/unistr.h"
33 #include "unicode/rbbi.h"
34 #include "unicode/uclean.h"
35 #include "unicode/udata.h"
36 #include "unicode/putil.h"
37 
38 #include "uoptions.h"
39 #include "unewdata.h"
40 #include "ucmndata.h"
41 #include "rbbidata.h"
42 #include "cmemory.h"
43 
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 
48 U_NAMESPACE_USE
49 
50 static char *progName;
51 static UOption options[]={
52     UOPTION_HELP_H,             /* 0 */
53     UOPTION_HELP_QUESTION_MARK, /* 1 */
54     UOPTION_VERBOSE,            /* 2 */
55     { "rules", nullptr, nullptr, nullptr, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
56     { "out",   nullptr, nullptr, nullptr, 'o', UOPT_REQUIRES_ARG, 0 },   /* 4 */
57     UOPTION_ICUDATADIR,         /* 5 */
58     UOPTION_DESTDIR,            /* 6 */
59     UOPTION_COPYRIGHT,          /* 7 */
60     UOPTION_QUIET,              /* 8 */
61 };
62 
usageAndDie(int retCode)63 void usageAndDie(int retCode) {
64         printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
65         printf("\tRead in break iteration rules text and write out the binary data.\n"
66             "\tIf the rule file does not have a Unicode signature byte sequence, it is assumed\n"
67             "\tto be UTF-8.\n"
68             "options:\n"
69             "\t-h or -? or --help  this usage text\n"
70             "\t-V or --version     show a version message\n"
71             "\t-c or --copyright   include a copyright notice\n"
72             "\t-v or --verbose     turn on verbose output\n"
73             "\t-q or --quiet       do not display warnings and progress\n"
74             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
75             "\t                    followed by path, defaults to %s\n"
76             "\t-d or --destdir     destination directory, followed by the path\n",
77             u_getDataDirectory());
78         exit (retCode);
79 }
80 
81 
82 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
83 
84 /* dummy UDataInfo cf. udata.h */
85 static UDataInfo dummyDataInfo = {
86     sizeof(UDataInfo),
87     0,
88 
89     U_IS_BIG_ENDIAN,
90     U_CHARSET_FAMILY,
91     U_SIZEOF_UCHAR,
92     0,
93 
94     { 0, 0, 0, 0 },                 /* dummy dataFormat */
95     { 0, 0, 0, 0 },                 /* dummy formatVersion */
96     { 0, 0, 0, 0 }                  /* dummy dataVersion */
97 };
98 
99 #else
100 
101 //
102 //  Set up the ICU data header, defined in ucmndata.h
103 //
104 DataHeader dh ={
105     {sizeof(DataHeader),           // Struct MappedData
106         0xda,
107         0x27},
108 
109     {                               // struct UDataInfo
110         sizeof(UDataInfo),          //     size
111         0,                          //     reserved
112         U_IS_BIG_ENDIAN,
113         U_CHARSET_FAMILY,
114         U_SIZEOF_UCHAR,
115         0,                          //     reserved
116 
117     { 0x42, 0x72, 0x6b, 0x20 },     //     dataFormat="Brk "
118     { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
119                                     //      from the RBBI rule builder.  The  values declared
120                                     //      here should never appear in any real RBBI data.
121         { 4, 1, 0, 0 }              //   dataVersion (Unicode version)
122     }};
123 
124 #endif
125 
126 //----------------------------------------------------------------------------
127 //
128 //  main      for genbrk
129 //
130 //----------------------------------------------------------------------------
main(int argc,char ** argv)131 int  main(int argc, char **argv) {
132     UErrorCode  status = U_ZERO_ERROR;
133     const char *ruleFileName;
134     const char *outFileName;
135     const char *outDir = nullptr;
136     const char *copyright = nullptr;
137 
138     //
139     // Pick up and check the command line arguments,
140     //    using the standard ICU tool utils option handling.
141     //
142     U_MAIN_INIT_ARGS(argc, argv);
143     progName = argv[0];
144     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
145     if(argc<0) {
146         // Unrecognized option
147         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
148         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
149     }
150 
151     if(options[0].doesOccur || options[1].doesOccur) {
152         //  -? or -h for help.
153         usageAndDie(0);
154     }
155 
156     if (!(options[3].doesOccur && options[4].doesOccur)) {
157         fprintf(stderr, "rule file and output file must both be specified.\n");
158         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
159     }
160     ruleFileName = options[3].value;
161     outFileName  = options[4].value;
162 
163     if (options[5].doesOccur) {
164         u_setDataDirectory(options[5].value);
165     }
166 
167     status = U_ZERO_ERROR;
168 
169     /* Combine the directory with the file name */
170     if(options[6].doesOccur) {
171         outDir = options[6].value;
172     }
173     if (options[7].doesOccur) {
174         copyright = U_COPYRIGHT_STRING;
175     }
176 
177 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
178 
179     UNewDataMemory *pData;
180     char msg[1024];
181 
182     /* write message with just the name */
183     snprintf(msg, sizeof(msg), "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
184     fprintf(stderr, "%s\n", msg);
185 
186     /* write the dummy data file */
187     pData = udata_create(outDir, nullptr, outFileName, &dummyDataInfo, nullptr, &status);
188     udata_writeBlock(pData, msg, strlen(msg));
189     udata_finish(pData, &status);
190     return (int)status;
191 
192 #else
193     /* Initialize ICU */
194     u_init(&status);
195     if (U_FAILURE(status)) {
196         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
197             argv[0], u_errorName(status));
198         exit(1);
199     }
200     status = U_ZERO_ERROR;
201 
202     //
203     //  Read in the rule source file
204     //
205     long        result;
206     long        ruleFileSize;
207     FILE        *file;
208     char        *ruleBufferC;
209 
210     file = fopen(ruleFileName, "rb");
211     if( file == 0 ) {
212         fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
213         exit(-1);
214     }
215     fseek(file, 0, SEEK_END);
216     ruleFileSize = ftell(file);
217     fseek(file, 0, SEEK_SET);
218     ruleBufferC = new char[ruleFileSize+10];
219 
220     result = (long)fread(ruleBufferC, 1, ruleFileSize, file);
221     if (result != ruleFileSize)  {
222         fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
223         exit (-1);
224     }
225     ruleBufferC[ruleFileSize]=0;
226     fclose(file);
227 
228     //
229     // Look for a Unicode Signature (BOM) on the rule file
230     //
231     int32_t        signatureLength;
232     const char *   ruleSourceC = ruleBufferC;
233     const char*    encoding = ucnv_detectUnicodeSignature(
234                            ruleSourceC, ruleFileSize, &signatureLength, &status);
235     if (U_FAILURE(status)) {
236         exit(status);
237     }
238     if (encoding == nullptr) {
239         // In the absence of a BOM, assume the rule file is in UTF-8.
240         encoding = "UTF-8";
241     } else {
242         ruleSourceC  += signatureLength;
243         ruleFileSize -= signatureLength;
244     }
245 
246     //
247     // Open a converter to take the rule file to UTF-16
248     //
249     UConverter* conv;
250     conv = ucnv_open(encoding, &status);
251     if (U_FAILURE(status)) {
252         fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
253         exit(status);
254     }
255 
256     //
257     // Convert the rules to char16_t.
258     //  Preflight first to determine required buffer size.
259     //
260     uint32_t destCap = ucnv_toUChars(conv,
261                        nullptr,           //  dest,
262                        0,              //  destCapacity,
263                        ruleSourceC,
264                        ruleFileSize,
265                        &status);
266     if (status != U_BUFFER_OVERFLOW_ERROR) {
267         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
268         exit(status);
269     }
270 
271     status = U_ZERO_ERROR;
272     char16_t *ruleSourceU = new char16_t[destCap+1];
273     ucnv_toUChars(conv,
274                   ruleSourceU,     //  dest,
275                   destCap+1,
276                   ruleSourceC,
277                   ruleFileSize,
278                   &status);
279     if (U_FAILURE(status)) {
280         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
281         exit(status);
282     }
283     ucnv_close(conv);
284 
285 
286     //
287     //  Put the source rules into a UnicodeString
288     //
289     UnicodeString ruleSourceS(false, ruleSourceU, destCap);
290 
291     //
292     //  Create the break iterator from the rules
293     //     This will compile the rules.
294     //
295     UParseError parseError;
296     parseError.line = 0;
297     parseError.offset = 0;
298     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
299     if (U_FAILURE(status)) {
300         fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
301                 u_errorName(status), (int)parseError.line, (int)parseError.offset);
302         exit(status);
303     }
304 
305 
306     //
307     //  Get the compiled rule data from the break iterator.
308     //
309     uint32_t        outDataSize;
310     const uint8_t  *outData;
311     outData = bi->getBinaryRules(outDataSize);
312 
313     // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
314     uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
315 
316     //
317     //  Create the output file
318     //
319     size_t bytesWritten;
320     UNewDataMemory *pData;
321     pData = udata_create(outDir, nullptr, outFileName, &(dh.info), copyright, &status);
322     if(U_FAILURE(status)) {
323         fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n",
324                          outFileName, u_errorName(status));
325         exit(status);
326     }
327 
328 
329     //  Write the data itself.
330     udata_writeBlock(pData, outData, outDataSize);
331     // finish up
332     bytesWritten = udata_finish(pData, &status);
333     if(U_FAILURE(status)) {
334         fprintf(stderr, "genbrk: error %d writing the output file\n", status);
335         exit(status);
336     }
337 
338     if (bytesWritten != outDataSize) {
339         fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
340         exit(-1);
341     }
342 
343     delete bi;
344     delete[] ruleSourceU;
345     delete[] ruleBufferC;
346     u_cleanup();
347 
348 
349     if(!options[8].doesOccur) {
350         printf("genbrk: tool completed successfully.\n");
351     }
352     return 0;
353 
354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
355 }
356 
357