• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2009, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *
7 * File genctd.c
8 */
9 
10 //--------------------------------------------------------------------
11 //
12 //   Tool for generating CompactTrieDictionary data files (.ctd files).
13 //
14 //   Usage:  genctd [options] -o output-file.ctd input-file
15 //
16 //       options:   -v         verbose
17 //                  -? or -h   help
18 //
19 //   The input  file is a plain text file containing words, one per line.
20 //    Words end at the first whitespace; lines beginning with whitespace
21 //    are ignored.
22 //    The file can be encoded as utf-8, or utf-16 (either endian), or
23 //    in the default code page (platform dependent.).  utf encoded
24 //    files must include a BOM.
25 //
26 //--------------------------------------------------------------------
27 
28 #include "unicode/utypes.h"
29 #include "unicode/uchar.h"
30 #include "unicode/ucnv.h"
31 #include "unicode/uniset.h"
32 #include "unicode/unistr.h"
33 #include "unicode/uclean.h"
34 #include "unicode/udata.h"
35 #include "unicode/putil.h"
36 
37 #include "uoptions.h"
38 #include "unewdata.h"
39 #include "ucmndata.h"
40 #include "rbbidata.h"
41 #include "triedict.h"
42 #include "cmemory.h"
43 
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 
48 U_NAMESPACE_USE
49 
50 static char *progName;
51 static UOption options[]={
52     UOPTION_HELP_H,             /* 0 */
53     UOPTION_HELP_QUESTION_MARK, /* 1 */
54     UOPTION_VERBOSE,            /* 2 */
55     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 3 */
56     UOPTION_ICUDATADIR,         /* 4 */
57     UOPTION_DESTDIR,            /* 5 */
58     UOPTION_COPYRIGHT,          /* 6 */
59 };
60 
usageAndDie(int retCode)61 void usageAndDie(int retCode) {
62         printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
63         printf("\tRead in word list and write out compact trie dictionary\n"
64             "options:\n"
65             "\t-h or -? or --help  this usage text\n"
66             "\t-V or --version     show a version message\n"
67             "\t-c or --copyright   include a copyright notice\n"
68             "\t-v or --verbose     turn on verbose output\n"
69             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
70             "\t                    followed by path, defaults to %s\n"
71             "\t-d or --destdir     destination directory, followed by the path\n",
72             u_getDataDirectory());
73         exit (retCode);
74 }
75 
76 
77 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
78 
79 /* dummy UDataInfo cf. udata.h */
80 static UDataInfo dummyDataInfo = {
81     sizeof(UDataInfo),
82     0,
83 
84     U_IS_BIG_ENDIAN,
85     U_CHARSET_FAMILY,
86     U_SIZEOF_UCHAR,
87     0,
88 
89     { 0, 0, 0, 0 },                 /* dummy dataFormat */
90     { 0, 0, 0, 0 },                 /* dummy formatVersion */
91     { 0, 0, 0, 0 }                  /* dummy dataVersion */
92 };
93 
94 #else
95 
96 //
97 //  Set up the ICU data header, defined in ucmndata.h
98 //
99 DataHeader dh ={
100     {sizeof(DataHeader),           // Struct MappedData
101         0xda,
102         0x27},
103 
104     {                               // struct UDataInfo
105         sizeof(UDataInfo),          //     size
106         0,                          //     reserved
107         U_IS_BIG_ENDIAN,
108         U_CHARSET_FAMILY,
109         U_SIZEOF_UCHAR,
110         0,                          //     reserved
111 
112     { 0x54, 0x72, 0x44, 0x63 },     // "TrDc" Trie Dictionary
113     { 1, 0, 0, 0 },                 // 1.0.0.0
114     { 0, 0, 0, 0 },                 // Irrelevant for this data type
115     }};
116 
117 #endif
118 
119 //----------------------------------------------------------------------------
120 //
121 //  main      for genctd
122 //
123 //----------------------------------------------------------------------------
main(int argc,char ** argv)124 int  main(int argc, char **argv) {
125     UErrorCode  status = U_ZERO_ERROR;
126     const char *wordFileName;
127     const char *outFileName;
128     const char *outDir = NULL;
129     const char *copyright = NULL;
130 
131     //
132     // Pick up and check the command line arguments,
133     //    using the standard ICU tool utils option handling.
134     //
135     U_MAIN_INIT_ARGS(argc, argv);
136     progName = argv[0];
137     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
138     if(argc<0) {
139         // Unrecognized option
140         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
141         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
142     }
143 
144     if(options[0].doesOccur || options[1].doesOccur) {
145         //  -? or -h for help.
146         usageAndDie(0);
147     }
148 
149     if (!options[3].doesOccur || argc < 2) {
150         fprintf(stderr, "input and output file must both be specified.\n");
151         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
152     }
153     outFileName  = options[3].value;
154     wordFileName = argv[1];
155 
156     if (options[4].doesOccur) {
157         u_setDataDirectory(options[4].value);
158     }
159 
160     status = U_ZERO_ERROR;
161 
162     /* Combine the directory with the file name */
163     if(options[5].doesOccur) {
164         outDir = options[5].value;
165     }
166     if (options[6].doesOccur) {
167         copyright = U_COPYRIGHT_STRING;
168     }
169 
170 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
171 
172     UNewDataMemory *pData;
173     char msg[1024];
174 
175     /* write message with just the name */
176     sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
177     fprintf(stderr, "%s\n", msg);
178 
179     /* write the dummy data file */
180     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
181     udata_writeBlock(pData, msg, strlen(msg));
182     udata_finish(pData, &status);
183     return (int)status;
184 
185 #else
186     /* Initialize ICU */
187     u_init(&status);
188     if (U_FAILURE(status)) {
189         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
190             argv[0], u_errorName(status));
191         exit(1);
192     }
193     status = U_ZERO_ERROR;
194 
195     //
196     //  Read in the dictionary source file
197     //
198     long        result;
199     long        wordFileSize;
200     FILE        *file;
201     char        *wordBufferC;
202 
203     file = fopen(wordFileName, "rb");
204     if( file == 0 ) {
205         fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
206         exit(-1);
207     }
208     fseek(file, 0, SEEK_END);
209     wordFileSize = ftell(file);
210     fseek(file, 0, SEEK_SET);
211     wordBufferC = new char[wordFileSize+10];
212 
213     result = (long)fread(wordBufferC, 1, wordFileSize, file);
214     if (result != wordFileSize)  {
215         fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
216         exit (-1);
217     }
218     wordBufferC[wordFileSize]=0;
219     fclose(file);
220 
221     //
222     // Look for a Unicode Signature (BOM) on the word file
223     //
224     int32_t        signatureLength;
225     const char *   wordSourceC = wordBufferC;
226     const char*    encoding = ucnv_detectUnicodeSignature(
227                            wordSourceC, wordFileSize, &signatureLength, &status);
228     if (U_FAILURE(status)) {
229         exit(status);
230     }
231     if(encoding!=NULL ){
232         wordSourceC  += signatureLength;
233         wordFileSize -= signatureLength;
234     }
235 
236     //
237     // Open a converter to take the rule file to UTF-16
238     //
239     UConverter* conv;
240     conv = ucnv_open(encoding, &status);
241     if (U_FAILURE(status)) {
242         fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
243         exit(status);
244     }
245 
246     //
247     // Convert the words to UChar.
248     //  Preflight first to determine required buffer size.
249     //
250     uint32_t destCap = ucnv_toUChars(conv,
251                        NULL,           //  dest,
252                        0,              //  destCapacity,
253                        wordSourceC,
254                        wordFileSize,
255                        &status);
256     if (status != U_BUFFER_OVERFLOW_ERROR) {
257         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
258         exit(status);
259     };
260 
261     status = U_ZERO_ERROR;
262     UChar *wordSourceU = new UChar[destCap+1];
263     ucnv_toUChars(conv,
264                   wordSourceU,     //  dest,
265                   destCap+1,
266                   wordSourceC,
267                   wordFileSize,
268                   &status);
269     if (U_FAILURE(status)) {
270         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
271         exit(status);
272     };
273     ucnv_close(conv);
274 
275     // Get rid of the original file buffer
276     delete[] wordBufferC;
277 
278     // Create a MutableTrieDictionary, and loop through all the lines, inserting
279     // words.
280 
281     // First, pick a median character.
282     UChar *current = wordSourceU + (destCap/2);
283     UChar uc = *current++;
284     UnicodeSet breaks;
285     breaks.add(0x000A);     // Line Feed
286     breaks.add(0x000D);     // Carriage Return
287     breaks.add(0x2028);     // Line Separator
288     breaks.add(0x2029);     // Paragraph Separator
289 
290     do {
291         // Look for line break
292         while (uc && !breaks.contains(uc)) {
293             uc = *current++;
294         }
295         // Now skip to first non-line-break
296         while (uc && breaks.contains(uc)) {
297             uc = *current++;
298         }
299     }
300     while (uc && (breaks.contains(uc) || u_isspace(uc)));
301 
302     MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
303 
304     if (U_FAILURE(status)) {
305         fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
306         exit(status);
307     }
308 
309     // Now add the words. Words are non-space characters at the beginning of
310     // lines, and must be at least one UChar.
311     current = wordSourceU;
312     UChar *candidate = current;
313     uc = *current++;
314     int32_t length = 0;
315 
316     while (uc) {
317         while (uc && !u_isspace(uc)) {
318             ++length;
319             uc = *current++;
320         }
321         if (length > 0) {
322             mtd->addWord(candidate, length, status);
323             if (U_FAILURE(status)) {
324                 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
325                         u_errorName(status));
326                 exit(status);
327             }
328         }
329         // Find beginning of next line
330         while (uc && !breaks.contains(uc)) {
331             uc = *current++;
332         }
333         while (uc && breaks.contains(uc)) {
334             uc = *current++;
335         }
336         candidate = current-1;
337         length = 0;
338     }
339 
340     // Get rid of the Unicode text buffer
341     delete[] wordSourceU;
342 
343     // Now, create a CompactTrieDictionary from the mutable dictionary
344     CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
345     if (U_FAILURE(status)) {
346         fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
347         exit(status);
348     }
349 
350     // Get rid of the MutableTrieDictionary
351     delete mtd;
352 
353     //
354     //  Get the binary data from the dictionary.
355     //
356     uint32_t        outDataSize = ctd->dataSize();
357     const uint8_t  *outData = (const uint8_t *)ctd->data();
358 
359     //
360     //  Create the output file
361     //
362     size_t bytesWritten;
363     UNewDataMemory *pData;
364     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
365     if(U_FAILURE(status)) {
366         fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
367                          outFileName, u_errorName(status));
368         exit(status);
369     }
370 
371 
372     //  Write the data itself.
373     udata_writeBlock(pData, outData, outDataSize);
374     // finish up
375     bytesWritten = udata_finish(pData, &status);
376     if(U_FAILURE(status)) {
377         fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
378         exit(status);
379     }
380 
381     if (bytesWritten != outDataSize) {
382         fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
383         exit(-1);
384     }
385 
386     // Get rid of the CompactTrieDictionary
387     delete ctd;
388 
389     u_cleanup();
390 
391     printf("genctd: tool completed successfully.\n");
392     return 0;
393 
394 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
395 }
396 
397