• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2006, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *
7 * File genctd.c
8 */
9 
10 //--------------------------------------------------------------------
11 //
12 //   Tool for generating CompactTrieDictionary data files (.ctd files).
13 //
14 //   Usage:  genctd [options] -o output-file.ctd input-file
15 //
16 //       options:   -v         verbose
17 //                  -? or -h   help
18 //
19 //   The input  file is a plain text file containing words, one per line.
20 //    Words end at the first whitespace; lines beginning with whitespace
21 //    are ignored.
22 //    The file can be encoded as utf-8, or utf-16 (either endian), or
23 //    in the default code page (platform dependent.).  utf encoded
24 //    files must include a BOM.
25 //
26 //--------------------------------------------------------------------
27 
28 #include "unicode/utypes.h"
29 #include "unicode/uchar.h"
30 #include "unicode/ucnv.h"
31 #include "unicode/uniset.h"
32 #include "unicode/unistr.h"
33 #include "unicode/uclean.h"
34 #include "unicode/udata.h"
35 #include "unicode/putil.h"
36 
37 #include "uoptions.h"
38 #include "unewdata.h"
39 #include "ucmndata.h"
40 #include "rbbidata.h"
41 #include "triedict.h"
42 #include "cmemory.h"
43 
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 
48 U_NAMESPACE_USE
49 
50 static char *progName;
51 static UOption options[]={
52     UOPTION_HELP_H,             /* 0 */
53     UOPTION_HELP_QUESTION_MARK, /* 1 */
54     UOPTION_VERBOSE,            /* 2 */
55     { "out",   NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 },   /* 3 */
56     UOPTION_ICUDATADIR,         /* 4 */
57     UOPTION_DESTDIR,            /* 5 */
58     UOPTION_COPYRIGHT,          /* 6 */
59 };
60 
usageAndDie(int retCode)61 void usageAndDie(int retCode) {
62         printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
63         printf("\tRead in word list and write out compact trie dictionary\n"
64             "options:\n"
65             "\t-h or -? or --help  this usage text\n"
66             "\t-V or --version     show a version message\n"
67             "\t-c or --copyright   include a copyright notice\n"
68             "\t-v or --verbose     turn on verbose output\n"
69             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
70             "\t                    followed by path, defaults to %s\n"
71             "\t-d or --destdir     destination directory, followed by the path\n",
72             u_getDataDirectory());
73         exit (retCode);
74 }
75 
76 
77 #if UCONFIG_NO_BREAK_ITERATION
78 
79 /* dummy UDataInfo cf. udata.h */
80 static UDataInfo dummyDataInfo = {
81     sizeof(UDataInfo),
82     0,
83 
84     U_IS_BIG_ENDIAN,
85     U_CHARSET_FAMILY,
86     U_SIZEOF_UCHAR,
87     0,
88 
89     { 0, 0, 0, 0 },                 /* dummy dataFormat */
90     { 0, 0, 0, 0 },                 /* dummy formatVersion */
91     { 0, 0, 0, 0 }                  /* dummy dataVersion */
92 };
93 
94 #else
95 
96 //
97 //  Set up the ICU data header, defined in ucmndata.h
98 //
99 DataHeader dh ={
100     {sizeof(DataHeader),           // Struct MappedData
101         0xda,
102         0x27},
103 
104     {                               // struct UDataInfo
105         sizeof(UDataInfo),          //     size
106         0,                          //     reserved
107         U_IS_BIG_ENDIAN,
108         U_CHARSET_FAMILY,
109         U_SIZEOF_UCHAR,
110         0,                          //     reserved
111 
112     { 0x54, 0x72, 0x44, 0x63 },     // "TrDc" Trie Dictionary
113     { 1, 0, 0, 0 },                 // 1.0.0.0
114     { 0, 0, 0, 0 },                 // Irrelevant for this data type
115     }};
116 
117 #endif
118 
119 //----------------------------------------------------------------------------
120 //
121 //  main      for genctd
122 //
123 //----------------------------------------------------------------------------
main(int argc,char ** argv)124 int  main(int argc, char **argv) {
125     UErrorCode  status = U_ZERO_ERROR;
126     const char *wordFileName;
127     const char *outFileName;
128     const char *outDir = NULL;
129     const char *copyright = NULL;
130 
131     //
132     // Pick up and check the command line arguments,
133     //    using the standard ICU tool utils option handling.
134     //
135     U_MAIN_INIT_ARGS(argc, argv);
136     progName = argv[0];
137     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
138     if(argc<0) {
139         // Unrecognized option
140         fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
141         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
142     }
143 
144     if(options[0].doesOccur || options[1].doesOccur) {
145         //  -? or -h for help.
146         usageAndDie(0);
147     }
148 
149     if (!options[3].doesOccur || argc < 2) {
150         fprintf(stderr, "input and output file must both be specified.\n");
151         usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
152     }
153     outFileName  = options[3].value;
154     wordFileName = argv[1];
155 
156     if (options[4].doesOccur) {
157         u_setDataDirectory(options[4].value);
158     }
159 
160     /* Initialize ICU */
161     u_init(&status);
162     if (U_FAILURE(status)) {
163         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
164             argv[0], u_errorName(status));
165         exit(1);
166     }
167     status = U_ZERO_ERROR;
168 
169     /* Combine the directory with the file name */
170     if(options[5].doesOccur) {
171         outDir = options[5].value;
172     }
173     if (options[6].doesOccur) {
174         copyright = U_COPYRIGHT_STRING;
175     }
176 
177 #if UCONFIG_NO_BREAK_ITERATION
178 
179     UNewDataMemory *pData;
180     char msg[1024];
181 
182     /* write message with just the name */
183     sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", outFileName);
184     fprintf(stderr, "%s\n", msg);
185 
186     /* write the dummy data file */
187     pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
188     udata_writeBlock(pData, msg, strlen(msg));
189     udata_finish(pData, &status);
190     return (int)status;
191 
192 #else
193 
194     //
195     //  Read in the dictionary source file
196     //
197     long        result;
198     long        wordFileSize;
199     FILE        *file;
200     char        *wordBufferC;
201 
202     file = fopen(wordFileName, "rb");
203     if( file == 0 ) {
204         fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
205         exit(-1);
206     }
207     fseek(file, 0, SEEK_END);
208     wordFileSize = ftell(file);
209     fseek(file, 0, SEEK_SET);
210     wordBufferC = new char[wordFileSize+10];
211 
212     result = (long)fread(wordBufferC, 1, wordFileSize, file);
213     if (result != wordFileSize)  {
214         fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
215         exit (-1);
216     }
217     wordBufferC[wordFileSize]=0;
218     fclose(file);
219 
220     //
221     // Look for a Unicode Signature (BOM) on the word file
222     //
223     int32_t        signatureLength;
224     const char *   wordSourceC = wordBufferC;
225     const char*    encoding = ucnv_detectUnicodeSignature(
226                            wordSourceC, wordFileSize, &signatureLength, &status);
227     if (U_FAILURE(status)) {
228         exit(status);
229     }
230     if(encoding!=NULL ){
231         wordSourceC  += signatureLength;
232         wordFileSize -= signatureLength;
233     }
234 
235     //
236     // Open a converter to take the rule file to UTF-16
237     //
238     UConverter* conv;
239     conv = ucnv_open(encoding, &status);
240     if (U_FAILURE(status)) {
241         fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
242         exit(status);
243     }
244 
245     //
246     // Convert the words to UChar.
247     //  Preflight first to determine required buffer size.
248     //
249     uint32_t destCap = ucnv_toUChars(conv,
250                        NULL,           //  dest,
251                        0,              //  destCapacity,
252                        wordSourceC,
253                        wordFileSize,
254                        &status);
255     if (status != U_BUFFER_OVERFLOW_ERROR) {
256         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
257         exit(status);
258     };
259 
260     status = U_ZERO_ERROR;
261     UChar *wordSourceU = new UChar[destCap+1];
262     ucnv_toUChars(conv,
263                   wordSourceU,     //  dest,
264                   destCap+1,
265                   wordSourceC,
266                   wordFileSize,
267                   &status);
268     if (U_FAILURE(status)) {
269         fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
270         exit(status);
271     };
272     ucnv_close(conv);
273 
274     // Get rid of the original file buffer
275     delete[] wordBufferC;
276 
277     // Create a MutableTrieDictionary, and loop through all the lines, inserting
278     // words.
279 
280     // First, pick a median character.
281     UChar *current = wordSourceU + (destCap/2);
282     UChar uc = *current++;
283     UnicodeSet breaks;
284     breaks.add(0x000A);     // Line Feed
285     breaks.add(0x000D);     // Carriage Return
286     breaks.add(0x2028);     // Line Separator
287     breaks.add(0x2029);     // Paragraph Separator
288 
289     do {
290         // Look for line break
291         while (uc && !breaks.contains(uc)) {
292             uc = *current++;
293         }
294         // Now skip to first non-line-break
295         while (uc && breaks.contains(uc)) {
296             uc = *current++;
297         }
298     }
299     while (uc && (breaks.contains(uc) || u_isspace(uc)));
300 
301     MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
302 
303     if (U_FAILURE(status)) {
304         fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
305         exit(status);
306     }
307 
308     // Now add the words. Words are non-space characters at the beginning of
309     // lines, and must be at least one UChar.
310     current = wordSourceU;
311     UChar *candidate = current;
312     uc = *current++;
313     int32_t length = 0;
314 
315     while (uc) {
316         while (uc && !u_isspace(uc)) {
317             ++length;
318             uc = *current++;
319         }
320         if (length > 0) {
321             mtd->addWord(candidate, length, status);
322             if (U_FAILURE(status)) {
323                 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
324                         u_errorName(status));
325                 exit(status);
326             }
327         }
328         // Find beginning of next line
329         while (uc && !breaks.contains(uc)) {
330             uc = *current++;
331         }
332         while (uc && breaks.contains(uc)) {
333             uc = *current++;
334         }
335         candidate = current-1;
336         length = 0;
337     }
338 
339     // Get rid of the Unicode text buffer
340     delete[] wordSourceU;
341 
342     // Now, create a CompactTrieDictionary from the mutable dictionary
343     CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
344     if (U_FAILURE(status)) {
345         fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
346         exit(status);
347     }
348 
349     // Get rid of the MutableTrieDictionary
350     delete mtd;
351 
352     //
353     //  Get the binary data from the dictionary.
354     //
355     uint32_t        outDataSize = ctd->dataSize();
356     const uint8_t  *outData = (const uint8_t *)ctd->data();
357 
358     //
359     //  Create the output file
360     //
361     size_t bytesWritten;
362     UNewDataMemory *pData;
363     pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
364     if(U_FAILURE(status)) {
365         fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
366                          outFileName, u_errorName(status));
367         exit(status);
368     }
369 
370 
371     //  Write the data itself.
372     udata_writeBlock(pData, outData, outDataSize);
373     // finish up
374     bytesWritten = udata_finish(pData, &status);
375     if(U_FAILURE(status)) {
376         fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
377         exit(status);
378     }
379 
380     if (bytesWritten != outDataSize) {
381         fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
382         exit(-1);
383     }
384 
385     // Get rid of the CompactTrieDictionary
386     delete ctd;
387 
388     u_cleanup();
389 
390     printf("genctd: tool completed successfully.\n");
391     return 0;
392 
393 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
394 }
395 
396