1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *
7 * File genctd.c
8 */
9
10 //--------------------------------------------------------------------
11 //
12 // Tool for generating CompactTrieDictionary data files (.ctd files).
13 //
14 // Usage: genctd [options] -o output-file.ctd input-file
15 //
16 // options: -v verbose
17 // -? or -h help
18 //
19 // The input file is a plain text file containing words, one per line.
20 // Words end at the first whitespace; lines beginning with whitespace
21 // are ignored.
22 // The file can be encoded as utf-8, or utf-16 (either endian), or
23 // in the default code page (platform dependent.). utf encoded
24 // files must include a BOM.
25 //
26 //--------------------------------------------------------------------
27
28 #include "unicode/utypes.h"
29 #include "unicode/uchar.h"
30 #include "unicode/ucnv.h"
31 #include "unicode/uniset.h"
32 #include "unicode/unistr.h"
33 #include "unicode/uclean.h"
34 #include "unicode/udata.h"
35 #include "unicode/putil.h"
36
37 #include "uoptions.h"
38 #include "unewdata.h"
39 #include "ucmndata.h"
40 #include "rbbidata.h"
41 #include "triedict.h"
42 #include "cmemory.h"
43
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47
48 U_NAMESPACE_USE
49
50 static char *progName;
51 static UOption options[]={
52 UOPTION_HELP_H, /* 0 */
53 UOPTION_HELP_QUESTION_MARK, /* 1 */
54 UOPTION_VERBOSE, /* 2 */
55 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */
56 UOPTION_ICUDATADIR, /* 4 */
57 UOPTION_DESTDIR, /* 5 */
58 UOPTION_COPYRIGHT, /* 6 */
59 };
60
usageAndDie(int retCode)61 void usageAndDie(int retCode) {
62 printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
63 printf("\tRead in word list and write out compact trie dictionary\n"
64 "options:\n"
65 "\t-h or -? or --help this usage text\n"
66 "\t-V or --version show a version message\n"
67 "\t-c or --copyright include a copyright notice\n"
68 "\t-v or --verbose turn on verbose output\n"
69 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
70 "\t followed by path, defaults to %s\n"
71 "\t-d or --destdir destination directory, followed by the path\n",
72 u_getDataDirectory());
73 exit (retCode);
74 }
75
76
77 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
78
79 /* dummy UDataInfo cf. udata.h */
80 static UDataInfo dummyDataInfo = {
81 sizeof(UDataInfo),
82 0,
83
84 U_IS_BIG_ENDIAN,
85 U_CHARSET_FAMILY,
86 U_SIZEOF_UCHAR,
87 0,
88
89 { 0, 0, 0, 0 }, /* dummy dataFormat */
90 { 0, 0, 0, 0 }, /* dummy formatVersion */
91 { 0, 0, 0, 0 } /* dummy dataVersion */
92 };
93
94 #else
95
96 //
97 // Set up the ICU data header, defined in ucmndata.h
98 //
99 DataHeader dh ={
100 {sizeof(DataHeader), // Struct MappedData
101 0xda,
102 0x27},
103
104 { // struct UDataInfo
105 sizeof(UDataInfo), // size
106 0, // reserved
107 U_IS_BIG_ENDIAN,
108 U_CHARSET_FAMILY,
109 U_SIZEOF_UCHAR,
110 0, // reserved
111
112 { 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary
113 { 1, 0, 0, 0 }, // 1.0.0.0
114 { 0, 0, 0, 0 }, // Irrelevant for this data type
115 }};
116
117 #endif
118
119 //----------------------------------------------------------------------------
120 //
121 // main for genctd
122 //
123 //----------------------------------------------------------------------------
main(int argc,char ** argv)124 int main(int argc, char **argv) {
125 UErrorCode status = U_ZERO_ERROR;
126 const char *wordFileName;
127 const char *outFileName;
128 const char *outDir = NULL;
129 const char *copyright = NULL;
130
131 //
132 // Pick up and check the command line arguments,
133 // using the standard ICU tool utils option handling.
134 //
135 U_MAIN_INIT_ARGS(argc, argv);
136 progName = argv[0];
137 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
138 if(argc<0) {
139 // Unrecognized option
140 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
141 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
142 }
143
144 if(options[0].doesOccur || options[1].doesOccur) {
145 // -? or -h for help.
146 usageAndDie(0);
147 }
148
149 if (!options[3].doesOccur || argc < 2) {
150 fprintf(stderr, "input and output file must both be specified.\n");
151 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
152 }
153 outFileName = options[3].value;
154 wordFileName = argv[1];
155
156 if (options[4].doesOccur) {
157 u_setDataDirectory(options[4].value);
158 }
159
160 status = U_ZERO_ERROR;
161
162 /* Combine the directory with the file name */
163 if(options[5].doesOccur) {
164 outDir = options[5].value;
165 }
166 if (options[6].doesOccur) {
167 copyright = U_COPYRIGHT_STRING;
168 }
169
170 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
171
172 UNewDataMemory *pData;
173 char msg[1024];
174
175 /* write message with just the name */
176 sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
177 fprintf(stderr, "%s\n", msg);
178
179 /* write the dummy data file */
180 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
181 udata_writeBlock(pData, msg, strlen(msg));
182 udata_finish(pData, &status);
183 return (int)status;
184
185 #else
186 /* Initialize ICU */
187 u_init(&status);
188 if (U_FAILURE(status)) {
189 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
190 argv[0], u_errorName(status));
191 exit(1);
192 }
193 status = U_ZERO_ERROR;
194
195 //
196 // Read in the dictionary source file
197 //
198 long result;
199 long wordFileSize;
200 FILE *file;
201 char *wordBufferC;
202
203 file = fopen(wordFileName, "rb");
204 if( file == 0 ) {
205 fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
206 exit(-1);
207 }
208 fseek(file, 0, SEEK_END);
209 wordFileSize = ftell(file);
210 fseek(file, 0, SEEK_SET);
211 wordBufferC = new char[wordFileSize+10];
212
213 result = (long)fread(wordBufferC, 1, wordFileSize, file);
214 if (result != wordFileSize) {
215 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
216 exit (-1);
217 }
218 wordBufferC[wordFileSize]=0;
219 fclose(file);
220
221 //
222 // Look for a Unicode Signature (BOM) on the word file
223 //
224 int32_t signatureLength;
225 const char * wordSourceC = wordBufferC;
226 const char* encoding = ucnv_detectUnicodeSignature(
227 wordSourceC, wordFileSize, &signatureLength, &status);
228 if (U_FAILURE(status)) {
229 exit(status);
230 }
231 if(encoding!=NULL ){
232 wordSourceC += signatureLength;
233 wordFileSize -= signatureLength;
234 }
235
236 //
237 // Open a converter to take the rule file to UTF-16
238 //
239 UConverter* conv;
240 conv = ucnv_open(encoding, &status);
241 if (U_FAILURE(status)) {
242 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
243 exit(status);
244 }
245
246 //
247 // Convert the words to UChar.
248 // Preflight first to determine required buffer size.
249 //
250 uint32_t destCap = ucnv_toUChars(conv,
251 NULL, // dest,
252 0, // destCapacity,
253 wordSourceC,
254 wordFileSize,
255 &status);
256 if (status != U_BUFFER_OVERFLOW_ERROR) {
257 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
258 exit(status);
259 };
260
261 status = U_ZERO_ERROR;
262 UChar *wordSourceU = new UChar[destCap+1];
263 ucnv_toUChars(conv,
264 wordSourceU, // dest,
265 destCap+1,
266 wordSourceC,
267 wordFileSize,
268 &status);
269 if (U_FAILURE(status)) {
270 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
271 exit(status);
272 };
273 ucnv_close(conv);
274
275 // Get rid of the original file buffer
276 delete[] wordBufferC;
277
278 // Create a MutableTrieDictionary, and loop through all the lines, inserting
279 // words.
280
281 // First, pick a median character.
282 UChar *current = wordSourceU + (destCap/2);
283 UChar uc = *current++;
284 UnicodeSet breaks;
285 breaks.add(0x000A); // Line Feed
286 breaks.add(0x000D); // Carriage Return
287 breaks.add(0x2028); // Line Separator
288 breaks.add(0x2029); // Paragraph Separator
289
290 do {
291 // Look for line break
292 while (uc && !breaks.contains(uc)) {
293 uc = *current++;
294 }
295 // Now skip to first non-line-break
296 while (uc && breaks.contains(uc)) {
297 uc = *current++;
298 }
299 }
300 while (uc && (breaks.contains(uc) || u_isspace(uc)));
301
302 MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
303
304 if (U_FAILURE(status)) {
305 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
306 exit(status);
307 }
308
309 // Now add the words. Words are non-space characters at the beginning of
310 // lines, and must be at least one UChar.
311 current = wordSourceU;
312 UChar *candidate = current;
313 uc = *current++;
314 int32_t length = 0;
315
316 while (uc) {
317 while (uc && !u_isspace(uc)) {
318 ++length;
319 uc = *current++;
320 }
321 if (length > 0) {
322 mtd->addWord(candidate, length, status);
323 if (U_FAILURE(status)) {
324 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
325 u_errorName(status));
326 exit(status);
327 }
328 }
329 // Find beginning of next line
330 while (uc && !breaks.contains(uc)) {
331 uc = *current++;
332 }
333 while (uc && breaks.contains(uc)) {
334 uc = *current++;
335 }
336 candidate = current-1;
337 length = 0;
338 }
339
340 // Get rid of the Unicode text buffer
341 delete[] wordSourceU;
342
343 // Now, create a CompactTrieDictionary from the mutable dictionary
344 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
345 if (U_FAILURE(status)) {
346 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
347 exit(status);
348 }
349
350 // Get rid of the MutableTrieDictionary
351 delete mtd;
352
353 //
354 // Get the binary data from the dictionary.
355 //
356 uint32_t outDataSize = ctd->dataSize();
357 const uint8_t *outData = (const uint8_t *)ctd->data();
358
359 //
360 // Create the output file
361 //
362 size_t bytesWritten;
363 UNewDataMemory *pData;
364 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
365 if(U_FAILURE(status)) {
366 fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
367 outFileName, u_errorName(status));
368 exit(status);
369 }
370
371
372 // Write the data itself.
373 udata_writeBlock(pData, outData, outDataSize);
374 // finish up
375 bytesWritten = udata_finish(pData, &status);
376 if(U_FAILURE(status)) {
377 fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
378 exit(status);
379 }
380
381 if (bytesWritten != outDataSize) {
382 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
383 exit(-1);
384 }
385
386 // Get rid of the CompactTrieDictionary
387 delete ctd;
388
389 u_cleanup();
390
391 printf("genctd: tool completed successfully.\n");
392 return 0;
393
394 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
395 }
396
397