1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *
7 * File genctd.c
8 */
9
10 //--------------------------------------------------------------------
11 //
12 // Tool for generating CompactTrieDictionary data files (.ctd files).
13 //
14 // Usage: genctd [options] -o output-file.ctd input-file
15 //
16 // options: -v verbose
17 // -? or -h help
18 //
19 // The input file is a plain text file containing words, one per line.
20 // Words end at the first whitespace; lines beginning with whitespace
21 // are ignored.
22 // The file can be encoded as utf-8, or utf-16 (either endian), or
23 // in the default code page (platform dependent.). utf encoded
24 // files must include a BOM.
25 //
26 //--------------------------------------------------------------------
27
28 #include "unicode/utypes.h"
29 #include "unicode/uchar.h"
30 #include "unicode/ucnv.h"
31 #include "unicode/uniset.h"
32 #include "unicode/unistr.h"
33 #include "unicode/uclean.h"
34 #include "unicode/udata.h"
35 #include "unicode/putil.h"
36
37 #include "uoptions.h"
38 #include "unewdata.h"
39 #include "ucmndata.h"
40 #include "rbbidata.h"
41 #include "triedict.h"
42 #include "cmemory.h"
43
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47
48 U_NAMESPACE_USE
49
50 static char *progName;
51 static UOption options[]={
52 UOPTION_HELP_H, /* 0 */
53 UOPTION_HELP_QUESTION_MARK, /* 1 */
54 UOPTION_VERBOSE, /* 2 */
55 { "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */
56 UOPTION_ICUDATADIR, /* 4 */
57 UOPTION_DESTDIR, /* 5 */
58 UOPTION_COPYRIGHT, /* 6 */
59 };
60
usageAndDie(int retCode)61 void usageAndDie(int retCode) {
62 printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
63 printf("\tRead in word list and write out compact trie dictionary\n"
64 "options:\n"
65 "\t-h or -? or --help this usage text\n"
66 "\t-V or --version show a version message\n"
67 "\t-c or --copyright include a copyright notice\n"
68 "\t-v or --verbose turn on verbose output\n"
69 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
70 "\t followed by path, defaults to %s\n"
71 "\t-d or --destdir destination directory, followed by the path\n",
72 u_getDataDirectory());
73 exit (retCode);
74 }
75
76
77 #if UCONFIG_NO_BREAK_ITERATION
78
79 /* dummy UDataInfo cf. udata.h */
80 static UDataInfo dummyDataInfo = {
81 sizeof(UDataInfo),
82 0,
83
84 U_IS_BIG_ENDIAN,
85 U_CHARSET_FAMILY,
86 U_SIZEOF_UCHAR,
87 0,
88
89 { 0, 0, 0, 0 }, /* dummy dataFormat */
90 { 0, 0, 0, 0 }, /* dummy formatVersion */
91 { 0, 0, 0, 0 } /* dummy dataVersion */
92 };
93
94 #else
95
96 //
97 // Set up the ICU data header, defined in ucmndata.h
98 //
99 DataHeader dh ={
100 {sizeof(DataHeader), // Struct MappedData
101 0xda,
102 0x27},
103
104 { // struct UDataInfo
105 sizeof(UDataInfo), // size
106 0, // reserved
107 U_IS_BIG_ENDIAN,
108 U_CHARSET_FAMILY,
109 U_SIZEOF_UCHAR,
110 0, // reserved
111
112 { 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary
113 { 1, 0, 0, 0 }, // 1.0.0.0
114 { 0, 0, 0, 0 }, // Irrelevant for this data type
115 }};
116
117 #endif
118
119 //----------------------------------------------------------------------------
120 //
121 // main for genctd
122 //
123 //----------------------------------------------------------------------------
main(int argc,char ** argv)124 int main(int argc, char **argv) {
125 UErrorCode status = U_ZERO_ERROR;
126 const char *wordFileName;
127 const char *outFileName;
128 const char *outDir = NULL;
129 const char *copyright = NULL;
130
131 //
132 // Pick up and check the command line arguments,
133 // using the standard ICU tool utils option handling.
134 //
135 U_MAIN_INIT_ARGS(argc, argv);
136 progName = argv[0];
137 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
138 if(argc<0) {
139 // Unrecognized option
140 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
141 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
142 }
143
144 if(options[0].doesOccur || options[1].doesOccur) {
145 // -? or -h for help.
146 usageAndDie(0);
147 }
148
149 if (!options[3].doesOccur || argc < 2) {
150 fprintf(stderr, "input and output file must both be specified.\n");
151 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
152 }
153 outFileName = options[3].value;
154 wordFileName = argv[1];
155
156 if (options[4].doesOccur) {
157 u_setDataDirectory(options[4].value);
158 }
159
160 /* Initialize ICU */
161 u_init(&status);
162 if (U_FAILURE(status)) {
163 fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
164 argv[0], u_errorName(status));
165 exit(1);
166 }
167 status = U_ZERO_ERROR;
168
169 /* Combine the directory with the file name */
170 if(options[5].doesOccur) {
171 outDir = options[5].value;
172 }
173 if (options[6].doesOccur) {
174 copyright = U_COPYRIGHT_STRING;
175 }
176
177 #if UCONFIG_NO_BREAK_ITERATION
178
179 UNewDataMemory *pData;
180 char msg[1024];
181
182 /* write message with just the name */
183 sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", outFileName);
184 fprintf(stderr, "%s\n", msg);
185
186 /* write the dummy data file */
187 pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
188 udata_writeBlock(pData, msg, strlen(msg));
189 udata_finish(pData, &status);
190 return (int)status;
191
192 #else
193
194 //
195 // Read in the dictionary source file
196 //
197 long result;
198 long wordFileSize;
199 FILE *file;
200 char *wordBufferC;
201
202 file = fopen(wordFileName, "rb");
203 if( file == 0 ) {
204 fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
205 exit(-1);
206 }
207 fseek(file, 0, SEEK_END);
208 wordFileSize = ftell(file);
209 fseek(file, 0, SEEK_SET);
210 wordBufferC = new char[wordFileSize+10];
211
212 result = (long)fread(wordBufferC, 1, wordFileSize, file);
213 if (result != wordFileSize) {
214 fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
215 exit (-1);
216 }
217 wordBufferC[wordFileSize]=0;
218 fclose(file);
219
220 //
221 // Look for a Unicode Signature (BOM) on the word file
222 //
223 int32_t signatureLength;
224 const char * wordSourceC = wordBufferC;
225 const char* encoding = ucnv_detectUnicodeSignature(
226 wordSourceC, wordFileSize, &signatureLength, &status);
227 if (U_FAILURE(status)) {
228 exit(status);
229 }
230 if(encoding!=NULL ){
231 wordSourceC += signatureLength;
232 wordFileSize -= signatureLength;
233 }
234
235 //
236 // Open a converter to take the rule file to UTF-16
237 //
238 UConverter* conv;
239 conv = ucnv_open(encoding, &status);
240 if (U_FAILURE(status)) {
241 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
242 exit(status);
243 }
244
245 //
246 // Convert the words to UChar.
247 // Preflight first to determine required buffer size.
248 //
249 uint32_t destCap = ucnv_toUChars(conv,
250 NULL, // dest,
251 0, // destCapacity,
252 wordSourceC,
253 wordFileSize,
254 &status);
255 if (status != U_BUFFER_OVERFLOW_ERROR) {
256 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
257 exit(status);
258 };
259
260 status = U_ZERO_ERROR;
261 UChar *wordSourceU = new UChar[destCap+1];
262 ucnv_toUChars(conv,
263 wordSourceU, // dest,
264 destCap+1,
265 wordSourceC,
266 wordFileSize,
267 &status);
268 if (U_FAILURE(status)) {
269 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
270 exit(status);
271 };
272 ucnv_close(conv);
273
274 // Get rid of the original file buffer
275 delete[] wordBufferC;
276
277 // Create a MutableTrieDictionary, and loop through all the lines, inserting
278 // words.
279
280 // First, pick a median character.
281 UChar *current = wordSourceU + (destCap/2);
282 UChar uc = *current++;
283 UnicodeSet breaks;
284 breaks.add(0x000A); // Line Feed
285 breaks.add(0x000D); // Carriage Return
286 breaks.add(0x2028); // Line Separator
287 breaks.add(0x2029); // Paragraph Separator
288
289 do {
290 // Look for line break
291 while (uc && !breaks.contains(uc)) {
292 uc = *current++;
293 }
294 // Now skip to first non-line-break
295 while (uc && breaks.contains(uc)) {
296 uc = *current++;
297 }
298 }
299 while (uc && (breaks.contains(uc) || u_isspace(uc)));
300
301 MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
302
303 if (U_FAILURE(status)) {
304 fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
305 exit(status);
306 }
307
308 // Now add the words. Words are non-space characters at the beginning of
309 // lines, and must be at least one UChar.
310 current = wordSourceU;
311 UChar *candidate = current;
312 uc = *current++;
313 int32_t length = 0;
314
315 while (uc) {
316 while (uc && !u_isspace(uc)) {
317 ++length;
318 uc = *current++;
319 }
320 if (length > 0) {
321 mtd->addWord(candidate, length, status);
322 if (U_FAILURE(status)) {
323 fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
324 u_errorName(status));
325 exit(status);
326 }
327 }
328 // Find beginning of next line
329 while (uc && !breaks.contains(uc)) {
330 uc = *current++;
331 }
332 while (uc && breaks.contains(uc)) {
333 uc = *current++;
334 }
335 candidate = current-1;
336 length = 0;
337 }
338
339 // Get rid of the Unicode text buffer
340 delete[] wordSourceU;
341
342 // Now, create a CompactTrieDictionary from the mutable dictionary
343 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
344 if (U_FAILURE(status)) {
345 fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
346 exit(status);
347 }
348
349 // Get rid of the MutableTrieDictionary
350 delete mtd;
351
352 //
353 // Get the binary data from the dictionary.
354 //
355 uint32_t outDataSize = ctd->dataSize();
356 const uint8_t *outData = (const uint8_t *)ctd->data();
357
358 //
359 // Create the output file
360 //
361 size_t bytesWritten;
362 UNewDataMemory *pData;
363 pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
364 if(U_FAILURE(status)) {
365 fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
366 outFileName, u_errorName(status));
367 exit(status);
368 }
369
370
371 // Write the data itself.
372 udata_writeBlock(pData, outData, outDataSize);
373 // finish up
374 bytesWritten = udata_finish(pData, &status);
375 if(U_FAILURE(status)) {
376 fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
377 exit(status);
378 }
379
380 if (bytesWritten != outDataSize) {
381 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
382 exit(-1);
383 }
384
385 // Get rid of the CompactTrieDictionary
386 delete ctd;
387
388 u_cleanup();
389
390 printf("genctd: tool completed successfully.\n");
391 return 0;
392
393 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
394 }
395
396