1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: gennames.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 1999sep30
14 * created by: Markus W. Scherer
15 *
16 * This program reads the Unicode character database text file,
17 * parses it, and extracts the character code,
18 * the "modern" character name, and optionally the
19 * Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
20 * It then tokenizes and compresses the names and builds
21 * compact binary tables for random-access lookup
22 * in a u_charName() API function.
23 *
24 * unames.icu file format (after UDataInfo header etc. - see udata.c)
25 * (all data is static const)
26 *
27 * UDataInfo fields:
28 * dataFormat "unam"
29 * formatVersion 1.0
30 * dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
31 *
32 * -- data-based names
33 * uint32_t tokenStringOffset,
34 * groupsOffset,
35 * groupStringOffset,
36 * algNamesOffset;
37 *
38 * uint16_t tokenCount;
39 * uint16_t tokenTable[tokenCount];
40 *
41 * char tokenStrings[]; -- padded to even count
42 *
43 * -- strings (groupStrings) are tokenized as follows:
44 * for each character c
45 * if(c>=tokenCount) write that character c directly
46 * else
47 * token=tokenTable[c];
48 * if(token==0xfffe) -- lead byte of double-byte token
49 * token=tokenTable[c<<8|next character];
50 * if(token==-1)
51 * write c directly
52 * else
53 * tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
54 * append zero-terminated tokenString;
55 *
56 * Different strings for a code point - normal name, 1.0 name, and ISO comment -
57 * are separated by ';'.
58 *
59 * uint16_t groupCount;
60 * struct {
61 * uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
62 * uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
63 * uint16_t offsetLow;
64 * } groupTable[groupCount];
65 *
66 * char groupStrings[]; -- padded to 4-count
67 *
68 * -- The actual, tokenized group strings are not zero-terminated because
69 * that would take up too much space.
70 * Instead, they are preceeded by their length, written in a variable-length sequence:
71 * For each of the 32 group strings, one or two nibbles are stored for its length.
72 * Nibbles (4-bit values, half-bytes) are read MSB first.
73 * A nibble with a value of 0..11 directly indicates the length of the name string.
74 * A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
75 * by (((n-12)<<4)|m)+12, reaching values of 12..75.
76 * These lengths are sequentially for each tokenized string, not for the de-tokenized result.
77 * For the de-tokenizing, see token description above; the strings immediately follow the
78 * 32 lengths.
79 *
80 * -- algorithmic names
81 *
82 * typedef struct AlgorithmicRange {
83 * uint32_t rangeStart, rangeEnd;
84 * uint8_t algorithmType, algorithmVariant;
85 * uint16_t rangeSize;
86 * } AlgorithmicRange;
87 *
88 * uint32_t algRangesCount; -- number of data blocks for ranges of
89 * algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
90 *
91 * struct {
92 * AlgorithmicRange algRange;
93 * uint8_t algRangeData[]; -- padded to 4-count except in last range
94 * } algRanges[algNamesCount];
95 * -- not a real array because each part has a different size
96 * of algRange.rangeSize (including AlgorithmicRange)
97 *
98 * -- algorithmic range types:
99 *
100 * 0 Names are formed from a string prefix that is stored in
101 * the algRangeData (zero-terminated), followed by the Unicode code point
102 * of the character in hexadecimal digits;
103 * algRange.algorithmVariant digits are written
104 *
105 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
106 * algRange.algorithmVariant is the count of modulo factors
107 * algRangeData contains
108 * uint16_t factors[algRange.algorithmVariant];
109 * char strings[];
110 * the first zero-terminated string is written as the prefix; then:
111 *
112 * The rangeStart is subtracted; with the difference, here "code":
113 * for(i=algRange.algorithmVariant-1 to 0 step -1)
114 * index[i]=code%factor[i];
115 * code/=factor[i];
116 *
117 * The strings after the prefix are short pieces that are then appended to the result
118 * according to index[0..algRange.algorithmVariant-1].
119 */
120
121 #include <stdio.h>
122 #include "unicode/utypes.h"
123 #include "unicode/putil.h"
124 #include "unicode/uclean.h"
125 #include "unicode/udata.h"
126 #include "cmemory.h"
127 #include "cstring.h"
128 #include "uarrsort.h"
129 #include "unewdata.h"
130 #include "uoptions.h"
131 #include "uparse.h"
132
133 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
134
135 #define STRING_STORE_SIZE 1000000
136 #define GROUP_STORE_SIZE 5000
137
138 #define GROUP_SHIFT 5
139 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
140 #define GROUP_MASK (LINES_PER_GROUP-1)
141
142 #define MAX_LINE_COUNT 50000
143 #define MAX_WORD_COUNT 20000
144 #define MAX_GROUP_COUNT 5000
145
146 #define DATA_NAME "unames"
147 #define DATA_TYPE "icu"
148 #define VERSION_STRING "unam"
149 #define NAME_SEPARATOR_CHAR ';'
150
151 #define ISO_DATA_NAME "ucomment"
152
153 /* Unicode versions --------------------------------------------------------- */
154
155 enum {
156 UNI_1_0,
157 UNI_1_1,
158 UNI_2_0,
159 UNI_3_0,
160 UNI_3_1,
161 UNI_3_2,
162 UNI_4_0,
163 UNI_4_0_1,
164 UNI_4_1,
165 UNI_5_0,
166 UNI_5_1,
167 UNI_5_2,
168 UNI_VER_COUNT
169 };
170
171 static const UVersionInfo
172 unicodeVersions[]={
173 { 1, 0, 0, 0 },
174 { 1, 1, 0, 0 },
175 { 2, 0, 0, 0 },
176 { 3, 0, 0, 0 },
177 { 3, 1, 0, 0 },
178 { 3, 2, 0, 0 },
179 { 4, 0, 0, 0 },
180 { 4, 0, 1, 0 },
181 { 4, 1, 0, 0 },
182 { 5, 0, 0, 0 },
183 { 5, 1, 0, 0 },
184 { 5, 2, 0, 0 }
185 };
186
187 static int32_t ucdVersion=UNI_5_2;
188
189 static int32_t
findUnicodeVersion(const UVersionInfo version)190 findUnicodeVersion(const UVersionInfo version) {
191 int32_t i;
192
193 for(i=0; /* while(version>unicodeVersions[i]) {} */
194 i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
195 ++i) {}
196 if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
197 --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
198 }
199 return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
200 }
201
202 /* generator data ----------------------------------------------------------- */
203
204 /* UDataInfo cf. udata.h */
205 static UDataInfo dataInfo={
206 sizeof(UDataInfo),
207 0,
208
209 U_IS_BIG_ENDIAN,
210 U_CHARSET_FAMILY,
211 sizeof(UChar),
212 0,
213
214 {0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */
215 {1, 0, 0, 0}, /* formatVersion */
216 {3, 0, 0, 0} /* dataVersion */
217 };
218
219 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
220
221 typedef struct Options {
222 UBool storeNames;
223 UBool store10Names;
224 UBool storeISOComments;
225 } Options;
226
227 /*
228 * Pair of code point and name alias.
229 * Try to keep sizeof(CpNameAlias) a multiple of 4 to avoid padding.
230 */
231 typedef struct CpNameAlias {
232 uint32_t code;
233 char nameAlias[124];
234 } CpNameAlias;
235
236 static CpNameAlias cpNameAliases[50];
237
238 static uint32_t cpNameAliasesIndex=0, cpNameAliasesTop=0;
239
240 static uint8_t stringStore[STRING_STORE_SIZE],
241 groupStore[GROUP_STORE_SIZE],
242 lineLengths[LINES_PER_GROUP];
243
244 static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
245
246 typedef struct {
247 uint32_t code;
248 int16_t length;
249 uint8_t *s;
250 } Line;
251
252 typedef struct {
253 int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
254 int16_t count;
255 int16_t length;
256 uint8_t *s;
257 } Word;
258
259 static Line lines[MAX_LINE_COUNT];
260 static Word words[MAX_WORD_COUNT];
261
262 static uint32_t lineCount=0, wordCount=0;
263
264 static int16_t leadByteCount;
265
266 #define LEADBYTE_LIMIT 16
267
268 static int16_t tokens[LEADBYTE_LIMIT*256];
269 static uint32_t tokenCount;
270
271 /* prototypes --------------------------------------------------------------- */
272
273 static void
274 init(void);
275
276 static void
277 parseNameAliases(const char *filename, Options *options);
278
279 static void
280 parseDB(const char *filename, Options *options);
281
282 static void
283 parseName(char *name, int16_t length);
284
285 static int16_t
286 skipNoise(char *line, int16_t start, int16_t limit);
287
288 static int16_t
289 getWord(char *line, int16_t start, int16_t limit);
290
291 static void
292 compress(void);
293
294 static void
295 compressLines(void);
296
297 static int16_t
298 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
299
300 static int32_t
301 compareWords(const void *context, const void *word1, const void *word2);
302
303 static void
304 generateData(const char *dataDir, Options *options);
305
306 static uint32_t
307 generateAlgorithmicData(UNewDataMemory *pData, Options *options);
308
309 static int16_t
310 findToken(uint8_t *s, int16_t length);
311
312 static Word *
313 findWord(char *s, int16_t length);
314
315 static Word *
316 addWord(char *s, int16_t length);
317
318 static void
319 countWord(Word *word);
320
321 static void
322 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
323
324 static void
325 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
326
327 static uint32_t
328 addToken(uint8_t *s, int16_t length);
329
330 static void
331 appendLineLength(int16_t length);
332
333 static void
334 appendLineLengthNibble(uint8_t nibble);
335
336 static uint8_t *
337 allocLine(int32_t length);
338
339 static uint8_t *
340 allocWord(uint32_t length);
341
342 /* -------------------------------------------------------------------------- */
343
344 enum {
345 HELP_H,
346 HELP_QUESTION_MARK,
347 VERBOSE,
348 QUIET,
349 COPYRIGHT,
350 DESTDIR,
351 UNICODE,
352 UNICODE1_NAMES,
353 NO_ISO_COMMENTS,
354 ONLY_ISO_COMMENTS
355 };
356
357 static UOption options[]={
358 UOPTION_HELP_H,
359 UOPTION_HELP_QUESTION_MARK,
360 UOPTION_VERBOSE,
361 UOPTION_QUIET,
362 UOPTION_COPYRIGHT,
363 UOPTION_DESTDIR,
364 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
365 { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 },
366 { "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
367 { "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
368 };
369
370 extern int
main(int argc,char * argv[])371 main(int argc, char* argv[]) {
372 UVersionInfo version;
373 Options moreOptions={ TRUE, FALSE, TRUE };
374 UErrorCode errorCode = U_ZERO_ERROR;
375
376 U_MAIN_INIT_ARGS(argc, argv);
377
378 /* Initialize ICU */
379 u_init(&errorCode);
380 if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
381 /* Note: u_init() will try to open ICU property data.
382 * failures here are expected when building ICU from scratch.
383 * ignore them.
384 */
385 fprintf(stderr, "%s: can not initialize ICU. errorCode = %s\n",
386 argv[0], u_errorName(errorCode));
387 exit(1);
388 }
389
390 /* preset then read command line options */
391 options[DESTDIR].value=u_getDataDirectory();
392 options[UNICODE].value="4.1";
393 argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
394
395 /* error handling, printing usage message */
396 if(argc<0) {
397 fprintf(stderr,
398 "error in command line argument \"%s\"\n",
399 argv[-argc]);
400 } else if(argc<2) {
401 argc=-1;
402 }
403 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
404 /*
405 * Broken into chucks because the C89 standard says the minimum
406 * required supported string length is 509 bytes.
407 */
408 fprintf(stderr,
409 "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] [filename_ud [filename_na]]\n"
410 "\n"
411 "Read the UnicodeData.txt file and \n"
412 "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
413 "\n"
414 "\tfilename_ud absolute path/filename for the UnicodeData.txt file\n"
415 "\t (default: standard input)\n"
416 "\tfilename_na absolute path/filename for the NameAliases.txt file\n"
417 "\t (default: no name aliases)\n"
418 "\n",
419 argv[0]);
420 fprintf(stderr,
421 "Options:\n"
422 "\t-h or -? or --help this usage text\n"
423 "\t-v or --verbose verbose output\n"
424 "\t-q or --quiet no output\n"
425 "\t-c or --copyright include a copyright notice\n"
426 "\t-d or --destdir destination directory, followed by the path\n"
427 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
428 fprintf(stderr,
429 "\t-1 or --unicode1-names store Unicode 1.0 character names\n"
430 "\t --no-iso-comments do not store ISO comments\n"
431 "\t --only-iso-comments write ucomment.icu with only ISO comments\n");
432 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
433 }
434
435 /* get the options values */
436 beVerbose=options[VERBOSE].doesOccur;
437 beQuiet=options[QUIET].doesOccur;
438 haveCopyright=options[COPYRIGHT].doesOccur;
439 moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur;
440 moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur;
441 if(options[ONLY_ISO_COMMENTS].doesOccur) {
442 moreOptions.storeNames=moreOptions.store10Names=FALSE;
443 moreOptions.storeISOComments=TRUE;
444 }
445
446 /* set the Unicode version */
447 u_versionFromString(version, options[UNICODE].value);
448 uprv_memcpy(dataInfo.dataVersion, version, 4);
449 ucdVersion=findUnicodeVersion(version);
450
451 init();
452 if(argc>=3) {
453 parseNameAliases(argv[2], &moreOptions);
454 }
455 parseDB(argc>=2 ? argv[1] : "-", &moreOptions);
456 compress();
457 generateData(options[DESTDIR].value, &moreOptions);
458
459 u_cleanup();
460 return 0;
461 }
462
463 static void
init()464 init() {
465 int i;
466
467 for(i=0; i<256; ++i) {
468 tokens[i]=0;
469 }
470 }
471
472 /* parsing ------------------------------------------------------------------ */
473
474 /* get a name, strip leading and trailing whitespace */
475 static int16_t
getName(char ** pStart,char * limit)476 getName(char **pStart, char *limit) {
477 /* strip leading whitespace */
478 char *start=(char *)u_skipWhitespace(*pStart);
479
480 /* strip trailing whitespace */
481 while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
482 --limit;
483 }
484
485 /* return results */
486 *pStart=start;
487 return (int16_t)(limit-start);
488 }
489
490 static void U_CALLCONV
nameAliasesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)491 nameAliasesLineFn(void *context,
492 char *fields[][2], int32_t fieldCount,
493 UErrorCode *pErrorCode) {
494 char *name;
495 int16_t length=0;
496 static uint32_t prevCode=0;
497 uint32_t code=0;
498
499 if(U_FAILURE(*pErrorCode)) {
500 return;
501 }
502 /* get the character code */
503 code=uprv_strtoul(fields[0][0], NULL, 16);
504
505 /* get the character name */
506 name=fields[1][0];
507 length=getName(&name, fields[1][1]);
508 if(length==0 || length>=sizeof(cpNameAliases[cpNameAliasesTop].nameAlias)) {
509 fprintf(stderr, "gennames: error - name alias %s empty or too long for code point U+%04lx\n",
510 name, (unsigned long)code);
511 *pErrorCode=U_PARSE_ERROR;
512 exit(U_PARSE_ERROR);
513 }
514
515 /* check for non-character code points */
516 if(!U_IS_UNICODE_CHAR(code)) {
517 fprintf(stderr, "gennames: error - name alias for non-character code point U+%04lx\n",
518 (unsigned long)code);
519 *pErrorCode=U_PARSE_ERROR;
520 exit(U_PARSE_ERROR);
521 }
522
523 /* check that the code points (code) are in ascending order */
524 if(code<=prevCode && code>0) {
525 fprintf(stderr, "gennames: error - NameAliases entries out of order, U+%04lx after U+%04lx\n",
526 (unsigned long)code, (unsigned long)prevCode);
527 *pErrorCode=U_PARSE_ERROR;
528 exit(U_PARSE_ERROR);
529 }
530 prevCode=code;
531
532 if(cpNameAliasesTop>=LENGTHOF(cpNameAliases)) {
533 fprintf(stderr, "gennames: error - too many name aliases\n");
534 *pErrorCode=U_PARSE_ERROR;
535 exit(U_PARSE_ERROR);
536 }
537 cpNameAliases[cpNameAliasesTop].code=code;
538 uprv_memcpy(cpNameAliases[cpNameAliasesTop].nameAlias, name, length);
539 cpNameAliases[cpNameAliasesTop].nameAlias[length]=0;
540 ++cpNameAliasesTop;
541
542 parseName(name, length);
543 }
544
545 static void U_CALLCONV
lineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)546 lineFn(void *context,
547 char *fields[][2], int32_t fieldCount,
548 UErrorCode *pErrorCode) {
549 Options *storeOptions=(Options *)context;
550 char *names[4];
551 int16_t lengths[4]={ 0, 0, 0, 0 };
552 static uint32_t prevCode=0;
553 uint32_t code=0;
554
555 if(U_FAILURE(*pErrorCode)) {
556 return;
557 }
558 /* get the character code */
559 code=uprv_strtoul(fields[0][0], NULL, 16);
560
561 /* get the character name */
562 if(storeOptions->storeNames) {
563 names[0]=fields[1][0];
564 lengths[0]=getName(names+0, fields[1][1]);
565 if(names[0][0]=='<') {
566 /* do not store pseudo-names in <> brackets */
567 lengths[0]=0;
568 }
569 }
570
571 /* store 1.0 names */
572 /* get the second character name, the one from Unicode 1.0 */
573 if(storeOptions->store10Names) {
574 names[1]=fields[10][0];
575 lengths[1]=getName(names+1, fields[10][1]);
576 if(names[1][0]=='<') {
577 /* do not store pseudo-names in <> brackets */
578 lengths[1]=0;
579 }
580 }
581
582 /* get the ISO 10646 comment */
583 if(storeOptions->storeISOComments) {
584 names[2]=fields[11][0];
585 lengths[2]=getName(names+2, fields[11][1]);
586 }
587
588 if(lengths[0]+lengths[1]+lengths[2]==0) {
589 return;
590 }
591
592 /* check for non-character code points */
593 if(!U_IS_UNICODE_CHAR(code)) {
594 fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
595 (unsigned long)code);
596 *pErrorCode=U_PARSE_ERROR;
597 exit(U_PARSE_ERROR);
598 }
599
600 /* check that the code points (code) are in ascending order */
601 if(code<=prevCode && code>0) {
602 fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
603 (unsigned long)code, (unsigned long)prevCode);
604 *pErrorCode=U_PARSE_ERROR;
605 exit(U_PARSE_ERROR);
606 }
607 prevCode=code;
608
609 parseName(names[0], lengths[0]);
610 parseName(names[1], lengths[1]);
611 parseName(names[2], lengths[2]);
612
613 if(cpNameAliasesIndex<cpNameAliasesTop && code>=cpNameAliases[cpNameAliasesIndex].code) {
614 if(code==cpNameAliases[cpNameAliasesIndex].code) {
615 names[3]=cpNameAliases[cpNameAliasesIndex].nameAlias;
616 lengths[3]=(int16_t)uprv_strlen(cpNameAliases[cpNameAliasesIndex].nameAlias);
617 ++cpNameAliasesIndex;
618 } else {
619 fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n",
620 (unsigned long)code);
621 *pErrorCode=U_PARSE_ERROR;
622 exit(U_PARSE_ERROR);
623 }
624 }
625
626 /*
627 * set the count argument to
628 * 1: only store regular names, or only store ISO 10646 comments
629 * 2: store regular and 1.0 names
630 * 3: store names and ISO 10646 comment
631 * 4: also store name alias
632 *
633 * addLine() will ignore empty trailing names
634 */
635 if(storeOptions->storeNames) {
636 /* store names and comments as parsed according to storeOptions */
637 addLine(code, names, lengths, LENGTHOF(names));
638 } else {
639 /* store only ISO 10646 comments */
640 addLine(code, names+2, lengths+2, 1);
641 }
642 }
643
644 static void
parseNameAliases(const char * filename,Options * storeOptions)645 parseNameAliases(const char *filename, Options *storeOptions) {
646 char *fields[2][2];
647 UErrorCode errorCode=U_ZERO_ERROR;
648
649 if(!storeOptions->storeNames) {
650 return;
651 }
652 u_parseDelimitedFile(filename, ';', fields, 2, nameAliasesLineFn, NULL, &errorCode);
653 if(U_FAILURE(errorCode)) {
654 fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
655 exit(errorCode);
656 }
657
658 if(!beQuiet) {
659 printf("number of name aliases: %lu\n", (unsigned long)cpNameAliasesTop);
660 }
661 }
662
663 static void
parseDB(const char * filename,Options * storeOptions)664 parseDB(const char *filename, Options *storeOptions) {
665 char *fields[15][2];
666 UErrorCode errorCode=U_ZERO_ERROR;
667
668 u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode);
669 if(U_FAILURE(errorCode)) {
670 fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
671 exit(errorCode);
672 }
673 if(cpNameAliasesIndex<cpNameAliasesTop) {
674 fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n",
675 (unsigned long)cpNameAliases[cpNameAliasesIndex].code);
676 exit(U_PARSE_ERROR);
677 }
678
679 if(!beQuiet) {
680 printf("size of all names in the database: %lu\n",
681 (unsigned long)lineTop);
682 printf("number of named Unicode characters: %lu\n",
683 (unsigned long)lineCount);
684 printf("number of words in the dictionary from these names: %lu\n",
685 (unsigned long)wordCount);
686 }
687 }
688
689 static void
parseName(char * name,int16_t length)690 parseName(char *name, int16_t length) {
691 int16_t start=0, limit, wordLength/*, prevStart=-1*/;
692 Word *word;
693
694 while(start<length) {
695 /* skip any "noise" characters */
696 limit=skipNoise(name, start, length);
697 if(start<limit) {
698 /*prevStart=-1;*/
699 start=limit;
700 }
701 if(start==length) {
702 break;
703 }
704
705 /* get a word and add it if it is longer than 1 */
706 limit=getWord(name, start, length);
707 wordLength=(int16_t)(limit-start);
708 if(wordLength>1) {
709 word=findWord(name+start, wordLength);
710 if(word==NULL) {
711 word=addWord(name+start, wordLength);
712 }
713 countWord(word);
714 }
715
716 #if 0
717 /*
718 * if there was a word before this
719 * (with no noise in between), then add the pair of words, too
720 */
721 if(prevStart!=-1) {
722 wordLength=limit-prevStart;
723 word=findWord(name+prevStart, wordLength);
724 if(word==NULL) {
725 word=addWord(name+prevStart, wordLength);
726 }
727 countWord(word);
728 }
729 #endif
730
731 /*prevStart=start;*/
732 start=limit;
733 }
734 }
735
736 static UBool U_INLINE
isWordChar(char c)737 isWordChar(char c) {
738 return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
739 ('J'<=c && c<='R') ||
740 ('S'<=c && c<='Z') ||
741
742 ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
743 ('j'<=c && c<='r') ||
744 ('s'<=c && c<='z') ||
745
746 ('0'<=c && c<='9');
747 }
748
749 static int16_t
skipNoise(char * line,int16_t start,int16_t limit)750 skipNoise(char *line, int16_t start, int16_t limit) {
751 /* skip anything that is not part of a word in this sense */
752 while(start<limit && !isWordChar(line[start])) {
753 ++start;
754 }
755
756 return start;
757 }
758
759 static int16_t
getWord(char * line,int16_t start,int16_t limit)760 getWord(char *line, int16_t start, int16_t limit) {
761 char c=0; /* initialize to avoid a compiler warning although the code was safe */
762
763 /* a unicode character name word consists of A-Z0-9 */
764 while(start<limit && isWordChar(line[start])) {
765 ++start;
766 }
767
768 /* include a following space or dash */
769 if(start<limit && ((c=line[start])==' ' || c=='-')) {
770 ++start;
771 }
772
773 return start;
774 }
775
776 /* compressing -------------------------------------------------------------- */
777
778 static void
compress()779 compress() {
780 uint32_t i, letterCount;
781 int16_t wordNumber;
782 UErrorCode errorCode;
783
784 /* sort the words in reverse order by weight */
785 errorCode=U_ZERO_ERROR;
786 uprv_sortArray(words, wordCount, sizeof(Word),
787 compareWords, NULL, FALSE, &errorCode);
788
789 /* remove the words that do not save anything */
790 while(wordCount>0 && words[wordCount-1].weight<1) {
791 --wordCount;
792 }
793
794 /* count the letters in the token range */
795 letterCount=0;
796 for(i=LEADBYTE_LIMIT; i<256; ++i) {
797 if(tokens[i]==-1) {
798 ++letterCount;
799 }
800 }
801 if(!beQuiet) {
802 printf("number of letters used in the names: %d\n", (int)letterCount);
803 }
804
805 /* do we need double-byte tokens? */
806 if(wordCount+letterCount<=256) {
807 /* no, single-byte tokens are enough */
808 leadByteCount=0;
809 for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
810 if(tokens[i]!=-1) {
811 tokens[i]=wordNumber;
812 if(beVerbose) {
813 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
814 (int)i, (long)words[wordNumber].weight,
815 words[wordNumber].length, words[wordNumber].s);
816 }
817 ++wordNumber;
818 }
819 }
820 tokenCount=i;
821 } else {
822 /*
823 * The tokens that need two token bytes
824 * get their weight reduced by their count
825 * because they save less.
826 */
827 tokenCount=256-letterCount;
828 for(i=tokenCount; i<wordCount; ++i) {
829 words[i].weight-=words[i].count;
830 }
831
832 /* sort these words in reverse order by weight */
833 errorCode=U_ZERO_ERROR;
834 uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
835 compareWords, NULL, FALSE, &errorCode);
836
837 /* remove the words that do not save anything */
838 while(wordCount>0 && words[wordCount-1].weight<1) {
839 --wordCount;
840 }
841
842 /* how many tokens and lead bytes do we have now? */
843 tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
844 /*
845 * adjust upwards to take into account that
846 * double-byte tokens must not
847 * use NAME_SEPARATOR_CHAR as a second byte
848 */
849 tokenCount+=(tokenCount-256+254)/255;
850
851 leadByteCount=(int16_t)(tokenCount>>8);
852 if(leadByteCount<LEADBYTE_LIMIT) {
853 /* adjust for the real number of lead bytes */
854 tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
855 } else {
856 /* limit the number of lead bytes */
857 leadByteCount=LEADBYTE_LIMIT-1;
858 tokenCount=LEADBYTE_LIMIT*256;
859 wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
860 /* adjust again to skip double-byte tokens with ';' */
861 wordCount-=(tokenCount-256+254)/255;
862 }
863
864 /* set token 0 to word 0 */
865 tokens[0]=0;
866 if(beVerbose) {
867 printf("tokens[0x000]: word%8ld \"%.*s\"\n",
868 (long)words[0].weight,
869 words[0].length, words[0].s);
870 }
871 wordNumber=1;
872
873 /* set the lead byte tokens */
874 for(i=1; (int16_t)i<=leadByteCount; ++i) {
875 tokens[i]=-2;
876 }
877
878 /* set the tokens */
879 for(; i<256; ++i) {
880 /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
881 if(tokens[i]!=-1) {
882 tokens[i]=wordNumber;
883 if(beVerbose) {
884 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
885 (int)i, (long)words[wordNumber].weight,
886 words[wordNumber].length, words[wordNumber].s);
887 }
888 ++wordNumber;
889 }
890 }
891
892 /* continue above 255 where there are no letters */
893 for(; (uint32_t)wordNumber<wordCount; ++i) {
894 if((i&0xff)==NAME_SEPARATOR_CHAR) {
895 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
896 } else {
897 tokens[i]=wordNumber;
898 if(beVerbose) {
899 printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
900 (int)i, (long)words[wordNumber].weight,
901 words[wordNumber].length, words[wordNumber].s);
902 }
903 ++wordNumber;
904 }
905 }
906 tokenCount=i; /* should be already tokenCount={i or i+1} */
907 }
908
909 if(!beQuiet) {
910 printf("number of lead bytes: %d\n", leadByteCount);
911 printf("number of single-byte tokens: %lu\n",
912 (unsigned long)256-letterCount-leadByteCount);
913 printf("number of tokens: %lu\n", (unsigned long)tokenCount);
914 }
915
916 compressLines();
917 }
918
919 static void
compressLines()920 compressLines() {
921 Line *line=NULL;
922 uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
923 groupMSB=0xffff, lineCount2;
924 int16_t groupTop=0;
925
926 /* store the groups like lines, with compressed data after raw strings */
927 groupBottom=lineTop;
928 lineCount2=lineCount;
929 lineCount=0;
930
931 /* loop over all lines */
932 while(i<lineCount2) {
933 line=lines+i++;
934 inLine=line->code;
935
936 /* segment the lines to groups of 32 */
937 if(inLine>>GROUP_SHIFT!=groupMSB) {
938 /* finish the current group with empty lines */
939 while((++outLine&GROUP_MASK)!=0) {
940 appendLineLength(0);
941 }
942
943 /* store the group like a line */
944 if(groupTop>0) {
945 if(groupTop>GROUP_STORE_SIZE) {
946 fprintf(stderr, "gennames: group store overflow\n");
947 exit(U_BUFFER_OVERFLOW_ERROR);
948 }
949 addGroup(groupMSB, groupStore, groupTop);
950 }
951
952 /* start the new group */
953 lineLengthsTop=0;
954 groupTop=0;
955 groupMSB=inLine>>GROUP_SHIFT;
956 outLine=(inLine&~GROUP_MASK)-1;
957 }
958
959 /* write empty lines between the previous line in the group and this one */
960 while(++outLine<inLine) {
961 appendLineLength(0);
962 }
963
964 /* write characters and tokens for this line */
965 appendLineLength(compressLine(line->s, line->length, &groupTop));
966 }
967
968 /* finish and store the last group */
969 if(line && groupMSB!=0xffff) {
970 /* finish the current group with empty lines */
971 while((++outLine&GROUP_MASK)!=0) {
972 appendLineLength(0);
973 }
974
975 /* store the group like a line */
976 if(groupTop>0) {
977 if(groupTop>GROUP_STORE_SIZE) {
978 fprintf(stderr, "gennames: group store overflow\n");
979 exit(U_BUFFER_OVERFLOW_ERROR);
980 }
981 addGroup(groupMSB, groupStore, groupTop);
982 }
983 }
984
985 if(!beQuiet) {
986 printf("number of groups: %lu\n", (unsigned long)lineCount);
987 }
988 }
989
990 static int16_t
compressLine(uint8_t * s,int16_t length,int16_t * pGroupTop)991 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
992 int16_t start, limit, token, groupTop=*pGroupTop;
993
994 start=0;
995 do {
996 /* write any "noise" characters */
997 limit=skipNoise((char *)s, start, length);
998 while(start<limit) {
999 groupStore[groupTop++]=s[start++];
1000 }
1001
1002 if(start==length) {
1003 break;
1004 }
1005
1006 /* write a word, as token or directly */
1007 limit=getWord((char *)s, start, length);
1008 if(limit-start==1) {
1009 groupStore[groupTop++]=s[start++];
1010 } else {
1011 token=findToken(s+start, (int16_t)(limit-start));
1012 if(token!=-1) {
1013 if(token>0xff) {
1014 groupStore[groupTop++]=(uint8_t)(token>>8);
1015 }
1016 groupStore[groupTop++]=(uint8_t)token;
1017 start=limit;
1018 } else {
1019 while(start<limit) {
1020 groupStore[groupTop++]=s[start++];
1021 }
1022 }
1023 }
1024 } while(start<length);
1025
1026 length=(int16_t)(groupTop-*pGroupTop);
1027 *pGroupTop=groupTop;
1028 return length;
1029 }
1030
1031 static int32_t
compareWords(const void * context,const void * word1,const void * word2)1032 compareWords(const void *context, const void *word1, const void *word2) {
1033 /* reverse sort by word weight */
1034 return ((Word *)word2)->weight-((Word *)word1)->weight;
1035 }
1036
1037 /* generate output data ----------------------------------------------------- */
1038
1039 static void
generateData(const char * dataDir,Options * storeOptions)1040 generateData(const char *dataDir, Options *storeOptions) {
1041 UNewDataMemory *pData;
1042 UErrorCode errorCode=U_ZERO_ERROR;
1043 uint16_t groupWords[3];
1044 uint32_t i, groupTop=lineTop, offset, size,
1045 tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
1046 long dataLength;
1047 int16_t token;
1048
1049 pData=udata_create(dataDir,
1050 DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME,
1051 &dataInfo,
1052 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1053 if(U_FAILURE(errorCode)) {
1054 fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
1055 exit(errorCode);
1056 }
1057
1058 /* first, see how much space we need, and prepare the token strings */
1059 for(i=0; i<tokenCount; ++i) {
1060 token=tokens[i];
1061 if(token!=-1 && token!=-2) {
1062 tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
1063 }
1064 }
1065
1066 /*
1067 * Required padding for data swapping:
1068 * The token table undergoes a permutation during data swapping when the
1069 * input and output charsets are different.
1070 * The token table cannot grow during swapping, so we need to make sure that
1071 * the table is long enough for successful in-place permutation.
1072 *
1073 * We simply round up tokenCount to the next multiple of 256 to account for
1074 * all possible permutations.
1075 *
1076 * An optimization is possible if we only ever swap between ASCII and EBCDIC:
1077 *
1078 * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
1079 * and will be swapped between ASCII and EBCDIC between
1080 * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
1081 * This should be the only -1 entry in tokens[256..511] on which the data
1082 * swapper bases its trail byte permutation map (trailMap[]).
1083 *
1084 * It would be sufficient to increase tokenCount so that its lower 8 bits
1085 * are at least 0x5e+1 to make room for swapping between the two semicolons.
1086 * For values higher than 0x5e, the trail byte permutation map (trailMap[])
1087 * should always be an identity map, where we do not need additional room.
1088 */
1089 i=tokenCount;
1090 tokenCount=(tokenCount+0xff)&~0xff;
1091 if(!beQuiet && i<tokenCount) {
1092 printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
1093 }
1094 for(; i<tokenCount; ++i) {
1095 if((i&0xff)==NAME_SEPARATOR_CHAR) {
1096 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
1097 } else {
1098 tokens[i]=0; /* unused token for padding */
1099 }
1100 }
1101
1102 /*
1103 * Calculate the total size in bytes of the data including:
1104 * - the offset to the token strings, uint32_t (4)
1105 * - the offset to the group table, uint32_t (4)
1106 * - the offset to the group strings, uint32_t (4)
1107 * - the offset to the algorithmic names, uint32_t (4)
1108 *
1109 * - the number of tokens, uint16_t (2)
1110 * - the token table, uint16_t[tokenCount] (2*tokenCount)
1111 *
1112 * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
1113 *
1114 * - the number of groups, uint16_t (2)
1115 * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
1116 *
1117 * - the group strings (groupTop-groupBottom), 2-padded
1118 *
1119 * - the size of the data for the algorithmic names
1120 */
1121 tokenStringOffset=4+4+4+4+2+2*tokenCount;
1122 groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
1123 groupStringOffset=groupsOffset+2+6*lineCount;
1124 algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
1125
1126 offset=generateAlgorithmicData(NULL, storeOptions);
1127 size=algNamesOffset+offset;
1128
1129 if(!beQuiet) {
1130 printf("size of the Unicode Names data:\n"
1131 "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
1132 (unsigned long)size, (unsigned long)(lineTop-groupTop),
1133 (unsigned long)(groupTop-groupBottom), (unsigned long)offset);
1134 }
1135
1136 /* write the data to the file */
1137 /* offsets */
1138 udata_write32(pData, tokenStringOffset);
1139 udata_write32(pData, groupsOffset);
1140 udata_write32(pData, groupStringOffset);
1141 udata_write32(pData, algNamesOffset);
1142
1143 /* token table */
1144 udata_write16(pData, (uint16_t)tokenCount);
1145 udata_writeBlock(pData, tokens, 2*tokenCount);
1146
1147 /* token strings */
1148 udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
1149 if((lineTop-groupTop)&1) {
1150 /* 2-padding */
1151 udata_writePadding(pData, 1);
1152 }
1153
1154 /* group table */
1155 udata_write16(pData, (uint16_t)lineCount);
1156 for(i=0; i<lineCount; ++i) {
1157 /* groupMSB */
1158 groupWords[0]=(uint16_t)lines[i].code;
1159
1160 /* offset */
1161 offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
1162 groupWords[1]=(uint16_t)(offset>>16);
1163 groupWords[2]=(uint16_t)(offset);
1164 udata_writeBlock(pData, groupWords, 6);
1165 }
1166
1167 /* group strings */
1168 udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
1169
1170 /* 4-align the algorithmic names data */
1171 udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
1172
1173 generateAlgorithmicData(pData, storeOptions);
1174
1175 /* finish up */
1176 dataLength=udata_finish(pData, &errorCode);
1177 if(U_FAILURE(errorCode)) {
1178 fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
1179 exit(errorCode);
1180 }
1181
1182 if(dataLength!=(long)size) {
1183 fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
1184 dataLength, (unsigned long)size);
1185 exit(U_INTERNAL_PROGRAM_ERROR);
1186 }
1187 }
1188
1189 /* the structure for algorithmic names needs to be 4-aligned */
1190 typedef struct AlgorithmicRange {
1191 uint32_t rangeStart, rangeEnd;
1192 uint8_t algorithmType, algorithmVariant;
1193 uint16_t rangeSize;
1194 } AlgorithmicRange;
1195
1196 static uint32_t
generateAlgorithmicData(UNewDataMemory * pData,Options * storeOptions)1197 generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
1198 static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
1199 # define PREFIX_LENGTH 23
1200 # define PREFIX_LENGTH_4 24
1201 uint32_t countAlgRanges;
1202
1203 static AlgorithmicRange cjkExtA={
1204 0x3400, 0x4db5,
1205 0, 4,
1206 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1207 };
1208 static AlgorithmicRange cjk={
1209 0x4e00, 0x9fa5,
1210 0, 4,
1211 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1212 };
1213 static AlgorithmicRange cjkExtB={
1214 0x20000, 0x2a6d6,
1215 0, 5,
1216 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1217 };
1218 static AlgorithmicRange cjkExtC={
1219 0x2a700, 0x2b734,
1220 0, 5,
1221 sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1222 };
1223
1224 static char jamo[]=
1225 "HANGUL SYLLABLE \0"
1226
1227 "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
1228 "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
1229
1230 "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
1231 "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
1232 "YU\0EU\0YI\0I\0"
1233
1234 "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
1235 "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
1236 "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
1237 ;
1238
1239 static AlgorithmicRange hangul={
1240 0xac00, 0xd7a3,
1241 1, 3,
1242 sizeof(AlgorithmicRange)+6+sizeof(jamo)
1243 };
1244
1245 /* modulo factors, maximum 8 */
1246 /* 3 factors: 19, 21, 28, most-to-least-significant */
1247 static uint16_t hangulFactors[3]={
1248 19, 21, 28
1249 };
1250
1251 uint32_t size;
1252
1253 size=0;
1254
1255 if(ucdVersion>=UNI_5_2) {
1256 /* Unicode 5.2 and up has a longer CJK Unihan range than before */
1257 cjk.rangeEnd=0x9FCB;
1258 } else if(ucdVersion>=UNI_5_1) {
1259 /* Unicode 5.1 and up has a longer CJK Unihan range than before */
1260 cjk.rangeEnd=0x9FC3;
1261 } else if(ucdVersion>=UNI_4_1) {
1262 /* Unicode 4.1 and up has a longer CJK Unihan range than before */
1263 cjk.rangeEnd=0x9FBB;
1264 }
1265
1266 /* number of ranges of algorithmic names */
1267 if(!storeOptions->storeNames) {
1268 countAlgRanges=0;
1269 } else if(ucdVersion>=UNI_5_2) {
1270 /* Unicode 5.2 and up has 5 ranges including CJK Extension C */
1271 countAlgRanges=5;
1272 } else if(ucdVersion>=UNI_3_1) {
1273 /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
1274 countAlgRanges=4;
1275 } else if(ucdVersion>=UNI_3_0) {
1276 /* Unicode 3.0 has 3 ranges including CJK Extension A */
1277 countAlgRanges=3;
1278 } else {
1279 /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
1280 countAlgRanges=2;
1281 }
1282
1283 if(pData!=NULL) {
1284 udata_write32(pData, countAlgRanges);
1285 } else {
1286 size+=4;
1287 }
1288 if(countAlgRanges==0) {
1289 return size;
1290 }
1291
1292 /*
1293 * each range:
1294 * uint32_t rangeStart
1295 * uint32_t rangeEnd
1296 * uint8_t algorithmType
1297 * uint8_t algorithmVariant
1298 * uint16_t size of range data
1299 * uint8_t[size] data
1300 */
1301
1302 /* range 0: cjk extension a */
1303 if(countAlgRanges>=3) {
1304 if(pData!=NULL) {
1305 udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1306 udata_writeString(pData, prefix, PREFIX_LENGTH);
1307 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1308 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1309 }
1310 } else {
1311 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1312 }
1313 }
1314
1315 /* range 1: cjk */
1316 if(pData!=NULL) {
1317 udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1318 udata_writeString(pData, prefix, PREFIX_LENGTH);
1319 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1320 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1321 }
1322 } else {
1323 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1324 }
1325
1326 /* range 2: hangul syllables */
1327 if(pData!=NULL) {
1328 udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1329 udata_writeBlock(pData, hangulFactors, 6);
1330 udata_writeString(pData, jamo, sizeof(jamo));
1331 } else {
1332 size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1333 }
1334
1335 /* range 3: cjk extension b */
1336 if(countAlgRanges>=4) {
1337 if(pData!=NULL) {
1338 udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1339 udata_writeString(pData, prefix, PREFIX_LENGTH);
1340 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1341 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1342 }
1343 } else {
1344 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1345 }
1346 }
1347
1348 /* range 4: cjk extension c */
1349 if(countAlgRanges>=5) {
1350 if(pData!=NULL) {
1351 udata_writeBlock(pData, &cjkExtC, sizeof(AlgorithmicRange));
1352 udata_writeString(pData, prefix, PREFIX_LENGTH);
1353 if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1354 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1355 }
1356 } else {
1357 size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1358 }
1359 }
1360
1361 return size;
1362 }
1363
1364 /* helpers ------------------------------------------------------------------ */
1365
1366 static int16_t
findToken(uint8_t * s,int16_t length)1367 findToken(uint8_t *s, int16_t length) {
1368 int16_t i, token;
1369
1370 for(i=0; i<(int16_t)tokenCount; ++i) {
1371 token=tokens[i];
1372 if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
1373 return i;
1374 }
1375 }
1376
1377 return -1;
1378 }
1379
1380 static Word *
findWord(char * s,int16_t length)1381 findWord(char *s, int16_t length) {
1382 uint32_t i;
1383
1384 for(i=0; i<wordCount; ++i) {
1385 if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1386 return words+i;
1387 }
1388 }
1389
1390 return NULL;
1391 }
1392
1393 static Word *
addWord(char * s,int16_t length)1394 addWord(char *s, int16_t length) {
1395 uint8_t *stringStart;
1396 Word *word;
1397
1398 if(wordCount==MAX_WORD_COUNT) {
1399 fprintf(stderr, "gennames: too many words\n");
1400 exit(U_BUFFER_OVERFLOW_ERROR);
1401 }
1402
1403 stringStart=allocWord(length);
1404 uprv_memcpy(stringStart, s, length);
1405
1406 word=words+wordCount;
1407
1408 /*
1409 * Initialize the weight with the costs for this token:
1410 * a zero-terminated string and a 16-bit offset.
1411 */
1412 word->weight=-(length+1+2);
1413 word->count=0;
1414 word->length=length;
1415 word->s=stringStart;
1416
1417 ++wordCount;
1418
1419 return word;
1420 }
1421
1422 static void
countWord(Word * word)1423 countWord(Word *word) {
1424 /* add to the weight the savings: the length of the word minus 1 byte for the token */
1425 word->weight+=word->length-1;
1426 ++word->count;
1427 }
1428
1429 static void
addLine(uint32_t code,char * names[],int16_t lengths[],int16_t count)1430 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1431 uint8_t *stringStart;
1432 Line *line;
1433 int16_t i, length;
1434
1435 if(lineCount==MAX_LINE_COUNT) {
1436 fprintf(stderr, "gennames: too many lines\n");
1437 exit(U_BUFFER_OVERFLOW_ERROR);
1438 }
1439
1440 /* find the last non-empty name */
1441 while(count>0 && lengths[count-1]==0) {
1442 --count;
1443 }
1444 if(count==0) {
1445 return; /* should not occur: caller should not have called */
1446 }
1447
1448 /* there will be (count-1) separator characters */
1449 i=count;
1450 length=count-1;
1451
1452 /* add lengths of strings */
1453 while(i>0) {
1454 length+=lengths[--i];
1455 }
1456
1457 /* allocate line memory */
1458 stringStart=allocLine(length);
1459
1460 /* copy all strings into the line memory */
1461 length=0; /* number of chars copied so far */
1462 for(i=0; i<count; ++i) {
1463 if(i>0) {
1464 stringStart[length++]=NAME_SEPARATOR_CHAR;
1465 }
1466 if(lengths[i]>0) {
1467 uprv_memcpy(stringStart+length, names[i], lengths[i]);
1468 length+=lengths[i];
1469 }
1470 }
1471
1472 line=lines+lineCount;
1473
1474 line->code=code;
1475 line->length=length;
1476 line->s=stringStart;
1477
1478 ++lineCount;
1479
1480 /* prevent a character value that is actually in a name from becoming a token */
1481 while(length>0) {
1482 tokens[stringStart[--length]]=-1;
1483 }
1484 }
1485
1486 static void
addGroup(uint32_t groupMSB,uint8_t * strings,int16_t length)1487 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1488 uint8_t *stringStart;
1489 Line *line;
1490
1491 if(lineCount==MAX_LINE_COUNT) {
1492 fprintf(stderr, "gennames: too many groups\n");
1493 exit(U_BUFFER_OVERFLOW_ERROR);
1494 }
1495
1496 /* store the line lengths first, then the strings */
1497 lineLengthsTop=(lineLengthsTop+1)/2;
1498 stringStart=allocLine(lineLengthsTop+length);
1499 uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1500 uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1501
1502 line=lines+lineCount;
1503
1504 line->code=groupMSB;
1505 line->length=length;
1506 line->s=stringStart;
1507
1508 ++lineCount;
1509 }
1510
1511 static uint32_t
addToken(uint8_t * s,int16_t length)1512 addToken(uint8_t *s, int16_t length) {
1513 uint8_t *stringStart;
1514
1515 stringStart=allocLine(length+1);
1516 uprv_memcpy(stringStart, s, length);
1517 stringStart[length]=0;
1518
1519 return (uint32_t)(stringStart - stringStore);
1520 }
1521
1522 static void
appendLineLength(int16_t length)1523 appendLineLength(int16_t length) {
1524 if(length>=76) {
1525 fprintf(stderr, "gennames: compressed line too long\n");
1526 exit(U_BUFFER_OVERFLOW_ERROR);
1527 }
1528 if(length>=12) {
1529 length-=12;
1530 appendLineLengthNibble((uint8_t)((length>>4)|12));
1531 }
1532 appendLineLengthNibble((uint8_t)length);
1533 }
1534
1535 static void
appendLineLengthNibble(uint8_t nibble)1536 appendLineLengthNibble(uint8_t nibble) {
1537 if((lineLengthsTop&1)==0) {
1538 lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1539 } else {
1540 lineLengths[lineLengthsTop/2]|=nibble&0xf;
1541 }
1542 ++lineLengthsTop;
1543 }
1544
1545 static uint8_t *
allocLine(int32_t length)1546 allocLine(int32_t length) {
1547 uint32_t top=lineTop+length;
1548 uint8_t *p;
1549
1550 if(top>wordBottom) {
1551 fprintf(stderr, "gennames: out of memory\n");
1552 exit(U_MEMORY_ALLOCATION_ERROR);
1553 }
1554 p=stringStore+lineTop;
1555 lineTop=top;
1556 return p;
1557 }
1558
1559 static uint8_t *
allocWord(uint32_t length)1560 allocWord(uint32_t length) {
1561 uint32_t bottom=wordBottom-length;
1562
1563 if(lineTop>bottom) {
1564 fprintf(stderr, "gennames: out of memory\n");
1565 exit(U_MEMORY_ALLOCATION_ERROR);
1566 }
1567 wordBottom=bottom;
1568 return stringStore+bottom;
1569 }
1570
1571 /*
1572 * Hey, Emacs, please set the following:
1573 *
1574 * Local Variables:
1575 * indent-tabs-mode: nil
1576 * End:
1577 *
1578 */
1579