• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2009, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  gennames.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 1999sep30
14 *   created by: Markus W. Scherer
15 *
16 *   This program reads the Unicode character database text file,
17 *   parses it, and extracts the character code,
18 *   the "modern" character name, and optionally the
19 *   Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
20 *   It then tokenizes and compresses the names and builds
21 *   compact binary tables for random-access lookup
22 *   in a u_charName() API function.
23 *
24 * unames.icu file format (after UDataInfo header etc. - see udata.c)
25 * (all data is static const)
26 *
27 * UDataInfo fields:
28 *   dataFormat "unam"
29 *   formatVersion 1.0
30 *   dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
31 *
32 * -- data-based names
33 * uint32_t tokenStringOffset,
34 *          groupsOffset,
35 *          groupStringOffset,
36 *          algNamesOffset;
37 *
38 * uint16_t tokenCount;
39 * uint16_t tokenTable[tokenCount];
40 *
41 * char     tokenStrings[]; -- padded to even count
42 *
43 * -- strings (groupStrings) are tokenized as follows:
44 *   for each character c
45 *       if(c>=tokenCount) write that character c directly
46 *   else
47 *       token=tokenTable[c];
48 *       if(token==0xfffe) -- lead byte of double-byte token
49 *           token=tokenTable[c<<8|next character];
50 *       if(token==-1)
51 *           write c directly
52 *       else
53 *           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
54 *           append zero-terminated tokenString;
55 *
56 *    Different strings for a code point - normal name, 1.0 name, and ISO comment -
57 *    are separated by ';'.
58 *
59 * uint16_t groupCount;
60 * struct {
61 *   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
62 *   uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
63 *   uint16_t offsetLow;
64 * } groupTable[groupCount];
65 *
66 * char     groupStrings[]; -- padded to 4-count
67 *
68 * -- The actual, tokenized group strings are not zero-terminated because
69 *   that would take up too much space.
70 *   Instead, they are preceeded by their length, written in a variable-length sequence:
71 *   For each of the 32 group strings, one or two nibbles are stored for its length.
72 *   Nibbles (4-bit values, half-bytes) are read MSB first.
73 *   A nibble with a value of 0..11 directly indicates the length of the name string.
74 *   A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
75 *   by (((n-12)<<4)|m)+12, reaching values of 12..75.
76 *   These lengths are sequentially for each tokenized string, not for the de-tokenized result.
77 *   For the de-tokenizing, see token description above; the strings immediately follow the
78 *   32 lengths.
79 *
80 * -- algorithmic names
81 *
82 * typedef struct AlgorithmicRange {
83 *     uint32_t rangeStart, rangeEnd;
84 *     uint8_t algorithmType, algorithmVariant;
85 *     uint16_t rangeSize;
86 * } AlgorithmicRange;
87 *
88 * uint32_t algRangesCount; -- number of data blocks for ranges of
89 *               algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
90 *
91 * struct {
92 *     AlgorithmicRange algRange;
93 *     uint8_t algRangeData[]; -- padded to 4-count except in last range
94 * } algRanges[algNamesCount];
95 * -- not a real array because each part has a different size
96 *    of algRange.rangeSize (including AlgorithmicRange)
97 *
98 * -- algorithmic range types:
99 *
100 * 0 Names are formed from a string prefix that is stored in
101 *   the algRangeData (zero-terminated), followed by the Unicode code point
102 *   of the character in hexadecimal digits;
103 *   algRange.algorithmVariant digits are written
104 *
105 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
106 *   algRange.algorithmVariant is the count of modulo factors
107 *   algRangeData contains
108 *       uint16_t factors[algRange.algorithmVariant];
109 *       char strings[];
110 *   the first zero-terminated string is written as the prefix; then:
111 *
112 *   The rangeStart is subtracted; with the difference, here "code":
113 *   for(i=algRange.algorithmVariant-1 to 0 step -1)
114 *       index[i]=code%factor[i];
115 *       code/=factor[i];
116 *
117 *   The strings after the prefix are short pieces that are then appended to the result
118 *   according to index[0..algRange.algorithmVariant-1].
119 */
120 
121 #include <stdio.h>
122 #include "unicode/utypes.h"
123 #include "unicode/putil.h"
124 #include "unicode/uclean.h"
125 #include "unicode/udata.h"
126 #include "cmemory.h"
127 #include "cstring.h"
128 #include "uarrsort.h"
129 #include "unewdata.h"
130 #include "uoptions.h"
131 #include "uparse.h"
132 
133 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
134 
135 #define STRING_STORE_SIZE 1000000
136 #define GROUP_STORE_SIZE 5000
137 
138 #define GROUP_SHIFT 5
139 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
140 #define GROUP_MASK (LINES_PER_GROUP-1)
141 
142 #define MAX_LINE_COUNT 50000
143 #define MAX_WORD_COUNT 20000
144 #define MAX_GROUP_COUNT 5000
145 
146 #define DATA_NAME "unames"
147 #define DATA_TYPE "icu"
148 #define VERSION_STRING "unam"
149 #define NAME_SEPARATOR_CHAR ';'
150 
151 #define ISO_DATA_NAME "ucomment"
152 
153 /* Unicode versions --------------------------------------------------------- */
154 
155 enum {
156     UNI_1_0,
157     UNI_1_1,
158     UNI_2_0,
159     UNI_3_0,
160     UNI_3_1,
161     UNI_3_2,
162     UNI_4_0,
163     UNI_4_0_1,
164     UNI_4_1,
165     UNI_5_0,
166     UNI_5_1,
167     UNI_5_2,
168     UNI_VER_COUNT
169 };
170 
171 static const UVersionInfo
172 unicodeVersions[]={
173     { 1, 0, 0, 0 },
174     { 1, 1, 0, 0 },
175     { 2, 0, 0, 0 },
176     { 3, 0, 0, 0 },
177     { 3, 1, 0, 0 },
178     { 3, 2, 0, 0 },
179     { 4, 0, 0, 0 },
180     { 4, 0, 1, 0 },
181     { 4, 1, 0, 0 },
182     { 5, 0, 0, 0 },
183     { 5, 1, 0, 0 },
184     { 5, 2, 0, 0 }
185 };
186 
187 static int32_t ucdVersion=UNI_5_2;
188 
189 static int32_t
findUnicodeVersion(const UVersionInfo version)190 findUnicodeVersion(const UVersionInfo version) {
191     int32_t i;
192 
193     for(i=0; /* while(version>unicodeVersions[i]) {} */
194         i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
195         ++i) {}
196     if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
197         --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
198     }
199     return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
200 }
201 
202 /* generator data ----------------------------------------------------------- */
203 
204 /* UDataInfo cf. udata.h */
205 static UDataInfo dataInfo={
206     sizeof(UDataInfo),
207     0,
208 
209     U_IS_BIG_ENDIAN,
210     U_CHARSET_FAMILY,
211     sizeof(UChar),
212     0,
213 
214     {0x75, 0x6e, 0x61, 0x6d},     /* dataFormat="unam" */
215     {1, 0, 0, 0},                 /* formatVersion */
216     {3, 0, 0, 0}                  /* dataVersion */
217 };
218 
219 static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
220 
221 typedef struct Options {
222     UBool storeNames;
223     UBool store10Names;
224     UBool storeISOComments;
225 } Options;
226 
227 /*
228  * Pair of code point and name alias.
229  * Try to keep sizeof(CpNameAlias) a multiple of 4 to avoid padding.
230  */
231 typedef struct CpNameAlias {
232     uint32_t code;
233     char nameAlias[124];
234 } CpNameAlias;
235 
236 static CpNameAlias cpNameAliases[50];
237 
238 static uint32_t cpNameAliasesIndex=0, cpNameAliasesTop=0;
239 
240 static uint8_t stringStore[STRING_STORE_SIZE],
241                groupStore[GROUP_STORE_SIZE],
242                lineLengths[LINES_PER_GROUP];
243 
244 static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
245 
246 typedef struct {
247     uint32_t code;
248     int16_t length;
249     uint8_t *s;
250 } Line;
251 
252 typedef struct {
253     int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
254     int16_t count;
255     int16_t length;
256     uint8_t *s;
257 } Word;
258 
259 static Line lines[MAX_LINE_COUNT];
260 static Word words[MAX_WORD_COUNT];
261 
262 static uint32_t lineCount=0, wordCount=0;
263 
264 static int16_t leadByteCount;
265 
266 #define LEADBYTE_LIMIT 16
267 
268 static int16_t tokens[LEADBYTE_LIMIT*256];
269 static uint32_t tokenCount;
270 
271 /* prototypes --------------------------------------------------------------- */
272 
273 static void
274 init(void);
275 
276 static void
277 parseNameAliases(const char *filename, Options *options);
278 
279 static void
280 parseDB(const char *filename, Options *options);
281 
282 static void
283 parseName(char *name, int16_t length);
284 
285 static int16_t
286 skipNoise(char *line, int16_t start, int16_t limit);
287 
288 static int16_t
289 getWord(char *line, int16_t start, int16_t limit);
290 
291 static void
292 compress(void);
293 
294 static void
295 compressLines(void);
296 
297 static int16_t
298 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
299 
300 static int32_t
301 compareWords(const void *context, const void *word1, const void *word2);
302 
303 static void
304 generateData(const char *dataDir, Options *options);
305 
306 static uint32_t
307 generateAlgorithmicData(UNewDataMemory *pData, Options *options);
308 
309 static int16_t
310 findToken(uint8_t *s, int16_t length);
311 
312 static Word *
313 findWord(char *s, int16_t length);
314 
315 static Word *
316 addWord(char *s, int16_t length);
317 
318 static void
319 countWord(Word *word);
320 
321 static void
322 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
323 
324 static void
325 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
326 
327 static uint32_t
328 addToken(uint8_t *s, int16_t length);
329 
330 static void
331 appendLineLength(int16_t length);
332 
333 static void
334 appendLineLengthNibble(uint8_t nibble);
335 
336 static uint8_t *
337 allocLine(int32_t length);
338 
339 static uint8_t *
340 allocWord(uint32_t length);
341 
342 /* -------------------------------------------------------------------------- */
343 
344 enum {
345     HELP_H,
346     HELP_QUESTION_MARK,
347     VERBOSE,
348     QUIET,
349     COPYRIGHT,
350     DESTDIR,
351     UNICODE,
352     UNICODE1_NAMES,
353     NO_ISO_COMMENTS,
354     ONLY_ISO_COMMENTS
355 };
356 
357 static UOption options[]={
358     UOPTION_HELP_H,
359     UOPTION_HELP_QUESTION_MARK,
360     UOPTION_VERBOSE,
361     UOPTION_QUIET,
362     UOPTION_COPYRIGHT,
363     UOPTION_DESTDIR,
364     { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
365     { "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 },
366     { "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
367     { "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
368 };
369 
370 extern int
main(int argc,char * argv[])371 main(int argc, char* argv[]) {
372     UVersionInfo version;
373     Options moreOptions={ TRUE, FALSE, TRUE };
374     UErrorCode errorCode = U_ZERO_ERROR;
375 
376     U_MAIN_INIT_ARGS(argc, argv);
377 
378     /* Initialize ICU */
379     u_init(&errorCode);
380     if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
381         /* Note: u_init() will try to open ICU property data.
382          *       failures here are expected when building ICU from scratch.
383          *       ignore them.
384          */
385         fprintf(stderr, "%s: can not initialize ICU.  errorCode = %s\n",
386             argv[0], u_errorName(errorCode));
387         exit(1);
388     }
389 
390     /* preset then read command line options */
391     options[DESTDIR].value=u_getDataDirectory();
392     options[UNICODE].value="4.1";
393     argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
394 
395     /* error handling, printing usage message */
396     if(argc<0) {
397         fprintf(stderr,
398             "error in command line argument \"%s\"\n",
399             argv[-argc]);
400     } else if(argc<2) {
401         argc=-1;
402     }
403     if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
404         /*
405          * Broken into chucks because the C89 standard says the minimum
406          * required supported string length is 509 bytes.
407          */
408         fprintf(stderr,
409             "Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] [filename_ud [filename_na]]\n"
410             "\n"
411             "Read the UnicodeData.txt file and \n"
412             "create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
413             "\n"
414             "\tfilename_ud  absolute path/filename for the UnicodeData.txt file\n"
415             "\t             (default: standard input)\n"
416             "\tfilename_na  absolute path/filename for the NameAliases.txt file\n"
417             "\t             (default: no name aliases)\n"
418             "\n",
419             argv[0]);
420         fprintf(stderr,
421             "Options:\n"
422             "\t-h or -? or --help  this usage text\n"
423             "\t-v or --verbose     verbose output\n"
424             "\t-q or --quiet       no output\n"
425             "\t-c or --copyright   include a copyright notice\n"
426             "\t-d or --destdir     destination directory, followed by the path\n"
427             "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n");
428         fprintf(stderr,
429             "\t-1 or --unicode1-names     store Unicode 1.0 character names\n"
430             "\t      --no-iso-comments    do not store ISO comments\n"
431             "\t      --only-iso-comments  write ucomment.icu with only ISO comments\n");
432         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
433     }
434 
435     /* get the options values */
436     beVerbose=options[VERBOSE].doesOccur;
437     beQuiet=options[QUIET].doesOccur;
438     haveCopyright=options[COPYRIGHT].doesOccur;
439     moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur;
440     moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur;
441     if(options[ONLY_ISO_COMMENTS].doesOccur) {
442         moreOptions.storeNames=moreOptions.store10Names=FALSE;
443         moreOptions.storeISOComments=TRUE;
444     }
445 
446     /* set the Unicode version */
447     u_versionFromString(version, options[UNICODE].value);
448     uprv_memcpy(dataInfo.dataVersion, version, 4);
449     ucdVersion=findUnicodeVersion(version);
450 
451     init();
452     if(argc>=3) {
453         parseNameAliases(argv[2], &moreOptions);
454     }
455     parseDB(argc>=2 ? argv[1] : "-", &moreOptions);
456     compress();
457     generateData(options[DESTDIR].value, &moreOptions);
458 
459     u_cleanup();
460     return 0;
461 }
462 
463 static void
init()464 init() {
465     int i;
466 
467     for(i=0; i<256; ++i) {
468         tokens[i]=0;
469     }
470 }
471 
472 /* parsing ------------------------------------------------------------------ */
473 
474 /* get a name, strip leading and trailing whitespace */
475 static int16_t
getName(char ** pStart,char * limit)476 getName(char **pStart, char *limit) {
477     /* strip leading whitespace */
478     char *start=(char *)u_skipWhitespace(*pStart);
479 
480     /* strip trailing whitespace */
481     while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
482         --limit;
483     }
484 
485     /* return results */
486     *pStart=start;
487     return (int16_t)(limit-start);
488 }
489 
490 static void U_CALLCONV
nameAliasesLineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)491 nameAliasesLineFn(void *context,
492        char *fields[][2], int32_t fieldCount,
493        UErrorCode *pErrorCode) {
494     char *name;
495     int16_t length=0;
496     static uint32_t prevCode=0;
497     uint32_t code=0;
498 
499     if(U_FAILURE(*pErrorCode)) {
500         return;
501     }
502     /* get the character code */
503     code=uprv_strtoul(fields[0][0], NULL, 16);
504 
505     /* get the character name */
506     name=fields[1][0];
507     length=getName(&name, fields[1][1]);
508     if(length==0 || length>=sizeof(cpNameAliases[cpNameAliasesTop].nameAlias)) {
509         fprintf(stderr, "gennames: error - name alias %s empty or too long for code point U+%04lx\n",
510                 name, (unsigned long)code);
511         *pErrorCode=U_PARSE_ERROR;
512         exit(U_PARSE_ERROR);
513     }
514 
515     /* check for non-character code points */
516     if(!U_IS_UNICODE_CHAR(code)) {
517         fprintf(stderr, "gennames: error - name alias for non-character code point U+%04lx\n",
518                 (unsigned long)code);
519         *pErrorCode=U_PARSE_ERROR;
520         exit(U_PARSE_ERROR);
521     }
522 
523     /* check that the code points (code) are in ascending order */
524     if(code<=prevCode && code>0) {
525         fprintf(stderr, "gennames: error - NameAliases entries out of order, U+%04lx after U+%04lx\n",
526                 (unsigned long)code, (unsigned long)prevCode);
527         *pErrorCode=U_PARSE_ERROR;
528         exit(U_PARSE_ERROR);
529     }
530     prevCode=code;
531 
532     if(cpNameAliasesTop>=LENGTHOF(cpNameAliases)) {
533         fprintf(stderr, "gennames: error - too many name aliases\n");
534         *pErrorCode=U_PARSE_ERROR;
535         exit(U_PARSE_ERROR);
536     }
537     cpNameAliases[cpNameAliasesTop].code=code;
538     uprv_memcpy(cpNameAliases[cpNameAliasesTop].nameAlias, name, length);
539     cpNameAliases[cpNameAliasesTop].nameAlias[length]=0;
540     ++cpNameAliasesTop;
541 
542     parseName(name, length);
543 }
544 
545 static void U_CALLCONV
lineFn(void * context,char * fields[][2],int32_t fieldCount,UErrorCode * pErrorCode)546 lineFn(void *context,
547        char *fields[][2], int32_t fieldCount,
548        UErrorCode *pErrorCode) {
549     Options *storeOptions=(Options *)context;
550     char *names[4];
551     int16_t lengths[4]={ 0, 0, 0, 0 };
552     static uint32_t prevCode=0;
553     uint32_t code=0;
554 
555     if(U_FAILURE(*pErrorCode)) {
556         return;
557     }
558     /* get the character code */
559     code=uprv_strtoul(fields[0][0], NULL, 16);
560 
561     /* get the character name */
562     if(storeOptions->storeNames) {
563         names[0]=fields[1][0];
564         lengths[0]=getName(names+0, fields[1][1]);
565         if(names[0][0]=='<') {
566             /* do not store pseudo-names in <> brackets */
567             lengths[0]=0;
568         }
569     }
570 
571     /* store 1.0 names */
572     /* get the second character name, the one from Unicode 1.0 */
573     if(storeOptions->store10Names) {
574         names[1]=fields[10][0];
575         lengths[1]=getName(names+1, fields[10][1]);
576         if(names[1][0]=='<') {
577             /* do not store pseudo-names in <> brackets */
578             lengths[1]=0;
579         }
580     }
581 
582     /* get the ISO 10646 comment */
583     if(storeOptions->storeISOComments) {
584         names[2]=fields[11][0];
585         lengths[2]=getName(names+2, fields[11][1]);
586     }
587 
588     if(lengths[0]+lengths[1]+lengths[2]==0) {
589         return;
590     }
591 
592     /* check for non-character code points */
593     if(!U_IS_UNICODE_CHAR(code)) {
594         fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
595                 (unsigned long)code);
596         *pErrorCode=U_PARSE_ERROR;
597         exit(U_PARSE_ERROR);
598     }
599 
600     /* check that the code points (code) are in ascending order */
601     if(code<=prevCode && code>0) {
602         fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
603                 (unsigned long)code, (unsigned long)prevCode);
604         *pErrorCode=U_PARSE_ERROR;
605         exit(U_PARSE_ERROR);
606     }
607     prevCode=code;
608 
609     parseName(names[0], lengths[0]);
610     parseName(names[1], lengths[1]);
611     parseName(names[2], lengths[2]);
612 
613     if(cpNameAliasesIndex<cpNameAliasesTop && code>=cpNameAliases[cpNameAliasesIndex].code) {
614         if(code==cpNameAliases[cpNameAliasesIndex].code) {
615             names[3]=cpNameAliases[cpNameAliasesIndex].nameAlias;
616             lengths[3]=(int16_t)uprv_strlen(cpNameAliases[cpNameAliasesIndex].nameAlias);
617             ++cpNameAliasesIndex;
618         } else {
619             fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n",
620                     (unsigned long)code);
621             *pErrorCode=U_PARSE_ERROR;
622             exit(U_PARSE_ERROR);
623         }
624     }
625 
626     /*
627      * set the count argument to
628      * 1: only store regular names, or only store ISO 10646 comments
629      * 2: store regular and 1.0 names
630      * 3: store names and ISO 10646 comment
631      * 4: also store name alias
632      *
633      * addLine() will ignore empty trailing names
634      */
635     if(storeOptions->storeNames) {
636         /* store names and comments as parsed according to storeOptions */
637         addLine(code, names, lengths, LENGTHOF(names));
638     } else {
639         /* store only ISO 10646 comments */
640         addLine(code, names+2, lengths+2, 1);
641     }
642 }
643 
644 static void
parseNameAliases(const char * filename,Options * storeOptions)645 parseNameAliases(const char *filename, Options *storeOptions) {
646     char *fields[2][2];
647     UErrorCode errorCode=U_ZERO_ERROR;
648 
649     if(!storeOptions->storeNames) {
650         return;
651     }
652     u_parseDelimitedFile(filename, ';', fields, 2, nameAliasesLineFn, NULL, &errorCode);
653     if(U_FAILURE(errorCode)) {
654         fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
655         exit(errorCode);
656     }
657 
658     if(!beQuiet) {
659         printf("number of name aliases: %lu\n", (unsigned long)cpNameAliasesTop);
660     }
661 }
662 
663 static void
parseDB(const char * filename,Options * storeOptions)664 parseDB(const char *filename, Options *storeOptions) {
665     char *fields[15][2];
666     UErrorCode errorCode=U_ZERO_ERROR;
667 
668     u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode);
669     if(U_FAILURE(errorCode)) {
670         fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
671         exit(errorCode);
672     }
673     if(cpNameAliasesIndex<cpNameAliasesTop) {
674         fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n",
675                 (unsigned long)cpNameAliases[cpNameAliasesIndex].code);
676         exit(U_PARSE_ERROR);
677     }
678 
679     if(!beQuiet) {
680         printf("size of all names in the database: %lu\n",
681             (unsigned long)lineTop);
682         printf("number of named Unicode characters: %lu\n",
683             (unsigned long)lineCount);
684         printf("number of words in the dictionary from these names: %lu\n",
685             (unsigned long)wordCount);
686     }
687 }
688 
689 static void
parseName(char * name,int16_t length)690 parseName(char *name, int16_t length) {
691     int16_t start=0, limit, wordLength/*, prevStart=-1*/;
692     Word *word;
693 
694     while(start<length) {
695         /* skip any "noise" characters */
696         limit=skipNoise(name, start, length);
697         if(start<limit) {
698             /*prevStart=-1;*/
699             start=limit;
700         }
701         if(start==length) {
702             break;
703         }
704 
705         /* get a word and add it if it is longer than 1 */
706         limit=getWord(name, start, length);
707         wordLength=(int16_t)(limit-start);
708         if(wordLength>1) {
709             word=findWord(name+start, wordLength);
710             if(word==NULL) {
711                 word=addWord(name+start, wordLength);
712             }
713             countWord(word);
714         }
715 
716 #if 0
717         /*
718          * if there was a word before this
719          * (with no noise in between), then add the pair of words, too
720          */
721         if(prevStart!=-1) {
722             wordLength=limit-prevStart;
723             word=findWord(name+prevStart, wordLength);
724             if(word==NULL) {
725                 word=addWord(name+prevStart, wordLength);
726             }
727             countWord(word);
728         }
729 #endif
730 
731         /*prevStart=start;*/
732         start=limit;
733     }
734 }
735 
736 static UBool U_INLINE
isWordChar(char c)737 isWordChar(char c) {
738     return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
739            ('J'<=c && c<='R') ||
740            ('S'<=c && c<='Z') ||
741 
742            ('a'<=c && c<='i') || /* lowercase letters for ISO comments */
743            ('j'<=c && c<='r') ||
744            ('s'<=c && c<='z') ||
745 
746            ('0'<=c && c<='9');
747 }
748 
749 static int16_t
skipNoise(char * line,int16_t start,int16_t limit)750 skipNoise(char *line, int16_t start, int16_t limit) {
751     /* skip anything that is not part of a word in this sense */
752     while(start<limit && !isWordChar(line[start])) {
753         ++start;
754     }
755 
756     return start;
757 }
758 
759 static int16_t
getWord(char * line,int16_t start,int16_t limit)760 getWord(char *line, int16_t start, int16_t limit) {
761     char c=0; /* initialize to avoid a compiler warning although the code was safe */
762 
763     /* a unicode character name word consists of A-Z0-9 */
764     while(start<limit && isWordChar(line[start])) {
765         ++start;
766     }
767 
768     /* include a following space or dash */
769     if(start<limit && ((c=line[start])==' ' || c=='-')) {
770         ++start;
771     }
772 
773     return start;
774 }
775 
776 /* compressing -------------------------------------------------------------- */
777 
778 static void
compress()779 compress() {
780     uint32_t i, letterCount;
781     int16_t wordNumber;
782     UErrorCode errorCode;
783 
784     /* sort the words in reverse order by weight */
785     errorCode=U_ZERO_ERROR;
786     uprv_sortArray(words, wordCount, sizeof(Word),
787                     compareWords, NULL, FALSE, &errorCode);
788 
789     /* remove the words that do not save anything */
790     while(wordCount>0 && words[wordCount-1].weight<1) {
791         --wordCount;
792     }
793 
794     /* count the letters in the token range */
795     letterCount=0;
796     for(i=LEADBYTE_LIMIT; i<256; ++i) {
797         if(tokens[i]==-1) {
798             ++letterCount;
799         }
800     }
801     if(!beQuiet) {
802         printf("number of letters used in the names: %d\n", (int)letterCount);
803     }
804 
805     /* do we need double-byte tokens? */
806     if(wordCount+letterCount<=256) {
807         /* no, single-byte tokens are enough */
808         leadByteCount=0;
809         for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
810             if(tokens[i]!=-1) {
811                 tokens[i]=wordNumber;
812                 if(beVerbose) {
813                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
814                             (int)i, (long)words[wordNumber].weight,
815                             words[wordNumber].length, words[wordNumber].s);
816                 }
817                 ++wordNumber;
818             }
819         }
820         tokenCount=i;
821     } else {
822         /*
823          * The tokens that need two token bytes
824          * get their weight reduced by their count
825          * because they save less.
826          */
827         tokenCount=256-letterCount;
828         for(i=tokenCount; i<wordCount; ++i) {
829             words[i].weight-=words[i].count;
830         }
831 
832         /* sort these words in reverse order by weight */
833         errorCode=U_ZERO_ERROR;
834         uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
835                         compareWords, NULL, FALSE, &errorCode);
836 
837         /* remove the words that do not save anything */
838         while(wordCount>0 && words[wordCount-1].weight<1) {
839             --wordCount;
840         }
841 
842         /* how many tokens and lead bytes do we have now? */
843         tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
844         /*
845          * adjust upwards to take into account that
846          * double-byte tokens must not
847          * use NAME_SEPARATOR_CHAR as a second byte
848          */
849         tokenCount+=(tokenCount-256+254)/255;
850 
851         leadByteCount=(int16_t)(tokenCount>>8);
852         if(leadByteCount<LEADBYTE_LIMIT) {
853             /* adjust for the real number of lead bytes */
854             tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
855         } else {
856             /* limit the number of lead bytes */
857             leadByteCount=LEADBYTE_LIMIT-1;
858             tokenCount=LEADBYTE_LIMIT*256;
859             wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
860             /* adjust again to skip double-byte tokens with ';' */
861             wordCount-=(tokenCount-256+254)/255;
862         }
863 
864         /* set token 0 to word 0 */
865         tokens[0]=0;
866         if(beVerbose) {
867             printf("tokens[0x000]: word%8ld \"%.*s\"\n",
868                     (long)words[0].weight,
869                     words[0].length, words[0].s);
870         }
871         wordNumber=1;
872 
873         /* set the lead byte tokens */
874         for(i=1; (int16_t)i<=leadByteCount; ++i) {
875             tokens[i]=-2;
876         }
877 
878         /* set the tokens */
879         for(; i<256; ++i) {
880             /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
881             if(tokens[i]!=-1) {
882                 tokens[i]=wordNumber;
883                 if(beVerbose) {
884                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
885                             (int)i, (long)words[wordNumber].weight,
886                             words[wordNumber].length, words[wordNumber].s);
887                 }
888                 ++wordNumber;
889             }
890         }
891 
892         /* continue above 255 where there are no letters */
893         for(; (uint32_t)wordNumber<wordCount; ++i) {
894             if((i&0xff)==NAME_SEPARATOR_CHAR) {
895                 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
896             } else {
897                 tokens[i]=wordNumber;
898                 if(beVerbose) {
899                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
900                             (int)i, (long)words[wordNumber].weight,
901                             words[wordNumber].length, words[wordNumber].s);
902                 }
903                 ++wordNumber;
904             }
905         }
906         tokenCount=i; /* should be already tokenCount={i or i+1} */
907     }
908 
909     if(!beQuiet) {
910         printf("number of lead bytes: %d\n", leadByteCount);
911         printf("number of single-byte tokens: %lu\n",
912             (unsigned long)256-letterCount-leadByteCount);
913         printf("number of tokens: %lu\n", (unsigned long)tokenCount);
914     }
915 
916     compressLines();
917 }
918 
919 static void
compressLines()920 compressLines() {
921     Line *line=NULL;
922     uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
923              groupMSB=0xffff, lineCount2;
924     int16_t groupTop=0;
925 
926     /* store the groups like lines, with compressed data after raw strings */
927     groupBottom=lineTop;
928     lineCount2=lineCount;
929     lineCount=0;
930 
931     /* loop over all lines */
932     while(i<lineCount2) {
933         line=lines+i++;
934         inLine=line->code;
935 
936         /* segment the lines to groups of 32 */
937         if(inLine>>GROUP_SHIFT!=groupMSB) {
938             /* finish the current group with empty lines */
939             while((++outLine&GROUP_MASK)!=0) {
940                 appendLineLength(0);
941             }
942 
943             /* store the group like a line */
944             if(groupTop>0) {
945                 if(groupTop>GROUP_STORE_SIZE) {
946                     fprintf(stderr, "gennames: group store overflow\n");
947                     exit(U_BUFFER_OVERFLOW_ERROR);
948                 }
949                 addGroup(groupMSB, groupStore, groupTop);
950             }
951 
952             /* start the new group */
953             lineLengthsTop=0;
954             groupTop=0;
955             groupMSB=inLine>>GROUP_SHIFT;
956             outLine=(inLine&~GROUP_MASK)-1;
957         }
958 
959         /* write empty lines between the previous line in the group and this one */
960         while(++outLine<inLine) {
961             appendLineLength(0);
962         }
963 
964         /* write characters and tokens for this line */
965         appendLineLength(compressLine(line->s, line->length, &groupTop));
966     }
967 
968     /* finish and store the last group */
969     if(line && groupMSB!=0xffff) {
970         /* finish the current group with empty lines */
971         while((++outLine&GROUP_MASK)!=0) {
972             appendLineLength(0);
973         }
974 
975         /* store the group like a line */
976         if(groupTop>0) {
977             if(groupTop>GROUP_STORE_SIZE) {
978                 fprintf(stderr, "gennames: group store overflow\n");
979                 exit(U_BUFFER_OVERFLOW_ERROR);
980             }
981             addGroup(groupMSB, groupStore, groupTop);
982         }
983     }
984 
985     if(!beQuiet) {
986         printf("number of groups: %lu\n", (unsigned long)lineCount);
987     }
988 }
989 
990 static int16_t
compressLine(uint8_t * s,int16_t length,int16_t * pGroupTop)991 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
992     int16_t start, limit, token, groupTop=*pGroupTop;
993 
994     start=0;
995     do {
996         /* write any "noise" characters */
997         limit=skipNoise((char *)s, start, length);
998         while(start<limit) {
999             groupStore[groupTop++]=s[start++];
1000         }
1001 
1002         if(start==length) {
1003             break;
1004         }
1005 
1006         /* write a word, as token or directly */
1007         limit=getWord((char *)s, start, length);
1008         if(limit-start==1) {
1009             groupStore[groupTop++]=s[start++];
1010         } else {
1011             token=findToken(s+start, (int16_t)(limit-start));
1012             if(token!=-1) {
1013                 if(token>0xff) {
1014                     groupStore[groupTop++]=(uint8_t)(token>>8);
1015                 }
1016                 groupStore[groupTop++]=(uint8_t)token;
1017                 start=limit;
1018             } else {
1019                 while(start<limit) {
1020                     groupStore[groupTop++]=s[start++];
1021                 }
1022             }
1023         }
1024     } while(start<length);
1025 
1026     length=(int16_t)(groupTop-*pGroupTop);
1027     *pGroupTop=groupTop;
1028     return length;
1029 }
1030 
1031 static int32_t
compareWords(const void * context,const void * word1,const void * word2)1032 compareWords(const void *context, const void *word1, const void *word2) {
1033     /* reverse sort by word weight */
1034     return ((Word *)word2)->weight-((Word *)word1)->weight;
1035 }
1036 
1037 /* generate output data ----------------------------------------------------- */
1038 
1039 static void
generateData(const char * dataDir,Options * storeOptions)1040 generateData(const char *dataDir, Options *storeOptions) {
1041     UNewDataMemory *pData;
1042     UErrorCode errorCode=U_ZERO_ERROR;
1043     uint16_t groupWords[3];
1044     uint32_t i, groupTop=lineTop, offset, size,
1045              tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
1046     long dataLength;
1047     int16_t token;
1048 
1049     pData=udata_create(dataDir,
1050                        DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME,
1051                        &dataInfo,
1052                        haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1053     if(U_FAILURE(errorCode)) {
1054         fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
1055         exit(errorCode);
1056     }
1057 
1058     /* first, see how much space we need, and prepare the token strings */
1059     for(i=0; i<tokenCount; ++i) {
1060         token=tokens[i];
1061         if(token!=-1 && token!=-2) {
1062             tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
1063         }
1064     }
1065 
1066     /*
1067      * Required padding for data swapping:
1068      * The token table undergoes a permutation during data swapping when the
1069      * input and output charsets are different.
1070      * The token table cannot grow during swapping, so we need to make sure that
1071      * the table is long enough for successful in-place permutation.
1072      *
1073      * We simply round up tokenCount to the next multiple of 256 to account for
1074      * all possible permutations.
1075      *
1076      * An optimization is possible if we only ever swap between ASCII and EBCDIC:
1077      *
1078      * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
1079      * and will be swapped between ASCII and EBCDIC between
1080      * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
1081      * This should be the only -1 entry in tokens[256..511] on which the data
1082      * swapper bases its trail byte permutation map (trailMap[]).
1083      *
1084      * It would be sufficient to increase tokenCount so that its lower 8 bits
1085      * are at least 0x5e+1 to make room for swapping between the two semicolons.
1086      * For values higher than 0x5e, the trail byte permutation map (trailMap[])
1087      * should always be an identity map, where we do not need additional room.
1088      */
1089     i=tokenCount;
1090     tokenCount=(tokenCount+0xff)&~0xff;
1091     if(!beQuiet && i<tokenCount) {
1092         printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
1093     }
1094     for(; i<tokenCount; ++i) {
1095         if((i&0xff)==NAME_SEPARATOR_CHAR) {
1096             tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
1097         } else {
1098             tokens[i]=0; /* unused token for padding */
1099         }
1100     }
1101 
1102     /*
1103      * Calculate the total size in bytes of the data including:
1104      * - the offset to the token strings, uint32_t (4)
1105      * - the offset to the group table, uint32_t (4)
1106      * - the offset to the group strings, uint32_t (4)
1107      * - the offset to the algorithmic names, uint32_t (4)
1108      *
1109      * - the number of tokens, uint16_t (2)
1110      * - the token table, uint16_t[tokenCount] (2*tokenCount)
1111      *
1112      * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
1113      *
1114      * - the number of groups, uint16_t (2)
1115      * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
1116      *
1117      * - the group strings (groupTop-groupBottom), 2-padded
1118      *
1119      * - the size of the data for the algorithmic names
1120      */
1121     tokenStringOffset=4+4+4+4+2+2*tokenCount;
1122     groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
1123     groupStringOffset=groupsOffset+2+6*lineCount;
1124     algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
1125 
1126     offset=generateAlgorithmicData(NULL, storeOptions);
1127     size=algNamesOffset+offset;
1128 
1129     if(!beQuiet) {
1130         printf("size of the Unicode Names data:\n"
1131                "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
1132                 (unsigned long)size, (unsigned long)(lineTop-groupTop),
1133                 (unsigned long)(groupTop-groupBottom), (unsigned long)offset);
1134     }
1135 
1136     /* write the data to the file */
1137     /* offsets */
1138     udata_write32(pData, tokenStringOffset);
1139     udata_write32(pData, groupsOffset);
1140     udata_write32(pData, groupStringOffset);
1141     udata_write32(pData, algNamesOffset);
1142 
1143     /* token table */
1144     udata_write16(pData, (uint16_t)tokenCount);
1145     udata_writeBlock(pData, tokens, 2*tokenCount);
1146 
1147     /* token strings */
1148     udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
1149     if((lineTop-groupTop)&1) {
1150         /* 2-padding */
1151         udata_writePadding(pData, 1);
1152     }
1153 
1154     /* group table */
1155     udata_write16(pData, (uint16_t)lineCount);
1156     for(i=0; i<lineCount; ++i) {
1157         /* groupMSB */
1158         groupWords[0]=(uint16_t)lines[i].code;
1159 
1160         /* offset */
1161         offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
1162         groupWords[1]=(uint16_t)(offset>>16);
1163         groupWords[2]=(uint16_t)(offset);
1164         udata_writeBlock(pData, groupWords, 6);
1165     }
1166 
1167     /* group strings */
1168     udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
1169 
1170     /* 4-align the algorithmic names data */
1171     udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
1172 
1173     generateAlgorithmicData(pData, storeOptions);
1174 
1175     /* finish up */
1176     dataLength=udata_finish(pData, &errorCode);
1177     if(U_FAILURE(errorCode)) {
1178         fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
1179         exit(errorCode);
1180     }
1181 
1182     if(dataLength!=(long)size) {
1183         fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
1184 dataLength, (unsigned long)size);
1185         exit(U_INTERNAL_PROGRAM_ERROR);
1186     }
1187 }
1188 
1189 /* the structure for algorithmic names needs to be 4-aligned */
1190 typedef struct AlgorithmicRange {
1191     uint32_t rangeStart, rangeEnd;
1192     uint8_t algorithmType, algorithmVariant;
1193     uint16_t rangeSize;
1194 } AlgorithmicRange;
1195 
1196 static uint32_t
generateAlgorithmicData(UNewDataMemory * pData,Options * storeOptions)1197 generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
1198     static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
1199 #   define PREFIX_LENGTH 23
1200 #   define PREFIX_LENGTH_4 24
1201     uint32_t countAlgRanges;
1202 
1203     static AlgorithmicRange cjkExtA={
1204         0x3400, 0x4db5,
1205         0, 4,
1206         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1207     };
1208     static AlgorithmicRange cjk={
1209         0x4e00, 0x9fa5,
1210         0, 4,
1211         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1212     };
1213     static AlgorithmicRange cjkExtB={
1214         0x20000, 0x2a6d6,
1215         0, 5,
1216         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1217     };
1218     static AlgorithmicRange cjkExtC={
1219         0x2a700, 0x2b734,
1220         0, 5,
1221         sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1222     };
1223 
1224     static char jamo[]=
1225         "HANGUL SYLLABLE \0"
1226 
1227         "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
1228         "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
1229 
1230         "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
1231         "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
1232         "YU\0EU\0YI\0I\0"
1233 
1234         "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
1235         "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
1236         "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
1237     ;
1238 
1239     static AlgorithmicRange hangul={
1240         0xac00, 0xd7a3,
1241         1, 3,
1242         sizeof(AlgorithmicRange)+6+sizeof(jamo)
1243     };
1244 
1245     /* modulo factors, maximum 8 */
1246     /* 3 factors: 19, 21, 28, most-to-least-significant */
1247     static uint16_t hangulFactors[3]={
1248         19, 21, 28
1249     };
1250 
1251     uint32_t size;
1252 
1253     size=0;
1254 
1255     if(ucdVersion>=UNI_5_2) {
1256         /* Unicode 5.2 and up has a longer CJK Unihan range than before */
1257         cjk.rangeEnd=0x9FCB;
1258     } else if(ucdVersion>=UNI_5_1) {
1259         /* Unicode 5.1 and up has a longer CJK Unihan range than before */
1260         cjk.rangeEnd=0x9FC3;
1261     } else if(ucdVersion>=UNI_4_1) {
1262         /* Unicode 4.1 and up has a longer CJK Unihan range than before */
1263         cjk.rangeEnd=0x9FBB;
1264     }
1265 
1266     /* number of ranges of algorithmic names */
1267     if(!storeOptions->storeNames) {
1268         countAlgRanges=0;
1269     } else if(ucdVersion>=UNI_5_2) {
1270         /* Unicode 5.2 and up has 5 ranges including CJK Extension C */
1271         countAlgRanges=5;
1272     } else if(ucdVersion>=UNI_3_1) {
1273         /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
1274         countAlgRanges=4;
1275     } else if(ucdVersion>=UNI_3_0) {
1276         /* Unicode 3.0 has 3 ranges including CJK Extension A */
1277         countAlgRanges=3;
1278     } else {
1279         /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
1280         countAlgRanges=2;
1281     }
1282 
1283     if(pData!=NULL) {
1284         udata_write32(pData, countAlgRanges);
1285     } else {
1286         size+=4;
1287     }
1288     if(countAlgRanges==0) {
1289         return size;
1290     }
1291 
1292     /*
1293      * each range:
1294      * uint32_t rangeStart
1295      * uint32_t rangeEnd
1296      * uint8_t algorithmType
1297      * uint8_t algorithmVariant
1298      * uint16_t size of range data
1299      * uint8_t[size] data
1300      */
1301 
1302     /* range 0: cjk extension a */
1303     if(countAlgRanges>=3) {
1304         if(pData!=NULL) {
1305             udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1306             udata_writeString(pData, prefix, PREFIX_LENGTH);
1307             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1308                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1309             }
1310         } else {
1311             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1312         }
1313     }
1314 
1315     /* range 1: cjk */
1316     if(pData!=NULL) {
1317         udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1318         udata_writeString(pData, prefix, PREFIX_LENGTH);
1319         if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1320             udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1321         }
1322     } else {
1323         size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1324     }
1325 
1326     /* range 2: hangul syllables */
1327     if(pData!=NULL) {
1328         udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1329         udata_writeBlock(pData, hangulFactors, 6);
1330         udata_writeString(pData, jamo, sizeof(jamo));
1331     } else {
1332         size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1333     }
1334 
1335     /* range 3: cjk extension b */
1336     if(countAlgRanges>=4) {
1337         if(pData!=NULL) {
1338             udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1339             udata_writeString(pData, prefix, PREFIX_LENGTH);
1340             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1341                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1342             }
1343         } else {
1344             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1345         }
1346     }
1347 
1348     /* range 4: cjk extension c */
1349     if(countAlgRanges>=5) {
1350         if(pData!=NULL) {
1351             udata_writeBlock(pData, &cjkExtC, sizeof(AlgorithmicRange));
1352             udata_writeString(pData, prefix, PREFIX_LENGTH);
1353             if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1354                 udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1355             }
1356         } else {
1357             size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1358         }
1359     }
1360 
1361     return size;
1362 }
1363 
1364 /* helpers ------------------------------------------------------------------ */
1365 
1366 static int16_t
findToken(uint8_t * s,int16_t length)1367 findToken(uint8_t *s, int16_t length) {
1368     int16_t i, token;
1369 
1370     for(i=0; i<(int16_t)tokenCount; ++i) {
1371         token=tokens[i];
1372         if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
1373             return i;
1374         }
1375     }
1376 
1377     return -1;
1378 }
1379 
1380 static Word *
findWord(char * s,int16_t length)1381 findWord(char *s, int16_t length) {
1382     uint32_t i;
1383 
1384     for(i=0; i<wordCount; ++i) {
1385         if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1386             return words+i;
1387         }
1388     }
1389 
1390     return NULL;
1391 }
1392 
1393 static Word *
addWord(char * s,int16_t length)1394 addWord(char *s, int16_t length) {
1395     uint8_t *stringStart;
1396     Word *word;
1397 
1398     if(wordCount==MAX_WORD_COUNT) {
1399         fprintf(stderr, "gennames: too many words\n");
1400         exit(U_BUFFER_OVERFLOW_ERROR);
1401     }
1402 
1403     stringStart=allocWord(length);
1404     uprv_memcpy(stringStart, s, length);
1405 
1406     word=words+wordCount;
1407 
1408     /*
1409      * Initialize the weight with the costs for this token:
1410      * a zero-terminated string and a 16-bit offset.
1411      */
1412     word->weight=-(length+1+2);
1413     word->count=0;
1414     word->length=length;
1415     word->s=stringStart;
1416 
1417     ++wordCount;
1418 
1419     return word;
1420 }
1421 
1422 static void
countWord(Word * word)1423 countWord(Word *word) {
1424     /* add to the weight the savings: the length of the word minus 1 byte for the token */
1425     word->weight+=word->length-1;
1426     ++word->count;
1427 }
1428 
1429 static void
addLine(uint32_t code,char * names[],int16_t lengths[],int16_t count)1430 addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1431     uint8_t *stringStart;
1432     Line *line;
1433     int16_t i, length;
1434 
1435     if(lineCount==MAX_LINE_COUNT) {
1436         fprintf(stderr, "gennames: too many lines\n");
1437         exit(U_BUFFER_OVERFLOW_ERROR);
1438     }
1439 
1440     /* find the last non-empty name */
1441     while(count>0 && lengths[count-1]==0) {
1442         --count;
1443     }
1444     if(count==0) {
1445         return; /* should not occur: caller should not have called */
1446     }
1447 
1448     /* there will be (count-1) separator characters */
1449     i=count;
1450     length=count-1;
1451 
1452     /* add lengths of strings */
1453     while(i>0) {
1454         length+=lengths[--i];
1455     }
1456 
1457     /* allocate line memory */
1458     stringStart=allocLine(length);
1459 
1460     /* copy all strings into the line memory */
1461     length=0; /* number of chars copied so far */
1462     for(i=0; i<count; ++i) {
1463         if(i>0) {
1464             stringStart[length++]=NAME_SEPARATOR_CHAR;
1465         }
1466         if(lengths[i]>0) {
1467             uprv_memcpy(stringStart+length, names[i], lengths[i]);
1468             length+=lengths[i];
1469         }
1470     }
1471 
1472     line=lines+lineCount;
1473 
1474     line->code=code;
1475     line->length=length;
1476     line->s=stringStart;
1477 
1478     ++lineCount;
1479 
1480     /* prevent a character value that is actually in a name from becoming a token */
1481     while(length>0) {
1482         tokens[stringStart[--length]]=-1;
1483     }
1484 }
1485 
1486 static void
addGroup(uint32_t groupMSB,uint8_t * strings,int16_t length)1487 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1488     uint8_t *stringStart;
1489     Line *line;
1490 
1491     if(lineCount==MAX_LINE_COUNT) {
1492         fprintf(stderr, "gennames: too many groups\n");
1493         exit(U_BUFFER_OVERFLOW_ERROR);
1494     }
1495 
1496     /* store the line lengths first, then the strings */
1497     lineLengthsTop=(lineLengthsTop+1)/2;
1498     stringStart=allocLine(lineLengthsTop+length);
1499     uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1500     uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1501 
1502     line=lines+lineCount;
1503 
1504     line->code=groupMSB;
1505     line->length=length;
1506     line->s=stringStart;
1507 
1508     ++lineCount;
1509 }
1510 
1511 static uint32_t
addToken(uint8_t * s,int16_t length)1512 addToken(uint8_t *s, int16_t length) {
1513     uint8_t *stringStart;
1514 
1515     stringStart=allocLine(length+1);
1516     uprv_memcpy(stringStart, s, length);
1517     stringStart[length]=0;
1518 
1519     return (uint32_t)(stringStart - stringStore);
1520 }
1521 
1522 static void
appendLineLength(int16_t length)1523 appendLineLength(int16_t length) {
1524     if(length>=76) {
1525         fprintf(stderr, "gennames: compressed line too long\n");
1526         exit(U_BUFFER_OVERFLOW_ERROR);
1527     }
1528     if(length>=12) {
1529         length-=12;
1530         appendLineLengthNibble((uint8_t)((length>>4)|12));
1531     }
1532     appendLineLengthNibble((uint8_t)length);
1533 }
1534 
1535 static void
appendLineLengthNibble(uint8_t nibble)1536 appendLineLengthNibble(uint8_t nibble) {
1537     if((lineLengthsTop&1)==0) {
1538         lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1539     } else {
1540         lineLengths[lineLengthsTop/2]|=nibble&0xf;
1541     }
1542     ++lineLengthsTop;
1543 }
1544 
1545 static uint8_t *
allocLine(int32_t length)1546 allocLine(int32_t length) {
1547     uint32_t top=lineTop+length;
1548     uint8_t *p;
1549 
1550     if(top>wordBottom) {
1551         fprintf(stderr, "gennames: out of memory\n");
1552         exit(U_MEMORY_ALLOCATION_ERROR);
1553     }
1554     p=stringStore+lineTop;
1555     lineTop=top;
1556     return p;
1557 }
1558 
1559 static uint8_t *
allocWord(uint32_t length)1560 allocWord(uint32_t length) {
1561     uint32_t bottom=wordBottom-length;
1562 
1563     if(lineTop>bottom) {
1564         fprintf(stderr, "gennames: out of memory\n");
1565         exit(U_MEMORY_ALLOCATION_ERROR);
1566     }
1567     wordBottom=bottom;
1568     return stringStore+bottom;
1569 }
1570 
1571 /*
1572  * Hey, Emacs, please set the following:
1573  *
1574  * Local Variables:
1575  * indent-tabs-mode: nil
1576  * End:
1577  *
1578  */
1579