• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  namespropsbuilder.cpp (was gennames/gennames.c)
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 1999sep30
16 *   created by: Markus W. Scherer
17 *
18 *   This builder reads Unicode character names and aliases,
19 *   tokenizes and compresses them, and builds
20 *   compact binary tables for random-access lookup
21 *   in a u_charName() API function.
22 *
23 * unames.icu file format (after UDataInfo header etc. - see udata.c)
24 * (all data is static const)
25 *
26 * UDataInfo fields:
27 *   dataFormat "unam"
28 *   formatVersion 1.0
29 *   dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
30 *
31 * -- data-based names
32 * uint32_t tokenStringOffset,
33 *          groupsOffset,
34 *          groupStringOffset,
35 *          algNamesOffset;
36 *
37 * uint16_t tokenCount;
38 * uint16_t tokenTable[tokenCount];
39 *
40 * char     tokenStrings[]; -- padded to even count
41 *
42 * -- strings (groupStrings) are tokenized as follows:
43 *   for each character c
44 *       if(c>=tokenCount) write that character c directly
45 *   else
46 *       token=tokenTable[c];
47 *       if(token==0xfffe) -- lead byte of double-byte token
48 *           token=tokenTable[c<<8|next character];
49 *       if(token==-1)
50 *           write c directly
51 *       else
52 *           tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
53 *           append zero-terminated tokenString;
54 *
55 *    Different strings for a code point - normal name, 1.0 name, and ISO comment -
56 *    are separated by ';'.
57 *
58 * uint16_t groupCount;
59 * struct {
60 *   uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
61 *   uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
62 *   uint16_t offsetLow;
63 * } groupTable[groupCount];
64 *
65 * char     groupStrings[]; -- padded to 4-count
66 *
67 * -- The actual, tokenized group strings are not zero-terminated because
68 *   that would take up too much space.
69 *   Instead, they are preceeded by their length, written in a variable-length sequence:
70 *   For each of the 32 group strings, one or two nibbles are stored for its length.
71 *   Nibbles (4-bit values, half-bytes) are read MSB first.
72 *   A nibble with a value of 0..11 directly indicates the length of the name string.
73 *   A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
74 *   by (((n-12)<<4)|m)+12, reaching values of 12..75.
75 *   These lengths are sequentially for each tokenized string, not for the de-tokenized result.
76 *   For the de-tokenizing, see token description above; the strings immediately follow the
77 *   32 lengths.
78 *
79 * -- algorithmic names
80 *
81 * typedef struct AlgorithmicRange {
82 *     uint32_t rangeStart, rangeEnd;
83 *     uint8_t algorithmType, algorithmVariant;
84 *     uint16_t rangeSize;
85 * } AlgorithmicRange;
86 *
87 * uint32_t algRangesCount; -- number of data blocks for ranges of
88 *               algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
89 *
90 * struct {
91 *     AlgorithmicRange algRange;
92 *     uint8_t algRangeData[]; -- padded to 4-count except in last range
93 * } algRanges[algNamesCount];
94 * -- not a real array because each part has a different size
95 *    of algRange.rangeSize (including AlgorithmicRange)
96 *
97 * -- algorithmic range types:
98 *
99 * 0 Names are formed from a string prefix that is stored in
100 *   the algRangeData (zero-terminated), followed by the Unicode code point
101 *   of the character in hexadecimal digits;
102 *   algRange.algorithmVariant digits are written
103 *
104 * 1 Names are formed by calculating modulo-factors of the code point value as follows:
105 *   algRange.algorithmVariant is the count of modulo factors
106 *   algRangeData contains
107 *       uint16_t factors[algRange.algorithmVariant];
108 *       char strings[];
109 *   the first zero-terminated string is written as the prefix; then:
110 *
111 *   The rangeStart is subtracted; with the difference, here "code":
112 *   for(i=algRange.algorithmVariant-1 to 0 step -1)
113 *       index[i]=code%factor[i];
114 *       code/=factor[i];
115 *
116 *   The strings after the prefix are short pieces that are then appended to the result
117 *   according to index[0..algRange.algorithmVariant-1].
118 */
119 
120 #include <stdio.h>
121 #include "unicode/utypes.h"
122 #include "unicode/putil.h"
123 #include "unicode/udata.h"
124 #include "charstr.h"
125 #include "cmemory.h"
126 #include "cstring.h"
127 #include "genprops.h"
128 #include "ppucd.h"
129 #include "uarrsort.h"
130 #include "uassert.h"
131 #include "unewdata.h"
132 #include "uoptions.h"
133 
134 #define STRING_STORE_SIZE 2000000
135 #define GROUP_STORE_SIZE 5000
136 
137 #define GROUP_SHIFT 5
138 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
139 #define GROUP_MASK (LINES_PER_GROUP-1)
140 
141 #define MAX_LINE_COUNT 50000
142 #define MAX_WORD_COUNT 20000
143 #define MAX_GROUP_COUNT 5000
144 
145 #define NAME_SEPARATOR_CHAR ';'
146 
147 /* generator data ----------------------------------------------------------- */
148 
149 U_NAMESPACE_USE
150 
151 /* UDataInfo cf. udata.h */
152 static UDataInfo dataInfo={
153     sizeof(UDataInfo),
154     0,
155 
156     U_IS_BIG_ENDIAN,
157     U_CHARSET_FAMILY,
158     sizeof(UChar),
159     0,
160 
161     {0x75, 0x6e, 0x61, 0x6d},     /* dataFormat="unam" */
162     {1, 0, 0, 0},                 /* formatVersion */
163     {3, 0, 0, 0}                  /* dataVersion */
164 };
165 
166 static uint8_t stringStore[STRING_STORE_SIZE],
167                groupStore[GROUP_STORE_SIZE],
168                lineLengths[LINES_PER_GROUP];
169 
170 static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
171 
172 typedef struct {
173     uint32_t code;
174     int16_t length;
175     uint8_t *s;
176 } Line;
177 
178 typedef struct {
179     int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
180     int16_t count;
181     int16_t length;
182     uint8_t *s;
183 } Word;
184 
185 static Line lines[MAX_LINE_COUNT];
186 static Word words[MAX_WORD_COUNT];
187 
188 static uint32_t lineCount=0, wordCount=0;
189 
190 static int16_t leadByteCount;
191 
192 #define LEADBYTE_LIMIT 16
193 
194 static int16_t tokens[LEADBYTE_LIMIT*256];
195 static uint32_t tokenCount;
196 
197 /* the structure for algorithmic names needs to be 4-aligned */
198 struct AlgorithmicRange {
199     UChar32 start, end;
200     uint8_t type, variant;
201     uint16_t size;
202 };
203 
204 class NamesPropsBuilder : public PropsBuilder {
205 public:
206     NamesPropsBuilder(UErrorCode &errorCode);
207     virtual ~NamesPropsBuilder();
208 
209     virtual void setUnicodeVersion(const UVersionInfo version);
210     virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
211     virtual void build(UErrorCode &errorCode);
212     virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
213 
214 private:
215     virtual void setAlgNamesRange(UChar32 start, UChar32 end,
216                                   const char *type, const char *prefix, UErrorCode &errorCode);
217 
218     CharString algRanges;
219     int32_t countAlgRanges;
220 };
221 
NamesPropsBuilder(UErrorCode & errorCode)222 NamesPropsBuilder::NamesPropsBuilder(UErrorCode &errorCode)
223         : countAlgRanges(0) {
224     for(int i=0; i<256; ++i) {
225         tokens[i]=0;
226     }
227 }
228 
~NamesPropsBuilder()229 NamesPropsBuilder::~NamesPropsBuilder() {
230 }
231 
232 void
setUnicodeVersion(const UVersionInfo version)233 NamesPropsBuilder::setUnicodeVersion(const UVersionInfo version) {
234     uprv_memcpy(dataInfo.dataVersion, version, 4);
235 }
236 
237 /* prototypes --------------------------------------------------------------- */
238 
239 static void
240 parseName(const char *name, int16_t length);
241 
242 static int16_t
243 skipNoise(const char *line, int16_t start, int16_t limit);
244 
245 static int16_t
246 getWord(const char *line, int16_t start, int16_t limit);
247 
248 static void
249 compress(UErrorCode &errorCode);
250 
251 static void
252 compressLines(void);
253 
254 static int16_t
255 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
256 
257 static int32_t
258 compareWords(const void *context, const void *word1, const void *word2);
259 
260 static int16_t
261 findToken(uint8_t *s, int16_t length);
262 
263 static Word *
264 findWord(const char *s, int16_t length);
265 
266 static Word *
267 addWord(const char *s, int16_t length);
268 
269 static void
270 countWord(Word *word);
271 
272 static void
273 addLine(UChar32 code, const char *names[], int16_t lengths[], int16_t count);
274 
275 static void
276 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
277 
278 static uint32_t
279 addToken(uint8_t *s, int16_t length);
280 
281 static void
282 appendLineLength(int16_t length);
283 
284 static void
285 appendLineLengthNibble(uint8_t nibble);
286 
287 static uint8_t *
288 allocLine(int32_t length);
289 
290 static uint8_t *
291 allocWord(uint32_t length);
292 
293 /* parsing ------------------------------------------------------------------ */
294 
295 void
setProps(const UniProps & props,const UnicodeSet & newValues,UErrorCode & errorCode)296 NamesPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
297                             UErrorCode &errorCode) {
298     if(U_FAILURE(errorCode)) { return; }
299     if(!newValues.contains(UCHAR_NAME) && !newValues.contains(PPUCD_NAME_ALIAS)) {
300         return;
301     }
302 
303     U_ASSERT(props.start==props.end);
304 
305     const char *names[4]={ NULL, NULL, NULL, NULL };
306     int16_t lengths[4]={ 0, 0, 0, 0 };
307 
308     /* get the character name */
309     if(props.name!=NULL) {
310         names[0]=props.name;
311         lengths[0]=(int16_t)uprv_strlen(props.name);
312         parseName(names[0], lengths[0]);
313     }
314 
315     CharString buffer;
316     if(props.nameAlias!=NULL) {
317         /*
318          * Only use "correction" aliases for now, from Unicode 6.1 NameAliases.txt with 3 fields per line.
319          * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character.
320          */
321         const char *corr=uprv_strstr(props.nameAlias, "correction=");
322         if(corr!=NULL) {
323             corr+=11;  // skip "correction="
324             const char *limit=uprv_strchr(corr, ',');
325             if(limit!=NULL) {
326                 buffer.append(corr, limit-corr, errorCode);
327                 names[3]=buffer.data();
328                 lengths[3]=(int16_t)(limit-corr);
329             } else {
330                 names[3]=corr;
331                 lengths[3]=(int16_t)uprv_strlen(corr);
332             }
333             parseName(names[3], lengths[3]);
334         }
335     }
336 
337     addLine(props.start, names, lengths, LENGTHOF(names));
338 }
339 
340 static void
parseName(const char * name,int16_t length)341 parseName(const char *name, int16_t length) {
342     int16_t start=0, limit, wordLength/*, prevStart=-1*/;
343     Word *word;
344 
345     while(start<length) {
346         /* skip any "noise" characters */
347         limit=skipNoise(name, start, length);
348         if(start<limit) {
349             /*prevStart=-1;*/
350             start=limit;
351         }
352         if(start==length) {
353             break;
354         }
355 
356         /* get a word and add it if it is longer than 1 */
357         limit=getWord(name, start, length);
358         wordLength=(int16_t)(limit-start);
359         if(wordLength>1) {
360             word=findWord(name+start, wordLength);
361             if(word==NULL) {
362                 word=addWord(name+start, wordLength);
363             }
364             countWord(word);
365         }
366 
367 #if 0
368         /*
369          * if there was a word before this
370          * (with no noise in between), then add the pair of words, too
371          */
372         if(prevStart!=-1) {
373             wordLength=limit-prevStart;
374             word=findWord(name+prevStart, wordLength);
375             if(word==NULL) {
376                 word=addWord(name+prevStart, wordLength);
377             }
378             countWord(word);
379         }
380 #endif
381 
382         /*prevStart=start;*/
383         start=limit;
384     }
385 }
386 
387 static UBool
isWordChar(char c)388 isWordChar(char c) {
389     return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
390            ('J'<=c && c<='R') ||
391            ('S'<=c && c<='Z') ||
392 
393            ('0'<=c && c<='9');
394 }
395 
396 static int16_t
skipNoise(const char * line,int16_t start,int16_t limit)397 skipNoise(const char *line, int16_t start, int16_t limit) {
398     /* skip anything that is not part of a word in this sense */
399     while(start<limit && !isWordChar(line[start])) {
400         ++start;
401     }
402 
403     return start;
404 }
405 
406 static int16_t
getWord(const char * line,int16_t start,int16_t limit)407 getWord(const char *line, int16_t start, int16_t limit) {
408     char c=0; /* initialize to avoid a compiler warning although the code was safe */
409 
410     /* a unicode character name word consists of A-Z0-9 */
411     while(start<limit && isWordChar(line[start])) {
412         ++start;
413     }
414 
415     /* include a following space or dash */
416     if(start<limit && ((c=line[start])==' ' || c=='-')) {
417         ++start;
418     }
419 
420     return start;
421 }
422 
423 void
setAlgNamesRange(UChar32 start,UChar32 end,const char * type,const char * prefix,UErrorCode & errorCode)424 NamesPropsBuilder::setAlgNamesRange(UChar32 start, UChar32 end,
425                                     const char *type,
426                                     const char *prefix,  // number of hex digits
427                                     UErrorCode &errorCode) {
428     /* modulo factors, maximum 8 */
429     /* 3 factors: 19, 21, 28, most-to-least-significant */
430     static const uint16_t hangulFactors[3]={
431         19, 21, 28
432     };
433 
434     static const char jamo[]=
435         "HANGUL SYLLABLE \0"
436 
437         "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
438         "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
439 
440         "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
441         "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
442         "YU\0EU\0YI\0I\0"
443 
444         "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
445         "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
446         "S\0SS\0NG\0J\0C\0K\0T\0P\0H";
447 
448     int32_t prefixLength=0;
449     AlgorithmicRange range;
450     uprv_memset(&range, 0, sizeof(AlgorithmicRange));
451     int32_t rangeSize=(int32_t)sizeof(AlgorithmicRange);
452     range.start=start;
453     range.end=end;
454     if(0==uprv_strcmp(type, "han")) {
455         range.type=0;
456         range.variant= end<=0xffff ? 4 : 5;
457         prefixLength=uprv_strlen(prefix)+1;
458         rangeSize+=prefixLength;
459     } else if(0==uprv_strcmp(type, "hangul")) {
460         range.type=1;
461         range.variant=(uint8_t)LENGTHOF(hangulFactors);
462         rangeSize+=(int32_t)sizeof(hangulFactors);
463         rangeSize+=(int32_t)sizeof(jamo);
464     } else {
465         fprintf(stderr, "genprops error: unknown algnamesrange type '%s'\n", prefix);
466         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
467         return;
468     }
469     int32_t paddingLength=rangeSize&3;
470     if(paddingLength) {
471         paddingLength=4-paddingLength;
472         rangeSize+=paddingLength;
473     }
474     range.size=(uint16_t)rangeSize;
475     algRanges.append((char *)&range, (int32_t)sizeof(AlgorithmicRange), errorCode);
476     if(range.type==0) {  // han
477         algRanges.append(prefix, prefixLength, errorCode);
478     } else /* type==1 */ {  // hangul
479         algRanges.append((char *)hangulFactors, (int32_t)sizeof(hangulFactors), errorCode);
480         algRanges.append(jamo, (int32_t)sizeof(jamo), errorCode);
481     }
482     while(paddingLength) {
483         algRanges.append((char)0xaa, errorCode);
484         --paddingLength;
485     }
486     ++countAlgRanges;
487 }
488 
489 /* compressing -------------------------------------------------------------- */
490 
491 static void
compress(UErrorCode & errorCode)492 compress(UErrorCode &errorCode) {
493     uint32_t i, letterCount;
494     int16_t wordNumber;
495 
496     /* sort the words in reverse order by weight */
497     uprv_sortArray(words, wordCount, sizeof(Word),
498                    compareWords, NULL, false, &errorCode);
499 
500     /* remove the words that do not save anything */
501     while(wordCount>0 && words[wordCount-1].weight<1) {
502         --wordCount;
503     }
504 
505     /* count the letters in the token range */
506     letterCount=0;
507     for(i=LEADBYTE_LIMIT; i<256; ++i) {
508         if(tokens[i]==-1) {
509             ++letterCount;
510         }
511     }
512     if(!beQuiet) {
513         printf("number of letters used in the names: %d\n", (int)letterCount);
514     }
515 
516     /* do we need double-byte tokens? */
517     if(wordCount+letterCount<=256) {
518         /* no, single-byte tokens are enough */
519         leadByteCount=0;
520         for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
521             if(tokens[i]!=-1) {
522                 tokens[i]=wordNumber;
523                 if(beVerbose) {
524                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
525                             (int)i, (long)words[wordNumber].weight,
526                             words[wordNumber].length, words[wordNumber].s);
527                 }
528                 ++wordNumber;
529             }
530         }
531         tokenCount=i;
532     } else {
533         /*
534          * The tokens that need two token bytes
535          * get their weight reduced by their count
536          * because they save less.
537          */
538         tokenCount=256-letterCount;
539         for(i=tokenCount; i<wordCount; ++i) {
540             words[i].weight-=words[i].count;
541         }
542 
543         /* sort these words in reverse order by weight */
544         errorCode=U_ZERO_ERROR;
545         uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
546                         compareWords, NULL, false, &errorCode);
547 
548         /* remove the words that do not save anything */
549         while(wordCount>0 && words[wordCount-1].weight<1) {
550             --wordCount;
551         }
552 
553         /* how many tokens and lead bytes do we have now? */
554         tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
555         /*
556          * adjust upwards to take into account that
557          * double-byte tokens must not
558          * use NAME_SEPARATOR_CHAR as a second byte
559          */
560         tokenCount+=(tokenCount-256+254)/255;
561 
562         leadByteCount=(int16_t)(tokenCount>>8);
563         if(leadByteCount<LEADBYTE_LIMIT) {
564             /* adjust for the real number of lead bytes */
565             tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
566         } else {
567             /* limit the number of lead bytes */
568             leadByteCount=LEADBYTE_LIMIT-1;
569             tokenCount=LEADBYTE_LIMIT*256;
570             wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
571             /* adjust again to skip double-byte tokens with ';' */
572             wordCount-=(tokenCount-256+254)/255;
573         }
574 
575         /* set token 0 to word 0 */
576         tokens[0]=0;
577         if(beVerbose) {
578             printf("tokens[0x000]: word%8ld \"%.*s\"\n",
579                     (long)words[0].weight,
580                     words[0].length, words[0].s);
581         }
582         wordNumber=1;
583 
584         /* set the lead byte tokens */
585         for(i=1; (int16_t)i<=leadByteCount; ++i) {
586             tokens[i]=-2;
587         }
588 
589         /* set the tokens */
590         for(; i<256; ++i) {
591             /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
592             if(tokens[i]!=-1) {
593                 tokens[i]=wordNumber;
594                 if(beVerbose) {
595                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
596                             (int)i, (long)words[wordNumber].weight,
597                             words[wordNumber].length, words[wordNumber].s);
598                 }
599                 ++wordNumber;
600             }
601         }
602 
603         /* continue above 255 where there are no letters */
604         for(; (uint32_t)wordNumber<wordCount; ++i) {
605             if((i&0xff)==NAME_SEPARATOR_CHAR) {
606                 tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
607             } else {
608                 tokens[i]=wordNumber;
609                 if(beVerbose) {
610                     printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
611                             (int)i, (long)words[wordNumber].weight,
612                             words[wordNumber].length, words[wordNumber].s);
613                 }
614                 ++wordNumber;
615             }
616         }
617         tokenCount=i; /* should be already tokenCount={i or i+1} */
618     }
619 
620     if(!beQuiet) {
621         printf("number of lead bytes: %d\n", leadByteCount);
622         printf("number of single-byte tokens: %lu\n",
623             (unsigned long)256-letterCount-leadByteCount);
624         printf("number of tokens: %lu\n", (unsigned long)tokenCount);
625     }
626 
627     compressLines();
628 }
629 
630 static void
compressLines()631 compressLines() {
632     Line *line=NULL;
633     uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
634              groupMSB=0xffff, lineCount2;
635     int16_t groupTop=0;
636 
637     /* store the groups like lines, with compressed data after raw strings */
638     groupBottom=lineTop;
639     lineCount2=lineCount;
640     lineCount=0;
641 
642     /* loop over all lines */
643     while(i<lineCount2) {
644         line=lines+i++;
645         inLine=line->code;
646 
647         /* segment the lines to groups of 32 */
648         if(inLine>>GROUP_SHIFT!=groupMSB) {
649             /* finish the current group with empty lines */
650             while((++outLine&GROUP_MASK)!=0) {
651                 appendLineLength(0);
652             }
653 
654             /* store the group like a line */
655             if(groupTop>0) {
656                 if(groupTop>GROUP_STORE_SIZE) {
657                     fprintf(stderr, "gennames: group store overflow\n");
658                     exit(U_BUFFER_OVERFLOW_ERROR);
659                 }
660                 addGroup(groupMSB, groupStore, groupTop);
661             }
662 
663             /* start the new group */
664             lineLengthsTop=0;
665             groupTop=0;
666             groupMSB=inLine>>GROUP_SHIFT;
667             outLine=(inLine&~GROUP_MASK)-1;
668         }
669 
670         /* write empty lines between the previous line in the group and this one */
671         while(++outLine<inLine) {
672             appendLineLength(0);
673         }
674 
675         /* write characters and tokens for this line */
676         appendLineLength(compressLine(line->s, line->length, &groupTop));
677     }
678 
679     /* finish and store the last group */
680     if(line && groupMSB!=0xffff) {
681         /* finish the current group with empty lines */
682         while((++outLine&GROUP_MASK)!=0) {
683             appendLineLength(0);
684         }
685 
686         /* store the group like a line */
687         if(groupTop>0) {
688             if(groupTop>GROUP_STORE_SIZE) {
689                 fprintf(stderr, "gennames: group store overflow\n");
690                 exit(U_BUFFER_OVERFLOW_ERROR);
691             }
692             addGroup(groupMSB, groupStore, groupTop);
693         }
694     }
695 
696     if(!beQuiet) {
697         printf("number of groups: %lu\n", (unsigned long)lineCount);
698     }
699 }
700 
701 static int16_t
compressLine(uint8_t * s,int16_t length,int16_t * pGroupTop)702 compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
703     int16_t start, limit, token, groupTop=*pGroupTop;
704 
705     start=0;
706     do {
707         /* write any "noise" characters */
708         limit=skipNoise((char *)s, start, length);
709         while(start<limit) {
710             groupStore[groupTop++]=s[start++];
711         }
712 
713         if(start==length) {
714             break;
715         }
716 
717         /* write a word, as token or directly */
718         limit=getWord((char *)s, start, length);
719         if(limit-start==1) {
720             groupStore[groupTop++]=s[start++];
721         } else {
722             token=findToken(s+start, (int16_t)(limit-start));
723             if(token!=-1) {
724                 if(token>0xff) {
725                     groupStore[groupTop++]=(uint8_t)(token>>8);
726                 }
727                 groupStore[groupTop++]=(uint8_t)token;
728                 start=limit;
729             } else {
730                 while(start<limit) {
731                     groupStore[groupTop++]=s[start++];
732                 }
733             }
734         }
735     } while(start<length);
736 
737     length=(int16_t)(groupTop-*pGroupTop);
738     *pGroupTop=groupTop;
739     return length;
740 }
741 
742 static int32_t
compareWords(const void * context,const void * word1,const void * word2)743 compareWords(const void *context, const void *word1, const void *word2) {
744     /* reverse sort by word weight */
745     return ((Word *)word2)->weight-((Word *)word1)->weight;
746 }
747 
748 void
build(UErrorCode & errorCode)749 NamesPropsBuilder::build(UErrorCode &errorCode) {
750     if(U_FAILURE(errorCode)) { return; }
751 
752     if(!beQuiet) {
753         puts("* unames.icu stats *");
754         printf("size of all names in the database: %lu\n",
755             (unsigned long)lineTop);
756         printf("number of named Unicode characters: %lu\n",
757             (unsigned long)lineCount);
758         printf("number of words in the dictionary from these names: %lu\n",
759             (unsigned long)wordCount);
760     }
761     compress(errorCode);
762 }
763 
764 /* generate output data ----------------------------------------------------- */
765 
766 void
writeBinaryData(const char * path,UBool withCopyright,UErrorCode & errorCode)767 NamesPropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
768     if(U_FAILURE(errorCode)) { return; }
769 
770     UNewDataMemory *pData=udata_create(path, "icu", "unames", &dataInfo,
771                                        withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
772     if(U_FAILURE(errorCode)) {
773         fprintf(stderr, "genprops: udata_create(%s, unames.icu) failed - %s\n",
774                 path, u_errorName(errorCode));
775         return;
776     }
777 
778     uint16_t groupWords[3];
779     uint32_t i, groupTop=lineTop, size,
780              tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
781     long dataLength;
782     int16_t token;
783 
784     /* first, see how much space we need, and prepare the token strings */
785     for(i=0; i<tokenCount; ++i) {
786         token=tokens[i];
787         if(token!=-1 && token!=-2) {
788             tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
789         }
790     }
791 
792     /*
793      * Required padding for data swapping:
794      * The token table undergoes a permutation during data swapping when the
795      * input and output charsets are different.
796      * The token table cannot grow during swapping, so we need to make sure that
797      * the table is long enough for successful in-place permutation.
798      *
799      * We simply round up tokenCount to the next multiple of 256 to account for
800      * all possible permutations.
801      *
802      * An optimization is possible if we only ever swap between ASCII and EBCDIC:
803      *
804      * If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
805      * and will be swapped between ASCII and EBCDIC between
806      * positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
807      * This should be the only -1 entry in tokens[256..511] on which the data
808      * swapper bases its trail byte permutation map (trailMap[]).
809      *
810      * It would be sufficient to increase tokenCount so that its lower 8 bits
811      * are at least 0x5e+1 to make room for swapping between the two semicolons.
812      * For values higher than 0x5e, the trail byte permutation map (trailMap[])
813      * should always be an identity map, where we do not need additional room.
814      */
815     i=tokenCount;
816     tokenCount=(tokenCount+0xff)&~0xff;
817     if(!beQuiet && i<tokenCount) {
818         printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
819     }
820     for(; i<tokenCount; ++i) {
821         if((i&0xff)==NAME_SEPARATOR_CHAR) {
822             tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
823         } else {
824             tokens[i]=0; /* unused token for padding */
825         }
826     }
827 
828     /*
829      * Calculate the total size in bytes of the data including:
830      * - the offset to the token strings, uint32_t (4)
831      * - the offset to the group table, uint32_t (4)
832      * - the offset to the group strings, uint32_t (4)
833      * - the offset to the algorithmic names, uint32_t (4)
834      *
835      * - the number of tokens, uint16_t (2)
836      * - the token table, uint16_t[tokenCount] (2*tokenCount)
837      *
838      * - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
839      *
840      * - the number of groups, uint16_t (2)
841      * - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
842      *
843      * - the group strings (groupTop-groupBottom), 2-padded
844      *
845      * - the size of the data for the algorithmic names
846      */
847     tokenStringOffset=4+4+4+4+2+2*tokenCount;
848     groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
849     groupStringOffset=groupsOffset+2+6*lineCount;
850     algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
851 
852     size=algNamesOffset+4+algRanges.length();
853 
854     if(!beQuiet) {
855         printf("size of the Unicode Names data:\n"
856                "total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
857                 (unsigned long)size, (unsigned long)(lineTop-groupTop),
858                 (unsigned long)(groupTop-groupBottom), (unsigned long)(4+algRanges.length()));
859     }
860 
861     /* write the data to the file */
862     /* offsets */
863     udata_write32(pData, tokenStringOffset);
864     udata_write32(pData, groupsOffset);
865     udata_write32(pData, groupStringOffset);
866     udata_write32(pData, algNamesOffset);
867 
868     /* token table */
869     udata_write16(pData, (uint16_t)tokenCount);
870     udata_writeBlock(pData, tokens, 2*tokenCount);
871 
872     /* token strings */
873     udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
874     if((lineTop-groupTop)&1) {
875         /* 2-padding */
876         udata_writePadding(pData, 1);
877     }
878 
879     /* group table */
880     udata_write16(pData, (uint16_t)lineCount);
881     for(i=0; i<lineCount; ++i) {
882         /* groupMSB */
883         groupWords[0]=(uint16_t)lines[i].code;
884 
885         /* offset */
886         uint32_t offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
887         groupWords[1]=(uint16_t)(offset>>16);
888         groupWords[2]=(uint16_t)(offset);
889         udata_writeBlock(pData, groupWords, 6);
890     }
891 
892     /* group strings */
893     udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
894 
895     /* 4-align the algorithmic names data */
896     udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
897 
898     udata_write32(pData, countAlgRanges);
899     udata_writeBlock(pData, algRanges.data(), algRanges.length());
900 
901     /* finish up */
902     dataLength=udata_finish(pData, &errorCode);
903     if(U_FAILURE(errorCode)) {
904         fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
905         exit(errorCode);
906     }
907 
908     if(dataLength!=(long)size) {
909         fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
910 dataLength, (unsigned long)size);
911         exit(U_INTERNAL_PROGRAM_ERROR);
912     }
913 }
914 
915 /* helpers ------------------------------------------------------------------ */
916 
917 static int16_t
findToken(uint8_t * s,int16_t length)918 findToken(uint8_t *s, int16_t length) {
919     int16_t i, token;
920 
921     for(i=0; i<(int16_t)tokenCount; ++i) {
922         token=tokens[i];
923         if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
924             return i;
925         }
926     }
927 
928     return -1;
929 }
930 
931 static Word *
findWord(const char * s,int16_t length)932 findWord(const char *s, int16_t length) {
933     uint32_t i;
934 
935     for(i=0; i<wordCount; ++i) {
936         if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
937             return words+i;
938         }
939     }
940 
941     return NULL;
942 }
943 
944 static Word *
addWord(const char * s,int16_t length)945 addWord(const char *s, int16_t length) {
946     uint8_t *stringStart;
947     Word *word;
948 
949     if(wordCount==MAX_WORD_COUNT) {
950         fprintf(stderr, "gennames: too many words\n");
951         exit(U_BUFFER_OVERFLOW_ERROR);
952     }
953 
954     stringStart=allocWord(length);
955     uprv_memcpy(stringStart, s, length);
956 
957     word=words+wordCount;
958 
959     /*
960      * Initialize the weight with the costs for this token:
961      * a zero-terminated string and a 16-bit offset.
962      */
963     word->weight=-(length+1+2);
964     word->count=0;
965     word->length=length;
966     word->s=stringStart;
967 
968     ++wordCount;
969 
970     return word;
971 }
972 
973 static void
countWord(Word * word)974 countWord(Word *word) {
975     /* add to the weight the savings: the length of the word minus 1 byte for the token */
976     word->weight+=word->length-1;
977     ++word->count;
978 }
979 
980 static void
addLine(UChar32 code,const char * names[],int16_t lengths[],int16_t count)981 addLine(UChar32 code, const char *names[], int16_t lengths[], int16_t count) {
982     uint8_t *stringStart;
983     Line *line;
984     int16_t i, length;
985 
986     if(lineCount==MAX_LINE_COUNT) {
987         fprintf(stderr, "gennames: too many lines\n");
988         exit(U_BUFFER_OVERFLOW_ERROR);
989     }
990 
991     /* find the last non-empty name */
992     while(count>0 && lengths[count-1]==0) {
993         --count;
994     }
995     if(count==0) {
996         return; /* should not occur: caller should not have called */
997     }
998 
999     /* there will be (count-1) separator characters */
1000     i=count;
1001     length=count-1;
1002 
1003     /* add lengths of strings */
1004     while(i>0) {
1005         length+=lengths[--i];
1006     }
1007 
1008     /* allocate line memory */
1009     stringStart=allocLine(length);
1010 
1011     /* copy all strings into the line memory */
1012     length=0; /* number of chars copied so far */
1013     for(i=0; i<count; ++i) {
1014         if(i>0) {
1015             stringStart[length++]=NAME_SEPARATOR_CHAR;
1016         }
1017         if(lengths[i]>0) {
1018             uprv_memcpy(stringStart+length, names[i], lengths[i]);
1019             length+=lengths[i];
1020         }
1021     }
1022 
1023     line=lines+lineCount;
1024 
1025     line->code=code;
1026     line->length=length;
1027     line->s=stringStart;
1028 
1029     ++lineCount;
1030 
1031     /* prevent a character value that is actually in a name from becoming a token */
1032     while(length>0) {
1033         tokens[stringStart[--length]]=-1;
1034     }
1035 }
1036 
1037 static void
addGroup(uint32_t groupMSB,uint8_t * strings,int16_t length)1038 addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1039     uint8_t *stringStart;
1040     Line *line;
1041 
1042     if(lineCount==MAX_LINE_COUNT) {
1043         fprintf(stderr, "gennames: too many groups\n");
1044         exit(U_BUFFER_OVERFLOW_ERROR);
1045     }
1046 
1047     /* store the line lengths first, then the strings */
1048     lineLengthsTop=(lineLengthsTop+1)/2;
1049     stringStart=allocLine(lineLengthsTop+length);
1050     uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1051     uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1052 
1053     line=lines+lineCount;
1054 
1055     line->code=groupMSB;
1056     line->length=length;
1057     line->s=stringStart;
1058 
1059     ++lineCount;
1060 }
1061 
1062 static uint32_t
addToken(uint8_t * s,int16_t length)1063 addToken(uint8_t *s, int16_t length) {
1064     uint8_t *stringStart;
1065 
1066     stringStart=allocLine(length+1);
1067     uprv_memcpy(stringStart, s, length);
1068     stringStart[length]=0;
1069 
1070     return (uint32_t)(stringStart - stringStore);
1071 }
1072 
1073 static void
appendLineLength(int16_t length)1074 appendLineLength(int16_t length) {
1075     if(length>=76) {
1076         fprintf(stderr, "gennames: compressed line too long\n");
1077         exit(U_BUFFER_OVERFLOW_ERROR);
1078     }
1079     if(length>=12) {
1080         length-=12;
1081         appendLineLengthNibble((uint8_t)((length>>4)|12));
1082     }
1083     appendLineLengthNibble((uint8_t)length);
1084 }
1085 
1086 static void
appendLineLengthNibble(uint8_t nibble)1087 appendLineLengthNibble(uint8_t nibble) {
1088     if((lineLengthsTop&1)==0) {
1089         lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1090     } else {
1091         lineLengths[lineLengthsTop/2]|=nibble&0xf;
1092     }
1093     ++lineLengthsTop;
1094 }
1095 
1096 static uint8_t *
allocLine(int32_t length)1097 allocLine(int32_t length) {
1098     uint32_t top=lineTop+length;
1099     uint8_t *p;
1100 
1101     if(top>wordBottom) {
1102         fprintf(stderr, "gennames allocLine(): out of memory\n");
1103         exit(U_MEMORY_ALLOCATION_ERROR);
1104     }
1105     p=stringStore+lineTop;
1106     lineTop=top;
1107     return p;
1108 }
1109 
1110 static uint8_t *
allocWord(uint32_t length)1111 allocWord(uint32_t length) {
1112     uint32_t bottom=wordBottom-length;
1113 
1114     if(lineTop>bottom) {
1115         fprintf(stderr, "gennames allocWord(): out of memory\n");
1116         exit(U_MEMORY_ALLOCATION_ERROR);
1117     }
1118     wordBottom=bottom;
1119     return stringStore+bottom;
1120 }
1121 
1122 PropsBuilder *
createNamesPropsBuilder(UErrorCode & errorCode)1123 createNamesPropsBuilder(UErrorCode &errorCode) {
1124     if(U_FAILURE(errorCode)) { return NULL; }
1125     PropsBuilder *pb=new NamesPropsBuilder(errorCode);
1126     if(pb==NULL) {
1127         errorCode=U_MEMORY_ALLOCATION_ERROR;
1128     }
1129     return pb;
1130 }
1131 
1132 /*
1133  * Hey, Emacs, please set the following:
1134  *
1135  * Local Variables:
1136  * indent-tabs-mode: nil
1137  * End:
1138  *
1139  */
1140