• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2006, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  store.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2001may25
14 *   created by: Markus W. Scherer
15 *
16 *   Store Unicode normalization data in a memory-mappable file.
17 */
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include "unicode/utypes.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ustring.h"
24 #include "cmemory.h"
25 #include "cstring.h"
26 #include "filestrm.h"
27 #include "unicode/udata.h"
28 #include "utrie.h"
29 #include "unicode/uset.h"
30 #include "toolutil.h"
31 #include "unewdata.h"
32 #include "writesrc.h"
33 #include "unormimp.h"
34 #include "gennorm.h"
35 
36 #define DO_DEBUG_OUT 0
37 
38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
39 
40 /*
41  * The new implementation of the normalization code loads its data from
42  * unorm.icu, which is generated with this gennorm tool.
43  * The format of that file is described in unormimp.h .
44  */
45 
46 /* file data ---------------------------------------------------------------- */
47 
48 #if UCONFIG_NO_NORMALIZATION
49 
50 /* dummy UDataInfo cf. udata.h */
51 static UDataInfo dataInfo = {
52     sizeof(UDataInfo),
53     0,
54 
55     U_IS_BIG_ENDIAN,
56     U_CHARSET_FAMILY,
57     U_SIZEOF_UCHAR,
58     0,
59 
60     { 0, 0, 0, 0 },                 /* dummy dataFormat */
61     { 0, 0, 0, 0 },                 /* dummy formatVersion */
62     { 0, 0, 0, 0 }                  /* dummy dataVersion */
63 };
64 
65 #else
66 
67 /* UDataInfo cf. udata.h */
68 static UDataInfo dataInfo={
69     sizeof(UDataInfo),
70     0,
71 
72     U_IS_BIG_ENDIAN,
73     U_CHARSET_FAMILY,
74     U_SIZEOF_UCHAR,
75     0,
76 
77     { 0x4e, 0x6f, 0x72, 0x6d },   /* dataFormat="Norm" */
78     { 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
79     { 3, 2, 0, 0 }                /* dataVersion (Unicode version) */
80 };
81 
82 extern void
setUnicodeVersion(const char * v)83 setUnicodeVersion(const char *v) {
84     UVersionInfo version;
85     u_versionFromString(version, v);
86     uprv_memcpy(dataInfo.dataVersion, version, 4);
87 }
88 
89 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
90 
91 /* builder data ------------------------------------------------------------- */
92 
93 /* modularization flags, see gennorm.h (default to "store everything") */
94 uint32_t gStoreFlags=0xffffffff;
95 
96 typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
97 
98 static UNewTrie
99     *normTrie,
100     *norm32Trie,
101     *fcdTrie,
102     *auxTrie;
103 
104 static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
105 
106 static Norm *norms;
107 
108 /*
109  * set a flag for each code point that was seen in decompositions -
110  * avoid to decompose ones that have not been used before
111  */
112 static uint32_t haveSeenFlags[256];
113 
114 /* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
115 static USet *nfdQCNoSet;
116 
117 /* see addCombiningCP() for details */
118 static uint32_t combiningCPs[2000];
119 
120 /*
121  * after processCombining() this contains for each code point in combiningCPs[]
122  * the runtime combining index
123  */
124 static uint16_t combiningIndexes[2000];
125 
126 /* section limits for combiningCPs[], see addCombiningCP() */
127 static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;
128 
129 /**
130  * Structure for a triple of code points, stored in combiningTriplesMem.
131  * The lead and trail code points combine into the the combined one,
132  * i.e., there is a canonical decomposition of combined-> <lead, trail>.
133  *
134  * Before processCombining() is called, leadIndex and trailIndex are 0.
135  * After processCombining(), they contain the indexes of the lead and trail
136  * code point in the combiningCPs[] array.
137  * They are then sorted by leadIndex, then trailIndex.
138  * They are not sorted by code points.
139  */
140 typedef struct CombiningTriple {
141     uint16_t leadIndex, trailIndex;
142     uint32_t lead, trail, combined;
143 } CombiningTriple;
144 
145 /* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
146 static uint16_t combiningTable[0x8000];
147 static uint16_t combiningTableTop=0;
148 
149 #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
150 static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
151                                +10000]; /* +10000 for exclusion sets */
152 static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
153 static int32_t canonSetsCount=0;
154 
155 /* allocate and initialize a Norm unit */
156 static Norm *
allocNorm()157 allocNorm() {
158     /* allocate Norm */
159     Norm *p=(Norm *)utm_alloc(normMem);
160     /*
161      * The combiningIndex must not be initialized to 0 because 0 is the
162      * combiningIndex of the first forward-combining character.
163      */
164     p->combiningIndex=0xffff;
165     return p;
166 }
167 
168 extern void
init()169 init() {
170     uint16_t *p16;
171 
172     normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
173     uprv_memset(normTrie, 0, sizeof(UNewTrie));
174     norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
175     uprv_memset(norm32Trie, 0, sizeof(UNewTrie));
176     fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
177     uprv_memset(fcdTrie, 0, sizeof(UNewTrie));
178     auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
179     uprv_memset(auxTrie, 0, sizeof(UNewTrie));
180 
181     /* initialize the two tries */
182     if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) {
183         fprintf(stderr, "error: failed to initialize tries\n");
184         exit(U_MEMORY_ALLOCATION_ERROR);
185     }
186 
187     /* allocate Norm structures and reset the first one */
188     normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
189     norms=allocNorm();
190 
191     /* allocate UTF-32 string memory */
192     utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
193 
194     /* reset all "have seen" flags */
195     uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
196 
197     /* open an empty set */
198     nfdQCNoSet=uset_open(1, 0);
199 
200     /* allocate extra data memory for UTF-16 decomposition strings and other values */
201     extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
202     /* initialize the extraMem counter for the top of FNC strings */
203     p16=(uint16_t *)utm_alloc(extraMem);
204     *p16=1;
205 
206     /* allocate temporary memory for combining triples */
207     combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
208 
209     /* set the minimum code points for no/maybe quick check values to the end of the BMP */
210     indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
211     indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
212     indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
213     indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
214 
215     /* preset the indexes portion of canonStartSets */
216     uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2);
217 }
218 
219 /*
220  * get or create a Norm unit;
221  * get or create the intermediate trie entries for it as well
222  */
223 static Norm *
createNorm(uint32_t code)224 createNorm(uint32_t code) {
225     Norm *p;
226     uint32_t i;
227 
228     i=utrie_get32(normTrie, (UChar32)code, NULL);
229     if(i!=0) {
230         p=norms+i;
231     } else {
232         /* allocate Norm */
233         p=allocNorm();
234         if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) {
235             fprintf(stderr, "error: too many normalization entries\n");
236             exit(U_BUFFER_OVERFLOW_ERROR);
237         }
238     }
239     return p;
240 }
241 
242 /* get an existing Norm unit */
243 static Norm *
getNorm(uint32_t code)244 getNorm(uint32_t code) {
245     uint32_t i;
246 
247     i=utrie_get32(normTrie, (UChar32)code, NULL);
248     if(i==0) {
249         return NULL;
250     }
251     return norms+i;
252 }
253 
254 /* get the canonical combining class of a character */
255 static uint8_t
getCCFromCP(uint32_t code)256 getCCFromCP(uint32_t code) {
257     Norm *norm=getNorm(code);
258     if(norm==NULL) {
259         return 0;
260     } else {
261         return norm->udataCC;
262     }
263 }
264 
265 /*
266  * enumerate all code points with their Norm structs and call a function for each
267  * return the number of code points with data
268  */
269 static uint32_t
enumTrie(EnumTrieFn * fn,void * context)270 enumTrie(EnumTrieFn *fn, void *context) {
271     uint32_t count, i;
272     UChar32 code;
273     UBool isInBlockZero;
274 
275     count=0;
276     for(code=0; code<=0x10ffff;) {
277         i=utrie_get32(normTrie, code, &isInBlockZero);
278         if(isInBlockZero) {
279             code+=UTRIE_DATA_BLOCK_LENGTH;
280         } else {
281             if(i!=0) {
282                 fn(context, (uint32_t)code, norms+i);
283                 ++count;
284             }
285             ++code;
286         }
287     }
288     return count;
289 }
290 
291 static void
setHaveSeenString(const uint32_t * s,int32_t length)292 setHaveSeenString(const uint32_t *s, int32_t length) {
293     uint32_t c;
294 
295     while(length>0) {
296         c=*s++;
297         haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f));
298         --length;
299     }
300 }
301 
302 #define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
303 
304 /* handle combining data ---------------------------------------------------- */
305 
306 /*
307  * Insert an entry into combiningCPs[] for the new code point code with its flags.
308  * The flags indicate if code combines forward, backward, or both.
309  *
310  * combiningCPs[] contains three sections:
311  * 1. code points that combine forward
312  * 2. code points that combine forward and backward
313  * 3. code points that combine backward
314  *
315  * Search for code in the entire array.
316  * If it is found and already is in the right section (old flags==new flags)
317  * then we are done.
318  * If it is found but the flags are different, then remove it,
319  * union the old and new flags, and reinsert it into its correct section.
320  * If it is not found, then just insert it.
321  *
322  * Within each section, the code points are not sorted.
323  */
324 static void
addCombiningCP(uint32_t code,uint8_t flags)325 addCombiningCP(uint32_t code, uint8_t flags) {
326     uint32_t newEntry;
327     uint16_t i;
328 
329     newEntry=code|((uint32_t)flags<<24);
330 
331     /* search for this code point */
332     for(i=0; i<combineBackTop; ++i) {
333         if(code==(combiningCPs[i]&0xffffff)) {
334             /* found it */
335             if(newEntry==combiningCPs[i]) {
336                 return; /* no change */
337             }
338 
339             /* combine the flags, remove the old entry from the old place, and insert the new one */
340             newEntry|=combiningCPs[i];
341             if(i!=--combineBackTop) {
342                 uprv_memmove(combiningCPs+i, combiningCPs+i+1, (combineBackTop-i)*4);
343             }
344             if(i<combineBothTop) {
345                 --combineBothTop;
346             }
347             if(i<combineFwdTop) {
348                 --combineFwdTop;
349             }
350             break;
351         }
352     }
353 
354     /* not found or modified, insert it */
355     if(combineBackTop>=sizeof(combiningCPs)/4) {
356         fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n",
357                 (long)(sizeof(combiningCPs)/4));
358         exit(U_MEMORY_ALLOCATION_ERROR);
359     }
360 
361     /* set i to the insertion point */
362     flags=(uint8_t)(newEntry>>24);
363     if(flags==1) {
364         i=combineFwdTop++;
365         ++combineBothTop;
366     } else if(flags==3) {
367         i=combineBothTop++;
368     } else /* flags==2 */ {
369         i=combineBackTop;
370     }
371 
372     /* move the following code points up one and insert newEntry at i */
373     if(i<combineBackTop) {
374         uprv_memmove(combiningCPs+i+1, combiningCPs+i, (combineBackTop-i)*4);
375     }
376     combiningCPs[i]=newEntry;
377 
378     /* finally increment the total counter */
379     ++combineBackTop;
380 }
381 
382 /**
383  * Find the index in combiningCPs[] where code point code is stored.
384  * @param code code point to look for
385  * @param isLead is code a forward combining code point?
386  * @return index in combiningCPs[] where code is stored
387  */
388 static uint16_t
findCombiningCP(uint32_t code,UBool isLead)389 findCombiningCP(uint32_t code, UBool isLead) {
390     uint16_t i, limit;
391 
392     if(isLead) {
393         i=0;
394         limit=combineBothTop;
395     } else {
396         i=combineFwdTop;
397         limit=combineBackTop;
398     }
399 
400     /* search for this code point */
401     for(; i<limit; ++i) {
402         if(code==(combiningCPs[i]&0xffffff)) {
403             /* found it */
404             return i;
405         }
406     }
407 
408     /* not found */
409     return 0xffff;
410 }
411 
412 static void
addCombiningTriple(uint32_t lead,uint32_t trail,uint32_t combined)413 addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
414     CombiningTriple *triple;
415 
416     if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
417         return;
418     }
419 
420     /*
421      * set combiningFlags for the two code points
422      * do this after decomposition so that getNorm() above returns NULL
423      * if we do not have actual sub-decomposition data for the initial NFD here
424      */
425     createNorm(lead)->combiningFlags|=1;    /* combines forward */
426     createNorm(trail)->combiningFlags|=2;    /* combines backward */
427 
428     addCombiningCP(lead, 1);
429     addCombiningCP(trail, 2);
430 
431     triple=(CombiningTriple *)utm_alloc(combiningTriplesMem);
432     triple->lead=lead;
433     triple->trail=trail;
434     triple->combined=combined;
435 }
436 
437 static int
compareTriples(const void * l,const void * r)438 compareTriples(const void *l, const void *r) {
439     int diff;
440     diff=(int)((CombiningTriple *)l)->leadIndex-
441          (int)((CombiningTriple *)r)->leadIndex;
442     if(diff==0) {
443         diff=(int)((CombiningTriple *)l)->trailIndex-
444              (int)((CombiningTriple *)r)->trailIndex;
445     }
446     return diff;
447 }
448 
449 static void
processCombining()450 processCombining() {
451     CombiningTriple *triples;
452     uint16_t *p;
453     uint32_t combined;
454     uint16_t i, j, count, tableTop, finalIndex, combinesFwd;
455 
456     triples=utm_getStart(combiningTriplesMem);
457 
458     /* add lead and trail indexes to the triples for sorting */
459     count=(uint16_t)utm_countItems(combiningTriplesMem);
460     for(i=0; i<count; ++i) {
461         /* findCombiningCP() must always find the code point */
462         triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
463         triples[i].trailIndex=findCombiningCP(triples[i].trail, FALSE);
464     }
465 
466     /* sort them by leadIndex, trailIndex */
467     qsort(triples, count, sizeof(CombiningTriple), compareTriples);
468 
469     /* calculate final combining indexes and store them in the Norm entries */
470     tableTop=0;
471     j=0; /* triples counter */
472 
473     /* first, combining indexes of fwd/both characters are indexes into the combiningTable */
474     for(i=0; i<combineBothTop; ++i) {
475         /* start a new table */
476 
477         /* assign combining index */
478         createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=tableTop;
479 
480         /* calculate the length of the combining data for this lead code point in the combiningTable */
481         while(j<count && i==triples[j].leadIndex) {
482             /* count 2 to 3 16-bit units per composition entry (back-index, code point) */
483             combined=triples[j++].combined;
484             if(combined<=0x1fff) {
485                 tableTop+=2;
486             } else {
487                 tableTop+=3;
488             }
489         }
490     }
491 
492     /* second, combining indexes of back-only characters are simply incremented from here to be unique */
493     finalIndex=tableTop;
494     for(; i<combineBackTop; ++i) {
495         createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=finalIndex++;
496     }
497 
498     /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
499     if(finalIndex>0x8000) {
500         fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
501                 tableTop, (long)(sizeof(combiningTable)/4));
502         exit(U_MEMORY_ALLOCATION_ERROR);
503     }
504 
505     combiningTableTop=tableTop;
506 
507     /* store the combining data in the combiningTable, with the final indexes from above */
508     p=combiningTable;
509     j=0; /* triples counter */
510 
511     /*
512      * this is essentially the same loop as above, but
513      * it writes the table data instead of calculating and setting the final indexes;
514      * it is necessary to have two passes so that all the final indexes are known before
515      * they are written into the table
516      */
517     for(i=0; i<combineBothTop; ++i) {
518         /* start a new table */
519 
520         combined=0; /* avoid compiler warning */
521 
522         /* store the combining data for this lead code point in the combiningTable */
523         while(j<count && i==triples[j].leadIndex) {
524             finalIndex=combiningIndexes[triples[j].trailIndex];
525             combined=triples[j++].combined;
526 
527             /* is combined a starter? (i.e., cc==0 && combines forward) */
528             combinesFwd=(uint16_t)((getNorm(combined)->combiningFlags&1)<<13);
529 
530             *p++=finalIndex;
531             if(combined<=0x1fff) {
532                 *p++=(uint16_t)(combinesFwd|combined);
533             } else if(combined<=0xffff) {
534                 *p++=(uint16_t)(0x8000|combinesFwd);
535                 *p++=(uint16_t)combined;
536             } else {
537                 *p++=(uint16_t)(0xc000|combinesFwd|((combined-0x10000)>>10));
538                 *p++=(uint16_t)(0xdc00|(combined&0x3ff));
539             }
540         }
541 
542         /* set a marker on the last final trail index in this lead's table */
543         if(combined<=0x1fff) {
544             *(p-2)|=0x8000;
545         } else {
546             *(p-3)|=0x8000;
547         }
548     }
549 
550     /* post condition: tableTop==(p-combiningTable) */
551 }
552 
553 /* processing incoming normalization data ----------------------------------- */
554 
555 /*
556  * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
557  * c must be a Hangul syllable code point.
558  */
559 static void
getHangulDecomposition(uint32_t c,Norm * pHangulNorm,uint32_t hangulBuffer[3])560 getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3]) {
561     /* Hangul syllable: decompose algorithmically */
562     uint32_t c2;
563     uint8_t length;
564 
565     uprv_memset(pHangulNorm, 0, sizeof(Norm));
566 
567     c-=HANGUL_BASE;
568 
569     c2=c%JAMO_T_COUNT;
570     c/=JAMO_T_COUNT;
571     if(c2>0) {
572         hangulBuffer[2]=JAMO_T_BASE+c2;
573         length=3;
574     } else {
575         hangulBuffer[2]=0;
576         length=2;
577     }
578 
579     hangulBuffer[1]=JAMO_V_BASE+c%JAMO_V_COUNT;
580     hangulBuffer[0]=JAMO_L_BASE+c/JAMO_V_COUNT;
581 
582     pHangulNorm->nfd=hangulBuffer;
583     pHangulNorm->lenNFD=length;
584     if(DO_STORE(UGENNORM_STORE_COMPAT)) {
585         pHangulNorm->nfkd=hangulBuffer;
586         pHangulNorm->lenNFKD=length;
587     }
588 }
589 
590 /*
591  * decompose the one decomposition further, may generate two decompositions
592  * apply all previous characters' decompositions to this one
593  */
594 static void
decompStoreNewNF(uint32_t code,Norm * norm)595 decompStoreNewNF(uint32_t code, Norm *norm) {
596     uint32_t nfd[40], nfkd[40], hangulBuffer[3];
597     Norm hangulNorm;
598 
599     uint32_t *s32;
600     Norm *p;
601     uint32_t c;
602     int32_t i, length;
603     uint8_t lenNFD=0, lenNFKD=0;
604     UBool changedNFD=FALSE, changedNFKD=FALSE;
605 
606     if((length=norm->lenNFD)!=0) {
607         /* always allocate the original string */
608         changedNFD=TRUE;
609         s32=norm->nfd;
610     } else if((length=norm->lenNFKD)!=0) {
611         /* always allocate the original string */
612         changedNFKD=TRUE;
613         s32=norm->nfkd;
614     } else {
615         /* no decomposition here, nothing to do */
616         return;
617     }
618 
619     /* decompose each code point */
620     for(i=0; i<length; ++i) {
621         c=s32[i];
622         p=getNorm(c);
623         if(p==NULL) {
624             if(HANGUL_BASE<=c && c<(HANGUL_BASE+HANGUL_COUNT)) {
625                 getHangulDecomposition(c, &hangulNorm, hangulBuffer);
626                 p=&hangulNorm;
627             } else {
628                 /* no data, no decomposition */
629                 nfd[lenNFD++]=c;
630                 nfkd[lenNFKD++]=c;
631                 continue;
632             }
633         }
634 
635         /* canonically decompose c */
636         if(changedNFD) {
637             if(p->lenNFD!=0) {
638                 uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4);
639                 lenNFD+=p->lenNFD;
640             } else {
641                 nfd[lenNFD++]=c;
642             }
643         }
644 
645         /* compatibility-decompose c */
646         if(p->lenNFKD!=0) {
647             uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4);
648             lenNFKD+=p->lenNFKD;
649             changedNFKD=TRUE;
650         } else if(p->lenNFD!=0) {
651             uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4);
652             lenNFKD+=p->lenNFD;
653             /*
654              * not  changedNFKD=TRUE;
655              * so that we do not store a new nfkd if there was no nfkd string before
656              * and we only see canonical decompositions
657              */
658         } else {
659             nfkd[lenNFKD++]=c;
660         }
661     }
662 
663     /* assume that norm->lenNFD==1 or ==2 */
664     if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) {
665         addCombiningTriple(s32[0], s32[1], code);
666     }
667 
668     if(changedNFD) {
669         if(lenNFD!=0) {
670             s32=utm_allocN(utf32Mem, lenNFD);
671             uprv_memcpy(s32, nfd, lenNFD*4);
672         } else {
673             s32=NULL;
674         }
675         norm->lenNFD=lenNFD;
676         norm->nfd=s32;
677         setHaveSeenString(nfd, lenNFD);
678     }
679     if(changedNFKD) {
680         if(lenNFKD!=0) {
681             s32=utm_allocN(utf32Mem, lenNFKD);
682             uprv_memcpy(s32, nfkd, lenNFKD*4);
683         } else {
684             s32=NULL;
685         }
686         norm->lenNFKD=lenNFKD;
687         norm->nfkd=s32;
688         setHaveSeenString(nfkd, lenNFKD);
689     }
690 }
691 
692 typedef struct DecompSingle {
693     uint32_t c;
694     Norm *norm;
695 } DecompSingle;
696 
697 /*
698  * apply this one character's decompositions (there is at least one!) to
699  * all previous characters' decompositions to decompose them further
700  */
701 static void
decompWithSingleFn(void * context,uint32_t code,Norm * norm)702 decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
703     uint32_t nfd[40], nfkd[40];
704     uint32_t *s32;
705     DecompSingle *me=(DecompSingle *)context;
706     uint32_t c, myC;
707     int32_t i, length;
708     uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD;
709     UBool changedNFD=FALSE, changedNFKD=FALSE;
710 
711     /* get the new character's data */
712     myC=me->c;
713     myLenNFD=me->norm->lenNFD;
714     myLenNFKD=me->norm->lenNFKD;
715     /* assume that myC has at least one decomposition */
716 
717     if((length=norm->lenNFD)!=0 && myLenNFD!=0) {
718         /* apply NFD(myC) to norm->nfd */
719         s32=norm->nfd;
720         for(i=0; i<length; ++i) {
721             c=s32[i];
722             if(c==myC) {
723                 uprv_memcpy(nfd+lenNFD, me->norm->nfd, myLenNFD*4);
724                 lenNFD+=myLenNFD;
725                 changedNFD=TRUE;
726             } else {
727                 nfd[lenNFD++]=c;
728             }
729         }
730     }
731 
732     if((length=norm->lenNFKD)!=0) {
733         /* apply NFD(myC) and NFKD(myC) to norm->nfkd */
734         s32=norm->nfkd;
735         for(i=0; i<length; ++i) {
736             c=s32[i];
737             if(c==myC) {
738                 if(myLenNFKD!=0) {
739                     uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
740                     lenNFKD+=myLenNFKD;
741                 } else /* assume myLenNFD!=0 */ {
742                     uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4);
743                     lenNFKD+=myLenNFD;
744                 }
745                 changedNFKD=TRUE;
746             } else {
747                 nfkd[lenNFKD++]=c;
748             }
749         }
750     } else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) {
751         /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
752         s32=norm->nfd;
753         for(i=0; i<length; ++i) {
754             c=s32[i];
755             if(c==myC) {
756                 uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
757                 lenNFKD+=myLenNFKD;
758                 changedNFKD=TRUE;
759             } else {
760                 nfkd[lenNFKD++]=c;
761             }
762         }
763     }
764 
765     /* set the new decompositions, forget the old ones */
766     if(changedNFD) {
767         if(lenNFD!=0) {
768             if(lenNFD>norm->lenNFD) {
769                 s32=utm_allocN(utf32Mem, lenNFD);
770             } else {
771                 s32=norm->nfd;
772             }
773             uprv_memcpy(s32, nfd, lenNFD*4);
774         } else {
775             s32=NULL;
776         }
777         norm->lenNFD=lenNFD;
778         norm->nfd=s32;
779     }
780     if(changedNFKD) {
781         if(lenNFKD!=0) {
782             if(lenNFKD>norm->lenNFKD) {
783                 s32=utm_allocN(utf32Mem, lenNFKD);
784             } else {
785                 s32=norm->nfkd;
786             }
787             uprv_memcpy(s32, nfkd, lenNFKD*4);
788         } else {
789             s32=NULL;
790         }
791         norm->lenNFKD=lenNFKD;
792         norm->nfkd=s32;
793     }
794 }
795 
796 /*
797  * process the data for one code point listed in UnicodeData;
798  * UnicodeData itself never maps a code point to both NFD and NFKD
799  */
800 extern void
storeNorm(uint32_t code,Norm * norm)801 storeNorm(uint32_t code, Norm *norm) {
802     DecompSingle decompSingle;
803     Norm *p;
804 
805     if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
806         /* ignore compatibility decomposition */
807         norm->lenNFKD=0;
808     }
809 
810     /* copy existing derived normalization properties */
811     p=createNorm(code);
812     norm->qcFlags=p->qcFlags;
813     norm->combiningFlags=p->combiningFlags;
814     norm->fncIndex=p->fncIndex;
815 
816     /* process the decomposition if there is one here */
817     if((norm->lenNFD|norm->lenNFKD)!=0) {
818         /* decompose this one decomposition further, may generate two decompositions */
819         decompStoreNewNF(code, norm);
820 
821         /* has this code point been used in previous decompositions? */
822         if(HAVE_SEEN(code)) {
823             /* use this decomposition to decompose other decompositions further */
824             decompSingle.c=code;
825             decompSingle.norm=norm;
826             enumTrie(decompWithSingleFn, &decompSingle);
827         }
828     }
829 
830     /* store the data */
831     uprv_memcpy(p, norm, sizeof(Norm));
832 }
833 
834 extern void
setQCFlags(uint32_t code,uint8_t qcFlags)835 setQCFlags(uint32_t code, uint8_t qcFlags) {
836     if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
837         /* ignore compatibility decomposition: unset the KC/KD flags */
838         qcFlags&=~(_NORM_QC_NFKC|_NORM_QC_NFKD);
839 
840         /* set the KC/KD flags to the same values as the C/D flags */
841         qcFlags|=qcFlags<<1;
842     }
843     if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
844         /* ignore composition data: unset the C/KC flags */
845         qcFlags&=~(_NORM_QC_NFC|_NORM_QC_NFKC);
846 
847         /* set the C/KC flags to the same values as the D/KD flags */
848         qcFlags|=qcFlags>>2;
849     }
850 
851     createNorm(code)->qcFlags|=qcFlags;
852 
853     /* adjust the minimum code point for quick check no/maybe */
854     if(code<0xffff) {
855         if((qcFlags&_NORM_QC_NFC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]) {
856             indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=(uint16_t)code;
857         }
858         if((qcFlags&_NORM_QC_NFKC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]) {
859             indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=(uint16_t)code;
860         }
861         if((qcFlags&_NORM_QC_NFD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
862             indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=(uint16_t)code;
863         }
864         if((qcFlags&_NORM_QC_NFKD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]) {
865             indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=(uint16_t)code;
866         }
867     }
868 
869     if(qcFlags&_NORM_QC_NFD) {
870         uset_add(nfdQCNoSet, (UChar32)code);
871     }
872 }
873 
874 extern void
setCompositionExclusion(uint32_t code)875 setCompositionExclusion(uint32_t code) {
876     if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
877         createNorm(code)->combiningFlags|=0x80;
878     }
879 }
880 
881 static void
setHangulJamoSpecials()882 setHangulJamoSpecials() {
883     Norm *norm;
884     uint32_t c, hangul;
885 
886     /*
887      * Hangul syllables are algorithmically decomposed into Jamos,
888      * and Jamos are algorithmically composed into Hangul syllables.
889      * The quick check flags are parsed, except for Hangul.
890      */
891 
892     /* set Jamo L specials */
893     hangul=0xac00;
894     for(c=0x1100; c<=0x1112; ++c) {
895         norm=createNorm(c);
896         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L;
897         if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
898             norm->combiningFlags=1;
899         }
900 
901         /* for each Jamo L create a set with its associated Hangul block */
902         norm->canonStart=uset_open(hangul, hangul+21*28-1);
903         hangul+=21*28;
904     }
905 
906     /* set Jamo V specials */
907     for(c=0x1161; c<=0x1175; ++c) {
908         norm=createNorm(c);
909         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V;
910         if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
911             norm->combiningFlags=2;
912         }
913         norm->unsafeStart=TRUE;
914     }
915 
916     /* set Jamo T specials */
917     for(c=0x11a8; c<=0x11c2; ++c) {
918         norm=createNorm(c);
919         norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T;
920         if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
921             norm->combiningFlags=2;
922         }
923         norm->unsafeStart=TRUE;
924     }
925 
926     /* set Hangul specials, precompacted */
927     norm=allocNorm();
928     norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL;
929     if(DO_STORE(UGENNORM_STORE_COMPAT)) {
930         norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD;
931     } else {
932         norm->qcFlags=_NORM_QC_NFD;
933     }
934 
935     if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) {
936         fprintf(stderr, "error: too many normalization entries (setting Hangul)\n");
937         exit(U_BUFFER_OVERFLOW_ERROR);
938     }
939 }
940 
941 /*
942  * set FC-NFKC-Closure string
943  * s contains the closure string; s[0]==length, s[1..length] is the actual string
944  * may modify s[0]
945  */
946 U_CFUNC void
setFNC(uint32_t c,UChar * s)947 setFNC(uint32_t c, UChar *s) {
948     uint16_t *p;
949     int32_t length, i, count;
950     UChar first;
951 
952     if( DO_NOT_STORE(UGENNORM_STORE_COMPAT) ||
953         DO_NOT_STORE(UGENNORM_STORE_COMPOSITION) ||
954         DO_NOT_STORE(UGENNORM_STORE_AUX)
955     ) {
956         return;
957     }
958 
959     count=utm_countItems(extraMem);
960     length=s[0];
961     first=s[1];
962 
963     /* try to overlay single-unit strings with existing ones */
964     if(length==1 && first<0xff00) {
965         p=utm_getStart(extraMem);
966         for(i=1; i<count; ++i) {
967             if(first==p[i]) {
968                 break;
969             }
970         }
971     } else {
972         i=count;
973     }
974 
975     /* append the new string if it cannot be overlayed with an old one */
976     if(i==count) {
977         if(count>_NORM_AUX_MAX_FNC) {
978             fprintf(stderr, "gennorm error: too many FNC strings\n");
979             exit(U_INDEX_OUTOFBOUNDS_ERROR);
980         }
981 
982         /* prepend 0xffxx with xx==length */
983         s[0]=(uint16_t)(0xff00+length);
984         ++length;
985         p=(uint16_t *)utm_allocN(extraMem, length);
986         uprv_memcpy(p, s, length*2);
987 
988         /* update the top index in extraMem[0] */
989         count+=length;
990         ((uint16_t *)utm_getStart(extraMem))[0]=(uint16_t)count;
991     }
992 
993     /* store the index to the string */
994     createNorm(c)->fncIndex=i;
995 }
996 
997 /* build runtime structures ------------------------------------------------- */
998 
999 /* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
1000 static uint16_t
reorderString(uint32_t * s,int32_t length)1001 reorderString(uint32_t *s, int32_t length) {
1002     uint8_t ccs[40];
1003     uint32_t c;
1004     int32_t i, j;
1005     uint8_t cc, prevCC;
1006 
1007     if(length<=0) {
1008         return 0;
1009     }
1010 
1011     for(i=0; i<length; ++i) {
1012         /* get the i-th code point and its combining class */
1013         c=s[i];
1014         cc=getCCFromCP(c);
1015         if(cc!=0 && i!=0) {
1016             /* it is a combining mark, see if it needs to be moved back */
1017             j=i;
1018             do {
1019                 prevCC=ccs[j-1];
1020                 if(prevCC<=cc) {
1021                     break;  /* found the right place */
1022                 }
1023                 /* move the previous code point here and go back */
1024                 s[j]=s[j-1];
1025                 ccs[j]=prevCC;
1026             } while(--j!=0);
1027             s[j]=c;
1028             ccs[j]=cc;
1029         } else {
1030             /* just store the combining class */
1031             ccs[i]=cc;
1032         }
1033     }
1034 
1035     return (uint16_t)(((uint16_t)ccs[0]<<8)|ccs[length-1]);
1036 }
1037 
1038 #if 0
1039 static UBool combineAndQC[64]={ 0 };
1040 #endif
1041 
1042 /*
1043  * canonically reorder the up to two decompositions
1044  * and store the leading and trailing combining classes accordingly
1045  *
1046  * also process canonical decompositions for canonical closure
1047  */
1048 static void
postParseFn(void * context,uint32_t code,Norm * norm)1049 postParseFn(void *context, uint32_t code, Norm *norm) {
1050     int32_t length;
1051 
1052     /* canonically order the NFD */
1053     length=norm->lenNFD;
1054     if(length>0) {
1055         norm->canonBothCCs=reorderString(norm->nfd, length);
1056     }
1057 
1058     /* canonically reorder the NFKD */
1059     length=norm->lenNFKD;
1060     if(length>0) {
1061         norm->compatBothCCs=reorderString(norm->nfkd, length);
1062     }
1063 
1064     /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
1065     if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) {
1066         fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags);
1067     }
1068     if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) {
1069         fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
1070     }
1071 
1072     /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
1073 #if 0
1074     combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1;
1075 #endif
1076 
1077     if(norm->combiningFlags&1) {
1078         if(norm->udataCC!=0) {
1079             /* illegal - data-derivable composition exclusion */
1080             fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC);
1081         }
1082     }
1083     if(norm->combiningFlags&2) {
1084         if((norm->qcFlags&0x11)==0) {
1085             fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code);
1086         }
1087 #if 0
1088         /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
1089         if(norm->udataCC==0) {
1090             printf("U+%04lx combines backward but udataCC==0\n", (long)code);
1091         }
1092 #endif
1093     }
1094     if((norm->combiningFlags&3)==3 && beVerbose) {
1095         printf("U+%04lx combines both ways\n", (long)code);
1096     }
1097 
1098     /*
1099      * process canonical decompositions for canonical closure
1100      *
1101      * in each canonical decomposition:
1102      *   add the current character (code) to the set of canonical starters of its norm->nfd[0]
1103      *   set the "unsafe starter" flag for each norm->nfd[1..]
1104      */
1105     length=norm->lenNFD;
1106     if(length>0) {
1107         Norm *otherNorm;
1108         UChar32 c;
1109         int32_t i;
1110 
1111         /* nfd[0].canonStart.add(code) */
1112         c=norm->nfd[0];
1113         otherNorm=createNorm(c);
1114         if(otherNorm->canonStart==NULL) {
1115             otherNorm->canonStart=uset_open(code, code);
1116             if(otherNorm->canonStart==NULL) {
1117                 fprintf(stderr, "gennorm error: out of memory in uset_open()\n");
1118                 exit(U_MEMORY_ALLOCATION_ERROR);
1119             }
1120         } else {
1121             uset_add(otherNorm->canonStart, code);
1122             if(!uset_contains(otherNorm->canonStart, code)) {
1123                 fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code);
1124                 exit(U_INTERNAL_PROGRAM_ERROR);
1125             }
1126         }
1127 
1128         /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
1129         for(i=1; i<length; ++i) {
1130             createNorm(norm->nfd[i])->unsafeStart=TRUE;
1131         }
1132     }
1133 }
1134 
1135 static uint32_t
make32BitNorm(Norm * norm)1136 make32BitNorm(Norm *norm) {
1137     UChar extra[100];
1138     const Norm *other;
1139     uint32_t word;
1140     int32_t i, length, beforeZero=0, count, start;
1141 
1142     /*
1143      * Check for assumptions:
1144      *
1145      * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
1146      * then the decomposition also begins with a true starter.
1147      */
1148     if(norm->udataCC==0) {
1149         /* this is a starter */
1150         if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) {
1151             /* a "true" NFC starter with a canonical decomposition */
1152             if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
1153                 ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */
1154             ) {
1155                 fprintf(stderr,
1156                     "error: true NFC starter canonical decomposition[%u] does not begin\n"
1157                     "    with a true NFC starter: U+%04lx U+%04lx%s\n",
1158                     norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
1159                     norm->lenNFD<=2 ? "" : " ...");
1160                 exit(U_INVALID_TABLE_FILE);
1161             }
1162         }
1163 
1164         if((norm->qcFlags&_NORM_QC_NFKC)==0) {
1165             if(norm->lenNFKD>0) {
1166                 /* a "true" NFKC starter with a compatibility decomposition */
1167                 if( norm->compatBothCCs>=0x100 || /* lead cc!=0 or */
1168                     ((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFKC_YES */
1169                 ) {
1170                     fprintf(stderr,
1171                         "error: true NFKC starter compatibility decomposition[%u] does not begin\n"
1172                         "    with a true NFKC starter: U+%04lx U+%04lx%s\n",
1173                         norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1],
1174                         norm->lenNFKD<=2 ? "" : " ...");
1175                     exit(U_INVALID_TABLE_FILE);
1176                 }
1177             } else if(norm->lenNFD>0) {
1178                 /* a "true" NFKC starter with only a canonical decomposition */
1179                 if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
1180                     ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFKC_YES */
1181                 ) {
1182                     fprintf(stderr,
1183                         "error: true NFKC starter canonical decomposition[%u] does not begin\n"
1184                         "    with a true NFKC starter: U+%04lx U+%04lx%s\n",
1185                         norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
1186                         norm->lenNFD<=2 ? "" : " ...");
1187                     exit(U_INVALID_TABLE_FILE);
1188                 }
1189             }
1190         }
1191     }
1192 
1193     /* reset the 32-bit word and set the quick check flags */
1194     word=norm->qcFlags;
1195 
1196     /* set the UnicodeData combining class */
1197     word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT;
1198 
1199     /* set the combining flag and index */
1200     if(norm->combiningFlags&3) {
1201         word|=(uint32_t)(norm->combiningFlags&3)<<6;
1202     }
1203 
1204     /* set the combining index value into the extra data */
1205     /* 0xffff: no combining index; 0..0x7fff: combining index */
1206     if(norm->combiningIndex!=0xffff) {
1207         extra[0]=norm->combiningIndex;
1208         beforeZero=1;
1209     }
1210 
1211     count=beforeZero;
1212 
1213     /* write the decompositions */
1214     if((norm->lenNFD|norm->lenNFKD)!=0) {
1215         extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */
1216 
1217         length=norm->lenNFD;
1218         if(length>0) {
1219             if(norm->canonBothCCs!=0) {
1220                 extra[beforeZero]|=0x80;
1221                 extra[count++]=norm->canonBothCCs;
1222             }
1223             start=count;
1224             for(i=0; i<length; ++i) {
1225                 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfd[i]);
1226             }
1227             extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */
1228         }
1229 
1230         length=norm->lenNFKD;
1231         if(length>0) {
1232             if(norm->compatBothCCs!=0) {
1233                 extra[beforeZero]|=0x8000;
1234                 extra[count++]=norm->compatBothCCs;
1235             }
1236             start=count;
1237             for(i=0; i<length; ++i) {
1238                 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfkd[i]);
1239             }
1240             extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */
1241         }
1242     }
1243 
1244     /* allocate and copy the extra data */
1245     if(count!=0) {
1246         UChar *p;
1247 
1248         if(norm->specialTag!=0) {
1249             fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag);
1250             exit(U_ILLEGAL_ARGUMENT_ERROR);
1251         }
1252 
1253         p=(UChar *)utm_allocN(extraMem, count);
1254         uprv_memcpy(p, extra, count*2);
1255 
1256         /* set the extra index, offset by beforeZero */
1257         word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT;
1258     } else if(norm->specialTag!=0) {
1259         /* set a special tag instead of an extra index */
1260         word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT;
1261     }
1262 
1263     return word;
1264 }
1265 
1266 /* turn all Norm structs into corresponding 32-bit norm values */
1267 static void
makeAll32()1268 makeAll32() {
1269     uint32_t *pNormData;
1270     uint32_t n;
1271     int32_t i, normLength, count;
1272 
1273     count=(int32_t)utm_countItems(normMem);
1274     for(i=0; i<count; ++i) {
1275         norms[i].value32=make32BitNorm(norms+i);
1276     }
1277 
1278     pNormData=utrie_getData(norm32Trie, &normLength);
1279 
1280     count=0; /* count is now just used for debugging */
1281     for(i=0; i<normLength; ++i) {
1282         n=pNormData[i];
1283         if(0!=(pNormData[i]=norms[n].value32)) {
1284             ++count;
1285         }
1286     }
1287 }
1288 
1289 /*
1290  * extract all Norm.canonBothCCs into the FCD table
1291  * set 32-bit values to use the common fold and compact functions
1292  */
1293 static void
makeFCD()1294 makeFCD() {
1295     uint32_t *pFCDData;
1296     uint32_t n;
1297     int32_t i, count, fcdLength;
1298     uint16_t bothCCs;
1299 
1300     count=utm_countItems(normMem);
1301     for(i=0; i<count; ++i) {
1302         bothCCs=norms[i].canonBothCCs;
1303         if(bothCCs==0) {
1304             /* if there are no decomposition cc's then use the udataCC twice */
1305             bothCCs=norms[i].udataCC;
1306             bothCCs|=bothCCs<<8;
1307         }
1308         norms[i].value32=bothCCs;
1309     }
1310 
1311     pFCDData=utrie_getData(fcdTrie, &fcdLength);
1312 
1313     for(i=0; i<fcdLength; ++i) {
1314         n=pFCDData[i];
1315         pFCDData[i]=norms[n].value32;
1316     }
1317 }
1318 
1319 /**
1320  * If the given set contains exactly one character, then return it.
1321  * Otherwise return -1.
1322  */
1323 static int32_t
usetContainsOne(const USet * set)1324 usetContainsOne(const USet* set) {
1325     if(uset_getItemCount(set)==1) {
1326         /* there is a single item (a single range) */
1327         UChar32 start, end;
1328         UErrorCode ec=U_ZERO_ERROR;
1329         int32_t len=uset_getItem(set, 0, &start, &end, NULL, 0, &ec);
1330         if (len==0 && start==end) { /* a range (len==0) with a single code point */
1331             return start;
1332         }
1333     }
1334     return -1;
1335 }
1336 
1337 static void
makeCanonSetFn(void * context,uint32_t code,Norm * norm)1338 makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
1339     if(norm->canonStart!=NULL && !uset_isEmpty(norm->canonStart)) {
1340         uint16_t *table;
1341         int32_t c, tableLength;
1342         UErrorCode errorCode=U_ZERO_ERROR;
1343 
1344         /* does the set contain exactly one code point? */
1345         c=usetContainsOne(norm->canonStart);
1346 
1347         /* add an entry to the BMP or supplementary search table */
1348         if(code<=0xffff) {
1349             table=canonStartSets+_NORM_MAX_CANON_SETS;
1350             tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1351 
1352             table[tableLength++]=(uint16_t)code;
1353 
1354             if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) {
1355                 /* single-code point BMP result for BMP code point */
1356                 table[tableLength++]=(uint16_t)c;
1357             } else {
1358                 table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop);
1359                 c=-1;
1360             }
1361             canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength;
1362         } else {
1363             table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH;
1364             tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1365 
1366             table[tableLength++]=(uint16_t)(code>>16);
1367             table[tableLength++]=(uint16_t)code;
1368 
1369             if(c>=0) {
1370                 /* single-code point result for supplementary code point */
1371                 table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00));
1372                 table[tableLength++]=(uint16_t)c;
1373             } else {
1374                 table[tableLength++]=(uint16_t)canonStartSetsTop;
1375             }
1376             canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength;
1377         }
1378 
1379         if(c<0) {
1380             /* write a USerializedSet */
1381             ++canonSetsCount;
1382             canonStartSetsTop+=
1383                     uset_serialize(norm->canonStart,
1384                             canonStartSets+canonStartSetsTop,
1385                             _NORM_MAX_CANON_SETS-canonStartSetsTop,
1386                             &errorCode);
1387         }
1388         canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
1389 
1390         if(U_FAILURE(errorCode)) {
1391             fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop);
1392             exit(errorCode);
1393         }
1394         if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) {
1395             fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n");
1396             exit(U_INDEX_OUTOFBOUNDS_ERROR);
1397         }
1398     }
1399 }
1400 
1401 /* for getSkippableFlags ---------------------------------------------------- */
1402 
1403 /* combine the lead and trail code points; return <0 if they do not combine */
1404 static int32_t
combine(uint32_t lead,uint32_t trail)1405 combine(uint32_t lead, uint32_t trail) {
1406     CombiningTriple *triples;
1407     uint32_t i, count;
1408 
1409     /* search for all triples with c as lead code point */
1410     triples=utm_getStart(combiningTriplesMem);
1411     count=utm_countItems(combiningTriplesMem);
1412 
1413     /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1414     for(i=0; i<count && lead!=triples[i].lead; ++i) {}
1415 
1416     /* check each triple for this code point */
1417     for(; i<count && lead==triples[i].lead; ++i) {
1418         if(trail==triples[i].trail) {
1419             return (int32_t)triples[i].combined;
1420         }
1421     }
1422 
1423     return -1;
1424 }
1425 
1426 /*
1427  * Starting from the canonical decomposition s[0..length[ of a single code point,
1428  * is the code point c consumed in an NFC/FCC recomposition?
1429  *
1430  * No need to handle discontiguous composition because that would not consume some
1431  * intermediate character, so would not compose back to the original character.
1432  * See comments in canChangeWithFollowing().
1433  *
1434  * No need to compose beyond where c canonically orders because if it is consumed
1435  * then the result differs from the original anyway.
1436  *
1437  * Possible optimization:
1438  * - Verify that there are no cases of the same combining mark stacking twice.
1439  * - return FALSE right away if c inserts after a copy of itself
1440  *   without attempting to recompose; will happen because each mark in
1441  *   the decomposition will be enumerated and passed in as c.
1442  *   More complicated and fragile though than it is already.
1443  *
1444  * markus 2002nov04
1445  */
1446 static UBool
doesComposeConsume(const uint32_t * s,int32_t length,uint32_t c,uint8_t cc)1447 doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) {
1448     int32_t starter, i;
1449 
1450     /* ignore trailing characters where cc<prevCC */
1451     while(length>1 && cc<getCCFromCP(s[length-1])) {
1452         --length;
1453     }
1454 
1455     /* start consuming/combining from the beginning */
1456     starter=(int32_t)s[0];
1457     for(i=1; i<length; ++i) {
1458         starter=combine((uint32_t)starter, s[i]);
1459         if(starter<0) {
1460             fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
1461                 (int)s[0], (int)s[1], (int)length, (int)c, cc);
1462             exit(U_INTERNAL_PROGRAM_ERROR);
1463         }
1464     }
1465 
1466     /* try to combine/consume c, return TRUE if it is consumed */
1467     return combine((uint32_t)starter, c)>=0;
1468 }
1469 
1470 /* does the starter s[0] combine forward with another char that is below trailCC? */
1471 static UBool
canChangeWithFollowing(const uint32_t * s,int32_t length,uint8_t trailCC)1472 canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
1473     if(trailCC<=1) {
1474         /* no character will combine ahead of the trailing char of the decomposition */
1475         return FALSE;
1476     }
1477 
1478     /*
1479      * We are only checking skippable condition (f).
1480      * Therefore, the original character does not have quick check flag NFC_NO (c),
1481      * i.e., the decomposition recomposes completely back into the original code point.
1482      * So s[0] must be a true starter with cc==0 and
1483      * combining with following code points.
1484      *
1485      * Similarly, length==1 is not possible because that would be a singleton
1486      * decomposition which is marked with NFC_NO and does not pass (c).
1487      *
1488      * Only a character with cc<trailCC can change the composition.
1489      * Reason: A char with cc>=trailCC would order after decomposition s[],
1490      * composition would consume all of the decomposition, and here we know that
1491      * the original char passed check d), i.e., it does not combine forward,
1492      * therefore does not combine with anything after the decomposition is consumed.
1493      *
1494      * Now see if there is a character that
1495      * 1. combines backward
1496      * 2. has cc<trailCC
1497      * 3. is consumed in recomposition
1498      *
1499      * length==2 is simple:
1500      *
1501      * Characters that fulfill these conditions are exactly the ones that combine directly
1502      * with the starter c==s[0] because there is no intervening character after
1503      * reordering.
1504      * We can just enumerate all chars with which c combines (they all pass 1. and 3.)
1505      * and see if one has cc<trailCC (passes 2.).
1506      *
1507      * length>2 is a little harder:
1508      *
1509      * Since we will get different starters during recomposition, we need to
1510      * enumerate each backward-combining character (1.)
1511      * with cc<trailCC (2.) and
1512      * see if it gets consumed in recomposition. (3.)
1513      * No need to enumerate both-ways combining characters because they must have cc==0.
1514      */
1515     if(length==2) {
1516         /* enumerate all chars that combine with this one and check their cc */
1517         CombiningTriple *triples;
1518         uint32_t c, i, count;
1519         uint8_t cc;
1520 
1521         /* search for all triples with c as lead code point */
1522         triples=utm_getStart(combiningTriplesMem);
1523         count=utm_countItems(combiningTriplesMem);
1524         c=s[0];
1525 
1526         /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1527         for(i=0; i<count && c!=triples[i].lead; ++i) {}
1528 
1529         /* check each triple for this code point */
1530         for(; i<count && c==triples[i].lead; ++i) {
1531             cc=getCCFromCP(triples[i].trail);
1532             if(cc>0 && cc<trailCC) {
1533                 /* this trail code point combines with c and has cc<trailCC */
1534                 return TRUE;
1535             }
1536         }
1537     } else {
1538         /* enumerate all chars that combine backward */
1539         uint32_t c2;
1540         uint16_t i;
1541         uint8_t cc;
1542 
1543         for(i=combineBothTop; i<combineBackTop; ++i) {
1544             c2=combiningCPs[i]&0xffffff;
1545             cc=getCCFromCP(c2);
1546             /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
1547             if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) {
1548                 return TRUE;
1549             }
1550         }
1551     }
1552 
1553     /* this decomposition is not modified by any appended character */
1554     return FALSE;
1555 }
1556 
1557 /* see unormimp.h for details on NF*C Skippable flags */
1558 static uint32_t
getSkippableFlags(const Norm * norm)1559 getSkippableFlags(const Norm *norm) {
1560     /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
1561 
1562     /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
1563     if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) {
1564         return 0;
1565     }
1566 
1567     /* ### TODO check other data generation functions whether they should & do ignore Hangul/Jamo specials */
1568 
1569     /*
1570      * Note:
1571      * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
1572      *
1573      * This means that (a)..(e) must always be derived from the runtime norm32 value,
1574      * and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
1575      * the form is NF*C and there is a canonical decomposition (NFD_NO).
1576      *
1577      * (a) unassigned code points get "not skippable"==false because they
1578      * don't have a Norm struct so they won't get here
1579      */
1580 
1581     /* (b) not skippable if cc!=0 */
1582     if(norm->udataCC!=0) {
1583         return 0; /* non-zero flag for (f) only */
1584     }
1585 
1586     /*
1587      * not NFC_Skippable if
1588      * (c) quick check flag == NO  or
1589      * (d) combines forward  or
1590      * (e) combines back or
1591      * (f) can change if another character is added
1592      *
1593      * for (f):
1594      * For NF*C: Get corresponding decomposition, get its last starter (cc==0),
1595      *           check its composition list,
1596      *           see if any of the second code points in the list
1597      *           has cc less than the trailCC of the decomposition.
1598      *
1599      * For FCC: Test at runtime if the decomposition has a trailCC>1
1600      *          -> there are characters with cc==1, they would order before the trail char
1601      *          and prevent contiguous combination with the trail char.
1602      */
1603     if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 ||
1604         (norm->combiningFlags&3)!=0) {
1605         return 0; /* non-zero flag for (f) only */
1606     }
1607     if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) {
1608         return _NORM_AUX_NFC_SKIP_F_MASK;
1609     }
1610 
1611     return 0; /* skippable */
1612 }
1613 
1614 static void
makeAux()1615 makeAux() {
1616     Norm *norm;
1617     uint32_t *pData;
1618     int32_t i, length;
1619 
1620     pData=utrie_getData(auxTrie, &length);
1621 
1622     for(i=0; i<length; ++i) {
1623         norm=norms+pData[i];
1624         /*
1625          * 16-bit auxiliary normalization properties
1626          * see unormimp.h
1627          */
1628         pData[i]=
1629             ((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
1630             (uint32_t)norm->fncIndex;
1631 
1632         if(norm->unsafeStart || norm->udataCC!=0) {
1633             pData[i]|=_NORM_AUX_UNSAFE_MASK;
1634         }
1635 
1636         pData[i]|=getSkippableFlags(norm);
1637     }
1638 }
1639 
1640 /* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
1641 static uint32_t U_CALLCONV
getFoldedNormValue(UNewTrie * trie,UChar32 start,int32_t offset)1642 getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1643     uint32_t value, leadNorm32=0;
1644     UChar32 limit;
1645     UBool inBlockZero;
1646 
1647     limit=start+0x400;
1648     while(start<limit) {
1649         value=utrie_get32(trie, start, &inBlockZero);
1650         if(inBlockZero) {
1651             start+=UTRIE_DATA_BLOCK_LENGTH;
1652         } else {
1653             if(value!=0) {
1654                 leadNorm32|=value;
1655             }
1656             ++start;
1657         }
1658     }
1659 
1660     /* turn multi-bit fields into the worst-case value */
1661     if(leadNorm32&_NORM_CC_MASK) {
1662         leadNorm32|=_NORM_CC_MASK;
1663     }
1664 
1665     /* clean up unnecessarily ored bit fields */
1666     leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
1667 
1668     if(leadNorm32==0) {
1669         /* nothing to do (only composition exclusions?) */
1670         return 0;
1671     }
1672 
1673     /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
1674     leadNorm32|=(
1675         (uint32_t)_NORM_EXTRA_INDEX_TOP+
1676         (uint32_t)((offset-UTRIE_BMP_INDEX_LENGTH)>>UTRIE_SURROGATE_BLOCK_BITS)
1677     )<<_NORM_EXTRA_SHIFT;
1678 
1679     return leadNorm32;
1680 }
1681 
1682 /* folding value for FCD: use default function (just store the offset (16 bits) if there is any non-0 entry) */
1683 
1684 /*
1685  * folding value for auxiliary data:
1686  * store the non-zero offset in bits 9..0 (FNC bits)
1687  * if there is any non-0 entry;
1688  * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
1689  */
1690 static uint32_t U_CALLCONV
getFoldedAuxValue(UNewTrie * trie,UChar32 start,int32_t offset)1691 getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1692     uint32_t value, oredValues;
1693     UChar32 limit;
1694     UBool inBlockZero;
1695 
1696     oredValues=0;
1697     limit=start+0x400;
1698     while(start<limit) {
1699         value=utrie_get32(trie, start, &inBlockZero);
1700         if(inBlockZero) {
1701             start+=UTRIE_DATA_BLOCK_LENGTH;
1702         } else {
1703             oredValues|=value;
1704             ++start;
1705         }
1706     }
1707 
1708     if(oredValues!=0) {
1709         /* move the 10 significant offset bits into bits 9..0 */
1710         offset>>=UTRIE_SURROGATE_BLOCK_BITS;
1711         if(offset>_NORM_AUX_FNC_MASK) {
1712             fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n");
1713             exit(U_INDEX_OUTOFBOUNDS_ERROR);
1714         }
1715         return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK);
1716     } else {
1717         return 0;
1718     }
1719 }
1720 
1721 extern void
processData()1722 processData() {
1723 #if 0
1724     uint16_t i;
1725 #endif
1726 
1727     processCombining();
1728 
1729     /* canonically reorder decompositions and assign combining classes for decompositions */
1730     enumTrie(postParseFn, NULL);
1731 
1732 #if 0
1733     for(i=1; i<64; ++i) {
1734         if(combineAndQC[i]) {
1735             printf("combiningFlags==0x%02x  qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33);
1736         }
1737     }
1738 #endif
1739 
1740     /* add hangul/jamo specials */
1741     setHangulJamoSpecials();
1742 
1743     /* set this value; will be updated as makeCanonSetFn() adds sets (if there are any, see gStoreFlags) */
1744     canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
1745 
1746     /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
1747     if(DO_STORE(UGENNORM_STORE_AUX) && DO_STORE(UGENNORM_STORE_COMPOSITION)) {
1748         enumTrie(makeCanonSetFn, NULL);
1749     }
1750 
1751     /* clone the normalization builder trie to make the final data tries */
1752     if( NULL==utrie_clone(norm32Trie, normTrie, NULL, 0) ||
1753         NULL==utrie_clone(fcdTrie, normTrie, NULL, 0) ||
1754         NULL==utrie_clone(auxTrie, normTrie, NULL, 0)
1755     ) {
1756         fprintf(stderr, "error: unable to clone the normalization trie\n");
1757         exit(U_MEMORY_ALLOCATION_ERROR);
1758     }
1759 
1760     /* --- finalize data for quick checks & normalization --- */
1761 
1762     /* turn the Norm structs (stage2, norms) into 32-bit data words */
1763     makeAll32();
1764 
1765     /* --- finalize data for FCD checks --- */
1766 
1767     /* FCD data: take Norm.canonBothCCs and store them in the FCD table */
1768     makeFCD();
1769 
1770     /* --- finalize auxiliary normalization data --- */
1771     makeAux();
1772 
1773     if(beVerbose) {
1774 #if 0
1775         printf("number of stage 2 entries: %ld\n", stage2Mem->index);
1776         printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2);
1777 #endif
1778         printf("combining CPs tops: fwd %u  both %u  back %u\n", combineFwdTop, combineBothTop, combineBackTop);
1779         printf("combining table count: %u\n", combiningTableTop);
1780     }
1781 }
1782 
1783 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1784 
1785 extern void
generateData(const char * dataDir,UBool csource)1786 generateData(const char *dataDir, UBool csource) {
1787     static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000];
1788 
1789     UNewDataMemory *pData;
1790     UErrorCode errorCode=U_ZERO_ERROR;
1791     int32_t size, dataLength;
1792 
1793 #if UCONFIG_NO_NORMALIZATION
1794 
1795     size=0;
1796 
1797 #else
1798 
1799     U_STRING_DECL(nxCJKCompatPattern, "[:Ideographic:]", 15);
1800     U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
1801     USet *set;
1802     int32_t normTrieSize, fcdTrieSize, auxTrieSize;
1803 
1804     normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
1805     if(U_FAILURE(errorCode)) {
1806         fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
1807         exit(errorCode);
1808     }
1809 
1810     if(DO_STORE(UGENNORM_STORE_FCD)) {
1811         fcdTrieSize=utrie_serialize(fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), NULL, TRUE, &errorCode);
1812         if(U_FAILURE(errorCode)) {
1813             fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode));
1814             exit(errorCode);
1815         }
1816     } else {
1817         fcdTrieSize=0;
1818     }
1819 
1820     if(DO_STORE(UGENNORM_STORE_AUX)) {
1821         auxTrieSize=utrie_serialize(auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode);
1822         if(U_FAILURE(errorCode)) {
1823             fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode));
1824             exit(errorCode);
1825         }
1826     } else {
1827         auxTrieSize=0;
1828     }
1829 
1830     /* move the parts of canonStartSets[] together into a contiguous block */
1831     if( canonStartSetsTop<_NORM_MAX_CANON_SETS &&
1832         canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]!=0
1833     ) {
1834         uprv_memmove(canonStartSets+canonStartSetsTop,
1835                      canonStartSets+_NORM_MAX_CANON_SETS,
1836                      canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2);
1837     }
1838     canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1839 
1840     if( canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH) &&
1841         canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]!=0
1842     ) {
1843         uprv_memmove(canonStartSets+canonStartSetsTop,
1844                      canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH,
1845                      canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2);
1846     }
1847     canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1848 
1849     /* create the normalization exclusion sets */
1850     /*
1851      * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
1852      * but we cannot use NFD_QC from the pattern because that would require
1853      * unorm.icu which we are just going to generate.
1854      * Therefore we have manually collected nfdQCNoSet and intersect Ideographic
1855      * with that.
1856      */
1857     U_STRING_INIT(nxCJKCompatPattern, "[:Ideographic:]", 15);
1858     U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
1859 
1860     canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop;
1861     set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode);
1862     if(U_FAILURE(errorCode)) {
1863         fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
1864         exit(errorCode);
1865     }
1866     uset_retainAll(set, nfdQCNoSet);
1867     if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
1868         uset_clear(set);
1869     }
1870     canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
1871     if(U_FAILURE(errorCode)) {
1872         fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
1873         exit(errorCode);
1874     }
1875     uset_close(set);
1876 
1877     canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop;
1878     set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode);
1879     if(U_FAILURE(errorCode)) {
1880         fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
1881         exit(errorCode);
1882     }
1883     if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
1884         uset_clear(set);
1885     }
1886     canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
1887     if(U_FAILURE(errorCode)) {
1888         fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
1889         exit(errorCode);
1890     }
1891     uset_close(set);
1892 
1893     canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop;
1894 
1895     /* make sure that the FCD trie is 4-aligned */
1896     if((utm_countItems(extraMem)+combiningTableTop)&1) {
1897         combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
1898     }
1899 
1900     /* pad canonStartSets to 4-alignment, too */
1901     if(canonStartSetsTop&1) {
1902         canonStartSets[canonStartSetsTop++]=0x1235;
1903     }
1904 
1905     size=
1906         _NORM_INDEX_TOP*4+
1907         normTrieSize+
1908         utm_countItems(extraMem)*2+
1909         combiningTableTop*2+
1910         fcdTrieSize+
1911         auxTrieSize+
1912         canonStartSetsTop*2;
1913 
1914     if(beVerbose) {
1915         printf("size of normalization trie              %5u bytes\n", (int)normTrieSize);
1916         printf("size of 16-bit extra memory             %5u UChars/uint16_t\n", (int)utm_countItems(extraMem));
1917         printf("  of that: FC_NFKC_Closure size         %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
1918         printf("size of combining table                 %5u uint16_t\n", combiningTableTop);
1919         printf("size of FCD trie                        %5u bytes\n", (int)fcdTrieSize);
1920         printf("size of auxiliary trie                  %5u bytes\n", (int)auxTrieSize);
1921         printf("size of canonStartSets[]                %5u uint16_t\n", (int)canonStartSetsTop);
1922         printf("  number of indexes                     %5u uint16_t\n", _NORM_SET_INDEX_TOP);
1923         printf("  size of sets                          %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP);
1924         printf("  number of sets                        %5d\n", (int)canonSetsCount);
1925         printf("  size of BMP search table              %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
1926         printf("  size of supplementary search table    %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
1927         printf("  length of exclusion sets              %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]);
1928         printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
1929     }
1930 
1931     indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
1932     indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
1933 
1934     indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
1935     indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
1936     indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop);
1937     indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop);
1938 
1939     /* the quick check minimum code points are already set */
1940 
1941     indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
1942     indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize;
1943     indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop;
1944 
1945 #endif
1946 
1947     if(csource) {
1948 #if UCONFIG_NO_NORMALIZATION
1949     /* no csource for dummy mode..? */
1950     fprintf(stderr, "gennorm error: UCONFIG_NO_NORMALIZATION is on in csource mode.\n");
1951     exit(1);
1952 #else
1953         /* write .c file for hardcoded data */
1954         UTrie normTrie2={ NULL }, fcdTrie2={ NULL }, auxTrie2={ NULL };
1955         FILE *f;
1956 
1957         utrie_unserialize(&normTrie2, normTrieBlock, normTrieSize, &errorCode);
1958         if(fcdTrieSize>0) {
1959             utrie_unserialize(&fcdTrie2, fcdTrieBlock, fcdTrieSize, &errorCode);
1960         }
1961         if(auxTrieSize>0) {
1962             utrie_unserialize(&auxTrie2, auxTrieBlock, auxTrieSize, &errorCode);
1963         }
1964         if(U_FAILURE(errorCode)) {
1965             fprintf(
1966                 stderr,
1967                 "gennorm error: failed to utrie_unserialize() one of the tries - %s\n",
1968                 u_errorName(errorCode));
1969             exit(errorCode);
1970         }
1971 
1972         f=usrc_create(dataDir, "unorm_props_data.c");
1973         if(f!=NULL) {
1974             usrc_writeArray(f,
1975                 "static const UVersionInfo formatVersion={ ",
1976                 dataInfo.formatVersion, 8, 4,
1977                 " };\n\n");
1978             usrc_writeArray(f,
1979                 "static const UVersionInfo dataVersion={ ",
1980                 dataInfo.dataVersion, 8, 4,
1981                 " };\n\n");
1982             usrc_writeArray(f,
1983                 "static const int32_t indexes[_NORM_INDEX_TOP]={\n",
1984                 indexes, 32, _NORM_INDEX_TOP,
1985                 "\n};\n\n");
1986             usrc_writeUTrieArrays(f,
1987                 "static const uint16_t normTrie_index[%ld]={\n",
1988                 "static const uint32_t normTrie_data32[%ld]={\n",
1989                 &normTrie2,
1990                 "\n};\n\n");
1991             usrc_writeUTrieStruct(f,
1992                 "static const UTrie normTrie={\n",
1993                 &normTrie2, "normTrie_index", "normTrie_data32", "getFoldingNormOffset",
1994                 "};\n\n");
1995             usrc_writeArray(f,
1996                 "static const uint16_t extraData[%ld]={\n",
1997                 utm_getStart(extraMem), 16, utm_countItems(extraMem),
1998                 "\n};\n\n");
1999             usrc_writeArray(f,
2000                 "static const uint16_t combiningTable[%ld]={\n",
2001                 combiningTable, 16, combiningTableTop,
2002                 "\n};\n\n");
2003             if(fcdTrieSize>0) {
2004                 usrc_writeUTrieArrays(f,
2005                     "static const uint16_t fcdTrie_index[%ld]={\n", NULL,
2006                     &fcdTrie2,
2007                     "\n};\n\n");
2008                 usrc_writeUTrieStruct(f,
2009                     "static const UTrie fcdTrie={\n",
2010                     &fcdTrie2, "fcdTrie_index", NULL, NULL,
2011                     "};\n\n");
2012             } else {
2013                 fputs( "static const UTrie fcdTrie={ NULL };\n\n", f);
2014             }
2015             if(auxTrieSize>0) {
2016                 usrc_writeUTrieArrays(f,
2017                     "static const uint16_t auxTrie_index[%ld]={\n", NULL,
2018                     &auxTrie2,
2019                     "\n};\n\n");
2020                 usrc_writeUTrieStruct(f,
2021                     "static const UTrie auxTrie={\n",
2022                     &auxTrie2, "auxTrie_index", NULL, "getFoldingAuxOffset",
2023                     "};\n\n");
2024             } else {
2025                 fputs( "static const UTrie auxTrie={ NULL };\n\n", f);
2026             }
2027             usrc_writeArray(f,
2028                 "static const uint16_t canonStartSets[%ld]={\n",
2029                 canonStartSets, 16, canonStartSetsTop,
2030                 "\n};\n\n");
2031             fclose(f);
2032         }
2033 #endif
2034     } else {
2035         /* write the data */
2036         pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
2037                         haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
2038         if(U_FAILURE(errorCode)) {
2039             fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
2040             exit(errorCode);
2041         }
2042 
2043 #if !UCONFIG_NO_NORMALIZATION
2044 
2045         udata_writeBlock(pData, indexes, sizeof(indexes));
2046         udata_writeBlock(pData, normTrieBlock, normTrieSize);
2047         udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
2048         udata_writeBlock(pData, combiningTable, combiningTableTop*2);
2049         udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
2050         udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
2051         udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2);
2052 
2053 #endif
2054 
2055         /* finish up */
2056         dataLength=udata_finish(pData, &errorCode);
2057         if(U_FAILURE(errorCode)) {
2058             fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode);
2059             exit(errorCode);
2060         }
2061 
2062         if(dataLength!=size) {
2063             fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n",
2064                 (long)dataLength, (long)size);
2065             exit(U_INTERNAL_PROGRAM_ERROR);
2066         }
2067     }
2068 }
2069 
2070 #if !UCONFIG_NO_NORMALIZATION
2071 
2072 extern void
cleanUpData(void)2073 cleanUpData(void) {
2074     int32_t i, count;
2075 
2076     count=utm_countItems(normMem);
2077     for(i=0; i<count; ++i) {
2078         uset_close(norms[i].canonStart);
2079     }
2080 
2081     utm_close(normMem);
2082     utm_close(utf32Mem);
2083     utm_close(extraMem);
2084     utm_close(combiningTriplesMem);
2085     utrie_close(normTrie);
2086     utrie_close(norm32Trie);
2087     utrie_close(fcdTrie);
2088     utrie_close(auxTrie);
2089 
2090     uset_close(nfdQCNoSet);
2091 
2092     uprv_free(normTrie);
2093     uprv_free(norm32Trie);
2094     uprv_free(fcdTrie);
2095     uprv_free(auxTrie);
2096 }
2097 
2098 #endif /* #if !UCONFIG_NO_NORMALIZATION */
2099 
2100 /*
2101  * Hey, Emacs, please set the following:
2102  *
2103  * Local Variables:
2104  * indent-tabs-mode: nil
2105  * End:
2106  *
2107  */
2108