1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: store.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2001may25
14 * created by: Markus W. Scherer
15 *
16 * Store Unicode normalization data in a memory-mappable file.
17 */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include "unicode/utypes.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ustring.h"
24 #include "cmemory.h"
25 #include "cstring.h"
26 #include "filestrm.h"
27 #include "unicode/udata.h"
28 #include "utrie.h"
29 #include "unicode/uset.h"
30 #include "toolutil.h"
31 #include "unewdata.h"
32 #include "writesrc.h"
33 #include "unormimp.h"
34 #include "gennorm.h"
35
36 #define DO_DEBUG_OUT 0
37
38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
39
40 /*
41 * The new implementation of the normalization code loads its data from
42 * unorm.icu, which is generated with this gennorm tool.
43 * The format of that file is described in unormimp.h .
44 */
45
46 /* file data ---------------------------------------------------------------- */
47
48 #if UCONFIG_NO_NORMALIZATION
49
50 /* dummy UDataInfo cf. udata.h */
51 static UDataInfo dataInfo = {
52 sizeof(UDataInfo),
53 0,
54
55 U_IS_BIG_ENDIAN,
56 U_CHARSET_FAMILY,
57 U_SIZEOF_UCHAR,
58 0,
59
60 { 0, 0, 0, 0 }, /* dummy dataFormat */
61 { 0, 0, 0, 0 }, /* dummy formatVersion */
62 { 0, 0, 0, 0 } /* dummy dataVersion */
63 };
64
65 #else
66
67 /* UDataInfo cf. udata.h */
68 static UDataInfo dataInfo={
69 sizeof(UDataInfo),
70 0,
71
72 U_IS_BIG_ENDIAN,
73 U_CHARSET_FAMILY,
74 U_SIZEOF_UCHAR,
75 0,
76
77 { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
78 { 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
79 { 3, 2, 0, 0 } /* dataVersion (Unicode version) */
80 };
81
82 extern void
setUnicodeVersion(const char * v)83 setUnicodeVersion(const char *v) {
84 UVersionInfo version;
85 u_versionFromString(version, v);
86 uprv_memcpy(dataInfo.dataVersion, version, 4);
87 }
88
89 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
90
91 /* builder data ------------------------------------------------------------- */
92
93 /* modularization flags, see gennorm.h (default to "store everything") */
94 uint32_t gStoreFlags=0xffffffff;
95
96 typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
97
98 static UNewTrie
99 *normTrie,
100 *norm32Trie,
101 *fcdTrie,
102 *auxTrie;
103
104 static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
105
106 static Norm *norms;
107
108 /*
109 * set a flag for each code point that was seen in decompositions -
110 * avoid to decompose ones that have not been used before
111 */
112 static uint32_t haveSeenFlags[256];
113
114 /* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
115 static USet *nfdQCNoSet;
116
117 /* see addCombiningCP() for details */
118 static uint32_t combiningCPs[2000];
119
120 /*
121 * after processCombining() this contains for each code point in combiningCPs[]
122 * the runtime combining index
123 */
124 static uint16_t combiningIndexes[2000];
125
126 /* section limits for combiningCPs[], see addCombiningCP() */
127 static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;
128
129 /**
130 * Structure for a triple of code points, stored in combiningTriplesMem.
131 * The lead and trail code points combine into the the combined one,
132 * i.e., there is a canonical decomposition of combined-> <lead, trail>.
133 *
134 * Before processCombining() is called, leadIndex and trailIndex are 0.
135 * After processCombining(), they contain the indexes of the lead and trail
136 * code point in the combiningCPs[] array.
137 * They are then sorted by leadIndex, then trailIndex.
138 * They are not sorted by code points.
139 */
140 typedef struct CombiningTriple {
141 uint16_t leadIndex, trailIndex;
142 uint32_t lead, trail, combined;
143 } CombiningTriple;
144
145 /* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
146 static uint16_t combiningTable[0x8000];
147 static uint16_t combiningTableTop=0;
148
149 #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
150 static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
151 +10000]; /* +10000 for exclusion sets */
152 static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
153 static int32_t canonSetsCount=0;
154
155 /* allocate and initialize a Norm unit */
156 static Norm *
allocNorm()157 allocNorm() {
158 /* allocate Norm */
159 Norm *p=(Norm *)utm_alloc(normMem);
160 /*
161 * The combiningIndex must not be initialized to 0 because 0 is the
162 * combiningIndex of the first forward-combining character.
163 */
164 p->combiningIndex=0xffff;
165 return p;
166 }
167
168 extern void
init()169 init() {
170 uint16_t *p16;
171
172 normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
173 uprv_memset(normTrie, 0, sizeof(UNewTrie));
174 norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
175 uprv_memset(norm32Trie, 0, sizeof(UNewTrie));
176 fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
177 uprv_memset(fcdTrie, 0, sizeof(UNewTrie));
178 auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
179 uprv_memset(auxTrie, 0, sizeof(UNewTrie));
180
181 /* initialize the two tries */
182 if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) {
183 fprintf(stderr, "error: failed to initialize tries\n");
184 exit(U_MEMORY_ALLOCATION_ERROR);
185 }
186
187 /* allocate Norm structures and reset the first one */
188 normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
189 norms=allocNorm();
190
191 /* allocate UTF-32 string memory */
192 utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
193
194 /* reset all "have seen" flags */
195 uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
196
197 /* open an empty set */
198 nfdQCNoSet=uset_open(1, 0);
199
200 /* allocate extra data memory for UTF-16 decomposition strings and other values */
201 extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
202 /* initialize the extraMem counter for the top of FNC strings */
203 p16=(uint16_t *)utm_alloc(extraMem);
204 *p16=1;
205
206 /* allocate temporary memory for combining triples */
207 combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
208
209 /* set the minimum code points for no/maybe quick check values to the end of the BMP */
210 indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
211 indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
212 indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
213 indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
214
215 /* preset the indexes portion of canonStartSets */
216 uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2);
217 }
218
219 /*
220 * get or create a Norm unit;
221 * get or create the intermediate trie entries for it as well
222 */
223 static Norm *
createNorm(uint32_t code)224 createNorm(uint32_t code) {
225 Norm *p;
226 uint32_t i;
227
228 i=utrie_get32(normTrie, (UChar32)code, NULL);
229 if(i!=0) {
230 p=norms+i;
231 } else {
232 /* allocate Norm */
233 p=allocNorm();
234 if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) {
235 fprintf(stderr, "error: too many normalization entries\n");
236 exit(U_BUFFER_OVERFLOW_ERROR);
237 }
238 }
239 return p;
240 }
241
242 /* get an existing Norm unit */
243 static Norm *
getNorm(uint32_t code)244 getNorm(uint32_t code) {
245 uint32_t i;
246
247 i=utrie_get32(normTrie, (UChar32)code, NULL);
248 if(i==0) {
249 return NULL;
250 }
251 return norms+i;
252 }
253
254 /* get the canonical combining class of a character */
255 static uint8_t
getCCFromCP(uint32_t code)256 getCCFromCP(uint32_t code) {
257 Norm *norm=getNorm(code);
258 if(norm==NULL) {
259 return 0;
260 } else {
261 return norm->udataCC;
262 }
263 }
264
265 /*
266 * enumerate all code points with their Norm structs and call a function for each
267 * return the number of code points with data
268 */
269 static uint32_t
enumTrie(EnumTrieFn * fn,void * context)270 enumTrie(EnumTrieFn *fn, void *context) {
271 uint32_t count, i;
272 UChar32 code;
273 UBool isInBlockZero;
274
275 count=0;
276 for(code=0; code<=0x10ffff;) {
277 i=utrie_get32(normTrie, code, &isInBlockZero);
278 if(isInBlockZero) {
279 code+=UTRIE_DATA_BLOCK_LENGTH;
280 } else {
281 if(i!=0) {
282 fn(context, (uint32_t)code, norms+i);
283 ++count;
284 }
285 ++code;
286 }
287 }
288 return count;
289 }
290
291 static void
setHaveSeenString(const uint32_t * s,int32_t length)292 setHaveSeenString(const uint32_t *s, int32_t length) {
293 uint32_t c;
294
295 while(length>0) {
296 c=*s++;
297 haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f));
298 --length;
299 }
300 }
301
302 #define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
303
304 /* handle combining data ---------------------------------------------------- */
305
306 /*
307 * Insert an entry into combiningCPs[] for the new code point code with its flags.
308 * The flags indicate if code combines forward, backward, or both.
309 *
310 * combiningCPs[] contains three sections:
311 * 1. code points that combine forward
312 * 2. code points that combine forward and backward
313 * 3. code points that combine backward
314 *
315 * Search for code in the entire array.
316 * If it is found and already is in the right section (old flags==new flags)
317 * then we are done.
318 * If it is found but the flags are different, then remove it,
319 * union the old and new flags, and reinsert it into its correct section.
320 * If it is not found, then just insert it.
321 *
322 * Within each section, the code points are not sorted.
323 */
324 static void
addCombiningCP(uint32_t code,uint8_t flags)325 addCombiningCP(uint32_t code, uint8_t flags) {
326 uint32_t newEntry;
327 uint16_t i;
328
329 newEntry=code|((uint32_t)flags<<24);
330
331 /* search for this code point */
332 for(i=0; i<combineBackTop; ++i) {
333 if(code==(combiningCPs[i]&0xffffff)) {
334 /* found it */
335 if(newEntry==combiningCPs[i]) {
336 return; /* no change */
337 }
338
339 /* combine the flags, remove the old entry from the old place, and insert the new one */
340 newEntry|=combiningCPs[i];
341 if(i!=--combineBackTop) {
342 uprv_memmove(combiningCPs+i, combiningCPs+i+1, (combineBackTop-i)*4);
343 }
344 if(i<combineBothTop) {
345 --combineBothTop;
346 }
347 if(i<combineFwdTop) {
348 --combineFwdTop;
349 }
350 break;
351 }
352 }
353
354 /* not found or modified, insert it */
355 if(combineBackTop>=sizeof(combiningCPs)/4) {
356 fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n",
357 (long)(sizeof(combiningCPs)/4));
358 exit(U_MEMORY_ALLOCATION_ERROR);
359 }
360
361 /* set i to the insertion point */
362 flags=(uint8_t)(newEntry>>24);
363 if(flags==1) {
364 i=combineFwdTop++;
365 ++combineBothTop;
366 } else if(flags==3) {
367 i=combineBothTop++;
368 } else /* flags==2 */ {
369 i=combineBackTop;
370 }
371
372 /* move the following code points up one and insert newEntry at i */
373 if(i<combineBackTop) {
374 uprv_memmove(combiningCPs+i+1, combiningCPs+i, (combineBackTop-i)*4);
375 }
376 combiningCPs[i]=newEntry;
377
378 /* finally increment the total counter */
379 ++combineBackTop;
380 }
381
382 /**
383 * Find the index in combiningCPs[] where code point code is stored.
384 * @param code code point to look for
385 * @param isLead is code a forward combining code point?
386 * @return index in combiningCPs[] where code is stored
387 */
388 static uint16_t
findCombiningCP(uint32_t code,UBool isLead)389 findCombiningCP(uint32_t code, UBool isLead) {
390 uint16_t i, limit;
391
392 if(isLead) {
393 i=0;
394 limit=combineBothTop;
395 } else {
396 i=combineFwdTop;
397 limit=combineBackTop;
398 }
399
400 /* search for this code point */
401 for(; i<limit; ++i) {
402 if(code==(combiningCPs[i]&0xffffff)) {
403 /* found it */
404 return i;
405 }
406 }
407
408 /* not found */
409 return 0xffff;
410 }
411
412 static void
addCombiningTriple(uint32_t lead,uint32_t trail,uint32_t combined)413 addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
414 CombiningTriple *triple;
415
416 if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
417 return;
418 }
419
420 /*
421 * set combiningFlags for the two code points
422 * do this after decomposition so that getNorm() above returns NULL
423 * if we do not have actual sub-decomposition data for the initial NFD here
424 */
425 createNorm(lead)->combiningFlags|=1; /* combines forward */
426 createNorm(trail)->combiningFlags|=2; /* combines backward */
427
428 addCombiningCP(lead, 1);
429 addCombiningCP(trail, 2);
430
431 triple=(CombiningTriple *)utm_alloc(combiningTriplesMem);
432 triple->lead=lead;
433 triple->trail=trail;
434 triple->combined=combined;
435 }
436
437 static int
compareTriples(const void * l,const void * r)438 compareTriples(const void *l, const void *r) {
439 int diff;
440 diff=(int)((CombiningTriple *)l)->leadIndex-
441 (int)((CombiningTriple *)r)->leadIndex;
442 if(diff==0) {
443 diff=(int)((CombiningTriple *)l)->trailIndex-
444 (int)((CombiningTriple *)r)->trailIndex;
445 }
446 return diff;
447 }
448
449 static void
processCombining()450 processCombining() {
451 CombiningTriple *triples;
452 uint16_t *p;
453 uint32_t combined;
454 uint16_t i, j, count, tableTop, finalIndex, combinesFwd;
455
456 triples=utm_getStart(combiningTriplesMem);
457
458 /* add lead and trail indexes to the triples for sorting */
459 count=(uint16_t)utm_countItems(combiningTriplesMem);
460 for(i=0; i<count; ++i) {
461 /* findCombiningCP() must always find the code point */
462 triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
463 triples[i].trailIndex=findCombiningCP(triples[i].trail, FALSE);
464 }
465
466 /* sort them by leadIndex, trailIndex */
467 qsort(triples, count, sizeof(CombiningTriple), compareTriples);
468
469 /* calculate final combining indexes and store them in the Norm entries */
470 tableTop=0;
471 j=0; /* triples counter */
472
473 /* first, combining indexes of fwd/both characters are indexes into the combiningTable */
474 for(i=0; i<combineBothTop; ++i) {
475 /* start a new table */
476
477 /* assign combining index */
478 createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=tableTop;
479
480 /* calculate the length of the combining data for this lead code point in the combiningTable */
481 while(j<count && i==triples[j].leadIndex) {
482 /* count 2 to 3 16-bit units per composition entry (back-index, code point) */
483 combined=triples[j++].combined;
484 if(combined<=0x1fff) {
485 tableTop+=2;
486 } else {
487 tableTop+=3;
488 }
489 }
490 }
491
492 /* second, combining indexes of back-only characters are simply incremented from here to be unique */
493 finalIndex=tableTop;
494 for(; i<combineBackTop; ++i) {
495 createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=finalIndex++;
496 }
497
498 /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
499 if(finalIndex>0x8000) {
500 fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
501 tableTop, (long)(sizeof(combiningTable)/4));
502 exit(U_MEMORY_ALLOCATION_ERROR);
503 }
504
505 combiningTableTop=tableTop;
506
507 /* store the combining data in the combiningTable, with the final indexes from above */
508 p=combiningTable;
509 j=0; /* triples counter */
510
511 /*
512 * this is essentially the same loop as above, but
513 * it writes the table data instead of calculating and setting the final indexes;
514 * it is necessary to have two passes so that all the final indexes are known before
515 * they are written into the table
516 */
517 for(i=0; i<combineBothTop; ++i) {
518 /* start a new table */
519
520 combined=0; /* avoid compiler warning */
521
522 /* store the combining data for this lead code point in the combiningTable */
523 while(j<count && i==triples[j].leadIndex) {
524 finalIndex=combiningIndexes[triples[j].trailIndex];
525 combined=triples[j++].combined;
526
527 /* is combined a starter? (i.e., cc==0 && combines forward) */
528 combinesFwd=(uint16_t)((getNorm(combined)->combiningFlags&1)<<13);
529
530 *p++=finalIndex;
531 if(combined<=0x1fff) {
532 *p++=(uint16_t)(combinesFwd|combined);
533 } else if(combined<=0xffff) {
534 *p++=(uint16_t)(0x8000|combinesFwd);
535 *p++=(uint16_t)combined;
536 } else {
537 *p++=(uint16_t)(0xc000|combinesFwd|((combined-0x10000)>>10));
538 *p++=(uint16_t)(0xdc00|(combined&0x3ff));
539 }
540 }
541
542 /* set a marker on the last final trail index in this lead's table */
543 if(combined<=0x1fff) {
544 *(p-2)|=0x8000;
545 } else {
546 *(p-3)|=0x8000;
547 }
548 }
549
550 /* post condition: tableTop==(p-combiningTable) */
551 }
552
553 /* processing incoming normalization data ----------------------------------- */
554
555 /*
556 * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
557 * c must be a Hangul syllable code point.
558 */
559 static void
getHangulDecomposition(uint32_t c,Norm * pHangulNorm,uint32_t hangulBuffer[3])560 getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3]) {
561 /* Hangul syllable: decompose algorithmically */
562 uint32_t c2;
563 uint8_t length;
564
565 uprv_memset(pHangulNorm, 0, sizeof(Norm));
566
567 c-=HANGUL_BASE;
568
569 c2=c%JAMO_T_COUNT;
570 c/=JAMO_T_COUNT;
571 if(c2>0) {
572 hangulBuffer[2]=JAMO_T_BASE+c2;
573 length=3;
574 } else {
575 hangulBuffer[2]=0;
576 length=2;
577 }
578
579 hangulBuffer[1]=JAMO_V_BASE+c%JAMO_V_COUNT;
580 hangulBuffer[0]=JAMO_L_BASE+c/JAMO_V_COUNT;
581
582 pHangulNorm->nfd=hangulBuffer;
583 pHangulNorm->lenNFD=length;
584 if(DO_STORE(UGENNORM_STORE_COMPAT)) {
585 pHangulNorm->nfkd=hangulBuffer;
586 pHangulNorm->lenNFKD=length;
587 }
588 }
589
590 /*
591 * decompose the one decomposition further, may generate two decompositions
592 * apply all previous characters' decompositions to this one
593 */
594 static void
decompStoreNewNF(uint32_t code,Norm * norm)595 decompStoreNewNF(uint32_t code, Norm *norm) {
596 uint32_t nfd[40], nfkd[40], hangulBuffer[3];
597 Norm hangulNorm;
598
599 uint32_t *s32;
600 Norm *p;
601 uint32_t c;
602 int32_t i, length;
603 uint8_t lenNFD=0, lenNFKD=0;
604 UBool changedNFD=FALSE, changedNFKD=FALSE;
605
606 if((length=norm->lenNFD)!=0) {
607 /* always allocate the original string */
608 changedNFD=TRUE;
609 s32=norm->nfd;
610 } else if((length=norm->lenNFKD)!=0) {
611 /* always allocate the original string */
612 changedNFKD=TRUE;
613 s32=norm->nfkd;
614 } else {
615 /* no decomposition here, nothing to do */
616 return;
617 }
618
619 /* decompose each code point */
620 for(i=0; i<length; ++i) {
621 c=s32[i];
622 p=getNorm(c);
623 if(p==NULL) {
624 if(HANGUL_BASE<=c && c<(HANGUL_BASE+HANGUL_COUNT)) {
625 getHangulDecomposition(c, &hangulNorm, hangulBuffer);
626 p=&hangulNorm;
627 } else {
628 /* no data, no decomposition */
629 nfd[lenNFD++]=c;
630 nfkd[lenNFKD++]=c;
631 continue;
632 }
633 }
634
635 /* canonically decompose c */
636 if(changedNFD) {
637 if(p->lenNFD!=0) {
638 uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4);
639 lenNFD+=p->lenNFD;
640 } else {
641 nfd[lenNFD++]=c;
642 }
643 }
644
645 /* compatibility-decompose c */
646 if(p->lenNFKD!=0) {
647 uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4);
648 lenNFKD+=p->lenNFKD;
649 changedNFKD=TRUE;
650 } else if(p->lenNFD!=0) {
651 uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4);
652 lenNFKD+=p->lenNFD;
653 /*
654 * not changedNFKD=TRUE;
655 * so that we do not store a new nfkd if there was no nfkd string before
656 * and we only see canonical decompositions
657 */
658 } else {
659 nfkd[lenNFKD++]=c;
660 }
661 }
662
663 /* assume that norm->lenNFD==1 or ==2 */
664 if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) {
665 addCombiningTriple(s32[0], s32[1], code);
666 }
667
668 if(changedNFD) {
669 if(lenNFD!=0) {
670 s32=utm_allocN(utf32Mem, lenNFD);
671 uprv_memcpy(s32, nfd, lenNFD*4);
672 } else {
673 s32=NULL;
674 }
675 norm->lenNFD=lenNFD;
676 norm->nfd=s32;
677 setHaveSeenString(nfd, lenNFD);
678 }
679 if(changedNFKD) {
680 if(lenNFKD!=0) {
681 s32=utm_allocN(utf32Mem, lenNFKD);
682 uprv_memcpy(s32, nfkd, lenNFKD*4);
683 } else {
684 s32=NULL;
685 }
686 norm->lenNFKD=lenNFKD;
687 norm->nfkd=s32;
688 setHaveSeenString(nfkd, lenNFKD);
689 }
690 }
691
692 typedef struct DecompSingle {
693 uint32_t c;
694 Norm *norm;
695 } DecompSingle;
696
697 /*
698 * apply this one character's decompositions (there is at least one!) to
699 * all previous characters' decompositions to decompose them further
700 */
701 static void
decompWithSingleFn(void * context,uint32_t code,Norm * norm)702 decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
703 uint32_t nfd[40], nfkd[40];
704 uint32_t *s32;
705 DecompSingle *me=(DecompSingle *)context;
706 uint32_t c, myC;
707 int32_t i, length;
708 uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD;
709 UBool changedNFD=FALSE, changedNFKD=FALSE;
710
711 /* get the new character's data */
712 myC=me->c;
713 myLenNFD=me->norm->lenNFD;
714 myLenNFKD=me->norm->lenNFKD;
715 /* assume that myC has at least one decomposition */
716
717 if((length=norm->lenNFD)!=0 && myLenNFD!=0) {
718 /* apply NFD(myC) to norm->nfd */
719 s32=norm->nfd;
720 for(i=0; i<length; ++i) {
721 c=s32[i];
722 if(c==myC) {
723 uprv_memcpy(nfd+lenNFD, me->norm->nfd, myLenNFD*4);
724 lenNFD+=myLenNFD;
725 changedNFD=TRUE;
726 } else {
727 nfd[lenNFD++]=c;
728 }
729 }
730 }
731
732 if((length=norm->lenNFKD)!=0) {
733 /* apply NFD(myC) and NFKD(myC) to norm->nfkd */
734 s32=norm->nfkd;
735 for(i=0; i<length; ++i) {
736 c=s32[i];
737 if(c==myC) {
738 if(myLenNFKD!=0) {
739 uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
740 lenNFKD+=myLenNFKD;
741 } else /* assume myLenNFD!=0 */ {
742 uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4);
743 lenNFKD+=myLenNFD;
744 }
745 changedNFKD=TRUE;
746 } else {
747 nfkd[lenNFKD++]=c;
748 }
749 }
750 } else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) {
751 /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
752 s32=norm->nfd;
753 for(i=0; i<length; ++i) {
754 c=s32[i];
755 if(c==myC) {
756 uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
757 lenNFKD+=myLenNFKD;
758 changedNFKD=TRUE;
759 } else {
760 nfkd[lenNFKD++]=c;
761 }
762 }
763 }
764
765 /* set the new decompositions, forget the old ones */
766 if(changedNFD) {
767 if(lenNFD!=0) {
768 if(lenNFD>norm->lenNFD) {
769 s32=utm_allocN(utf32Mem, lenNFD);
770 } else {
771 s32=norm->nfd;
772 }
773 uprv_memcpy(s32, nfd, lenNFD*4);
774 } else {
775 s32=NULL;
776 }
777 norm->lenNFD=lenNFD;
778 norm->nfd=s32;
779 }
780 if(changedNFKD) {
781 if(lenNFKD!=0) {
782 if(lenNFKD>norm->lenNFKD) {
783 s32=utm_allocN(utf32Mem, lenNFKD);
784 } else {
785 s32=norm->nfkd;
786 }
787 uprv_memcpy(s32, nfkd, lenNFKD*4);
788 } else {
789 s32=NULL;
790 }
791 norm->lenNFKD=lenNFKD;
792 norm->nfkd=s32;
793 }
794 }
795
796 /*
797 * process the data for one code point listed in UnicodeData;
798 * UnicodeData itself never maps a code point to both NFD and NFKD
799 */
800 extern void
storeNorm(uint32_t code,Norm * norm)801 storeNorm(uint32_t code, Norm *norm) {
802 DecompSingle decompSingle;
803 Norm *p;
804
805 if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
806 /* ignore compatibility decomposition */
807 norm->lenNFKD=0;
808 }
809
810 /* copy existing derived normalization properties */
811 p=createNorm(code);
812 norm->qcFlags=p->qcFlags;
813 norm->combiningFlags=p->combiningFlags;
814 norm->fncIndex=p->fncIndex;
815
816 /* process the decomposition if there is one here */
817 if((norm->lenNFD|norm->lenNFKD)!=0) {
818 /* decompose this one decomposition further, may generate two decompositions */
819 decompStoreNewNF(code, norm);
820
821 /* has this code point been used in previous decompositions? */
822 if(HAVE_SEEN(code)) {
823 /* use this decomposition to decompose other decompositions further */
824 decompSingle.c=code;
825 decompSingle.norm=norm;
826 enumTrie(decompWithSingleFn, &decompSingle);
827 }
828 }
829
830 /* store the data */
831 uprv_memcpy(p, norm, sizeof(Norm));
832 }
833
834 extern void
setQCFlags(uint32_t code,uint8_t qcFlags)835 setQCFlags(uint32_t code, uint8_t qcFlags) {
836 if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
837 /* ignore compatibility decomposition: unset the KC/KD flags */
838 qcFlags&=~(_NORM_QC_NFKC|_NORM_QC_NFKD);
839
840 /* set the KC/KD flags to the same values as the C/D flags */
841 qcFlags|=qcFlags<<1;
842 }
843 if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
844 /* ignore composition data: unset the C/KC flags */
845 qcFlags&=~(_NORM_QC_NFC|_NORM_QC_NFKC);
846
847 /* set the C/KC flags to the same values as the D/KD flags */
848 qcFlags|=qcFlags>>2;
849 }
850
851 createNorm(code)->qcFlags|=qcFlags;
852
853 /* adjust the minimum code point for quick check no/maybe */
854 if(code<0xffff) {
855 if((qcFlags&_NORM_QC_NFC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]) {
856 indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=(uint16_t)code;
857 }
858 if((qcFlags&_NORM_QC_NFKC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]) {
859 indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=(uint16_t)code;
860 }
861 if((qcFlags&_NORM_QC_NFD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
862 indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=(uint16_t)code;
863 }
864 if((qcFlags&_NORM_QC_NFKD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]) {
865 indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=(uint16_t)code;
866 }
867 }
868
869 if(qcFlags&_NORM_QC_NFD) {
870 uset_add(nfdQCNoSet, (UChar32)code);
871 }
872 }
873
874 extern void
setCompositionExclusion(uint32_t code)875 setCompositionExclusion(uint32_t code) {
876 if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
877 createNorm(code)->combiningFlags|=0x80;
878 }
879 }
880
881 static void
setHangulJamoSpecials()882 setHangulJamoSpecials() {
883 Norm *norm;
884 uint32_t c, hangul;
885
886 /*
887 * Hangul syllables are algorithmically decomposed into Jamos,
888 * and Jamos are algorithmically composed into Hangul syllables.
889 * The quick check flags are parsed, except for Hangul.
890 */
891
892 /* set Jamo L specials */
893 hangul=0xac00;
894 for(c=0x1100; c<=0x1112; ++c) {
895 norm=createNorm(c);
896 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L;
897 if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
898 norm->combiningFlags=1;
899 }
900
901 /* for each Jamo L create a set with its associated Hangul block */
902 norm->canonStart=uset_open(hangul, hangul+21*28-1);
903 hangul+=21*28;
904 }
905
906 /* set Jamo V specials */
907 for(c=0x1161; c<=0x1175; ++c) {
908 norm=createNorm(c);
909 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V;
910 if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
911 norm->combiningFlags=2;
912 }
913 norm->unsafeStart=TRUE;
914 }
915
916 /* set Jamo T specials */
917 for(c=0x11a8; c<=0x11c2; ++c) {
918 norm=createNorm(c);
919 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T;
920 if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
921 norm->combiningFlags=2;
922 }
923 norm->unsafeStart=TRUE;
924 }
925
926 /* set Hangul specials, precompacted */
927 norm=allocNorm();
928 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL;
929 if(DO_STORE(UGENNORM_STORE_COMPAT)) {
930 norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD;
931 } else {
932 norm->qcFlags=_NORM_QC_NFD;
933 }
934
935 if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) {
936 fprintf(stderr, "error: too many normalization entries (setting Hangul)\n");
937 exit(U_BUFFER_OVERFLOW_ERROR);
938 }
939 }
940
941 /*
942 * set FC-NFKC-Closure string
943 * s contains the closure string; s[0]==length, s[1..length] is the actual string
944 * may modify s[0]
945 */
946 U_CFUNC void
setFNC(uint32_t c,UChar * s)947 setFNC(uint32_t c, UChar *s) {
948 uint16_t *p;
949 int32_t length, i, count;
950 UChar first;
951
952 if( DO_NOT_STORE(UGENNORM_STORE_COMPAT) ||
953 DO_NOT_STORE(UGENNORM_STORE_COMPOSITION) ||
954 DO_NOT_STORE(UGENNORM_STORE_AUX)
955 ) {
956 return;
957 }
958
959 count=utm_countItems(extraMem);
960 length=s[0];
961 first=s[1];
962
963 /* try to overlay single-unit strings with existing ones */
964 if(length==1 && first<0xff00) {
965 p=utm_getStart(extraMem);
966 for(i=1; i<count; ++i) {
967 if(first==p[i]) {
968 break;
969 }
970 }
971 } else {
972 i=count;
973 }
974
975 /* append the new string if it cannot be overlayed with an old one */
976 if(i==count) {
977 if(count>_NORM_AUX_MAX_FNC) {
978 fprintf(stderr, "gennorm error: too many FNC strings\n");
979 exit(U_INDEX_OUTOFBOUNDS_ERROR);
980 }
981
982 /* prepend 0xffxx with xx==length */
983 s[0]=(uint16_t)(0xff00+length);
984 ++length;
985 p=(uint16_t *)utm_allocN(extraMem, length);
986 uprv_memcpy(p, s, length*2);
987
988 /* update the top index in extraMem[0] */
989 count+=length;
990 ((uint16_t *)utm_getStart(extraMem))[0]=(uint16_t)count;
991 }
992
993 /* store the index to the string */
994 createNorm(c)->fncIndex=i;
995 }
996
997 /* build runtime structures ------------------------------------------------- */
998
999 /* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
1000 static uint16_t
reorderString(uint32_t * s,int32_t length)1001 reorderString(uint32_t *s, int32_t length) {
1002 uint8_t ccs[40];
1003 uint32_t c;
1004 int32_t i, j;
1005 uint8_t cc, prevCC;
1006
1007 if(length<=0) {
1008 return 0;
1009 }
1010
1011 for(i=0; i<length; ++i) {
1012 /* get the i-th code point and its combining class */
1013 c=s[i];
1014 cc=getCCFromCP(c);
1015 if(cc!=0 && i!=0) {
1016 /* it is a combining mark, see if it needs to be moved back */
1017 j=i;
1018 do {
1019 prevCC=ccs[j-1];
1020 if(prevCC<=cc) {
1021 break; /* found the right place */
1022 }
1023 /* move the previous code point here and go back */
1024 s[j]=s[j-1];
1025 ccs[j]=prevCC;
1026 } while(--j!=0);
1027 s[j]=c;
1028 ccs[j]=cc;
1029 } else {
1030 /* just store the combining class */
1031 ccs[i]=cc;
1032 }
1033 }
1034
1035 return (uint16_t)(((uint16_t)ccs[0]<<8)|ccs[length-1]);
1036 }
1037
1038 #if 0
1039 static UBool combineAndQC[64]={ 0 };
1040 #endif
1041
1042 /*
1043 * canonically reorder the up to two decompositions
1044 * and store the leading and trailing combining classes accordingly
1045 *
1046 * also process canonical decompositions for canonical closure
1047 */
1048 static void
postParseFn(void * context,uint32_t code,Norm * norm)1049 postParseFn(void *context, uint32_t code, Norm *norm) {
1050 int32_t length;
1051
1052 /* canonically order the NFD */
1053 length=norm->lenNFD;
1054 if(length>0) {
1055 norm->canonBothCCs=reorderString(norm->nfd, length);
1056 }
1057
1058 /* canonically reorder the NFKD */
1059 length=norm->lenNFKD;
1060 if(length>0) {
1061 norm->compatBothCCs=reorderString(norm->nfkd, length);
1062 }
1063
1064 /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
1065 if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) {
1066 fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags);
1067 }
1068 if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) {
1069 fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
1070 }
1071
1072 /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
1073 #if 0
1074 combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1;
1075 #endif
1076
1077 if(norm->combiningFlags&1) {
1078 if(norm->udataCC!=0) {
1079 /* illegal - data-derivable composition exclusion */
1080 fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC);
1081 }
1082 }
1083 if(norm->combiningFlags&2) {
1084 if((norm->qcFlags&0x11)==0) {
1085 fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code);
1086 }
1087 #if 0
1088 /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
1089 if(norm->udataCC==0) {
1090 printf("U+%04lx combines backward but udataCC==0\n", (long)code);
1091 }
1092 #endif
1093 }
1094 if((norm->combiningFlags&3)==3 && beVerbose) {
1095 printf("U+%04lx combines both ways\n", (long)code);
1096 }
1097
1098 /*
1099 * process canonical decompositions for canonical closure
1100 *
1101 * in each canonical decomposition:
1102 * add the current character (code) to the set of canonical starters of its norm->nfd[0]
1103 * set the "unsafe starter" flag for each norm->nfd[1..]
1104 */
1105 length=norm->lenNFD;
1106 if(length>0) {
1107 Norm *otherNorm;
1108 UChar32 c;
1109 int32_t i;
1110
1111 /* nfd[0].canonStart.add(code) */
1112 c=norm->nfd[0];
1113 otherNorm=createNorm(c);
1114 if(otherNorm->canonStart==NULL) {
1115 otherNorm->canonStart=uset_open(code, code);
1116 if(otherNorm->canonStart==NULL) {
1117 fprintf(stderr, "gennorm error: out of memory in uset_open()\n");
1118 exit(U_MEMORY_ALLOCATION_ERROR);
1119 }
1120 } else {
1121 uset_add(otherNorm->canonStart, code);
1122 if(!uset_contains(otherNorm->canonStart, code)) {
1123 fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code);
1124 exit(U_INTERNAL_PROGRAM_ERROR);
1125 }
1126 }
1127
1128 /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
1129 for(i=1; i<length; ++i) {
1130 createNorm(norm->nfd[i])->unsafeStart=TRUE;
1131 }
1132 }
1133 }
1134
1135 static uint32_t
make32BitNorm(Norm * norm)1136 make32BitNorm(Norm *norm) {
1137 UChar extra[100];
1138 const Norm *other;
1139 uint32_t word;
1140 int32_t i, length, beforeZero=0, count, start;
1141
1142 /*
1143 * Check for assumptions:
1144 *
1145 * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
1146 * then the decomposition also begins with a true starter.
1147 */
1148 if(norm->udataCC==0) {
1149 /* this is a starter */
1150 if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) {
1151 /* a "true" NFC starter with a canonical decomposition */
1152 if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
1153 ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */
1154 ) {
1155 fprintf(stderr,
1156 "error: true NFC starter canonical decomposition[%u] does not begin\n"
1157 " with a true NFC starter: U+%04lx U+%04lx%s\n",
1158 norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
1159 norm->lenNFD<=2 ? "" : " ...");
1160 exit(U_INVALID_TABLE_FILE);
1161 }
1162 }
1163
1164 if((norm->qcFlags&_NORM_QC_NFKC)==0) {
1165 if(norm->lenNFKD>0) {
1166 /* a "true" NFKC starter with a compatibility decomposition */
1167 if( norm->compatBothCCs>=0x100 || /* lead cc!=0 or */
1168 ((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFKC_YES */
1169 ) {
1170 fprintf(stderr,
1171 "error: true NFKC starter compatibility decomposition[%u] does not begin\n"
1172 " with a true NFKC starter: U+%04lx U+%04lx%s\n",
1173 norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1],
1174 norm->lenNFKD<=2 ? "" : " ...");
1175 exit(U_INVALID_TABLE_FILE);
1176 }
1177 } else if(norm->lenNFD>0) {
1178 /* a "true" NFKC starter with only a canonical decomposition */
1179 if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
1180 ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFKC_YES */
1181 ) {
1182 fprintf(stderr,
1183 "error: true NFKC starter canonical decomposition[%u] does not begin\n"
1184 " with a true NFKC starter: U+%04lx U+%04lx%s\n",
1185 norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
1186 norm->lenNFD<=2 ? "" : " ...");
1187 exit(U_INVALID_TABLE_FILE);
1188 }
1189 }
1190 }
1191 }
1192
1193 /* reset the 32-bit word and set the quick check flags */
1194 word=norm->qcFlags;
1195
1196 /* set the UnicodeData combining class */
1197 word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT;
1198
1199 /* set the combining flag and index */
1200 if(norm->combiningFlags&3) {
1201 word|=(uint32_t)(norm->combiningFlags&3)<<6;
1202 }
1203
1204 /* set the combining index value into the extra data */
1205 /* 0xffff: no combining index; 0..0x7fff: combining index */
1206 if(norm->combiningIndex!=0xffff) {
1207 extra[0]=norm->combiningIndex;
1208 beforeZero=1;
1209 }
1210
1211 count=beforeZero;
1212
1213 /* write the decompositions */
1214 if((norm->lenNFD|norm->lenNFKD)!=0) {
1215 extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */
1216
1217 length=norm->lenNFD;
1218 if(length>0) {
1219 if(norm->canonBothCCs!=0) {
1220 extra[beforeZero]|=0x80;
1221 extra[count++]=norm->canonBothCCs;
1222 }
1223 start=count;
1224 for(i=0; i<length; ++i) {
1225 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfd[i]);
1226 }
1227 extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */
1228 }
1229
1230 length=norm->lenNFKD;
1231 if(length>0) {
1232 if(norm->compatBothCCs!=0) {
1233 extra[beforeZero]|=0x8000;
1234 extra[count++]=norm->compatBothCCs;
1235 }
1236 start=count;
1237 for(i=0; i<length; ++i) {
1238 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfkd[i]);
1239 }
1240 extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */
1241 }
1242 }
1243
1244 /* allocate and copy the extra data */
1245 if(count!=0) {
1246 UChar *p;
1247
1248 if(norm->specialTag!=0) {
1249 fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag);
1250 exit(U_ILLEGAL_ARGUMENT_ERROR);
1251 }
1252
1253 p=(UChar *)utm_allocN(extraMem, count);
1254 uprv_memcpy(p, extra, count*2);
1255
1256 /* set the extra index, offset by beforeZero */
1257 word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT;
1258 } else if(norm->specialTag!=0) {
1259 /* set a special tag instead of an extra index */
1260 word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT;
1261 }
1262
1263 return word;
1264 }
1265
1266 /* turn all Norm structs into corresponding 32-bit norm values */
1267 static void
makeAll32()1268 makeAll32() {
1269 uint32_t *pNormData;
1270 uint32_t n;
1271 int32_t i, normLength, count;
1272
1273 count=(int32_t)utm_countItems(normMem);
1274 for(i=0; i<count; ++i) {
1275 norms[i].value32=make32BitNorm(norms+i);
1276 }
1277
1278 pNormData=utrie_getData(norm32Trie, &normLength);
1279
1280 count=0; /* count is now just used for debugging */
1281 for(i=0; i<normLength; ++i) {
1282 n=pNormData[i];
1283 if(0!=(pNormData[i]=norms[n].value32)) {
1284 ++count;
1285 }
1286 }
1287 }
1288
1289 /*
1290 * extract all Norm.canonBothCCs into the FCD table
1291 * set 32-bit values to use the common fold and compact functions
1292 */
1293 static void
makeFCD()1294 makeFCD() {
1295 uint32_t *pFCDData;
1296 uint32_t n;
1297 int32_t i, count, fcdLength;
1298 uint16_t bothCCs;
1299
1300 count=utm_countItems(normMem);
1301 for(i=0; i<count; ++i) {
1302 bothCCs=norms[i].canonBothCCs;
1303 if(bothCCs==0) {
1304 /* if there are no decomposition cc's then use the udataCC twice */
1305 bothCCs=norms[i].udataCC;
1306 bothCCs|=bothCCs<<8;
1307 }
1308 norms[i].value32=bothCCs;
1309 }
1310
1311 pFCDData=utrie_getData(fcdTrie, &fcdLength);
1312
1313 for(i=0; i<fcdLength; ++i) {
1314 n=pFCDData[i];
1315 pFCDData[i]=norms[n].value32;
1316 }
1317 }
1318
1319 /**
1320 * If the given set contains exactly one character, then return it.
1321 * Otherwise return -1.
1322 */
1323 static int32_t
usetContainsOne(const USet * set)1324 usetContainsOne(const USet* set) {
1325 if(uset_getItemCount(set)==1) {
1326 /* there is a single item (a single range) */
1327 UChar32 start, end;
1328 UErrorCode ec=U_ZERO_ERROR;
1329 int32_t len=uset_getItem(set, 0, &start, &end, NULL, 0, &ec);
1330 if (len==0 && start==end) { /* a range (len==0) with a single code point */
1331 return start;
1332 }
1333 }
1334 return -1;
1335 }
1336
1337 static void
makeCanonSetFn(void * context,uint32_t code,Norm * norm)1338 makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
1339 if(norm->canonStart!=NULL && !uset_isEmpty(norm->canonStart)) {
1340 uint16_t *table;
1341 int32_t c, tableLength;
1342 UErrorCode errorCode=U_ZERO_ERROR;
1343
1344 /* does the set contain exactly one code point? */
1345 c=usetContainsOne(norm->canonStart);
1346
1347 /* add an entry to the BMP or supplementary search table */
1348 if(code<=0xffff) {
1349 table=canonStartSets+_NORM_MAX_CANON_SETS;
1350 tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1351
1352 table[tableLength++]=(uint16_t)code;
1353
1354 if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) {
1355 /* single-code point BMP result for BMP code point */
1356 table[tableLength++]=(uint16_t)c;
1357 } else {
1358 table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop);
1359 c=-1;
1360 }
1361 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength;
1362 } else {
1363 table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH;
1364 tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1365
1366 table[tableLength++]=(uint16_t)(code>>16);
1367 table[tableLength++]=(uint16_t)code;
1368
1369 if(c>=0) {
1370 /* single-code point result for supplementary code point */
1371 table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00));
1372 table[tableLength++]=(uint16_t)c;
1373 } else {
1374 table[tableLength++]=(uint16_t)canonStartSetsTop;
1375 }
1376 canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength;
1377 }
1378
1379 if(c<0) {
1380 /* write a USerializedSet */
1381 ++canonSetsCount;
1382 canonStartSetsTop+=
1383 uset_serialize(norm->canonStart,
1384 canonStartSets+canonStartSetsTop,
1385 _NORM_MAX_CANON_SETS-canonStartSetsTop,
1386 &errorCode);
1387 }
1388 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
1389
1390 if(U_FAILURE(errorCode)) {
1391 fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop);
1392 exit(errorCode);
1393 }
1394 if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) {
1395 fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n");
1396 exit(U_INDEX_OUTOFBOUNDS_ERROR);
1397 }
1398 }
1399 }
1400
1401 /* for getSkippableFlags ---------------------------------------------------- */
1402
1403 /* combine the lead and trail code points; return <0 if they do not combine */
1404 static int32_t
combine(uint32_t lead,uint32_t trail)1405 combine(uint32_t lead, uint32_t trail) {
1406 CombiningTriple *triples;
1407 uint32_t i, count;
1408
1409 /* search for all triples with c as lead code point */
1410 triples=utm_getStart(combiningTriplesMem);
1411 count=utm_countItems(combiningTriplesMem);
1412
1413 /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1414 for(i=0; i<count && lead!=triples[i].lead; ++i) {}
1415
1416 /* check each triple for this code point */
1417 for(; i<count && lead==triples[i].lead; ++i) {
1418 if(trail==triples[i].trail) {
1419 return (int32_t)triples[i].combined;
1420 }
1421 }
1422
1423 return -1;
1424 }
1425
1426 /*
1427 * Starting from the canonical decomposition s[0..length[ of a single code point,
1428 * is the code point c consumed in an NFC/FCC recomposition?
1429 *
1430 * No need to handle discontiguous composition because that would not consume some
1431 * intermediate character, so would not compose back to the original character.
1432 * See comments in canChangeWithFollowing().
1433 *
1434 * No need to compose beyond where c canonically orders because if it is consumed
1435 * then the result differs from the original anyway.
1436 *
1437 * Possible optimization:
1438 * - Verify that there are no cases of the same combining mark stacking twice.
1439 * - return FALSE right away if c inserts after a copy of itself
1440 * without attempting to recompose; will happen because each mark in
1441 * the decomposition will be enumerated and passed in as c.
1442 * More complicated and fragile though than it is already.
1443 *
1444 * markus 2002nov04
1445 */
1446 static UBool
doesComposeConsume(const uint32_t * s,int32_t length,uint32_t c,uint8_t cc)1447 doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) {
1448 int32_t starter, i;
1449
1450 /* ignore trailing characters where cc<prevCC */
1451 while(length>1 && cc<getCCFromCP(s[length-1])) {
1452 --length;
1453 }
1454
1455 /* start consuming/combining from the beginning */
1456 starter=(int32_t)s[0];
1457 for(i=1; i<length; ++i) {
1458 starter=combine((uint32_t)starter, s[i]);
1459 if(starter<0) {
1460 fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
1461 (int)s[0], (int)s[1], (int)length, (int)c, cc);
1462 exit(U_INTERNAL_PROGRAM_ERROR);
1463 }
1464 }
1465
1466 /* try to combine/consume c, return TRUE if it is consumed */
1467 return combine((uint32_t)starter, c)>=0;
1468 }
1469
1470 /* does the starter s[0] combine forward with another char that is below trailCC? */
1471 static UBool
canChangeWithFollowing(const uint32_t * s,int32_t length,uint8_t trailCC)1472 canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
1473 if(trailCC<=1) {
1474 /* no character will combine ahead of the trailing char of the decomposition */
1475 return FALSE;
1476 }
1477
1478 /*
1479 * We are only checking skippable condition (f).
1480 * Therefore, the original character does not have quick check flag NFC_NO (c),
1481 * i.e., the decomposition recomposes completely back into the original code point.
1482 * So s[0] must be a true starter with cc==0 and
1483 * combining with following code points.
1484 *
1485 * Similarly, length==1 is not possible because that would be a singleton
1486 * decomposition which is marked with NFC_NO and does not pass (c).
1487 *
1488 * Only a character with cc<trailCC can change the composition.
1489 * Reason: A char with cc>=trailCC would order after decomposition s[],
1490 * composition would consume all of the decomposition, and here we know that
1491 * the original char passed check d), i.e., it does not combine forward,
1492 * therefore does not combine with anything after the decomposition is consumed.
1493 *
1494 * Now see if there is a character that
1495 * 1. combines backward
1496 * 2. has cc<trailCC
1497 * 3. is consumed in recomposition
1498 *
1499 * length==2 is simple:
1500 *
1501 * Characters that fulfill these conditions are exactly the ones that combine directly
1502 * with the starter c==s[0] because there is no intervening character after
1503 * reordering.
1504 * We can just enumerate all chars with which c combines (they all pass 1. and 3.)
1505 * and see if one has cc<trailCC (passes 2.).
1506 *
1507 * length>2 is a little harder:
1508 *
1509 * Since we will get different starters during recomposition, we need to
1510 * enumerate each backward-combining character (1.)
1511 * with cc<trailCC (2.) and
1512 * see if it gets consumed in recomposition. (3.)
1513 * No need to enumerate both-ways combining characters because they must have cc==0.
1514 */
1515 if(length==2) {
1516 /* enumerate all chars that combine with this one and check their cc */
1517 CombiningTriple *triples;
1518 uint32_t c, i, count;
1519 uint8_t cc;
1520
1521 /* search for all triples with c as lead code point */
1522 triples=utm_getStart(combiningTriplesMem);
1523 count=utm_countItems(combiningTriplesMem);
1524 c=s[0];
1525
1526 /* triples are not sorted by code point but for each lead CP there is one contiguous block */
1527 for(i=0; i<count && c!=triples[i].lead; ++i) {}
1528
1529 /* check each triple for this code point */
1530 for(; i<count && c==triples[i].lead; ++i) {
1531 cc=getCCFromCP(triples[i].trail);
1532 if(cc>0 && cc<trailCC) {
1533 /* this trail code point combines with c and has cc<trailCC */
1534 return TRUE;
1535 }
1536 }
1537 } else {
1538 /* enumerate all chars that combine backward */
1539 uint32_t c2;
1540 uint16_t i;
1541 uint8_t cc;
1542
1543 for(i=combineBothTop; i<combineBackTop; ++i) {
1544 c2=combiningCPs[i]&0xffffff;
1545 cc=getCCFromCP(c2);
1546 /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
1547 if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) {
1548 return TRUE;
1549 }
1550 }
1551 }
1552
1553 /* this decomposition is not modified by any appended character */
1554 return FALSE;
1555 }
1556
1557 /* see unormimp.h for details on NF*C Skippable flags */
1558 static uint32_t
getSkippableFlags(const Norm * norm)1559 getSkippableFlags(const Norm *norm) {
1560 /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
1561
1562 /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
1563 if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) {
1564 return 0;
1565 }
1566
1567 /* ### TODO check other data generation functions whether they should & do ignore Hangul/Jamo specials */
1568
1569 /*
1570 * Note:
1571 * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
1572 *
1573 * This means that (a)..(e) must always be derived from the runtime norm32 value,
1574 * and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
1575 * the form is NF*C and there is a canonical decomposition (NFD_NO).
1576 *
1577 * (a) unassigned code points get "not skippable"==false because they
1578 * don't have a Norm struct so they won't get here
1579 */
1580
1581 /* (b) not skippable if cc!=0 */
1582 if(norm->udataCC!=0) {
1583 return 0; /* non-zero flag for (f) only */
1584 }
1585
1586 /*
1587 * not NFC_Skippable if
1588 * (c) quick check flag == NO or
1589 * (d) combines forward or
1590 * (e) combines back or
1591 * (f) can change if another character is added
1592 *
1593 * for (f):
1594 * For NF*C: Get corresponding decomposition, get its last starter (cc==0),
1595 * check its composition list,
1596 * see if any of the second code points in the list
1597 * has cc less than the trailCC of the decomposition.
1598 *
1599 * For FCC: Test at runtime if the decomposition has a trailCC>1
1600 * -> there are characters with cc==1, they would order before the trail char
1601 * and prevent contiguous combination with the trail char.
1602 */
1603 if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 ||
1604 (norm->combiningFlags&3)!=0) {
1605 return 0; /* non-zero flag for (f) only */
1606 }
1607 if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) {
1608 return _NORM_AUX_NFC_SKIP_F_MASK;
1609 }
1610
1611 return 0; /* skippable */
1612 }
1613
1614 static void
makeAux()1615 makeAux() {
1616 Norm *norm;
1617 uint32_t *pData;
1618 int32_t i, length;
1619
1620 pData=utrie_getData(auxTrie, &length);
1621
1622 for(i=0; i<length; ++i) {
1623 norm=norms+pData[i];
1624 /*
1625 * 16-bit auxiliary normalization properties
1626 * see unormimp.h
1627 */
1628 pData[i]=
1629 ((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
1630 (uint32_t)norm->fncIndex;
1631
1632 if(norm->unsafeStart || norm->udataCC!=0) {
1633 pData[i]|=_NORM_AUX_UNSAFE_MASK;
1634 }
1635
1636 pData[i]|=getSkippableFlags(norm);
1637 }
1638 }
1639
1640 /* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
1641 static uint32_t U_CALLCONV
getFoldedNormValue(UNewTrie * trie,UChar32 start,int32_t offset)1642 getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1643 uint32_t value, leadNorm32=0;
1644 UChar32 limit;
1645 UBool inBlockZero;
1646
1647 limit=start+0x400;
1648 while(start<limit) {
1649 value=utrie_get32(trie, start, &inBlockZero);
1650 if(inBlockZero) {
1651 start+=UTRIE_DATA_BLOCK_LENGTH;
1652 } else {
1653 if(value!=0) {
1654 leadNorm32|=value;
1655 }
1656 ++start;
1657 }
1658 }
1659
1660 /* turn multi-bit fields into the worst-case value */
1661 if(leadNorm32&_NORM_CC_MASK) {
1662 leadNorm32|=_NORM_CC_MASK;
1663 }
1664
1665 /* clean up unnecessarily ored bit fields */
1666 leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
1667
1668 if(leadNorm32==0) {
1669 /* nothing to do (only composition exclusions?) */
1670 return 0;
1671 }
1672
1673 /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
1674 leadNorm32|=(
1675 (uint32_t)_NORM_EXTRA_INDEX_TOP+
1676 (uint32_t)((offset-UTRIE_BMP_INDEX_LENGTH)>>UTRIE_SURROGATE_BLOCK_BITS)
1677 )<<_NORM_EXTRA_SHIFT;
1678
1679 return leadNorm32;
1680 }
1681
1682 /* folding value for FCD: use default function (just store the offset (16 bits) if there is any non-0 entry) */
1683
1684 /*
1685 * folding value for auxiliary data:
1686 * store the non-zero offset in bits 9..0 (FNC bits)
1687 * if there is any non-0 entry;
1688 * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
1689 */
1690 static uint32_t U_CALLCONV
getFoldedAuxValue(UNewTrie * trie,UChar32 start,int32_t offset)1691 getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
1692 uint32_t value, oredValues;
1693 UChar32 limit;
1694 UBool inBlockZero;
1695
1696 oredValues=0;
1697 limit=start+0x400;
1698 while(start<limit) {
1699 value=utrie_get32(trie, start, &inBlockZero);
1700 if(inBlockZero) {
1701 start+=UTRIE_DATA_BLOCK_LENGTH;
1702 } else {
1703 oredValues|=value;
1704 ++start;
1705 }
1706 }
1707
1708 if(oredValues!=0) {
1709 /* move the 10 significant offset bits into bits 9..0 */
1710 offset>>=UTRIE_SURROGATE_BLOCK_BITS;
1711 if(offset>_NORM_AUX_FNC_MASK) {
1712 fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n");
1713 exit(U_INDEX_OUTOFBOUNDS_ERROR);
1714 }
1715 return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK);
1716 } else {
1717 return 0;
1718 }
1719 }
1720
1721 extern void
processData()1722 processData() {
1723 #if 0
1724 uint16_t i;
1725 #endif
1726
1727 processCombining();
1728
1729 /* canonically reorder decompositions and assign combining classes for decompositions */
1730 enumTrie(postParseFn, NULL);
1731
1732 #if 0
1733 for(i=1; i<64; ++i) {
1734 if(combineAndQC[i]) {
1735 printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33);
1736 }
1737 }
1738 #endif
1739
1740 /* add hangul/jamo specials */
1741 setHangulJamoSpecials();
1742
1743 /* set this value; will be updated as makeCanonSetFn() adds sets (if there are any, see gStoreFlags) */
1744 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
1745
1746 /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
1747 if(DO_STORE(UGENNORM_STORE_AUX) && DO_STORE(UGENNORM_STORE_COMPOSITION)) {
1748 enumTrie(makeCanonSetFn, NULL);
1749 }
1750
1751 /* clone the normalization builder trie to make the final data tries */
1752 if( NULL==utrie_clone(norm32Trie, normTrie, NULL, 0) ||
1753 NULL==utrie_clone(fcdTrie, normTrie, NULL, 0) ||
1754 NULL==utrie_clone(auxTrie, normTrie, NULL, 0)
1755 ) {
1756 fprintf(stderr, "error: unable to clone the normalization trie\n");
1757 exit(U_MEMORY_ALLOCATION_ERROR);
1758 }
1759
1760 /* --- finalize data for quick checks & normalization --- */
1761
1762 /* turn the Norm structs (stage2, norms) into 32-bit data words */
1763 makeAll32();
1764
1765 /* --- finalize data for FCD checks --- */
1766
1767 /* FCD data: take Norm.canonBothCCs and store them in the FCD table */
1768 makeFCD();
1769
1770 /* --- finalize auxiliary normalization data --- */
1771 makeAux();
1772
1773 if(beVerbose) {
1774 #if 0
1775 printf("number of stage 2 entries: %ld\n", stage2Mem->index);
1776 printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2);
1777 #endif
1778 printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop);
1779 printf("combining table count: %u\n", combiningTableTop);
1780 }
1781 }
1782
1783 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1784
1785 extern void
generateData(const char * dataDir,UBool csource)1786 generateData(const char *dataDir, UBool csource) {
1787 static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000];
1788
1789 UNewDataMemory *pData;
1790 UErrorCode errorCode=U_ZERO_ERROR;
1791 int32_t size, dataLength;
1792
1793 #if UCONFIG_NO_NORMALIZATION
1794
1795 size=0;
1796
1797 #else
1798
1799 U_STRING_DECL(nxCJKCompatPattern, "[:Ideographic:]", 15);
1800 U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
1801 USet *set;
1802 int32_t normTrieSize, fcdTrieSize, auxTrieSize;
1803
1804 normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
1805 if(U_FAILURE(errorCode)) {
1806 fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
1807 exit(errorCode);
1808 }
1809
1810 if(DO_STORE(UGENNORM_STORE_FCD)) {
1811 fcdTrieSize=utrie_serialize(fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), NULL, TRUE, &errorCode);
1812 if(U_FAILURE(errorCode)) {
1813 fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode));
1814 exit(errorCode);
1815 }
1816 } else {
1817 fcdTrieSize=0;
1818 }
1819
1820 if(DO_STORE(UGENNORM_STORE_AUX)) {
1821 auxTrieSize=utrie_serialize(auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode);
1822 if(U_FAILURE(errorCode)) {
1823 fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode));
1824 exit(errorCode);
1825 }
1826 } else {
1827 auxTrieSize=0;
1828 }
1829
1830 /* move the parts of canonStartSets[] together into a contiguous block */
1831 if( canonStartSetsTop<_NORM_MAX_CANON_SETS &&
1832 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]!=0
1833 ) {
1834 uprv_memmove(canonStartSets+canonStartSetsTop,
1835 canonStartSets+_NORM_MAX_CANON_SETS,
1836 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2);
1837 }
1838 canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1839
1840 if( canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH) &&
1841 canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]!=0
1842 ) {
1843 uprv_memmove(canonStartSets+canonStartSetsTop,
1844 canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH,
1845 canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2);
1846 }
1847 canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1848
1849 /* create the normalization exclusion sets */
1850 /*
1851 * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
1852 * but we cannot use NFD_QC from the pattern because that would require
1853 * unorm.icu which we are just going to generate.
1854 * Therefore we have manually collected nfdQCNoSet and intersect Ideographic
1855 * with that.
1856 */
1857 U_STRING_INIT(nxCJKCompatPattern, "[:Ideographic:]", 15);
1858 U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
1859
1860 canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop;
1861 set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode);
1862 if(U_FAILURE(errorCode)) {
1863 fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
1864 exit(errorCode);
1865 }
1866 uset_retainAll(set, nfdQCNoSet);
1867 if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
1868 uset_clear(set);
1869 }
1870 canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
1871 if(U_FAILURE(errorCode)) {
1872 fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
1873 exit(errorCode);
1874 }
1875 uset_close(set);
1876
1877 canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop;
1878 set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode);
1879 if(U_FAILURE(errorCode)) {
1880 fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
1881 exit(errorCode);
1882 }
1883 if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
1884 uset_clear(set);
1885 }
1886 canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
1887 if(U_FAILURE(errorCode)) {
1888 fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
1889 exit(errorCode);
1890 }
1891 uset_close(set);
1892
1893 canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop;
1894
1895 /* make sure that the FCD trie is 4-aligned */
1896 if((utm_countItems(extraMem)+combiningTableTop)&1) {
1897 combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
1898 }
1899
1900 /* pad canonStartSets to 4-alignment, too */
1901 if(canonStartSetsTop&1) {
1902 canonStartSets[canonStartSetsTop++]=0x1235;
1903 }
1904
1905 size=
1906 _NORM_INDEX_TOP*4+
1907 normTrieSize+
1908 utm_countItems(extraMem)*2+
1909 combiningTableTop*2+
1910 fcdTrieSize+
1911 auxTrieSize+
1912 canonStartSetsTop*2;
1913
1914 if(beVerbose) {
1915 printf("size of normalization trie %5u bytes\n", (int)normTrieSize);
1916 printf("size of 16-bit extra memory %5u UChars/uint16_t\n", (int)utm_countItems(extraMem));
1917 printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
1918 printf("size of combining table %5u uint16_t\n", combiningTableTop);
1919 printf("size of FCD trie %5u bytes\n", (int)fcdTrieSize);
1920 printf("size of auxiliary trie %5u bytes\n", (int)auxTrieSize);
1921 printf("size of canonStartSets[] %5u uint16_t\n", (int)canonStartSetsTop);
1922 printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP);
1923 printf(" size of sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP);
1924 printf(" number of sets %5d\n", (int)canonSetsCount);
1925 printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
1926 printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
1927 printf(" length of exclusion sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]);
1928 printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
1929 }
1930
1931 indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
1932 indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
1933
1934 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
1935 indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
1936 indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop);
1937 indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop);
1938
1939 /* the quick check minimum code points are already set */
1940
1941 indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
1942 indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize;
1943 indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop;
1944
1945 #endif
1946
1947 if(csource) {
1948 #if UCONFIG_NO_NORMALIZATION
1949 /* no csource for dummy mode..? */
1950 fprintf(stderr, "gennorm error: UCONFIG_NO_NORMALIZATION is on in csource mode.\n");
1951 exit(1);
1952 #else
1953 /* write .c file for hardcoded data */
1954 UTrie normTrie2={ NULL }, fcdTrie2={ NULL }, auxTrie2={ NULL };
1955 FILE *f;
1956
1957 utrie_unserialize(&normTrie2, normTrieBlock, normTrieSize, &errorCode);
1958 if(fcdTrieSize>0) {
1959 utrie_unserialize(&fcdTrie2, fcdTrieBlock, fcdTrieSize, &errorCode);
1960 }
1961 if(auxTrieSize>0) {
1962 utrie_unserialize(&auxTrie2, auxTrieBlock, auxTrieSize, &errorCode);
1963 }
1964 if(U_FAILURE(errorCode)) {
1965 fprintf(
1966 stderr,
1967 "gennorm error: failed to utrie_unserialize() one of the tries - %s\n",
1968 u_errorName(errorCode));
1969 exit(errorCode);
1970 }
1971
1972 f=usrc_create(dataDir, "unorm_props_data.c");
1973 if(f!=NULL) {
1974 usrc_writeArray(f,
1975 "static const UVersionInfo formatVersion={ ",
1976 dataInfo.formatVersion, 8, 4,
1977 " };\n\n");
1978 usrc_writeArray(f,
1979 "static const UVersionInfo dataVersion={ ",
1980 dataInfo.dataVersion, 8, 4,
1981 " };\n\n");
1982 usrc_writeArray(f,
1983 "static const int32_t indexes[_NORM_INDEX_TOP]={\n",
1984 indexes, 32, _NORM_INDEX_TOP,
1985 "\n};\n\n");
1986 usrc_writeUTrieArrays(f,
1987 "static const uint16_t normTrie_index[%ld]={\n",
1988 "static const uint32_t normTrie_data32[%ld]={\n",
1989 &normTrie2,
1990 "\n};\n\n");
1991 usrc_writeUTrieStruct(f,
1992 "static const UTrie normTrie={\n",
1993 &normTrie2, "normTrie_index", "normTrie_data32", "getFoldingNormOffset",
1994 "};\n\n");
1995 usrc_writeArray(f,
1996 "static const uint16_t extraData[%ld]={\n",
1997 utm_getStart(extraMem), 16, utm_countItems(extraMem),
1998 "\n};\n\n");
1999 usrc_writeArray(f,
2000 "static const uint16_t combiningTable[%ld]={\n",
2001 combiningTable, 16, combiningTableTop,
2002 "\n};\n\n");
2003 if(fcdTrieSize>0) {
2004 usrc_writeUTrieArrays(f,
2005 "static const uint16_t fcdTrie_index[%ld]={\n", NULL,
2006 &fcdTrie2,
2007 "\n};\n\n");
2008 usrc_writeUTrieStruct(f,
2009 "static const UTrie fcdTrie={\n",
2010 &fcdTrie2, "fcdTrie_index", NULL, NULL,
2011 "};\n\n");
2012 } else {
2013 fputs( "static const UTrie fcdTrie={ NULL };\n\n", f);
2014 }
2015 if(auxTrieSize>0) {
2016 usrc_writeUTrieArrays(f,
2017 "static const uint16_t auxTrie_index[%ld]={\n", NULL,
2018 &auxTrie2,
2019 "\n};\n\n");
2020 usrc_writeUTrieStruct(f,
2021 "static const UTrie auxTrie={\n",
2022 &auxTrie2, "auxTrie_index", NULL, "getFoldingAuxOffset",
2023 "};\n\n");
2024 } else {
2025 fputs( "static const UTrie auxTrie={ NULL };\n\n", f);
2026 }
2027 usrc_writeArray(f,
2028 "static const uint16_t canonStartSets[%ld]={\n",
2029 canonStartSets, 16, canonStartSetsTop,
2030 "\n};\n\n");
2031 fclose(f);
2032 }
2033 #endif
2034 } else {
2035 /* write the data */
2036 pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
2037 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
2038 if(U_FAILURE(errorCode)) {
2039 fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
2040 exit(errorCode);
2041 }
2042
2043 #if !UCONFIG_NO_NORMALIZATION
2044
2045 udata_writeBlock(pData, indexes, sizeof(indexes));
2046 udata_writeBlock(pData, normTrieBlock, normTrieSize);
2047 udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
2048 udata_writeBlock(pData, combiningTable, combiningTableTop*2);
2049 udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
2050 udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
2051 udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2);
2052
2053 #endif
2054
2055 /* finish up */
2056 dataLength=udata_finish(pData, &errorCode);
2057 if(U_FAILURE(errorCode)) {
2058 fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode);
2059 exit(errorCode);
2060 }
2061
2062 if(dataLength!=size) {
2063 fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n",
2064 (long)dataLength, (long)size);
2065 exit(U_INTERNAL_PROGRAM_ERROR);
2066 }
2067 }
2068 }
2069
2070 #if !UCONFIG_NO_NORMALIZATION
2071
2072 extern void
cleanUpData(void)2073 cleanUpData(void) {
2074 int32_t i, count;
2075
2076 count=utm_countItems(normMem);
2077 for(i=0; i<count; ++i) {
2078 uset_close(norms[i].canonStart);
2079 }
2080
2081 utm_close(normMem);
2082 utm_close(utf32Mem);
2083 utm_close(extraMem);
2084 utm_close(combiningTriplesMem);
2085 utrie_close(normTrie);
2086 utrie_close(norm32Trie);
2087 utrie_close(fcdTrie);
2088 utrie_close(auxTrie);
2089
2090 uset_close(nfdQCNoSet);
2091
2092 uprv_free(normTrie);
2093 uprv_free(norm32Trie);
2094 uprv_free(fcdTrie);
2095 uprv_free(auxTrie);
2096 }
2097
2098 #endif /* #if !UCONFIG_NO_NORMALIZATION */
2099
2100 /*
2101 * Hey, Emacs, please set the following:
2102 *
2103 * Local Variables:
2104 * indent-tabs-mode: nil
2105 * End:
2106 *
2107 */
2108