• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 * Copyright (c) 1996-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 * File unorm.cpp
7 *
8 * Created by: Vladimir Weinstein 12052000
9 *
10 * Modification history :
11 *
12 * Date        Name        Description
13 * 02/01/01    synwee      Added normalization quickcheck enum and method.
14 * 02/12/01    synwee      Commented out quickcheck util api has been approved
15 *                         Added private method for doing FCD checks
16 * 02/23/01    synwee      Modified quickcheck and checkFCE to run through
17 *                         string for codepoints < 0x300 for the normalization
18 *                         mode NFC.
19 * 05/25/01+   Markus Scherer total rewrite, implement all normalization here
20 *                         instead of just wrappers around normlzr.cpp,
21 *                         load unorm.dat, support Unicode 3.1 with
22 *                         supplementary code points, etc.
23 */
24 
25 #include "unicode/utypes.h"
26 
27 #if !UCONFIG_NO_NORMALIZATION
28 
29 #include "unicode/udata.h"
30 #include "unicode/uchar.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/uniset.h"
34 #include "unicode/usetiter.h"
35 #include "unicode/unorm.h"
36 #include "ucln_cmn.h"
37 #include "unormimp.h"
38 #include "ucase.h"
39 #include "cmemory.h"
40 #include "umutex.h"
41 #include "utrie.h"
42 #include "unicode/uset.h"
43 #include "udataswp.h"
44 #include "putilimp.h"
45 
46 /*
47  * Status of tailored normalization
48  *
49  * This was done initially for investigation on Unicode public review issue 7
50  * (http://www.unicode.org/review/). See Jitterbug 2481.
51  * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
52  * a permanent feature in ICU 2.6 in support of IDNA which requires true
53  * Unicode 3.2 normalization.
54  * (NormalizationCorrections are rolled into IDNA mapping tables.)
55  *
56  * Tailored normalization as implemented here allows to "normalize less"
57  * than full Unicode normalization would.
58  * Based internally on a UnicodeSet of code points that are
59  * "excluded from normalization", the normalization functions leave those
60  * code points alone ("inert"). This means that tailored normalization
61  * still transforms text into a canonically equivalent form.
62  * It does not add decompositions to code points that do not have any or
63  * change decomposition results.
64  *
65  * Any function that searches for a safe boundary has not been touched,
66  * which means that these functions will be over-pessimistic when
67  * exclusions are applied.
68  * This should not matter because subsequent checks and normalizations
69  * do apply the exclusions; only a little more of the text may be processed
70  * than necessary under exclusions.
71  *
72  * Normalization exclusions have the following effect on excluded code points c:
73  * - c is not decomposed
74  * - c is not a composition target
75  * - c does not combine forward or backward for composition
76  *   except that this is not implemented for Jamo
77  * - c is treated as having a combining class of 0
78  */
79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
80 
81 U_NAMESPACE_USE
82 
83 /*
84  * This new implementation of the normalization code loads its data from
85  * unorm.dat, which is generated with the gennorm tool.
86  * The format of that file is described in unormimp.h .
87  */
88 
89 /* -------------------------------------------------------------------------- */
90 
91 enum {
92     _STACK_BUFFER_CAPACITY=100
93 };
94 
95 /*
96  * Constants for the bit fields in the options bit set parameter.
97  * These need not be public.
98  * A user only needs to know the currently assigned values.
99  * The number and positions of reserved bits per field can remain private
100  * and may change in future implementations.
101  */
102 enum {
103     _NORM_OPTIONS_NX_MASK=0x1f,
104     _NORM_OPTIONS_UNICODE_MASK=0x60,
105     _NORM_OPTIONS_SETS_MASK=0x7f,
106 
107     _NORM_OPTIONS_UNICODE_SHIFT=5,
108 
109     /*
110      * The following options are used only in some composition functions.
111      * They use bits 12 and up to preserve lower bits for the available options
112      * space in unorm_compare() -
113      * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
114      */
115 
116     /** Options bit 12, for compatibility vs. canonical decomposition. */
117     _NORM_OPTIONS_COMPAT=0x1000,
118     /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
119     _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
120 };
121 
122 U_CDECL_BEGIN
123 static inline UBool
isHangulWithoutJamoT(UChar c)124 isHangulWithoutJamoT(UChar c) {
125     c-=HANGUL_BASE;
126     return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
127 }
128 
129 /* norm32 helpers */
130 
131 /* is this a norm32 with a regular index? */
132 static inline UBool
isNorm32Regular(uint32_t norm32)133 isNorm32Regular(uint32_t norm32) {
134     return norm32<_NORM_MIN_SPECIAL;
135 }
136 
137 /* is this a norm32 with a special index for a lead surrogate? */
138 static inline UBool
isNorm32LeadSurrogate(uint32_t norm32)139 isNorm32LeadSurrogate(uint32_t norm32) {
140     return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
141 }
142 
143 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
144 static inline UBool
isNorm32HangulOrJamo(uint32_t norm32)145 isNorm32HangulOrJamo(uint32_t norm32) {
146     return norm32>=_NORM_MIN_HANGUL;
147 }
148 
149 /*
150  * Given isNorm32HangulOrJamo(),
151  * is this a Hangul syllable or a Jamo?
152  */
153 /*static inline UBool
154 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
155     return norm32<_NORM_MIN_JAMO_V;
156 }*/
157 
158 /*
159  * Given norm32 for Jamo V or T,
160  * is this a Jamo V?
161  */
162 static inline UBool
isJamoVTNorm32JamoV(uint32_t norm32)163 isJamoVTNorm32JamoV(uint32_t norm32) {
164     return norm32<_NORM_JAMO_V_TOP;
165 }
166 
167 /* load unorm.dat ----------------------------------------------------------- */
168 
169 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
170 static int32_t U_CALLCONV
getFoldingNormOffset(uint32_t norm32)171 getFoldingNormOffset(uint32_t norm32) {
172     if(isNorm32LeadSurrogate(norm32)) {
173         return
174             UTRIE_BMP_INDEX_LENGTH+
175                 (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
176                  (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
177     } else {
178         return 0;
179     }
180 }
181 
182 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
183 static int32_t U_CALLCONV
getFoldingAuxOffset(uint32_t data)184 getFoldingAuxOffset(uint32_t data) {
185     return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
186 }
187 U_CDECL_END
188 
189 #define UNORM_HARDCODE_DATA 1
190 
191 #if UNORM_HARDCODE_DATA
192 
193 /* unorm_props_data.c is machine-generated by gennorm --csource */
194 #include "unorm_props_data.c"
195 
196 static const UBool formatVersion_2_2=TRUE;
197 
198 #else
199 
200 #define DATA_NAME "unorm"
201 #define DATA_TYPE "icu"
202 
203 static UDataMemory *normData=NULL;
204 static UErrorCode dataErrorCode=U_ZERO_ERROR;
205 static int8_t haveNormData=0;
206 
207 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
208 static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
209 
210 /*
211  * pointers into the memory-mapped unorm.icu
212  */
213 static const uint16_t *extraData=NULL,
214                       *combiningTable=NULL,
215                       *canonStartSets=NULL;
216 
217 static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
218 static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
219 
220 /* the Unicode version of the normalization data */
221 static UVersionInfo dataVersion={ 0, 0, 0, 0 };
222 
223 #endif
224 
225 /* cache UnicodeSets for each combination of exclusion flags */
226 static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
227 
228 U_CDECL_BEGIN
229 
230 static UBool U_CALLCONV
unorm_cleanup(void)231 unorm_cleanup(void) {
232     int32_t i;
233 
234 #if !UNORM_HARDCODE_DATA
235     if(normData!=NULL) {
236         udata_close(normData);
237         normData=NULL;
238     }
239     dataErrorCode=U_ZERO_ERROR;
240     haveNormData=0;
241 #endif
242 
243     for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
244         if (nxCache[i]) {
245             delete nxCache[i];
246             nxCache[i] = 0;
247         }
248     }
249 
250     return TRUE;
251 }
252 
253 #if !UNORM_HARDCODE_DATA
254 
255 static UBool U_CALLCONV
isAcceptable(void *,const char *,const char *,const UDataInfo * pInfo)256 isAcceptable(void * /* context */,
257              const char * /* type */, const char * /* name */,
258              const UDataInfo *pInfo) {
259     if(
260         pInfo->size>=20 &&
261         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
262         pInfo->charsetFamily==U_CHARSET_FAMILY &&
263         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Norm" */
264         pInfo->dataFormat[1]==0x6f &&
265         pInfo->dataFormat[2]==0x72 &&
266         pInfo->dataFormat[3]==0x6d &&
267         pInfo->formatVersion[0]==2 &&
268         pInfo->formatVersion[2]==UTRIE_SHIFT &&
269         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
270     ) {
271         uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
272         uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
273         return TRUE;
274     } else {
275         return FALSE;
276     }
277 }
278 
279 #endif
280 
281 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)282 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) {
283     /* add the start code point to the USet */
284     const USetAdder *sa=(const USetAdder *)context;
285     sa->add(sa->set, start);
286     return TRUE;
287 }
288 
289 U_CDECL_END
290 
291 #if !UNORM_HARDCODE_DATA
292 
293 static int8_t
loadNormData(UErrorCode & errorCode)294 loadNormData(UErrorCode &errorCode) {
295     /* load Unicode normalization data from file */
296 
297     /*
298      * This lazy intialization with double-checked locking (without mutex protection for
299      * haveNormData==0) is transiently unsafe under certain circumstances.
300      * Check the readme and use u_init() if necessary.
301      *
302      * While u_init() initializes the main normalization data via this functions,
303      * it does not do so for exclusion sets (which are fully mutexed).
304      * This is because
305      * - there can be many exclusion sets
306      * - they are rarely used
307      * - they are not usually used in execution paths that are
308      *   as performance-sensitive as others
309      *   (e.g., IDNA takes more time than unorm_quickCheck() anyway)
310      */
311     if(haveNormData==0) {
312         UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
313         UDataMemory *data;
314 
315         const int32_t *p=NULL;
316         const uint8_t *pb;
317 
318         if(&errorCode==NULL || U_FAILURE(errorCode)) {
319             return 0;
320         }
321 
322         /* open the data outside the mutex block */
323         data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
324         dataErrorCode=errorCode;
325         if(U_FAILURE(errorCode)) {
326             return haveNormData=-1;
327         }
328 
329         p=(const int32_t *)udata_getMemory(data);
330         pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
331         utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
332         _normTrie.getFoldingOffset=getFoldingNormOffset;
333 
334         pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
335         if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
336             utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
337         }
338         pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
339 
340         if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
341             utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
342             _auxTrie.getFoldingOffset=getFoldingAuxOffset;
343         }
344 
345         if(U_FAILURE(errorCode)) {
346             dataErrorCode=errorCode;
347             udata_close(data);
348             return haveNormData=-1;
349         }
350 
351         /* in the mutex block, set the data for this process */
352         umtx_lock(NULL);
353         if(normData==NULL) {
354             normData=data;
355             data=NULL;
356 
357             uprv_memcpy(&indexes, p, sizeof(indexes));
358             uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
359             uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
360             uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
361         } else {
362             p=(const int32_t *)udata_getMemory(normData);
363         }
364 
365         /* initialize some variables */
366         extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
367         combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
368         formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
369         formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
370         if(formatVersion_2_1) {
371             canonStartSets=combiningTable+
372                 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
373                 (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
374         }
375         haveNormData=1;
376         ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
377         umtx_unlock(NULL);
378 
379         /* if a different thread set it first, then close the extra data */
380         if(data!=NULL) {
381             udata_close(data); /* NULL if it was set correctly */
382         }
383     }
384 
385     return haveNormData;
386 }
387 
388 #endif
389 
390 static inline UBool
_haveData(UErrorCode & errorCode)391 _haveData(UErrorCode &errorCode) {
392 #if UNORM_HARDCODE_DATA
393     return U_SUCCESS(errorCode);
394 #else
395     if(U_FAILURE(errorCode)) {
396         return FALSE;
397     } else if(haveNormData>0) {
398         return TRUE;
399     } else if(haveNormData<0) {
400         errorCode=dataErrorCode;
401         return FALSE;
402     } else /* haveNormData==0 */ {
403         return (UBool)(loadNormData(errorCode)>0);
404     }
405 #endif
406 }
407 
408 U_CAPI UBool U_EXPORT2
unorm_haveData(UErrorCode * pErrorCode)409 unorm_haveData(UErrorCode *pErrorCode) {
410     return _haveData(*pErrorCode);
411 }
412 
413 U_CAPI const uint16_t * U_EXPORT2
unorm_getFCDTrie(UErrorCode * pErrorCode)414 unorm_getFCDTrie(UErrorCode *pErrorCode) {
415     if(_haveData(*pErrorCode)) {
416         return fcdTrie.index;
417     } else {
418         return NULL;
419     }
420 }
421 
422 /* data access primitives --------------------------------------------------- */
423 
424 static inline uint32_t
_getNorm32(UChar c)425 _getNorm32(UChar c) {
426     return UTRIE_GET32_FROM_LEAD(&normTrie, c);
427 }
428 
429 static inline uint32_t
_getNorm32FromSurrogatePair(uint32_t norm32,UChar c2)430 _getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) {
431     /*
432      * the surrogate index in norm32 stores only the number of the surrogate index block
433      * see gennorm/store.c/getFoldedNormValue()
434      */
435     norm32=
436         UTRIE_BMP_INDEX_LENGTH+
437             ((norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
438              (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
439     return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie, norm32, c2);
440 }
441 
442 /*
443  * get a norm32 from text with complete code points
444  * (like from decompositions)
445  */
446 static inline uint32_t
_getNorm32(const UChar * p,uint32_t mask)447 _getNorm32(const UChar *p, uint32_t mask) {
448     uint32_t norm32=_getNorm32(*p);
449     if((norm32&mask) && isNorm32LeadSurrogate(norm32)) {
450         /* *p is a lead surrogate, get the real norm32 */
451         norm32=_getNorm32FromSurrogatePair(norm32, *(p+1));
452     }
453     return norm32;
454 }
455 
456 static inline uint16_t
_getFCD16(UChar c)457 _getFCD16(UChar c) {
458     return UTRIE_GET16_FROM_LEAD(&fcdTrie, c);
459 }
460 
461 static inline uint16_t
_getFCD16FromSurrogatePair(uint16_t fcd16,UChar c2)462 _getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) {
463     /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
464     return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie, fcd16, c2);
465 }
466 
467 static inline const uint16_t *
_getExtraData(uint32_t norm32)468 _getExtraData(uint32_t norm32) {
469     return extraData+(norm32>>_NORM_EXTRA_SHIFT);
470 }
471 
472 #if 0
473 /*
474  * It is possible to get the FCD data from the main trie if unorm.icu
475  * was built without the FCD trie, although it is slower.
476  * This is not implemented because it is hard to test, and because it seems
477  * unusual to want to use FCD and not build the data file for it.
478  *
479  * Untested sample code:
480  */
481 static inline uint16_t
482 _getFCD16FromNormData(UChar32 c) {
483     uint32_t norm32, fcd;
484 
485     norm32=_getNorm32(c);
486     if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) {
487         /* get the lead/trail cc from the decomposition data */
488         const uint16_t *nfd=_getExtraData(norm32);
489         if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
490             fcd=nfd[1];
491         }
492     } else {
493         fcd=norm32&_NORM_CC_MASK;
494         if(fcd!=0) {
495             /* use the code point cc value for both lead and trail cc's */
496             fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */
497         }
498     }
499 
500     return (uint16_t)fcd;
501 }
502 #endif
503 
504 /* normalization exclusion sets --------------------------------------------- */
505 
506 /*
507  * Normalization exclusion UnicodeSets are used for tailored normalization;
508  * see the comment near the beginning of this file.
509  *
510  * By specifying one or several sets of code points,
511  * those code points become inert for normalization.
512  */
513 
514 static const UnicodeSet *
internalGetNXHangul(UErrorCode & errorCode)515 internalGetNXHangul(UErrorCode &errorCode) {
516     /* internal function, does not check for incoming U_FAILURE */
517     UBool isCached;
518 
519     UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached);
520 
521     if(!isCached) {
522         UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
523         if(set==NULL) {
524             errorCode=U_MEMORY_ALLOCATION_ERROR;
525             return NULL;
526         }
527         // Compact the set for caching.
528         set->compact();
529 
530         umtx_lock(NULL);
531         if(nxCache[UNORM_NX_HANGUL]==NULL) {
532             nxCache[UNORM_NX_HANGUL]=set;
533             set=NULL;
534             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
535         }
536         umtx_unlock(NULL);
537 
538         delete set;
539     }
540 
541     return nxCache[UNORM_NX_HANGUL];
542 }
543 
544 /* unorm.cpp 1.116 had and used
545 static const UnicodeSet *
546 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
547     ...
548 }
549 */
550 
551 /* get and set an exclusion set from a serialized UnicodeSet */
552 static const UnicodeSet *
internalGetSerializedNX(int32_t options,int32_t nxIndex,UErrorCode & errorCode)553 internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
554     /* internal function, does not check for incoming U_FAILURE */
555     UBool isCached;
556 
557     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
558 
559     if( !isCached &&
560         canonStartSets!=NULL &&
561         canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
562     ) {
563         USerializedSet sset;
564         UnicodeSet *set;
565         UChar32 start, end;
566         int32_t i;
567 
568         if( !uset_getSerializedSet(
569                     &sset,
570                     canonStartSets+canonStartSets[nxIndex],
571                     canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
572         ) {
573             errorCode=U_INVALID_FORMAT_ERROR;
574             return NULL;
575         }
576 
577         /* turn the serialized set into a UnicodeSet */
578         set=new UnicodeSet();
579         if(set==NULL) {
580             errorCode=U_MEMORY_ALLOCATION_ERROR;
581             return NULL;
582         }
583         for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
584             set->add(start, end);
585         }
586         // Compact the set for caching.
587         set->compact();
588 
589         umtx_lock(NULL);
590         if(nxCache[options]==NULL) {
591             nxCache[options]=set;
592             set=NULL;
593             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
594         }
595         umtx_unlock(NULL);
596 
597         delete set;
598     }
599 
600     return nxCache[options];
601 }
602 
603 static const UnicodeSet *
internalGetNXCJKCompat(UErrorCode & errorCode)604 internalGetNXCJKCompat(UErrorCode &errorCode) {
605     /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
606     return internalGetSerializedNX(
607                 UNORM_NX_CJK_COMPAT,
608                 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
609                 errorCode);
610 }
611 
612 static const UnicodeSet *
internalGetNXUnicode(uint32_t options,UErrorCode & errorCode)613 internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
614     /* internal function, does not check for incoming U_FAILURE */
615     int32_t nxIndex;
616 
617     options&=_NORM_OPTIONS_UNICODE_MASK;
618     switch(options) {
619     case 0:
620         return NULL;
621     case UNORM_UNICODE_3_2:
622         /* [:^Age=3.2:] */
623         nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
624         break;
625     default:
626         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
627         return NULL;
628     }
629 
630     /* build a set with all code points that were not designated by the specified Unicode version */
631     return internalGetSerializedNX(options, nxIndex, errorCode);
632 }
633 
634 /* Get a decomposition exclusion set. The data must be loaded. */
635 static const UnicodeSet *
internalGetNX(int32_t options,UErrorCode & errorCode)636 internalGetNX(int32_t options, UErrorCode &errorCode) {
637     options&=_NORM_OPTIONS_SETS_MASK;
638 
639     UBool isCached;
640 
641     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
642 
643     if(!isCached) {
644         /* return basic sets */
645         if(options==UNORM_NX_HANGUL) {
646             return internalGetNXHangul(errorCode);
647         }
648         if(options==UNORM_NX_CJK_COMPAT) {
649             return internalGetNXCJKCompat(errorCode);
650         }
651         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
652             return internalGetNXUnicode(options, errorCode);
653         }
654 
655         /* build a set from multiple subsets */
656         UnicodeSet *set;
657         const UnicodeSet *other;
658 
659         set=new UnicodeSet();
660         if(set==NULL) {
661             errorCode=U_MEMORY_ALLOCATION_ERROR;
662             return NULL;
663         }
664 
665         if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
666             set->addAll(*other);
667         }
668         if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
669             set->addAll(*other);
670         }
671         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
672             set->addAll(*other);
673         }
674 
675         if(U_FAILURE(errorCode)) {
676             delete set;
677             return NULL;
678         }
679         // Compact the set for caching.
680         set->compact();
681 
682         umtx_lock(NULL);
683         if(nxCache[options]==NULL) {
684             nxCache[options]=set;
685             set=NULL;
686             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
687         }
688         umtx_unlock(NULL);
689 
690         delete set;
691     }
692 
693     return nxCache[options];
694 }
695 
696 static inline const UnicodeSet *
getNX(int32_t options,UErrorCode & errorCode)697 getNX(int32_t options, UErrorCode &errorCode) {
698     if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
699         /* incoming failure, or no decomposition exclusions requested */
700         return NULL;
701     } else {
702         return internalGetNX(options, errorCode);
703     }
704 }
705 
706 U_CFUNC const UnicodeSet *
unorm_getNX(int32_t options,UErrorCode * pErrorCode)707 unorm_getNX(int32_t options, UErrorCode *pErrorCode) {
708     return getNX(options, *pErrorCode);
709 }
710 
711 static inline UBool
nx_contains(const UnicodeSet * nx,UChar32 c)712 nx_contains(const UnicodeSet *nx, UChar32 c) {
713     return nx!=NULL && nx->contains(c);
714 }
715 
716 static inline UBool
nx_contains(const UnicodeSet * nx,UChar c,UChar c2)717 nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
718     return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
719 }
720 
721 /* other normalization primitives ------------------------------------------- */
722 
723 /* get the canonical or compatibility decomposition for one character */
724 static inline const UChar *
_decompose(uint32_t norm32,uint32_t qcMask,int32_t & length,uint8_t & cc,uint8_t & trailCC)725 _decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
726            uint8_t &cc, uint8_t &trailCC) {
727     const UChar *p=(const UChar *)_getExtraData(norm32);
728     length=*p++;
729 
730     if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
731         /* use compatibility decomposition, skip canonical data */
732         p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
733         length>>=8;
734     }
735 
736     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
737         /* get the lead and trail cc's */
738         UChar bothCCs=*p++;
739         cc=(uint8_t)(bothCCs>>8);
740         trailCC=(uint8_t)bothCCs;
741     } else {
742         /* lead and trail cc's are both 0 */
743         cc=trailCC=0;
744     }
745 
746     length&=_NORM_DECOMP_LENGTH_MASK;
747     return p;
748 }
749 
750 /* get the canonical decomposition for one character */
751 static inline const UChar *
_decompose(uint32_t norm32,int32_t & length,uint8_t & cc,uint8_t & trailCC)752 _decompose(uint32_t norm32, int32_t &length,
753            uint8_t &cc, uint8_t &trailCC) {
754     const UChar *p=(const UChar *)_getExtraData(norm32);
755     length=*p++;
756 
757     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
758         /* get the lead and trail cc's */
759         UChar bothCCs=*p++;
760         cc=(uint8_t)(bothCCs>>8);
761         trailCC=(uint8_t)bothCCs;
762     } else {
763         /* lead and trail cc's are both 0 */
764         cc=trailCC=0;
765     }
766 
767     length&=_NORM_DECOMP_LENGTH_MASK;
768     return p;
769 }
770 
771 /**
772  * Get the canonical decomposition for one code point.
773  * @param c code point
774  * @param buffer out-only buffer for algorithmic decompositions of Hangul
775  * @param length out-only, takes the length of the decomposition, if any
776  * @return pointer to decomposition, or 0 if none
777  * @internal
778  */
779 U_CFUNC const UChar *
unorm_getCanonicalDecomposition(UChar32 c,UChar buffer[4],int32_t * pLength)780 unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) {
781     uint32_t norm32;
782 
783     if(c<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
784         /* trivial case */
785         return NULL;
786     }
787 
788     UTRIE_GET32(&normTrie, c, norm32);
789     if(norm32&_NORM_QC_NFD) {
790         if(isNorm32HangulOrJamo(norm32)) {
791             /* Hangul syllable: decompose algorithmically */
792             UChar c2;
793 
794             c-=HANGUL_BASE;
795 
796             c2=(UChar)(c%JAMO_T_COUNT);
797             c/=JAMO_T_COUNT;
798             if(c2>0) {
799                 buffer[2]=(UChar)(JAMO_T_BASE+c2);
800                 *pLength=3;
801             } else {
802                 *pLength=2;
803             }
804 
805             buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
806             buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
807             return buffer;
808         } else {
809             /* normal decomposition */
810             uint8_t cc, trailCC;
811             return _decompose(norm32, *pLength, cc, trailCC);
812         }
813     } else {
814         return 0;
815     }
816 }
817 
818 /*
819  * get the combining class of (c, c2)=*p++
820  * before: p<limit  after: p<=limit
821  * if only one code unit is used, then c2==0
822  */
823 static inline uint8_t
_getNextCC(const UChar * & p,const UChar * limit,UChar & c,UChar & c2)824 _getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
825     uint32_t norm32;
826 
827     c=*p++;
828     norm32=_getNorm32(c);
829     if((norm32&_NORM_CC_MASK)==0) {
830         c2=0;
831         return 0;
832     } else {
833         if(!isNorm32LeadSurrogate(norm32)) {
834             c2=0;
835         } else {
836             /* c is a lead surrogate, get the real norm32 */
837             if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
838                 ++p;
839                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
840             } else {
841                 c2=0;
842                 return 0;
843             }
844         }
845 
846         return (uint8_t)(norm32>>_NORM_CC_SHIFT);
847     }
848 }
849 
850 /*
851  * read backwards and get norm32
852  * return 0 if the character is <minC
853  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
854  */
855 static inline uint32_t
_getPrevNorm32(const UChar * start,const UChar * & src,uint32_t minC,uint32_t mask,UChar & c,UChar & c2)856 _getPrevNorm32(const UChar *start, const UChar *&src,
857                uint32_t minC, uint32_t mask,
858                UChar &c, UChar &c2) {
859     uint32_t norm32;
860 
861     c=*--src;
862     c2=0;
863 
864     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
865     if(c<minC) {
866         return 0;
867     } else if(!UTF_IS_SURROGATE(c)) {
868         return _getNorm32(c);
869     } else if(UTF_IS_SURROGATE_FIRST(c)) {
870         /* unpaired first surrogate */
871         return 0;
872     } else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) {
873         --src;
874         norm32=_getNorm32(c2);
875 
876         if((norm32&mask)==0) {
877             /* all surrogate pairs with this lead surrogate have only irrelevant data */
878             return 0;
879         } else {
880             /* norm32 must be a surrogate special */
881             return _getNorm32FromSurrogatePair(norm32, c);
882         }
883     } else {
884         /* unpaired second surrogate */
885         c2=0;
886         return 0;
887     }
888 }
889 
890 /*
891  * get the combining class of (c, c2)=*--p
892  * before: start<p  after: start<=p
893  */
894 static inline uint8_t
_getPrevCC(const UChar * start,const UChar * & p)895 _getPrevCC(const UChar *start, const UChar *&p) {
896     UChar c, c2;
897 
898     return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, _NORM_CC_MASK, c, c2)>>_NORM_CC_SHIFT);
899 }
900 
901 /*
902  * is this a safe boundary character for NF*D?
903  * (lead cc==0)
904  */
905 static inline UBool
_isNFDSafe(uint32_t norm32,uint32_t ccOrQCMask,uint32_t decompQCMask)906 _isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
907     if((norm32&ccOrQCMask)==0) {
908         return TRUE; /* cc==0 and no decomposition: this is NF*D safe */
909     }
910 
911     /* inspect its decomposition - maybe a Hangul but not a surrogate here */
912     if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
913         int32_t length;
914         uint8_t cc, trailCC;
915 
916         /* decomposes, get everything from the variable-length extra data */
917         _decompose(norm32, decompQCMask, length, cc, trailCC);
918         return cc==0;
919     } else {
920         /* no decomposition (or Hangul), test the cc directly */
921         return (norm32&_NORM_CC_MASK)==0;
922     }
923 }
924 
925 /*
926  * is this (or does its decomposition begin with) a "true starter"?
927  * (cc==0 and NF*C_YES)
928  */
929 static inline UBool
_isTrueStarter(uint32_t norm32,uint32_t ccOrQCMask,uint32_t decompQCMask)930 _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
931     if((norm32&ccOrQCMask)==0) {
932         return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
933     }
934 
935     /* inspect its decomposition - not a Hangul or a surrogate here */
936     if((norm32&decompQCMask)!=0) {
937         const UChar *p;
938         int32_t length;
939         uint8_t cc, trailCC;
940 
941         /* decomposes, get everything from the variable-length extra data */
942         p=_decompose(norm32, decompQCMask, length, cc, trailCC);
943         if(cc==0) {
944             uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
945 
946             /* does it begin with NFC_YES? */
947             if((_getNorm32(p, qcMask)&qcMask)==0) {
948                 /* yes, the decomposition begins with a true starter */
949                 return TRUE;
950             }
951         }
952     }
953     return FALSE;
954 }
955 
956 /* uchar.h */
957 U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c)958 u_getCombiningClass(UChar32 c) {
959 #if !UNORM_HARDCODE_DATA
960     UErrorCode errorCode=U_ZERO_ERROR;
961     if(_haveData(errorCode)) {
962 #endif
963         uint32_t norm32;
964 
965         UTRIE_GET32(&normTrie, c, norm32);
966         return (uint8_t)(norm32>>_NORM_CC_SHIFT);
967 #if !UNORM_HARDCODE_DATA
968     } else {
969         return 0;
970     }
971 #endif
972 }
973 
974 U_CFUNC UBool U_EXPORT2
unorm_internalIsFullCompositionExclusion(UChar32 c)975 unorm_internalIsFullCompositionExclusion(UChar32 c) {
976 #if UNORM_HARDCODE_DATA
977     if(auxTrie.index!=NULL) {
978 #else
979     UErrorCode errorCode=U_ZERO_ERROR;
980     if(_haveData(errorCode) && auxTrie.index!=NULL) {
981 #endif
982         uint16_t aux;
983 
984         UTRIE_GET16(&auxTrie, c, aux);
985         return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
986     } else {
987         return FALSE;
988     }
989 }
990 
991 U_CFUNC UBool U_EXPORT2
992 unorm_isCanonSafeStart(UChar32 c) {
993 #if UNORM_HARDCODE_DATA
994     if(auxTrie.index!=NULL) {
995 #else
996     UErrorCode errorCode=U_ZERO_ERROR;
997     if(_haveData(errorCode) && auxTrie.index!=NULL) {
998 #endif
999         uint16_t aux;
1000 
1001         UTRIE_GET16(&auxTrie, c, aux);
1002         return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
1003     } else {
1004         return FALSE;
1005     }
1006 }
1007 
1008 U_CAPI void U_EXPORT2
1009 unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
1010     if(unorm_haveData(pErrorCode)){
1011         uprv_memcpy(*versionInfo, dataVersion, 4);
1012     }
1013 }
1014 
1015 
1016 U_CAPI UBool U_EXPORT2
1017 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
1018 #if !UNORM_HARDCODE_DATA
1019     UErrorCode errorCode=U_ZERO_ERROR;
1020 #endif
1021     if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
1022 #if !UNORM_HARDCODE_DATA
1023         _haveData(errorCode) &&
1024 #endif
1025         canonStartSets!=NULL
1026     ) {
1027         const uint16_t *table;
1028         int32_t i, start, limit;
1029 
1030         /*
1031          * binary search for c
1032          *
1033          * There are two search tables,
1034          * one for BMP code points and one for supplementary ones.
1035          * See unormimp.h for details.
1036          */
1037         if(c<=0xffff) {
1038             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
1039             start=0;
1040             limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1041 
1042             /* each entry is a pair { c, result } */
1043             while(start<limit-2) {
1044                 i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
1045                 if(c<table[i]) {
1046                     limit=i;
1047                 } else {
1048                     start=i;
1049                 }
1050             }
1051 
1052             /* found? */
1053             if(c==table[start]) {
1054                 i=table[start+1];
1055                 if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
1056                     /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
1057                     i&=(_NORM_MAX_CANON_SETS-1);
1058                     return uset_getSerializedSet(fillSet,
1059                                             canonStartSets+i,
1060                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1061                 } else {
1062                     /* other result values are BMP code points for single-code point sets */
1063                     uset_setSerializedToOne(fillSet, (UChar32)i);
1064                     return TRUE;
1065                 }
1066             }
1067         } else {
1068             uint16_t high, low, h;
1069 
1070             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
1071                                  canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1072             start=0;
1073             limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1074 
1075             high=(uint16_t)(c>>16);
1076             low=(uint16_t)c;
1077 
1078             /* each entry is a triplet { high(c), low(c), result } */
1079             while(start<limit-3) {
1080                 i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
1081                 h=table[i]&0x1f; /* high word */
1082                 if(high<h || (high==h && low<table[i+1])) {
1083                     limit=i;
1084                 } else {
1085                     start=i;
1086                 }
1087             }
1088 
1089             /* found? */
1090             h=table[start];
1091             if(high==(h&0x1f) && low==table[start+1]) {
1092                 i=table[start+2];
1093                 if((h&0x8000)==0) {
1094                     /* the result is an index to a USerializedSet */
1095                     return uset_getSerializedSet(fillSet,
1096                                             canonStartSets+i,
1097                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1098                 } else {
1099                     /*
1100                      * single-code point set {x} in
1101                      * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }
1102                      */
1103                     i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
1104                     uset_setSerializedToOne(fillSet, (UChar32)i);
1105                     return TRUE;
1106                 }
1107             }
1108         }
1109     }
1110 
1111     return FALSE; /* not found */
1112 }
1113 
1114 U_CAPI int32_t U_EXPORT2
1115 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
1116     uint16_t aux;
1117 
1118     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1119         return 0;
1120     }
1121     if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1122         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1123         return 0;
1124     }
1125     if(!_haveData(*pErrorCode) || auxTrie.index==NULL) {
1126         return 0;
1127     }
1128 
1129     UTRIE_GET16(&auxTrie, c, aux);
1130     aux&=_NORM_AUX_FNC_MASK;
1131     if(aux!=0) {
1132         const UChar *s;
1133         int32_t length;
1134 
1135         s=(const UChar *)(extraData+aux);
1136         if(*s<0xff00) {
1137             /* s points to the single-unit string */
1138             length=1;
1139         } else {
1140             length=*s&0xff;
1141             ++s;
1142         }
1143         if(0<length && length<=destCapacity) {
1144             uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
1145         }
1146         return u_terminateUChars(dest, destCapacity, length, pErrorCode);
1147     } else {
1148         return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
1149     }
1150 }
1151 
1152 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
1153 U_CAPI UBool U_EXPORT2
1154 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
1155     uint32_t norm32, mask;
1156     uint16_t aux, fcd;
1157 
1158 #if !UNORM_HARDCODE_DATA
1159     UErrorCode errorCode=U_ZERO_ERROR;
1160     if(!_haveData(errorCode)) {
1161         return FALSE;
1162     }
1163 #endif
1164 
1165     /* handle trivial cases; set the comparison mask for the normal ones */
1166     switch(mode) {
1167     case UNORM_NONE:
1168         return TRUE;
1169     case UNORM_NFD:
1170         mask=_NORM_CC_MASK|_NORM_QC_NFD;
1171         break;
1172     case UNORM_NFKD:
1173         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
1174         break;
1175     case UNORM_NFC:
1176     /* case UNORM_FCC: */
1177         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
1178         break;
1179     case UNORM_NFKC:
1180         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
1181         break;
1182     case UNORM_FCD:
1183         /* FCD: skippable if lead cc==0 and trail cc<=1 */
1184         if(fcdTrie.index!=NULL) {
1185             UTRIE_GET16(&fcdTrie, c, fcd);
1186             return fcd<=1;
1187         } else {
1188             return FALSE;
1189         }
1190     default:
1191         return FALSE;
1192     }
1193 
1194     /* check conditions (a)..(e), see unormimp.h */
1195     UTRIE_GET32(&normTrie, c, norm32);
1196     if((norm32&mask)!=0) {
1197         return FALSE; /* fails (a)..(e), not skippable */
1198     }
1199 
1200     if(mode<UNORM_NFC) {
1201         return TRUE; /* NF*D, passed (a)..(c), is skippable */
1202     }
1203 
1204     /* NF*C/FCC, passed (a)..(e) */
1205     if((norm32&_NORM_QC_NFD)==0) {
1206         return TRUE; /* no canonical decomposition, is skippable */
1207     }
1208 
1209     /* check Hangul syllables algorithmically */
1210     if(isNorm32HangulOrJamo(norm32)) {
1211         /* Jamo passed (a)..(e) above, must be Hangul */
1212         return !isHangulWithoutJamoT((UChar)c); /* LVT are skippable, LV are not */
1213     }
1214 
1215     /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1216     /* NF*C, test (f) flag */
1217     if(!formatVersion_2_2 || auxTrie.index==NULL) {
1218         return FALSE; /* no (f) data, say not skippable to be safe */
1219     }
1220 
1221     UTRIE_GET16(&auxTrie, c, aux);
1222     return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
1223 
1224     /* } else { FCC, test fcd<=1 instead of the above } */
1225 }
1226 
1227 U_CAPI void U_EXPORT2
1228 unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
1229     UChar c;
1230 
1231     if(!_haveData(*pErrorCode)) {
1232         return;
1233     }
1234 
1235     /* add the start code point of each same-value range of each trie */
1236     utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
1237     if(fcdTrie.index!=NULL) {
1238         utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
1239     }
1240     if(auxTrie.index!=NULL) {
1241         utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
1242     }
1243 
1244     /* add Hangul LV syllables and LV+1 because of skippables */
1245     for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
1246         sa->add(sa->set, c);
1247         sa->add(sa->set, c+1);
1248     }
1249     sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
1250 }
1251 
1252 U_CFUNC UNormalizationCheckResult U_EXPORT2
1253 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
1254     static const uint32_t qcMask[UNORM_MODE_COUNT]={
1255         0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
1256     };
1257 
1258     uint32_t norm32;
1259 
1260 #if !UNORM_HARDCODE_DATA
1261     UErrorCode errorCode=U_ZERO_ERROR;
1262     if(!_haveData(errorCode)) {
1263         return UNORM_YES;
1264     }
1265 #endif
1266 
1267     UTRIE_GET32(&normTrie, c, norm32);
1268     norm32&=qcMask[mode];
1269 
1270     if(norm32==0) {
1271         return UNORM_YES;
1272     } else if(norm32&_NORM_QC_ANY_NO) {
1273         return UNORM_NO;
1274     } else /* _NORM_QC_ANY_MAYBE */ {
1275         return UNORM_MAYBE;
1276     }
1277 }
1278 
1279 U_CFUNC uint16_t U_EXPORT2
1280 unorm_getFCD16FromCodePoint(UChar32 c) {
1281     uint16_t fcd;
1282 #if !UNORM_HARDCODE_DATA
1283     UErrorCode errorCode;
1284     errorCode=U_ZERO_ERROR;
1285 #endif
1286 
1287     if(
1288 #if !UNORM_HARDCODE_DATA
1289         !_haveData(errorCode) ||
1290 #endif
1291         fcdTrie.index==NULL
1292     ) {
1293         return 0;
1294     }
1295 
1296     UTRIE_GET16(&fcdTrie, c, fcd);
1297     return fcd;
1298 }
1299 
1300 /* reorder UTF-16 in-place -------------------------------------------------- */
1301 
1302 /*
1303  * simpler, single-character version of _mergeOrdered() -
1304  * bubble-insert one single code point into the preceding string
1305  * which is already canonically ordered
1306  * (c, c2) may or may not yet have been inserted at [current..p[
1307  *
1308  * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1309  *
1310  * before: [start..current[ is already ordered, and
1311  *         [current..p[     may or may not hold (c, c2) but
1312  *                          must be exactly the same length as (c, c2)
1313  * after: [start..p[ is ordered
1314  *
1315  * returns the trailing combining class
1316  */
1317 static uint8_t
1318 _insertOrdered(const UChar *start, UChar *current, UChar *p,
1319                UChar c, UChar c2, uint8_t cc) {
1320     const UChar *pBack, *pPreBack;
1321     UChar *r;
1322     uint8_t prevCC, trailCC=cc;
1323 
1324     if(start<current && cc!=0) {
1325         /* search for the insertion point where cc>=prevCC */
1326         pPreBack=pBack=current;
1327         prevCC=_getPrevCC(start, pPreBack);
1328         if(cc<prevCC) {
1329             /* this will be the last code point, so keep its cc */
1330             trailCC=prevCC;
1331             pBack=pPreBack;
1332             while(start<pPreBack) {
1333                 prevCC=_getPrevCC(start, pPreBack);
1334                 if(cc>=prevCC) {
1335                     break;
1336                 }
1337                 pBack=pPreBack;
1338             }
1339 
1340             /*
1341              * this is where we are right now with all these pointers:
1342              * [start..pPreBack[ 0..? code points that we can ignore
1343              * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1344              * [pBack..current[  0..n code points with >cc, move up to insert (c, c2)
1345              * [current..p[         1 code point (c, c2) with cc
1346              */
1347 
1348             /* move the code units in between up */
1349             r=p;
1350             do {
1351                 *--r=*--current;
1352             } while(pBack!=current);
1353         }
1354     }
1355 
1356     /* insert (c, c2) */
1357     *current=c;
1358     if(c2!=0) {
1359         *(current+1)=c2;
1360     }
1361 
1362     /* we know the cc of the last code point */
1363     return trailCC;
1364 }
1365 
1366 /*
1367  * merge two UTF-16 string parts together
1368  * to canonically order (order by combining classes) their concatenation
1369  *
1370  * the two strings may already be adjacent, so that the merging is done in-place
1371  * if the two strings are not adjacent, then the buffer holding the first one
1372  * must be large enough
1373  * the second string may or may not be ordered in itself
1374  *
1375  * before: [start..current[ is already ordered, and
1376  *         [next..limit[    may be ordered in itself, but
1377  *                          is not in relation to [start..current[
1378  * after: [start..current+(limit-next)[ is ordered
1379  *
1380  * the algorithm is a simple bubble-sort that takes the characters from *next++
1381  * and inserts them in correct combining class order into the preceding part
1382  * of the string
1383  *
1384  * since this function is called much less often than the single-code point
1385  * _insertOrdered(), it just uses that for easier maintenance
1386  * (see file version from before 2001aug31 for a more optimized version)
1387  *
1388  * returns the trailing combining class
1389  */
1390 static uint8_t
1391 _mergeOrdered(UChar *start, UChar *current,
1392               const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
1393     UChar *r;
1394     UChar c, c2;
1395     uint8_t cc, trailCC=0;
1396     UBool adjacent;
1397 
1398     adjacent= current==next;
1399 
1400     if(start!=current || !isOrdered) {
1401         while(next<limit) {
1402             cc=_getNextCC(next, limit, c, c2);
1403             if(cc==0) {
1404                 /* does not bubble back */
1405                 trailCC=0;
1406                 if(adjacent) {
1407                     current=(UChar *)next;
1408                 } else {
1409                     *current++=c;
1410                     if(c2!=0) {
1411                         *current++=c2;
1412                     }
1413                 }
1414                 if(isOrdered) {
1415                     break;
1416                 } else {
1417                     start=current;
1418                 }
1419             } else {
1420                 r=current+(c2==0 ? 1 : 2);
1421                 trailCC=_insertOrdered(start, current, r, c, c2, cc);
1422                 current=r;
1423             }
1424         }
1425     }
1426 
1427     if(next==limit) {
1428         /* we know the cc of the last code point */
1429         return trailCC;
1430     } else {
1431         if(!adjacent) {
1432             /* copy the second string part */
1433             do {
1434                 *current++=*next++;
1435             } while(next!=limit);
1436             limit=current;
1437         }
1438         return _getPrevCC(start, limit);
1439     }
1440 }
1441 
1442 /* find the last true starter in [start..src[ and return the pointer to it */
1443 static const UChar *
1444 _findPreviousStarter(const UChar *start, const UChar *src,
1445                      uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
1446     uint32_t norm32;
1447     UChar c, c2;
1448 
1449     while(start<src) {
1450         norm32=_getPrevNorm32(start, src, minNoMaybe, ccOrQCMask|decompQCMask, c, c2);
1451         if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1452             break;
1453         }
1454     }
1455     return src;
1456 }
1457 
1458 /* find the first true starter in [src..limit[ and return the pointer to it */
1459 static const UChar *
1460 _findNextStarter(const UChar *src, const UChar *limit,
1461                  uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
1462     const UChar *p;
1463     uint32_t norm32, ccOrQCMask;
1464     int32_t length;
1465     UChar c, c2;
1466     uint8_t cc, trailCC;
1467 
1468     ccOrQCMask=_NORM_CC_MASK|qcMask;
1469 
1470     for(;;) {
1471         if(src==limit) {
1472             break; /* end of string */
1473         }
1474         c=*src;
1475         if(c<minNoMaybe) {
1476             break; /* catches NUL terminater, too */
1477         }
1478 
1479         norm32=_getNorm32(c);
1480         if((norm32&ccOrQCMask)==0) {
1481             break; /* true starter */
1482         }
1483 
1484         if(isNorm32LeadSurrogate(norm32)) {
1485             /* c is a lead surrogate, get the real norm32 */
1486             if((src+1)==limit || !UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
1487                 break; /* unmatched first surrogate: counts as a true starter */
1488             }
1489             norm32=_getNorm32FromSurrogatePair(norm32, c2);
1490 
1491             if((norm32&ccOrQCMask)==0) {
1492                 break; /* true starter */
1493             }
1494         } else {
1495             c2=0;
1496         }
1497 
1498         /* (c, c2) is not a true starter but its decomposition may be */
1499         if(norm32&decompQCMask) {
1500             /* (c, c2) decomposes, get everything from the variable-length extra data */
1501             p=_decompose(norm32, decompQCMask, length, cc, trailCC);
1502 
1503             /* get the first character's norm32 to check if it is a true starter */
1504             if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
1505                 break; /* true starter */
1506             }
1507         }
1508 
1509         src+= c2==0 ? 1 : 2; /* not a true starter, continue */
1510     }
1511 
1512     return src;
1513 }
1514 
1515 /* make NFD & NFKD ---------------------------------------------------------- */
1516 
1517 U_CAPI int32_t U_EXPORT2
1518 unorm_getDecomposition(UChar32 c, UBool compat,
1519                        UChar *dest, int32_t destCapacity) {
1520 #if !UNORM_HARDCODE_DATA
1521     UErrorCode errorCode=U_ZERO_ERROR;
1522 #endif
1523     if( (uint32_t)c<=0x10ffff &&
1524 #if !UNORM_HARDCODE_DATA
1525         _haveData(errorCode) &&
1526 #endif
1527         ((dest!=NULL && destCapacity>0) || destCapacity==0)
1528     ) {
1529         uint32_t norm32, qcMask;
1530         UChar32 minNoMaybe;
1531         int32_t length;
1532 
1533         /* initialize */
1534         if(!compat) {
1535             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1536             qcMask=_NORM_QC_NFD;
1537         } else {
1538             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1539             qcMask=_NORM_QC_NFKD;
1540         }
1541 
1542         if(c<minNoMaybe) {
1543             /* trivial case */
1544             if(destCapacity>0) {
1545                 dest[0]=(UChar)c;
1546             }
1547             return -1;
1548         }
1549 
1550         /* data lookup */
1551         UTRIE_GET32(&normTrie, c, norm32);
1552         if((norm32&qcMask)==0) {
1553             /* simple case: no decomposition */
1554             if(c<=0xffff) {
1555                 if(destCapacity>0) {
1556                     dest[0]=(UChar)c;
1557                 }
1558                 return -1;
1559             } else {
1560                 if(destCapacity>=2) {
1561                     dest[0]=UTF16_LEAD(c);
1562                     dest[1]=UTF16_TRAIL(c);
1563                 }
1564                 return -2;
1565             }
1566         } else if(isNorm32HangulOrJamo(norm32)) {
1567             /* Hangul syllable: decompose algorithmically */
1568             UChar c2;
1569 
1570             c-=HANGUL_BASE;
1571 
1572             c2=(UChar)(c%JAMO_T_COUNT);
1573             c/=JAMO_T_COUNT;
1574             if(c2>0) {
1575                 if(destCapacity>=3) {
1576                     dest[2]=(UChar)(JAMO_T_BASE+c2);
1577                 }
1578                 length=3;
1579             } else {
1580                 length=2;
1581             }
1582 
1583             if(destCapacity>=2) {
1584                 dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1585                 dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1586             }
1587             return length;
1588         } else {
1589             /* c decomposes, get everything from the variable-length extra data */
1590             const UChar *p, *limit;
1591             uint8_t cc, trailCC;
1592 
1593             p=_decompose(norm32, qcMask, length, cc, trailCC);
1594             if(length<=destCapacity) {
1595                 limit=p+length;
1596                 do {
1597                     *dest++=*p++;
1598                 } while(p<limit);
1599             }
1600             return length;
1601         }
1602     } else {
1603         return 0;
1604     }
1605 }
1606 
1607 static int32_t
1608 _decompose(UChar *dest, int32_t destCapacity,
1609            const UChar *src, int32_t srcLength,
1610            UBool compat, const UnicodeSet *nx,
1611            uint8_t &outTrailCC) {
1612     UChar buffer[3];
1613     const UChar *limit, *prevSrc, *p;
1614     uint32_t norm32, ccOrQCMask, qcMask;
1615     int32_t destIndex, reorderStartIndex, length;
1616     UChar c, c2, minNoMaybe;
1617     uint8_t cc, prevCC, trailCC;
1618 
1619     if(!compat) {
1620         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1621         qcMask=_NORM_QC_NFD;
1622     } else {
1623         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1624         qcMask=_NORM_QC_NFKD;
1625     }
1626 
1627     /* initialize */
1628     ccOrQCMask=_NORM_CC_MASK|qcMask;
1629     destIndex=reorderStartIndex=0;
1630     prevCC=0;
1631 
1632     /* avoid compiler warnings */
1633     norm32=0;
1634     c=0;
1635     cc=0;
1636     trailCC=0;
1637 
1638     if(srcLength>=0) {
1639         /* string with length */
1640         limit=src+srcLength;
1641     } else /* srcLength==-1 */ {
1642         /* zero-terminated string */
1643         limit=NULL;
1644     }
1645 
1646     U_ALIGN_CODE(16);
1647 
1648     for(;;) {
1649         /* count code units below the minimum or with irrelevant data for the quick check */
1650         prevSrc=src;
1651         if(limit==NULL) {
1652             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
1653                 prevCC=0;
1654                 ++src;
1655             }
1656         } else {
1657             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
1658                 prevCC=0;
1659                 ++src;
1660             }
1661         }
1662 
1663         /* copy these code units all at once */
1664         if(src!=prevSrc) {
1665             length=(int32_t)(src-prevSrc);
1666             if((destIndex+length)<=destCapacity) {
1667                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
1668             }
1669             destIndex+=length;
1670             reorderStartIndex=destIndex;
1671         }
1672 
1673         /* end of source reached? */
1674         if(limit==NULL ? c==0 : src==limit) {
1675             break;
1676         }
1677 
1678         /* c already contains *src and norm32 is set for it, increment src */
1679         ++src;
1680 
1681         /* check one above-minimum, relevant code unit */
1682         /*
1683          * generally, set p and length to the decomposition string
1684          * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1685          * in all cases, set cc to the lead and trailCC to the trail combining class
1686          *
1687          * the following merge-sort of the current character into the preceding,
1688          * canonically ordered result text will use the optimized _insertOrdered()
1689          * if there is only one single code point to process;
1690          * this is indicated with p==NULL, and (c, c2) is the character to insert
1691          * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1692          * for a supplementary character)
1693          * otherwise, p[length] is merged in with _mergeOrdered()
1694          */
1695         if(isNorm32HangulOrJamo(norm32)) {
1696             if(nx_contains(nx, c)) {
1697                 c2=0;
1698                 p=NULL;
1699                 length=1;
1700             } else {
1701                 /* Hangul syllable: decompose algorithmically */
1702                 p=buffer;
1703                 cc=trailCC=0;
1704 
1705                 c-=HANGUL_BASE;
1706 
1707                 c2=(UChar)(c%JAMO_T_COUNT);
1708                 c/=JAMO_T_COUNT;
1709                 if(c2>0) {
1710                     buffer[2]=(UChar)(JAMO_T_BASE+c2);
1711                     length=3;
1712                 } else {
1713                     length=2;
1714                 }
1715 
1716                 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1717                 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1718             }
1719         } else {
1720             if(isNorm32Regular(norm32)) {
1721                 c2=0;
1722                 length=1;
1723             } else {
1724                 /* c is a lead surrogate, get the real norm32 */
1725                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
1726                     ++src;
1727                     length=2;
1728                     norm32=_getNorm32FromSurrogatePair(norm32, c2);
1729                 } else {
1730                     c2=0;
1731                     length=1;
1732                     norm32=0;
1733                 }
1734             }
1735 
1736             /* get the decomposition and the lead and trail cc's */
1737             if(nx_contains(nx, c, c2)) {
1738                 /* excluded: norm32==0 */
1739                 cc=trailCC=0;
1740                 p=NULL;
1741             } else if((norm32&qcMask)==0) {
1742                 /* c does not decompose */
1743                 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1744                 p=NULL;
1745             } else {
1746                 /* c decomposes, get everything from the variable-length extra data */
1747                 p=_decompose(norm32, qcMask, length, cc, trailCC);
1748                 if(length==1) {
1749                     /* fastpath a single code unit from decomposition */
1750                     c=*p;
1751                     c2=0;
1752                     p=NULL;
1753                 }
1754             }
1755         }
1756 
1757         /* append the decomposition to the destination buffer, assume length>0 */
1758         if((destIndex+length)<=destCapacity) {
1759             UChar *reorderSplit=dest+destIndex;
1760             if(p==NULL) {
1761                 /* fastpath: single code point */
1762                 if(cc!=0 && cc<prevCC) {
1763                     /* (c, c2) is out of order with respect to the preceding text */
1764                     destIndex+=length;
1765                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
1766                 } else {
1767                     /* just append (c, c2) */
1768                     dest[destIndex++]=c;
1769                     if(c2!=0) {
1770                         dest[destIndex++]=c2;
1771                     }
1772                 }
1773             } else {
1774                 /* general: multiple code points (ordered by themselves) from decomposition */
1775                 if(cc!=0 && cc<prevCC) {
1776                     /* the decomposition is out of order with respect to the preceding text */
1777                     destIndex+=length;
1778                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
1779                 } else {
1780                     /* just append the decomposition */
1781                     do {
1782                         dest[destIndex++]=*p++;
1783                     } while(--length>0);
1784                 }
1785             }
1786         } else {
1787             /* buffer overflow */
1788             /* keep incrementing the destIndex for preflighting */
1789             destIndex+=length;
1790         }
1791 
1792         prevCC=trailCC;
1793         if(prevCC==0) {
1794             reorderStartIndex=destIndex;
1795         }
1796     }
1797 
1798     outTrailCC=prevCC;
1799     return destIndex;
1800 }
1801 
1802 U_CAPI int32_t U_EXPORT2
1803 unorm_decompose(UChar *dest, int32_t destCapacity,
1804                 const UChar *src, int32_t srcLength,
1805                 UBool compat, int32_t options,
1806                 UErrorCode *pErrorCode) {
1807     const UnicodeSet *nx;
1808     int32_t destIndex;
1809     uint8_t trailCC;
1810 
1811     if(!_haveData(*pErrorCode)) {
1812         return 0;
1813     }
1814 
1815     nx=getNX(options, *pErrorCode);
1816     if(U_FAILURE(*pErrorCode)) {
1817         return 0;
1818     }
1819 
1820     destIndex=_decompose(dest, destCapacity,
1821                          src, srcLength,
1822                          compat, nx,
1823                          trailCC);
1824 
1825     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
1826 }
1827 
1828 /* make NFC & NFKC ---------------------------------------------------------- */
1829 
1830 /* get the composition properties of the next character */
1831 static inline uint32_t
1832 _getNextCombining(UChar *&p, const UChar *limit,
1833                   UChar &c, UChar &c2,
1834                   uint16_t &combiningIndex, uint8_t &cc,
1835                   const UnicodeSet *nx) {
1836     uint32_t norm32, combineFlags;
1837 
1838     /* get properties */
1839     c=*p++;
1840     norm32=_getNorm32(c);
1841 
1842     /* preset output values for most characters */
1843     c2=0;
1844     combiningIndex=0;
1845     cc=0;
1846 
1847     if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
1848         return 0;
1849     } else {
1850         if(isNorm32Regular(norm32)) {
1851             /* set cc etc. below */
1852         } else if(isNorm32HangulOrJamo(norm32)) {
1853             /* a compatibility decomposition contained Jamos */
1854             combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
1855             return norm32&_NORM_COMBINES_ANY;
1856         } else {
1857             /* c is a lead surrogate, get the real norm32 */
1858             if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
1859                 ++p;
1860                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1861             } else {
1862                 c2=0;
1863                 return 0;
1864             }
1865         }
1866 
1867         if(nx_contains(nx, c, c2)) {
1868             return 0; /* excluded: norm32==0 */
1869         }
1870 
1871         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1872 
1873         combineFlags=norm32&_NORM_COMBINES_ANY;
1874         if(combineFlags!=0) {
1875             combiningIndex=*(_getExtraData(norm32)-1);
1876         }
1877         return combineFlags;
1878     }
1879 }
1880 
1881 /*
1882  * given a composition-result starter (c, c2) - which means its cc==0,
1883  * it combines forward, it has extra data, its norm32!=0,
1884  * it is not a Hangul or Jamo,
1885  * get just its combineFwdIndex
1886  *
1887  * norm32(c) is special if and only if c2!=0
1888  */
1889 static inline uint16_t
1890 _getCombiningIndexFromStarter(UChar c, UChar c2) {
1891     uint32_t norm32;
1892 
1893     norm32=_getNorm32(c);
1894     if(c2!=0) {
1895         norm32=_getNorm32FromSurrogatePair(norm32, c2);
1896     }
1897     return *(_getExtraData(norm32)-1);
1898 }
1899 
1900 /*
1901  * Find the recomposition result for
1902  * a forward-combining character
1903  * (specified with a pointer to its part of the combiningTable[])
1904  * and a backward-combining character
1905  * (specified with its combineBackIndex).
1906  *
1907  * If these two characters combine, then set (value, value2)
1908  * with the code unit(s) of the composition character.
1909  *
1910  * Return value:
1911  * 0    do not combine
1912  * 1    combine
1913  * >1   combine, and the composition is a forward-combining starter
1914  *
1915  * See unormimp.h for a description of the composition table format.
1916  */
1917 static inline uint16_t
1918 _combine(const uint16_t *table, uint16_t combineBackIndex,
1919          uint16_t &value, uint16_t &value2) {
1920     uint16_t key;
1921 
1922     /* search in the starter's composition table */
1923     for(;;) {
1924         key=*table++;
1925         if(key>=combineBackIndex) {
1926             break;
1927         }
1928         table+= *table&0x8000 ? 2 : 1;
1929     }
1930 
1931     /* mask off bit 15, the last-entry-in-the-list flag */
1932     if((key&0x7fff)==combineBackIndex) {
1933         /* found! combine! */
1934         value=*table;
1935 
1936         /* is the composition a starter that combines forward? */
1937         key=(uint16_t)((value&0x2000)+1);
1938 
1939         /* get the composition result code point from the variable-length result value */
1940         if(value&0x8000) {
1941             if(value&0x4000) {
1942                 /* surrogate pair composition result */
1943                 value=(uint16_t)((value&0x3ff)|0xd800);
1944                 value2=*(table+1);
1945             } else {
1946                 /* BMP composition result U+2000..U+ffff */
1947                 value=*(table+1);
1948                 value2=0;
1949             }
1950         } else {
1951             /* BMP composition result U+0000..U+1fff */
1952             value&=0x1fff;
1953             value2=0;
1954         }
1955 
1956         return key;
1957     } else {
1958         /* not found */
1959         return 0;
1960     }
1961 }
1962 
1963 static inline UBool
1964 _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
1965                UBool compat, UChar *dest, const UnicodeSet *nx) {
1966     if(isJamoVTNorm32JamoV(norm32)) {
1967         /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1968         prev=(UChar)(prev-JAMO_L_BASE);
1969         if(prev<JAMO_L_COUNT) {
1970             c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
1971 
1972             /* check if the next character is a Jamo T (normal or compatibility) */
1973             if(src!=limit) {
1974                 UChar next, t;
1975 
1976                 next=*src;
1977                 if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1978                     /* normal Jamo T */
1979                     ++src;
1980                     c+=t;
1981                 } else if(compat) {
1982                     /* if NFKC, then check for compatibility Jamo T (BMP only) */
1983                     norm32=_getNorm32(next);
1984                     if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
1985                         const UChar *p;
1986                         int32_t length;
1987                         uint8_t cc, trailCC;
1988 
1989                         p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
1990                         if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1991                             /* compatibility Jamo T */
1992                             ++src;
1993                             c+=t;
1994                         }
1995                     }
1996                 }
1997             }
1998             if(nx_contains(nx, c)) {
1999                 if(!isHangulWithoutJamoT(c)) {
2000                     --src; /* undo ++src from reading the Jamo T */
2001                 }
2002                 return FALSE;
2003             }
2004             if(dest!=0) {
2005                 *dest=c;
2006             }
2007             return TRUE;
2008         }
2009     } else if(isHangulWithoutJamoT(prev)) {
2010         /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
2011         c=(UChar)(prev+(c-JAMO_T_BASE));
2012         if(nx_contains(nx, c)) {
2013             return FALSE;
2014         }
2015         if(dest!=0) {
2016             *dest=c;
2017         }
2018         return TRUE;
2019     }
2020     return FALSE;
2021 }
2022 
2023 /*
2024  * recompose the characters in [p..limit[
2025  * (which is in NFD - decomposed and canonically ordered),
2026  * adjust limit, and return the trailing cc
2027  *
2028  * since for NFKC we may get Jamos in decompositions, we need to
2029  * recompose those too
2030  *
2031  * note that recomposition never lengthens the text:
2032  * any character consists of either one or two code units;
2033  * a composition may contain at most one more code unit than the original starter,
2034  * while the combining mark that is removed has at least one code unit
2035  */
2036 static uint8_t
2037 _recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
2038     UChar *starter, *pRemove, *q, *r;
2039     uint32_t combineFlags;
2040     UChar c, c2;
2041     uint16_t combineFwdIndex, combineBackIndex;
2042     uint16_t result, value, value2;
2043     uint8_t cc, prevCC;
2044     UBool starterIsSupplementary;
2045 
2046     starter=NULL;                   /* no starter */
2047     combineFwdIndex=0;              /* will not be used until starter!=NULL - avoid compiler warnings */
2048     combineBackIndex=0;             /* will always be set if combineFlags!=0 - avoid compiler warnings */
2049     value=value2=0;                 /* always set by _combine() before used - avoid compiler warnings */
2050     starterIsSupplementary=FALSE;   /* will not be used until starter!=NULL - avoid compiler warnings */
2051     prevCC=0;
2052 
2053     for(;;) {
2054         combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
2055         if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
2056             if(combineBackIndex&0x8000) {
2057                 /* c is a Jamo V/T, see if we can compose it with the previous character */
2058                 /* for the PRI #29 fix, check that there is no intervening combining mark */
2059                 if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
2060                     pRemove=NULL; /* NULL while no Hangul composition */
2061                     combineFlags=0;
2062                     c2=*starter;
2063                     if(combineBackIndex==0xfff2) {
2064                         /* Jamo V, compose with previous Jamo L and following Jamo T */
2065                         c2=(UChar)(c2-JAMO_L_BASE);
2066                         if(c2<JAMO_L_COUNT) {
2067                             pRemove=p-1;
2068                             c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
2069                             if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
2070                                 ++p;
2071                                 c+=c2;
2072                             } else {
2073                                 /* the result is an LV syllable, which is a starter (unlike LVT) */
2074                                 combineFlags=_NORM_COMBINES_FWD;
2075                             }
2076                             if(!nx_contains(nx, c)) {
2077                                 *starter=c;
2078                             } else {
2079                                 /* excluded */
2080                                 if(!isHangulWithoutJamoT(c)) {
2081                                     --p; /* undo the ++p from reading the Jamo T */
2082                                 }
2083                                 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
2084                                 pRemove=NULL;
2085                             }
2086                         }
2087 
2088                     /*
2089                      * Normally, the following can not occur:
2090                      * Since the input is in NFD, there are no Hangul LV syllables that
2091                      * a Jamo T could combine with.
2092                      * All Jamo Ts are combined above when handling Jamo Vs.
2093                      *
2094                      * However, before the PRI #29 fix, this can occur due to
2095                      * an intervening combining mark between the Hangul LV and the Jamo T.
2096                      */
2097                     } else {
2098                         /* Jamo T, compose with previous Hangul that does not have a Jamo T */
2099                         if(isHangulWithoutJamoT(c2)) {
2100                             c2+=(UChar)(c-JAMO_T_BASE);
2101                             if(!nx_contains(nx, c2)) {
2102                                 pRemove=p-1;
2103                                 *starter=c2;
2104                             }
2105                         }
2106                     }
2107 
2108                     if(pRemove!=NULL) {
2109                         /* remove the Jamo(s) */
2110                         q=pRemove;
2111                         r=p;
2112                         while(r<limit) {
2113                             *q++=*r++;
2114                         }
2115                         p=pRemove;
2116                         limit=q;
2117                     }
2118 
2119                     c2=0; /* c2 held *starter temporarily */
2120 
2121                     if(combineFlags!=0) {
2122                         /*
2123                          * not starter=NULL because the composition is a Hangul LV syllable
2124                          * and might combine once more (but only before the PRI #29 fix)
2125                          */
2126 
2127                         /* done? */
2128                         if(p==limit) {
2129                             return prevCC;
2130                         }
2131 
2132                         /* the composition is a Hangul LV syllable which is a starter that combines forward */
2133                         combineFwdIndex=0xfff0;
2134 
2135                         /* we combined; continue with looking for compositions */
2136                         continue;
2137                     }
2138                 }
2139 
2140                 /*
2141                  * now: cc==0 and the combining index does not include "forward" ->
2142                  * the rest of the loop body will reset starter to NULL;
2143                  * technically, a composed Hangul syllable is a starter, but it
2144                  * does not combine forward now that we have consumed all eligible Jamos;
2145                  * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2146                  */
2147 
2148             } else if(
2149                 /* the starter is not a Hangul LV or Jamo V/T and */
2150                 !(combineFwdIndex&0x8000) &&
2151                 /* the combining mark is not blocked and */
2152                 ((options&UNORM_BEFORE_PRI_29) ?
2153                     (prevCC!=cc || prevCC==0) :
2154                     (prevCC<cc || prevCC==0)) &&
2155                 /* the starter and the combining mark (c, c2) do combine and */
2156                 0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
2157                 /* the composition result is not excluded */
2158                 !nx_contains(nx, value, value2)
2159             ) {
2160                 /* replace the starter with the composition, remove the combining mark */
2161                 pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
2162 
2163                 /* replace the starter with the composition */
2164                 *starter=(UChar)value;
2165                 if(starterIsSupplementary) {
2166                     if(value2!=0) {
2167                         /* both are supplementary */
2168                         *(starter+1)=(UChar)value2;
2169                     } else {
2170                         /* the composition is shorter than the starter, move the intermediate characters forward one */
2171                         starterIsSupplementary=FALSE;
2172                         q=starter+1;
2173                         r=q+1;
2174                         while(r<pRemove) {
2175                             *q++=*r++;
2176                         }
2177                         --pRemove;
2178                     }
2179                 } else if(value2!=0) {
2180                     /* the composition is longer than the starter, move the intermediate characters back one */
2181                     starterIsSupplementary=TRUE;
2182                     ++starter; /* temporarily increment for the loop boundary */
2183                     q=pRemove;
2184                     r=++pRemove;
2185                     while(starter<q) {
2186                         *--r=*--q;
2187                     }
2188                     *starter=(UChar)value2;
2189                     --starter; /* undo the temporary increment */
2190                 /* } else { both are on the BMP, nothing more to do */
2191                 }
2192 
2193                 /* remove the combining mark by moving the following text over it */
2194                 if(pRemove<p) {
2195                     q=pRemove;
2196                     r=p;
2197                     while(r<limit) {
2198                         *q++=*r++;
2199                     }
2200                     p=pRemove;
2201                     limit=q;
2202                 }
2203 
2204                 /* keep prevCC because we removed the combining mark */
2205 
2206                 /* done? */
2207                 if(p==limit) {
2208                     return prevCC;
2209                 }
2210 
2211                 /* is the composition a starter that combines forward? */
2212                 if(result>1) {
2213                     combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
2214                 } else {
2215                     starter=NULL;
2216                 }
2217 
2218                 /* we combined; continue with looking for compositions */
2219                 continue;
2220             }
2221         }
2222 
2223         /* no combination this time */
2224         prevCC=cc;
2225         if(p==limit) {
2226             return prevCC;
2227         }
2228 
2229         /* if (c, c2) did not combine, then check if it is a starter */
2230         if(cc==0) {
2231             /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2232             if(combineFlags&_NORM_COMBINES_FWD) {
2233                 /* it may combine with something, prepare for it */
2234                 if(c2==0) {
2235                     starterIsSupplementary=FALSE;
2236                     starter=p-1;
2237                 } else {
2238                     starterIsSupplementary=TRUE;
2239                     starter=p-2;
2240                 }
2241                 combineFwdIndex=combineBackIndex;
2242             } else {
2243                 /* it will not combine with anything */
2244                 starter=NULL;
2245             }
2246         } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
2247             /* FCC: no discontiguous compositions; any intervening character blocks */
2248             starter=NULL;
2249         }
2250     }
2251 }
2252 
2253 /* decompose and recompose [prevStarter..src[ */
2254 static const UChar *
2255 _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
2256              const UChar *prevStarter, const UChar *src,
2257              uint8_t &prevCC,
2258              int32_t options, const UnicodeSet *nx,
2259              UErrorCode *pErrorCode) {
2260     UChar *recomposeLimit;
2261     uint8_t trailCC;
2262     UBool compat;
2263 
2264     compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
2265 
2266     /* decompose [prevStarter..src[ */
2267     length=_decompose(buffer, bufferCapacity,
2268                       prevStarter, (int32_t)(src-prevStarter),
2269                       compat, nx,
2270                       trailCC);
2271     if(length>bufferCapacity) {
2272         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
2273             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2274             return NULL;
2275         }
2276         length=_decompose(buffer, bufferCapacity,
2277                           prevStarter, (int32_t)(src-prevStarter),
2278                           compat, nx,
2279                           trailCC);
2280     }
2281 
2282     /* recompose the decomposition */
2283     recomposeLimit=buffer+length;
2284     if(length>=2) {
2285         prevCC=_recompose(buffer, recomposeLimit, options, nx);
2286     }
2287 
2288     /* return with a pointer to the recomposition and its length */
2289     length=(int32_t)(recomposeLimit-buffer);
2290     return buffer;
2291 }
2292 
2293 static int32_t
2294 _compose(UChar *dest, int32_t destCapacity,
2295          const UChar *src, int32_t srcLength,
2296          int32_t options, const UnicodeSet *nx,
2297          UErrorCode *pErrorCode) {
2298     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2299     UChar *buffer;
2300     int32_t bufferCapacity;
2301 
2302     const UChar *limit, *prevSrc, *prevStarter;
2303     uint32_t norm32, ccOrQCMask, qcMask;
2304     int32_t destIndex, reorderStartIndex, length;
2305     UChar c, c2, minNoMaybe;
2306     uint8_t cc, prevCC;
2307 
2308     if(options&_NORM_OPTIONS_COMPAT) {
2309         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2310         qcMask=_NORM_QC_NFKC;
2311     } else {
2312         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2313         qcMask=_NORM_QC_NFC;
2314     }
2315 
2316     /* initialize */
2317     buffer=stackBuffer;
2318     bufferCapacity=_STACK_BUFFER_CAPACITY;
2319 
2320     /*
2321      * prevStarter points to the last character before the current one
2322      * that is a "true" starter with cc==0 and quick check "yes".
2323      *
2324      * prevStarter will be used instead of looking for a true starter
2325      * while incrementally decomposing [prevStarter..prevSrc[
2326      * in _composePart(). Having a good prevStarter allows to just decompose
2327      * the entire [prevStarter..prevSrc[.
2328      *
2329      * When _composePart() backs out from prevSrc back to prevStarter,
2330      * then it also backs out destIndex by the same amount.
2331      * Therefore, at all times, the (prevSrc-prevStarter) source units
2332      * must correspond 1:1 to destination units counted with destIndex,
2333      * except for reordering.
2334      * This is true for the qc "yes" characters copied in the fast loop,
2335      * and for pure reordering.
2336      * prevStarter must be set forward to src when this is not true:
2337      * In _composePart() and after composing a Hangul syllable.
2338      *
2339      * This mechanism relies on the assumption that the decomposition of a true starter
2340      * also begins with a true starter. gennorm/store.c checks for this.
2341      */
2342     prevStarter=src;
2343 
2344     ccOrQCMask=_NORM_CC_MASK|qcMask;
2345     destIndex=reorderStartIndex=0;
2346     prevCC=0;
2347 
2348     /* avoid compiler warnings */
2349     norm32=0;
2350     c=0;
2351 
2352     if(srcLength>=0) {
2353         /* string with length */
2354         limit=src+srcLength;
2355     } else /* srcLength==-1 */ {
2356         /* zero-terminated string */
2357         limit=NULL;
2358     }
2359 
2360     U_ALIGN_CODE(16);
2361 
2362     for(;;) {
2363         /* count code units below the minimum or with irrelevant data for the quick check */
2364         prevSrc=src;
2365         if(limit==NULL) {
2366             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
2367                 prevCC=0;
2368                 ++src;
2369             }
2370         } else {
2371             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
2372                 prevCC=0;
2373                 ++src;
2374             }
2375         }
2376 
2377         /* copy these code units all at once */
2378         if(src!=prevSrc) {
2379             length=(int32_t)(src-prevSrc);
2380             if((destIndex+length)<=destCapacity) {
2381                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2382             }
2383             destIndex+=length;
2384             reorderStartIndex=destIndex;
2385 
2386             /* set prevStarter to the last character in the quick check loop */
2387             prevStarter=src-1;
2388             if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
2389                 --prevStarter;
2390             }
2391 
2392             prevSrc=src;
2393         }
2394 
2395         /* end of source reached? */
2396         if(limit==NULL ? c==0 : src==limit) {
2397             break;
2398         }
2399 
2400         /* c already contains *src and norm32 is set for it, increment src */
2401         ++src;
2402 
2403         /*
2404          * source buffer pointers:
2405          *
2406          *  all done      quick check   current char  not yet
2407          *                "yes" but     (c, c2)       processed
2408          *                may combine
2409          *                forward
2410          * [-------------[-------------[-------------[-------------[
2411          * |             |             |             |             |
2412          * start         prevStarter   prevSrc       src           limit
2413          *
2414          *
2415          * destination buffer pointers and indexes:
2416          *
2417          *  all done      might take    not filled yet
2418          *                characters for
2419          *                reordering
2420          * [-------------[-------------[-------------[
2421          * |             |             |             |
2422          * dest      reorderStartIndex destIndex     destCapacity
2423          */
2424 
2425         /* check one above-minimum, relevant code unit */
2426         /*
2427          * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2428          * check for Jamo V/T, then for surrogates and regular characters
2429          * c is not a Hangul syllable or Jamo L because
2430          * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2431          */
2432         if(isNorm32HangulOrJamo(norm32)) {
2433             /*
2434              * c is a Jamo V/T:
2435              * try to compose with the previous character, Jamo V also with a following Jamo T,
2436              * and set values here right now in case we just continue with the main loop
2437              */
2438             prevCC=cc=0;
2439             reorderStartIndex=destIndex;
2440 
2441             if(
2442                 destIndex>0 &&
2443                 _composeHangul(
2444                     *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
2445                     destIndex<=destCapacity ? dest+(destIndex-1) : 0,
2446                     nx)
2447             ) {
2448                 prevStarter=src;
2449                 continue;
2450             }
2451 
2452             /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2453             c2=0;
2454             length=1;
2455             prevStarter=prevSrc;
2456         } else {
2457             if(isNorm32Regular(norm32)) {
2458                 c2=0;
2459                 length=1;
2460             } else {
2461                 /* c is a lead surrogate, get the real norm32 */
2462                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2463                     ++src;
2464                     length=2;
2465                     norm32=_getNorm32FromSurrogatePair(norm32, c2);
2466                 } else {
2467                     /* c is an unpaired lead surrogate, nothing to do */
2468                     c2=0;
2469                     length=1;
2470                     norm32=0;
2471                 }
2472             }
2473 
2474             /* we are looking at the character (c, c2) at [prevSrc..src[ */
2475             if(nx_contains(nx, c, c2)) {
2476                 /* excluded: norm32==0 */
2477                 cc=0;
2478             } else if((norm32&qcMask)==0) {
2479                 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2480             } else {
2481                 const UChar *p;
2482                 uint32_t decompQCMask;
2483 
2484                 /*
2485                  * find appropriate boundaries around this character,
2486                  * decompose the source text from between the boundaries,
2487                  * and recompose it
2488                  *
2489                  * this puts the intermediate text into the side buffer because
2490                  * it might be longer than the recomposition end result,
2491                  * or the destination buffer may be too short or missing
2492                  *
2493                  * note that destIndex may be adjusted backwards to account
2494                  * for source text that passed the quick check but needed to
2495                  * take part in the recomposition
2496                  */
2497                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
2498 
2499                 /*
2500                  * find the last true starter in [prevStarter..src[
2501                  * it is either the decomposition of the current character (at prevSrc),
2502                  * or prevStarter
2503                  */
2504                 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
2505                     prevStarter=prevSrc;
2506                 } else {
2507                     /* adjust destIndex: back out what had been copied with qc "yes" */
2508                     destIndex-=(int32_t)(prevSrc-prevStarter);
2509                 }
2510 
2511                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2512                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
2513 
2514                 /* compose [prevStarter..src[ */
2515                 p=_composePart(stackBuffer, buffer, bufferCapacity,
2516                                length,          /* output */
2517                                prevStarter, src,
2518                                prevCC,          /* output */
2519                                options, nx,
2520                                pErrorCode);
2521 
2522                 if(p==NULL) {
2523                     destIndex=0;   /* an error occurred (out of memory) */
2524                     break;
2525                 }
2526 
2527                 /* append the recomposed buffer contents to the destination buffer */
2528                 if((destIndex+length)<=destCapacity) {
2529                     while(length>0) {
2530                         dest[destIndex++]=*p++;
2531                         --length;
2532                     }
2533                 } else {
2534                     /* buffer overflow */
2535                     /* keep incrementing the destIndex for preflighting */
2536                     destIndex+=length;
2537                 }
2538 
2539                 /* set the next starter */
2540                 prevStarter=src;
2541 
2542                 continue;
2543             }
2544         }
2545 
2546         /* append the single code point (c, c2) to the destination buffer */
2547         if((destIndex+length)<=destCapacity) {
2548             if(cc!=0 && cc<prevCC) {
2549                 /* (c, c2) is out of order with respect to the preceding text */
2550                 UChar *reorderSplit=dest+destIndex;
2551                 destIndex+=length;
2552                 prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2553             } else {
2554                 /* just append (c, c2) */
2555                 dest[destIndex++]=c;
2556                 if(c2!=0) {
2557                     dest[destIndex++]=c2;
2558                 }
2559                 prevCC=cc;
2560             }
2561         } else {
2562             /* buffer overflow */
2563             /* keep incrementing the destIndex for preflighting */
2564             destIndex+=length;
2565             prevCC=cc;
2566         }
2567     }
2568 
2569     /* cleanup */
2570     if(buffer!=stackBuffer) {
2571         uprv_free(buffer);
2572     }
2573 
2574     return destIndex;
2575 }
2576 
2577 U_CAPI int32_t U_EXPORT2
2578 unorm_compose(UChar *dest, int32_t destCapacity,
2579               const UChar *src, int32_t srcLength,
2580               UBool compat, int32_t options,
2581               UErrorCode *pErrorCode) {
2582     const UnicodeSet *nx;
2583     int32_t destIndex;
2584 
2585     if(!_haveData(*pErrorCode)) {
2586         return 0;
2587     }
2588 
2589     nx=getNX(options, *pErrorCode);
2590     if(U_FAILURE(*pErrorCode)) {
2591         return 0;
2592     }
2593 
2594     /* reset options bits that should only be set here or inside _compose() */
2595     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
2596 
2597     if(compat) {
2598         options|=_NORM_OPTIONS_COMPAT;
2599     }
2600 
2601     destIndex=_compose(dest, destCapacity,
2602                        src, srcLength,
2603                        options, nx,
2604                        pErrorCode);
2605 
2606     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2607 }
2608 
2609 /* make FCD ----------------------------------------------------------------- */
2610 
2611 static const UChar *
2612 _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
2613     UChar c, c2;
2614 
2615     /*
2616      * find the first position in [src..limit[ after some cc==0 according to FCD data
2617      *
2618      * at the beginning of the loop, we have fcd16 from before src
2619      *
2620      * stop at positions:
2621      * - after trail cc==0
2622      * - at the end of the source
2623      * - before lead cc==0
2624      */
2625     for(;;) {
2626         /* stop if trail cc==0 for the previous character */
2627         if((fcd16&0xff)==0) {
2628             break;
2629         }
2630 
2631         /* get c=*src - stop at end of string */
2632         if(src==limit) {
2633             break;
2634         }
2635         c=*src;
2636 
2637         /* stop if lead cc==0 for this character */
2638         if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
2639             break; /* catches terminating NUL, too */
2640         }
2641 
2642         if(!UTF_IS_FIRST_SURROGATE(c)) {
2643             if(fcd16<=0xff) {
2644                 break;
2645             }
2646             ++src;
2647         } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
2648             /* c is a lead surrogate, get the real fcd16 */
2649             fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2650             if(fcd16<=0xff) {
2651                 break;
2652             }
2653             src+=2;
2654         } else {
2655             /* c is an unpaired first surrogate, lead cc==0 */
2656             break;
2657         }
2658     }
2659 
2660     return src;
2661 }
2662 
2663 static uint8_t
2664 _decomposeFCD(const UChar *src, const UChar *decompLimit,
2665               UChar *dest, int32_t &destIndex, int32_t destCapacity,
2666               const UnicodeSet *nx) {
2667     const UChar *p;
2668     uint32_t norm32;
2669     int32_t reorderStartIndex, length;
2670     UChar c, c2;
2671     uint8_t cc, prevCC, trailCC;
2672 
2673     /*
2674      * canonically decompose [src..decompLimit[
2675      *
2676      * all characters in this range have some non-zero cc,
2677      * directly or in decomposition,
2678      * so that we do not need to check in the following for quick-check limits etc.
2679      *
2680      * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2681      *
2682      * we also do not need to check for c==0 because we have an established decompLimit
2683      */
2684     reorderStartIndex=destIndex;
2685     prevCC=0;
2686 
2687     while(src<decompLimit) {
2688         c=*src++;
2689         norm32=_getNorm32(c);
2690         if(isNorm32Regular(norm32)) {
2691             c2=0;
2692             length=1;
2693         } else {
2694             /*
2695              * reminder: this function is called with [src..decompLimit[
2696              * not containing any Hangul/Jamo characters,
2697              * therefore the only specials are lead surrogates
2698              */
2699             /* c is a lead surrogate, get the real norm32 */
2700             if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2701                 ++src;
2702                 length=2;
2703                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
2704             } else {
2705                 c2=0;
2706                 length=1;
2707                 norm32=0;
2708             }
2709         }
2710 
2711         /* get the decomposition and the lead and trail cc's */
2712         if(nx_contains(nx, c, c2)) {
2713             /* excluded: norm32==0 */
2714             cc=trailCC=0;
2715             p=NULL;
2716         } else if((norm32&_NORM_QC_NFD)==0) {
2717             /* c does not decompose */
2718             cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2719             p=NULL;
2720         } else {
2721             /* c decomposes, get everything from the variable-length extra data */
2722             p=_decompose(norm32, length, cc, trailCC);
2723             if(length==1) {
2724                 /* fastpath a single code unit from decomposition */
2725                 c=*p;
2726                 c2=0;
2727                 p=NULL;
2728             }
2729         }
2730 
2731         /* append the decomposition to the destination buffer, assume length>0 */
2732         if((destIndex+length)<=destCapacity) {
2733             UChar *reorderSplit=dest+destIndex;
2734             if(p==NULL) {
2735                 /* fastpath: single code point */
2736                 if(cc!=0 && cc<prevCC) {
2737                     /* (c, c2) is out of order with respect to the preceding text */
2738                     destIndex+=length;
2739                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2740                 } else {
2741                     /* just append (c, c2) */
2742                     dest[destIndex++]=c;
2743                     if(c2!=0) {
2744                         dest[destIndex++]=c2;
2745                     }
2746                 }
2747             } else {
2748                 /* general: multiple code points (ordered by themselves) from decomposition */
2749                 if(cc!=0 && cc<prevCC) {
2750                     /* the decomposition is out of order with respect to the preceding text */
2751                     destIndex+=length;
2752                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
2753                 } else {
2754                     /* just append the decomposition */
2755                     do {
2756                         dest[destIndex++]=*p++;
2757                     } while(--length>0);
2758                 }
2759             }
2760         } else {
2761             /* buffer overflow */
2762             /* keep incrementing the destIndex for preflighting */
2763             destIndex+=length;
2764         }
2765 
2766         prevCC=trailCC;
2767         if(prevCC==0) {
2768             reorderStartIndex=destIndex;
2769         }
2770     }
2771 
2772     return prevCC;
2773 }
2774 
2775 static int32_t
2776 unorm_makeFCD(UChar *dest, int32_t destCapacity,
2777               const UChar *src, int32_t srcLength,
2778               const UnicodeSet *nx,
2779               UErrorCode *pErrorCode) {
2780     const UChar *limit, *prevSrc, *decompStart;
2781     int32_t destIndex, length;
2782     UChar c, c2;
2783     uint16_t fcd16;
2784     int16_t prevCC, cc;
2785 
2786     if(!_haveData(*pErrorCode)) {
2787         return 0;
2788     }
2789 
2790     /* initialize */
2791     decompStart=src;
2792     destIndex=0;
2793     prevCC=0;
2794 
2795     /* avoid compiler warnings */
2796     c=0;
2797     fcd16=0;
2798 
2799     if(srcLength>=0) {
2800         /* string with length */
2801         limit=src+srcLength;
2802     } else /* srcLength==-1 */ {
2803         /* zero-terminated string */
2804         limit=NULL;
2805     }
2806 
2807     U_ALIGN_CODE(16);
2808 
2809     for(;;) {
2810         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2811         prevSrc=src;
2812         if(limit==NULL) {
2813             for(;;) {
2814                 c=*src;
2815                 if(c<_NORM_MIN_WITH_LEAD_CC) {
2816                     if(c==0) {
2817                         break;
2818                     }
2819                     prevCC=(int16_t)-c;
2820                 } else if((fcd16=_getFCD16(c))==0) {
2821                     prevCC=0;
2822                 } else {
2823                     break;
2824                 }
2825                 ++src;
2826             }
2827         } else {
2828             for(;;) {
2829                 if(src==limit) {
2830                     break;
2831                 } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
2832                     prevCC=(int16_t)-c;
2833                 } else if((fcd16=_getFCD16(c))==0) {
2834                     prevCC=0;
2835                 } else {
2836                     break;
2837                 }
2838                 ++src;
2839             }
2840         }
2841 
2842         /*
2843          * prevCC has values from the following ranges:
2844          * 0..0xff - the previous trail combining class
2845          * <0      - the negative value of the previous code unit;
2846          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2847          *           was deferred so that average text is checked faster
2848          */
2849 
2850         /* copy these code units all at once */
2851         if(src!=prevSrc) {
2852             length=(int32_t)(src-prevSrc);
2853             if((destIndex+length)<=destCapacity) {
2854                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2855             }
2856             destIndex+=length;
2857             prevSrc=src;
2858 
2859             /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2860             if(prevCC<0) {
2861                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2862                 if(!nx_contains(nx, (UChar32)-prevCC)) {
2863                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2864                 } else {
2865                     prevCC=0; /* excluded: fcd16==0 */
2866                 }
2867 
2868                 /*
2869                  * set a pointer to this below-U+0300 character;
2870                  * if prevCC==0 then it will moved to after this character below
2871                  */
2872                 decompStart=prevSrc-1;
2873             }
2874         }
2875         /*
2876          * now:
2877          * prevSrc==src - used later to adjust destIndex before decomposition
2878          * prevCC>=0
2879          */
2880 
2881         /* end of source reached? */
2882         if(limit==NULL ? c==0 : src==limit) {
2883             break;
2884         }
2885 
2886         /* set a pointer to after the last source position where prevCC==0 */
2887         if(prevCC==0) {
2888             decompStart=prevSrc;
2889         }
2890 
2891         /* c already contains *src and fcd16 is set for it, increment src */
2892         ++src;
2893 
2894         /* check one above-minimum, relevant code unit */
2895         if(UTF_IS_FIRST_SURROGATE(c)) {
2896             /* c is a lead surrogate, get the real fcd16 */
2897             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2898                 ++src;
2899                 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2900             } else {
2901                 c2=0;
2902                 fcd16=0;
2903             }
2904         } else {
2905             c2=0;
2906         }
2907 
2908         /* we are looking at the character (c, c2) at [prevSrc..src[ */
2909         if(nx_contains(nx, c, c2)) {
2910             fcd16=0; /* excluded: fcd16==0 */
2911         }
2912 
2913         /* check the combining order, get the lead cc */
2914         cc=(int16_t)(fcd16>>8);
2915         if(cc==0 || cc>=prevCC) {
2916             /* the order is ok */
2917             if(cc==0) {
2918                 decompStart=prevSrc;
2919             }
2920             prevCC=(int16_t)(fcd16&0xff);
2921 
2922             /* just append (c, c2) */
2923             length= c2==0 ? 1 : 2;
2924             if((destIndex+length)<=destCapacity) {
2925                 dest[destIndex++]=c;
2926                 if(c2!=0) {
2927                     dest[destIndex++]=c2;
2928                 }
2929             } else {
2930                 destIndex+=length;
2931             }
2932         } else {
2933             /*
2934              * back out the part of the source that we copied already but
2935              * is now going to be decomposed;
2936              * prevSrc is set to after what was copied
2937              */
2938             destIndex-=(int32_t)(prevSrc-decompStart);
2939 
2940             /*
2941              * find the part of the source that needs to be decomposed;
2942              * to be safe and simple, decompose to before the next character with lead cc==0
2943              */
2944             src=_findSafeFCD(src, limit, fcd16);
2945 
2946             /*
2947              * the source text does not fulfill the conditions for FCD;
2948              * decompose and reorder a limited piece of the text
2949              */
2950             prevCC=_decomposeFCD(decompStart, src,
2951                                  dest, destIndex, destCapacity,
2952                                  nx);
2953             decompStart=src;
2954         }
2955     }
2956 
2957     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2958 }
2959 
2960 /* quick check functions ---------------------------------------------------- */
2961 
2962 static UBool
2963 unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
2964     const UChar *limit;
2965     UChar c, c2;
2966     uint16_t fcd16;
2967     int16_t prevCC, cc;
2968 
2969     /* initialize */
2970     prevCC=0;
2971 
2972     if(srcLength>=0) {
2973         /* string with length */
2974         limit=src+srcLength;
2975     } else /* srcLength==-1 */ {
2976         /* zero-terminated string */
2977         limit=NULL;
2978     }
2979 
2980     U_ALIGN_CODE(16);
2981 
2982     for(;;) {
2983         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2984         if(limit==NULL) {
2985             for(;;) {
2986                 c=*src++;
2987                 if(c<_NORM_MIN_WITH_LEAD_CC) {
2988                     if(c==0) {
2989                         return TRUE;
2990                     }
2991                     /*
2992                      * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2993                      * because chances are good that the next one will have
2994                      * a leading cc of 0;
2995                      * _getFCD16(-prevCC) is later called when necessary -
2996                      * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2997                      */
2998                     prevCC=(int16_t)-c;
2999                 } else if((fcd16=_getFCD16(c))==0) {
3000                     prevCC=0;
3001                 } else {
3002                     break;
3003                 }
3004             }
3005         } else {
3006             for(;;) {
3007                 if(src==limit) {
3008                     return TRUE;
3009                 } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
3010                     prevCC=(int16_t)-c;
3011                 } else if((fcd16=_getFCD16(c))==0) {
3012                     prevCC=0;
3013                 } else {
3014                     break;
3015                 }
3016             }
3017         }
3018 
3019         /* check one above-minimum, relevant code unit */
3020         if(UTF_IS_FIRST_SURROGATE(c)) {
3021             /* c is a lead surrogate, get the real fcd16 */
3022             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3023                 ++src;
3024                 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
3025             } else {
3026                 c2=0;
3027                 fcd16=0;
3028             }
3029         } else {
3030             c2=0;
3031         }
3032 
3033         if(nx_contains(nx, c, c2)) {
3034             prevCC=0; /* excluded: fcd16==0 */
3035             continue;
3036         }
3037 
3038         /*
3039          * prevCC has values from the following ranges:
3040          * 0..0xff - the previous trail combining class
3041          * <0      - the negative value of the previous code unit;
3042          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
3043          *           was deferred so that average text is checked faster
3044          */
3045 
3046         /* check the combining order */
3047         cc=(int16_t)(fcd16>>8);
3048         if(cc!=0) {
3049             if(prevCC<0) {
3050                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
3051                 if(!nx_contains(nx, (UChar32)-prevCC)) {
3052                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
3053                 } else {
3054                     prevCC=0; /* excluded: fcd16==0 */
3055                 }
3056             }
3057 
3058             if(cc<prevCC) {
3059                 return FALSE;
3060             }
3061         }
3062         prevCC=(int16_t)(fcd16&0xff);
3063     }
3064 }
3065 
3066 static UNormalizationCheckResult
3067 _quickCheck(const UChar *src,
3068             int32_t srcLength,
3069             UNormalizationMode mode,
3070             UBool allowMaybe,
3071             const UnicodeSet *nx,
3072             UErrorCode *pErrorCode) {
3073     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
3074     UChar *buffer;
3075     int32_t bufferCapacity;
3076 
3077     const UChar *start, *limit;
3078     uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
3079     int32_t options;
3080     UChar c, c2, minNoMaybe;
3081     uint8_t cc, prevCC;
3082     UNormalizationCheckResult result;
3083 
3084     /* check arguments */
3085     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3086         return UNORM_MAYBE;
3087     }
3088 
3089     if(src==NULL || srcLength<-1) {
3090         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3091         return UNORM_MAYBE;
3092     }
3093 
3094     if(!_haveData(*pErrorCode)) {
3095         return UNORM_MAYBE;
3096     }
3097 
3098     /* check for a valid mode and set the quick check minimum and mask */
3099     switch(mode) {
3100     case UNORM_NFC:
3101         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3102         qcMask=_NORM_QC_NFC;
3103         options=0;
3104         break;
3105     case UNORM_NFKC:
3106         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3107         qcMask=_NORM_QC_NFKC;
3108         options=_NORM_OPTIONS_COMPAT;
3109         break;
3110     case UNORM_NFD:
3111         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
3112         qcMask=_NORM_QC_NFD;
3113         options=0;
3114         break;
3115     case UNORM_NFKD:
3116         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
3117         qcMask=_NORM_QC_NFKD;
3118         options=_NORM_OPTIONS_COMPAT;
3119         break;
3120     case UNORM_FCD:
3121         if(fcdTrie.index==NULL) {
3122             *pErrorCode=U_UNSUPPORTED_ERROR;
3123             return UNORM_MAYBE;
3124         }
3125         return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
3126     default:
3127         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3128         return UNORM_MAYBE;
3129     }
3130 
3131     /* initialize */
3132     buffer=stackBuffer;
3133     bufferCapacity=_STACK_BUFFER_CAPACITY;
3134 
3135     ccOrQCMask=_NORM_CC_MASK|qcMask;
3136     result=UNORM_YES;
3137     prevCC=0;
3138 
3139     start=src;
3140     if(srcLength>=0) {
3141         /* string with length */
3142         limit=src+srcLength;
3143     } else /* srcLength==-1 */ {
3144         /* zero-terminated string */
3145         limit=NULL;
3146     }
3147 
3148     U_ALIGN_CODE(16);
3149 
3150     for(;;) {
3151         /* skip a run of code units below the minimum or with irrelevant data for the quick check */
3152         if(limit==NULL) {
3153             for(;;) {
3154                 c=*src++;
3155                 if(c<minNoMaybe) {
3156                     if(c==0) {
3157                         goto endloop; /* break out of outer loop */
3158                     }
3159                 } else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3160                     break;
3161                 }
3162                 prevCC=0;
3163             }
3164         } else {
3165             for(;;) {
3166                 if(src==limit) {
3167                     goto endloop; /* break out of outer loop */
3168                 } else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3169                     break;
3170                 }
3171                 prevCC=0;
3172             }
3173         }
3174 
3175         /* check one above-minimum, relevant code unit */
3176         if(isNorm32LeadSurrogate(norm32)) {
3177             /* c is a lead surrogate, get the real norm32 */
3178             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3179                 ++src;
3180                 norm32=_getNorm32FromSurrogatePair(norm32, c2);
3181             } else {
3182                 c2=0;
3183                 norm32=0;
3184             }
3185         } else {
3186             c2=0;
3187         }
3188 
3189         if(nx_contains(nx, c, c2)) {
3190             /* excluded: norm32==0 */
3191             norm32=0;
3192         }
3193 
3194         /* check the combining order */
3195         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
3196         if(cc!=0 && cc<prevCC) {
3197             result=UNORM_NO;
3198             break;
3199         }
3200         prevCC=cc;
3201 
3202         /* check for "no" or "maybe" quick check flags */
3203         qcNorm32=norm32&qcMask;
3204         if(qcNorm32&_NORM_QC_ANY_NO) {
3205             result=UNORM_NO;
3206             break;
3207         } else if(qcNorm32!=0) {
3208             /* "maybe" can only occur for NFC and NFKC */
3209             if(allowMaybe) {
3210                 result=UNORM_MAYBE;
3211             } else {
3212                 /* normalize a section around here to see if it is really normalized or not */
3213                 const UChar *prevStarter;
3214                 uint32_t decompQCMask;
3215                 int32_t length;
3216 
3217                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
3218 
3219                 /* find the previous starter */
3220                 prevStarter=src-1; /* set prevStarter to the beginning of the current character */
3221                 if(UTF_IS_TRAIL(*prevStarter)) {
3222                     --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */
3223                 }
3224                 prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
3225 
3226                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3227                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
3228 
3229                 /* decompose and recompose [prevStarter..src[ */
3230                 _composePart(stackBuffer, buffer, bufferCapacity,
3231                              length,
3232                              prevStarter,
3233                              src,
3234                              prevCC,
3235                              options, nx, pErrorCode);
3236                 if(U_FAILURE(*pErrorCode)) {
3237                     result=UNORM_MAYBE; /* error (out of memory) */
3238                     break;
3239                 }
3240 
3241                 /* compare the normalized version with the original */
3242                 if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
3243                     result=UNORM_NO; /* normalization differs */
3244                     break;
3245                 }
3246 
3247                 /* continue after the next starter */
3248             }
3249         }
3250     }
3251 endloop:
3252 
3253     if(buffer!=stackBuffer) {
3254         uprv_free(buffer);
3255     }
3256 
3257     return result;
3258 }
3259 
3260 U_CAPI UNormalizationCheckResult U_EXPORT2
3261 unorm_quickCheck(const UChar *src,
3262                  int32_t srcLength,
3263                  UNormalizationMode mode,
3264                  UErrorCode *pErrorCode) {
3265     return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
3266 }
3267 
3268 U_CAPI UNormalizationCheckResult U_EXPORT2
3269 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
3270                             UNormalizationMode mode, int32_t options,
3271                             UErrorCode *pErrorCode) {
3272     return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
3273 }
3274 
3275 U_CFUNC UNormalizationCheckResult
3276 unorm_internalQuickCheck(const UChar *src,
3277                          int32_t srcLength,
3278                          UNormalizationMode mode,
3279                          UBool allowMaybe,
3280                          const UnicodeSet *nx,
3281                          UErrorCode *pErrorCode) {
3282     return _quickCheck(src, srcLength, mode, allowMaybe, nx, pErrorCode);
3283 }
3284 
3285 U_CAPI UBool U_EXPORT2
3286 unorm_isNormalized(const UChar *src, int32_t srcLength,
3287                    UNormalizationMode mode,
3288                    UErrorCode *pErrorCode) {
3289     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
3290 }
3291 
3292 U_CAPI UBool U_EXPORT2
3293 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
3294                               UNormalizationMode mode, int32_t options,
3295                               UErrorCode *pErrorCode) {
3296     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
3297 }
3298 
3299 /* normalize() API ---------------------------------------------------------- */
3300 
3301 /**
3302  * Internal API for normalizing.
3303  * Does not check for bad input.
3304  * Requires _haveData() to be true.
3305  * @internal
3306  */
3307 U_CFUNC int32_t
3308 unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
3309                               const UChar *src, int32_t srcLength,
3310                               UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
3311                               UErrorCode *pErrorCode) {
3312     int32_t destLength;
3313     uint8_t trailCC;
3314 
3315     switch(mode) {
3316     case UNORM_NFD:
3317         destLength=_decompose(dest, destCapacity,
3318                               src, srcLength,
3319                               FALSE, nx, trailCC);
3320         break;
3321     case UNORM_NFKD:
3322         destLength=_decompose(dest, destCapacity,
3323                               src, srcLength,
3324                               TRUE, nx, trailCC);
3325         break;
3326     case UNORM_NFC:
3327         destLength=_compose(dest, destCapacity,
3328                             src, srcLength,
3329                             options, nx, pErrorCode);
3330         break;
3331     case UNORM_NFKC:
3332         destLength=_compose(dest, destCapacity,
3333                             src, srcLength,
3334                             options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
3335         break;
3336     case UNORM_FCD:
3337         if(fcdTrie.index==NULL) {
3338             *pErrorCode=U_UNSUPPORTED_ERROR;
3339             return 0;
3340         }
3341         return unorm_makeFCD(dest, destCapacity,
3342                              src, srcLength,
3343                              nx,
3344                              pErrorCode);
3345 #if 0
3346     case UNORM_FCC:
3347         destLength=_compose(dest, destCapacity,
3348                             src, srcLength,
3349                             options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
3350         break;
3351 #endif
3352     case UNORM_NONE:
3353         /* just copy the string */
3354         if(srcLength==-1) {
3355             srcLength=u_strlen(src);
3356         }
3357         if(srcLength>0 && srcLength<=destCapacity) {
3358             uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
3359         }
3360         destLength=srcLength;
3361         break;
3362     default:
3363         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3364         return 0;
3365     }
3366 
3367     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3368 }
3369 
3370 /**
3371  * Internal API for normalizing.
3372  * Does not check for bad input.
3373  * @internal
3374  */
3375 U_CAPI int32_t U_EXPORT2
3376 unorm_internalNormalize(UChar *dest, int32_t destCapacity,
3377                         const UChar *src, int32_t srcLength,
3378                         UNormalizationMode mode, int32_t options,
3379                         UErrorCode *pErrorCode) {
3380     const UnicodeSet *nx;
3381 
3382     if(!_haveData(*pErrorCode)) {
3383         return 0;
3384     }
3385 
3386     nx=getNX(options, *pErrorCode);
3387     if(U_FAILURE(*pErrorCode)) {
3388         return 0;
3389     }
3390 
3391     /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3392     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
3393 
3394     return unorm_internalNormalizeWithNX(dest, destCapacity,
3395                                          src, srcLength,
3396                                          mode, options, nx,
3397                                          pErrorCode);
3398 }
3399 
3400 /** Public API for normalizing. */
3401 U_CAPI int32_t U_EXPORT2
3402 unorm_normalize(const UChar *src, int32_t srcLength,
3403                 UNormalizationMode mode, int32_t options,
3404                 UChar *dest, int32_t destCapacity,
3405                 UErrorCode *pErrorCode) {
3406     /* check argument values */
3407     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3408         return 0;
3409     }
3410 
3411     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3412         src==NULL || srcLength<-1
3413     ) {
3414         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3415         return 0;
3416     }
3417 
3418     /* check for overlapping src and destination */
3419     if( dest!=NULL &&
3420         ((src>=dest && src<(dest+destCapacity)) ||
3421          (srcLength>0 && dest>=src && dest<(src+srcLength)))
3422     ) {
3423         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3424         return 0;
3425     }
3426 
3427     return unorm_internalNormalize(dest, destCapacity,
3428                                    src, srcLength,
3429                                    mode, options,
3430                                    pErrorCode);
3431 }
3432 
3433 
3434 /* iteration functions ------------------------------------------------------ */
3435 
3436 /*
3437  * These iteration functions are the core implementations of the
3438  * Normalizer class iteration API.
3439  * They read from a UCharIterator into their own buffer
3440  * and normalize into the Normalizer iteration buffer.
3441  * Normalizer itself then iterates over its buffer until that needs to be
3442  * filled again.
3443  */
3444 
3445 /*
3446  * ### TODO:
3447  * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3448  * if iteration bounds are reached,
3449  * try to not call hasNext/hasPrevious and instead check for >=0.
3450  */
3451 
3452 /* backward iteration ------------------------------------------------------- */
3453 
3454 /*
3455  * read backwards and get norm32
3456  * return 0 if the character is <minC
3457  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3458  */
3459 static inline uint32_t
3460 _getPrevNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3461     uint32_t norm32;
3462 
3463     /* need src.hasPrevious() */
3464     c=(UChar)src.previous(&src);
3465     c2=0;
3466 
3467     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3468     if(c<minC) {
3469         return 0;
3470     } else if(!UTF_IS_SURROGATE(c)) {
3471         return _getNorm32(c);
3472     } else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
3473         /* unpaired surrogate */
3474         return 0;
3475     } else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
3476         norm32=_getNorm32(c2);
3477         if((norm32&mask)==0) {
3478             /* all surrogate pairs with this lead surrogate have irrelevant data */
3479             return 0;
3480         } else {
3481             /* norm32 must be a surrogate special */
3482             return _getNorm32FromSurrogatePair(norm32, c);
3483         }
3484     } else {
3485         /* unpaired second surrogate, undo the c2=src.previous() movement */
3486         src.move(&src, 1, UITER_CURRENT);
3487         c2=0;
3488         return 0;
3489     }
3490 }
3491 
3492 /*
3493  * read backwards and check if the character is a previous-iteration boundary
3494  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3495  */
3496 typedef UBool
3497 IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3498 
3499 /*
3500  * for NF*D:
3501  * read backwards and check if the lead combining class is 0
3502  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3503  */
3504 static UBool
3505 _isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3506     return _isNFDSafe(_getPrevNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3507 }
3508 
3509 /*
3510  * read backwards and check if the character is (or its decomposition begins with)
3511  * a "true starter" (cc==0 and NF*C_YES)
3512  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3513  */
3514 static UBool
3515 _isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3516     uint32_t norm32, decompQCMask;
3517 
3518     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3519     norm32=_getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3520     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3521 }
3522 
3523 static int32_t
3524 _findPreviousIterationBoundary(UCharIterator &src,
3525                                IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
3526                                UChar *&buffer, int32_t &bufferCapacity,
3527                                int32_t &startIndex,
3528                                UErrorCode *pErrorCode) {
3529     UChar *stackBuffer;
3530     UChar c, c2;
3531     UBool isBoundary;
3532 
3533     /* initialize */
3534     stackBuffer=buffer;
3535     startIndex=bufferCapacity; /* fill the buffer from the end backwards */
3536 
3537     while(src.hasPrevious(&src)) {
3538         isBoundary=isPrevBoundary(src, minC, mask, c, c2);
3539 
3540         /* always write this character to the front of the buffer */
3541         /* make sure there is enough space in the buffer */
3542         if(startIndex < (c2==0 ? 1 : 2)) {
3543             int32_t bufferLength=bufferCapacity;
3544 
3545             if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
3546                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3547                 src.move(&src, 0, UITER_START);
3548                 return 0;
3549             }
3550 
3551             /* move the current buffer contents up */
3552             uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
3553             startIndex+=bufferCapacity-bufferLength;
3554         }
3555 
3556         buffer[--startIndex]=c;
3557         if(c2!=0) {
3558             buffer[--startIndex]=c2;
3559         }
3560 
3561         /* stop if this just-copied character is a boundary */
3562         if(isBoundary) {
3563             break;
3564         }
3565     }
3566 
3567     /* return the length of the buffer contents */
3568     return bufferCapacity-startIndex;
3569 }
3570 
3571 U_CAPI int32_t U_EXPORT2
3572 unorm_previous(UCharIterator *src,
3573                UChar *dest, int32_t destCapacity,
3574                UNormalizationMode mode, int32_t options,
3575                UBool doNormalize, UBool *pNeededToNormalize,
3576                UErrorCode *pErrorCode) {
3577     UChar stackBuffer[100];
3578     UChar *buffer=NULL;
3579     IsPrevBoundaryFn *isPreviousBoundary=NULL;
3580     uint32_t mask=0;
3581     int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
3582     int32_t c=0, c2=0;
3583     UChar minC=0;
3584 
3585     /* check argument values */
3586     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3587         return 0;
3588     }
3589 
3590     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3591         src==NULL
3592     ) {
3593         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3594         return 0;
3595     }
3596 
3597     if(!_haveData(*pErrorCode)) {
3598         return 0;
3599     }
3600 
3601     if(pNeededToNormalize!=NULL) {
3602         *pNeededToNormalize=FALSE;
3603     }
3604 
3605     switch(mode) {
3606     case UNORM_FCD:
3607         if(fcdTrie.index==NULL) {
3608             *pErrorCode=U_UNSUPPORTED_ERROR;
3609             return 0;
3610         }
3611         /* fall through to NFD */
3612     case UNORM_NFD:
3613         isPreviousBoundary=_isPrevNFDSafe;
3614         minC=_NORM_MIN_WITH_LEAD_CC;
3615         mask=_NORM_CC_MASK|_NORM_QC_NFD;
3616         break;
3617     case UNORM_NFKD:
3618         isPreviousBoundary=_isPrevNFDSafe;
3619         minC=_NORM_MIN_WITH_LEAD_CC;
3620         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3621         break;
3622     case UNORM_NFC:
3623         isPreviousBoundary=_isPrevTrueStarter;
3624         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3625         mask=_NORM_CC_MASK|_NORM_QC_NFC;
3626         break;
3627     case UNORM_NFKC:
3628         isPreviousBoundary=_isPrevTrueStarter;
3629         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3630         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3631         break;
3632     case UNORM_NONE:
3633         destLength=0;
3634         if((c=src->previous(src))>=0) {
3635             destLength=1;
3636             if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
3637                 if(UTF_IS_LEAD(c2)) {
3638                     if(destCapacity>=2) {
3639                         dest[1]=(UChar)c; /* trail surrogate */
3640                         destLength=2;
3641                     }
3642                     c=c2; /* lead surrogate to be written below */
3643                 } else {
3644                     src->move(src, 1, UITER_CURRENT);
3645                 }
3646             }
3647 
3648             if(destCapacity>0) {
3649                 dest[0]=(UChar)c;
3650             }
3651         }
3652         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3653     default:
3654         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3655         return 0;
3656     }
3657 
3658     buffer=stackBuffer;
3659     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3660     bufferLength=_findPreviousIterationBoundary(*src,
3661                                                 isPreviousBoundary, minC, mask,
3662                                                 buffer, bufferCapacity,
3663                                                 startIndex,
3664                                                 pErrorCode);
3665     if(bufferLength>0) {
3666         if(doNormalize) {
3667             destLength=unorm_internalNormalize(dest, destCapacity,
3668                                                buffer+startIndex, bufferLength,
3669                                                mode, options,
3670                                                pErrorCode);
3671             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3672                 *pNeededToNormalize=
3673                     (UBool)(destLength!=bufferLength ||
3674                             0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
3675             }
3676         } else {
3677             /* just copy the source characters */
3678             if(destCapacity>0) {
3679                 uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3680             }
3681             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3682         }
3683     } else {
3684         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3685     }
3686 
3687     /* cleanup */
3688     if(buffer!=stackBuffer) {
3689         uprv_free(buffer);
3690     }
3691 
3692     return destLength;
3693 }
3694 
3695 /* forward iteration -------------------------------------------------------- */
3696 
3697 /*
3698  * read forward and get norm32
3699  * return 0 if the character is <minC
3700  * if c2!=0 then (c2, c) is a surrogate pair
3701  * always reads complete characters
3702  */
3703 static inline uint32_t
3704 _getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3705     uint32_t norm32;
3706 
3707     /* need src.hasNext() to be true */
3708     c=(UChar)src.next(&src);
3709     c2=0;
3710 
3711     if(c<minC) {
3712         return 0;
3713     }
3714 
3715     norm32=_getNorm32(c);
3716     if(UTF_IS_FIRST_SURROGATE(c)) {
3717         if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
3718             src.move(&src, 1, UITER_CURRENT); /* skip the c2 surrogate */
3719             if((norm32&mask)==0) {
3720                 /* irrelevant data */
3721                 return 0;
3722             } else {
3723                 /* norm32 must be a surrogate special */
3724                 return _getNorm32FromSurrogatePair(norm32, c2);
3725             }
3726         } else {
3727             /* unmatched surrogate */
3728             c2=0;
3729             return 0;
3730         }
3731     }
3732     return norm32;
3733 }
3734 
3735 /*
3736  * read forward and check if the character is a next-iteration boundary
3737  * if c2!=0 then (c, c2) is a surrogate pair
3738  */
3739 typedef UBool
3740 IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3741 
3742 /*
3743  * for NF*D:
3744  * read forward and check if the lead combining class is 0
3745  * if c2!=0 then (c, c2) is a surrogate pair
3746  */
3747 static UBool
3748 _isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3749     return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3750 }
3751 
3752 /*
3753  * for NF*C:
3754  * read forward and check if the character is (or its decomposition begins with)
3755  * a "true starter" (cc==0 and NF*C_YES)
3756  * if c2!=0 then (c, c2) is a surrogate pair
3757  */
3758 static UBool
3759 _isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3760     uint32_t norm32, decompQCMask;
3761 
3762     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3763     norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3764     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3765 }
3766 
3767 static int32_t
3768 _findNextIterationBoundary(UCharIterator &src,
3769                            IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
3770                            UChar *&buffer, int32_t &bufferCapacity,
3771                            UErrorCode *pErrorCode) {
3772     UChar *stackBuffer;
3773     int32_t bufferIndex;
3774     UChar c, c2;
3775 
3776     if(!src.hasNext(&src)) {
3777         return 0;
3778     }
3779 
3780     /* initialize */
3781     stackBuffer=buffer;
3782 
3783     /* get one character and ignore its properties */
3784     buffer[0]=c=(UChar)src.next(&src);
3785     bufferIndex=1;
3786     if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
3787         if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
3788             buffer[bufferIndex++]=c2;
3789         } else {
3790             src.move(&src, -1, UITER_CURRENT); /* back out the non-trail-surrogate */
3791         }
3792     }
3793 
3794     /* get all following characters until we see a boundary */
3795     /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3796     while(src.hasNext(&src)) {
3797         if(isNextBoundary(src, minC, mask, c, c2)) {
3798             /* back out the latest movement to stop at the boundary */
3799             src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
3800             break;
3801         } else {
3802             if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
3803                 /* attempt to grow the buffer */
3804                 u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
3805                                        2*bufferCapacity,
3806                                        bufferIndex)
3807             ) {
3808                 buffer[bufferIndex++]=c;
3809                 if(c2!=0) {
3810                     buffer[bufferIndex++]=c2;
3811                 }
3812             } else {
3813                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3814                 src.move(&src, 0, UITER_LIMIT);
3815                 return 0;
3816             }
3817         }
3818     }
3819 
3820     /* return the length of the buffer contents */
3821     return bufferIndex;
3822 }
3823 
3824 U_CAPI int32_t U_EXPORT2
3825 unorm_next(UCharIterator *src,
3826            UChar *dest, int32_t destCapacity,
3827            UNormalizationMode mode, int32_t options,
3828            UBool doNormalize, UBool *pNeededToNormalize,
3829            UErrorCode *pErrorCode) {
3830     UChar stackBuffer[100];
3831     UChar *buffer;
3832     IsNextBoundaryFn *isNextBoundary;
3833     uint32_t mask;
3834     int32_t bufferLength, bufferCapacity, destLength;
3835     int32_t c, c2;
3836     UChar minC;
3837 
3838     /* check argument values */
3839     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3840         return 0;
3841     }
3842 
3843     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3844         src==NULL
3845     ) {
3846         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3847         return 0;
3848     }
3849 
3850     if(!_haveData(*pErrorCode)) {
3851         return 0;
3852     }
3853 
3854     if(pNeededToNormalize!=NULL) {
3855         *pNeededToNormalize=FALSE;
3856     }
3857 
3858     switch(mode) {
3859     case UNORM_FCD:
3860         if(fcdTrie.index==NULL) {
3861             *pErrorCode=U_UNSUPPORTED_ERROR;
3862             return 0;
3863         }
3864         /* fall through to NFD */
3865     case UNORM_NFD:
3866         isNextBoundary=_isNextNFDSafe;
3867         minC=_NORM_MIN_WITH_LEAD_CC;
3868         mask=_NORM_CC_MASK|_NORM_QC_NFD;
3869         break;
3870     case UNORM_NFKD:
3871         isNextBoundary=_isNextNFDSafe;
3872         minC=_NORM_MIN_WITH_LEAD_CC;
3873         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3874         break;
3875     case UNORM_NFC:
3876         isNextBoundary=_isNextTrueStarter;
3877         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3878         mask=_NORM_CC_MASK|_NORM_QC_NFC;
3879         break;
3880     case UNORM_NFKC:
3881         isNextBoundary=_isNextTrueStarter;
3882         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3883         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3884         break;
3885     case UNORM_NONE:
3886         destLength=0;
3887         if((c=src->next(src))>=0) {
3888             destLength=1;
3889             if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
3890                 if(UTF_IS_TRAIL(c2)) {
3891                     if(destCapacity>=2) {
3892                         dest[1]=(UChar)c2; /* trail surrogate */
3893                         destLength=2;
3894                     }
3895                     /* lead surrogate to be written below */
3896                 } else {
3897                     src->move(src, -1, UITER_CURRENT);
3898                 }
3899             }
3900 
3901             if(destCapacity>0) {
3902                 dest[0]=(UChar)c;
3903             }
3904         }
3905         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3906     default:
3907         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3908         return 0;
3909     }
3910 
3911     buffer=stackBuffer;
3912     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3913     bufferLength=_findNextIterationBoundary(*src,
3914                                             isNextBoundary, minC, mask,
3915                                             buffer, bufferCapacity,
3916                                             pErrorCode);
3917     if(bufferLength>0) {
3918         if(doNormalize) {
3919             destLength=unorm_internalNormalize(dest, destCapacity,
3920                                                buffer, bufferLength,
3921                                                mode, options,
3922                                                pErrorCode);
3923             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3924                 *pNeededToNormalize=
3925                     (UBool)(destLength!=bufferLength ||
3926                             0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
3927             }
3928         } else {
3929             /* just copy the source characters */
3930             if(destCapacity>0) {
3931                 uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3932             }
3933             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3934         }
3935     } else {
3936         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3937     }
3938 
3939     /* cleanup */
3940     if(buffer!=stackBuffer) {
3941         uprv_free(buffer);
3942     }
3943 
3944     return destLength;
3945 }
3946 
3947 /*
3948  * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3949  * and if not, how hard it would be to improve it.
3950  * For example, see _findSafeFCD().
3951  */
3952 
3953 /* Concatenation of normalized strings -------------------------------------- */
3954 
3955 U_CAPI int32_t U_EXPORT2
3956 unorm_concatenate(const UChar *left, int32_t leftLength,
3957                   const UChar *right, int32_t rightLength,
3958                   UChar *dest, int32_t destCapacity,
3959                   UNormalizationMode mode, int32_t options,
3960                   UErrorCode *pErrorCode) {
3961     UChar stackBuffer[100];
3962     UChar *buffer;
3963     int32_t bufferLength, bufferCapacity;
3964 
3965     UCharIterator iter;
3966     int32_t leftBoundary, rightBoundary, destLength;
3967 
3968     /* check argument values */
3969     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3970         return 0;
3971     }
3972 
3973     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3974         left==NULL || leftLength<-1 ||
3975         right==NULL || rightLength<-1
3976     ) {
3977         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3978         return 0;
3979     }
3980 
3981     /* check for overlapping right and destination */
3982     if( dest!=NULL &&
3983         ((right>=dest && right<(dest+destCapacity)) ||
3984          (rightLength>0 && dest>=right && dest<(right+rightLength)))
3985     ) {
3986         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3987         return 0;
3988     }
3989 
3990     /* allow left==dest */
3991 
3992     /* set up intermediate buffer */
3993     buffer=stackBuffer;
3994     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3995 
3996     /*
3997      * Input: left[0..leftLength[ + right[0..rightLength[
3998      *
3999      * Find normalization-safe boundaries leftBoundary and rightBoundary
4000      * and copy the end parts together:
4001      * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
4002      *
4003      * dest=left[0..leftBoundary[ +
4004      *      normalize(buffer) +
4005      *      right[rightBoundary..rightLength[
4006      */
4007 
4008     /*
4009      * find a normalization boundary at the end of the left string
4010      * and copy the end part into the buffer
4011      */
4012     uiter_setString(&iter, left, leftLength);
4013     iter.index=leftLength=iter.length; /* end of left string */
4014 
4015     bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
4016                                 mode, options,
4017                                 FALSE, NULL,
4018                                 pErrorCode);
4019     leftBoundary=iter.index;
4020     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
4021         *pErrorCode=U_ZERO_ERROR;
4022         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
4023             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
4024             /* dont need to cleanup here since
4025              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4026              */
4027             return 0;
4028         }
4029 
4030         /* just copy from the left string: we know the boundary already */
4031         uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
4032     }
4033 
4034     /*
4035      * find a normalization boundary at the beginning of the right string
4036      * and concatenate the beginning part to the buffer
4037      */
4038     uiter_setString(&iter, right, rightLength);
4039     rightLength=iter.length; /* in case it was -1 */
4040 
4041     rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
4042                              mode, options,
4043                              FALSE, NULL,
4044                              pErrorCode);
4045     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
4046         *pErrorCode=U_ZERO_ERROR;
4047         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
4048             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
4049             /* dont need to cleanup here since
4050              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4051              */
4052             return 0;
4053         }
4054 
4055         /* just copy from the right string: we know the boundary already */
4056         uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
4057     }
4058 
4059     bufferLength+=rightBoundary;
4060 
4061     /* copy left[0..leftBoundary[ to dest */
4062     if(left!=dest && leftBoundary>0 && destCapacity>0) {
4063         uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
4064     }
4065     destLength=leftBoundary;
4066 
4067     /* concatenate the normalization of the buffer to dest */
4068     if(destCapacity>destLength) {
4069         destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
4070                                             buffer, bufferLength,
4071                                             mode, options,
4072                                             pErrorCode);
4073     } else {
4074         destLength+=unorm_internalNormalize(NULL, 0,
4075                                             buffer, bufferLength,
4076                                             mode, options,
4077                                             pErrorCode);
4078     }
4079     /*
4080      * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
4081      * so we dont check for the error code here..just let it pass through
4082      */
4083     /* concatenate right[rightBoundary..rightLength[ to dest */
4084     right+=rightBoundary;
4085     rightLength-=rightBoundary;
4086     if(rightLength>0 && destCapacity>destLength) {
4087         uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
4088     }
4089     destLength+=rightLength;
4090 
4091     /* cleanup */
4092     if(buffer!=stackBuffer) {
4093         uprv_free(buffer);
4094     }
4095 
4096     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
4097 }
4098 
4099 #endif /* #if !UCONFIG_NO_NORMALIZATION */
4100