• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 * Copyright (c) 1996-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 * File unorm.cpp
7 *
8 * Created by: Vladimir Weinstein 12052000
9 *
10 * Modification history :
11 *
12 * Date        Name        Description
13 * 02/01/01    synwee      Added normalization quickcheck enum and method.
14 * 02/12/01    synwee      Commented out quickcheck util api has been approved
15 *                         Added private method for doing FCD checks
16 * 02/23/01    synwee      Modified quickcheck and checkFCE to run through
17 *                         string for codepoints < 0x300 for the normalization
18 *                         mode NFC.
19 * 05/25/01+   Markus Scherer total rewrite, implement all normalization here
20 *                         instead of just wrappers around normlzr.cpp,
21 *                         load unorm.dat, support Unicode 3.1 with
22 *                         supplementary code points, etc.
23 */
24 
25 #include "unicode/utypes.h"
26 
27 #if !UCONFIG_NO_NORMALIZATION
28 
29 #include "unicode/udata.h"
30 #include "unicode/uchar.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/uniset.h"
34 #include "unicode/usetiter.h"
35 #include "unicode/unorm.h"
36 #include "ucln_cmn.h"
37 #include "unormimp.h"
38 #include "ucase.h"
39 #include "cmemory.h"
40 #include "umutex.h"
41 #include "utrie2.h"
42 #include "unicode/uset.h"
43 #include "udataswp.h"
44 #include "putilimp.h"
45 
46 /*
47  * Status of tailored normalization
48  *
49  * This was done initially for investigation on Unicode public review issue 7
50  * (http://www.unicode.org/review/). See Jitterbug 2481.
51  * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
52  * a permanent feature in ICU 2.6 in support of IDNA which requires true
53  * Unicode 3.2 normalization.
54  * (NormalizationCorrections are rolled into IDNA mapping tables.)
55  *
56  * Tailored normalization as implemented here allows to "normalize less"
57  * than full Unicode normalization would.
58  * Based internally on a UnicodeSet of code points that are
59  * "excluded from normalization", the normalization functions leave those
60  * code points alone ("inert"). This means that tailored normalization
61  * still transforms text into a canonically equivalent form.
62  * It does not add decompositions to code points that do not have any or
63  * change decomposition results.
64  *
65  * Any function that searches for a safe boundary has not been touched,
66  * which means that these functions will be over-pessimistic when
67  * exclusions are applied.
68  * This should not matter because subsequent checks and normalizations
69  * do apply the exclusions; only a little more of the text may be processed
70  * than necessary under exclusions.
71  *
72  * Normalization exclusions have the following effect on excluded code points c:
73  * - c is not decomposed
74  * - c is not a composition target
75  * - c does not combine forward or backward for composition
76  *   except that this is not implemented for Jamo
77  * - c is treated as having a combining class of 0
78  */
79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
80 
81 U_NAMESPACE_USE
82 
83 /*
84  * This new implementation of the normalization code loads its data from
85  * unorm.dat, which is generated with the gennorm tool.
86  * The format of that file is described in unormimp.h .
87  */
88 
89 /* -------------------------------------------------------------------------- */
90 
91 enum {
92     _STACK_BUFFER_CAPACITY=100
93 };
94 
95 /*
96  * Constants for the bit fields in the options bit set parameter.
97  * These need not be public.
98  * A user only needs to know the currently assigned values.
99  * The number and positions of reserved bits per field can remain private
100  * and may change in future implementations.
101  */
102 enum {
103     _NORM_OPTIONS_NX_MASK=0x1f,
104     _NORM_OPTIONS_UNICODE_MASK=0x60,
105     _NORM_OPTIONS_SETS_MASK=0x7f,
106 
107     _NORM_OPTIONS_UNICODE_SHIFT=5,
108 
109     /*
110      * The following options are used only in some composition functions.
111      * They use bits 12 and up to preserve lower bits for the available options
112      * space in unorm_compare() -
113      * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
114      */
115 
116     /** Options bit 12, for compatibility vs. canonical decomposition. */
117     _NORM_OPTIONS_COMPAT=0x1000,
118     /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
119     _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
120 };
121 
122 U_CDECL_BEGIN
123 static inline UBool
isHangulWithoutJamoT(UChar c)124 isHangulWithoutJamoT(UChar c) {
125     c-=HANGUL_BASE;
126     return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
127 }
128 
129 /* norm32 helpers */
130 
131 /* is this a norm32 with a regular index? */
132 static inline UBool
isNorm32Regular(uint32_t norm32)133 isNorm32Regular(uint32_t norm32) {
134     return norm32<_NORM_MIN_SPECIAL;
135 }
136 
137 #if 0  // Code changed to use U16_IS_LEAD(c) instead.
138 /* is this a norm32 with a special index for a lead surrogate? */
139 static inline UBool
140 isNorm32LeadSurrogate(uint32_t norm32) {
141     return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
142 }
143 #endif
144 
145 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
146 static inline UBool
isNorm32HangulOrJamo(uint32_t norm32)147 isNorm32HangulOrJamo(uint32_t norm32) {
148     return norm32>=_NORM_MIN_HANGUL;
149 }
150 
151 /*
152  * Given isNorm32HangulOrJamo(),
153  * is this a Hangul syllable or a Jamo?
154  */
155 /*static inline UBool
156 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
157     return norm32<_NORM_MIN_JAMO_V;
158 }*/
159 
160 /*
161  * Given norm32 for Jamo V or T,
162  * is this a Jamo V?
163  */
164 static inline UBool
isJamoVTNorm32JamoV(uint32_t norm32)165 isJamoVTNorm32JamoV(uint32_t norm32) {
166     return norm32<_NORM_JAMO_V_TOP;
167 }
168 U_CDECL_END
169 
170 /* load unorm.dat ----------------------------------------------------------- */
171 
172 #define UNORM_HARDCODE_DATA 1
173 
174 #if UNORM_HARDCODE_DATA
175 
176 /* unorm_props_data.c is machine-generated by gennorm --csource */
177 #include "unorm_props_data.c"
178 
179 static const UBool formatVersion_2_2=TRUE;
180 
181 #else
182 
183 #define DATA_NAME "unorm"
184 #define DATA_TYPE "icu"
185 
186 static UDataMemory *normData=NULL;
187 static UErrorCode dataErrorCode=U_ZERO_ERROR;
188 static int8_t haveNormData=0;
189 
190 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
191 static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
192 
193 /*
194  * pointers into the memory-mapped unorm.icu
195  */
196 static const uint16_t *extraData=NULL,
197                       *combiningTable=NULL,
198                       *canonStartSets=NULL;
199 
200 static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
201 static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
202 
203 /* the Unicode version of the normalization data */
204 static UVersionInfo dataVersion={ 0, 0, 0, 0 };
205 
206 #endif
207 
208 /* cache UnicodeSets for each combination of exclusion flags */
209 static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
210 
211 U_CDECL_BEGIN
212 
213 static UBool U_CALLCONV
unorm_cleanup(void)214 unorm_cleanup(void) {
215     int32_t i;
216 
217 #if !UNORM_HARDCODE_DATA
218     if(normData!=NULL) {
219         udata_close(normData);
220         normData=NULL;
221     }
222     dataErrorCode=U_ZERO_ERROR;
223     haveNormData=0;
224 #endif
225 
226     for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
227         if (nxCache[i]) {
228             delete nxCache[i];
229             nxCache[i] = 0;
230         }
231     }
232 
233     return TRUE;
234 }
235 
236 #if !UNORM_HARDCODE_DATA
237 
238 static UBool U_CALLCONV
isAcceptable(void *,const char *,const char *,const UDataInfo * pInfo)239 isAcceptable(void * /* context */,
240              const char * /* type */, const char * /* name */,
241              const UDataInfo *pInfo) {
242     if(
243         pInfo->size>=20 &&
244         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
245         pInfo->charsetFamily==U_CHARSET_FAMILY &&
246         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Norm" */
247         pInfo->dataFormat[1]==0x6f &&
248         pInfo->dataFormat[2]==0x72 &&
249         pInfo->dataFormat[3]==0x6d &&
250         pInfo->formatVersion[0]==2 &&
251         pInfo->formatVersion[2]==UTRIE_SHIFT &&
252         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
253     ) {
254         uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
255         uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
256         return TRUE;
257     } else {
258         return FALSE;
259     }
260 }
261 
262 #endif
263 
264 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)265 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
266     /* add the start code point to the USet */
267     const USetAdder *sa=(const USetAdder *)context;
268     sa->add(sa->set, start);
269     return TRUE;
270 }
271 
272 U_CDECL_END
273 
274 #if !UNORM_HARDCODE_DATA
275 
276 static int8_t
loadNormData(UErrorCode & errorCode)277 loadNormData(UErrorCode &errorCode) {
278     /* load Unicode normalization data from file */
279 
280     /*
281      * This lazy intialization with double-checked locking (without mutex protection for
282      * haveNormData==0) is transiently unsafe under certain circumstances.
283      * Check the readme and use u_init() if necessary.
284      *
285      * While u_init() initializes the main normalization data via this functions,
286      * it does not do so for exclusion sets (which are fully mutexed).
287      * This is because
288      * - there can be many exclusion sets
289      * - they are rarely used
290      * - they are not usually used in execution paths that are
291      *   as performance-sensitive as others
292      *   (e.g., IDNA takes more time than unorm_quickCheck() anyway)
293      */
294     if(haveNormData==0) {
295         UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
296         UDataMemory *data;
297 
298         const int32_t *p=NULL;
299         const uint8_t *pb;
300 
301         if(&errorCode==NULL || U_FAILURE(errorCode)) {
302             return 0;
303         }
304 
305         /* open the data outside the mutex block */
306         data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
307         dataErrorCode=errorCode;
308         if(U_FAILURE(errorCode)) {
309             return haveNormData=-1;
310         }
311 
312         p=(const int32_t *)udata_getMemory(data);
313         pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
314         utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
315         _normTrie.getFoldingOffset=getFoldingNormOffset;
316 
317         pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
318         if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
319             utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
320         }
321         pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
322 
323         if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
324             utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
325             _auxTrie.getFoldingOffset=getFoldingAuxOffset;
326         }
327 
328         if(U_FAILURE(errorCode)) {
329             dataErrorCode=errorCode;
330             udata_close(data);
331             return haveNormData=-1;
332         }
333 
334         /* in the mutex block, set the data for this process */
335         umtx_lock(NULL);
336         if(normData==NULL) {
337             normData=data;
338             data=NULL;
339 
340             uprv_memcpy(&indexes, p, sizeof(indexes));
341             uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
342             uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
343             uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
344         } else {
345             p=(const int32_t *)udata_getMemory(normData);
346         }
347 
348         /* initialize some variables */
349         extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
350         combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
351         formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
352         formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
353         if(formatVersion_2_1) {
354             canonStartSets=combiningTable+
355                 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
356                 (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
357         }
358         haveNormData=1;
359         ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
360         umtx_unlock(NULL);
361 
362         /* if a different thread set it first, then close the extra data */
363         if(data!=NULL) {
364             udata_close(data); /* NULL if it was set correctly */
365         }
366     }
367 
368     return haveNormData;
369 }
370 
371 #endif
372 
373 static inline UBool
_haveData(UErrorCode & errorCode)374 _haveData(UErrorCode &errorCode) {
375 #if UNORM_HARDCODE_DATA
376     return U_SUCCESS(errorCode);
377 #else
378     if(U_FAILURE(errorCode)) {
379         return FALSE;
380     } else if(haveNormData>0) {
381         return TRUE;
382     } else if(haveNormData<0) {
383         errorCode=dataErrorCode;
384         return FALSE;
385     } else /* haveNormData==0 */ {
386         return (UBool)(loadNormData(errorCode)>0);
387     }
388 #endif
389 }
390 
391 U_CAPI UBool U_EXPORT2
unorm_haveData(UErrorCode * pErrorCode)392 unorm_haveData(UErrorCode *pErrorCode) {
393     return _haveData(*pErrorCode);
394 }
395 
396 U_CAPI const uint16_t * U_EXPORT2
unorm_getFCDTrieIndex(UChar32 & fcdHighStart,UErrorCode * pErrorCode)397 unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode) {
398     if(_haveData(*pErrorCode)) {
399         fcdHighStart=fcdTrie.highStart;
400         return fcdTrie.index;
401     } else {
402         return NULL;
403     }
404 }
405 
406 /* data access primitives --------------------------------------------------- */
407 
408 static inline uint32_t
_getNorm32(UChar c)409 _getNorm32(UChar c) {
410     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(&normTrie, c);
411 }
412 
413 static inline uint32_t
_getNorm32FromSurrogatePair(UChar c,UChar c2)414 _getNorm32FromSurrogatePair(UChar c, UChar c2) {
415     UChar32 cp=U16_GET_SUPPLEMENTARY(c, c2);
416     return UTRIE2_GET32_FROM_SUPP(&normTrie, cp);
417 }
418 
419 /*
420  * get a norm32 from text with complete code points
421  * (like from decompositions)
422  */
423 static inline uint32_t
_getNorm32(const UChar * p,uint32_t mask)424 _getNorm32(const UChar *p, uint32_t mask) {
425     UChar c=*p;
426     uint32_t norm32=_getNorm32(c);
427     if((norm32&mask) && U16_IS_LEAD(c)) {
428         /* c is a lead surrogate, get the real norm32 */
429         norm32=_getNorm32FromSurrogatePair(c, *(p+1));
430     }
431     return norm32;
432 }
433 
434 static inline uint16_t
_getFCD16(UChar c)435 _getFCD16(UChar c) {
436     return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(&fcdTrie, c);
437 }
438 
439 static inline uint16_t
_getFCD16FromSurrogatePair(UChar c,UChar c2)440 _getFCD16FromSurrogatePair(UChar c, UChar c2) {
441     UChar32 cp=U16_GET_SUPPLEMENTARY(c, c2);
442     return UTRIE2_GET16_FROM_SUPP(&fcdTrie, cp);
443 }
444 
445 static inline const uint16_t *
_getExtraData(uint32_t norm32)446 _getExtraData(uint32_t norm32) {
447     return extraData+(norm32>>_NORM_EXTRA_SHIFT);
448 }
449 
450 /*
451  * TODO(markus): Revisit if it makes sense for functions like _getNextCC()
452  * and their call sites, and a fair bit of other code here, to work with UTF-16 code units,
453  * or whether code simplification would suggest just using UChar32 and maybe UTRIE2_NEXT32().
454  */
455 
456 #if 0
457 /*
458  * It is possible to get the FCD data from the main trie if unorm.icu
459  * was built without the FCD trie, although it is slower.
460  * This is not implemented because it is hard to test, and because it seems
461  * unusual to want to use FCD and not build the data file for it.
462  *
463  * Untested sample code:
464  */
465 static inline uint16_t
466 _getFCD16FromNormData(UChar32 c) {
467     uint32_t norm32, fcd;
468 
469     norm32=_getNorm32(c);
470     if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) {
471         /* get the lead/trail cc from the decomposition data */
472         const uint16_t *nfd=_getExtraData(norm32);
473         if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
474             fcd=nfd[1];
475         }
476     } else {
477         fcd=norm32&_NORM_CC_MASK;
478         if(fcd!=0) {
479             /* use the code point cc value for both lead and trail cc's */
480             fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */
481         }
482     }
483 
484     return (uint16_t)fcd;
485 }
486 #endif
487 
488 /* normalization exclusion sets --------------------------------------------- */
489 
490 /*
491  * Normalization exclusion UnicodeSets are used for tailored normalization;
492  * see the comment near the beginning of this file.
493  *
494  * By specifying one or several sets of code points,
495  * those code points become inert for normalization.
496  */
497 
498 static const UnicodeSet *
internalGetNXHangul(UErrorCode & errorCode)499 internalGetNXHangul(UErrorCode &errorCode) {
500     /* internal function, does not check for incoming U_FAILURE */
501     UBool isCached;
502 
503     UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached);
504 
505     if(!isCached) {
506         UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
507         if(set==NULL) {
508             errorCode=U_MEMORY_ALLOCATION_ERROR;
509             return NULL;
510         }
511         // Compact the set for caching.
512         set->compact();
513 
514         umtx_lock(NULL);
515         if(nxCache[UNORM_NX_HANGUL]==NULL) {
516             nxCache[UNORM_NX_HANGUL]=set;
517             set=NULL;
518             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
519         }
520         umtx_unlock(NULL);
521 
522         delete set;
523     }
524 
525     return nxCache[UNORM_NX_HANGUL];
526 }
527 
528 /* unorm.cpp 1.116 had and used
529 static const UnicodeSet *
530 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
531     ...
532 }
533 */
534 
535 /* get and set an exclusion set from a serialized UnicodeSet */
536 static const UnicodeSet *
internalGetSerializedNX(int32_t options,int32_t nxIndex,UErrorCode & errorCode)537 internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
538     /* internal function, does not check for incoming U_FAILURE */
539     UBool isCached;
540 
541     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
542 
543     if( !isCached &&
544         canonStartSets!=NULL &&
545         canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
546     ) {
547         USerializedSet sset;
548         UnicodeSet *set;
549         UChar32 start, end;
550         int32_t i;
551 
552         if( !uset_getSerializedSet(
553                     &sset,
554                     canonStartSets+canonStartSets[nxIndex],
555                     canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
556         ) {
557             errorCode=U_INVALID_FORMAT_ERROR;
558             return NULL;
559         }
560 
561         /* turn the serialized set into a UnicodeSet */
562         set=new UnicodeSet();
563         if(set==NULL) {
564             errorCode=U_MEMORY_ALLOCATION_ERROR;
565             return NULL;
566         }
567         for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
568             set->add(start, end);
569         }
570         // Compact the set for caching.
571         set->compact();
572 
573         umtx_lock(NULL);
574         if(nxCache[options]==NULL) {
575             nxCache[options]=set;
576             set=NULL;
577             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
578         }
579         umtx_unlock(NULL);
580 
581         delete set;
582     }
583 
584     return nxCache[options];
585 }
586 
587 static const UnicodeSet *
internalGetNXCJKCompat(UErrorCode & errorCode)588 internalGetNXCJKCompat(UErrorCode &errorCode) {
589     /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
590     return internalGetSerializedNX(
591                 UNORM_NX_CJK_COMPAT,
592                 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
593                 errorCode);
594 }
595 
596 static const UnicodeSet *
internalGetNXUnicode(uint32_t options,UErrorCode & errorCode)597 internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
598     /* internal function, does not check for incoming U_FAILURE */
599     int32_t nxIndex;
600 
601     options&=_NORM_OPTIONS_UNICODE_MASK;
602     switch(options) {
603     case 0:
604         return NULL;
605     case UNORM_UNICODE_3_2:
606         /* [:^Age=3.2:] */
607         nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
608         break;
609     default:
610         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
611         return NULL;
612     }
613 
614     /* build a set with all code points that were not designated by the specified Unicode version */
615     return internalGetSerializedNX(options, nxIndex, errorCode);
616 }
617 
618 /* Get a decomposition exclusion set. The data must be loaded. */
619 static const UnicodeSet *
internalGetNX(int32_t options,UErrorCode & errorCode)620 internalGetNX(int32_t options, UErrorCode &errorCode) {
621     options&=_NORM_OPTIONS_SETS_MASK;
622 
623     UBool isCached;
624 
625     UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
626 
627     if(!isCached) {
628         /* return basic sets */
629         if(options==UNORM_NX_HANGUL) {
630             return internalGetNXHangul(errorCode);
631         }
632         if(options==UNORM_NX_CJK_COMPAT) {
633             return internalGetNXCJKCompat(errorCode);
634         }
635         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
636             return internalGetNXUnicode(options, errorCode);
637         }
638 
639         /* build a set from multiple subsets */
640         UnicodeSet *set;
641         const UnicodeSet *other;
642 
643         set=new UnicodeSet();
644         if(set==NULL) {
645             errorCode=U_MEMORY_ALLOCATION_ERROR;
646             return NULL;
647         }
648 
649         if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
650             set->addAll(*other);
651         }
652         if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
653             set->addAll(*other);
654         }
655         if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
656             set->addAll(*other);
657         }
658 
659         if(U_FAILURE(errorCode)) {
660             delete set;
661             return NULL;
662         }
663         // Compact the set for caching.
664         set->compact();
665 
666         umtx_lock(NULL);
667         if(nxCache[options]==NULL) {
668             nxCache[options]=set;
669             set=NULL;
670             ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
671         }
672         umtx_unlock(NULL);
673 
674         delete set;
675     }
676 
677     return nxCache[options];
678 }
679 
680 static inline const UnicodeSet *
getNX(int32_t options,UErrorCode & errorCode)681 getNX(int32_t options, UErrorCode &errorCode) {
682     if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
683         /* incoming failure, or no decomposition exclusions requested */
684         return NULL;
685     } else {
686         return internalGetNX(options, errorCode);
687     }
688 }
689 
690 U_CFUNC const UnicodeSet *
unorm_getNX(int32_t options,UErrorCode * pErrorCode)691 unorm_getNX(int32_t options, UErrorCode *pErrorCode) {
692     return getNX(options, *pErrorCode);
693 }
694 
695 static inline UBool
nx_contains(const UnicodeSet * nx,UChar32 c)696 nx_contains(const UnicodeSet *nx, UChar32 c) {
697     return nx!=NULL && nx->contains(c);
698 }
699 
700 static inline UBool
nx_contains(const UnicodeSet * nx,UChar c,UChar c2)701 nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
702     return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
703 }
704 
705 /* other normalization primitives ------------------------------------------- */
706 
707 /* get the canonical or compatibility decomposition for one character */
708 static inline const UChar *
_decompose(uint32_t norm32,uint32_t qcMask,int32_t & length,uint8_t & cc,uint8_t & trailCC)709 _decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
710            uint8_t &cc, uint8_t &trailCC) {
711     const UChar *p=(const UChar *)_getExtraData(norm32);
712     length=*p++;
713 
714     if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
715         /* use compatibility decomposition, skip canonical data */
716         p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
717         length>>=8;
718     }
719 
720     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
721         /* get the lead and trail cc's */
722         UChar bothCCs=*p++;
723         cc=(uint8_t)(bothCCs>>8);
724         trailCC=(uint8_t)bothCCs;
725     } else {
726         /* lead and trail cc's are both 0 */
727         cc=trailCC=0;
728     }
729 
730     length&=_NORM_DECOMP_LENGTH_MASK;
731     return p;
732 }
733 
734 /* get the canonical decomposition for one character */
735 static inline const UChar *
_decompose(uint32_t norm32,int32_t & length,uint8_t & cc,uint8_t & trailCC)736 _decompose(uint32_t norm32, int32_t &length,
737            uint8_t &cc, uint8_t &trailCC) {
738     const UChar *p=(const UChar *)_getExtraData(norm32);
739     length=*p++;
740 
741     if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
742         /* get the lead and trail cc's */
743         UChar bothCCs=*p++;
744         cc=(uint8_t)(bothCCs>>8);
745         trailCC=(uint8_t)bothCCs;
746     } else {
747         /* lead and trail cc's are both 0 */
748         cc=trailCC=0;
749     }
750 
751     length&=_NORM_DECOMP_LENGTH_MASK;
752     return p;
753 }
754 
755 /**
756  * Get the canonical decomposition for one code point.
757  * @param c code point
758  * @param buffer out-only buffer for algorithmic decompositions of Hangul
759  * @param length out-only, takes the length of the decomposition, if any
760  * @return pointer to decomposition, or 0 if none
761  * @internal
762  */
763 U_CFUNC const UChar *
unorm_getCanonicalDecomposition(UChar32 c,UChar buffer[4],int32_t * pLength)764 unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) {
765     uint32_t norm32;
766 
767     if(c<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
768         /* trivial case */
769         return NULL;
770     }
771 
772     norm32=UTRIE2_GET32(&normTrie, c);
773     if(norm32&_NORM_QC_NFD) {
774         if(isNorm32HangulOrJamo(norm32)) {
775             /* Hangul syllable: decompose algorithmically */
776             UChar c2;
777 
778             c-=HANGUL_BASE;
779 
780             c2=(UChar)(c%JAMO_T_COUNT);
781             c/=JAMO_T_COUNT;
782             if(c2>0) {
783                 buffer[2]=(UChar)(JAMO_T_BASE+c2);
784                 *pLength=3;
785             } else {
786                 *pLength=2;
787             }
788 
789             buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
790             buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
791             return buffer;
792         } else {
793             /* normal decomposition */
794             uint8_t cc, trailCC;
795             return _decompose(norm32, *pLength, cc, trailCC);
796         }
797     } else {
798         return 0;
799     }
800 }
801 
802 /*
803  * get the combining class of (c, c2)=*p++
804  * before: p<limit  after: p<=limit
805  * if only one code unit is used, then c2==0
806  */
807 static inline uint8_t
_getNextCC(const UChar * & p,const UChar * limit,UChar & c,UChar & c2)808 _getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
809     uint32_t norm32;
810 
811     c=*p++;
812     c2=0;
813     norm32=_getNorm32(c);
814     if((norm32&_NORM_CC_MASK)==0) {
815         return 0;
816     } else if(U16_IS_LEAD(c)) {
817         /* c is a lead surrogate, get the real norm32 */
818         if(p!=limit && U16_IS_TRAIL(c2=*p)) {
819             ++p;
820             norm32=_getNorm32FromSurrogatePair(c, c2);
821         } else {
822             c2=0;
823             return 0;
824         }
825     }
826     return (uint8_t)(norm32>>_NORM_CC_SHIFT);
827 }
828 
829 /*
830  * read backwards and get norm32
831  * return 0 if the character is <minC
832  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
833  */
834 static inline uint32_t
_getPrevNorm32(const UChar * start,const UChar * & src,uint32_t minC,UChar & c,UChar & c2)835 _getPrevNorm32(const UChar *start, const UChar *&src,
836                uint32_t minC,
837                UChar &c, UChar &c2) {
838     c=*--src;
839     c2=0;
840 
841     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
842     if(c<minC) {
843         return 0;
844     } else if(!U_IS_SURROGATE(c)) {
845         return _getNorm32(c);
846     } else if(U16_IS_SURROGATE_TRAIL(c) && src!=start && U16_IS_LEAD(c2=*(src-1))) {
847         --src;
848         return _getNorm32FromSurrogatePair(c2, c);
849     } else {
850         /* unpaired surrogate */
851         c2=0;
852         return 0;
853     }
854 }
855 
856 /*
857  * get the combining class of (c, c2)=*--p
858  * before: start<p  after: start<=p
859  */
860 static inline uint8_t
_getPrevCC(const UChar * start,const UChar * & p)861 _getPrevCC(const UChar *start, const UChar *&p) {
862     UChar c, c2;
863 
864     return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, c, c2)>>_NORM_CC_SHIFT);
865 }
866 
867 /*
868  * is this a safe boundary character for NF*D?
869  * (lead cc==0)
870  */
871 static inline UBool
_isNFDSafe(uint32_t norm32,uint32_t ccOrQCMask,uint32_t decompQCMask)872 _isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
873     if((norm32&ccOrQCMask)==0) {
874         return TRUE; /* cc==0 and no decomposition: this is NF*D safe */
875     }
876 
877     /* inspect its decomposition - maybe a Hangul but not a surrogate here */
878     if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
879         int32_t length;
880         uint8_t cc, trailCC;
881 
882         /* decomposes, get everything from the variable-length extra data */
883         _decompose(norm32, decompQCMask, length, cc, trailCC);
884         return cc==0;
885     } else {
886         /* no decomposition (or Hangul), test the cc directly */
887         return (norm32&_NORM_CC_MASK)==0;
888     }
889 }
890 
891 /*
892  * is this (or does its decomposition begin with) a "true starter"?
893  * (cc==0 and NF*C_YES)
894  */
895 static inline UBool
_isTrueStarter(uint32_t norm32,uint32_t ccOrQCMask,uint32_t decompQCMask)896 _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
897     if((norm32&ccOrQCMask)==0) {
898         return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
899     }
900 
901     /* inspect its decomposition - not a Hangul or a surrogate here */
902     if((norm32&decompQCMask)!=0) {
903         const UChar *p;
904         int32_t length;
905         uint8_t cc, trailCC;
906 
907         /* decomposes, get everything from the variable-length extra data */
908         p=_decompose(norm32, decompQCMask, length, cc, trailCC);
909         if(cc==0) {
910             uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
911 
912             /* does it begin with NFC_YES? */
913             if((_getNorm32(p, qcMask)&qcMask)==0) {
914                 /* yes, the decomposition begins with a true starter */
915                 return TRUE;
916             }
917         }
918     }
919     return FALSE;
920 }
921 
922 /* uchar.h */
923 U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c)924 u_getCombiningClass(UChar32 c) {
925 #if !UNORM_HARDCODE_DATA
926     UErrorCode errorCode=U_ZERO_ERROR;
927     if(_haveData(errorCode)) {
928 #endif
929         uint32_t norm32=UTRIE2_GET32(&normTrie, c);
930         return (uint8_t)(norm32>>_NORM_CC_SHIFT);
931 #if !UNORM_HARDCODE_DATA
932     } else {
933         return 0;
934     }
935 #endif
936 }
937 
938 U_CFUNC UBool U_EXPORT2
unorm_internalIsFullCompositionExclusion(UChar32 c)939 unorm_internalIsFullCompositionExclusion(UChar32 c) {
940 #if UNORM_HARDCODE_DATA
941     if(auxTrie.index!=NULL) {
942 #else
943     UErrorCode errorCode=U_ZERO_ERROR;
944     if(_haveData(errorCode) && auxTrie.index!=NULL) {
945 #endif
946         uint16_t aux=UTRIE2_GET16(&auxTrie, c);
947         return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
948     } else {
949         return FALSE;
950     }
951 }
952 
953 U_CFUNC UBool U_EXPORT2
954 unorm_isCanonSafeStart(UChar32 c) {
955 #if UNORM_HARDCODE_DATA
956     if(auxTrie.index!=NULL) {
957 #else
958     UErrorCode errorCode=U_ZERO_ERROR;
959     if(_haveData(errorCode) && auxTrie.index!=NULL) {
960 #endif
961         uint16_t aux=UTRIE2_GET16(&auxTrie, c);
962         return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
963     } else {
964         return FALSE;
965     }
966 }
967 
968 U_CAPI void U_EXPORT2
969 unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
970     if(unorm_haveData(pErrorCode)){
971         uprv_memcpy(*versionInfo, dataVersion, 4);
972     }
973 }
974 
975 
976 U_CAPI UBool U_EXPORT2
977 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
978 #if !UNORM_HARDCODE_DATA
979     UErrorCode errorCode=U_ZERO_ERROR;
980 #endif
981     if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
982 #if !UNORM_HARDCODE_DATA
983         _haveData(errorCode) &&
984 #endif
985         canonStartSets!=NULL
986     ) {
987         const uint16_t *table;
988         int32_t i, start, limit;
989 
990         /*
991          * binary search for c
992          *
993          * There are two search tables,
994          * one for BMP code points and one for supplementary ones.
995          * See unormimp.h for details.
996          */
997         if(c<=0xffff) {
998             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
999             start=0;
1000             limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1001 
1002             /* each entry is a pair { c, result } */
1003             while(start<limit-2) {
1004                 i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
1005                 if(c<table[i]) {
1006                     limit=i;
1007                 } else {
1008                     start=i;
1009                 }
1010             }
1011 
1012             /* found? */
1013             if(c==table[start]) {
1014                 i=table[start+1];
1015                 if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
1016                     /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
1017                     i&=(_NORM_MAX_CANON_SETS-1);
1018                     return uset_getSerializedSet(fillSet,
1019                                             canonStartSets+i,
1020                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1021                 } else {
1022                     /* other result values are BMP code points for single-code point sets */
1023                     uset_setSerializedToOne(fillSet, (UChar32)i);
1024                     return TRUE;
1025                 }
1026             }
1027         } else {
1028             uint16_t high, low, h;
1029 
1030             table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
1031                                  canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1032             start=0;
1033             limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1034 
1035             high=(uint16_t)(c>>16);
1036             low=(uint16_t)c;
1037 
1038             /* each entry is a triplet { high(c), low(c), result } */
1039             while(start<limit-3) {
1040                 i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
1041                 h=table[i]&0x1f; /* high word */
1042                 if(high<h || (high==h && low<table[i+1])) {
1043                     limit=i;
1044                 } else {
1045                     start=i;
1046                 }
1047             }
1048 
1049             /* found? */
1050             h=table[start];
1051             if(high==(h&0x1f) && low==table[start+1]) {
1052                 i=table[start+2];
1053                 if((h&0x8000)==0) {
1054                     /* the result is an index to a USerializedSet */
1055                     return uset_getSerializedSet(fillSet,
1056                                             canonStartSets+i,
1057                                             canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1058                 } else {
1059                     /*
1060                      * single-code point set {x} in
1061                      * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }
1062                      */
1063                     i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
1064                     uset_setSerializedToOne(fillSet, (UChar32)i);
1065                     return TRUE;
1066                 }
1067             }
1068         }
1069     }
1070 
1071     return FALSE; /* not found */
1072 }
1073 
1074 U_CAPI int32_t U_EXPORT2
1075 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
1076     uint16_t aux;
1077 
1078     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1079         return 0;
1080     }
1081     if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1082         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1083         return 0;
1084     }
1085     if(_haveData(*pErrorCode) && auxTrie.index!=NULL) {
1086         aux=UTRIE2_GET16(&auxTrie, c);
1087         aux&=_NORM_AUX_FNC_MASK;
1088     } else {
1089         aux=0;
1090     }
1091     if(aux!=0) {
1092         const UChar *s;
1093         int32_t length;
1094 
1095         s=(const UChar *)(extraData+aux);
1096         if(*s<0xff00) {
1097             /* s points to the single-unit string */
1098             length=1;
1099         } else {
1100             length=*s&0xff;
1101             ++s;
1102         }
1103         if(0<length && length<=destCapacity) {
1104             uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
1105         }
1106         return u_terminateUChars(dest, destCapacity, length, pErrorCode);
1107     } else {
1108         return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
1109     }
1110 }
1111 
1112 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
1113 U_CAPI UBool U_EXPORT2
1114 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
1115     uint32_t norm32, mask;
1116     uint16_t aux;
1117 
1118 #if !UNORM_HARDCODE_DATA
1119     UErrorCode errorCode=U_ZERO_ERROR;
1120     if(!_haveData(errorCode)) {
1121         return FALSE;
1122     }
1123 #endif
1124 
1125     /* handle trivial cases; set the comparison mask for the normal ones */
1126     switch(mode) {
1127     case UNORM_NONE:
1128         return TRUE;
1129     case UNORM_NFD:
1130         mask=_NORM_CC_MASK|_NORM_QC_NFD;
1131         break;
1132     case UNORM_NFKD:
1133         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
1134         break;
1135     case UNORM_NFC:
1136     /* case UNORM_FCC: */
1137         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
1138         break;
1139     case UNORM_NFKC:
1140         mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
1141         break;
1142     case UNORM_FCD:
1143         /* FCD: skippable if lead cc==0 and trail cc<=1 */
1144         return fcdTrie.index!=NULL && UTRIE2_GET16(&fcdTrie, c)<=1;
1145     default:
1146         return FALSE;
1147     }
1148 
1149     /* check conditions (a)..(e), see unormimp.h */
1150     norm32=UTRIE2_GET32(&normTrie, c);
1151     if((norm32&mask)!=0) {
1152         return FALSE; /* fails (a)..(e), not skippable */
1153     }
1154 
1155     if(mode<UNORM_NFC) {
1156         return TRUE; /* NF*D, passed (a)..(c), is skippable */
1157     }
1158 
1159     /* NF*C/FCC, passed (a)..(e) */
1160     if((norm32&_NORM_QC_NFD)==0) {
1161         return TRUE; /* no canonical decomposition, is skippable */
1162     }
1163 
1164     /* check Hangul syllables algorithmically */
1165     if(isNorm32HangulOrJamo(norm32)) {
1166         /* Jamo passed (a)..(e) above, must be Hangul */
1167         return !isHangulWithoutJamoT((UChar)c); /* LVT are skippable, LV are not */
1168     }
1169 
1170     /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1171     /* NF*C, test (f) flag */
1172     if(!formatVersion_2_2 || auxTrie.index==NULL) {
1173         return FALSE; /* no (f) data, say not skippable to be safe */
1174     }
1175 
1176     aux=UTRIE2_GET16(&auxTrie, c);
1177     return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
1178 
1179     /* } else { FCC, test fcd<=1 instead of the above } */
1180 }
1181 
1182 U_CAPI void U_EXPORT2
1183 unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
1184     UChar c;
1185 
1186     if(!_haveData(*pErrorCode)) {
1187         return;
1188     }
1189 
1190     /* add the start code point of each same-value range of each trie */
1191     utrie2_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
1192     if(fcdTrie.index!=NULL) {
1193         utrie2_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
1194     }
1195     if(auxTrie.index!=NULL) {
1196         utrie2_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
1197     }
1198 
1199     /* add Hangul LV syllables and LV+1 because of skippables */
1200     for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
1201         sa->add(sa->set, c);
1202         sa->add(sa->set, c+1);
1203     }
1204     sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
1205 }
1206 
1207 U_CFUNC UNormalizationCheckResult U_EXPORT2
1208 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
1209     static const uint32_t qcMask[UNORM_MODE_COUNT]={
1210         0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
1211     };
1212 
1213     uint32_t norm32;
1214 
1215 #if !UNORM_HARDCODE_DATA
1216     UErrorCode errorCode=U_ZERO_ERROR;
1217     if(!_haveData(errorCode)) {
1218         return UNORM_YES;
1219     }
1220 #endif
1221 
1222     norm32=UTRIE2_GET32(&normTrie, c);
1223     norm32&=qcMask[mode];
1224 
1225     if(norm32==0) {
1226         return UNORM_YES;
1227     } else if(norm32&_NORM_QC_ANY_NO) {
1228         return UNORM_NO;
1229     } else /* _NORM_QC_ANY_MAYBE */ {
1230         return UNORM_MAYBE;
1231     }
1232 }
1233 
1234 U_CFUNC uint16_t U_EXPORT2
1235 unorm_getFCD16FromCodePoint(UChar32 c) {
1236 #if !UNORM_HARDCODE_DATA
1237     UErrorCode errorCode;
1238     errorCode=U_ZERO_ERROR;
1239 #endif
1240 
1241     if(
1242 #if !UNORM_HARDCODE_DATA
1243         !_haveData(errorCode) ||
1244 #endif
1245         fcdTrie.index==NULL
1246     ) {
1247         return 0;
1248     }
1249     return UTRIE2_GET16(&fcdTrie, c);
1250 }
1251 
1252 /* reorder UTF-16 in-place -------------------------------------------------- */
1253 
1254 /*
1255  * simpler, single-character version of _mergeOrdered() -
1256  * bubble-insert one single code point into the preceding string
1257  * which is already canonically ordered
1258  * (c, c2) may or may not yet have been inserted at [current..p[
1259  *
1260  * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1261  *
1262  * before: [start..current[ is already ordered, and
1263  *         [current..p[     may or may not hold (c, c2) but
1264  *                          must be exactly the same length as (c, c2)
1265  * after: [start..p[ is ordered
1266  *
1267  * returns the trailing combining class
1268  */
1269 static uint8_t
1270 _insertOrdered(const UChar *start, UChar *current, UChar *p,
1271                UChar c, UChar c2, uint8_t cc) {
1272     const UChar *pBack, *pPreBack;
1273     UChar *r;
1274     uint8_t prevCC, trailCC=cc;
1275 
1276     if(start<current && cc!=0) {
1277         /* search for the insertion point where cc>=prevCC */
1278         pPreBack=pBack=current;
1279         prevCC=_getPrevCC(start, pPreBack);
1280         if(cc<prevCC) {
1281             /* this will be the last code point, so keep its cc */
1282             trailCC=prevCC;
1283             pBack=pPreBack;
1284             while(start<pPreBack) {
1285                 prevCC=_getPrevCC(start, pPreBack);
1286                 if(cc>=prevCC) {
1287                     break;
1288                 }
1289                 pBack=pPreBack;
1290             }
1291 
1292             /*
1293              * this is where we are right now with all these pointers:
1294              * [start..pPreBack[ 0..? code points that we can ignore
1295              * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1296              * [pBack..current[  0..n code points with >cc, move up to insert (c, c2)
1297              * [current..p[         1 code point (c, c2) with cc
1298              */
1299 
1300             /* move the code units in between up */
1301             r=p;
1302             do {
1303                 *--r=*--current;
1304             } while(pBack!=current);
1305         }
1306     }
1307 
1308     /* insert (c, c2) */
1309     *current=c;
1310     if(c2!=0) {
1311         *(current+1)=c2;
1312     }
1313 
1314     /* we know the cc of the last code point */
1315     return trailCC;
1316 }
1317 
1318 /*
1319  * merge two UTF-16 string parts together
1320  * to canonically order (order by combining classes) their concatenation
1321  *
1322  * the two strings may already be adjacent, so that the merging is done in-place
1323  * if the two strings are not adjacent, then the buffer holding the first one
1324  * must be large enough
1325  * the second string may or may not be ordered in itself
1326  *
1327  * before: [start..current[ is already ordered, and
1328  *         [next..limit[    may be ordered in itself, but
1329  *                          is not in relation to [start..current[
1330  * after: [start..current+(limit-next)[ is ordered
1331  *
1332  * the algorithm is a simple bubble-sort that takes the characters from *next++
1333  * and inserts them in correct combining class order into the preceding part
1334  * of the string
1335  *
1336  * since this function is called much less often than the single-code point
1337  * _insertOrdered(), it just uses that for easier maintenance
1338  * (see file version from before 2001aug31 for a more optimized version)
1339  *
1340  * returns the trailing combining class
1341  */
1342 static uint8_t
1343 _mergeOrdered(UChar *start, UChar *current,
1344               const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
1345     UChar *r;
1346     UChar c, c2;
1347     uint8_t cc, trailCC=0;
1348     UBool adjacent;
1349 
1350     adjacent= current==next;
1351 
1352     if(start!=current || !isOrdered) {
1353         while(next<limit) {
1354             cc=_getNextCC(next, limit, c, c2);
1355             if(cc==0) {
1356                 /* does not bubble back */
1357                 trailCC=0;
1358                 if(adjacent) {
1359                     current=(UChar *)next;
1360                 } else {
1361                     *current++=c;
1362                     if(c2!=0) {
1363                         *current++=c2;
1364                     }
1365                 }
1366                 if(isOrdered) {
1367                     break;
1368                 } else {
1369                     start=current;
1370                 }
1371             } else {
1372                 r=current+(c2==0 ? 1 : 2);
1373                 trailCC=_insertOrdered(start, current, r, c, c2, cc);
1374                 current=r;
1375             }
1376         }
1377     }
1378 
1379     if(next==limit) {
1380         /* we know the cc of the last code point */
1381         return trailCC;
1382     } else {
1383         if(!adjacent) {
1384             /* copy the second string part */
1385             do {
1386                 *current++=*next++;
1387             } while(next!=limit);
1388             limit=current;
1389         }
1390         return _getPrevCC(start, limit);
1391     }
1392 }
1393 
1394 /* find the last true starter in [start..src[ and return the pointer to it */
1395 static const UChar *
1396 _findPreviousStarter(const UChar *start, const UChar *src,
1397                      uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
1398     uint32_t norm32;
1399     UChar c, c2;
1400 
1401     while(start<src) {
1402         norm32=_getPrevNorm32(start, src, minNoMaybe, c, c2);
1403         if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1404             break;
1405         }
1406     }
1407     return src;
1408 }
1409 
1410 /* find the first true starter in [src..limit[ and return the pointer to it */
1411 static const UChar *
1412 _findNextStarter(const UChar *src, const UChar *limit,
1413                  uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
1414     const UChar *p;
1415     uint32_t norm32, ccOrQCMask;
1416     int32_t length;
1417     UChar c, c2;
1418     uint8_t cc, trailCC;
1419 
1420     ccOrQCMask=_NORM_CC_MASK|qcMask;
1421 
1422     for(;;) {
1423         if(src==limit) {
1424             break; /* end of string */
1425         }
1426         c=*src;
1427         if(c<minNoMaybe) {
1428             break; /* catches NUL terminater, too */
1429         }
1430 
1431         norm32=_getNorm32(c);
1432         if((norm32&ccOrQCMask)==0) {
1433             break; /* true starter */
1434         }
1435 
1436         if(U16_IS_LEAD(c)) {
1437             /* c is a lead surrogate, get the real norm32 */
1438             if((src+1)==limit || !U16_IS_TRAIL(c2=*(src+1))) {
1439                 break; /* unmatched first surrogate: counts as a true starter */
1440             }
1441             norm32=_getNorm32FromSurrogatePair(c, c2);
1442 
1443             if((norm32&ccOrQCMask)==0) {
1444                 break; /* true starter */
1445             }
1446         } else {
1447             c2=0;
1448         }
1449 
1450         /* (c, c2) is not a true starter but its decomposition may be */
1451         if(norm32&decompQCMask) {
1452             /* (c, c2) decomposes, get everything from the variable-length extra data */
1453             p=_decompose(norm32, decompQCMask, length, cc, trailCC);
1454 
1455             /* get the first character's norm32 to check if it is a true starter */
1456             if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
1457                 break; /* true starter */
1458             }
1459         }
1460 
1461         src+= c2==0 ? 1 : 2; /* not a true starter, continue */
1462     }
1463 
1464     return src;
1465 }
1466 
1467 /* make NFD & NFKD ---------------------------------------------------------- */
1468 
1469 U_CAPI int32_t U_EXPORT2
1470 unorm_getDecomposition(UChar32 c, UBool compat,
1471                        UChar *dest, int32_t destCapacity) {
1472 #if !UNORM_HARDCODE_DATA
1473     UErrorCode errorCode=U_ZERO_ERROR;
1474 #endif
1475     if( (uint32_t)c<=0x10ffff &&
1476 #if !UNORM_HARDCODE_DATA
1477         _haveData(errorCode) &&
1478 #endif
1479         ((dest!=NULL && destCapacity>0) || destCapacity==0)
1480     ) {
1481         uint32_t norm32, qcMask;
1482         UChar32 minNoMaybe;
1483         int32_t length;
1484 
1485         /* initialize */
1486         if(!compat) {
1487             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1488             qcMask=_NORM_QC_NFD;
1489         } else {
1490             minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1491             qcMask=_NORM_QC_NFKD;
1492         }
1493 
1494         if(c<minNoMaybe) {
1495             /* trivial case */
1496             if(destCapacity>0) {
1497                 dest[0]=(UChar)c;
1498             }
1499             return -1;
1500         }
1501 
1502         /* data lookup */
1503         norm32=UTRIE2_GET32(&normTrie, c);
1504         if((norm32&qcMask)==0) {
1505             /* simple case: no decomposition */
1506             if(c<=0xffff) {
1507                 if(destCapacity>0) {
1508                     dest[0]=(UChar)c;
1509                 }
1510                 return -1;
1511             } else {
1512                 if(destCapacity>=2) {
1513                     dest[0]=UTF16_LEAD(c);
1514                     dest[1]=UTF16_TRAIL(c);
1515                 }
1516                 return -2;
1517             }
1518         } else if(isNorm32HangulOrJamo(norm32)) {
1519             /* Hangul syllable: decompose algorithmically */
1520             UChar c2;
1521 
1522             c-=HANGUL_BASE;
1523 
1524             c2=(UChar)(c%JAMO_T_COUNT);
1525             c/=JAMO_T_COUNT;
1526             if(c2>0) {
1527                 if(destCapacity>=3) {
1528                     dest[2]=(UChar)(JAMO_T_BASE+c2);
1529                 }
1530                 length=3;
1531             } else {
1532                 length=2;
1533             }
1534 
1535             if(destCapacity>=2) {
1536                 dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1537                 dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1538             }
1539             return length;
1540         } else {
1541             /* c decomposes, get everything from the variable-length extra data */
1542             const UChar *p, *limit;
1543             uint8_t cc, trailCC;
1544 
1545             p=_decompose(norm32, qcMask, length, cc, trailCC);
1546             if(length<=destCapacity) {
1547                 limit=p+length;
1548                 do {
1549                     *dest++=*p++;
1550                 } while(p<limit);
1551             }
1552             return length;
1553         }
1554     } else {
1555         return 0;
1556     }
1557 }
1558 
1559 static int32_t
1560 _decompose(UChar *dest, int32_t destCapacity,
1561            const UChar *src, int32_t srcLength,
1562            UBool compat, const UnicodeSet *nx,
1563            uint8_t &outTrailCC) {
1564     UChar buffer[3];
1565     const UChar *limit, *prevSrc, *p;
1566     uint32_t norm32, ccOrQCMask, qcMask;
1567     int32_t destIndex, reorderStartIndex, length;
1568     UChar c, c2, minNoMaybe;
1569     uint8_t cc, prevCC, trailCC;
1570 
1571     if(!compat) {
1572         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1573         qcMask=_NORM_QC_NFD;
1574     } else {
1575         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1576         qcMask=_NORM_QC_NFKD;
1577     }
1578 
1579     /* initialize */
1580     ccOrQCMask=_NORM_CC_MASK|qcMask;
1581     destIndex=reorderStartIndex=0;
1582     prevCC=0;
1583 
1584     /* avoid compiler warnings */
1585     norm32=0;
1586     c=0;
1587     cc=0;
1588     trailCC=0;
1589 
1590     if(srcLength>=0) {
1591         /* string with length */
1592         limit=src+srcLength;
1593     } else /* srcLength==-1 */ {
1594         /* zero-terminated string */
1595         limit=NULL;
1596     }
1597 
1598     U_ALIGN_CODE(16);
1599 
1600     for(;;) {
1601         /* count code units below the minimum or with irrelevant data for the quick check */
1602         prevSrc=src;
1603         if(limit==NULL) {
1604             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
1605                 prevCC=0;
1606                 ++src;
1607             }
1608         } else {
1609             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
1610                 prevCC=0;
1611                 ++src;
1612             }
1613         }
1614 
1615         /* copy these code units all at once */
1616         if(src!=prevSrc) {
1617             length=(int32_t)(src-prevSrc);
1618             if((destIndex+length)<=destCapacity) {
1619                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
1620             }
1621             destIndex+=length;
1622             reorderStartIndex=destIndex;
1623         }
1624 
1625         /* end of source reached? */
1626         if(limit==NULL ? c==0 : src==limit) {
1627             break;
1628         }
1629 
1630         /* c already contains *src and norm32 is set for it, increment src */
1631         ++src;
1632 
1633         /* check one above-minimum, relevant code unit */
1634         /*
1635          * generally, set p and length to the decomposition string
1636          * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1637          * in all cases, set cc to the lead and trailCC to the trail combining class
1638          *
1639          * the following merge-sort of the current character into the preceding,
1640          * canonically ordered result text will use the optimized _insertOrdered()
1641          * if there is only one single code point to process;
1642          * this is indicated with p==NULL, and (c, c2) is the character to insert
1643          * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1644          * for a supplementary character)
1645          * otherwise, p[length] is merged in with _mergeOrdered()
1646          */
1647         if(isNorm32HangulOrJamo(norm32)) {
1648             if(nx_contains(nx, c)) {
1649                 c2=0;
1650                 p=NULL;
1651                 length=1;
1652             } else {
1653                 /* Hangul syllable: decompose algorithmically */
1654                 p=buffer;
1655                 cc=trailCC=0;
1656 
1657                 c-=HANGUL_BASE;
1658 
1659                 c2=(UChar)(c%JAMO_T_COUNT);
1660                 c/=JAMO_T_COUNT;
1661                 if(c2>0) {
1662                     buffer[2]=(UChar)(JAMO_T_BASE+c2);
1663                     length=3;
1664                 } else {
1665                     length=2;
1666                 }
1667 
1668                 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1669                 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1670             }
1671         } else {
1672             if(isNorm32Regular(norm32)) {
1673                 c2=0;
1674                 length=1;
1675             } else {
1676                 /* c is a lead surrogate, get the real norm32 */
1677                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
1678                     ++src;
1679                     length=2;
1680                     norm32=_getNorm32FromSurrogatePair(c, c2);
1681                 } else {
1682                     c2=0;
1683                     length=1;
1684                     norm32=0;
1685                 }
1686             }
1687 
1688             /* get the decomposition and the lead and trail cc's */
1689             if(nx_contains(nx, c, c2)) {
1690                 /* excluded: norm32==0 */
1691                 cc=trailCC=0;
1692                 p=NULL;
1693             } else if((norm32&qcMask)==0) {
1694                 /* c does not decompose */
1695                 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1696                 p=NULL;
1697             } else {
1698                 /* c decomposes, get everything from the variable-length extra data */
1699                 p=_decompose(norm32, qcMask, length, cc, trailCC);
1700                 if(length==1) {
1701                     /* fastpath a single code unit from decomposition */
1702                     c=*p;
1703                     c2=0;
1704                     p=NULL;
1705                 }
1706             }
1707         }
1708 
1709         /* append the decomposition to the destination buffer, assume length>0 */
1710         if((destIndex+length)<=destCapacity) {
1711             UChar *reorderSplit=dest+destIndex;
1712             if(p==NULL) {
1713                 /* fastpath: single code point */
1714                 if(cc!=0 && cc<prevCC) {
1715                     /* (c, c2) is out of order with respect to the preceding text */
1716                     destIndex+=length;
1717                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
1718                 } else {
1719                     /* just append (c, c2) */
1720                     dest[destIndex++]=c;
1721                     if(c2!=0) {
1722                         dest[destIndex++]=c2;
1723                     }
1724                 }
1725             } else {
1726                 /* general: multiple code points (ordered by themselves) from decomposition */
1727                 if(cc!=0 && cc<prevCC) {
1728                     /* the decomposition is out of order with respect to the preceding text */
1729                     destIndex+=length;
1730                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
1731                 } else {
1732                     /* just append the decomposition */
1733                     do {
1734                         dest[destIndex++]=*p++;
1735                     } while(--length>0);
1736                 }
1737             }
1738         } else {
1739             /* buffer overflow */
1740             /* keep incrementing the destIndex for preflighting */
1741             destIndex+=length;
1742         }
1743 
1744         prevCC=trailCC;
1745         if(prevCC==0) {
1746             reorderStartIndex=destIndex;
1747         }
1748     }
1749 
1750     outTrailCC=prevCC;
1751     return destIndex;
1752 }
1753 
1754 U_CAPI int32_t U_EXPORT2
1755 unorm_decompose(UChar *dest, int32_t destCapacity,
1756                 const UChar *src, int32_t srcLength,
1757                 UBool compat, int32_t options,
1758                 UErrorCode *pErrorCode) {
1759     const UnicodeSet *nx;
1760     int32_t destIndex;
1761     uint8_t trailCC;
1762 
1763     if(!_haveData(*pErrorCode)) {
1764         return 0;
1765     }
1766 
1767     nx=getNX(options, *pErrorCode);
1768     if(U_FAILURE(*pErrorCode)) {
1769         return 0;
1770     }
1771 
1772     destIndex=_decompose(dest, destCapacity,
1773                          src, srcLength,
1774                          compat, nx,
1775                          trailCC);
1776 
1777     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
1778 }
1779 
1780 /* make NFC & NFKC ---------------------------------------------------------- */
1781 
1782 /* get the composition properties of the next character */
1783 static inline uint32_t
1784 _getNextCombining(UChar *&p, const UChar *limit,
1785                   UChar &c, UChar &c2,
1786                   uint16_t &combiningIndex, uint8_t &cc,
1787                   const UnicodeSet *nx) {
1788     uint32_t norm32, combineFlags;
1789 
1790     /* get properties */
1791     c=*p++;
1792     norm32=_getNorm32(c);
1793 
1794     /* preset output values for most characters */
1795     c2=0;
1796     combiningIndex=0;
1797     cc=0;
1798 
1799     if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
1800         return 0;
1801     } else {
1802         if(isNorm32Regular(norm32)) {
1803             /* set cc etc. below */
1804         } else if(isNorm32HangulOrJamo(norm32)) {
1805             /* a compatibility decomposition contained Jamos */
1806             combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
1807             return norm32&_NORM_COMBINES_ANY;
1808         } else {
1809             /* c is a lead surrogate, get the real norm32 */
1810             if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
1811                 ++p;
1812                 norm32=_getNorm32FromSurrogatePair(c, c2);
1813             } else {
1814                 c2=0;
1815                 return 0;
1816             }
1817         }
1818 
1819         if(nx_contains(nx, c, c2)) {
1820             return 0; /* excluded: norm32==0 */
1821         }
1822 
1823         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1824 
1825         combineFlags=norm32&_NORM_COMBINES_ANY;
1826         if(combineFlags!=0) {
1827             combiningIndex=*(_getExtraData(norm32)-1);
1828         }
1829         return combineFlags;
1830     }
1831 }
1832 
1833 /*
1834  * given a composition-result starter (c, c2) - which means its cc==0,
1835  * it combines forward, it has extra data, its norm32!=0,
1836  * it is not a Hangul or Jamo,
1837  * get just its combineFwdIndex
1838  *
1839  * norm32(c) is special if and only if c2!=0
1840  */
1841 static inline uint16_t
1842 _getCombiningIndexFromStarter(UChar c, UChar c2) {
1843     uint32_t norm32;
1844 
1845     if(c2==0) {
1846         norm32=_getNorm32(c);
1847     } else {
1848         norm32=_getNorm32FromSurrogatePair(c, c2);
1849     }
1850     return *(_getExtraData(norm32)-1);
1851 }
1852 
1853 /*
1854  * Find the recomposition result for
1855  * a forward-combining character
1856  * (specified with a pointer to its part of the combiningTable[])
1857  * and a backward-combining character
1858  * (specified with its combineBackIndex).
1859  *
1860  * If these two characters combine, then set (value, value2)
1861  * with the code unit(s) of the composition character.
1862  *
1863  * Return value:
1864  * 0    do not combine
1865  * 1    combine
1866  * >1   combine, and the composition is a forward-combining starter
1867  *
1868  * See unormimp.h for a description of the composition table format.
1869  */
1870 static inline uint16_t
1871 _combine(const uint16_t *table, uint16_t combineBackIndex,
1872          uint16_t &value, uint16_t &value2) {
1873     uint16_t key;
1874 
1875     /* search in the starter's composition table */
1876     for(;;) {
1877         key=*table++;
1878         if(key>=combineBackIndex) {
1879             break;
1880         }
1881         table+= *table&0x8000 ? 2 : 1;
1882     }
1883 
1884     /* mask off bit 15, the last-entry-in-the-list flag */
1885     if((key&0x7fff)==combineBackIndex) {
1886         /* found! combine! */
1887         value=*table;
1888 
1889         /* is the composition a starter that combines forward? */
1890         key=(uint16_t)((value&0x2000)+1);
1891 
1892         /* get the composition result code point from the variable-length result value */
1893         if(value&0x8000) {
1894             if(value&0x4000) {
1895                 /* surrogate pair composition result */
1896                 value=(uint16_t)((value&0x3ff)|0xd800);
1897                 value2=*(table+1);
1898             } else {
1899                 /* BMP composition result U+2000..U+ffff */
1900                 value=*(table+1);
1901                 value2=0;
1902             }
1903         } else {
1904             /* BMP composition result U+0000..U+1fff */
1905             value&=0x1fff;
1906             value2=0;
1907         }
1908 
1909         return key;
1910     } else {
1911         /* not found */
1912         return 0;
1913     }
1914 }
1915 
1916 static inline UBool
1917 _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
1918                UBool compat, UChar *dest, const UnicodeSet *nx) {
1919     if(isJamoVTNorm32JamoV(norm32)) {
1920         /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1921         prev=(UChar)(prev-JAMO_L_BASE);
1922         if(prev<JAMO_L_COUNT) {
1923             c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
1924 
1925             /* check if the next character is a Jamo T (normal or compatibility) */
1926             if(src!=limit) {
1927                 UChar next, t;
1928 
1929                 next=*src;
1930                 if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1931                     /* normal Jamo T */
1932                     ++src;
1933                     c+=t;
1934                 } else if(compat) {
1935                     /* if NFKC, then check for compatibility Jamo T (BMP only) */
1936                     norm32=_getNorm32(next);
1937                     if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
1938                         const UChar *p;
1939                         int32_t length;
1940                         uint8_t cc, trailCC;
1941 
1942                         p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
1943                         if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1944                             /* compatibility Jamo T */
1945                             ++src;
1946                             c+=t;
1947                         }
1948                     }
1949                 }
1950             }
1951             if(nx_contains(nx, c)) {
1952                 if(!isHangulWithoutJamoT(c)) {
1953                     --src; /* undo ++src from reading the Jamo T */
1954                 }
1955                 return FALSE;
1956             }
1957             if(dest!=0) {
1958                 *dest=c;
1959             }
1960             return TRUE;
1961         }
1962     } else if(isHangulWithoutJamoT(prev)) {
1963         /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
1964         c=(UChar)(prev+(c-JAMO_T_BASE));
1965         if(nx_contains(nx, c)) {
1966             return FALSE;
1967         }
1968         if(dest!=0) {
1969             *dest=c;
1970         }
1971         return TRUE;
1972     }
1973     return FALSE;
1974 }
1975 
1976 /*
1977  * recompose the characters in [p..limit[
1978  * (which is in NFD - decomposed and canonically ordered),
1979  * adjust limit, and return the trailing cc
1980  *
1981  * since for NFKC we may get Jamos in decompositions, we need to
1982  * recompose those too
1983  *
1984  * note that recomposition never lengthens the text:
1985  * any character consists of either one or two code units;
1986  * a composition may contain at most one more code unit than the original starter,
1987  * while the combining mark that is removed has at least one code unit
1988  */
1989 static uint8_t
1990 _recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
1991     UChar *starter, *pRemove, *q, *r;
1992     uint32_t combineFlags;
1993     UChar c, c2;
1994     uint16_t combineFwdIndex, combineBackIndex;
1995     uint16_t result, value, value2;
1996     uint8_t cc, prevCC;
1997     UBool starterIsSupplementary;
1998 
1999     starter=NULL;                   /* no starter */
2000     combineFwdIndex=0;              /* will not be used until starter!=NULL - avoid compiler warnings */
2001     combineBackIndex=0;             /* will always be set if combineFlags!=0 - avoid compiler warnings */
2002     value=value2=0;                 /* always set by _combine() before used - avoid compiler warnings */
2003     starterIsSupplementary=FALSE;   /* will not be used until starter!=NULL - avoid compiler warnings */
2004     prevCC=0;
2005 
2006     for(;;) {
2007         combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
2008         if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
2009             if(combineBackIndex&0x8000) {
2010                 /* c is a Jamo V/T, see if we can compose it with the previous character */
2011                 /* for the PRI #29 fix, check that there is no intervening combining mark */
2012                 if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
2013                     pRemove=NULL; /* NULL while no Hangul composition */
2014                     combineFlags=0;
2015                     c2=*starter;
2016                     if(combineBackIndex==0xfff2) {
2017                         /* Jamo V, compose with previous Jamo L and following Jamo T */
2018                         c2=(UChar)(c2-JAMO_L_BASE);
2019                         if(c2<JAMO_L_COUNT) {
2020                             pRemove=p-1;
2021                             c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
2022                             if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
2023                                 ++p;
2024                                 c+=c2;
2025                             } else {
2026                                 /* the result is an LV syllable, which is a starter (unlike LVT) */
2027                                 combineFlags=_NORM_COMBINES_FWD;
2028                             }
2029                             if(!nx_contains(nx, c)) {
2030                                 *starter=c;
2031                             } else {
2032                                 /* excluded */
2033                                 if(!isHangulWithoutJamoT(c)) {
2034                                     --p; /* undo the ++p from reading the Jamo T */
2035                                 }
2036                                 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
2037                                 pRemove=NULL;
2038                             }
2039                         }
2040 
2041                     /*
2042                      * Normally, the following can not occur:
2043                      * Since the input is in NFD, there are no Hangul LV syllables that
2044                      * a Jamo T could combine with.
2045                      * All Jamo Ts are combined above when handling Jamo Vs.
2046                      *
2047                      * However, before the PRI #29 fix, this can occur due to
2048                      * an intervening combining mark between the Hangul LV and the Jamo T.
2049                      */
2050                     } else {
2051                         /* Jamo T, compose with previous Hangul that does not have a Jamo T */
2052                         if(isHangulWithoutJamoT(c2)) {
2053                             c2+=(UChar)(c-JAMO_T_BASE);
2054                             if(!nx_contains(nx, c2)) {
2055                                 pRemove=p-1;
2056                                 *starter=c2;
2057                             }
2058                         }
2059                     }
2060 
2061                     if(pRemove!=NULL) {
2062                         /* remove the Jamo(s) */
2063                         q=pRemove;
2064                         r=p;
2065                         while(r<limit) {
2066                             *q++=*r++;
2067                         }
2068                         p=pRemove;
2069                         limit=q;
2070                     }
2071 
2072                     c2=0; /* c2 held *starter temporarily */
2073 
2074                     if(combineFlags!=0) {
2075                         /*
2076                          * not starter=NULL because the composition is a Hangul LV syllable
2077                          * and might combine once more (but only before the PRI #29 fix)
2078                          */
2079 
2080                         /* done? */
2081                         if(p==limit) {
2082                             return prevCC;
2083                         }
2084 
2085                         /* the composition is a Hangul LV syllable which is a starter that combines forward */
2086                         combineFwdIndex=0xfff0;
2087 
2088                         /* we combined; continue with looking for compositions */
2089                         continue;
2090                     }
2091                 }
2092 
2093                 /*
2094                  * now: cc==0 and the combining index does not include "forward" ->
2095                  * the rest of the loop body will reset starter to NULL;
2096                  * technically, a composed Hangul syllable is a starter, but it
2097                  * does not combine forward now that we have consumed all eligible Jamos;
2098                  * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2099                  */
2100 
2101             } else if(
2102                 /* the starter is not a Hangul LV or Jamo V/T and */
2103                 !(combineFwdIndex&0x8000) &&
2104                 /* the combining mark is not blocked and */
2105                 ((options&UNORM_BEFORE_PRI_29) ?
2106                     (prevCC!=cc || prevCC==0) :
2107                     (prevCC<cc || prevCC==0)) &&
2108                 /* the starter and the combining mark (c, c2) do combine and */
2109                 0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
2110                 /* the composition result is not excluded */
2111                 !nx_contains(nx, value, value2)
2112             ) {
2113                 /* replace the starter with the composition, remove the combining mark */
2114                 pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
2115 
2116                 /* replace the starter with the composition */
2117                 *starter=(UChar)value;
2118                 if(starterIsSupplementary) {
2119                     if(value2!=0) {
2120                         /* both are supplementary */
2121                         *(starter+1)=(UChar)value2;
2122                     } else {
2123                         /* the composition is shorter than the starter, move the intermediate characters forward one */
2124                         starterIsSupplementary=FALSE;
2125                         q=starter+1;
2126                         r=q+1;
2127                         while(r<pRemove) {
2128                             *q++=*r++;
2129                         }
2130                         --pRemove;
2131                     }
2132                 } else if(value2!=0) {
2133                     /* the composition is longer than the starter, move the intermediate characters back one */
2134                     starterIsSupplementary=TRUE;
2135                     ++starter; /* temporarily increment for the loop boundary */
2136                     q=pRemove;
2137                     r=++pRemove;
2138                     while(starter<q) {
2139                         *--r=*--q;
2140                     }
2141                     *starter=(UChar)value2;
2142                     --starter; /* undo the temporary increment */
2143                 /* } else { both are on the BMP, nothing more to do */
2144                 }
2145 
2146                 /* remove the combining mark by moving the following text over it */
2147                 if(pRemove<p) {
2148                     q=pRemove;
2149                     r=p;
2150                     while(r<limit) {
2151                         *q++=*r++;
2152                     }
2153                     p=pRemove;
2154                     limit=q;
2155                 }
2156 
2157                 /* keep prevCC because we removed the combining mark */
2158 
2159                 /* done? */
2160                 if(p==limit) {
2161                     return prevCC;
2162                 }
2163 
2164                 /* is the composition a starter that combines forward? */
2165                 if(result>1) {
2166                     combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
2167                 } else {
2168                     starter=NULL;
2169                 }
2170 
2171                 /* we combined; continue with looking for compositions */
2172                 continue;
2173             }
2174         }
2175 
2176         /* no combination this time */
2177         prevCC=cc;
2178         if(p==limit) {
2179             return prevCC;
2180         }
2181 
2182         /* if (c, c2) did not combine, then check if it is a starter */
2183         if(cc==0) {
2184             /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2185             if(combineFlags&_NORM_COMBINES_FWD) {
2186                 /* it may combine with something, prepare for it */
2187                 if(c2==0) {
2188                     starterIsSupplementary=FALSE;
2189                     starter=p-1;
2190                 } else {
2191                     starterIsSupplementary=TRUE;
2192                     starter=p-2;
2193                 }
2194                 combineFwdIndex=combineBackIndex;
2195             } else {
2196                 /* it will not combine with anything */
2197                 starter=NULL;
2198             }
2199         } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
2200             /* FCC: no discontiguous compositions; any intervening character blocks */
2201             starter=NULL;
2202         }
2203     }
2204 }
2205 
2206 /* decompose and recompose [prevStarter..src[ */
2207 static const UChar *
2208 _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
2209              const UChar *prevStarter, const UChar *src,
2210              uint8_t &prevCC,
2211              int32_t options, const UnicodeSet *nx,
2212              UErrorCode *pErrorCode) {
2213     UChar *recomposeLimit;
2214     uint8_t trailCC;
2215     UBool compat;
2216 
2217     compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
2218 
2219     /* decompose [prevStarter..src[ */
2220     length=_decompose(buffer, bufferCapacity,
2221                       prevStarter, (int32_t)(src-prevStarter),
2222                       compat, nx,
2223                       trailCC);
2224     if(length>bufferCapacity) {
2225         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
2226             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2227             return NULL;
2228         }
2229         length=_decompose(buffer, bufferCapacity,
2230                           prevStarter, (int32_t)(src-prevStarter),
2231                           compat, nx,
2232                           trailCC);
2233     }
2234 
2235     /* recompose the decomposition */
2236     recomposeLimit=buffer+length;
2237     if(length>=2) {
2238         prevCC=_recompose(buffer, recomposeLimit, options, nx);
2239     }
2240 
2241     /* return with a pointer to the recomposition and its length */
2242     length=(int32_t)(recomposeLimit-buffer);
2243     return buffer;
2244 }
2245 
2246 static int32_t
2247 _compose(UChar *dest, int32_t destCapacity,
2248          const UChar *src, int32_t srcLength,
2249          int32_t options, const UnicodeSet *nx,
2250          UErrorCode *pErrorCode) {
2251     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2252     UChar *buffer;
2253     int32_t bufferCapacity;
2254 
2255     const UChar *limit, *prevSrc, *prevStarter;
2256     uint32_t norm32, ccOrQCMask, qcMask;
2257     int32_t destIndex, reorderStartIndex, length;
2258     UChar c, c2, minNoMaybe;
2259     uint8_t cc, prevCC;
2260 
2261     if(options&_NORM_OPTIONS_COMPAT) {
2262         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2263         qcMask=_NORM_QC_NFKC;
2264     } else {
2265         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2266         qcMask=_NORM_QC_NFC;
2267     }
2268 
2269     /* initialize */
2270     buffer=stackBuffer;
2271     bufferCapacity=_STACK_BUFFER_CAPACITY;
2272 
2273     /*
2274      * prevStarter points to the last character before the current one
2275      * that is a "true" starter with cc==0 and quick check "yes".
2276      *
2277      * prevStarter will be used instead of looking for a true starter
2278      * while incrementally decomposing [prevStarter..prevSrc[
2279      * in _composePart(). Having a good prevStarter allows to just decompose
2280      * the entire [prevStarter..prevSrc[.
2281      *
2282      * When _composePart() backs out from prevSrc back to prevStarter,
2283      * then it also backs out destIndex by the same amount.
2284      * Therefore, at all times, the (prevSrc-prevStarter) source units
2285      * must correspond 1:1 to destination units counted with destIndex,
2286      * except for reordering.
2287      * This is true for the qc "yes" characters copied in the fast loop,
2288      * and for pure reordering.
2289      * prevStarter must be set forward to src when this is not true:
2290      * In _composePart() and after composing a Hangul syllable.
2291      *
2292      * This mechanism relies on the assumption that the decomposition of a true starter
2293      * also begins with a true starter. gennorm/store.c checks for this.
2294      */
2295     prevStarter=src;
2296 
2297     ccOrQCMask=_NORM_CC_MASK|qcMask;
2298     destIndex=reorderStartIndex=0;
2299     prevCC=0;
2300 
2301     /* avoid compiler warnings */
2302     norm32=0;
2303     c=0;
2304 
2305     if(srcLength>=0) {
2306         /* string with length */
2307         limit=src+srcLength;
2308     } else /* srcLength==-1 */ {
2309         /* zero-terminated string */
2310         limit=NULL;
2311     }
2312 
2313     U_ALIGN_CODE(16);
2314 
2315     for(;;) {
2316         /* count code units below the minimum or with irrelevant data for the quick check */
2317         prevSrc=src;
2318         if(limit==NULL) {
2319             while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
2320                 prevCC=0;
2321                 ++src;
2322             }
2323         } else {
2324             while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
2325                 prevCC=0;
2326                 ++src;
2327             }
2328         }
2329 
2330         /* copy these code units all at once */
2331         if(src!=prevSrc) {
2332             length=(int32_t)(src-prevSrc);
2333             if((destIndex+length)<=destCapacity) {
2334                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2335             }
2336             destIndex+=length;
2337             reorderStartIndex=destIndex;
2338 
2339             /* set prevStarter to the last character in the quick check loop */
2340             prevStarter=src-1;
2341             if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
2342                 --prevStarter;
2343             }
2344 
2345             prevSrc=src;
2346         }
2347 
2348         /* end of source reached? */
2349         if(limit==NULL ? c==0 : src==limit) {
2350             break;
2351         }
2352 
2353         /* c already contains *src and norm32 is set for it, increment src */
2354         ++src;
2355 
2356         /*
2357          * source buffer pointers:
2358          *
2359          *  all done      quick check   current char  not yet
2360          *                "yes" but     (c, c2)       processed
2361          *                may combine
2362          *                forward
2363          * [-------------[-------------[-------------[-------------[
2364          * |             |             |             |             |
2365          * start         prevStarter   prevSrc       src           limit
2366          *
2367          *
2368          * destination buffer pointers and indexes:
2369          *
2370          *  all done      might take    not filled yet
2371          *                characters for
2372          *                reordering
2373          * [-------------[-------------[-------------[
2374          * |             |             |             |
2375          * dest      reorderStartIndex destIndex     destCapacity
2376          */
2377 
2378         /* check one above-minimum, relevant code unit */
2379         /*
2380          * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2381          * check for Jamo V/T, then for surrogates and regular characters
2382          * c is not a Hangul syllable or Jamo L because
2383          * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2384          */
2385         if(isNorm32HangulOrJamo(norm32)) {
2386             /*
2387              * c is a Jamo V/T:
2388              * try to compose with the previous character, Jamo V also with a following Jamo T,
2389              * and set values here right now in case we just continue with the main loop
2390              */
2391             prevCC=cc=0;
2392             reorderStartIndex=destIndex;
2393 
2394             if(
2395                 destIndex>0 &&
2396                 _composeHangul(
2397                     *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
2398                     destIndex<=destCapacity ? dest+(destIndex-1) : 0,
2399                     nx)
2400             ) {
2401                 prevStarter=src;
2402                 continue;
2403             }
2404 
2405             /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2406             c2=0;
2407             length=1;
2408             prevStarter=prevSrc;
2409         } else {
2410             if(isNorm32Regular(norm32)) {
2411                 c2=0;
2412                 length=1;
2413             } else {
2414                 /* c is a lead surrogate, get the real norm32 */
2415                 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2416                     ++src;
2417                     length=2;
2418                     norm32=_getNorm32FromSurrogatePair(c, c2);
2419                 } else {
2420                     /* c is an unpaired lead surrogate, nothing to do */
2421                     c2=0;
2422                     length=1;
2423                     norm32=0;
2424                 }
2425             }
2426 
2427             /* we are looking at the character (c, c2) at [prevSrc..src[ */
2428             if(nx_contains(nx, c, c2)) {
2429                 /* excluded: norm32==0 */
2430                 cc=0;
2431             } else if((norm32&qcMask)==0) {
2432                 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2433             } else {
2434                 const UChar *p;
2435                 uint32_t decompQCMask;
2436 
2437                 /*
2438                  * find appropriate boundaries around this character,
2439                  * decompose the source text from between the boundaries,
2440                  * and recompose it
2441                  *
2442                  * this puts the intermediate text into the side buffer because
2443                  * it might be longer than the recomposition end result,
2444                  * or the destination buffer may be too short or missing
2445                  *
2446                  * note that destIndex may be adjusted backwards to account
2447                  * for source text that passed the quick check but needed to
2448                  * take part in the recomposition
2449                  */
2450                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
2451 
2452                 /*
2453                  * find the last true starter in [prevStarter..src[
2454                  * it is either the decomposition of the current character (at prevSrc),
2455                  * or prevStarter
2456                  */
2457                 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
2458                     prevStarter=prevSrc;
2459                 } else {
2460                     /* adjust destIndex: back out what had been copied with qc "yes" */
2461                     destIndex-=(int32_t)(prevSrc-prevStarter);
2462                 }
2463 
2464                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2465                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
2466 
2467                 /* compose [prevStarter..src[ */
2468                 p=_composePart(stackBuffer, buffer, bufferCapacity,
2469                                length,          /* output */
2470                                prevStarter, src,
2471                                prevCC,          /* output */
2472                                options, nx,
2473                                pErrorCode);
2474 
2475                 if(p==NULL) {
2476                     destIndex=0;   /* an error occurred (out of memory) */
2477                     break;
2478                 }
2479 
2480                 /* append the recomposed buffer contents to the destination buffer */
2481                 if((destIndex+length)<=destCapacity) {
2482                     while(length>0) {
2483                         dest[destIndex++]=*p++;
2484                         --length;
2485                     }
2486                 } else {
2487                     /* buffer overflow */
2488                     /* keep incrementing the destIndex for preflighting */
2489                     destIndex+=length;
2490                 }
2491 
2492                 /* set the next starter */
2493                 prevStarter=src;
2494 
2495                 continue;
2496             }
2497         }
2498 
2499         /* append the single code point (c, c2) to the destination buffer */
2500         if((destIndex+length)<=destCapacity) {
2501             if(cc!=0 && cc<prevCC) {
2502                 /* (c, c2) is out of order with respect to the preceding text */
2503                 UChar *reorderSplit=dest+destIndex;
2504                 destIndex+=length;
2505                 prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2506             } else {
2507                 /* just append (c, c2) */
2508                 dest[destIndex++]=c;
2509                 if(c2!=0) {
2510                     dest[destIndex++]=c2;
2511                 }
2512                 prevCC=cc;
2513             }
2514         } else {
2515             /* buffer overflow */
2516             /* keep incrementing the destIndex for preflighting */
2517             destIndex+=length;
2518             prevCC=cc;
2519         }
2520     }
2521 
2522     /* cleanup */
2523     if(buffer!=stackBuffer) {
2524         uprv_free(buffer);
2525     }
2526 
2527     return destIndex;
2528 }
2529 
2530 U_CAPI int32_t U_EXPORT2
2531 unorm_compose(UChar *dest, int32_t destCapacity,
2532               const UChar *src, int32_t srcLength,
2533               UBool compat, int32_t options,
2534               UErrorCode *pErrorCode) {
2535     const UnicodeSet *nx;
2536     int32_t destIndex;
2537 
2538     if(!_haveData(*pErrorCode)) {
2539         return 0;
2540     }
2541 
2542     nx=getNX(options, *pErrorCode);
2543     if(U_FAILURE(*pErrorCode)) {
2544         return 0;
2545     }
2546 
2547     /* reset options bits that should only be set here or inside _compose() */
2548     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
2549 
2550     if(compat) {
2551         options|=_NORM_OPTIONS_COMPAT;
2552     }
2553 
2554     destIndex=_compose(dest, destCapacity,
2555                        src, srcLength,
2556                        options, nx,
2557                        pErrorCode);
2558 
2559     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2560 }
2561 
2562 /* make FCD ----------------------------------------------------------------- */
2563 
2564 static const UChar *
2565 _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
2566     UChar c, c2;
2567 
2568     /*
2569      * find the first position in [src..limit[ after some cc==0 according to FCD data
2570      *
2571      * at the beginning of the loop, we have fcd16 from before src
2572      *
2573      * stop at positions:
2574      * - after trail cc==0
2575      * - at the end of the source
2576      * - before lead cc==0
2577      */
2578     for(;;) {
2579         /* stop if trail cc==0 for the previous character */
2580         if((fcd16&0xff)==0) {
2581             break;
2582         }
2583 
2584         /* get c=*src - stop at end of string */
2585         if(src==limit) {
2586             break;
2587         }
2588         c=*src;
2589 
2590         /* stop if lead cc==0 for this character */
2591         if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
2592             break; /* catches terminating NUL, too */
2593         }
2594 
2595         if(!UTF_IS_FIRST_SURROGATE(c)) {
2596             if(fcd16<=0xff) {
2597                 break;
2598             }
2599             ++src;
2600         } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
2601             /* c is a lead surrogate, get the real fcd16 */
2602             fcd16=_getFCD16FromSurrogatePair(c, c2);
2603             if(fcd16<=0xff) {
2604                 break;
2605             }
2606             src+=2;
2607         } else {
2608             /* c is an unpaired first surrogate, lead cc==0 */
2609             break;
2610         }
2611     }
2612 
2613     return src;
2614 }
2615 
2616 static uint8_t
2617 _decomposeFCD(const UChar *src, const UChar *decompLimit,
2618               UChar *dest, int32_t &destIndex, int32_t destCapacity,
2619               const UnicodeSet *nx) {
2620     const UChar *p;
2621     uint32_t norm32;
2622     int32_t reorderStartIndex, length;
2623     UChar c, c2;
2624     uint8_t cc, prevCC, trailCC;
2625 
2626     /*
2627      * canonically decompose [src..decompLimit[
2628      *
2629      * all characters in this range have some non-zero cc,
2630      * directly or in decomposition,
2631      * so that we do not need to check in the following for quick-check limits etc.
2632      *
2633      * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2634      *
2635      * we also do not need to check for c==0 because we have an established decompLimit
2636      */
2637     reorderStartIndex=destIndex;
2638     prevCC=0;
2639 
2640     while(src<decompLimit) {
2641         c=*src++;
2642         norm32=_getNorm32(c);
2643         if(isNorm32Regular(norm32)) {
2644             c2=0;
2645             length=1;
2646         } else {
2647             /*
2648              * reminder: this function is called with [src..decompLimit[
2649              * not containing any Hangul/Jamo characters,
2650              * therefore the only specials are lead surrogates
2651              */
2652             /* c is a lead surrogate, get the real norm32 */
2653             if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2654                 ++src;
2655                 length=2;
2656                 norm32=_getNorm32FromSurrogatePair(c, c2);
2657             } else {
2658                 c2=0;
2659                 length=1;
2660                 norm32=0;
2661             }
2662         }
2663 
2664         /* get the decomposition and the lead and trail cc's */
2665         if(nx_contains(nx, c, c2)) {
2666             /* excluded: norm32==0 */
2667             cc=trailCC=0;
2668             p=NULL;
2669         } else if((norm32&_NORM_QC_NFD)==0) {
2670             /* c does not decompose */
2671             cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2672             p=NULL;
2673         } else {
2674             /* c decomposes, get everything from the variable-length extra data */
2675             p=_decompose(norm32, length, cc, trailCC);
2676             if(length==1) {
2677                 /* fastpath a single code unit from decomposition */
2678                 c=*p;
2679                 c2=0;
2680                 p=NULL;
2681             }
2682         }
2683 
2684         /* append the decomposition to the destination buffer, assume length>0 */
2685         if((destIndex+length)<=destCapacity) {
2686             UChar *reorderSplit=dest+destIndex;
2687             if(p==NULL) {
2688                 /* fastpath: single code point */
2689                 if(cc!=0 && cc<prevCC) {
2690                     /* (c, c2) is out of order with respect to the preceding text */
2691                     destIndex+=length;
2692                     trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2693                 } else {
2694                     /* just append (c, c2) */
2695                     dest[destIndex++]=c;
2696                     if(c2!=0) {
2697                         dest[destIndex++]=c2;
2698                     }
2699                 }
2700             } else {
2701                 /* general: multiple code points (ordered by themselves) from decomposition */
2702                 if(cc!=0 && cc<prevCC) {
2703                     /* the decomposition is out of order with respect to the preceding text */
2704                     destIndex+=length;
2705                     trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
2706                 } else {
2707                     /* just append the decomposition */
2708                     do {
2709                         dest[destIndex++]=*p++;
2710                     } while(--length>0);
2711                 }
2712             }
2713         } else {
2714             /* buffer overflow */
2715             /* keep incrementing the destIndex for preflighting */
2716             destIndex+=length;
2717         }
2718 
2719         prevCC=trailCC;
2720         if(prevCC==0) {
2721             reorderStartIndex=destIndex;
2722         }
2723     }
2724 
2725     return prevCC;
2726 }
2727 
2728 static int32_t
2729 unorm_makeFCD(UChar *dest, int32_t destCapacity,
2730               const UChar *src, int32_t srcLength,
2731               const UnicodeSet *nx,
2732               UErrorCode *pErrorCode) {
2733     const UChar *limit, *prevSrc, *decompStart;
2734     int32_t destIndex, length;
2735     UChar c, c2;
2736     uint16_t fcd16;
2737     int16_t prevCC, cc;
2738 
2739     if(!_haveData(*pErrorCode)) {
2740         return 0;
2741     }
2742 
2743     /* initialize */
2744     decompStart=src;
2745     destIndex=0;
2746     prevCC=0;
2747 
2748     /* avoid compiler warnings */
2749     c=0;
2750     fcd16=0;
2751 
2752     if(srcLength>=0) {
2753         /* string with length */
2754         limit=src+srcLength;
2755     } else /* srcLength==-1 */ {
2756         /* zero-terminated string */
2757         limit=NULL;
2758     }
2759 
2760     U_ALIGN_CODE(16);
2761 
2762     for(;;) {
2763         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2764         prevSrc=src;
2765         if(limit==NULL) {
2766             for(;;) {
2767                 c=*src;
2768                 if(c<_NORM_MIN_WITH_LEAD_CC) {
2769                     if(c==0) {
2770                         break;
2771                     }
2772                     prevCC=(int16_t)-c;
2773                 } else if((fcd16=_getFCD16(c))==0) {
2774                     prevCC=0;
2775                 } else {
2776                     break;
2777                 }
2778                 ++src;
2779             }
2780         } else {
2781             for(;;) {
2782                 if(src==limit) {
2783                     break;
2784                 } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
2785                     prevCC=(int16_t)-c;
2786                 } else if((fcd16=_getFCD16(c))==0) {
2787                     prevCC=0;
2788                 } else {
2789                     break;
2790                 }
2791                 ++src;
2792             }
2793         }
2794 
2795         /*
2796          * prevCC has values from the following ranges:
2797          * 0..0xff - the previous trail combining class
2798          * <0      - the negative value of the previous code unit;
2799          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2800          *           was deferred so that average text is checked faster
2801          */
2802 
2803         /* copy these code units all at once */
2804         if(src!=prevSrc) {
2805             length=(int32_t)(src-prevSrc);
2806             if((destIndex+length)<=destCapacity) {
2807                 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2808             }
2809             destIndex+=length;
2810             prevSrc=src;
2811 
2812             /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2813             if(prevCC<0) {
2814                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2815                 if(!nx_contains(nx, (UChar32)-prevCC)) {
2816                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2817                 } else {
2818                     prevCC=0; /* excluded: fcd16==0 */
2819                 }
2820 
2821                 /*
2822                  * set a pointer to this below-U+0300 character;
2823                  * if prevCC==0 then it will moved to after this character below
2824                  */
2825                 decompStart=prevSrc-1;
2826             }
2827         }
2828         /*
2829          * now:
2830          * prevSrc==src - used later to adjust destIndex before decomposition
2831          * prevCC>=0
2832          */
2833 
2834         /* end of source reached? */
2835         if(limit==NULL ? c==0 : src==limit) {
2836             break;
2837         }
2838 
2839         /* set a pointer to after the last source position where prevCC==0 */
2840         if(prevCC==0) {
2841             decompStart=prevSrc;
2842         }
2843 
2844         /* c already contains *src and fcd16 is set for it, increment src */
2845         ++src;
2846 
2847         /* check one above-minimum, relevant code unit */
2848         if(UTF_IS_FIRST_SURROGATE(c)) {
2849             /* c is a lead surrogate, get the real fcd16 */
2850             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2851                 ++src;
2852                 fcd16=_getFCD16FromSurrogatePair(c, c2);
2853             } else {
2854                 c2=0;
2855                 fcd16=0;
2856             }
2857         } else {
2858             c2=0;
2859         }
2860 
2861         /* we are looking at the character (c, c2) at [prevSrc..src[ */
2862         if(nx_contains(nx, c, c2)) {
2863             fcd16=0; /* excluded: fcd16==0 */
2864         }
2865 
2866         /* check the combining order, get the lead cc */
2867         cc=(int16_t)(fcd16>>8);
2868         if(cc==0 || cc>=prevCC) {
2869             /* the order is ok */
2870             if(cc==0) {
2871                 decompStart=prevSrc;
2872             }
2873             prevCC=(int16_t)(fcd16&0xff);
2874 
2875             /* just append (c, c2) */
2876             length= c2==0 ? 1 : 2;
2877             if((destIndex+length)<=destCapacity) {
2878                 dest[destIndex++]=c;
2879                 if(c2!=0) {
2880                     dest[destIndex++]=c2;
2881                 }
2882             } else {
2883                 destIndex+=length;
2884             }
2885         } else {
2886             /*
2887              * back out the part of the source that we copied already but
2888              * is now going to be decomposed;
2889              * prevSrc is set to after what was copied
2890              */
2891             destIndex-=(int32_t)(prevSrc-decompStart);
2892 
2893             /*
2894              * find the part of the source that needs to be decomposed;
2895              * to be safe and simple, decompose to before the next character with lead cc==0
2896              */
2897             src=_findSafeFCD(src, limit, fcd16);
2898 
2899             /*
2900              * the source text does not fulfill the conditions for FCD;
2901              * decompose and reorder a limited piece of the text
2902              */
2903             prevCC=_decomposeFCD(decompStart, src,
2904                                  dest, destIndex, destCapacity,
2905                                  nx);
2906             decompStart=src;
2907         }
2908     }
2909 
2910     return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2911 }
2912 
2913 /* quick check functions ---------------------------------------------------- */
2914 
2915 static UBool
2916 unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
2917     const UChar *limit;
2918     UChar c, c2;
2919     uint16_t fcd16;
2920     int16_t prevCC, cc;
2921 
2922     /* initialize */
2923     prevCC=0;
2924 
2925     if(srcLength>=0) {
2926         /* string with length */
2927         limit=src+srcLength;
2928     } else /* srcLength==-1 */ {
2929         /* zero-terminated string */
2930         limit=NULL;
2931     }
2932 
2933     U_ALIGN_CODE(16);
2934 
2935     for(;;) {
2936         /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2937         if(limit==NULL) {
2938             for(;;) {
2939                 c=*src++;
2940                 if(c<_NORM_MIN_WITH_LEAD_CC) {
2941                     if(c==0) {
2942                         return TRUE;
2943                     }
2944                     /*
2945                      * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2946                      * because chances are good that the next one will have
2947                      * a leading cc of 0;
2948                      * _getFCD16(-prevCC) is later called when necessary -
2949                      * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2950                      */
2951                     prevCC=(int16_t)-c;
2952                 } else if((fcd16=_getFCD16(c))==0) {
2953                     prevCC=0;
2954                 } else {
2955                     break;
2956                 }
2957             }
2958         } else {
2959             for(;;) {
2960                 if(src==limit) {
2961                     return TRUE;
2962                 } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
2963                     prevCC=(int16_t)-c;
2964                 } else if((fcd16=_getFCD16(c))==0) {
2965                     prevCC=0;
2966                 } else {
2967                     break;
2968                 }
2969             }
2970         }
2971 
2972         /* check one above-minimum, relevant code unit */
2973         if(UTF_IS_FIRST_SURROGATE(c)) {
2974             /* c is a lead surrogate, get the real fcd16 */
2975             if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2976                 ++src;
2977                 fcd16=_getFCD16FromSurrogatePair(c, c2);
2978             } else {
2979                 c2=0;
2980                 fcd16=0;
2981             }
2982         } else {
2983             c2=0;
2984         }
2985 
2986         if(nx_contains(nx, c, c2)) {
2987             prevCC=0; /* excluded: fcd16==0 */
2988             continue;
2989         }
2990 
2991         /*
2992          * prevCC has values from the following ranges:
2993          * 0..0xff - the previous trail combining class
2994          * <0      - the negative value of the previous code unit;
2995          *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2996          *           was deferred so that average text is checked faster
2997          */
2998 
2999         /* check the combining order */
3000         cc=(int16_t)(fcd16>>8);
3001         if(cc!=0) {
3002             if(prevCC<0) {
3003                 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
3004                 if(!nx_contains(nx, (UChar32)-prevCC)) {
3005                     prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
3006                 } else {
3007                     prevCC=0; /* excluded: fcd16==0 */
3008                 }
3009             }
3010 
3011             if(cc<prevCC) {
3012                 return FALSE;
3013             }
3014         }
3015         prevCC=(int16_t)(fcd16&0xff);
3016     }
3017 }
3018 
3019 static UNormalizationCheckResult
3020 _quickCheck(const UChar *src,
3021             int32_t srcLength,
3022             UNormalizationMode mode,
3023             UBool allowMaybe,
3024             const UnicodeSet *nx,
3025             UErrorCode *pErrorCode) {
3026     UChar stackBuffer[_STACK_BUFFER_CAPACITY];
3027     UChar *buffer;
3028     int32_t bufferCapacity;
3029 
3030     const UChar *start, *limit;
3031     uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
3032     int32_t options;
3033     UChar c, c2, minNoMaybe;
3034     uint8_t cc, prevCC;
3035     UNormalizationCheckResult result;
3036 
3037     /* check arguments */
3038     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3039         return UNORM_MAYBE;
3040     }
3041 
3042     if(src==NULL || srcLength<-1) {
3043         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3044         return UNORM_MAYBE;
3045     }
3046 
3047     if(!_haveData(*pErrorCode)) {
3048         return UNORM_MAYBE;
3049     }
3050 
3051     /* check for a valid mode and set the quick check minimum and mask */
3052     switch(mode) {
3053     case UNORM_NFC:
3054         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3055         qcMask=_NORM_QC_NFC;
3056         options=0;
3057         break;
3058     case UNORM_NFKC:
3059         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3060         qcMask=_NORM_QC_NFKC;
3061         options=_NORM_OPTIONS_COMPAT;
3062         break;
3063     case UNORM_NFD:
3064         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
3065         qcMask=_NORM_QC_NFD;
3066         options=0;
3067         break;
3068     case UNORM_NFKD:
3069         minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
3070         qcMask=_NORM_QC_NFKD;
3071         options=_NORM_OPTIONS_COMPAT;
3072         break;
3073     case UNORM_FCD:
3074         if(fcdTrie.index==NULL) {
3075             *pErrorCode=U_UNSUPPORTED_ERROR;
3076             return UNORM_MAYBE;
3077         }
3078         return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
3079     default:
3080         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3081         return UNORM_MAYBE;
3082     }
3083 
3084     /* initialize */
3085     buffer=stackBuffer;
3086     bufferCapacity=_STACK_BUFFER_CAPACITY;
3087 
3088     ccOrQCMask=_NORM_CC_MASK|qcMask;
3089     result=UNORM_YES;
3090     prevCC=0;
3091 
3092     start=src;
3093     if(srcLength>=0) {
3094         /* string with length */
3095         limit=src+srcLength;
3096     } else /* srcLength==-1 */ {
3097         /* zero-terminated string */
3098         limit=NULL;
3099     }
3100 
3101     U_ALIGN_CODE(16);
3102 
3103     for(;;) {
3104         /* skip a run of code units below the minimum or with irrelevant data for the quick check */
3105         if(limit==NULL) {
3106             for(;;) {
3107                 c=*src++;
3108                 if(c<minNoMaybe) {
3109                     if(c==0) {
3110                         goto endloop; /* break out of outer loop */
3111                     }
3112                 } else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3113                     break;
3114                 }
3115                 prevCC=0;
3116             }
3117         } else {
3118             for(;;) {
3119                 if(src==limit) {
3120                     goto endloop; /* break out of outer loop */
3121                 } else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3122                     break;
3123                 }
3124                 prevCC=0;
3125             }
3126         }
3127 
3128         /* check one above-minimum, relevant code unit */
3129         if(U16_IS_LEAD(c)) {
3130             /* c is a lead surrogate, get the real norm32 */
3131             if(src!=limit && U16_IS_TRAIL(c2=*src)) {
3132                 ++src;
3133                 norm32=_getNorm32FromSurrogatePair(c, c2);
3134             } else {
3135                 c2=0;
3136                 norm32=0;
3137             }
3138         } else {
3139             c2=0;
3140         }
3141 
3142         if(nx_contains(nx, c, c2)) {
3143             /* excluded: norm32==0 */
3144             norm32=0;
3145         }
3146 
3147         /* check the combining order */
3148         cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
3149         if(cc!=0 && cc<prevCC) {
3150             result=UNORM_NO;
3151             break;
3152         }
3153         prevCC=cc;
3154 
3155         /* check for "no" or "maybe" quick check flags */
3156         qcNorm32=norm32&qcMask;
3157         if(qcNorm32&_NORM_QC_ANY_NO) {
3158             result=UNORM_NO;
3159             break;
3160         } else if(qcNorm32!=0) {
3161             /* "maybe" can only occur for NFC and NFKC */
3162             if(allowMaybe) {
3163                 result=UNORM_MAYBE;
3164             } else {
3165                 /* normalize a section around here to see if it is really normalized or not */
3166                 const UChar *prevStarter;
3167                 uint32_t decompQCMask;
3168                 int32_t length;
3169 
3170                 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
3171 
3172                 /* find the previous starter */
3173                 prevStarter=src-1; /* set prevStarter to the beginning of the current character */
3174                 if(UTF_IS_TRAIL(*prevStarter)) {
3175                     --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */
3176                 }
3177                 prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
3178 
3179                 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3180                 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
3181 
3182                 /* decompose and recompose [prevStarter..src[ */
3183                 _composePart(stackBuffer, buffer, bufferCapacity,
3184                              length,
3185                              prevStarter,
3186                              src,
3187                              prevCC,
3188                              options, nx, pErrorCode);
3189                 if(U_FAILURE(*pErrorCode)) {
3190                     result=UNORM_MAYBE; /* error (out of memory) */
3191                     break;
3192                 }
3193 
3194                 /* compare the normalized version with the original */
3195                 if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
3196                     result=UNORM_NO; /* normalization differs */
3197                     break;
3198                 }
3199 
3200                 /* continue after the next starter */
3201             }
3202         }
3203     }
3204 endloop:
3205 
3206     if(buffer!=stackBuffer) {
3207         uprv_free(buffer);
3208     }
3209 
3210     return result;
3211 }
3212 
3213 U_CAPI UNormalizationCheckResult U_EXPORT2
3214 unorm_quickCheck(const UChar *src,
3215                  int32_t srcLength,
3216                  UNormalizationMode mode,
3217                  UErrorCode *pErrorCode) {
3218     return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
3219 }
3220 
3221 U_CAPI UNormalizationCheckResult U_EXPORT2
3222 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
3223                             UNormalizationMode mode, int32_t options,
3224                             UErrorCode *pErrorCode) {
3225     return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
3226 }
3227 
3228 U_CFUNC UNormalizationCheckResult
3229 unorm_internalQuickCheck(const UChar *src,
3230                          int32_t srcLength,
3231                          UNormalizationMode mode,
3232                          UBool allowMaybe,
3233                          const UnicodeSet *nx,
3234                          UErrorCode *pErrorCode) {
3235     return _quickCheck(src, srcLength, mode, allowMaybe, nx, pErrorCode);
3236 }
3237 
3238 U_CAPI UBool U_EXPORT2
3239 unorm_isNormalized(const UChar *src, int32_t srcLength,
3240                    UNormalizationMode mode,
3241                    UErrorCode *pErrorCode) {
3242     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
3243 }
3244 
3245 U_CAPI UBool U_EXPORT2
3246 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
3247                               UNormalizationMode mode, int32_t options,
3248                               UErrorCode *pErrorCode) {
3249     return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
3250 }
3251 
3252 /* normalize() API ---------------------------------------------------------- */
3253 
3254 /**
3255  * Internal API for normalizing.
3256  * Does not check for bad input.
3257  * Requires _haveData() to be true.
3258  * @internal
3259  */
3260 U_CFUNC int32_t
3261 unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
3262                               const UChar *src, int32_t srcLength,
3263                               UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
3264                               UErrorCode *pErrorCode) {
3265     int32_t destLength;
3266     uint8_t trailCC;
3267 
3268     switch(mode) {
3269     case UNORM_NFD:
3270         destLength=_decompose(dest, destCapacity,
3271                               src, srcLength,
3272                               FALSE, nx, trailCC);
3273         break;
3274     case UNORM_NFKD:
3275         destLength=_decompose(dest, destCapacity,
3276                               src, srcLength,
3277                               TRUE, nx, trailCC);
3278         break;
3279     case UNORM_NFC:
3280         destLength=_compose(dest, destCapacity,
3281                             src, srcLength,
3282                             options, nx, pErrorCode);
3283         break;
3284     case UNORM_NFKC:
3285         destLength=_compose(dest, destCapacity,
3286                             src, srcLength,
3287                             options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
3288         break;
3289     case UNORM_FCD:
3290         if(fcdTrie.index==NULL) {
3291             *pErrorCode=U_UNSUPPORTED_ERROR;
3292             return 0;
3293         }
3294         return unorm_makeFCD(dest, destCapacity,
3295                              src, srcLength,
3296                              nx,
3297                              pErrorCode);
3298 #if 0
3299     case UNORM_FCC:
3300         destLength=_compose(dest, destCapacity,
3301                             src, srcLength,
3302                             options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
3303         break;
3304 #endif
3305     case UNORM_NONE:
3306         /* just copy the string */
3307         if(srcLength==-1) {
3308             srcLength=u_strlen(src);
3309         }
3310         if(srcLength>0 && srcLength<=destCapacity) {
3311             uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
3312         }
3313         destLength=srcLength;
3314         break;
3315     default:
3316         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3317         return 0;
3318     }
3319 
3320     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3321 }
3322 
3323 /**
3324  * Internal API for normalizing.
3325  * Does not check for bad input.
3326  * @internal
3327  */
3328 U_CAPI int32_t U_EXPORT2
3329 unorm_internalNormalize(UChar *dest, int32_t destCapacity,
3330                         const UChar *src, int32_t srcLength,
3331                         UNormalizationMode mode, int32_t options,
3332                         UErrorCode *pErrorCode) {
3333     const UnicodeSet *nx;
3334 
3335     if(!_haveData(*pErrorCode)) {
3336         return 0;
3337     }
3338 
3339     nx=getNX(options, *pErrorCode);
3340     if(U_FAILURE(*pErrorCode)) {
3341         return 0;
3342     }
3343 
3344     /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3345     options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
3346 
3347     return unorm_internalNormalizeWithNX(dest, destCapacity,
3348                                          src, srcLength,
3349                                          mode, options, nx,
3350                                          pErrorCode);
3351 }
3352 
3353 /** Public API for normalizing. */
3354 U_CAPI int32_t U_EXPORT2
3355 unorm_normalize(const UChar *src, int32_t srcLength,
3356                 UNormalizationMode mode, int32_t options,
3357                 UChar *dest, int32_t destCapacity,
3358                 UErrorCode *pErrorCode) {
3359     /* check argument values */
3360     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3361         return 0;
3362     }
3363 
3364     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3365         src==NULL || srcLength<-1
3366     ) {
3367         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3368         return 0;
3369     }
3370 
3371     /* check for overlapping src and destination */
3372     if( dest!=NULL &&
3373         ((src>=dest && src<(dest+destCapacity)) ||
3374          (srcLength>0 && dest>=src && dest<(src+srcLength)))
3375     ) {
3376         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3377         return 0;
3378     }
3379 
3380     return unorm_internalNormalize(dest, destCapacity,
3381                                    src, srcLength,
3382                                    mode, options,
3383                                    pErrorCode);
3384 }
3385 
3386 
3387 /* iteration functions ------------------------------------------------------ */
3388 
3389 /*
3390  * These iteration functions are the core implementations of the
3391  * Normalizer class iteration API.
3392  * They read from a UCharIterator into their own buffer
3393  * and normalize into the Normalizer iteration buffer.
3394  * Normalizer itself then iterates over its buffer until that needs to be
3395  * filled again.
3396  */
3397 
3398 /*
3399  * ### TODO:
3400  * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3401  * if iteration bounds are reached,
3402  * try to not call hasNext/hasPrevious and instead check for >=0.
3403  */
3404 
3405 /* backward iteration ------------------------------------------------------- */
3406 
3407 /*
3408  * read backwards and get norm32
3409  * return 0 if the character is <minC
3410  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3411  */
3412 static inline uint32_t
3413 _getPrevNorm32(UCharIterator &src, uint32_t minC, UChar &c, UChar &c2) {
3414     /* need src.hasPrevious() */
3415     c=(UChar)src.previous(&src);
3416     c2=0;
3417 
3418     /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3419     if(c<minC) {
3420         return 0;
3421     } else if(!UTF_IS_SURROGATE(c)) {
3422         return _getNorm32(c);
3423     } else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
3424         /* unpaired surrogate */
3425         return 0;
3426     } else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
3427         return _getNorm32FromSurrogatePair(c2, c);
3428     } else {
3429         /* unpaired second surrogate, undo the c2=src.previous() movement */
3430         src.move(&src, 1, UITER_CURRENT);
3431         c2=0;
3432         return 0;
3433     }
3434 }
3435 
3436 /*
3437  * read backwards and check if the character is a previous-iteration boundary
3438  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3439  */
3440 typedef UBool
3441 IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3442 
3443 /*
3444  * for NF*D:
3445  * read backwards and check if the lead combining class is 0
3446  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3447  */
3448 static UBool
3449 _isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3450     return _isNFDSafe(_getPrevNorm32(src, minC, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3451 }
3452 
3453 /*
3454  * read backwards and check if the character is (or its decomposition begins with)
3455  * a "true starter" (cc==0 and NF*C_YES)
3456  * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3457  */
3458 static UBool
3459 _isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3460     uint32_t norm32, decompQCMask;
3461 
3462     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3463     norm32=_getPrevNorm32(src, minC, c, c2);
3464     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3465 }
3466 
3467 static int32_t
3468 _findPreviousIterationBoundary(UCharIterator &src,
3469                                IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
3470                                UChar *&buffer, int32_t &bufferCapacity,
3471                                int32_t &startIndex,
3472                                UErrorCode *pErrorCode) {
3473     UChar *stackBuffer;
3474     UChar c, c2;
3475     UBool isBoundary;
3476 
3477     /* initialize */
3478     stackBuffer=buffer;
3479     startIndex=bufferCapacity; /* fill the buffer from the end backwards */
3480 
3481     while(src.hasPrevious(&src)) {
3482         isBoundary=isPrevBoundary(src, minC, mask, c, c2);
3483 
3484         /* always write this character to the front of the buffer */
3485         /* make sure there is enough space in the buffer */
3486         if(startIndex < (c2==0 ? 1 : 2)) {
3487             int32_t bufferLength=bufferCapacity;
3488 
3489             if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
3490                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3491                 src.move(&src, 0, UITER_START);
3492                 return 0;
3493             }
3494 
3495             /* move the current buffer contents up */
3496             uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
3497             startIndex+=bufferCapacity-bufferLength;
3498         }
3499 
3500         buffer[--startIndex]=c;
3501         if(c2!=0) {
3502             buffer[--startIndex]=c2;
3503         }
3504 
3505         /* stop if this just-copied character is a boundary */
3506         if(isBoundary) {
3507             break;
3508         }
3509     }
3510 
3511     /* return the length of the buffer contents */
3512     return bufferCapacity-startIndex;
3513 }
3514 
3515 U_CAPI int32_t U_EXPORT2
3516 unorm_previous(UCharIterator *src,
3517                UChar *dest, int32_t destCapacity,
3518                UNormalizationMode mode, int32_t options,
3519                UBool doNormalize, UBool *pNeededToNormalize,
3520                UErrorCode *pErrorCode) {
3521     UChar stackBuffer[100];
3522     UChar *buffer=NULL;
3523     IsPrevBoundaryFn *isPreviousBoundary=NULL;
3524     uint32_t mask=0;
3525     int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
3526     int32_t c=0, c2=0;
3527     UChar minC=0;
3528 
3529     /* check argument values */
3530     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3531         return 0;
3532     }
3533 
3534     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3535         src==NULL
3536     ) {
3537         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3538         return 0;
3539     }
3540 
3541     if(!_haveData(*pErrorCode)) {
3542         return 0;
3543     }
3544 
3545     if(pNeededToNormalize!=NULL) {
3546         *pNeededToNormalize=FALSE;
3547     }
3548 
3549     switch(mode) {
3550     case UNORM_FCD:
3551         if(fcdTrie.index==NULL) {
3552             *pErrorCode=U_UNSUPPORTED_ERROR;
3553             return 0;
3554         }
3555         /* fall through to NFD */
3556     case UNORM_NFD:
3557         isPreviousBoundary=_isPrevNFDSafe;
3558         minC=_NORM_MIN_WITH_LEAD_CC;
3559         mask=_NORM_CC_MASK|_NORM_QC_NFD;
3560         break;
3561     case UNORM_NFKD:
3562         isPreviousBoundary=_isPrevNFDSafe;
3563         minC=_NORM_MIN_WITH_LEAD_CC;
3564         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3565         break;
3566     case UNORM_NFC:
3567         isPreviousBoundary=_isPrevTrueStarter;
3568         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3569         mask=_NORM_CC_MASK|_NORM_QC_NFC;
3570         break;
3571     case UNORM_NFKC:
3572         isPreviousBoundary=_isPrevTrueStarter;
3573         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3574         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3575         break;
3576     case UNORM_NONE:
3577         destLength=0;
3578         if((c=src->previous(src))>=0) {
3579             destLength=1;
3580             if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
3581                 if(UTF_IS_LEAD(c2)) {
3582                     if(destCapacity>=2) {
3583                         dest[1]=(UChar)c; /* trail surrogate */
3584                         destLength=2;
3585                     }
3586                     c=c2; /* lead surrogate to be written below */
3587                 } else {
3588                     src->move(src, 1, UITER_CURRENT);
3589                 }
3590             }
3591 
3592             if(destCapacity>0) {
3593                 dest[0]=(UChar)c;
3594             }
3595         }
3596         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3597     default:
3598         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3599         return 0;
3600     }
3601 
3602     buffer=stackBuffer;
3603     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3604     bufferLength=_findPreviousIterationBoundary(*src,
3605                                                 isPreviousBoundary, minC, mask,
3606                                                 buffer, bufferCapacity,
3607                                                 startIndex,
3608                                                 pErrorCode);
3609     if(bufferLength>0) {
3610         if(doNormalize) {
3611             destLength=unorm_internalNormalize(dest, destCapacity,
3612                                                buffer+startIndex, bufferLength,
3613                                                mode, options,
3614                                                pErrorCode);
3615             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3616                 *pNeededToNormalize=
3617                     (UBool)(destLength!=bufferLength ||
3618                             0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
3619             }
3620         } else {
3621             /* just copy the source characters */
3622             if(destCapacity>0) {
3623                 uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3624             }
3625             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3626         }
3627     } else {
3628         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3629     }
3630 
3631     /* cleanup */
3632     if(buffer!=stackBuffer) {
3633         uprv_free(buffer);
3634     }
3635 
3636     return destLength;
3637 }
3638 
3639 /* forward iteration -------------------------------------------------------- */
3640 
3641 /*
3642  * read forward and get norm32
3643  * return 0 if the character is <minC
3644  * if c2!=0 then (c2, c) is a surrogate pair
3645  * always reads complete characters
3646  */
3647 static inline uint32_t
3648 _getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3649     uint32_t norm32;
3650 
3651     /* need src.hasNext() to be true */
3652     c=(UChar)src.next(&src);
3653     c2=0;
3654 
3655     if(c<minC) {
3656         return 0;
3657     }
3658 
3659     norm32=_getNorm32(c);
3660     if(UTF_IS_FIRST_SURROGATE(c)) {
3661         if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
3662             src.move(&src, 1, UITER_CURRENT); /* skip the c2 surrogate */
3663             if((norm32&mask)==0) {
3664                 /* irrelevant data */
3665                 return 0;
3666             } else {
3667                 /* norm32 must be a surrogate special */
3668                 return _getNorm32FromSurrogatePair(c, c2);
3669             }
3670         } else {
3671             /* unmatched surrogate */
3672             c2=0;
3673             return 0;
3674         }
3675     }
3676     return norm32;
3677 }
3678 
3679 /*
3680  * read forward and check if the character is a next-iteration boundary
3681  * if c2!=0 then (c, c2) is a surrogate pair
3682  */
3683 typedef UBool
3684 IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3685 
3686 /*
3687  * for NF*D:
3688  * read forward and check if the lead combining class is 0
3689  * if c2!=0 then (c, c2) is a surrogate pair
3690  */
3691 static UBool
3692 _isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3693     return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3694 }
3695 
3696 /*
3697  * for NF*C:
3698  * read forward and check if the character is (or its decomposition begins with)
3699  * a "true starter" (cc==0 and NF*C_YES)
3700  * if c2!=0 then (c, c2) is a surrogate pair
3701  */
3702 static UBool
3703 _isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3704     uint32_t norm32, decompQCMask;
3705 
3706     decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3707     norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3708     return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3709 }
3710 
3711 static int32_t
3712 _findNextIterationBoundary(UCharIterator &src,
3713                            IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
3714                            UChar *&buffer, int32_t &bufferCapacity,
3715                            UErrorCode *pErrorCode) {
3716     UChar *stackBuffer;
3717     int32_t bufferIndex;
3718     UChar c, c2;
3719 
3720     if(!src.hasNext(&src)) {
3721         return 0;
3722     }
3723 
3724     /* initialize */
3725     stackBuffer=buffer;
3726 
3727     /* get one character and ignore its properties */
3728     buffer[0]=c=(UChar)src.next(&src);
3729     bufferIndex=1;
3730     if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
3731         if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
3732             buffer[bufferIndex++]=c2;
3733         } else {
3734             src.move(&src, -1, UITER_CURRENT); /* back out the non-trail-surrogate */
3735         }
3736     }
3737 
3738     /* get all following characters until we see a boundary */
3739     /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3740     while(src.hasNext(&src)) {
3741         if(isNextBoundary(src, minC, mask, c, c2)) {
3742             /* back out the latest movement to stop at the boundary */
3743             src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
3744             break;
3745         } else {
3746             if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
3747                 /* attempt to grow the buffer */
3748                 u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
3749                                        2*bufferCapacity,
3750                                        bufferIndex)
3751             ) {
3752                 buffer[bufferIndex++]=c;
3753                 if(c2!=0) {
3754                     buffer[bufferIndex++]=c2;
3755                 }
3756             } else {
3757                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3758                 src.move(&src, 0, UITER_LIMIT);
3759                 return 0;
3760             }
3761         }
3762     }
3763 
3764     /* return the length of the buffer contents */
3765     return bufferIndex;
3766 }
3767 
3768 U_CAPI int32_t U_EXPORT2
3769 unorm_next(UCharIterator *src,
3770            UChar *dest, int32_t destCapacity,
3771            UNormalizationMode mode, int32_t options,
3772            UBool doNormalize, UBool *pNeededToNormalize,
3773            UErrorCode *pErrorCode) {
3774     UChar stackBuffer[100];
3775     UChar *buffer;
3776     IsNextBoundaryFn *isNextBoundary;
3777     uint32_t mask;
3778     int32_t bufferLength, bufferCapacity, destLength;
3779     int32_t c, c2;
3780     UChar minC;
3781 
3782     /* check argument values */
3783     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3784         return 0;
3785     }
3786 
3787     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3788         src==NULL
3789     ) {
3790         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3791         return 0;
3792     }
3793 
3794     if(!_haveData(*pErrorCode)) {
3795         return 0;
3796     }
3797 
3798     if(pNeededToNormalize!=NULL) {
3799         *pNeededToNormalize=FALSE;
3800     }
3801 
3802     switch(mode) {
3803     case UNORM_FCD:
3804         if(fcdTrie.index==NULL) {
3805             *pErrorCode=U_UNSUPPORTED_ERROR;
3806             return 0;
3807         }
3808         /* fall through to NFD */
3809     case UNORM_NFD:
3810         isNextBoundary=_isNextNFDSafe;
3811         minC=_NORM_MIN_WITH_LEAD_CC;
3812         mask=_NORM_CC_MASK|_NORM_QC_NFD;
3813         break;
3814     case UNORM_NFKD:
3815         isNextBoundary=_isNextNFDSafe;
3816         minC=_NORM_MIN_WITH_LEAD_CC;
3817         mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3818         break;
3819     case UNORM_NFC:
3820         isNextBoundary=_isNextTrueStarter;
3821         minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3822         mask=_NORM_CC_MASK|_NORM_QC_NFC;
3823         break;
3824     case UNORM_NFKC:
3825         isNextBoundary=_isNextTrueStarter;
3826         minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3827         mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3828         break;
3829     case UNORM_NONE:
3830         destLength=0;
3831         if((c=src->next(src))>=0) {
3832             destLength=1;
3833             if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
3834                 if(UTF_IS_TRAIL(c2)) {
3835                     if(destCapacity>=2) {
3836                         dest[1]=(UChar)c2; /* trail surrogate */
3837                         destLength=2;
3838                     }
3839                     /* lead surrogate to be written below */
3840                 } else {
3841                     src->move(src, -1, UITER_CURRENT);
3842                 }
3843             }
3844 
3845             if(destCapacity>0) {
3846                 dest[0]=(UChar)c;
3847             }
3848         }
3849         return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3850     default:
3851         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3852         return 0;
3853     }
3854 
3855     buffer=stackBuffer;
3856     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3857     bufferLength=_findNextIterationBoundary(*src,
3858                                             isNextBoundary, minC, mask,
3859                                             buffer, bufferCapacity,
3860                                             pErrorCode);
3861     if(bufferLength>0) {
3862         if(doNormalize) {
3863             destLength=unorm_internalNormalize(dest, destCapacity,
3864                                                buffer, bufferLength,
3865                                                mode, options,
3866                                                pErrorCode);
3867             if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3868                 *pNeededToNormalize=
3869                     (UBool)(destLength!=bufferLength ||
3870                             0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
3871             }
3872         } else {
3873             /* just copy the source characters */
3874             if(destCapacity>0) {
3875                 uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3876             }
3877             destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3878         }
3879     } else {
3880         destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3881     }
3882 
3883     /* cleanup */
3884     if(buffer!=stackBuffer) {
3885         uprv_free(buffer);
3886     }
3887 
3888     return destLength;
3889 }
3890 
3891 /*
3892  * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3893  * and if not, how hard it would be to improve it.
3894  * For example, see _findSafeFCD().
3895  */
3896 
3897 /* Concatenation of normalized strings -------------------------------------- */
3898 
3899 U_CAPI int32_t U_EXPORT2
3900 unorm_concatenate(const UChar *left, int32_t leftLength,
3901                   const UChar *right, int32_t rightLength,
3902                   UChar *dest, int32_t destCapacity,
3903                   UNormalizationMode mode, int32_t options,
3904                   UErrorCode *pErrorCode) {
3905     UChar stackBuffer[100];
3906     UChar *buffer;
3907     int32_t bufferLength, bufferCapacity;
3908 
3909     UCharIterator iter;
3910     int32_t leftBoundary, rightBoundary, destLength;
3911 
3912     /* check argument values */
3913     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3914         return 0;
3915     }
3916 
3917     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3918         left==NULL || leftLength<-1 ||
3919         right==NULL || rightLength<-1
3920     ) {
3921         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3922         return 0;
3923     }
3924 
3925     /* check for overlapping right and destination */
3926     if( dest!=NULL &&
3927         ((right>=dest && right<(dest+destCapacity)) ||
3928          (rightLength>0 && dest>=right && dest<(right+rightLength)))
3929     ) {
3930         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3931         return 0;
3932     }
3933 
3934     /* allow left==dest */
3935 
3936     /* set up intermediate buffer */
3937     buffer=stackBuffer;
3938     bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3939 
3940     /*
3941      * Input: left[0..leftLength[ + right[0..rightLength[
3942      *
3943      * Find normalization-safe boundaries leftBoundary and rightBoundary
3944      * and copy the end parts together:
3945      * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
3946      *
3947      * dest=left[0..leftBoundary[ +
3948      *      normalize(buffer) +
3949      *      right[rightBoundary..rightLength[
3950      */
3951 
3952     /*
3953      * find a normalization boundary at the end of the left string
3954      * and copy the end part into the buffer
3955      */
3956     uiter_setString(&iter, left, leftLength);
3957     iter.index=leftLength=iter.length; /* end of left string */
3958 
3959     bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
3960                                 mode, options,
3961                                 FALSE, NULL,
3962                                 pErrorCode);
3963     leftBoundary=iter.index;
3964     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3965         *pErrorCode=U_ZERO_ERROR;
3966         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
3967             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3968             /* dont need to cleanup here since
3969              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
3970              */
3971             return 0;
3972         }
3973 
3974         /* just copy from the left string: we know the boundary already */
3975         uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
3976     }
3977 
3978     /*
3979      * find a normalization boundary at the beginning of the right string
3980      * and concatenate the beginning part to the buffer
3981      */
3982     uiter_setString(&iter, right, rightLength);
3983     rightLength=iter.length; /* in case it was -1 */
3984 
3985     rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
3986                              mode, options,
3987                              FALSE, NULL,
3988                              pErrorCode);
3989     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3990         *pErrorCode=U_ZERO_ERROR;
3991         if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
3992             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3993             /* dont need to cleanup here since
3994              * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
3995              */
3996             return 0;
3997         }
3998 
3999         /* just copy from the right string: we know the boundary already */
4000         uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
4001     }
4002 
4003     bufferLength+=rightBoundary;
4004 
4005     /* copy left[0..leftBoundary[ to dest */
4006     if(left!=dest && leftBoundary>0 && destCapacity>0) {
4007         uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
4008     }
4009     destLength=leftBoundary;
4010 
4011     /* concatenate the normalization of the buffer to dest */
4012     if(destCapacity>destLength) {
4013         destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
4014                                             buffer, bufferLength,
4015                                             mode, options,
4016                                             pErrorCode);
4017     } else {
4018         destLength+=unorm_internalNormalize(NULL, 0,
4019                                             buffer, bufferLength,
4020                                             mode, options,
4021                                             pErrorCode);
4022     }
4023     /*
4024      * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
4025      * so we dont check for the error code here..just let it pass through
4026      */
4027     /* concatenate right[rightBoundary..rightLength[ to dest */
4028     right+=rightBoundary;
4029     rightLength-=rightBoundary;
4030     if(rightLength>0 && destCapacity>destLength) {
4031         uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
4032     }
4033     destLength+=rightLength;
4034 
4035     /* cleanup */
4036     if(buffer!=stackBuffer) {
4037         uprv_free(buffer);
4038     }
4039 
4040     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
4041 }
4042 
4043 #endif /* #if !UCONFIG_NO_NORMALIZATION */
4044