1 /*
2 ******************************************************************************
3 * Copyright (c) 1996-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 * File unorm.cpp
7 *
8 * Created by: Vladimir Weinstein 12052000
9 *
10 * Modification history :
11 *
12 * Date Name Description
13 * 02/01/01 synwee Added normalization quickcheck enum and method.
14 * 02/12/01 synwee Commented out quickcheck util api has been approved
15 * Added private method for doing FCD checks
16 * 02/23/01 synwee Modified quickcheck and checkFCE to run through
17 * string for codepoints < 0x300 for the normalization
18 * mode NFC.
19 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here
20 * instead of just wrappers around normlzr.cpp,
21 * load unorm.dat, support Unicode 3.1 with
22 * supplementary code points, etc.
23 */
24
25 #include "unicode/utypes.h"
26
27 #if !UCONFIG_NO_NORMALIZATION
28
29 #include "unicode/udata.h"
30 #include "unicode/uchar.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/uniset.h"
34 #include "unicode/usetiter.h"
35 #include "unicode/unorm.h"
36 #include "ucln_cmn.h"
37 #include "unormimp.h"
38 #include "ucase.h"
39 #include "cmemory.h"
40 #include "umutex.h"
41 #include "utrie2.h"
42 #include "unicode/uset.h"
43 #include "udataswp.h"
44 #include "putilimp.h"
45
46 /*
47 * Status of tailored normalization
48 *
49 * This was done initially for investigation on Unicode public review issue 7
50 * (http://www.unicode.org/review/). See Jitterbug 2481.
51 * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
52 * a permanent feature in ICU 2.6 in support of IDNA which requires true
53 * Unicode 3.2 normalization.
54 * (NormalizationCorrections are rolled into IDNA mapping tables.)
55 *
56 * Tailored normalization as implemented here allows to "normalize less"
57 * than full Unicode normalization would.
58 * Based internally on a UnicodeSet of code points that are
59 * "excluded from normalization", the normalization functions leave those
60 * code points alone ("inert"). This means that tailored normalization
61 * still transforms text into a canonically equivalent form.
62 * It does not add decompositions to code points that do not have any or
63 * change decomposition results.
64 *
65 * Any function that searches for a safe boundary has not been touched,
66 * which means that these functions will be over-pessimistic when
67 * exclusions are applied.
68 * This should not matter because subsequent checks and normalizations
69 * do apply the exclusions; only a little more of the text may be processed
70 * than necessary under exclusions.
71 *
72 * Normalization exclusions have the following effect on excluded code points c:
73 * - c is not decomposed
74 * - c is not a composition target
75 * - c does not combine forward or backward for composition
76 * except that this is not implemented for Jamo
77 * - c is treated as having a combining class of 0
78 */
79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
80
81 U_NAMESPACE_USE
82
83 /*
84 * This new implementation of the normalization code loads its data from
85 * unorm.dat, which is generated with the gennorm tool.
86 * The format of that file is described in unormimp.h .
87 */
88
89 /* -------------------------------------------------------------------------- */
90
91 enum {
92 _STACK_BUFFER_CAPACITY=100
93 };
94
95 /*
96 * Constants for the bit fields in the options bit set parameter.
97 * These need not be public.
98 * A user only needs to know the currently assigned values.
99 * The number and positions of reserved bits per field can remain private
100 * and may change in future implementations.
101 */
102 enum {
103 _NORM_OPTIONS_NX_MASK=0x1f,
104 _NORM_OPTIONS_UNICODE_MASK=0x60,
105 _NORM_OPTIONS_SETS_MASK=0x7f,
106
107 _NORM_OPTIONS_UNICODE_SHIFT=5,
108
109 /*
110 * The following options are used only in some composition functions.
111 * They use bits 12 and up to preserve lower bits for the available options
112 * space in unorm_compare() -
113 * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
114 */
115
116 /** Options bit 12, for compatibility vs. canonical decomposition. */
117 _NORM_OPTIONS_COMPAT=0x1000,
118 /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
119 _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
120 };
121
122 U_CDECL_BEGIN
123 static inline UBool
isHangulWithoutJamoT(UChar c)124 isHangulWithoutJamoT(UChar c) {
125 c-=HANGUL_BASE;
126 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
127 }
128
129 /* norm32 helpers */
130
131 /* is this a norm32 with a regular index? */
132 static inline UBool
isNorm32Regular(uint32_t norm32)133 isNorm32Regular(uint32_t norm32) {
134 return norm32<_NORM_MIN_SPECIAL;
135 }
136
137 #if 0 // Code changed to use U16_IS_LEAD(c) instead.
138 /* is this a norm32 with a special index for a lead surrogate? */
139 static inline UBool
140 isNorm32LeadSurrogate(uint32_t norm32) {
141 return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
142 }
143 #endif
144
145 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
146 static inline UBool
isNorm32HangulOrJamo(uint32_t norm32)147 isNorm32HangulOrJamo(uint32_t norm32) {
148 return norm32>=_NORM_MIN_HANGUL;
149 }
150
151 /*
152 * Given isNorm32HangulOrJamo(),
153 * is this a Hangul syllable or a Jamo?
154 */
155 /*static inline UBool
156 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
157 return norm32<_NORM_MIN_JAMO_V;
158 }*/
159
160 /*
161 * Given norm32 for Jamo V or T,
162 * is this a Jamo V?
163 */
164 static inline UBool
isJamoVTNorm32JamoV(uint32_t norm32)165 isJamoVTNorm32JamoV(uint32_t norm32) {
166 return norm32<_NORM_JAMO_V_TOP;
167 }
168 U_CDECL_END
169
170 /* load unorm.dat ----------------------------------------------------------- */
171
172 #define UNORM_HARDCODE_DATA 1
173
174 #if UNORM_HARDCODE_DATA
175
176 /* unorm_props_data.c is machine-generated by gennorm --csource */
177 #include "unorm_props_data.c"
178
179 static const UBool formatVersion_2_2=TRUE;
180
181 #else
182
183 #define DATA_NAME "unorm"
184 #define DATA_TYPE "icu"
185
186 static UDataMemory *normData=NULL;
187 static UErrorCode dataErrorCode=U_ZERO_ERROR;
188 static int8_t haveNormData=0;
189
190 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
191 static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
192
193 /*
194 * pointers into the memory-mapped unorm.icu
195 */
196 static const uint16_t *extraData=NULL,
197 *combiningTable=NULL,
198 *canonStartSets=NULL;
199
200 static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
201 static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
202
203 /* the Unicode version of the normalization data */
204 static UVersionInfo dataVersion={ 0, 0, 0, 0 };
205
206 #endif
207
208 /* cache UnicodeSets for each combination of exclusion flags */
209 static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
210
211 U_CDECL_BEGIN
212
213 static UBool U_CALLCONV
unorm_cleanup(void)214 unorm_cleanup(void) {
215 int32_t i;
216
217 #if !UNORM_HARDCODE_DATA
218 if(normData!=NULL) {
219 udata_close(normData);
220 normData=NULL;
221 }
222 dataErrorCode=U_ZERO_ERROR;
223 haveNormData=0;
224 #endif
225
226 for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
227 if (nxCache[i]) {
228 delete nxCache[i];
229 nxCache[i] = 0;
230 }
231 }
232
233 return TRUE;
234 }
235
236 #if !UNORM_HARDCODE_DATA
237
238 static UBool U_CALLCONV
isAcceptable(void *,const char *,const char *,const UDataInfo * pInfo)239 isAcceptable(void * /* context */,
240 const char * /* type */, const char * /* name */,
241 const UDataInfo *pInfo) {
242 if(
243 pInfo->size>=20 &&
244 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
245 pInfo->charsetFamily==U_CHARSET_FAMILY &&
246 pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */
247 pInfo->dataFormat[1]==0x6f &&
248 pInfo->dataFormat[2]==0x72 &&
249 pInfo->dataFormat[3]==0x6d &&
250 pInfo->formatVersion[0]==2 &&
251 pInfo->formatVersion[2]==UTRIE_SHIFT &&
252 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
253 ) {
254 uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
255 uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
256 return TRUE;
257 } else {
258 return FALSE;
259 }
260 }
261
262 #endif
263
264 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)265 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
266 /* add the start code point to the USet */
267 const USetAdder *sa=(const USetAdder *)context;
268 sa->add(sa->set, start);
269 return TRUE;
270 }
271
272 U_CDECL_END
273
274 #if !UNORM_HARDCODE_DATA
275
276 static int8_t
loadNormData(UErrorCode & errorCode)277 loadNormData(UErrorCode &errorCode) {
278 /* load Unicode normalization data from file */
279
280 /*
281 * This lazy intialization with double-checked locking (without mutex protection for
282 * haveNormData==0) is transiently unsafe under certain circumstances.
283 * Check the readme and use u_init() if necessary.
284 *
285 * While u_init() initializes the main normalization data via this functions,
286 * it does not do so for exclusion sets (which are fully mutexed).
287 * This is because
288 * - there can be many exclusion sets
289 * - they are rarely used
290 * - they are not usually used in execution paths that are
291 * as performance-sensitive as others
292 * (e.g., IDNA takes more time than unorm_quickCheck() anyway)
293 */
294 if(haveNormData==0) {
295 UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
296 UDataMemory *data;
297
298 const int32_t *p=NULL;
299 const uint8_t *pb;
300
301 if(&errorCode==NULL || U_FAILURE(errorCode)) {
302 return 0;
303 }
304
305 /* open the data outside the mutex block */
306 data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
307 dataErrorCode=errorCode;
308 if(U_FAILURE(errorCode)) {
309 return haveNormData=-1;
310 }
311
312 p=(const int32_t *)udata_getMemory(data);
313 pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
314 utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
315 _normTrie.getFoldingOffset=getFoldingNormOffset;
316
317 pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
318 if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
319 utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
320 }
321 pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
322
323 if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
324 utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
325 _auxTrie.getFoldingOffset=getFoldingAuxOffset;
326 }
327
328 if(U_FAILURE(errorCode)) {
329 dataErrorCode=errorCode;
330 udata_close(data);
331 return haveNormData=-1;
332 }
333
334 /* in the mutex block, set the data for this process */
335 umtx_lock(NULL);
336 if(normData==NULL) {
337 normData=data;
338 data=NULL;
339
340 uprv_memcpy(&indexes, p, sizeof(indexes));
341 uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
342 uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
343 uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
344 } else {
345 p=(const int32_t *)udata_getMemory(normData);
346 }
347
348 /* initialize some variables */
349 extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
350 combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
351 formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
352 formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
353 if(formatVersion_2_1) {
354 canonStartSets=combiningTable+
355 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
356 (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
357 }
358 haveNormData=1;
359 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
360 umtx_unlock(NULL);
361
362 /* if a different thread set it first, then close the extra data */
363 if(data!=NULL) {
364 udata_close(data); /* NULL if it was set correctly */
365 }
366 }
367
368 return haveNormData;
369 }
370
371 #endif
372
373 static inline UBool
_haveData(UErrorCode & errorCode)374 _haveData(UErrorCode &errorCode) {
375 #if UNORM_HARDCODE_DATA
376 return U_SUCCESS(errorCode);
377 #else
378 if(U_FAILURE(errorCode)) {
379 return FALSE;
380 } else if(haveNormData>0) {
381 return TRUE;
382 } else if(haveNormData<0) {
383 errorCode=dataErrorCode;
384 return FALSE;
385 } else /* haveNormData==0 */ {
386 return (UBool)(loadNormData(errorCode)>0);
387 }
388 #endif
389 }
390
391 U_CAPI UBool U_EXPORT2
unorm_haveData(UErrorCode * pErrorCode)392 unorm_haveData(UErrorCode *pErrorCode) {
393 return _haveData(*pErrorCode);
394 }
395
396 U_CAPI const uint16_t * U_EXPORT2
unorm_getFCDTrieIndex(UChar32 & fcdHighStart,UErrorCode * pErrorCode)397 unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode) {
398 if(_haveData(*pErrorCode)) {
399 fcdHighStart=fcdTrie.highStart;
400 return fcdTrie.index;
401 } else {
402 return NULL;
403 }
404 }
405
406 /* data access primitives --------------------------------------------------- */
407
408 static inline uint32_t
_getNorm32(UChar c)409 _getNorm32(UChar c) {
410 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(&normTrie, c);
411 }
412
413 static inline uint32_t
_getNorm32FromSurrogatePair(UChar c,UChar c2)414 _getNorm32FromSurrogatePair(UChar c, UChar c2) {
415 UChar32 cp=U16_GET_SUPPLEMENTARY(c, c2);
416 return UTRIE2_GET32_FROM_SUPP(&normTrie, cp);
417 }
418
419 /*
420 * get a norm32 from text with complete code points
421 * (like from decompositions)
422 */
423 static inline uint32_t
_getNorm32(const UChar * p,uint32_t mask)424 _getNorm32(const UChar *p, uint32_t mask) {
425 UChar c=*p;
426 uint32_t norm32=_getNorm32(c);
427 if((norm32&mask) && U16_IS_LEAD(c)) {
428 /* c is a lead surrogate, get the real norm32 */
429 norm32=_getNorm32FromSurrogatePair(c, *(p+1));
430 }
431 return norm32;
432 }
433
434 static inline uint16_t
_getFCD16(UChar c)435 _getFCD16(UChar c) {
436 return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(&fcdTrie, c);
437 }
438
439 static inline uint16_t
_getFCD16FromSurrogatePair(UChar c,UChar c2)440 _getFCD16FromSurrogatePair(UChar c, UChar c2) {
441 UChar32 cp=U16_GET_SUPPLEMENTARY(c, c2);
442 return UTRIE2_GET16_FROM_SUPP(&fcdTrie, cp);
443 }
444
445 static inline const uint16_t *
_getExtraData(uint32_t norm32)446 _getExtraData(uint32_t norm32) {
447 return extraData+(norm32>>_NORM_EXTRA_SHIFT);
448 }
449
450 /*
451 * TODO(markus): Revisit if it makes sense for functions like _getNextCC()
452 * and their call sites, and a fair bit of other code here, to work with UTF-16 code units,
453 * or whether code simplification would suggest just using UChar32 and maybe UTRIE2_NEXT32().
454 */
455
456 #if 0
457 /*
458 * It is possible to get the FCD data from the main trie if unorm.icu
459 * was built without the FCD trie, although it is slower.
460 * This is not implemented because it is hard to test, and because it seems
461 * unusual to want to use FCD and not build the data file for it.
462 *
463 * Untested sample code:
464 */
465 static inline uint16_t
466 _getFCD16FromNormData(UChar32 c) {
467 uint32_t norm32, fcd;
468
469 norm32=_getNorm32(c);
470 if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) {
471 /* get the lead/trail cc from the decomposition data */
472 const uint16_t *nfd=_getExtraData(norm32);
473 if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
474 fcd=nfd[1];
475 }
476 } else {
477 fcd=norm32&_NORM_CC_MASK;
478 if(fcd!=0) {
479 /* use the code point cc value for both lead and trail cc's */
480 fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */
481 }
482 }
483
484 return (uint16_t)fcd;
485 }
486 #endif
487
488 /* normalization exclusion sets --------------------------------------------- */
489
490 /*
491 * Normalization exclusion UnicodeSets are used for tailored normalization;
492 * see the comment near the beginning of this file.
493 *
494 * By specifying one or several sets of code points,
495 * those code points become inert for normalization.
496 */
497
498 static const UnicodeSet *
internalGetNXHangul(UErrorCode & errorCode)499 internalGetNXHangul(UErrorCode &errorCode) {
500 /* internal function, does not check for incoming U_FAILURE */
501 UBool isCached;
502
503 UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached);
504
505 if(!isCached) {
506 UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
507 if(set==NULL) {
508 errorCode=U_MEMORY_ALLOCATION_ERROR;
509 return NULL;
510 }
511 // Compact the set for caching.
512 set->compact();
513
514 umtx_lock(NULL);
515 if(nxCache[UNORM_NX_HANGUL]==NULL) {
516 nxCache[UNORM_NX_HANGUL]=set;
517 set=NULL;
518 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
519 }
520 umtx_unlock(NULL);
521
522 delete set;
523 }
524
525 return nxCache[UNORM_NX_HANGUL];
526 }
527
528 /* unorm.cpp 1.116 had and used
529 static const UnicodeSet *
530 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
531 ...
532 }
533 */
534
535 /* get and set an exclusion set from a serialized UnicodeSet */
536 static const UnicodeSet *
internalGetSerializedNX(int32_t options,int32_t nxIndex,UErrorCode & errorCode)537 internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
538 /* internal function, does not check for incoming U_FAILURE */
539 UBool isCached;
540
541 UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
542
543 if( !isCached &&
544 canonStartSets!=NULL &&
545 canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
546 ) {
547 USerializedSet sset;
548 UnicodeSet *set;
549 UChar32 start, end;
550 int32_t i;
551
552 if( !uset_getSerializedSet(
553 &sset,
554 canonStartSets+canonStartSets[nxIndex],
555 canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
556 ) {
557 errorCode=U_INVALID_FORMAT_ERROR;
558 return NULL;
559 }
560
561 /* turn the serialized set into a UnicodeSet */
562 set=new UnicodeSet();
563 if(set==NULL) {
564 errorCode=U_MEMORY_ALLOCATION_ERROR;
565 return NULL;
566 }
567 for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
568 set->add(start, end);
569 }
570 // Compact the set for caching.
571 set->compact();
572
573 umtx_lock(NULL);
574 if(nxCache[options]==NULL) {
575 nxCache[options]=set;
576 set=NULL;
577 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
578 }
579 umtx_unlock(NULL);
580
581 delete set;
582 }
583
584 return nxCache[options];
585 }
586
587 static const UnicodeSet *
internalGetNXCJKCompat(UErrorCode & errorCode)588 internalGetNXCJKCompat(UErrorCode &errorCode) {
589 /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
590 return internalGetSerializedNX(
591 UNORM_NX_CJK_COMPAT,
592 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
593 errorCode);
594 }
595
596 static const UnicodeSet *
internalGetNXUnicode(uint32_t options,UErrorCode & errorCode)597 internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
598 /* internal function, does not check for incoming U_FAILURE */
599 int32_t nxIndex;
600
601 options&=_NORM_OPTIONS_UNICODE_MASK;
602 switch(options) {
603 case 0:
604 return NULL;
605 case UNORM_UNICODE_3_2:
606 /* [:^Age=3.2:] */
607 nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
608 break;
609 default:
610 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
611 return NULL;
612 }
613
614 /* build a set with all code points that were not designated by the specified Unicode version */
615 return internalGetSerializedNX(options, nxIndex, errorCode);
616 }
617
618 /* Get a decomposition exclusion set. The data must be loaded. */
619 static const UnicodeSet *
internalGetNX(int32_t options,UErrorCode & errorCode)620 internalGetNX(int32_t options, UErrorCode &errorCode) {
621 options&=_NORM_OPTIONS_SETS_MASK;
622
623 UBool isCached;
624
625 UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
626
627 if(!isCached) {
628 /* return basic sets */
629 if(options==UNORM_NX_HANGUL) {
630 return internalGetNXHangul(errorCode);
631 }
632 if(options==UNORM_NX_CJK_COMPAT) {
633 return internalGetNXCJKCompat(errorCode);
634 }
635 if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
636 return internalGetNXUnicode(options, errorCode);
637 }
638
639 /* build a set from multiple subsets */
640 UnicodeSet *set;
641 const UnicodeSet *other;
642
643 set=new UnicodeSet();
644 if(set==NULL) {
645 errorCode=U_MEMORY_ALLOCATION_ERROR;
646 return NULL;
647 }
648
649 if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
650 set->addAll(*other);
651 }
652 if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
653 set->addAll(*other);
654 }
655 if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
656 set->addAll(*other);
657 }
658
659 if(U_FAILURE(errorCode)) {
660 delete set;
661 return NULL;
662 }
663 // Compact the set for caching.
664 set->compact();
665
666 umtx_lock(NULL);
667 if(nxCache[options]==NULL) {
668 nxCache[options]=set;
669 set=NULL;
670 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
671 }
672 umtx_unlock(NULL);
673
674 delete set;
675 }
676
677 return nxCache[options];
678 }
679
680 static inline const UnicodeSet *
getNX(int32_t options,UErrorCode & errorCode)681 getNX(int32_t options, UErrorCode &errorCode) {
682 if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
683 /* incoming failure, or no decomposition exclusions requested */
684 return NULL;
685 } else {
686 return internalGetNX(options, errorCode);
687 }
688 }
689
690 U_CFUNC const UnicodeSet *
unorm_getNX(int32_t options,UErrorCode * pErrorCode)691 unorm_getNX(int32_t options, UErrorCode *pErrorCode) {
692 return getNX(options, *pErrorCode);
693 }
694
695 static inline UBool
nx_contains(const UnicodeSet * nx,UChar32 c)696 nx_contains(const UnicodeSet *nx, UChar32 c) {
697 return nx!=NULL && nx->contains(c);
698 }
699
700 static inline UBool
nx_contains(const UnicodeSet * nx,UChar c,UChar c2)701 nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
702 return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
703 }
704
705 /* other normalization primitives ------------------------------------------- */
706
707 /* get the canonical or compatibility decomposition for one character */
708 static inline const UChar *
_decompose(uint32_t norm32,uint32_t qcMask,int32_t & length,uint8_t & cc,uint8_t & trailCC)709 _decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
710 uint8_t &cc, uint8_t &trailCC) {
711 const UChar *p=(const UChar *)_getExtraData(norm32);
712 length=*p++;
713
714 if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
715 /* use compatibility decomposition, skip canonical data */
716 p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
717 length>>=8;
718 }
719
720 if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
721 /* get the lead and trail cc's */
722 UChar bothCCs=*p++;
723 cc=(uint8_t)(bothCCs>>8);
724 trailCC=(uint8_t)bothCCs;
725 } else {
726 /* lead and trail cc's are both 0 */
727 cc=trailCC=0;
728 }
729
730 length&=_NORM_DECOMP_LENGTH_MASK;
731 return p;
732 }
733
734 /* get the canonical decomposition for one character */
735 static inline const UChar *
_decompose(uint32_t norm32,int32_t & length,uint8_t & cc,uint8_t & trailCC)736 _decompose(uint32_t norm32, int32_t &length,
737 uint8_t &cc, uint8_t &trailCC) {
738 const UChar *p=(const UChar *)_getExtraData(norm32);
739 length=*p++;
740
741 if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
742 /* get the lead and trail cc's */
743 UChar bothCCs=*p++;
744 cc=(uint8_t)(bothCCs>>8);
745 trailCC=(uint8_t)bothCCs;
746 } else {
747 /* lead and trail cc's are both 0 */
748 cc=trailCC=0;
749 }
750
751 length&=_NORM_DECOMP_LENGTH_MASK;
752 return p;
753 }
754
755 /**
756 * Get the canonical decomposition for one code point.
757 * @param c code point
758 * @param buffer out-only buffer for algorithmic decompositions of Hangul
759 * @param length out-only, takes the length of the decomposition, if any
760 * @return pointer to decomposition, or 0 if none
761 * @internal
762 */
763 U_CFUNC const UChar *
unorm_getCanonicalDecomposition(UChar32 c,UChar buffer[4],int32_t * pLength)764 unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) {
765 uint32_t norm32;
766
767 if(c<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
768 /* trivial case */
769 return NULL;
770 }
771
772 norm32=UTRIE2_GET32(&normTrie, c);
773 if(norm32&_NORM_QC_NFD) {
774 if(isNorm32HangulOrJamo(norm32)) {
775 /* Hangul syllable: decompose algorithmically */
776 UChar c2;
777
778 c-=HANGUL_BASE;
779
780 c2=(UChar)(c%JAMO_T_COUNT);
781 c/=JAMO_T_COUNT;
782 if(c2>0) {
783 buffer[2]=(UChar)(JAMO_T_BASE+c2);
784 *pLength=3;
785 } else {
786 *pLength=2;
787 }
788
789 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
790 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
791 return buffer;
792 } else {
793 /* normal decomposition */
794 uint8_t cc, trailCC;
795 return _decompose(norm32, *pLength, cc, trailCC);
796 }
797 } else {
798 return 0;
799 }
800 }
801
802 /*
803 * get the combining class of (c, c2)=*p++
804 * before: p<limit after: p<=limit
805 * if only one code unit is used, then c2==0
806 */
807 static inline uint8_t
_getNextCC(const UChar * & p,const UChar * limit,UChar & c,UChar & c2)808 _getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
809 uint32_t norm32;
810
811 c=*p++;
812 c2=0;
813 norm32=_getNorm32(c);
814 if((norm32&_NORM_CC_MASK)==0) {
815 return 0;
816 } else if(U16_IS_LEAD(c)) {
817 /* c is a lead surrogate, get the real norm32 */
818 if(p!=limit && U16_IS_TRAIL(c2=*p)) {
819 ++p;
820 norm32=_getNorm32FromSurrogatePair(c, c2);
821 } else {
822 c2=0;
823 return 0;
824 }
825 }
826 return (uint8_t)(norm32>>_NORM_CC_SHIFT);
827 }
828
829 /*
830 * read backwards and get norm32
831 * return 0 if the character is <minC
832 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
833 */
834 static inline uint32_t
_getPrevNorm32(const UChar * start,const UChar * & src,uint32_t minC,UChar & c,UChar & c2)835 _getPrevNorm32(const UChar *start, const UChar *&src,
836 uint32_t minC,
837 UChar &c, UChar &c2) {
838 c=*--src;
839 c2=0;
840
841 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
842 if(c<minC) {
843 return 0;
844 } else if(!U_IS_SURROGATE(c)) {
845 return _getNorm32(c);
846 } else if(U16_IS_SURROGATE_TRAIL(c) && src!=start && U16_IS_LEAD(c2=*(src-1))) {
847 --src;
848 return _getNorm32FromSurrogatePair(c2, c);
849 } else {
850 /* unpaired surrogate */
851 c2=0;
852 return 0;
853 }
854 }
855
856 /*
857 * get the combining class of (c, c2)=*--p
858 * before: start<p after: start<=p
859 */
860 static inline uint8_t
_getPrevCC(const UChar * start,const UChar * & p)861 _getPrevCC(const UChar *start, const UChar *&p) {
862 UChar c, c2;
863
864 return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, c, c2)>>_NORM_CC_SHIFT);
865 }
866
867 /*
868 * is this a safe boundary character for NF*D?
869 * (lead cc==0)
870 */
871 static inline UBool
_isNFDSafe(uint32_t norm32,uint32_t ccOrQCMask,uint32_t decompQCMask)872 _isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
873 if((norm32&ccOrQCMask)==0) {
874 return TRUE; /* cc==0 and no decomposition: this is NF*D safe */
875 }
876
877 /* inspect its decomposition - maybe a Hangul but not a surrogate here */
878 if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
879 int32_t length;
880 uint8_t cc, trailCC;
881
882 /* decomposes, get everything from the variable-length extra data */
883 _decompose(norm32, decompQCMask, length, cc, trailCC);
884 return cc==0;
885 } else {
886 /* no decomposition (or Hangul), test the cc directly */
887 return (norm32&_NORM_CC_MASK)==0;
888 }
889 }
890
891 /*
892 * is this (or does its decomposition begin with) a "true starter"?
893 * (cc==0 and NF*C_YES)
894 */
895 static inline UBool
_isTrueStarter(uint32_t norm32,uint32_t ccOrQCMask,uint32_t decompQCMask)896 _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
897 if((norm32&ccOrQCMask)==0) {
898 return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
899 }
900
901 /* inspect its decomposition - not a Hangul or a surrogate here */
902 if((norm32&decompQCMask)!=0) {
903 const UChar *p;
904 int32_t length;
905 uint8_t cc, trailCC;
906
907 /* decomposes, get everything from the variable-length extra data */
908 p=_decompose(norm32, decompQCMask, length, cc, trailCC);
909 if(cc==0) {
910 uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
911
912 /* does it begin with NFC_YES? */
913 if((_getNorm32(p, qcMask)&qcMask)==0) {
914 /* yes, the decomposition begins with a true starter */
915 return TRUE;
916 }
917 }
918 }
919 return FALSE;
920 }
921
922 /* uchar.h */
923 U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c)924 u_getCombiningClass(UChar32 c) {
925 #if !UNORM_HARDCODE_DATA
926 UErrorCode errorCode=U_ZERO_ERROR;
927 if(_haveData(errorCode)) {
928 #endif
929 uint32_t norm32=UTRIE2_GET32(&normTrie, c);
930 return (uint8_t)(norm32>>_NORM_CC_SHIFT);
931 #if !UNORM_HARDCODE_DATA
932 } else {
933 return 0;
934 }
935 #endif
936 }
937
938 U_CFUNC UBool U_EXPORT2
unorm_internalIsFullCompositionExclusion(UChar32 c)939 unorm_internalIsFullCompositionExclusion(UChar32 c) {
940 #if UNORM_HARDCODE_DATA
941 if(auxTrie.index!=NULL) {
942 #else
943 UErrorCode errorCode=U_ZERO_ERROR;
944 if(_haveData(errorCode) && auxTrie.index!=NULL) {
945 #endif
946 uint16_t aux=UTRIE2_GET16(&auxTrie, c);
947 return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
948 } else {
949 return FALSE;
950 }
951 }
952
953 U_CFUNC UBool U_EXPORT2
954 unorm_isCanonSafeStart(UChar32 c) {
955 #if UNORM_HARDCODE_DATA
956 if(auxTrie.index!=NULL) {
957 #else
958 UErrorCode errorCode=U_ZERO_ERROR;
959 if(_haveData(errorCode) && auxTrie.index!=NULL) {
960 #endif
961 uint16_t aux=UTRIE2_GET16(&auxTrie, c);
962 return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
963 } else {
964 return FALSE;
965 }
966 }
967
968 U_CAPI void U_EXPORT2
969 unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
970 if(unorm_haveData(pErrorCode)){
971 uprv_memcpy(*versionInfo, dataVersion, 4);
972 }
973 }
974
975
976 U_CAPI UBool U_EXPORT2
977 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
978 #if !UNORM_HARDCODE_DATA
979 UErrorCode errorCode=U_ZERO_ERROR;
980 #endif
981 if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
982 #if !UNORM_HARDCODE_DATA
983 _haveData(errorCode) &&
984 #endif
985 canonStartSets!=NULL
986 ) {
987 const uint16_t *table;
988 int32_t i, start, limit;
989
990 /*
991 * binary search for c
992 *
993 * There are two search tables,
994 * one for BMP code points and one for supplementary ones.
995 * See unormimp.h for details.
996 */
997 if(c<=0xffff) {
998 table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
999 start=0;
1000 limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1001
1002 /* each entry is a pair { c, result } */
1003 while(start<limit-2) {
1004 i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
1005 if(c<table[i]) {
1006 limit=i;
1007 } else {
1008 start=i;
1009 }
1010 }
1011
1012 /* found? */
1013 if(c==table[start]) {
1014 i=table[start+1];
1015 if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
1016 /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
1017 i&=(_NORM_MAX_CANON_SETS-1);
1018 return uset_getSerializedSet(fillSet,
1019 canonStartSets+i,
1020 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1021 } else {
1022 /* other result values are BMP code points for single-code point sets */
1023 uset_setSerializedToOne(fillSet, (UChar32)i);
1024 return TRUE;
1025 }
1026 }
1027 } else {
1028 uint16_t high, low, h;
1029
1030 table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
1031 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1032 start=0;
1033 limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1034
1035 high=(uint16_t)(c>>16);
1036 low=(uint16_t)c;
1037
1038 /* each entry is a triplet { high(c), low(c), result } */
1039 while(start<limit-3) {
1040 i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
1041 h=table[i]&0x1f; /* high word */
1042 if(high<h || (high==h && low<table[i+1])) {
1043 limit=i;
1044 } else {
1045 start=i;
1046 }
1047 }
1048
1049 /* found? */
1050 h=table[start];
1051 if(high==(h&0x1f) && low==table[start+1]) {
1052 i=table[start+2];
1053 if((h&0x8000)==0) {
1054 /* the result is an index to a USerializedSet */
1055 return uset_getSerializedSet(fillSet,
1056 canonStartSets+i,
1057 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1058 } else {
1059 /*
1060 * single-code point set {x} in
1061 * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
1062 */
1063 i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
1064 uset_setSerializedToOne(fillSet, (UChar32)i);
1065 return TRUE;
1066 }
1067 }
1068 }
1069 }
1070
1071 return FALSE; /* not found */
1072 }
1073
1074 U_CAPI int32_t U_EXPORT2
1075 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
1076 uint16_t aux;
1077
1078 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1079 return 0;
1080 }
1081 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1082 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1083 return 0;
1084 }
1085 if(_haveData(*pErrorCode) && auxTrie.index!=NULL) {
1086 aux=UTRIE2_GET16(&auxTrie, c);
1087 aux&=_NORM_AUX_FNC_MASK;
1088 } else {
1089 aux=0;
1090 }
1091 if(aux!=0) {
1092 const UChar *s;
1093 int32_t length;
1094
1095 s=(const UChar *)(extraData+aux);
1096 if(*s<0xff00) {
1097 /* s points to the single-unit string */
1098 length=1;
1099 } else {
1100 length=*s&0xff;
1101 ++s;
1102 }
1103 if(0<length && length<=destCapacity) {
1104 uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
1105 }
1106 return u_terminateUChars(dest, destCapacity, length, pErrorCode);
1107 } else {
1108 return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
1109 }
1110 }
1111
1112 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
1113 U_CAPI UBool U_EXPORT2
1114 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
1115 uint32_t norm32, mask;
1116 uint16_t aux;
1117
1118 #if !UNORM_HARDCODE_DATA
1119 UErrorCode errorCode=U_ZERO_ERROR;
1120 if(!_haveData(errorCode)) {
1121 return FALSE;
1122 }
1123 #endif
1124
1125 /* handle trivial cases; set the comparison mask for the normal ones */
1126 switch(mode) {
1127 case UNORM_NONE:
1128 return TRUE;
1129 case UNORM_NFD:
1130 mask=_NORM_CC_MASK|_NORM_QC_NFD;
1131 break;
1132 case UNORM_NFKD:
1133 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
1134 break;
1135 case UNORM_NFC:
1136 /* case UNORM_FCC: */
1137 mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
1138 break;
1139 case UNORM_NFKC:
1140 mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
1141 break;
1142 case UNORM_FCD:
1143 /* FCD: skippable if lead cc==0 and trail cc<=1 */
1144 return fcdTrie.index!=NULL && UTRIE2_GET16(&fcdTrie, c)<=1;
1145 default:
1146 return FALSE;
1147 }
1148
1149 /* check conditions (a)..(e), see unormimp.h */
1150 norm32=UTRIE2_GET32(&normTrie, c);
1151 if((norm32&mask)!=0) {
1152 return FALSE; /* fails (a)..(e), not skippable */
1153 }
1154
1155 if(mode<UNORM_NFC) {
1156 return TRUE; /* NF*D, passed (a)..(c), is skippable */
1157 }
1158
1159 /* NF*C/FCC, passed (a)..(e) */
1160 if((norm32&_NORM_QC_NFD)==0) {
1161 return TRUE; /* no canonical decomposition, is skippable */
1162 }
1163
1164 /* check Hangul syllables algorithmically */
1165 if(isNorm32HangulOrJamo(norm32)) {
1166 /* Jamo passed (a)..(e) above, must be Hangul */
1167 return !isHangulWithoutJamoT((UChar)c); /* LVT are skippable, LV are not */
1168 }
1169
1170 /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1171 /* NF*C, test (f) flag */
1172 if(!formatVersion_2_2 || auxTrie.index==NULL) {
1173 return FALSE; /* no (f) data, say not skippable to be safe */
1174 }
1175
1176 aux=UTRIE2_GET16(&auxTrie, c);
1177 return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
1178
1179 /* } else { FCC, test fcd<=1 instead of the above } */
1180 }
1181
1182 U_CAPI void U_EXPORT2
1183 unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
1184 UChar c;
1185
1186 if(!_haveData(*pErrorCode)) {
1187 return;
1188 }
1189
1190 /* add the start code point of each same-value range of each trie */
1191 utrie2_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
1192 if(fcdTrie.index!=NULL) {
1193 utrie2_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
1194 }
1195 if(auxTrie.index!=NULL) {
1196 utrie2_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
1197 }
1198
1199 /* add Hangul LV syllables and LV+1 because of skippables */
1200 for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
1201 sa->add(sa->set, c);
1202 sa->add(sa->set, c+1);
1203 }
1204 sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
1205 }
1206
1207 U_CFUNC UNormalizationCheckResult U_EXPORT2
1208 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
1209 static const uint32_t qcMask[UNORM_MODE_COUNT]={
1210 0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
1211 };
1212
1213 uint32_t norm32;
1214
1215 #if !UNORM_HARDCODE_DATA
1216 UErrorCode errorCode=U_ZERO_ERROR;
1217 if(!_haveData(errorCode)) {
1218 return UNORM_YES;
1219 }
1220 #endif
1221
1222 norm32=UTRIE2_GET32(&normTrie, c);
1223 norm32&=qcMask[mode];
1224
1225 if(norm32==0) {
1226 return UNORM_YES;
1227 } else if(norm32&_NORM_QC_ANY_NO) {
1228 return UNORM_NO;
1229 } else /* _NORM_QC_ANY_MAYBE */ {
1230 return UNORM_MAYBE;
1231 }
1232 }
1233
1234 U_CFUNC uint16_t U_EXPORT2
1235 unorm_getFCD16FromCodePoint(UChar32 c) {
1236 #if !UNORM_HARDCODE_DATA
1237 UErrorCode errorCode;
1238 errorCode=U_ZERO_ERROR;
1239 #endif
1240
1241 if(
1242 #if !UNORM_HARDCODE_DATA
1243 !_haveData(errorCode) ||
1244 #endif
1245 fcdTrie.index==NULL
1246 ) {
1247 return 0;
1248 }
1249 return UTRIE2_GET16(&fcdTrie, c);
1250 }
1251
1252 /* reorder UTF-16 in-place -------------------------------------------------- */
1253
1254 /*
1255 * simpler, single-character version of _mergeOrdered() -
1256 * bubble-insert one single code point into the preceding string
1257 * which is already canonically ordered
1258 * (c, c2) may or may not yet have been inserted at [current..p[
1259 *
1260 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1261 *
1262 * before: [start..current[ is already ordered, and
1263 * [current..p[ may or may not hold (c, c2) but
1264 * must be exactly the same length as (c, c2)
1265 * after: [start..p[ is ordered
1266 *
1267 * returns the trailing combining class
1268 */
1269 static uint8_t
1270 _insertOrdered(const UChar *start, UChar *current, UChar *p,
1271 UChar c, UChar c2, uint8_t cc) {
1272 const UChar *pBack, *pPreBack;
1273 UChar *r;
1274 uint8_t prevCC, trailCC=cc;
1275
1276 if(start<current && cc!=0) {
1277 /* search for the insertion point where cc>=prevCC */
1278 pPreBack=pBack=current;
1279 prevCC=_getPrevCC(start, pPreBack);
1280 if(cc<prevCC) {
1281 /* this will be the last code point, so keep its cc */
1282 trailCC=prevCC;
1283 pBack=pPreBack;
1284 while(start<pPreBack) {
1285 prevCC=_getPrevCC(start, pPreBack);
1286 if(cc>=prevCC) {
1287 break;
1288 }
1289 pBack=pPreBack;
1290 }
1291
1292 /*
1293 * this is where we are right now with all these pointers:
1294 * [start..pPreBack[ 0..? code points that we can ignore
1295 * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1296 * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2)
1297 * [current..p[ 1 code point (c, c2) with cc
1298 */
1299
1300 /* move the code units in between up */
1301 r=p;
1302 do {
1303 *--r=*--current;
1304 } while(pBack!=current);
1305 }
1306 }
1307
1308 /* insert (c, c2) */
1309 *current=c;
1310 if(c2!=0) {
1311 *(current+1)=c2;
1312 }
1313
1314 /* we know the cc of the last code point */
1315 return trailCC;
1316 }
1317
1318 /*
1319 * merge two UTF-16 string parts together
1320 * to canonically order (order by combining classes) their concatenation
1321 *
1322 * the two strings may already be adjacent, so that the merging is done in-place
1323 * if the two strings are not adjacent, then the buffer holding the first one
1324 * must be large enough
1325 * the second string may or may not be ordered in itself
1326 *
1327 * before: [start..current[ is already ordered, and
1328 * [next..limit[ may be ordered in itself, but
1329 * is not in relation to [start..current[
1330 * after: [start..current+(limit-next)[ is ordered
1331 *
1332 * the algorithm is a simple bubble-sort that takes the characters from *next++
1333 * and inserts them in correct combining class order into the preceding part
1334 * of the string
1335 *
1336 * since this function is called much less often than the single-code point
1337 * _insertOrdered(), it just uses that for easier maintenance
1338 * (see file version from before 2001aug31 for a more optimized version)
1339 *
1340 * returns the trailing combining class
1341 */
1342 static uint8_t
1343 _mergeOrdered(UChar *start, UChar *current,
1344 const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
1345 UChar *r;
1346 UChar c, c2;
1347 uint8_t cc, trailCC=0;
1348 UBool adjacent;
1349
1350 adjacent= current==next;
1351
1352 if(start!=current || !isOrdered) {
1353 while(next<limit) {
1354 cc=_getNextCC(next, limit, c, c2);
1355 if(cc==0) {
1356 /* does not bubble back */
1357 trailCC=0;
1358 if(adjacent) {
1359 current=(UChar *)next;
1360 } else {
1361 *current++=c;
1362 if(c2!=0) {
1363 *current++=c2;
1364 }
1365 }
1366 if(isOrdered) {
1367 break;
1368 } else {
1369 start=current;
1370 }
1371 } else {
1372 r=current+(c2==0 ? 1 : 2);
1373 trailCC=_insertOrdered(start, current, r, c, c2, cc);
1374 current=r;
1375 }
1376 }
1377 }
1378
1379 if(next==limit) {
1380 /* we know the cc of the last code point */
1381 return trailCC;
1382 } else {
1383 if(!adjacent) {
1384 /* copy the second string part */
1385 do {
1386 *current++=*next++;
1387 } while(next!=limit);
1388 limit=current;
1389 }
1390 return _getPrevCC(start, limit);
1391 }
1392 }
1393
1394 /* find the last true starter in [start..src[ and return the pointer to it */
1395 static const UChar *
1396 _findPreviousStarter(const UChar *start, const UChar *src,
1397 uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
1398 uint32_t norm32;
1399 UChar c, c2;
1400
1401 while(start<src) {
1402 norm32=_getPrevNorm32(start, src, minNoMaybe, c, c2);
1403 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1404 break;
1405 }
1406 }
1407 return src;
1408 }
1409
1410 /* find the first true starter in [src..limit[ and return the pointer to it */
1411 static const UChar *
1412 _findNextStarter(const UChar *src, const UChar *limit,
1413 uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
1414 const UChar *p;
1415 uint32_t norm32, ccOrQCMask;
1416 int32_t length;
1417 UChar c, c2;
1418 uint8_t cc, trailCC;
1419
1420 ccOrQCMask=_NORM_CC_MASK|qcMask;
1421
1422 for(;;) {
1423 if(src==limit) {
1424 break; /* end of string */
1425 }
1426 c=*src;
1427 if(c<minNoMaybe) {
1428 break; /* catches NUL terminater, too */
1429 }
1430
1431 norm32=_getNorm32(c);
1432 if((norm32&ccOrQCMask)==0) {
1433 break; /* true starter */
1434 }
1435
1436 if(U16_IS_LEAD(c)) {
1437 /* c is a lead surrogate, get the real norm32 */
1438 if((src+1)==limit || !U16_IS_TRAIL(c2=*(src+1))) {
1439 break; /* unmatched first surrogate: counts as a true starter */
1440 }
1441 norm32=_getNorm32FromSurrogatePair(c, c2);
1442
1443 if((norm32&ccOrQCMask)==0) {
1444 break; /* true starter */
1445 }
1446 } else {
1447 c2=0;
1448 }
1449
1450 /* (c, c2) is not a true starter but its decomposition may be */
1451 if(norm32&decompQCMask) {
1452 /* (c, c2) decomposes, get everything from the variable-length extra data */
1453 p=_decompose(norm32, decompQCMask, length, cc, trailCC);
1454
1455 /* get the first character's norm32 to check if it is a true starter */
1456 if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
1457 break; /* true starter */
1458 }
1459 }
1460
1461 src+= c2==0 ? 1 : 2; /* not a true starter, continue */
1462 }
1463
1464 return src;
1465 }
1466
1467 /* make NFD & NFKD ---------------------------------------------------------- */
1468
1469 U_CAPI int32_t U_EXPORT2
1470 unorm_getDecomposition(UChar32 c, UBool compat,
1471 UChar *dest, int32_t destCapacity) {
1472 #if !UNORM_HARDCODE_DATA
1473 UErrorCode errorCode=U_ZERO_ERROR;
1474 #endif
1475 if( (uint32_t)c<=0x10ffff &&
1476 #if !UNORM_HARDCODE_DATA
1477 _haveData(errorCode) &&
1478 #endif
1479 ((dest!=NULL && destCapacity>0) || destCapacity==0)
1480 ) {
1481 uint32_t norm32, qcMask;
1482 UChar32 minNoMaybe;
1483 int32_t length;
1484
1485 /* initialize */
1486 if(!compat) {
1487 minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1488 qcMask=_NORM_QC_NFD;
1489 } else {
1490 minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1491 qcMask=_NORM_QC_NFKD;
1492 }
1493
1494 if(c<minNoMaybe) {
1495 /* trivial case */
1496 if(destCapacity>0) {
1497 dest[0]=(UChar)c;
1498 }
1499 return -1;
1500 }
1501
1502 /* data lookup */
1503 norm32=UTRIE2_GET32(&normTrie, c);
1504 if((norm32&qcMask)==0) {
1505 /* simple case: no decomposition */
1506 if(c<=0xffff) {
1507 if(destCapacity>0) {
1508 dest[0]=(UChar)c;
1509 }
1510 return -1;
1511 } else {
1512 if(destCapacity>=2) {
1513 dest[0]=UTF16_LEAD(c);
1514 dest[1]=UTF16_TRAIL(c);
1515 }
1516 return -2;
1517 }
1518 } else if(isNorm32HangulOrJamo(norm32)) {
1519 /* Hangul syllable: decompose algorithmically */
1520 UChar c2;
1521
1522 c-=HANGUL_BASE;
1523
1524 c2=(UChar)(c%JAMO_T_COUNT);
1525 c/=JAMO_T_COUNT;
1526 if(c2>0) {
1527 if(destCapacity>=3) {
1528 dest[2]=(UChar)(JAMO_T_BASE+c2);
1529 }
1530 length=3;
1531 } else {
1532 length=2;
1533 }
1534
1535 if(destCapacity>=2) {
1536 dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1537 dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1538 }
1539 return length;
1540 } else {
1541 /* c decomposes, get everything from the variable-length extra data */
1542 const UChar *p, *limit;
1543 uint8_t cc, trailCC;
1544
1545 p=_decompose(norm32, qcMask, length, cc, trailCC);
1546 if(length<=destCapacity) {
1547 limit=p+length;
1548 do {
1549 *dest++=*p++;
1550 } while(p<limit);
1551 }
1552 return length;
1553 }
1554 } else {
1555 return 0;
1556 }
1557 }
1558
1559 static int32_t
1560 _decompose(UChar *dest, int32_t destCapacity,
1561 const UChar *src, int32_t srcLength,
1562 UBool compat, const UnicodeSet *nx,
1563 uint8_t &outTrailCC) {
1564 UChar buffer[3];
1565 const UChar *limit, *prevSrc, *p;
1566 uint32_t norm32, ccOrQCMask, qcMask;
1567 int32_t destIndex, reorderStartIndex, length;
1568 UChar c, c2, minNoMaybe;
1569 uint8_t cc, prevCC, trailCC;
1570
1571 if(!compat) {
1572 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1573 qcMask=_NORM_QC_NFD;
1574 } else {
1575 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1576 qcMask=_NORM_QC_NFKD;
1577 }
1578
1579 /* initialize */
1580 ccOrQCMask=_NORM_CC_MASK|qcMask;
1581 destIndex=reorderStartIndex=0;
1582 prevCC=0;
1583
1584 /* avoid compiler warnings */
1585 norm32=0;
1586 c=0;
1587 cc=0;
1588 trailCC=0;
1589
1590 if(srcLength>=0) {
1591 /* string with length */
1592 limit=src+srcLength;
1593 } else /* srcLength==-1 */ {
1594 /* zero-terminated string */
1595 limit=NULL;
1596 }
1597
1598 U_ALIGN_CODE(16);
1599
1600 for(;;) {
1601 /* count code units below the minimum or with irrelevant data for the quick check */
1602 prevSrc=src;
1603 if(limit==NULL) {
1604 while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
1605 prevCC=0;
1606 ++src;
1607 }
1608 } else {
1609 while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
1610 prevCC=0;
1611 ++src;
1612 }
1613 }
1614
1615 /* copy these code units all at once */
1616 if(src!=prevSrc) {
1617 length=(int32_t)(src-prevSrc);
1618 if((destIndex+length)<=destCapacity) {
1619 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
1620 }
1621 destIndex+=length;
1622 reorderStartIndex=destIndex;
1623 }
1624
1625 /* end of source reached? */
1626 if(limit==NULL ? c==0 : src==limit) {
1627 break;
1628 }
1629
1630 /* c already contains *src and norm32 is set for it, increment src */
1631 ++src;
1632
1633 /* check one above-minimum, relevant code unit */
1634 /*
1635 * generally, set p and length to the decomposition string
1636 * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1637 * in all cases, set cc to the lead and trailCC to the trail combining class
1638 *
1639 * the following merge-sort of the current character into the preceding,
1640 * canonically ordered result text will use the optimized _insertOrdered()
1641 * if there is only one single code point to process;
1642 * this is indicated with p==NULL, and (c, c2) is the character to insert
1643 * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1644 * for a supplementary character)
1645 * otherwise, p[length] is merged in with _mergeOrdered()
1646 */
1647 if(isNorm32HangulOrJamo(norm32)) {
1648 if(nx_contains(nx, c)) {
1649 c2=0;
1650 p=NULL;
1651 length=1;
1652 } else {
1653 /* Hangul syllable: decompose algorithmically */
1654 p=buffer;
1655 cc=trailCC=0;
1656
1657 c-=HANGUL_BASE;
1658
1659 c2=(UChar)(c%JAMO_T_COUNT);
1660 c/=JAMO_T_COUNT;
1661 if(c2>0) {
1662 buffer[2]=(UChar)(JAMO_T_BASE+c2);
1663 length=3;
1664 } else {
1665 length=2;
1666 }
1667
1668 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1669 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1670 }
1671 } else {
1672 if(isNorm32Regular(norm32)) {
1673 c2=0;
1674 length=1;
1675 } else {
1676 /* c is a lead surrogate, get the real norm32 */
1677 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
1678 ++src;
1679 length=2;
1680 norm32=_getNorm32FromSurrogatePair(c, c2);
1681 } else {
1682 c2=0;
1683 length=1;
1684 norm32=0;
1685 }
1686 }
1687
1688 /* get the decomposition and the lead and trail cc's */
1689 if(nx_contains(nx, c, c2)) {
1690 /* excluded: norm32==0 */
1691 cc=trailCC=0;
1692 p=NULL;
1693 } else if((norm32&qcMask)==0) {
1694 /* c does not decompose */
1695 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1696 p=NULL;
1697 } else {
1698 /* c decomposes, get everything from the variable-length extra data */
1699 p=_decompose(norm32, qcMask, length, cc, trailCC);
1700 if(length==1) {
1701 /* fastpath a single code unit from decomposition */
1702 c=*p;
1703 c2=0;
1704 p=NULL;
1705 }
1706 }
1707 }
1708
1709 /* append the decomposition to the destination buffer, assume length>0 */
1710 if((destIndex+length)<=destCapacity) {
1711 UChar *reorderSplit=dest+destIndex;
1712 if(p==NULL) {
1713 /* fastpath: single code point */
1714 if(cc!=0 && cc<prevCC) {
1715 /* (c, c2) is out of order with respect to the preceding text */
1716 destIndex+=length;
1717 trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
1718 } else {
1719 /* just append (c, c2) */
1720 dest[destIndex++]=c;
1721 if(c2!=0) {
1722 dest[destIndex++]=c2;
1723 }
1724 }
1725 } else {
1726 /* general: multiple code points (ordered by themselves) from decomposition */
1727 if(cc!=0 && cc<prevCC) {
1728 /* the decomposition is out of order with respect to the preceding text */
1729 destIndex+=length;
1730 trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
1731 } else {
1732 /* just append the decomposition */
1733 do {
1734 dest[destIndex++]=*p++;
1735 } while(--length>0);
1736 }
1737 }
1738 } else {
1739 /* buffer overflow */
1740 /* keep incrementing the destIndex for preflighting */
1741 destIndex+=length;
1742 }
1743
1744 prevCC=trailCC;
1745 if(prevCC==0) {
1746 reorderStartIndex=destIndex;
1747 }
1748 }
1749
1750 outTrailCC=prevCC;
1751 return destIndex;
1752 }
1753
1754 U_CAPI int32_t U_EXPORT2
1755 unorm_decompose(UChar *dest, int32_t destCapacity,
1756 const UChar *src, int32_t srcLength,
1757 UBool compat, int32_t options,
1758 UErrorCode *pErrorCode) {
1759 const UnicodeSet *nx;
1760 int32_t destIndex;
1761 uint8_t trailCC;
1762
1763 if(!_haveData(*pErrorCode)) {
1764 return 0;
1765 }
1766
1767 nx=getNX(options, *pErrorCode);
1768 if(U_FAILURE(*pErrorCode)) {
1769 return 0;
1770 }
1771
1772 destIndex=_decompose(dest, destCapacity,
1773 src, srcLength,
1774 compat, nx,
1775 trailCC);
1776
1777 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
1778 }
1779
1780 /* make NFC & NFKC ---------------------------------------------------------- */
1781
1782 /* get the composition properties of the next character */
1783 static inline uint32_t
1784 _getNextCombining(UChar *&p, const UChar *limit,
1785 UChar &c, UChar &c2,
1786 uint16_t &combiningIndex, uint8_t &cc,
1787 const UnicodeSet *nx) {
1788 uint32_t norm32, combineFlags;
1789
1790 /* get properties */
1791 c=*p++;
1792 norm32=_getNorm32(c);
1793
1794 /* preset output values for most characters */
1795 c2=0;
1796 combiningIndex=0;
1797 cc=0;
1798
1799 if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
1800 return 0;
1801 } else {
1802 if(isNorm32Regular(norm32)) {
1803 /* set cc etc. below */
1804 } else if(isNorm32HangulOrJamo(norm32)) {
1805 /* a compatibility decomposition contained Jamos */
1806 combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
1807 return norm32&_NORM_COMBINES_ANY;
1808 } else {
1809 /* c is a lead surrogate, get the real norm32 */
1810 if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
1811 ++p;
1812 norm32=_getNorm32FromSurrogatePair(c, c2);
1813 } else {
1814 c2=0;
1815 return 0;
1816 }
1817 }
1818
1819 if(nx_contains(nx, c, c2)) {
1820 return 0; /* excluded: norm32==0 */
1821 }
1822
1823 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1824
1825 combineFlags=norm32&_NORM_COMBINES_ANY;
1826 if(combineFlags!=0) {
1827 combiningIndex=*(_getExtraData(norm32)-1);
1828 }
1829 return combineFlags;
1830 }
1831 }
1832
1833 /*
1834 * given a composition-result starter (c, c2) - which means its cc==0,
1835 * it combines forward, it has extra data, its norm32!=0,
1836 * it is not a Hangul or Jamo,
1837 * get just its combineFwdIndex
1838 *
1839 * norm32(c) is special if and only if c2!=0
1840 */
1841 static inline uint16_t
1842 _getCombiningIndexFromStarter(UChar c, UChar c2) {
1843 uint32_t norm32;
1844
1845 if(c2==0) {
1846 norm32=_getNorm32(c);
1847 } else {
1848 norm32=_getNorm32FromSurrogatePair(c, c2);
1849 }
1850 return *(_getExtraData(norm32)-1);
1851 }
1852
1853 /*
1854 * Find the recomposition result for
1855 * a forward-combining character
1856 * (specified with a pointer to its part of the combiningTable[])
1857 * and a backward-combining character
1858 * (specified with its combineBackIndex).
1859 *
1860 * If these two characters combine, then set (value, value2)
1861 * with the code unit(s) of the composition character.
1862 *
1863 * Return value:
1864 * 0 do not combine
1865 * 1 combine
1866 * >1 combine, and the composition is a forward-combining starter
1867 *
1868 * See unormimp.h for a description of the composition table format.
1869 */
1870 static inline uint16_t
1871 _combine(const uint16_t *table, uint16_t combineBackIndex,
1872 uint16_t &value, uint16_t &value2) {
1873 uint16_t key;
1874
1875 /* search in the starter's composition table */
1876 for(;;) {
1877 key=*table++;
1878 if(key>=combineBackIndex) {
1879 break;
1880 }
1881 table+= *table&0x8000 ? 2 : 1;
1882 }
1883
1884 /* mask off bit 15, the last-entry-in-the-list flag */
1885 if((key&0x7fff)==combineBackIndex) {
1886 /* found! combine! */
1887 value=*table;
1888
1889 /* is the composition a starter that combines forward? */
1890 key=(uint16_t)((value&0x2000)+1);
1891
1892 /* get the composition result code point from the variable-length result value */
1893 if(value&0x8000) {
1894 if(value&0x4000) {
1895 /* surrogate pair composition result */
1896 value=(uint16_t)((value&0x3ff)|0xd800);
1897 value2=*(table+1);
1898 } else {
1899 /* BMP composition result U+2000..U+ffff */
1900 value=*(table+1);
1901 value2=0;
1902 }
1903 } else {
1904 /* BMP composition result U+0000..U+1fff */
1905 value&=0x1fff;
1906 value2=0;
1907 }
1908
1909 return key;
1910 } else {
1911 /* not found */
1912 return 0;
1913 }
1914 }
1915
1916 static inline UBool
1917 _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
1918 UBool compat, UChar *dest, const UnicodeSet *nx) {
1919 if(isJamoVTNorm32JamoV(norm32)) {
1920 /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1921 prev=(UChar)(prev-JAMO_L_BASE);
1922 if(prev<JAMO_L_COUNT) {
1923 c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
1924
1925 /* check if the next character is a Jamo T (normal or compatibility) */
1926 if(src!=limit) {
1927 UChar next, t;
1928
1929 next=*src;
1930 if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1931 /* normal Jamo T */
1932 ++src;
1933 c+=t;
1934 } else if(compat) {
1935 /* if NFKC, then check for compatibility Jamo T (BMP only) */
1936 norm32=_getNorm32(next);
1937 if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
1938 const UChar *p;
1939 int32_t length;
1940 uint8_t cc, trailCC;
1941
1942 p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
1943 if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1944 /* compatibility Jamo T */
1945 ++src;
1946 c+=t;
1947 }
1948 }
1949 }
1950 }
1951 if(nx_contains(nx, c)) {
1952 if(!isHangulWithoutJamoT(c)) {
1953 --src; /* undo ++src from reading the Jamo T */
1954 }
1955 return FALSE;
1956 }
1957 if(dest!=0) {
1958 *dest=c;
1959 }
1960 return TRUE;
1961 }
1962 } else if(isHangulWithoutJamoT(prev)) {
1963 /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
1964 c=(UChar)(prev+(c-JAMO_T_BASE));
1965 if(nx_contains(nx, c)) {
1966 return FALSE;
1967 }
1968 if(dest!=0) {
1969 *dest=c;
1970 }
1971 return TRUE;
1972 }
1973 return FALSE;
1974 }
1975
1976 /*
1977 * recompose the characters in [p..limit[
1978 * (which is in NFD - decomposed and canonically ordered),
1979 * adjust limit, and return the trailing cc
1980 *
1981 * since for NFKC we may get Jamos in decompositions, we need to
1982 * recompose those too
1983 *
1984 * note that recomposition never lengthens the text:
1985 * any character consists of either one or two code units;
1986 * a composition may contain at most one more code unit than the original starter,
1987 * while the combining mark that is removed has at least one code unit
1988 */
1989 static uint8_t
1990 _recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
1991 UChar *starter, *pRemove, *q, *r;
1992 uint32_t combineFlags;
1993 UChar c, c2;
1994 uint16_t combineFwdIndex, combineBackIndex;
1995 uint16_t result, value, value2;
1996 uint8_t cc, prevCC;
1997 UBool starterIsSupplementary;
1998
1999 starter=NULL; /* no starter */
2000 combineFwdIndex=0; /* will not be used until starter!=NULL - avoid compiler warnings */
2001 combineBackIndex=0; /* will always be set if combineFlags!=0 - avoid compiler warnings */
2002 value=value2=0; /* always set by _combine() before used - avoid compiler warnings */
2003 starterIsSupplementary=FALSE; /* will not be used until starter!=NULL - avoid compiler warnings */
2004 prevCC=0;
2005
2006 for(;;) {
2007 combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
2008 if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
2009 if(combineBackIndex&0x8000) {
2010 /* c is a Jamo V/T, see if we can compose it with the previous character */
2011 /* for the PRI #29 fix, check that there is no intervening combining mark */
2012 if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
2013 pRemove=NULL; /* NULL while no Hangul composition */
2014 combineFlags=0;
2015 c2=*starter;
2016 if(combineBackIndex==0xfff2) {
2017 /* Jamo V, compose with previous Jamo L and following Jamo T */
2018 c2=(UChar)(c2-JAMO_L_BASE);
2019 if(c2<JAMO_L_COUNT) {
2020 pRemove=p-1;
2021 c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
2022 if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
2023 ++p;
2024 c+=c2;
2025 } else {
2026 /* the result is an LV syllable, which is a starter (unlike LVT) */
2027 combineFlags=_NORM_COMBINES_FWD;
2028 }
2029 if(!nx_contains(nx, c)) {
2030 *starter=c;
2031 } else {
2032 /* excluded */
2033 if(!isHangulWithoutJamoT(c)) {
2034 --p; /* undo the ++p from reading the Jamo T */
2035 }
2036 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
2037 pRemove=NULL;
2038 }
2039 }
2040
2041 /*
2042 * Normally, the following can not occur:
2043 * Since the input is in NFD, there are no Hangul LV syllables that
2044 * a Jamo T could combine with.
2045 * All Jamo Ts are combined above when handling Jamo Vs.
2046 *
2047 * However, before the PRI #29 fix, this can occur due to
2048 * an intervening combining mark between the Hangul LV and the Jamo T.
2049 */
2050 } else {
2051 /* Jamo T, compose with previous Hangul that does not have a Jamo T */
2052 if(isHangulWithoutJamoT(c2)) {
2053 c2+=(UChar)(c-JAMO_T_BASE);
2054 if(!nx_contains(nx, c2)) {
2055 pRemove=p-1;
2056 *starter=c2;
2057 }
2058 }
2059 }
2060
2061 if(pRemove!=NULL) {
2062 /* remove the Jamo(s) */
2063 q=pRemove;
2064 r=p;
2065 while(r<limit) {
2066 *q++=*r++;
2067 }
2068 p=pRemove;
2069 limit=q;
2070 }
2071
2072 c2=0; /* c2 held *starter temporarily */
2073
2074 if(combineFlags!=0) {
2075 /*
2076 * not starter=NULL because the composition is a Hangul LV syllable
2077 * and might combine once more (but only before the PRI #29 fix)
2078 */
2079
2080 /* done? */
2081 if(p==limit) {
2082 return prevCC;
2083 }
2084
2085 /* the composition is a Hangul LV syllable which is a starter that combines forward */
2086 combineFwdIndex=0xfff0;
2087
2088 /* we combined; continue with looking for compositions */
2089 continue;
2090 }
2091 }
2092
2093 /*
2094 * now: cc==0 and the combining index does not include "forward" ->
2095 * the rest of the loop body will reset starter to NULL;
2096 * technically, a composed Hangul syllable is a starter, but it
2097 * does not combine forward now that we have consumed all eligible Jamos;
2098 * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2099 */
2100
2101 } else if(
2102 /* the starter is not a Hangul LV or Jamo V/T and */
2103 !(combineFwdIndex&0x8000) &&
2104 /* the combining mark is not blocked and */
2105 ((options&UNORM_BEFORE_PRI_29) ?
2106 (prevCC!=cc || prevCC==0) :
2107 (prevCC<cc || prevCC==0)) &&
2108 /* the starter and the combining mark (c, c2) do combine and */
2109 0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
2110 /* the composition result is not excluded */
2111 !nx_contains(nx, value, value2)
2112 ) {
2113 /* replace the starter with the composition, remove the combining mark */
2114 pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
2115
2116 /* replace the starter with the composition */
2117 *starter=(UChar)value;
2118 if(starterIsSupplementary) {
2119 if(value2!=0) {
2120 /* both are supplementary */
2121 *(starter+1)=(UChar)value2;
2122 } else {
2123 /* the composition is shorter than the starter, move the intermediate characters forward one */
2124 starterIsSupplementary=FALSE;
2125 q=starter+1;
2126 r=q+1;
2127 while(r<pRemove) {
2128 *q++=*r++;
2129 }
2130 --pRemove;
2131 }
2132 } else if(value2!=0) {
2133 /* the composition is longer than the starter, move the intermediate characters back one */
2134 starterIsSupplementary=TRUE;
2135 ++starter; /* temporarily increment for the loop boundary */
2136 q=pRemove;
2137 r=++pRemove;
2138 while(starter<q) {
2139 *--r=*--q;
2140 }
2141 *starter=(UChar)value2;
2142 --starter; /* undo the temporary increment */
2143 /* } else { both are on the BMP, nothing more to do */
2144 }
2145
2146 /* remove the combining mark by moving the following text over it */
2147 if(pRemove<p) {
2148 q=pRemove;
2149 r=p;
2150 while(r<limit) {
2151 *q++=*r++;
2152 }
2153 p=pRemove;
2154 limit=q;
2155 }
2156
2157 /* keep prevCC because we removed the combining mark */
2158
2159 /* done? */
2160 if(p==limit) {
2161 return prevCC;
2162 }
2163
2164 /* is the composition a starter that combines forward? */
2165 if(result>1) {
2166 combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
2167 } else {
2168 starter=NULL;
2169 }
2170
2171 /* we combined; continue with looking for compositions */
2172 continue;
2173 }
2174 }
2175
2176 /* no combination this time */
2177 prevCC=cc;
2178 if(p==limit) {
2179 return prevCC;
2180 }
2181
2182 /* if (c, c2) did not combine, then check if it is a starter */
2183 if(cc==0) {
2184 /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2185 if(combineFlags&_NORM_COMBINES_FWD) {
2186 /* it may combine with something, prepare for it */
2187 if(c2==0) {
2188 starterIsSupplementary=FALSE;
2189 starter=p-1;
2190 } else {
2191 starterIsSupplementary=TRUE;
2192 starter=p-2;
2193 }
2194 combineFwdIndex=combineBackIndex;
2195 } else {
2196 /* it will not combine with anything */
2197 starter=NULL;
2198 }
2199 } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
2200 /* FCC: no discontiguous compositions; any intervening character blocks */
2201 starter=NULL;
2202 }
2203 }
2204 }
2205
2206 /* decompose and recompose [prevStarter..src[ */
2207 static const UChar *
2208 _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
2209 const UChar *prevStarter, const UChar *src,
2210 uint8_t &prevCC,
2211 int32_t options, const UnicodeSet *nx,
2212 UErrorCode *pErrorCode) {
2213 UChar *recomposeLimit;
2214 uint8_t trailCC;
2215 UBool compat;
2216
2217 compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
2218
2219 /* decompose [prevStarter..src[ */
2220 length=_decompose(buffer, bufferCapacity,
2221 prevStarter, (int32_t)(src-prevStarter),
2222 compat, nx,
2223 trailCC);
2224 if(length>bufferCapacity) {
2225 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
2226 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2227 return NULL;
2228 }
2229 length=_decompose(buffer, bufferCapacity,
2230 prevStarter, (int32_t)(src-prevStarter),
2231 compat, nx,
2232 trailCC);
2233 }
2234
2235 /* recompose the decomposition */
2236 recomposeLimit=buffer+length;
2237 if(length>=2) {
2238 prevCC=_recompose(buffer, recomposeLimit, options, nx);
2239 }
2240
2241 /* return with a pointer to the recomposition and its length */
2242 length=(int32_t)(recomposeLimit-buffer);
2243 return buffer;
2244 }
2245
2246 static int32_t
2247 _compose(UChar *dest, int32_t destCapacity,
2248 const UChar *src, int32_t srcLength,
2249 int32_t options, const UnicodeSet *nx,
2250 UErrorCode *pErrorCode) {
2251 UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2252 UChar *buffer;
2253 int32_t bufferCapacity;
2254
2255 const UChar *limit, *prevSrc, *prevStarter;
2256 uint32_t norm32, ccOrQCMask, qcMask;
2257 int32_t destIndex, reorderStartIndex, length;
2258 UChar c, c2, minNoMaybe;
2259 uint8_t cc, prevCC;
2260
2261 if(options&_NORM_OPTIONS_COMPAT) {
2262 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2263 qcMask=_NORM_QC_NFKC;
2264 } else {
2265 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2266 qcMask=_NORM_QC_NFC;
2267 }
2268
2269 /* initialize */
2270 buffer=stackBuffer;
2271 bufferCapacity=_STACK_BUFFER_CAPACITY;
2272
2273 /*
2274 * prevStarter points to the last character before the current one
2275 * that is a "true" starter with cc==0 and quick check "yes".
2276 *
2277 * prevStarter will be used instead of looking for a true starter
2278 * while incrementally decomposing [prevStarter..prevSrc[
2279 * in _composePart(). Having a good prevStarter allows to just decompose
2280 * the entire [prevStarter..prevSrc[.
2281 *
2282 * When _composePart() backs out from prevSrc back to prevStarter,
2283 * then it also backs out destIndex by the same amount.
2284 * Therefore, at all times, the (prevSrc-prevStarter) source units
2285 * must correspond 1:1 to destination units counted with destIndex,
2286 * except for reordering.
2287 * This is true for the qc "yes" characters copied in the fast loop,
2288 * and for pure reordering.
2289 * prevStarter must be set forward to src when this is not true:
2290 * In _composePart() and after composing a Hangul syllable.
2291 *
2292 * This mechanism relies on the assumption that the decomposition of a true starter
2293 * also begins with a true starter. gennorm/store.c checks for this.
2294 */
2295 prevStarter=src;
2296
2297 ccOrQCMask=_NORM_CC_MASK|qcMask;
2298 destIndex=reorderStartIndex=0;
2299 prevCC=0;
2300
2301 /* avoid compiler warnings */
2302 norm32=0;
2303 c=0;
2304
2305 if(srcLength>=0) {
2306 /* string with length */
2307 limit=src+srcLength;
2308 } else /* srcLength==-1 */ {
2309 /* zero-terminated string */
2310 limit=NULL;
2311 }
2312
2313 U_ALIGN_CODE(16);
2314
2315 for(;;) {
2316 /* count code units below the minimum or with irrelevant data for the quick check */
2317 prevSrc=src;
2318 if(limit==NULL) {
2319 while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
2320 prevCC=0;
2321 ++src;
2322 }
2323 } else {
2324 while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
2325 prevCC=0;
2326 ++src;
2327 }
2328 }
2329
2330 /* copy these code units all at once */
2331 if(src!=prevSrc) {
2332 length=(int32_t)(src-prevSrc);
2333 if((destIndex+length)<=destCapacity) {
2334 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2335 }
2336 destIndex+=length;
2337 reorderStartIndex=destIndex;
2338
2339 /* set prevStarter to the last character in the quick check loop */
2340 prevStarter=src-1;
2341 if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
2342 --prevStarter;
2343 }
2344
2345 prevSrc=src;
2346 }
2347
2348 /* end of source reached? */
2349 if(limit==NULL ? c==0 : src==limit) {
2350 break;
2351 }
2352
2353 /* c already contains *src and norm32 is set for it, increment src */
2354 ++src;
2355
2356 /*
2357 * source buffer pointers:
2358 *
2359 * all done quick check current char not yet
2360 * "yes" but (c, c2) processed
2361 * may combine
2362 * forward
2363 * [-------------[-------------[-------------[-------------[
2364 * | | | | |
2365 * start prevStarter prevSrc src limit
2366 *
2367 *
2368 * destination buffer pointers and indexes:
2369 *
2370 * all done might take not filled yet
2371 * characters for
2372 * reordering
2373 * [-------------[-------------[-------------[
2374 * | | | |
2375 * dest reorderStartIndex destIndex destCapacity
2376 */
2377
2378 /* check one above-minimum, relevant code unit */
2379 /*
2380 * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2381 * check for Jamo V/T, then for surrogates and regular characters
2382 * c is not a Hangul syllable or Jamo L because
2383 * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2384 */
2385 if(isNorm32HangulOrJamo(norm32)) {
2386 /*
2387 * c is a Jamo V/T:
2388 * try to compose with the previous character, Jamo V also with a following Jamo T,
2389 * and set values here right now in case we just continue with the main loop
2390 */
2391 prevCC=cc=0;
2392 reorderStartIndex=destIndex;
2393
2394 if(
2395 destIndex>0 &&
2396 _composeHangul(
2397 *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
2398 destIndex<=destCapacity ? dest+(destIndex-1) : 0,
2399 nx)
2400 ) {
2401 prevStarter=src;
2402 continue;
2403 }
2404
2405 /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2406 c2=0;
2407 length=1;
2408 prevStarter=prevSrc;
2409 } else {
2410 if(isNorm32Regular(norm32)) {
2411 c2=0;
2412 length=1;
2413 } else {
2414 /* c is a lead surrogate, get the real norm32 */
2415 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2416 ++src;
2417 length=2;
2418 norm32=_getNorm32FromSurrogatePair(c, c2);
2419 } else {
2420 /* c is an unpaired lead surrogate, nothing to do */
2421 c2=0;
2422 length=1;
2423 norm32=0;
2424 }
2425 }
2426
2427 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2428 if(nx_contains(nx, c, c2)) {
2429 /* excluded: norm32==0 */
2430 cc=0;
2431 } else if((norm32&qcMask)==0) {
2432 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2433 } else {
2434 const UChar *p;
2435 uint32_t decompQCMask;
2436
2437 /*
2438 * find appropriate boundaries around this character,
2439 * decompose the source text from between the boundaries,
2440 * and recompose it
2441 *
2442 * this puts the intermediate text into the side buffer because
2443 * it might be longer than the recomposition end result,
2444 * or the destination buffer may be too short or missing
2445 *
2446 * note that destIndex may be adjusted backwards to account
2447 * for source text that passed the quick check but needed to
2448 * take part in the recomposition
2449 */
2450 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
2451
2452 /*
2453 * find the last true starter in [prevStarter..src[
2454 * it is either the decomposition of the current character (at prevSrc),
2455 * or prevStarter
2456 */
2457 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
2458 prevStarter=prevSrc;
2459 } else {
2460 /* adjust destIndex: back out what had been copied with qc "yes" */
2461 destIndex-=(int32_t)(prevSrc-prevStarter);
2462 }
2463
2464 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2465 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
2466
2467 /* compose [prevStarter..src[ */
2468 p=_composePart(stackBuffer, buffer, bufferCapacity,
2469 length, /* output */
2470 prevStarter, src,
2471 prevCC, /* output */
2472 options, nx,
2473 pErrorCode);
2474
2475 if(p==NULL) {
2476 destIndex=0; /* an error occurred (out of memory) */
2477 break;
2478 }
2479
2480 /* append the recomposed buffer contents to the destination buffer */
2481 if((destIndex+length)<=destCapacity) {
2482 while(length>0) {
2483 dest[destIndex++]=*p++;
2484 --length;
2485 }
2486 } else {
2487 /* buffer overflow */
2488 /* keep incrementing the destIndex for preflighting */
2489 destIndex+=length;
2490 }
2491
2492 /* set the next starter */
2493 prevStarter=src;
2494
2495 continue;
2496 }
2497 }
2498
2499 /* append the single code point (c, c2) to the destination buffer */
2500 if((destIndex+length)<=destCapacity) {
2501 if(cc!=0 && cc<prevCC) {
2502 /* (c, c2) is out of order with respect to the preceding text */
2503 UChar *reorderSplit=dest+destIndex;
2504 destIndex+=length;
2505 prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2506 } else {
2507 /* just append (c, c2) */
2508 dest[destIndex++]=c;
2509 if(c2!=0) {
2510 dest[destIndex++]=c2;
2511 }
2512 prevCC=cc;
2513 }
2514 } else {
2515 /* buffer overflow */
2516 /* keep incrementing the destIndex for preflighting */
2517 destIndex+=length;
2518 prevCC=cc;
2519 }
2520 }
2521
2522 /* cleanup */
2523 if(buffer!=stackBuffer) {
2524 uprv_free(buffer);
2525 }
2526
2527 return destIndex;
2528 }
2529
2530 U_CAPI int32_t U_EXPORT2
2531 unorm_compose(UChar *dest, int32_t destCapacity,
2532 const UChar *src, int32_t srcLength,
2533 UBool compat, int32_t options,
2534 UErrorCode *pErrorCode) {
2535 const UnicodeSet *nx;
2536 int32_t destIndex;
2537
2538 if(!_haveData(*pErrorCode)) {
2539 return 0;
2540 }
2541
2542 nx=getNX(options, *pErrorCode);
2543 if(U_FAILURE(*pErrorCode)) {
2544 return 0;
2545 }
2546
2547 /* reset options bits that should only be set here or inside _compose() */
2548 options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
2549
2550 if(compat) {
2551 options|=_NORM_OPTIONS_COMPAT;
2552 }
2553
2554 destIndex=_compose(dest, destCapacity,
2555 src, srcLength,
2556 options, nx,
2557 pErrorCode);
2558
2559 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2560 }
2561
2562 /* make FCD ----------------------------------------------------------------- */
2563
2564 static const UChar *
2565 _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
2566 UChar c, c2;
2567
2568 /*
2569 * find the first position in [src..limit[ after some cc==0 according to FCD data
2570 *
2571 * at the beginning of the loop, we have fcd16 from before src
2572 *
2573 * stop at positions:
2574 * - after trail cc==0
2575 * - at the end of the source
2576 * - before lead cc==0
2577 */
2578 for(;;) {
2579 /* stop if trail cc==0 for the previous character */
2580 if((fcd16&0xff)==0) {
2581 break;
2582 }
2583
2584 /* get c=*src - stop at end of string */
2585 if(src==limit) {
2586 break;
2587 }
2588 c=*src;
2589
2590 /* stop if lead cc==0 for this character */
2591 if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
2592 break; /* catches terminating NUL, too */
2593 }
2594
2595 if(!UTF_IS_FIRST_SURROGATE(c)) {
2596 if(fcd16<=0xff) {
2597 break;
2598 }
2599 ++src;
2600 } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
2601 /* c is a lead surrogate, get the real fcd16 */
2602 fcd16=_getFCD16FromSurrogatePair(c, c2);
2603 if(fcd16<=0xff) {
2604 break;
2605 }
2606 src+=2;
2607 } else {
2608 /* c is an unpaired first surrogate, lead cc==0 */
2609 break;
2610 }
2611 }
2612
2613 return src;
2614 }
2615
2616 static uint8_t
2617 _decomposeFCD(const UChar *src, const UChar *decompLimit,
2618 UChar *dest, int32_t &destIndex, int32_t destCapacity,
2619 const UnicodeSet *nx) {
2620 const UChar *p;
2621 uint32_t norm32;
2622 int32_t reorderStartIndex, length;
2623 UChar c, c2;
2624 uint8_t cc, prevCC, trailCC;
2625
2626 /*
2627 * canonically decompose [src..decompLimit[
2628 *
2629 * all characters in this range have some non-zero cc,
2630 * directly or in decomposition,
2631 * so that we do not need to check in the following for quick-check limits etc.
2632 *
2633 * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2634 *
2635 * we also do not need to check for c==0 because we have an established decompLimit
2636 */
2637 reorderStartIndex=destIndex;
2638 prevCC=0;
2639
2640 while(src<decompLimit) {
2641 c=*src++;
2642 norm32=_getNorm32(c);
2643 if(isNorm32Regular(norm32)) {
2644 c2=0;
2645 length=1;
2646 } else {
2647 /*
2648 * reminder: this function is called with [src..decompLimit[
2649 * not containing any Hangul/Jamo characters,
2650 * therefore the only specials are lead surrogates
2651 */
2652 /* c is a lead surrogate, get the real norm32 */
2653 if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2654 ++src;
2655 length=2;
2656 norm32=_getNorm32FromSurrogatePair(c, c2);
2657 } else {
2658 c2=0;
2659 length=1;
2660 norm32=0;
2661 }
2662 }
2663
2664 /* get the decomposition and the lead and trail cc's */
2665 if(nx_contains(nx, c, c2)) {
2666 /* excluded: norm32==0 */
2667 cc=trailCC=0;
2668 p=NULL;
2669 } else if((norm32&_NORM_QC_NFD)==0) {
2670 /* c does not decompose */
2671 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2672 p=NULL;
2673 } else {
2674 /* c decomposes, get everything from the variable-length extra data */
2675 p=_decompose(norm32, length, cc, trailCC);
2676 if(length==1) {
2677 /* fastpath a single code unit from decomposition */
2678 c=*p;
2679 c2=0;
2680 p=NULL;
2681 }
2682 }
2683
2684 /* append the decomposition to the destination buffer, assume length>0 */
2685 if((destIndex+length)<=destCapacity) {
2686 UChar *reorderSplit=dest+destIndex;
2687 if(p==NULL) {
2688 /* fastpath: single code point */
2689 if(cc!=0 && cc<prevCC) {
2690 /* (c, c2) is out of order with respect to the preceding text */
2691 destIndex+=length;
2692 trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2693 } else {
2694 /* just append (c, c2) */
2695 dest[destIndex++]=c;
2696 if(c2!=0) {
2697 dest[destIndex++]=c2;
2698 }
2699 }
2700 } else {
2701 /* general: multiple code points (ordered by themselves) from decomposition */
2702 if(cc!=0 && cc<prevCC) {
2703 /* the decomposition is out of order with respect to the preceding text */
2704 destIndex+=length;
2705 trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
2706 } else {
2707 /* just append the decomposition */
2708 do {
2709 dest[destIndex++]=*p++;
2710 } while(--length>0);
2711 }
2712 }
2713 } else {
2714 /* buffer overflow */
2715 /* keep incrementing the destIndex for preflighting */
2716 destIndex+=length;
2717 }
2718
2719 prevCC=trailCC;
2720 if(prevCC==0) {
2721 reorderStartIndex=destIndex;
2722 }
2723 }
2724
2725 return prevCC;
2726 }
2727
2728 static int32_t
2729 unorm_makeFCD(UChar *dest, int32_t destCapacity,
2730 const UChar *src, int32_t srcLength,
2731 const UnicodeSet *nx,
2732 UErrorCode *pErrorCode) {
2733 const UChar *limit, *prevSrc, *decompStart;
2734 int32_t destIndex, length;
2735 UChar c, c2;
2736 uint16_t fcd16;
2737 int16_t prevCC, cc;
2738
2739 if(!_haveData(*pErrorCode)) {
2740 return 0;
2741 }
2742
2743 /* initialize */
2744 decompStart=src;
2745 destIndex=0;
2746 prevCC=0;
2747
2748 /* avoid compiler warnings */
2749 c=0;
2750 fcd16=0;
2751
2752 if(srcLength>=0) {
2753 /* string with length */
2754 limit=src+srcLength;
2755 } else /* srcLength==-1 */ {
2756 /* zero-terminated string */
2757 limit=NULL;
2758 }
2759
2760 U_ALIGN_CODE(16);
2761
2762 for(;;) {
2763 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2764 prevSrc=src;
2765 if(limit==NULL) {
2766 for(;;) {
2767 c=*src;
2768 if(c<_NORM_MIN_WITH_LEAD_CC) {
2769 if(c==0) {
2770 break;
2771 }
2772 prevCC=(int16_t)-c;
2773 } else if((fcd16=_getFCD16(c))==0) {
2774 prevCC=0;
2775 } else {
2776 break;
2777 }
2778 ++src;
2779 }
2780 } else {
2781 for(;;) {
2782 if(src==limit) {
2783 break;
2784 } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
2785 prevCC=(int16_t)-c;
2786 } else if((fcd16=_getFCD16(c))==0) {
2787 prevCC=0;
2788 } else {
2789 break;
2790 }
2791 ++src;
2792 }
2793 }
2794
2795 /*
2796 * prevCC has values from the following ranges:
2797 * 0..0xff - the previous trail combining class
2798 * <0 - the negative value of the previous code unit;
2799 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2800 * was deferred so that average text is checked faster
2801 */
2802
2803 /* copy these code units all at once */
2804 if(src!=prevSrc) {
2805 length=(int32_t)(src-prevSrc);
2806 if((destIndex+length)<=destCapacity) {
2807 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2808 }
2809 destIndex+=length;
2810 prevSrc=src;
2811
2812 /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2813 if(prevCC<0) {
2814 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2815 if(!nx_contains(nx, (UChar32)-prevCC)) {
2816 prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2817 } else {
2818 prevCC=0; /* excluded: fcd16==0 */
2819 }
2820
2821 /*
2822 * set a pointer to this below-U+0300 character;
2823 * if prevCC==0 then it will moved to after this character below
2824 */
2825 decompStart=prevSrc-1;
2826 }
2827 }
2828 /*
2829 * now:
2830 * prevSrc==src - used later to adjust destIndex before decomposition
2831 * prevCC>=0
2832 */
2833
2834 /* end of source reached? */
2835 if(limit==NULL ? c==0 : src==limit) {
2836 break;
2837 }
2838
2839 /* set a pointer to after the last source position where prevCC==0 */
2840 if(prevCC==0) {
2841 decompStart=prevSrc;
2842 }
2843
2844 /* c already contains *src and fcd16 is set for it, increment src */
2845 ++src;
2846
2847 /* check one above-minimum, relevant code unit */
2848 if(UTF_IS_FIRST_SURROGATE(c)) {
2849 /* c is a lead surrogate, get the real fcd16 */
2850 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2851 ++src;
2852 fcd16=_getFCD16FromSurrogatePair(c, c2);
2853 } else {
2854 c2=0;
2855 fcd16=0;
2856 }
2857 } else {
2858 c2=0;
2859 }
2860
2861 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2862 if(nx_contains(nx, c, c2)) {
2863 fcd16=0; /* excluded: fcd16==0 */
2864 }
2865
2866 /* check the combining order, get the lead cc */
2867 cc=(int16_t)(fcd16>>8);
2868 if(cc==0 || cc>=prevCC) {
2869 /* the order is ok */
2870 if(cc==0) {
2871 decompStart=prevSrc;
2872 }
2873 prevCC=(int16_t)(fcd16&0xff);
2874
2875 /* just append (c, c2) */
2876 length= c2==0 ? 1 : 2;
2877 if((destIndex+length)<=destCapacity) {
2878 dest[destIndex++]=c;
2879 if(c2!=0) {
2880 dest[destIndex++]=c2;
2881 }
2882 } else {
2883 destIndex+=length;
2884 }
2885 } else {
2886 /*
2887 * back out the part of the source that we copied already but
2888 * is now going to be decomposed;
2889 * prevSrc is set to after what was copied
2890 */
2891 destIndex-=(int32_t)(prevSrc-decompStart);
2892
2893 /*
2894 * find the part of the source that needs to be decomposed;
2895 * to be safe and simple, decompose to before the next character with lead cc==0
2896 */
2897 src=_findSafeFCD(src, limit, fcd16);
2898
2899 /*
2900 * the source text does not fulfill the conditions for FCD;
2901 * decompose and reorder a limited piece of the text
2902 */
2903 prevCC=_decomposeFCD(decompStart, src,
2904 dest, destIndex, destCapacity,
2905 nx);
2906 decompStart=src;
2907 }
2908 }
2909
2910 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2911 }
2912
2913 /* quick check functions ---------------------------------------------------- */
2914
2915 static UBool
2916 unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
2917 const UChar *limit;
2918 UChar c, c2;
2919 uint16_t fcd16;
2920 int16_t prevCC, cc;
2921
2922 /* initialize */
2923 prevCC=0;
2924
2925 if(srcLength>=0) {
2926 /* string with length */
2927 limit=src+srcLength;
2928 } else /* srcLength==-1 */ {
2929 /* zero-terminated string */
2930 limit=NULL;
2931 }
2932
2933 U_ALIGN_CODE(16);
2934
2935 for(;;) {
2936 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2937 if(limit==NULL) {
2938 for(;;) {
2939 c=*src++;
2940 if(c<_NORM_MIN_WITH_LEAD_CC) {
2941 if(c==0) {
2942 return TRUE;
2943 }
2944 /*
2945 * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2946 * because chances are good that the next one will have
2947 * a leading cc of 0;
2948 * _getFCD16(-prevCC) is later called when necessary -
2949 * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2950 */
2951 prevCC=(int16_t)-c;
2952 } else if((fcd16=_getFCD16(c))==0) {
2953 prevCC=0;
2954 } else {
2955 break;
2956 }
2957 }
2958 } else {
2959 for(;;) {
2960 if(src==limit) {
2961 return TRUE;
2962 } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
2963 prevCC=(int16_t)-c;
2964 } else if((fcd16=_getFCD16(c))==0) {
2965 prevCC=0;
2966 } else {
2967 break;
2968 }
2969 }
2970 }
2971
2972 /* check one above-minimum, relevant code unit */
2973 if(UTF_IS_FIRST_SURROGATE(c)) {
2974 /* c is a lead surrogate, get the real fcd16 */
2975 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2976 ++src;
2977 fcd16=_getFCD16FromSurrogatePair(c, c2);
2978 } else {
2979 c2=0;
2980 fcd16=0;
2981 }
2982 } else {
2983 c2=0;
2984 }
2985
2986 if(nx_contains(nx, c, c2)) {
2987 prevCC=0; /* excluded: fcd16==0 */
2988 continue;
2989 }
2990
2991 /*
2992 * prevCC has values from the following ranges:
2993 * 0..0xff - the previous trail combining class
2994 * <0 - the negative value of the previous code unit;
2995 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2996 * was deferred so that average text is checked faster
2997 */
2998
2999 /* check the combining order */
3000 cc=(int16_t)(fcd16>>8);
3001 if(cc!=0) {
3002 if(prevCC<0) {
3003 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
3004 if(!nx_contains(nx, (UChar32)-prevCC)) {
3005 prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
3006 } else {
3007 prevCC=0; /* excluded: fcd16==0 */
3008 }
3009 }
3010
3011 if(cc<prevCC) {
3012 return FALSE;
3013 }
3014 }
3015 prevCC=(int16_t)(fcd16&0xff);
3016 }
3017 }
3018
3019 static UNormalizationCheckResult
3020 _quickCheck(const UChar *src,
3021 int32_t srcLength,
3022 UNormalizationMode mode,
3023 UBool allowMaybe,
3024 const UnicodeSet *nx,
3025 UErrorCode *pErrorCode) {
3026 UChar stackBuffer[_STACK_BUFFER_CAPACITY];
3027 UChar *buffer;
3028 int32_t bufferCapacity;
3029
3030 const UChar *start, *limit;
3031 uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
3032 int32_t options;
3033 UChar c, c2, minNoMaybe;
3034 uint8_t cc, prevCC;
3035 UNormalizationCheckResult result;
3036
3037 /* check arguments */
3038 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3039 return UNORM_MAYBE;
3040 }
3041
3042 if(src==NULL || srcLength<-1) {
3043 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3044 return UNORM_MAYBE;
3045 }
3046
3047 if(!_haveData(*pErrorCode)) {
3048 return UNORM_MAYBE;
3049 }
3050
3051 /* check for a valid mode and set the quick check minimum and mask */
3052 switch(mode) {
3053 case UNORM_NFC:
3054 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3055 qcMask=_NORM_QC_NFC;
3056 options=0;
3057 break;
3058 case UNORM_NFKC:
3059 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3060 qcMask=_NORM_QC_NFKC;
3061 options=_NORM_OPTIONS_COMPAT;
3062 break;
3063 case UNORM_NFD:
3064 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
3065 qcMask=_NORM_QC_NFD;
3066 options=0;
3067 break;
3068 case UNORM_NFKD:
3069 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
3070 qcMask=_NORM_QC_NFKD;
3071 options=_NORM_OPTIONS_COMPAT;
3072 break;
3073 case UNORM_FCD:
3074 if(fcdTrie.index==NULL) {
3075 *pErrorCode=U_UNSUPPORTED_ERROR;
3076 return UNORM_MAYBE;
3077 }
3078 return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
3079 default:
3080 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3081 return UNORM_MAYBE;
3082 }
3083
3084 /* initialize */
3085 buffer=stackBuffer;
3086 bufferCapacity=_STACK_BUFFER_CAPACITY;
3087
3088 ccOrQCMask=_NORM_CC_MASK|qcMask;
3089 result=UNORM_YES;
3090 prevCC=0;
3091
3092 start=src;
3093 if(srcLength>=0) {
3094 /* string with length */
3095 limit=src+srcLength;
3096 } else /* srcLength==-1 */ {
3097 /* zero-terminated string */
3098 limit=NULL;
3099 }
3100
3101 U_ALIGN_CODE(16);
3102
3103 for(;;) {
3104 /* skip a run of code units below the minimum or with irrelevant data for the quick check */
3105 if(limit==NULL) {
3106 for(;;) {
3107 c=*src++;
3108 if(c<minNoMaybe) {
3109 if(c==0) {
3110 goto endloop; /* break out of outer loop */
3111 }
3112 } else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3113 break;
3114 }
3115 prevCC=0;
3116 }
3117 } else {
3118 for(;;) {
3119 if(src==limit) {
3120 goto endloop; /* break out of outer loop */
3121 } else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3122 break;
3123 }
3124 prevCC=0;
3125 }
3126 }
3127
3128 /* check one above-minimum, relevant code unit */
3129 if(U16_IS_LEAD(c)) {
3130 /* c is a lead surrogate, get the real norm32 */
3131 if(src!=limit && U16_IS_TRAIL(c2=*src)) {
3132 ++src;
3133 norm32=_getNorm32FromSurrogatePair(c, c2);
3134 } else {
3135 c2=0;
3136 norm32=0;
3137 }
3138 } else {
3139 c2=0;
3140 }
3141
3142 if(nx_contains(nx, c, c2)) {
3143 /* excluded: norm32==0 */
3144 norm32=0;
3145 }
3146
3147 /* check the combining order */
3148 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
3149 if(cc!=0 && cc<prevCC) {
3150 result=UNORM_NO;
3151 break;
3152 }
3153 prevCC=cc;
3154
3155 /* check for "no" or "maybe" quick check flags */
3156 qcNorm32=norm32&qcMask;
3157 if(qcNorm32&_NORM_QC_ANY_NO) {
3158 result=UNORM_NO;
3159 break;
3160 } else if(qcNorm32!=0) {
3161 /* "maybe" can only occur for NFC and NFKC */
3162 if(allowMaybe) {
3163 result=UNORM_MAYBE;
3164 } else {
3165 /* normalize a section around here to see if it is really normalized or not */
3166 const UChar *prevStarter;
3167 uint32_t decompQCMask;
3168 int32_t length;
3169
3170 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
3171
3172 /* find the previous starter */
3173 prevStarter=src-1; /* set prevStarter to the beginning of the current character */
3174 if(UTF_IS_TRAIL(*prevStarter)) {
3175 --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */
3176 }
3177 prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
3178
3179 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3180 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
3181
3182 /* decompose and recompose [prevStarter..src[ */
3183 _composePart(stackBuffer, buffer, bufferCapacity,
3184 length,
3185 prevStarter,
3186 src,
3187 prevCC,
3188 options, nx, pErrorCode);
3189 if(U_FAILURE(*pErrorCode)) {
3190 result=UNORM_MAYBE; /* error (out of memory) */
3191 break;
3192 }
3193
3194 /* compare the normalized version with the original */
3195 if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
3196 result=UNORM_NO; /* normalization differs */
3197 break;
3198 }
3199
3200 /* continue after the next starter */
3201 }
3202 }
3203 }
3204 endloop:
3205
3206 if(buffer!=stackBuffer) {
3207 uprv_free(buffer);
3208 }
3209
3210 return result;
3211 }
3212
3213 U_CAPI UNormalizationCheckResult U_EXPORT2
3214 unorm_quickCheck(const UChar *src,
3215 int32_t srcLength,
3216 UNormalizationMode mode,
3217 UErrorCode *pErrorCode) {
3218 return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
3219 }
3220
3221 U_CAPI UNormalizationCheckResult U_EXPORT2
3222 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
3223 UNormalizationMode mode, int32_t options,
3224 UErrorCode *pErrorCode) {
3225 return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
3226 }
3227
3228 U_CFUNC UNormalizationCheckResult
3229 unorm_internalQuickCheck(const UChar *src,
3230 int32_t srcLength,
3231 UNormalizationMode mode,
3232 UBool allowMaybe,
3233 const UnicodeSet *nx,
3234 UErrorCode *pErrorCode) {
3235 return _quickCheck(src, srcLength, mode, allowMaybe, nx, pErrorCode);
3236 }
3237
3238 U_CAPI UBool U_EXPORT2
3239 unorm_isNormalized(const UChar *src, int32_t srcLength,
3240 UNormalizationMode mode,
3241 UErrorCode *pErrorCode) {
3242 return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
3243 }
3244
3245 U_CAPI UBool U_EXPORT2
3246 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
3247 UNormalizationMode mode, int32_t options,
3248 UErrorCode *pErrorCode) {
3249 return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
3250 }
3251
3252 /* normalize() API ---------------------------------------------------------- */
3253
3254 /**
3255 * Internal API for normalizing.
3256 * Does not check for bad input.
3257 * Requires _haveData() to be true.
3258 * @internal
3259 */
3260 U_CFUNC int32_t
3261 unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
3262 const UChar *src, int32_t srcLength,
3263 UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
3264 UErrorCode *pErrorCode) {
3265 int32_t destLength;
3266 uint8_t trailCC;
3267
3268 switch(mode) {
3269 case UNORM_NFD:
3270 destLength=_decompose(dest, destCapacity,
3271 src, srcLength,
3272 FALSE, nx, trailCC);
3273 break;
3274 case UNORM_NFKD:
3275 destLength=_decompose(dest, destCapacity,
3276 src, srcLength,
3277 TRUE, nx, trailCC);
3278 break;
3279 case UNORM_NFC:
3280 destLength=_compose(dest, destCapacity,
3281 src, srcLength,
3282 options, nx, pErrorCode);
3283 break;
3284 case UNORM_NFKC:
3285 destLength=_compose(dest, destCapacity,
3286 src, srcLength,
3287 options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
3288 break;
3289 case UNORM_FCD:
3290 if(fcdTrie.index==NULL) {
3291 *pErrorCode=U_UNSUPPORTED_ERROR;
3292 return 0;
3293 }
3294 return unorm_makeFCD(dest, destCapacity,
3295 src, srcLength,
3296 nx,
3297 pErrorCode);
3298 #if 0
3299 case UNORM_FCC:
3300 destLength=_compose(dest, destCapacity,
3301 src, srcLength,
3302 options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
3303 break;
3304 #endif
3305 case UNORM_NONE:
3306 /* just copy the string */
3307 if(srcLength==-1) {
3308 srcLength=u_strlen(src);
3309 }
3310 if(srcLength>0 && srcLength<=destCapacity) {
3311 uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
3312 }
3313 destLength=srcLength;
3314 break;
3315 default:
3316 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3317 return 0;
3318 }
3319
3320 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3321 }
3322
3323 /**
3324 * Internal API for normalizing.
3325 * Does not check for bad input.
3326 * @internal
3327 */
3328 U_CAPI int32_t U_EXPORT2
3329 unorm_internalNormalize(UChar *dest, int32_t destCapacity,
3330 const UChar *src, int32_t srcLength,
3331 UNormalizationMode mode, int32_t options,
3332 UErrorCode *pErrorCode) {
3333 const UnicodeSet *nx;
3334
3335 if(!_haveData(*pErrorCode)) {
3336 return 0;
3337 }
3338
3339 nx=getNX(options, *pErrorCode);
3340 if(U_FAILURE(*pErrorCode)) {
3341 return 0;
3342 }
3343
3344 /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3345 options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
3346
3347 return unorm_internalNormalizeWithNX(dest, destCapacity,
3348 src, srcLength,
3349 mode, options, nx,
3350 pErrorCode);
3351 }
3352
3353 /** Public API for normalizing. */
3354 U_CAPI int32_t U_EXPORT2
3355 unorm_normalize(const UChar *src, int32_t srcLength,
3356 UNormalizationMode mode, int32_t options,
3357 UChar *dest, int32_t destCapacity,
3358 UErrorCode *pErrorCode) {
3359 /* check argument values */
3360 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3361 return 0;
3362 }
3363
3364 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3365 src==NULL || srcLength<-1
3366 ) {
3367 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3368 return 0;
3369 }
3370
3371 /* check for overlapping src and destination */
3372 if( dest!=NULL &&
3373 ((src>=dest && src<(dest+destCapacity)) ||
3374 (srcLength>0 && dest>=src && dest<(src+srcLength)))
3375 ) {
3376 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3377 return 0;
3378 }
3379
3380 return unorm_internalNormalize(dest, destCapacity,
3381 src, srcLength,
3382 mode, options,
3383 pErrorCode);
3384 }
3385
3386
3387 /* iteration functions ------------------------------------------------------ */
3388
3389 /*
3390 * These iteration functions are the core implementations of the
3391 * Normalizer class iteration API.
3392 * They read from a UCharIterator into their own buffer
3393 * and normalize into the Normalizer iteration buffer.
3394 * Normalizer itself then iterates over its buffer until that needs to be
3395 * filled again.
3396 */
3397
3398 /*
3399 * ### TODO:
3400 * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3401 * if iteration bounds are reached,
3402 * try to not call hasNext/hasPrevious and instead check for >=0.
3403 */
3404
3405 /* backward iteration ------------------------------------------------------- */
3406
3407 /*
3408 * read backwards and get norm32
3409 * return 0 if the character is <minC
3410 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3411 */
3412 static inline uint32_t
3413 _getPrevNorm32(UCharIterator &src, uint32_t minC, UChar &c, UChar &c2) {
3414 /* need src.hasPrevious() */
3415 c=(UChar)src.previous(&src);
3416 c2=0;
3417
3418 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3419 if(c<minC) {
3420 return 0;
3421 } else if(!UTF_IS_SURROGATE(c)) {
3422 return _getNorm32(c);
3423 } else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
3424 /* unpaired surrogate */
3425 return 0;
3426 } else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
3427 return _getNorm32FromSurrogatePair(c2, c);
3428 } else {
3429 /* unpaired second surrogate, undo the c2=src.previous() movement */
3430 src.move(&src, 1, UITER_CURRENT);
3431 c2=0;
3432 return 0;
3433 }
3434 }
3435
3436 /*
3437 * read backwards and check if the character is a previous-iteration boundary
3438 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3439 */
3440 typedef UBool
3441 IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3442
3443 /*
3444 * for NF*D:
3445 * read backwards and check if the lead combining class is 0
3446 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3447 */
3448 static UBool
3449 _isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3450 return _isNFDSafe(_getPrevNorm32(src, minC, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3451 }
3452
3453 /*
3454 * read backwards and check if the character is (or its decomposition begins with)
3455 * a "true starter" (cc==0 and NF*C_YES)
3456 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3457 */
3458 static UBool
3459 _isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3460 uint32_t norm32, decompQCMask;
3461
3462 decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3463 norm32=_getPrevNorm32(src, minC, c, c2);
3464 return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3465 }
3466
3467 static int32_t
3468 _findPreviousIterationBoundary(UCharIterator &src,
3469 IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
3470 UChar *&buffer, int32_t &bufferCapacity,
3471 int32_t &startIndex,
3472 UErrorCode *pErrorCode) {
3473 UChar *stackBuffer;
3474 UChar c, c2;
3475 UBool isBoundary;
3476
3477 /* initialize */
3478 stackBuffer=buffer;
3479 startIndex=bufferCapacity; /* fill the buffer from the end backwards */
3480
3481 while(src.hasPrevious(&src)) {
3482 isBoundary=isPrevBoundary(src, minC, mask, c, c2);
3483
3484 /* always write this character to the front of the buffer */
3485 /* make sure there is enough space in the buffer */
3486 if(startIndex < (c2==0 ? 1 : 2)) {
3487 int32_t bufferLength=bufferCapacity;
3488
3489 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
3490 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3491 src.move(&src, 0, UITER_START);
3492 return 0;
3493 }
3494
3495 /* move the current buffer contents up */
3496 uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
3497 startIndex+=bufferCapacity-bufferLength;
3498 }
3499
3500 buffer[--startIndex]=c;
3501 if(c2!=0) {
3502 buffer[--startIndex]=c2;
3503 }
3504
3505 /* stop if this just-copied character is a boundary */
3506 if(isBoundary) {
3507 break;
3508 }
3509 }
3510
3511 /* return the length of the buffer contents */
3512 return bufferCapacity-startIndex;
3513 }
3514
3515 U_CAPI int32_t U_EXPORT2
3516 unorm_previous(UCharIterator *src,
3517 UChar *dest, int32_t destCapacity,
3518 UNormalizationMode mode, int32_t options,
3519 UBool doNormalize, UBool *pNeededToNormalize,
3520 UErrorCode *pErrorCode) {
3521 UChar stackBuffer[100];
3522 UChar *buffer=NULL;
3523 IsPrevBoundaryFn *isPreviousBoundary=NULL;
3524 uint32_t mask=0;
3525 int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
3526 int32_t c=0, c2=0;
3527 UChar minC=0;
3528
3529 /* check argument values */
3530 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3531 return 0;
3532 }
3533
3534 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3535 src==NULL
3536 ) {
3537 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3538 return 0;
3539 }
3540
3541 if(!_haveData(*pErrorCode)) {
3542 return 0;
3543 }
3544
3545 if(pNeededToNormalize!=NULL) {
3546 *pNeededToNormalize=FALSE;
3547 }
3548
3549 switch(mode) {
3550 case UNORM_FCD:
3551 if(fcdTrie.index==NULL) {
3552 *pErrorCode=U_UNSUPPORTED_ERROR;
3553 return 0;
3554 }
3555 /* fall through to NFD */
3556 case UNORM_NFD:
3557 isPreviousBoundary=_isPrevNFDSafe;
3558 minC=_NORM_MIN_WITH_LEAD_CC;
3559 mask=_NORM_CC_MASK|_NORM_QC_NFD;
3560 break;
3561 case UNORM_NFKD:
3562 isPreviousBoundary=_isPrevNFDSafe;
3563 minC=_NORM_MIN_WITH_LEAD_CC;
3564 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3565 break;
3566 case UNORM_NFC:
3567 isPreviousBoundary=_isPrevTrueStarter;
3568 minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3569 mask=_NORM_CC_MASK|_NORM_QC_NFC;
3570 break;
3571 case UNORM_NFKC:
3572 isPreviousBoundary=_isPrevTrueStarter;
3573 minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3574 mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3575 break;
3576 case UNORM_NONE:
3577 destLength=0;
3578 if((c=src->previous(src))>=0) {
3579 destLength=1;
3580 if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
3581 if(UTF_IS_LEAD(c2)) {
3582 if(destCapacity>=2) {
3583 dest[1]=(UChar)c; /* trail surrogate */
3584 destLength=2;
3585 }
3586 c=c2; /* lead surrogate to be written below */
3587 } else {
3588 src->move(src, 1, UITER_CURRENT);
3589 }
3590 }
3591
3592 if(destCapacity>0) {
3593 dest[0]=(UChar)c;
3594 }
3595 }
3596 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3597 default:
3598 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3599 return 0;
3600 }
3601
3602 buffer=stackBuffer;
3603 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3604 bufferLength=_findPreviousIterationBoundary(*src,
3605 isPreviousBoundary, minC, mask,
3606 buffer, bufferCapacity,
3607 startIndex,
3608 pErrorCode);
3609 if(bufferLength>0) {
3610 if(doNormalize) {
3611 destLength=unorm_internalNormalize(dest, destCapacity,
3612 buffer+startIndex, bufferLength,
3613 mode, options,
3614 pErrorCode);
3615 if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3616 *pNeededToNormalize=
3617 (UBool)(destLength!=bufferLength ||
3618 0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
3619 }
3620 } else {
3621 /* just copy the source characters */
3622 if(destCapacity>0) {
3623 uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3624 }
3625 destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3626 }
3627 } else {
3628 destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3629 }
3630
3631 /* cleanup */
3632 if(buffer!=stackBuffer) {
3633 uprv_free(buffer);
3634 }
3635
3636 return destLength;
3637 }
3638
3639 /* forward iteration -------------------------------------------------------- */
3640
3641 /*
3642 * read forward and get norm32
3643 * return 0 if the character is <minC
3644 * if c2!=0 then (c2, c) is a surrogate pair
3645 * always reads complete characters
3646 */
3647 static inline uint32_t
3648 _getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3649 uint32_t norm32;
3650
3651 /* need src.hasNext() to be true */
3652 c=(UChar)src.next(&src);
3653 c2=0;
3654
3655 if(c<minC) {
3656 return 0;
3657 }
3658
3659 norm32=_getNorm32(c);
3660 if(UTF_IS_FIRST_SURROGATE(c)) {
3661 if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
3662 src.move(&src, 1, UITER_CURRENT); /* skip the c2 surrogate */
3663 if((norm32&mask)==0) {
3664 /* irrelevant data */
3665 return 0;
3666 } else {
3667 /* norm32 must be a surrogate special */
3668 return _getNorm32FromSurrogatePair(c, c2);
3669 }
3670 } else {
3671 /* unmatched surrogate */
3672 c2=0;
3673 return 0;
3674 }
3675 }
3676 return norm32;
3677 }
3678
3679 /*
3680 * read forward and check if the character is a next-iteration boundary
3681 * if c2!=0 then (c, c2) is a surrogate pair
3682 */
3683 typedef UBool
3684 IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3685
3686 /*
3687 * for NF*D:
3688 * read forward and check if the lead combining class is 0
3689 * if c2!=0 then (c, c2) is a surrogate pair
3690 */
3691 static UBool
3692 _isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3693 return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3694 }
3695
3696 /*
3697 * for NF*C:
3698 * read forward and check if the character is (or its decomposition begins with)
3699 * a "true starter" (cc==0 and NF*C_YES)
3700 * if c2!=0 then (c, c2) is a surrogate pair
3701 */
3702 static UBool
3703 _isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3704 uint32_t norm32, decompQCMask;
3705
3706 decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3707 norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3708 return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3709 }
3710
3711 static int32_t
3712 _findNextIterationBoundary(UCharIterator &src,
3713 IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
3714 UChar *&buffer, int32_t &bufferCapacity,
3715 UErrorCode *pErrorCode) {
3716 UChar *stackBuffer;
3717 int32_t bufferIndex;
3718 UChar c, c2;
3719
3720 if(!src.hasNext(&src)) {
3721 return 0;
3722 }
3723
3724 /* initialize */
3725 stackBuffer=buffer;
3726
3727 /* get one character and ignore its properties */
3728 buffer[0]=c=(UChar)src.next(&src);
3729 bufferIndex=1;
3730 if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
3731 if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
3732 buffer[bufferIndex++]=c2;
3733 } else {
3734 src.move(&src, -1, UITER_CURRENT); /* back out the non-trail-surrogate */
3735 }
3736 }
3737
3738 /* get all following characters until we see a boundary */
3739 /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3740 while(src.hasNext(&src)) {
3741 if(isNextBoundary(src, minC, mask, c, c2)) {
3742 /* back out the latest movement to stop at the boundary */
3743 src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
3744 break;
3745 } else {
3746 if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
3747 /* attempt to grow the buffer */
3748 u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
3749 2*bufferCapacity,
3750 bufferIndex)
3751 ) {
3752 buffer[bufferIndex++]=c;
3753 if(c2!=0) {
3754 buffer[bufferIndex++]=c2;
3755 }
3756 } else {
3757 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3758 src.move(&src, 0, UITER_LIMIT);
3759 return 0;
3760 }
3761 }
3762 }
3763
3764 /* return the length of the buffer contents */
3765 return bufferIndex;
3766 }
3767
3768 U_CAPI int32_t U_EXPORT2
3769 unorm_next(UCharIterator *src,
3770 UChar *dest, int32_t destCapacity,
3771 UNormalizationMode mode, int32_t options,
3772 UBool doNormalize, UBool *pNeededToNormalize,
3773 UErrorCode *pErrorCode) {
3774 UChar stackBuffer[100];
3775 UChar *buffer;
3776 IsNextBoundaryFn *isNextBoundary;
3777 uint32_t mask;
3778 int32_t bufferLength, bufferCapacity, destLength;
3779 int32_t c, c2;
3780 UChar minC;
3781
3782 /* check argument values */
3783 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3784 return 0;
3785 }
3786
3787 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3788 src==NULL
3789 ) {
3790 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3791 return 0;
3792 }
3793
3794 if(!_haveData(*pErrorCode)) {
3795 return 0;
3796 }
3797
3798 if(pNeededToNormalize!=NULL) {
3799 *pNeededToNormalize=FALSE;
3800 }
3801
3802 switch(mode) {
3803 case UNORM_FCD:
3804 if(fcdTrie.index==NULL) {
3805 *pErrorCode=U_UNSUPPORTED_ERROR;
3806 return 0;
3807 }
3808 /* fall through to NFD */
3809 case UNORM_NFD:
3810 isNextBoundary=_isNextNFDSafe;
3811 minC=_NORM_MIN_WITH_LEAD_CC;
3812 mask=_NORM_CC_MASK|_NORM_QC_NFD;
3813 break;
3814 case UNORM_NFKD:
3815 isNextBoundary=_isNextNFDSafe;
3816 minC=_NORM_MIN_WITH_LEAD_CC;
3817 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3818 break;
3819 case UNORM_NFC:
3820 isNextBoundary=_isNextTrueStarter;
3821 minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3822 mask=_NORM_CC_MASK|_NORM_QC_NFC;
3823 break;
3824 case UNORM_NFKC:
3825 isNextBoundary=_isNextTrueStarter;
3826 minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3827 mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3828 break;
3829 case UNORM_NONE:
3830 destLength=0;
3831 if((c=src->next(src))>=0) {
3832 destLength=1;
3833 if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
3834 if(UTF_IS_TRAIL(c2)) {
3835 if(destCapacity>=2) {
3836 dest[1]=(UChar)c2; /* trail surrogate */
3837 destLength=2;
3838 }
3839 /* lead surrogate to be written below */
3840 } else {
3841 src->move(src, -1, UITER_CURRENT);
3842 }
3843 }
3844
3845 if(destCapacity>0) {
3846 dest[0]=(UChar)c;
3847 }
3848 }
3849 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3850 default:
3851 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3852 return 0;
3853 }
3854
3855 buffer=stackBuffer;
3856 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3857 bufferLength=_findNextIterationBoundary(*src,
3858 isNextBoundary, minC, mask,
3859 buffer, bufferCapacity,
3860 pErrorCode);
3861 if(bufferLength>0) {
3862 if(doNormalize) {
3863 destLength=unorm_internalNormalize(dest, destCapacity,
3864 buffer, bufferLength,
3865 mode, options,
3866 pErrorCode);
3867 if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3868 *pNeededToNormalize=
3869 (UBool)(destLength!=bufferLength ||
3870 0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
3871 }
3872 } else {
3873 /* just copy the source characters */
3874 if(destCapacity>0) {
3875 uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3876 }
3877 destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3878 }
3879 } else {
3880 destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3881 }
3882
3883 /* cleanup */
3884 if(buffer!=stackBuffer) {
3885 uprv_free(buffer);
3886 }
3887
3888 return destLength;
3889 }
3890
3891 /*
3892 * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3893 * and if not, how hard it would be to improve it.
3894 * For example, see _findSafeFCD().
3895 */
3896
3897 /* Concatenation of normalized strings -------------------------------------- */
3898
3899 U_CAPI int32_t U_EXPORT2
3900 unorm_concatenate(const UChar *left, int32_t leftLength,
3901 const UChar *right, int32_t rightLength,
3902 UChar *dest, int32_t destCapacity,
3903 UNormalizationMode mode, int32_t options,
3904 UErrorCode *pErrorCode) {
3905 UChar stackBuffer[100];
3906 UChar *buffer;
3907 int32_t bufferLength, bufferCapacity;
3908
3909 UCharIterator iter;
3910 int32_t leftBoundary, rightBoundary, destLength;
3911
3912 /* check argument values */
3913 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3914 return 0;
3915 }
3916
3917 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3918 left==NULL || leftLength<-1 ||
3919 right==NULL || rightLength<-1
3920 ) {
3921 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3922 return 0;
3923 }
3924
3925 /* check for overlapping right and destination */
3926 if( dest!=NULL &&
3927 ((right>=dest && right<(dest+destCapacity)) ||
3928 (rightLength>0 && dest>=right && dest<(right+rightLength)))
3929 ) {
3930 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3931 return 0;
3932 }
3933
3934 /* allow left==dest */
3935
3936 /* set up intermediate buffer */
3937 buffer=stackBuffer;
3938 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3939
3940 /*
3941 * Input: left[0..leftLength[ + right[0..rightLength[
3942 *
3943 * Find normalization-safe boundaries leftBoundary and rightBoundary
3944 * and copy the end parts together:
3945 * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
3946 *
3947 * dest=left[0..leftBoundary[ +
3948 * normalize(buffer) +
3949 * right[rightBoundary..rightLength[
3950 */
3951
3952 /*
3953 * find a normalization boundary at the end of the left string
3954 * and copy the end part into the buffer
3955 */
3956 uiter_setString(&iter, left, leftLength);
3957 iter.index=leftLength=iter.length; /* end of left string */
3958
3959 bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
3960 mode, options,
3961 FALSE, NULL,
3962 pErrorCode);
3963 leftBoundary=iter.index;
3964 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3965 *pErrorCode=U_ZERO_ERROR;
3966 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
3967 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3968 /* dont need to cleanup here since
3969 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
3970 */
3971 return 0;
3972 }
3973
3974 /* just copy from the left string: we know the boundary already */
3975 uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
3976 }
3977
3978 /*
3979 * find a normalization boundary at the beginning of the right string
3980 * and concatenate the beginning part to the buffer
3981 */
3982 uiter_setString(&iter, right, rightLength);
3983 rightLength=iter.length; /* in case it was -1 */
3984
3985 rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
3986 mode, options,
3987 FALSE, NULL,
3988 pErrorCode);
3989 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3990 *pErrorCode=U_ZERO_ERROR;
3991 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
3992 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3993 /* dont need to cleanup here since
3994 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
3995 */
3996 return 0;
3997 }
3998
3999 /* just copy from the right string: we know the boundary already */
4000 uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
4001 }
4002
4003 bufferLength+=rightBoundary;
4004
4005 /* copy left[0..leftBoundary[ to dest */
4006 if(left!=dest && leftBoundary>0 && destCapacity>0) {
4007 uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
4008 }
4009 destLength=leftBoundary;
4010
4011 /* concatenate the normalization of the buffer to dest */
4012 if(destCapacity>destLength) {
4013 destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
4014 buffer, bufferLength,
4015 mode, options,
4016 pErrorCode);
4017 } else {
4018 destLength+=unorm_internalNormalize(NULL, 0,
4019 buffer, bufferLength,
4020 mode, options,
4021 pErrorCode);
4022 }
4023 /*
4024 * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
4025 * so we dont check for the error code here..just let it pass through
4026 */
4027 /* concatenate right[rightBoundary..rightLength[ to dest */
4028 right+=rightBoundary;
4029 rightLength-=rightBoundary;
4030 if(rightLength>0 && destCapacity>destLength) {
4031 uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
4032 }
4033 destLength+=rightLength;
4034
4035 /* cleanup */
4036 if(buffer!=stackBuffer) {
4037 uprv_free(buffer);
4038 }
4039
4040 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
4041 }
4042
4043 #endif /* #if !UCONFIG_NO_NORMALIZATION */
4044