• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2009-2011, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  normalizer2impl.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009nov22
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_NORMALIZATION
20 
21 #include "unicode/normalizer2.h"
22 #include "unicode/udata.h"
23 #include "unicode/ustring.h"
24 #include "cmemory.h"
25 #include "mutex.h"
26 #include "normalizer2impl.h"
27 #include "uassert.h"
28 #include "uhash.h"
29 #include "uset_imp.h"
30 #include "utrie2.h"
31 #include "uvector.h"
32 
33 U_NAMESPACE_BEGIN
34 
35 // ReorderingBuffer -------------------------------------------------------- ***
36 
init(int32_t destCapacity,UErrorCode & errorCode)37 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
38     int32_t length=str.length();
39     start=str.getBuffer(destCapacity);
40     if(start==NULL) {
41         // getBuffer() already did str.setToBogus()
42         errorCode=U_MEMORY_ALLOCATION_ERROR;
43         return FALSE;
44     }
45     limit=start+length;
46     remainingCapacity=str.getCapacity()-length;
47     reorderStart=start;
48     if(start==limit) {
49         lastCC=0;
50     } else {
51         setIterator();
52         lastCC=previousCC();
53         // Set reorderStart after the last code point with cc<=1 if there is one.
54         if(lastCC>1) {
55             while(previousCC()>1) {}
56         }
57         reorderStart=codePointLimit;
58     }
59     return TRUE;
60 }
61 
equals(const UChar * otherStart,const UChar * otherLimit) const62 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
63     int32_t length=(int32_t)(limit-start);
64     return
65         length==(int32_t)(otherLimit-otherStart) &&
66         0==u_memcmp(start, otherStart, length);
67 }
68 
appendSupplementary(UChar32 c,uint8_t cc,UErrorCode & errorCode)69 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
70     if(remainingCapacity<2 && !resize(2, errorCode)) {
71         return FALSE;
72     }
73     if(lastCC<=cc || cc==0) {
74         limit[0]=U16_LEAD(c);
75         limit[1]=U16_TRAIL(c);
76         limit+=2;
77         lastCC=cc;
78         if(cc<=1) {
79             reorderStart=limit;
80         }
81     } else {
82         insert(c, cc);
83     }
84     remainingCapacity-=2;
85     return TRUE;
86 }
87 
append(const UChar * s,int32_t length,uint8_t leadCC,uint8_t trailCC,UErrorCode & errorCode)88 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
89                                uint8_t leadCC, uint8_t trailCC,
90                                UErrorCode &errorCode) {
91     if(length==0) {
92         return TRUE;
93     }
94     if(remainingCapacity<length && !resize(length, errorCode)) {
95         return FALSE;
96     }
97     remainingCapacity-=length;
98     if(lastCC<=leadCC || leadCC==0) {
99         if(trailCC<=1) {
100             reorderStart=limit+length;
101         } else if(leadCC<=1) {
102             reorderStart=limit+1;  // Ok if not a code point boundary.
103         }
104         const UChar *sLimit=s+length;
105         do { *limit++=*s++; } while(s!=sLimit);
106         lastCC=trailCC;
107     } else {
108         int32_t i=0;
109         UChar32 c;
110         U16_NEXT(s, i, length, c);
111         insert(c, leadCC);  // insert first code point
112         while(i<length) {
113             U16_NEXT(s, i, length, c);
114             if(i<length) {
115                 // s must be in NFD, otherwise we need to use getCC().
116                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
117             } else {
118                 leadCC=trailCC;
119             }
120             append(c, leadCC, errorCode);
121         }
122     }
123     return TRUE;
124 }
125 
appendZeroCC(UChar32 c,UErrorCode & errorCode)126 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
127     int32_t cpLength=U16_LENGTH(c);
128     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
129         return FALSE;
130     }
131     remainingCapacity-=cpLength;
132     if(cpLength==1) {
133         *limit++=(UChar)c;
134     } else {
135         limit[0]=U16_LEAD(c);
136         limit[1]=U16_TRAIL(c);
137         limit+=2;
138     }
139     lastCC=0;
140     reorderStart=limit;
141     return TRUE;
142 }
143 
appendZeroCC(const UChar * s,const UChar * sLimit,UErrorCode & errorCode)144 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
145     if(s==sLimit) {
146         return TRUE;
147     }
148     int32_t length=(int32_t)(sLimit-s);
149     if(remainingCapacity<length && !resize(length, errorCode)) {
150         return FALSE;
151     }
152     u_memcpy(limit, s, length);
153     limit+=length;
154     remainingCapacity-=length;
155     lastCC=0;
156     reorderStart=limit;
157     return TRUE;
158 }
159 
remove()160 void ReorderingBuffer::remove() {
161     reorderStart=limit=start;
162     remainingCapacity=str.getCapacity();
163     lastCC=0;
164 }
165 
removeSuffix(int32_t suffixLength)166 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
167     if(suffixLength<(limit-start)) {
168         limit-=suffixLength;
169         remainingCapacity+=suffixLength;
170     } else {
171         limit=start;
172         remainingCapacity=str.getCapacity();
173     }
174     lastCC=0;
175     reorderStart=limit;
176 }
177 
resize(int32_t appendLength,UErrorCode & errorCode)178 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
179     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
180     int32_t length=(int32_t)(limit-start);
181     str.releaseBuffer(length);
182     int32_t newCapacity=length+appendLength;
183     int32_t doubleCapacity=2*str.getCapacity();
184     if(newCapacity<doubleCapacity) {
185         newCapacity=doubleCapacity;
186     }
187     if(newCapacity<256) {
188         newCapacity=256;
189     }
190     start=str.getBuffer(newCapacity);
191     if(start==NULL) {
192         // getBuffer() already did str.setToBogus()
193         errorCode=U_MEMORY_ALLOCATION_ERROR;
194         return FALSE;
195     }
196     reorderStart=start+reorderStartIndex;
197     limit=start+length;
198     remainingCapacity=str.getCapacity()-length;
199     return TRUE;
200 }
201 
skipPrevious()202 void ReorderingBuffer::skipPrevious() {
203     codePointLimit=codePointStart;
204     UChar c=*--codePointStart;
205     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
206         --codePointStart;
207     }
208 }
209 
previousCC()210 uint8_t ReorderingBuffer::previousCC() {
211     codePointLimit=codePointStart;
212     if(reorderStart>=codePointStart) {
213         return 0;
214     }
215     UChar32 c=*--codePointStart;
216     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
217         return 0;
218     }
219 
220     UChar c2;
221     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
222         --codePointStart;
223         c=U16_GET_SUPPLEMENTARY(c2, c);
224     }
225     return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
226 }
227 
228 // Inserts c somewhere before the last character.
229 // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(UChar32 c,uint8_t cc)230 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
231     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
232     // insert c at codePointLimit, after the character with prevCC<=cc
233     UChar *q=limit;
234     UChar *r=limit+=U16_LENGTH(c);
235     do {
236         *--r=*--q;
237     } while(codePointLimit!=q);
238     writeCodePoint(q, c);
239     if(cc<=1) {
240         reorderStart=r;
241     }
242 }
243 
244 // Normalizer2Impl --------------------------------------------------------- ***
245 
246 struct CanonIterData : public UMemory {
247     CanonIterData(UErrorCode &errorCode);
248     ~CanonIterData();
249     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
250     UTrie2 *trie;
251     UVector canonStartSets;  // contains UnicodeSet *
252 };
253 
~Normalizer2Impl()254 Normalizer2Impl::~Normalizer2Impl() {
255     udata_close(memory);
256     utrie2_close(normTrie);
257     UTrie2Singleton(fcdTrieSingleton).deleteInstance();
258     delete (CanonIterData *)canonIterDataSingleton.fInstance;
259 }
260 
261 UBool U_CALLCONV
isAcceptable(void * context,const char *,const char *,const UDataInfo * pInfo)262 Normalizer2Impl::isAcceptable(void *context,
263                               const char * /* type */, const char * /*name*/,
264                               const UDataInfo *pInfo) {
265     if(
266         pInfo->size>=20 &&
267         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
268         pInfo->charsetFamily==U_CHARSET_FAMILY &&
269         pInfo->dataFormat[0]==0x4e &&    /* dataFormat="Nrm2" */
270         pInfo->dataFormat[1]==0x72 &&
271         pInfo->dataFormat[2]==0x6d &&
272         pInfo->dataFormat[3]==0x32 &&
273         pInfo->formatVersion[0]==1
274     ) {
275         Normalizer2Impl *me=(Normalizer2Impl *)context;
276         uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
277         return TRUE;
278     } else {
279         return FALSE;
280     }
281 }
282 
283 void
load(const char * packageName,const char * name,UErrorCode & errorCode)284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
285     if(U_FAILURE(errorCode)) {
286         return;
287     }
288     memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
289     if(U_FAILURE(errorCode)) {
290         return;
291     }
292     const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
293     const int32_t *inIndexes=(const int32_t *)inBytes;
294     int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
295     if(indexesLength<=IX_MIN_MAYBE_YES) {
296         errorCode=U_INVALID_FORMAT_ERROR;  // Not enough indexes.
297         return;
298     }
299 
300     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
301     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
302 
303     minYesNo=inIndexes[IX_MIN_YES_NO];
304     minNoNo=inIndexes[IX_MIN_NO_NO];
305     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
306     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
307 
308     int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
309     int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
310     normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
311                                        inBytes+offset, nextOffset-offset, NULL,
312                                        &errorCode);
313     if(U_FAILURE(errorCode)) {
314         return;
315     }
316 
317     offset=nextOffset;
318     maybeYesCompositions=(const uint16_t *)(inBytes+offset);
319     extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
320 }
321 
getTrailCCFromCompYesAndZeroCC(const UChar * cpStart,const UChar * cpLimit) const322 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
323     UChar32 c;
324     if(cpStart==(cpLimit-1)) {
325         c=*cpStart;
326     } else {
327         c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
328     }
329     uint16_t prevNorm16=getNorm16(c);
330     if(prevNorm16<=minYesNo) {
331         return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
332     } else {
333         return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo
334     }
335 }
336 
337 U_CDECL_BEGIN
338 
339 static UBool U_CALLCONV
enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)340 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
341     /* add the start code point to the USet */
342     const USetAdder *sa=(const USetAdder *)context;
343     sa->add(sa->set, start);
344     return TRUE;
345 }
346 
347 static uint32_t U_CALLCONV
segmentStarterMapper(const void *,uint32_t value)348 segmentStarterMapper(const void * /*context*/, uint32_t value) {
349     return value&CANON_NOT_SEGMENT_STARTER;
350 }
351 
352 U_CDECL_END
353 
354 void
addPropertyStarts(const USetAdder * sa,UErrorCode &) const355 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
356     /* add the start code point of each same-value range of each trie */
357     utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
358 
359     /* add Hangul LV syllables and LV+1 because of skippables */
360     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
361         sa->add(sa->set, c);
362         sa->add(sa->set, c+1);
363     }
364     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
365 }
366 
367 void
addCanonIterPropertyStarts(const USetAdder * sa,UErrorCode & errorCode) const368 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
369     /* add the start code point of each same-value range of the canonical iterator data trie */
370     if(ensureCanonIterData(errorCode)) {
371         // currently only used for the SEGMENT_STARTER property
372         utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie,
373                     segmentStarterMapper, enumPropertyStartsRange, sa);
374     }
375 }
376 
377 const UChar *
copyLowPrefixFromNulTerminated(const UChar * src,UChar32 minNeedDataCP,ReorderingBuffer * buffer,UErrorCode & errorCode) const378 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
379                                                 UChar32 minNeedDataCP,
380                                                 ReorderingBuffer *buffer,
381                                                 UErrorCode &errorCode) const {
382     // Make some effort to support NUL-terminated strings reasonably.
383     // Take the part of the fast quick check loop that does not look up
384     // data and check the first part of the string.
385     // After this prefix, determine the string length to simplify the rest
386     // of the code.
387     const UChar *prevSrc=src;
388     UChar c;
389     while((c=*src++)<minNeedDataCP && c!=0) {}
390     // Back out the last character for full processing.
391     // Copy this prefix.
392     if(--src!=prevSrc) {
393         if(buffer!=NULL) {
394             buffer->appendZeroCC(prevSrc, src, errorCode);
395         }
396     }
397     return src;
398 }
399 
400 // Dual functionality:
401 // buffer!=NULL: normalize
402 // buffer==NULL: isNormalized/spanQuickCheckYes
403 const UChar *
decompose(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const404 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
405                            ReorderingBuffer *buffer,
406                            UErrorCode &errorCode) const {
407     UChar32 minNoCP=minDecompNoCP;
408     if(limit==NULL) {
409         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
410         if(U_FAILURE(errorCode)) {
411             return src;
412         }
413         limit=u_strchr(src, 0);
414     }
415 
416     const UChar *prevSrc;
417     UChar32 c=0;
418     uint16_t norm16=0;
419 
420     // only for quick check
421     const UChar *prevBoundary=src;
422     uint8_t prevCC=0;
423 
424     for(;;) {
425         // count code units below the minimum or with irrelevant data for the quick check
426         for(prevSrc=src; src!=limit;) {
427             if( (c=*src)<minNoCP ||
428                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
429             ) {
430                 ++src;
431             } else if(!U16_IS_SURROGATE(c)) {
432                 break;
433             } else {
434                 UChar c2;
435                 if(U16_IS_SURROGATE_LEAD(c)) {
436                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
437                         c=U16_GET_SUPPLEMENTARY(c, c2);
438                     }
439                 } else /* trail surrogate */ {
440                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
441                         --src;
442                         c=U16_GET_SUPPLEMENTARY(c2, c);
443                     }
444                 }
445                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
446                     src+=U16_LENGTH(c);
447                 } else {
448                     break;
449                 }
450             }
451         }
452         // copy these code units all at once
453         if(src!=prevSrc) {
454             if(buffer!=NULL) {
455                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
456                     break;
457                 }
458             } else {
459                 prevCC=0;
460                 prevBoundary=src;
461             }
462         }
463         if(src==limit) {
464             break;
465         }
466 
467         // Check one above-minimum, relevant code point.
468         src+=U16_LENGTH(c);
469         if(buffer!=NULL) {
470             if(!decompose(c, norm16, *buffer, errorCode)) {
471                 break;
472             }
473         } else {
474             if(isDecompYes(norm16)) {
475                 uint8_t cc=getCCFromYesOrMaybe(norm16);
476                 if(prevCC<=cc || cc==0) {
477                     prevCC=cc;
478                     if(cc<=1) {
479                         prevBoundary=src;
480                     }
481                     continue;
482                 }
483             }
484             return prevBoundary;  // "no" or cc out of order
485         }
486     }
487     return src;
488 }
489 
490 // Decompose a short piece of text which is likely to contain characters that
491 // fail the quick check loop and/or where the quick check loop's overhead
492 // is unlikely to be amortized.
493 // Called by the compose() and makeFCD() implementations.
decomposeShort(const UChar * src,const UChar * limit,ReorderingBuffer & buffer,UErrorCode & errorCode) const494 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
495                                       ReorderingBuffer &buffer,
496                                       UErrorCode &errorCode) const {
497     while(src<limit) {
498         UChar32 c;
499         uint16_t norm16;
500         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
501         if(!decompose(c, norm16, buffer, errorCode)) {
502             return FALSE;
503         }
504     }
505     return TRUE;
506 }
507 
decompose(UChar32 c,uint16_t norm16,ReorderingBuffer & buffer,UErrorCode & errorCode) const508 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
509                                  ReorderingBuffer &buffer,
510                                  UErrorCode &errorCode) const {
511     // Only loops for 1:1 algorithmic mappings.
512     for(;;) {
513         // get the decomposition and the lead and trail cc's
514         if(isDecompYes(norm16)) {
515             // c does not decompose
516             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
517         } else if(isHangul(norm16)) {
518             // Hangul syllable: decompose algorithmically
519             UChar jamos[3];
520             return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
521         } else if(isDecompNoAlgorithmic(norm16)) {
522             c=mapAlgorithmic(c, norm16);
523             norm16=getNorm16(c);
524         } else {
525             // c decomposes, get everything from the variable-length extra data
526             const uint16_t *mapping=getMapping(norm16);
527             uint16_t firstUnit=*mapping++;
528             int32_t length=firstUnit&MAPPING_LENGTH_MASK;
529             uint8_t leadCC, trailCC;
530             trailCC=(uint8_t)(firstUnit>>8);
531             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
532                 leadCC=(uint8_t)(*mapping++>>8);
533             } else {
534                 leadCC=0;
535             }
536             return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode);
537         }
538     }
539 }
540 
541 const UChar *
getDecomposition(UChar32 c,UChar buffer[4],int32_t & length) const542 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
543     const UChar *decomp=NULL;
544     uint16_t norm16;
545     for(;;) {
546         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
547             // c does not decompose
548             return decomp;
549         } else if(isHangul(norm16)) {
550             // Hangul syllable: decompose algorithmically
551             length=Hangul::decompose(c, buffer);
552             return buffer;
553         } else if(isDecompNoAlgorithmic(norm16)) {
554             c=mapAlgorithmic(c, norm16);
555             decomp=buffer;
556             length=0;
557             U16_APPEND_UNSAFE(buffer, length, c);
558         } else {
559             // c decomposes, get everything from the variable-length extra data
560             const uint16_t *mapping=getMapping(norm16);
561             uint16_t firstUnit=*mapping++;
562             length=firstUnit&MAPPING_LENGTH_MASK;
563             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
564                 ++mapping;
565             }
566             return (const UChar *)mapping;
567         }
568     }
569 }
570 
decomposeAndAppend(const UChar * src,const UChar * limit,UBool doDecompose,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const571 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
572                                          UBool doDecompose,
573                                          UnicodeString &safeMiddle,
574                                          ReorderingBuffer &buffer,
575                                          UErrorCode &errorCode) const {
576     buffer.copyReorderableSuffixTo(safeMiddle);
577     if(doDecompose) {
578         decompose(src, limit, &buffer, errorCode);
579         return;
580     }
581     // Just merge the strings at the boundary.
582     ForwardUTrie2StringIterator iter(normTrie, src, limit);
583     uint8_t firstCC, prevCC, cc;
584     firstCC=prevCC=cc=getCC(iter.next16());
585     while(cc!=0) {
586         prevCC=cc;
587         cc=getCC(iter.next16());
588     };
589     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
590         limit=u_strchr(iter.codePointStart, 0);
591     }
592     buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) &&
593         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
594 }
595 
596 // Note: hasDecompBoundary() could be implemented as aliases to
597 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
598 // at the cost of building the FCD trie for a decomposition normalizer.
hasDecompBoundary(UChar32 c,UBool before) const599 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
600     for(;;) {
601         if(c<minDecompNoCP) {
602             return TRUE;
603         }
604         uint16_t norm16=getNorm16(c);
605         if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
606             return TRUE;
607         } else if(norm16>MIN_NORMAL_MAYBE_YES) {
608             return FALSE;  // ccc!=0
609         } else if(isDecompNoAlgorithmic(norm16)) {
610             c=mapAlgorithmic(c, norm16);
611         } else {
612             // c decomposes, get everything from the variable-length extra data
613             const uint16_t *mapping=getMapping(norm16);
614             uint16_t firstUnit=*mapping++;
615             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
616                 return FALSE;
617             }
618             if(!before) {
619                 // decomp after-boundary: same as hasFCDBoundaryAfter(),
620                 // fcd16<=1 || trailCC==0
621                 if(firstUnit>0x1ff) {
622                     return FALSE;  // trailCC>1
623                 }
624                 if(firstUnit<=0xff) {
625                     return TRUE;  // trailCC==0
626                 }
627                 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
628             }
629             // TRUE if leadCC==0 (hasFCDBoundaryBefore())
630             return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0;
631         }
632     }
633 }
634 
635 /*
636  * Finds the recomposition result for
637  * a forward-combining "lead" character,
638  * specified with a pointer to its compositions list,
639  * and a backward-combining "trail" character.
640  *
641  * If the lead and trail characters combine, then this function returns
642  * the following "compositeAndFwd" value:
643  * Bits 21..1  composite character
644  * Bit      0  set if the composite is a forward-combining starter
645  * otherwise it returns -1.
646  *
647  * The compositions list has (trail, compositeAndFwd) pair entries,
648  * encoded as either pairs or triples of 16-bit units.
649  * The last entry has the high bit of its first unit set.
650  *
651  * The list is sorted by ascending trail characters (there are no duplicates).
652  * A linear search is used.
653  *
654  * See normalizer2impl.h for a more detailed description
655  * of the compositions list format.
656  */
combine(const uint16_t * list,UChar32 trail)657 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
658     uint16_t key1, firstUnit;
659     if(trail<COMP_1_TRAIL_LIMIT) {
660         // trail character is 0..33FF
661         // result entry may have 2 or 3 units
662         key1=(uint16_t)(trail<<1);
663         while(key1>(firstUnit=*list)) {
664             list+=2+(firstUnit&COMP_1_TRIPLE);
665         }
666         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
667             if(firstUnit&COMP_1_TRIPLE) {
668                 return ((int32_t)list[1]<<16)|list[2];
669             } else {
670                 return list[1];
671             }
672         }
673     } else {
674         // trail character is 3400..10FFFF
675         // result entry has 3 units
676         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
677                         (((trail>>COMP_1_TRAIL_SHIFT))&
678                           ~COMP_1_TRIPLE));
679         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
680         uint16_t secondUnit;
681         for(;;) {
682             if(key1>(firstUnit=*list)) {
683                 list+=2+(firstUnit&COMP_1_TRIPLE);
684             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
685                 if(key2>(secondUnit=list[1])) {
686                     if(firstUnit&COMP_1_LAST_TUPLE) {
687                         break;
688                     } else {
689                         list+=3;
690                     }
691                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
692                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
693                 } else {
694                     break;
695                 }
696             } else {
697                 break;
698             }
699         }
700     }
701     return -1;
702 }
703 
704 /**
705   * @param list some character's compositions list
706   * @param set recursively receives the composites from these compositions
707   */
addComposites(const uint16_t * list,UnicodeSet & set) const708 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
709     uint16_t firstUnit;
710     int32_t compositeAndFwd;
711     do {
712         firstUnit=*list;
713         if((firstUnit&COMP_1_TRIPLE)==0) {
714             compositeAndFwd=list[1];
715             list+=2;
716         } else {
717             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
718             list+=3;
719         }
720         UChar32 composite=compositeAndFwd>>1;
721         if((compositeAndFwd&1)!=0) {
722             addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
723         }
724         set.add(composite);
725     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
726 }
727 
728 /*
729  * Recomposes the buffer text starting at recomposeStartIndex
730  * (which is in NFD - decomposed and canonically ordered),
731  * and truncates the buffer contents.
732  *
733  * Note that recomposition never lengthens the text:
734  * Any character consists of either one or two code units;
735  * a composition may contain at most one more code unit than the original starter,
736  * while the combining mark that is removed has at least one code unit.
737  */
recompose(ReorderingBuffer & buffer,int32_t recomposeStartIndex,UBool onlyContiguous) const738 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
739                                 UBool onlyContiguous) const {
740     UChar *p=buffer.getStart()+recomposeStartIndex;
741     UChar *limit=buffer.getLimit();
742     if(p==limit) {
743         return;
744     }
745 
746     UChar *starter, *pRemove, *q, *r;
747     const uint16_t *compositionsList;
748     UChar32 c, compositeAndFwd;
749     uint16_t norm16;
750     uint8_t cc, prevCC;
751     UBool starterIsSupplementary;
752 
753     // Some of the following variables are not used until we have a forward-combining starter
754     // and are only initialized now to avoid compiler warnings.
755     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
756     starter=NULL;
757     starterIsSupplementary=FALSE;
758     prevCC=0;
759 
760     for(;;) {
761         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
762         cc=getCCFromYesOrMaybe(norm16);
763         if( // this character combines backward and
764             isMaybe(norm16) &&
765             // we have seen a starter that combines forward and
766             compositionsList!=NULL &&
767             // the backward-combining character is not blocked
768             (prevCC<cc || prevCC==0)
769         ) {
770             if(isJamoVT(norm16)) {
771                 // c is a Jamo V/T, see if we can compose it with the previous character.
772                 if(c<Hangul::JAMO_T_BASE) {
773                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
774                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
775                     if(prev<Hangul::JAMO_L_COUNT) {
776                         pRemove=p-1;
777                         UChar syllable=(UChar)
778                             (Hangul::HANGUL_BASE+
779                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
780                              Hangul::JAMO_T_COUNT);
781                         UChar t;
782                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
783                             ++p;
784                             syllable+=t;  // The next character was a Jamo T.
785                         }
786                         *starter=syllable;
787                         // remove the Jamo V/T
788                         q=pRemove;
789                         r=p;
790                         while(r<limit) {
791                             *q++=*r++;
792                         }
793                         limit=q;
794                         p=pRemove;
795                     }
796                 }
797                 /*
798                  * No "else" for Jamo T:
799                  * Since the input is in NFD, there are no Hangul LV syllables that
800                  * a Jamo T could combine with.
801                  * All Jamo Ts are combined above when handling Jamo Vs.
802                  */
803                 if(p==limit) {
804                     break;
805                 }
806                 compositionsList=NULL;
807                 continue;
808             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
809                 // The starter and the combining mark (c) do combine.
810                 UChar32 composite=compositeAndFwd>>1;
811 
812                 // Replace the starter with the composite, remove the combining mark.
813                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
814                 if(starterIsSupplementary) {
815                     if(U_IS_SUPPLEMENTARY(composite)) {
816                         // both are supplementary
817                         starter[0]=U16_LEAD(composite);
818                         starter[1]=U16_TRAIL(composite);
819                     } else {
820                         *starter=(UChar)composite;
821                         // The composite is shorter than the starter,
822                         // move the intermediate characters forward one.
823                         starterIsSupplementary=FALSE;
824                         q=starter+1;
825                         r=q+1;
826                         while(r<pRemove) {
827                             *q++=*r++;
828                         }
829                         --pRemove;
830                     }
831                 } else if(U_IS_SUPPLEMENTARY(composite)) {
832                     // The composite is longer than the starter,
833                     // move the intermediate characters back one.
834                     starterIsSupplementary=TRUE;
835                     ++starter;  // temporarily increment for the loop boundary
836                     q=pRemove;
837                     r=++pRemove;
838                     while(starter<q) {
839                         *--r=*--q;
840                     }
841                     *starter=U16_TRAIL(composite);
842                     *--starter=U16_LEAD(composite);  // undo the temporary increment
843                 } else {
844                     // both are on the BMP
845                     *starter=(UChar)composite;
846                 }
847 
848                 /* remove the combining mark by moving the following text over it */
849                 if(pRemove<p) {
850                     q=pRemove;
851                     r=p;
852                     while(r<limit) {
853                         *q++=*r++;
854                     }
855                     limit=q;
856                     p=pRemove;
857                 }
858                 // Keep prevCC because we removed the combining mark.
859 
860                 if(p==limit) {
861                     break;
862                 }
863                 // Is the composite a starter that combines forward?
864                 if(compositeAndFwd&1) {
865                     compositionsList=
866                         getCompositionsListForComposite(getNorm16(composite));
867                 } else {
868                     compositionsList=NULL;
869                 }
870 
871                 // We combined; continue with looking for compositions.
872                 continue;
873             }
874         }
875 
876         // no combination this time
877         prevCC=cc;
878         if(p==limit) {
879             break;
880         }
881 
882         // If c did not combine, then check if it is a starter.
883         if(cc==0) {
884             // Found a new starter.
885             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
886                 // It may combine with something, prepare for it.
887                 if(U_IS_BMP(c)) {
888                     starterIsSupplementary=FALSE;
889                     starter=p-1;
890                 } else {
891                     starterIsSupplementary=TRUE;
892                     starter=p-2;
893                 }
894             }
895         } else if(onlyContiguous) {
896             // FCC: no discontiguous compositions; any intervening character blocks.
897             compositionsList=NULL;
898         }
899     }
900     buffer.setReorderingLimit(limit);
901 }
902 
903 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
904 // doCompose: normalize
905 // !doCompose: isNormalized (buffer must be empty and initialized)
906 UBool
compose(const UChar * src,const UChar * limit,UBool onlyContiguous,UBool doCompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const907 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
908                          UBool onlyContiguous,
909                          UBool doCompose,
910                          ReorderingBuffer &buffer,
911                          UErrorCode &errorCode) const {
912     /*
913      * prevBoundary points to the last character before the current one
914      * that has a composition boundary before it with ccc==0 and quick check "yes".
915      * Keeping track of prevBoundary saves us looking for a composition boundary
916      * when we find a "no" or "maybe".
917      *
918      * When we back out from prevSrc back to prevBoundary,
919      * then we also remove those same characters (which had been simply copied
920      * or canonically-order-inserted) from the ReorderingBuffer.
921      * Therefore, at all times, the [prevBoundary..prevSrc[ source units
922      * must correspond 1:1 to destination units at the end of the destination buffer.
923      */
924     const UChar *prevBoundary=src;
925     UChar32 minNoMaybeCP=minCompNoMaybeCP;
926     if(limit==NULL) {
927         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
928                                            doCompose ? &buffer : NULL,
929                                            errorCode);
930         if(U_FAILURE(errorCode)) {
931             return FALSE;
932         }
933         if(prevBoundary<src) {
934             // Set prevBoundary to the last character in the prefix.
935             prevBoundary=src-1;
936         }
937         limit=u_strchr(src, 0);
938     }
939 
940     const UChar *prevSrc;
941     UChar32 c=0;
942     uint16_t norm16=0;
943 
944     // only for isNormalized
945     uint8_t prevCC=0;
946 
947     for(;;) {
948         // count code units below the minimum or with irrelevant data for the quick check
949         for(prevSrc=src; src!=limit;) {
950             if( (c=*src)<minNoMaybeCP ||
951                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
952             ) {
953                 ++src;
954             } else if(!U16_IS_SURROGATE(c)) {
955                 break;
956             } else {
957                 UChar c2;
958                 if(U16_IS_SURROGATE_LEAD(c)) {
959                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
960                         c=U16_GET_SUPPLEMENTARY(c, c2);
961                     }
962                 } else /* trail surrogate */ {
963                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
964                         --src;
965                         c=U16_GET_SUPPLEMENTARY(c2, c);
966                     }
967                 }
968                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
969                     src+=U16_LENGTH(c);
970                 } else {
971                     break;
972                 }
973             }
974         }
975         // copy these code units all at once
976         if(src!=prevSrc) {
977             if(doCompose) {
978                 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
979                     break;
980                 }
981             } else {
982                 prevCC=0;
983             }
984             if(src==limit) {
985                 break;
986             }
987             // Set prevBoundary to the last character in the quick check loop.
988             prevBoundary=src-1;
989             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
990                 U16_IS_LEAD(*(prevBoundary-1))
991             ) {
992                 --prevBoundary;
993             }
994             // The start of the current character (c).
995             prevSrc=src;
996         } else if(src==limit) {
997             break;
998         }
999 
1000         src+=U16_LENGTH(c);
1001         /*
1002          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1003          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1004          * or has ccc!=0.
1005          * Check for Jamo V/T, then for regular characters.
1006          * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1007          */
1008         if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1009             UChar prev=*(prevSrc-1);
1010             UBool needToDecompose=FALSE;
1011             if(c<Hangul::JAMO_T_BASE) {
1012                 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1013                 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1014                 if(prev<Hangul::JAMO_L_COUNT) {
1015                     if(!doCompose) {
1016                         return FALSE;
1017                     }
1018                     UChar syllable=(UChar)
1019                         (Hangul::HANGUL_BASE+
1020                          (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1021                          Hangul::JAMO_T_COUNT);
1022                     UChar t;
1023                     if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1024                         ++src;
1025                         syllable+=t;  // The next character was a Jamo T.
1026                         prevBoundary=src;
1027                         buffer.setLastChar(syllable);
1028                         continue;
1029                     }
1030                     // If we see L+V+x where x!=T then we drop to the slow path,
1031                     // decompose and recompose.
1032                     // This is to deal with NFKC finding normal L and V but a
1033                     // compatibility variant of a T. We need to either fully compose that
1034                     // combination here (which would complicate the code and may not work
1035                     // with strange custom data) or use the slow path -- or else our replacing
1036                     // two input characters (L+V) with one output character (LV syllable)
1037                     // would violate the invariant that [prevBoundary..prevSrc[ has the same
1038                     // length as what we appended to the buffer since prevBoundary.
1039                     needToDecompose=TRUE;
1040                 }
1041             } else if(Hangul::isHangulWithoutJamoT(prev)) {
1042                 // c is a Jamo Trailing consonant,
1043                 // compose with previous Hangul LV that does not contain a Jamo T.
1044                 if(!doCompose) {
1045                     return FALSE;
1046                 }
1047                 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1048                 prevBoundary=src;
1049                 continue;
1050             }
1051             if(!needToDecompose) {
1052                 // The Jamo V/T did not compose into a Hangul syllable.
1053                 if(doCompose) {
1054                     if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1055                         break;
1056                     }
1057                 } else {
1058                     prevCC=0;
1059                 }
1060                 continue;
1061             }
1062         }
1063         /*
1064          * Source buffer pointers:
1065          *
1066          *  all done      quick check   current char  not yet
1067          *                "yes" but     (c)           processed
1068          *                may combine
1069          *                forward
1070          * [-------------[-------------[-------------[-------------[
1071          * |             |             |             |             |
1072          * orig. src     prevBoundary  prevSrc       src           limit
1073          *
1074          *
1075          * Destination buffer pointers inside the ReorderingBuffer:
1076          *
1077          *  all done      might take    not filled yet
1078          *                characters for
1079          *                reordering
1080          * [-------------[-------------[-------------[
1081          * |             |             |             |
1082          * start         reorderStart  limit         |
1083          *                             +remainingCap.+
1084          */
1085         if(norm16>=MIN_YES_YES_WITH_CC) {
1086             uint8_t cc=(uint8_t)norm16;  // cc!=0
1087             if( onlyContiguous &&  // FCC
1088                 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1089                 prevBoundary<prevSrc &&
1090                 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1091                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1092                 // passed the quick check "yes && ccc==0" test.
1093                 // Check whether the last character was a "yesYes" or a "yesNo".
1094                 // If a "yesNo", then we get its trailing ccc from its
1095                 // mapping and check for canonical order.
1096                 // All other cases are ok.
1097                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1098             ) {
1099                 // Fails FCD test, need to decompose and contiguously recompose.
1100                 if(!doCompose) {
1101                     return FALSE;
1102                 }
1103             } else if(doCompose) {
1104                 if(!buffer.append(c, cc, errorCode)) {
1105                     break;
1106                 }
1107                 continue;
1108             } else if(prevCC<=cc) {
1109                 prevCC=cc;
1110                 continue;
1111             } else {
1112                 return FALSE;
1113             }
1114         } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1115             return FALSE;
1116         }
1117 
1118         /*
1119          * Find appropriate boundaries around this character,
1120          * decompose the source text from between the boundaries,
1121          * and recompose it.
1122          *
1123          * We may need to remove the last few characters from the ReorderingBuffer
1124          * to account for source text that was copied or appended
1125          * but needs to take part in the recomposition.
1126          */
1127 
1128         /*
1129          * Find the last composition boundary in [prevBoundary..src[.
1130          * It is either the decomposition of the current character (at prevSrc),
1131          * or prevBoundary.
1132          */
1133         if(hasCompBoundaryBefore(c, norm16)) {
1134             prevBoundary=prevSrc;
1135         } else if(doCompose) {
1136             buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1137         }
1138 
1139         // Find the next composition boundary in [src..limit[ -
1140         // modifies src to point to the next starter.
1141         src=(UChar *)findNextCompBoundary(src, limit);
1142 
1143         // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1144         int32_t recomposeStartIndex=buffer.length();
1145         if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1146             break;
1147         }
1148         recompose(buffer, recomposeStartIndex, onlyContiguous);
1149         if(!doCompose) {
1150             if(!buffer.equals(prevBoundary, src)) {
1151                 return FALSE;
1152             }
1153             buffer.remove();
1154             prevCC=0;
1155         }
1156 
1157         // Move to the next starter. We never need to look back before this point again.
1158         prevBoundary=src;
1159     }
1160     return TRUE;
1161 }
1162 
1163 // Very similar to compose(): Make the same changes in both places if relevant.
1164 // pQCResult==NULL: spanQuickCheckYes
1165 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1166 const UChar *
composeQuickCheck(const UChar * src,const UChar * limit,UBool onlyContiguous,UNormalizationCheckResult * pQCResult) const1167 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1168                                    UBool onlyContiguous,
1169                                    UNormalizationCheckResult *pQCResult) const {
1170     /*
1171      * prevBoundary points to the last character before the current one
1172      * that has a composition boundary before it with ccc==0 and quick check "yes".
1173      */
1174     const UChar *prevBoundary=src;
1175     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1176     if(limit==NULL) {
1177         UErrorCode errorCode=U_ZERO_ERROR;
1178         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1179         if(prevBoundary<src) {
1180             // Set prevBoundary to the last character in the prefix.
1181             prevBoundary=src-1;
1182         }
1183         limit=u_strchr(src, 0);
1184     }
1185 
1186     const UChar *prevSrc;
1187     UChar32 c=0;
1188     uint16_t norm16=0;
1189     uint8_t prevCC=0;
1190 
1191     for(;;) {
1192         // count code units below the minimum or with irrelevant data for the quick check
1193         for(prevSrc=src;;) {
1194             if(src==limit) {
1195                 return src;
1196             }
1197             if( (c=*src)<minNoMaybeCP ||
1198                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1199             ) {
1200                 ++src;
1201             } else if(!U16_IS_SURROGATE(c)) {
1202                 break;
1203             } else {
1204                 UChar c2;
1205                 if(U16_IS_SURROGATE_LEAD(c)) {
1206                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1207                         c=U16_GET_SUPPLEMENTARY(c, c2);
1208                     }
1209                 } else /* trail surrogate */ {
1210                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1211                         --src;
1212                         c=U16_GET_SUPPLEMENTARY(c2, c);
1213                     }
1214                 }
1215                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1216                     src+=U16_LENGTH(c);
1217                 } else {
1218                     break;
1219                 }
1220             }
1221         }
1222         if(src!=prevSrc) {
1223             // Set prevBoundary to the last character in the quick check loop.
1224             prevBoundary=src-1;
1225             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1226                 U16_IS_LEAD(*(prevBoundary-1))
1227             ) {
1228                 --prevBoundary;
1229             }
1230             prevCC=0;
1231             // The start of the current character (c).
1232             prevSrc=src;
1233         }
1234 
1235         src+=U16_LENGTH(c);
1236         /*
1237          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1238          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1239          * or has ccc!=0.
1240          */
1241         if(isMaybeOrNonZeroCC(norm16)) {
1242             uint8_t cc=getCCFromYesOrMaybe(norm16);
1243             if( onlyContiguous &&  // FCC
1244                 cc!=0 &&
1245                 prevCC==0 &&
1246                 prevBoundary<prevSrc &&
1247                 // prevCC==0 && prevBoundary<prevSrc tell us that
1248                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1249                 // passed the quick check "yes && ccc==0" test.
1250                 // Check whether the last character was a "yesYes" or a "yesNo".
1251                 // If a "yesNo", then we get its trailing ccc from its
1252                 // mapping and check for canonical order.
1253                 // All other cases are ok.
1254                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1255             ) {
1256                 // Fails FCD test.
1257             } else if(prevCC<=cc || cc==0) {
1258                 prevCC=cc;
1259                 if(norm16<MIN_YES_YES_WITH_CC) {
1260                     if(pQCResult!=NULL) {
1261                         *pQCResult=UNORM_MAYBE;
1262                     } else {
1263                         return prevBoundary;
1264                     }
1265                 }
1266                 continue;
1267             }
1268         }
1269         if(pQCResult!=NULL) {
1270             *pQCResult=UNORM_NO;
1271         }
1272         return prevBoundary;
1273     }
1274 }
1275 
composeAndAppend(const UChar * src,const UChar * limit,UBool doCompose,UBool onlyContiguous,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1276 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1277                                        UBool doCompose,
1278                                        UBool onlyContiguous,
1279                                        UnicodeString &safeMiddle,
1280                                        ReorderingBuffer &buffer,
1281                                        UErrorCode &errorCode) const {
1282     if(!buffer.isEmpty()) {
1283         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1284         if(src!=firstStarterInSrc) {
1285             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1286                                                                     buffer.getLimit());
1287             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1288             UnicodeString middle(lastStarterInDest, destSuffixLength);
1289             buffer.removeSuffix(destSuffixLength);
1290             safeMiddle=middle;
1291             middle.append(src, (int32_t)(firstStarterInSrc-src));
1292             const UChar *middleStart=middle.getBuffer();
1293             compose(middleStart, middleStart+middle.length(), onlyContiguous,
1294                     TRUE, buffer, errorCode);
1295             if(U_FAILURE(errorCode)) {
1296                 return;
1297             }
1298             src=firstStarterInSrc;
1299         }
1300     }
1301     if(doCompose) {
1302         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1303     } else {
1304         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1305             limit=u_strchr(src, 0);
1306         }
1307         buffer.appendZeroCC(src, limit, errorCode);
1308     }
1309 }
1310 
1311 /**
1312  * Does c have a composition boundary before it?
1313  * True if its decomposition begins with a character that has
1314  * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1315  * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1316  * (isCompYesAndZeroCC()) so we need not decompose.
1317  */
hasCompBoundaryBefore(UChar32 c,uint16_t norm16) const1318 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1319     for(;;) {
1320         if(isCompYesAndZeroCC(norm16)) {
1321             return TRUE;
1322         } else if(isMaybeOrNonZeroCC(norm16)) {
1323             return FALSE;
1324         } else if(isDecompNoAlgorithmic(norm16)) {
1325             c=mapAlgorithmic(c, norm16);
1326             norm16=getNorm16(c);
1327         } else {
1328             // c decomposes, get everything from the variable-length extra data
1329             const uint16_t *mapping=getMapping(norm16);
1330             uint16_t firstUnit=*mapping++;
1331             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1332                 return FALSE;
1333             }
1334             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) {
1335                 return FALSE;  // non-zero leadCC
1336             }
1337             int32_t i=0;
1338             UChar32 c;
1339             U16_NEXT_UNSAFE(mapping, i, c);
1340             return isCompYesAndZeroCC(getNorm16(c));
1341         }
1342     }
1343 }
1344 
hasCompBoundaryAfter(UChar32 c,UBool onlyContiguous,UBool testInert) const1345 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1346     for(;;) {
1347         uint16_t norm16=getNorm16(c);
1348         if(isInert(norm16)) {
1349             return TRUE;
1350         } else if(norm16<=minYesNo) {
1351             // Hangul LVT (==minYesNo) has a boundary after it.
1352             // Hangul LV and non-inert yesYes characters combine forward.
1353             return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1354         } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1355             return FALSE;
1356         } else if(isDecompNoAlgorithmic(norm16)) {
1357             c=mapAlgorithmic(c, norm16);
1358         } else {
1359             // c decomposes, get everything from the variable-length extra data.
1360             // If testInert, then c must be a yesNo character which has lccc=0,
1361             // otherwise it could be a noNo.
1362             const uint16_t *mapping=getMapping(norm16);
1363             uint16_t firstUnit=*mapping;
1364             // TRUE if
1365             //      c is not deleted, and
1366             //      it and its decomposition do not combine forward, and it has a starter, and
1367             //      if FCC then trailCC<=1
1368             return
1369                 (firstUnit&MAPPING_LENGTH_MASK)!=0 &&
1370                 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
1371                 (!onlyContiguous || firstUnit<=0x1ff);
1372         }
1373     }
1374 }
1375 
findPreviousCompBoundary(const UChar * start,const UChar * p) const1376 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1377     BackwardUTrie2StringIterator iter(normTrie, start, p);
1378     uint16_t norm16;
1379     do {
1380         norm16=iter.previous16();
1381     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1382     // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1383     // but that's probably not worth the extra cost.
1384     return iter.codePointStart;
1385 }
1386 
findNextCompBoundary(const UChar * p,const UChar * limit) const1387 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1388     ForwardUTrie2StringIterator iter(normTrie, p, limit);
1389     uint16_t norm16;
1390     do {
1391         norm16=iter.next16();
1392     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1393     return iter.codePointStart;
1394 }
1395 
1396 class FCDTrieSingleton : public UTrie2Singleton {
1397 public:
FCDTrieSingleton(SimpleSingleton & s,Normalizer2Impl & ni,UErrorCode & ec)1398     FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1399         UTrie2Singleton(s), impl(ni), errorCode(ec) {}
getInstance(UErrorCode & errorCode)1400     UTrie2 *getInstance(UErrorCode &errorCode) {
1401         return UTrie2Singleton::getInstance(createInstance, this, errorCode);
1402     }
1403     static void *createInstance(const void *context, UErrorCode &errorCode);
rangeHandler(UChar32 start,UChar32 end,uint32_t value)1404     UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1405         if(value!=0) {
1406             impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode);
1407         }
1408         return U_SUCCESS(errorCode);
1409     }
1410 
1411     Normalizer2Impl &impl;
1412     UTrie2 *newFCDTrie;
1413     UErrorCode &errorCode;
1414 };
1415 
1416 U_CDECL_BEGIN
1417 
1418 // Set the FCD value for a range of same-norm16 characters.
1419 static UBool U_CALLCONV
enumRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)1420 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1421     return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value);
1422 }
1423 
1424 // Collect (OR together) the FCD values for a range of supplementary characters,
1425 // for their lead surrogate code unit.
1426 static UBool U_CALLCONV
enumRangeOrValue(const void * context,UChar32,UChar32,uint32_t value)1427 enumRangeOrValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1428     *((uint32_t *)context)|=value;
1429     return TRUE;
1430 }
1431 
1432 U_CDECL_END
1433 
createInstance(const void * context,UErrorCode & errorCode)1434 void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1435     FCDTrieSingleton *me=(FCDTrieSingleton *)context;
1436     me->newFCDTrie=utrie2_open(0, 0, &errorCode);
1437     if(U_SUCCESS(errorCode)) {
1438         utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me);
1439         for(UChar lead=0xd800; lead<0xdc00; ++lead) {
1440             uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead);
1441             utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue);
1442             if(oredValue!=0) {
1443                 // Set a "bad" value for makeFCD() to break the quick check loop
1444                 // and look up the value for the supplementary code point.
1445                 // If there is any lccc, then set the worst-case lccc of 1.
1446                 // The ORed-together value's tccc is already the worst case.
1447                 if(oredValue>0xff) {
1448                     oredValue=0x100|(oredValue&0xff);
1449                 }
1450                 utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode);
1451             }
1452         }
1453         utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode);
1454         if(U_SUCCESS(errorCode)) {
1455             return me->newFCDTrie;
1456         }
1457     }
1458     utrie2_close(me->newFCDTrie);
1459     return NULL;
1460 }
1461 
setFCD16FromNorm16(UChar32 start,UChar32 end,uint16_t norm16,UTrie2 * newFCDTrie,UErrorCode & errorCode) const1462 void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1463                                          UTrie2 *newFCDTrie, UErrorCode &errorCode) const {
1464     // Only loops for 1:1 algorithmic mappings.
1465     for(;;) {
1466         if(norm16>=MIN_NORMAL_MAYBE_YES) {
1467             norm16&=0xff;
1468             norm16|=norm16<<8;
1469         } else if(norm16<=minYesNo || minMaybeYes<=norm16) {
1470             // no decomposition or Hangul syllable, all zeros
1471             break;
1472         } else if(limitNoNo<=norm16) {
1473             int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1);
1474             if(start==end) {
1475                 start+=delta;
1476                 norm16=getNorm16(start);
1477             } else {
1478                 // the same delta leads from different original characters to different mappings
1479                 do {
1480                     UChar32 c=start+delta;
1481                     setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode);
1482                 } while(++start<=end);
1483                 break;
1484             }
1485         } else {
1486             // c decomposes, get everything from the variable-length extra data
1487             const uint16_t *mapping=getMapping(norm16);
1488             uint16_t firstUnit=*mapping;
1489             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1490                 // A character that is deleted (maps to an empty string) must
1491                 // get the worst-case lccc and tccc values because arbitrary
1492                 // characters on both sides will become adjacent.
1493                 norm16=0x1ff;
1494             } else {
1495                 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1496                     norm16=mapping[1]&0xff00;  // lccc
1497                 } else {
1498                     norm16=0;
1499                 }
1500                 norm16|=firstUnit>>8;  // tccc
1501             }
1502         }
1503         utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);
1504         break;
1505     }
1506 }
1507 
getFCDTrie(UErrorCode & errorCode) const1508 const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {
1509     // Logically const: Synchronized instantiation.
1510     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1511     return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode);
1512 }
1513 
1514 // Dual functionality:
1515 // buffer!=NULL: normalize
1516 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1517 const UChar *
makeFCD(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const1518 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1519                          ReorderingBuffer *buffer,
1520                          UErrorCode &errorCode) const {
1521     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1522     // Similar to the prevBoundary in the compose() implementation.
1523     const UChar *prevBoundary=src;
1524     int32_t prevFCD16=0;
1525     if(limit==NULL) {
1526         src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1527         if(U_FAILURE(errorCode)) {
1528             return src;
1529         }
1530         if(prevBoundary<src) {
1531             prevBoundary=src;
1532             // We know that the previous character's lccc==0.
1533             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1534             prevFCD16=getFCD16FromSingleLead(*(src-1));
1535             if(prevFCD16>1) {
1536                 --prevBoundary;
1537             }
1538         }
1539         limit=u_strchr(src, 0);
1540     }
1541 
1542     // Note: In this function we use buffer->appendZeroCC() because we track
1543     // the lead and trail combining classes here, rather than leaving it to
1544     // the ReorderingBuffer.
1545     // The exception is the call to decomposeShort() which uses the buffer
1546     // in the normal way.
1547 
1548     const UTrie2 *trie=fcdTrie();
1549 
1550     const UChar *prevSrc;
1551     UChar32 c=0;
1552     uint16_t fcd16=0;
1553 
1554     for(;;) {
1555         // count code units with lccc==0
1556         for(prevSrc=src; src!=limit;) {
1557             if((c=*src)<MIN_CCC_LCCC_CP) {
1558                 prevFCD16=~c;
1559                 ++src;
1560             } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) {
1561                 prevFCD16=fcd16;
1562                 ++src;
1563             } else if(!U16_IS_SURROGATE(c)) {
1564                 break;
1565             } else {
1566                 UChar c2;
1567                 if(U16_IS_SURROGATE_LEAD(c)) {
1568                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1569                         c=U16_GET_SUPPLEMENTARY(c, c2);
1570                     }
1571                 } else /* trail surrogate */ {
1572                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1573                         --src;
1574                         c=U16_GET_SUPPLEMENTARY(c2, c);
1575                     }
1576                 }
1577                 if((fcd16=getFCD16(c))<=0xff) {
1578                     prevFCD16=fcd16;
1579                     src+=U16_LENGTH(c);
1580                 } else {
1581                     break;
1582                 }
1583             }
1584         }
1585         // copy these code units all at once
1586         if(src!=prevSrc) {
1587             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1588                 break;
1589             }
1590             if(src==limit) {
1591                 break;
1592             }
1593             prevBoundary=src;
1594             // We know that the previous character's lccc==0.
1595             if(prevFCD16<0) {
1596                 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1597                 prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16);
1598                 if(prevFCD16>1) {
1599                     --prevBoundary;
1600                 }
1601             } else {
1602                 const UChar *p=src-1;
1603                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1604                     --p;
1605                     // Need to fetch the previous character's FCD value because
1606                     // prevFCD16 was just for the trail surrogate code point.
1607                     prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]);
1608                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1609                 }
1610                 if(prevFCD16>1) {
1611                     prevBoundary=p;
1612                 }
1613             }
1614             // The start of the current character (c).
1615             prevSrc=src;
1616         } else if(src==limit) {
1617             break;
1618         }
1619 
1620         src+=U16_LENGTH(c);
1621         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1622         // Check for proper order, and decompose locally if necessary.
1623         if((prevFCD16&0xff)<=(fcd16>>8)) {
1624             // proper order: prev tccc <= current lccc
1625             if((fcd16&0xff)<=1) {
1626                 prevBoundary=src;
1627             }
1628             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1629                 break;
1630             }
1631             prevFCD16=fcd16;
1632             continue;
1633         } else if(buffer==NULL) {
1634             return prevBoundary;  // quick check "no"
1635         } else {
1636             /*
1637              * Back out the part of the source that we copied or appended
1638              * already but is now going to be decomposed.
1639              * prevSrc is set to after what was copied/appended.
1640              */
1641             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1642             /*
1643              * Find the part of the source that needs to be decomposed,
1644              * up to the next safe boundary.
1645              */
1646             src=findNextFCDBoundary(src, limit);
1647             /*
1648              * The source text does not fulfill the conditions for FCD.
1649              * Decompose and reorder a limited piece of the text.
1650              */
1651             if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1652                 break;
1653             }
1654             prevBoundary=src;
1655             prevFCD16=0;
1656         }
1657     }
1658     return src;
1659 }
1660 
makeFCDAndAppend(const UChar * src,const UChar * limit,UBool doMakeFCD,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1661 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1662                                        UBool doMakeFCD,
1663                                        UnicodeString &safeMiddle,
1664                                        ReorderingBuffer &buffer,
1665                                        UErrorCode &errorCode) const {
1666     if(!buffer.isEmpty()) {
1667         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1668         if(src!=firstBoundaryInSrc) {
1669             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1670                                                                     buffer.getLimit());
1671             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
1672             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
1673             buffer.removeSuffix(destSuffixLength);
1674             safeMiddle=middle;
1675             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1676             const UChar *middleStart=middle.getBuffer();
1677             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1678             if(U_FAILURE(errorCode)) {
1679                 return;
1680             }
1681             src=firstBoundaryInSrc;
1682         }
1683     }
1684     if(doMakeFCD) {
1685         makeFCD(src, limit, &buffer, errorCode);
1686     } else {
1687         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1688             limit=u_strchr(src, 0);
1689         }
1690         buffer.appendZeroCC(src, limit, errorCode);
1691     }
1692 }
1693 
findPreviousFCDBoundary(const UChar * start,const UChar * p) const1694 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1695     BackwardUTrie2StringIterator iter(fcdTrie(), start, p);
1696     uint16_t fcd16;
1697     do {
1698         fcd16=iter.previous16();
1699     } while(fcd16>0xff);
1700     return iter.codePointStart;
1701 }
1702 
findNextFCDBoundary(const UChar * p,const UChar * limit) const1703 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1704     ForwardUTrie2StringIterator iter(fcdTrie(), p, limit);
1705     uint16_t fcd16;
1706     do {
1707         fcd16=iter.next16();
1708     } while(fcd16>0xff);
1709     return iter.codePointStart;
1710 }
1711 
1712 // CanonicalIterator data -------------------------------------------------- ***
1713 
CanonIterData(UErrorCode & errorCode)1714 CanonIterData::CanonIterData(UErrorCode &errorCode) :
1715         trie(utrie2_open(0, 0, &errorCode)),
1716         canonStartSets(uhash_deleteUObject, NULL, errorCode) {}
1717 
~CanonIterData()1718 CanonIterData::~CanonIterData() {
1719     utrie2_close(trie);
1720 }
1721 
addToStartSet(UChar32 origin,UChar32 decompLead,UErrorCode & errorCode)1722 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1723     uint32_t canonValue=utrie2_get32(trie, decompLead);
1724     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1725         // origin is the first character whose decomposition starts with
1726         // the character for which we are setting the value.
1727         utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1728     } else {
1729         // origin is not the first character, or it is U+0000.
1730         UnicodeSet *set;
1731         if((canonValue&CANON_HAS_SET)==0) {
1732             set=new UnicodeSet;
1733             if(set==NULL) {
1734                 errorCode=U_MEMORY_ALLOCATION_ERROR;
1735                 return;
1736             }
1737             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1738             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1739             utrie2_set32(trie, decompLead, canonValue, &errorCode);
1740             canonStartSets.addElement(set, errorCode);
1741             if(firstOrigin!=0) {
1742                 set->add(firstOrigin);
1743             }
1744         } else {
1745             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1746         }
1747         set->add(origin);
1748     }
1749 }
1750 
1751 class CanonIterDataSingleton {
1752 public:
CanonIterDataSingleton(SimpleSingleton & s,Normalizer2Impl & ni,UErrorCode & ec)1753     CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1754         singleton(s), impl(ni), errorCode(ec) {}
getInstance(UErrorCode & errorCode)1755     CanonIterData *getInstance(UErrorCode &errorCode) {
1756         void *duplicate;
1757         CanonIterData *instance=
1758             (CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode);
1759         delete (CanonIterData *)duplicate;
1760         return instance;
1761     }
1762     static void *createInstance(const void *context, UErrorCode &errorCode);
rangeHandler(UChar32 start,UChar32 end,uint32_t value)1763     UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1764         if(value!=0) {
1765             impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode);
1766         }
1767         return U_SUCCESS(errorCode);
1768     }
1769 
1770 private:
1771     SimpleSingleton &singleton;
1772     Normalizer2Impl &impl;
1773     CanonIterData *newData;
1774     UErrorCode &errorCode;
1775 };
1776 
1777 U_CDECL_BEGIN
1778 
1779 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1780 static UBool U_CALLCONV
enumCIDRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)1781 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1782     return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value);
1783 }
1784 
1785 U_CDECL_END
1786 
createInstance(const void * context,UErrorCode & errorCode)1787 void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1788     CanonIterDataSingleton *me=(CanonIterDataSingleton *)context;
1789     me->newData=new CanonIterData(errorCode);
1790     if(me->newData==NULL) {
1791         errorCode=U_MEMORY_ALLOCATION_ERROR;
1792         return NULL;
1793     }
1794     if(U_SUCCESS(errorCode)) {
1795         utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me);
1796         utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1797         if(U_SUCCESS(errorCode)) {
1798             return me->newData;
1799         }
1800     }
1801     delete me->newData;
1802     return NULL;
1803 }
1804 
makeCanonIterDataFromNorm16(UChar32 start,UChar32 end,uint16_t norm16,CanonIterData & newData,UErrorCode & errorCode) const1805 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1806                                                   CanonIterData &newData,
1807                                                   UErrorCode &errorCode) const {
1808     if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1809         // Inert, or 2-way mapping (including Hangul syllable).
1810         // We do not write a canonStartSet for any yesNo character.
1811         // Composites from 2-way mappings are added at runtime from the
1812         // starter's compositions list, and the other characters in
1813         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1814         // "maybe" characters.
1815         return;
1816     }
1817     for(UChar32 c=start; c<=end; ++c) {
1818         uint32_t oldValue=utrie2_get32(newData.trie, c);
1819         uint32_t newValue=oldValue;
1820         if(norm16>=minMaybeYes) {
1821             // not a segment starter if it occurs in a decomposition or has cc!=0
1822             newValue|=CANON_NOT_SEGMENT_STARTER;
1823             if(norm16<MIN_NORMAL_MAYBE_YES) {
1824                 newValue|=CANON_HAS_COMPOSITIONS;
1825             }
1826         } else if(norm16<minYesNo) {
1827             newValue|=CANON_HAS_COMPOSITIONS;
1828         } else {
1829             // c has a one-way decomposition
1830             UChar32 c2=c;
1831             uint16_t norm16_2=norm16;
1832             while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1833                 c2=mapAlgorithmic(c2, norm16_2);
1834                 norm16_2=getNorm16(c2);
1835             }
1836             if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1837                 // c decomposes, get everything from the variable-length extra data
1838                 const uint16_t *mapping=getMapping(norm16_2);
1839                 uint16_t firstUnit=*mapping++;
1840                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1841                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1842                     if(c==c2 && (*mapping&0xff)!=0) {
1843                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
1844                     }
1845                     ++mapping;
1846                 }
1847                 // Skip empty mappings (no characters in the decomposition).
1848                 if(length!=0) {
1849                     // add c to first code point's start set
1850                     int32_t i=0;
1851                     U16_NEXT_UNSAFE(mapping, i, c2);
1852                     newData.addToStartSet(c, c2, errorCode);
1853                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1854                     // one-way mapping. A 2-way mapping is possible here after
1855                     // intermediate algorithmic mapping.
1856                     if(norm16_2>=minNoNo) {
1857                         while(i<length) {
1858                             U16_NEXT_UNSAFE(mapping, i, c2);
1859                             uint32_t c2Value=utrie2_get32(newData.trie, c2);
1860                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1861                                 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1862                                              &errorCode);
1863                             }
1864                         }
1865                     }
1866                 }
1867             } else {
1868                 // c decomposed to c2 algorithmically; c has cc==0
1869                 newData.addToStartSet(c, c2, errorCode);
1870             }
1871         }
1872         if(newValue!=oldValue) {
1873             utrie2_set32(newData.trie, c, newValue, &errorCode);
1874         }
1875     }
1876 }
1877 
ensureCanonIterData(UErrorCode & errorCode) const1878 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1879     // Logically const: Synchronized instantiation.
1880     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1881     CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode);
1882     return U_SUCCESS(errorCode);
1883 }
1884 
getCanonValue(UChar32 c) const1885 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1886     return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c);
1887 }
1888 
getCanonStartSet(int32_t n) const1889 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1890     return *(const UnicodeSet *)(
1891         ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]);
1892 }
1893 
isCanonSegmentStarter(UChar32 c) const1894 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1895     return getCanonValue(c)>=0;
1896 }
1897 
getCanonStartSet(UChar32 c,UnicodeSet & set) const1898 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1899     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1900     if(canonValue==0) {
1901         return FALSE;
1902     }
1903     set.clear();
1904     int32_t value=canonValue&CANON_VALUE_MASK;
1905     if((canonValue&CANON_HAS_SET)!=0) {
1906         set.addAll(getCanonStartSet(value));
1907     } else if(value!=0) {
1908         set.add(value);
1909     }
1910     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1911         uint16_t norm16=getNorm16(c);
1912         if(norm16==JAMO_L) {
1913             UChar32 syllable=
1914                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1915             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1916         } else {
1917             addComposites(getCompositionsList(norm16), set);
1918         }
1919     }
1920     return TRUE;
1921 }
1922 
1923 U_NAMESPACE_END
1924 
1925 // Normalizer2 data swapping ----------------------------------------------- ***
1926 
1927 U_NAMESPACE_USE
1928 
1929 U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1930 unorm2_swap(const UDataSwapper *ds,
1931             const void *inData, int32_t length, void *outData,
1932             UErrorCode *pErrorCode) {
1933     const UDataInfo *pInfo;
1934     int32_t headerSize;
1935 
1936     const uint8_t *inBytes;
1937     uint8_t *outBytes;
1938 
1939     const int32_t *inIndexes;
1940     int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
1941 
1942     int32_t i, offset, nextOffset, size;
1943 
1944     /* udata_swapDataHeader checks the arguments */
1945     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1946     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1947         return 0;
1948     }
1949 
1950     /* check data format and format version */
1951     pInfo=(const UDataInfo *)((const char *)inData+4);
1952     if(!(
1953         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
1954         pInfo->dataFormat[1]==0x72 &&
1955         pInfo->dataFormat[2]==0x6d &&
1956         pInfo->dataFormat[3]==0x32 &&
1957         pInfo->formatVersion[0]==1
1958     )) {
1959         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
1960                          pInfo->dataFormat[0], pInfo->dataFormat[1],
1961                          pInfo->dataFormat[2], pInfo->dataFormat[3],
1962                          pInfo->formatVersion[0]);
1963         *pErrorCode=U_UNSUPPORTED_ERROR;
1964         return 0;
1965     }
1966 
1967     inBytes=(const uint8_t *)inData+headerSize;
1968     outBytes=(uint8_t *)outData+headerSize;
1969 
1970     inIndexes=(const int32_t *)inBytes;
1971 
1972     if(length>=0) {
1973         length-=headerSize;
1974         if(length<(int32_t)sizeof(indexes)) {
1975             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
1976                              length);
1977             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1978             return 0;
1979         }
1980     }
1981 
1982     /* read the first few indexes */
1983     for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
1984         indexes[i]=udata_readInt32(ds, inIndexes[i]);
1985     }
1986 
1987     /* get the total length of the data */
1988     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
1989 
1990     if(length>=0) {
1991         if(length<size) {
1992             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
1993                              length);
1994             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1995             return 0;
1996         }
1997 
1998         /* copy the data for inaccessible bytes */
1999         if(inBytes!=outBytes) {
2000             uprv_memcpy(outBytes, inBytes, size);
2001         }
2002 
2003         offset=0;
2004 
2005         /* swap the int32_t indexes[] */
2006         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2007         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2008         offset=nextOffset;
2009 
2010         /* swap the UTrie2 */
2011         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2012         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2013         offset=nextOffset;
2014 
2015         /* swap the uint16_t extraData[] */
2016         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1];
2017         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2018         offset=nextOffset;
2019 
2020         U_ASSERT(offset==size);
2021     }
2022 
2023     return headerSize+size;
2024 }
2025 
2026 #endif  // !UCONFIG_NO_NORMALIZATION
2027