• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  normalizer2impl.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009nov22
16 *   created by: Markus W. Scherer
17 */
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_NORMALIZATION
22 
23 #include "unicode/bytestream.h"
24 #include "unicode/edits.h"
25 #include "unicode/normalizer2.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/udata.h"
28 #include "unicode/ustring.h"
29 #include "unicode/utf16.h"
30 #include "unicode/utf8.h"
31 #include "bytesinkutil.h"
32 #include "cmemory.h"
33 #include "mutex.h"
34 #include "normalizer2impl.h"
35 #include "putilimp.h"
36 #include "uassert.h"
37 #include "uset_imp.h"
38 #include "utrie2.h"
39 #include "uvector.h"
40 
41 U_NAMESPACE_BEGIN
42 
43 namespace {
44 
45 /**
46  * UTF-8 lead byte for minNoMaybeCP.
47  * Can be lower than the actual lead byte for c.
48  * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
49  */
leadByteForCP(UChar32 c)50 inline uint8_t leadByteForCP(UChar32 c) {
51     if (c <= 0x7f) {
52         return (uint8_t)c;
53     } else if (c <= 0x7ff) {
54         return (uint8_t)(0xc0+(c>>6));
55     } else {
56         // Should not occur because ccc(U+0300)!=0.
57         return 0xe0;
58     }
59 }
60 
61 /**
62  * Returns the code point from one single well-formed UTF-8 byte sequence
63  * between cpStart and cpLimit.
64  *
65  * UTrie2 UTF-8 macros do not assemble whole code points (for efficiency).
66  * When we do need the code point, we call this function.
67  * We should not need it for normalization-inert data (norm16==0).
68  * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
69  */
codePointFromValidUTF8(const uint8_t * cpStart,const uint8_t * cpLimit)70 UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) {
71     // Similar to U8_NEXT_UNSAFE(s, i, c).
72     U_ASSERT(cpStart < cpLimit);
73     uint8_t c = *cpStart;
74     switch(cpLimit-cpStart) {
75     case 1:
76         return c;
77     case 2:
78         return ((c&0x1f)<<6) | (cpStart[1]&0x3f);
79     case 3:
80         // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar)
81         return (UChar)((c<<12) | ((cpStart[1]&0x3f)<<6) | (cpStart[2]&0x3f));
82     case 4:
83         return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f);
84     default:
85         U_ASSERT(FALSE);  // Should not occur.
86         return U_SENTINEL;
87     }
88 }
89 
90 /**
91  * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.
92  * Otherwise returns a negative value.
93  */
previousHangulOrJamo(const uint8_t * start,const uint8_t * p)94 UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) {
95     if ((p - start) >= 3) {
96         p -= 3;
97         uint8_t l = *p;
98         uint8_t t1, t2;
99         if (0xe1 <= l && l <= 0xed &&
100                 (t1 = (uint8_t)(p[1] - 0x80)) <= 0x3f &&
101                 (t2 = (uint8_t)(p[2] - 0x80)) <= 0x3f &&
102                 (l < 0xed || t1 <= 0x1f)) {
103             return ((l & 0xf) << 12) | (t1 << 6) | t2;
104         }
105     }
106     return U_SENTINEL;
107 }
108 
109 /**
110  * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
111  * Otherwise returns a negative value.
112  */
getJamoTMinusBase(const uint8_t * src,const uint8_t * limit)113 int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
114     // Jamo T: E1 86 A8..E1 87 82
115     if ((limit - src) >= 3 && *src == 0xe1) {
116         if (src[1] == 0x86) {
117             uint8_t t = src[2];
118             // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
119             // Offset 0 does not correspond to any conjoining Jamo.
120             if (0xa8 <= t && t <= 0xbf) {
121                 return t - 0xa7;
122             }
123         } else if (src[1] == 0x87) {
124             uint8_t t = src[2];
125             if ((int8_t)t <= (int8_t)0x82) {
126                 return t - (0xa7 - 0x40);
127             }
128         }
129     }
130     return -1;
131 }
132 
133 void
appendCodePointDelta(const uint8_t * cpStart,const uint8_t * cpLimit,int32_t delta,ByteSink & sink,Edits * edits)134 appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,
135                      ByteSink &sink, Edits *edits) {
136     char buffer[U8_MAX_LENGTH];
137     int32_t length;
138     int32_t cpLength = (int32_t)(cpLimit - cpStart);
139     if (cpLength == 1) {
140         // The builder makes ASCII map to ASCII.
141         buffer[0] = (uint8_t)(*cpStart + delta);
142         length = 1;
143     } else {
144         int32_t trail = *(cpLimit-1) + delta;
145         if (0x80 <= trail && trail <= 0xbf) {
146             // The delta only changes the last trail byte.
147             --cpLimit;
148             length = 0;
149             do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit);
150             buffer[length++] = (uint8_t)trail;
151         } else {
152             // Decode the code point, add the delta, re-encode.
153             UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;
154             length = 0;
155             U8_APPEND_UNSAFE(buffer, length, c);
156         }
157     }
158     if (edits != nullptr) {
159         edits->addReplace(cpLength, length);
160     }
161     sink.Append(buffer, length);
162 }
163 
164 }  // namespace
165 
166 // ReorderingBuffer -------------------------------------------------------- ***
167 
ReorderingBuffer(const Normalizer2Impl & ni,UnicodeString & dest,UErrorCode & errorCode)168 ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest,
169                                    UErrorCode &errorCode) :
170         impl(ni), str(dest),
171         start(str.getBuffer(8)), reorderStart(start), limit(start),
172         remainingCapacity(str.getCapacity()), lastCC(0) {
173     if (start == nullptr && U_SUCCESS(errorCode)) {
174         // getBuffer() already did str.setToBogus()
175         errorCode = U_MEMORY_ALLOCATION_ERROR;
176     }
177 }
178 
init(int32_t destCapacity,UErrorCode & errorCode)179 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
180     int32_t length=str.length();
181     start=str.getBuffer(destCapacity);
182     if(start==NULL) {
183         // getBuffer() already did str.setToBogus()
184         errorCode=U_MEMORY_ALLOCATION_ERROR;
185         return FALSE;
186     }
187     limit=start+length;
188     remainingCapacity=str.getCapacity()-length;
189     reorderStart=start;
190     if(start==limit) {
191         lastCC=0;
192     } else {
193         setIterator();
194         lastCC=previousCC();
195         // Set reorderStart after the last code point with cc<=1 if there is one.
196         if(lastCC>1) {
197             while(previousCC()>1) {}
198         }
199         reorderStart=codePointLimit;
200     }
201     return TRUE;
202 }
203 
equals(const UChar * otherStart,const UChar * otherLimit) const204 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
205     int32_t length=(int32_t)(limit-start);
206     return
207         length==(int32_t)(otherLimit-otherStart) &&
208         0==u_memcmp(start, otherStart, length);
209 }
210 
equals(const uint8_t * otherStart,const uint8_t * otherLimit) const211 UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const {
212     U_ASSERT((otherLimit - otherStart) <= INT32_MAX);  // ensured by caller
213     int32_t length = (int32_t)(limit - start);
214     int32_t otherLength = (int32_t)(otherLimit - otherStart);
215     // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
216     if (otherLength < length || (otherLength / 3) > length) {
217         return FALSE;
218     }
219     // Compare valid strings from between normalization boundaries.
220     // (Invalid sequences are normalization-inert.)
221     for (int32_t i = 0, j = 0;;) {
222         if (i >= length) {
223             return j >= otherLength;
224         } else if (j >= otherLength) {
225             return FALSE;
226         }
227         // Not at the end of either string yet.
228         UChar32 c, other;
229         U16_NEXT_UNSAFE(start, i, c);
230         U8_NEXT_UNSAFE(otherStart, j, other);
231         if (c != other) {
232             return FALSE;
233         }
234     }
235 }
236 
appendSupplementary(UChar32 c,uint8_t cc,UErrorCode & errorCode)237 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
238     if(remainingCapacity<2 && !resize(2, errorCode)) {
239         return FALSE;
240     }
241     if(lastCC<=cc || cc==0) {
242         limit[0]=U16_LEAD(c);
243         limit[1]=U16_TRAIL(c);
244         limit+=2;
245         lastCC=cc;
246         if(cc<=1) {
247             reorderStart=limit;
248         }
249     } else {
250         insert(c, cc);
251     }
252     remainingCapacity-=2;
253     return TRUE;
254 }
255 
append(const UChar * s,int32_t length,uint8_t leadCC,uint8_t trailCC,UErrorCode & errorCode)256 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
257                                uint8_t leadCC, uint8_t trailCC,
258                                UErrorCode &errorCode) {
259     if(length==0) {
260         return TRUE;
261     }
262     if(remainingCapacity<length && !resize(length, errorCode)) {
263         return FALSE;
264     }
265     remainingCapacity-=length;
266     if(lastCC<=leadCC || leadCC==0) {
267         if(trailCC<=1) {
268             reorderStart=limit+length;
269         } else if(leadCC<=1) {
270             reorderStart=limit+1;  // Ok if not a code point boundary.
271         }
272         const UChar *sLimit=s+length;
273         do { *limit++=*s++; } while(s!=sLimit);
274         lastCC=trailCC;
275     } else {
276         int32_t i=0;
277         UChar32 c;
278         U16_NEXT(s, i, length, c);
279         insert(c, leadCC);  // insert first code point
280         while(i<length) {
281             U16_NEXT(s, i, length, c);
282             if(i<length) {
283                 // s must be in NFD, otherwise we need to use getCC().
284                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
285             } else {
286                 leadCC=trailCC;
287             }
288             append(c, leadCC, errorCode);
289         }
290     }
291     return TRUE;
292 }
293 
appendZeroCC(UChar32 c,UErrorCode & errorCode)294 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
295     int32_t cpLength=U16_LENGTH(c);
296     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
297         return FALSE;
298     }
299     remainingCapacity-=cpLength;
300     if(cpLength==1) {
301         *limit++=(UChar)c;
302     } else {
303         limit[0]=U16_LEAD(c);
304         limit[1]=U16_TRAIL(c);
305         limit+=2;
306     }
307     lastCC=0;
308     reorderStart=limit;
309     return TRUE;
310 }
311 
appendZeroCC(const UChar * s,const UChar * sLimit,UErrorCode & errorCode)312 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
313     if(s==sLimit) {
314         return TRUE;
315     }
316     int32_t length=(int32_t)(sLimit-s);
317     if(remainingCapacity<length && !resize(length, errorCode)) {
318         return FALSE;
319     }
320     u_memcpy(limit, s, length);
321     limit+=length;
322     remainingCapacity-=length;
323     lastCC=0;
324     reorderStart=limit;
325     return TRUE;
326 }
327 
remove()328 void ReorderingBuffer::remove() {
329     reorderStart=limit=start;
330     remainingCapacity=str.getCapacity();
331     lastCC=0;
332 }
333 
removeSuffix(int32_t suffixLength)334 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
335     if(suffixLength<(limit-start)) {
336         limit-=suffixLength;
337         remainingCapacity+=suffixLength;
338     } else {
339         limit=start;
340         remainingCapacity=str.getCapacity();
341     }
342     lastCC=0;
343     reorderStart=limit;
344 }
345 
resize(int32_t appendLength,UErrorCode & errorCode)346 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
347     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
348     int32_t length=(int32_t)(limit-start);
349     str.releaseBuffer(length);
350     int32_t newCapacity=length+appendLength;
351     int32_t doubleCapacity=2*str.getCapacity();
352     if(newCapacity<doubleCapacity) {
353         newCapacity=doubleCapacity;
354     }
355     if(newCapacity<256) {
356         newCapacity=256;
357     }
358     start=str.getBuffer(newCapacity);
359     if(start==NULL) {
360         // getBuffer() already did str.setToBogus()
361         errorCode=U_MEMORY_ALLOCATION_ERROR;
362         return FALSE;
363     }
364     reorderStart=start+reorderStartIndex;
365     limit=start+length;
366     remainingCapacity=str.getCapacity()-length;
367     return TRUE;
368 }
369 
skipPrevious()370 void ReorderingBuffer::skipPrevious() {
371     codePointLimit=codePointStart;
372     UChar c=*--codePointStart;
373     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
374         --codePointStart;
375     }
376 }
377 
previousCC()378 uint8_t ReorderingBuffer::previousCC() {
379     codePointLimit=codePointStart;
380     if(reorderStart>=codePointStart) {
381         return 0;
382     }
383     UChar32 c=*--codePointStart;
384     UChar c2;
385     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
386         --codePointStart;
387         c=U16_GET_SUPPLEMENTARY(c2, c);
388     }
389     return impl.getCCFromYesOrMaybeCP(c);
390 }
391 
392 // Inserts c somewhere before the last character.
393 // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(UChar32 c,uint8_t cc)394 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
395     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
396     // insert c at codePointLimit, after the character with prevCC<=cc
397     UChar *q=limit;
398     UChar *r=limit+=U16_LENGTH(c);
399     do {
400         *--r=*--q;
401     } while(codePointLimit!=q);
402     writeCodePoint(q, c);
403     if(cc<=1) {
404         reorderStart=r;
405     }
406 }
407 
408 // Normalizer2Impl --------------------------------------------------------- ***
409 
410 struct CanonIterData : public UMemory {
411     CanonIterData(UErrorCode &errorCode);
412     ~CanonIterData();
413     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
414     UTrie2 *trie;
415     UVector canonStartSets;  // contains UnicodeSet *
416 };
417 
~Normalizer2Impl()418 Normalizer2Impl::~Normalizer2Impl() {
419     delete fCanonIterData;
420 }
421 
422 void
init(const int32_t * inIndexes,const UTrie2 * inTrie,const uint16_t * inExtraData,const uint8_t * inSmallFCD)423 Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
424                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
425     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
426     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
427     minLcccCP=inIndexes[IX_MIN_LCCC_CP];
428 
429     minYesNo=inIndexes[IX_MIN_YES_NO];
430     minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
431     minNoNo=inIndexes[IX_MIN_NO_NO];
432     minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
433     minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
434     minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
435     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
436     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
437     U_ASSERT((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
438     centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
439 
440     normTrie=inTrie;
441 
442     maybeYesCompositions=inExtraData;
443     extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
444 
445     smallFCD=inSmallFCD;
446 }
447 
448 class LcccContext {
449 public:
LcccContext(const Normalizer2Impl & ni,UnicodeSet & s)450     LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
451 
handleRange(UChar32 start,UChar32 end,uint16_t norm16)452     void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
453         if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
454                 norm16 != Normalizer2Impl::JAMO_VT) {
455             set.add(start, end);
456         } else if (impl.minNoNoCompNoMaybeCC <= norm16 && norm16 < impl.limitNoNo) {
457             uint16_t fcd16=impl.getFCD16(start);
458             if(fcd16>0xff) { set.add(start, end); }
459         }
460     }
461 
462 private:
463     const Normalizer2Impl &impl;
464     UnicodeSet &set;
465 };
466 
467 namespace {
468 
469 struct PropertyStartsContext {
PropertyStartsContext__anona4c62b150211::PropertyStartsContext470     PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
471             : impl(ni), sa(adder) {}
472 
473     const Normalizer2Impl &impl;
474     const USetAdder *sa;
475 };
476 
477 }  // namespace
478 
479 U_CDECL_BEGIN
480 
481 static UBool U_CALLCONV
enumLcccRange(const void * context,UChar32 start,UChar32 end,uint32_t value)482 enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
483     ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
484     return TRUE;
485 }
486 
487 static UBool U_CALLCONV
enumNorm16PropertyStartsRange(const void * context,UChar32 start,UChar32 end,uint32_t value)488 enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
489     /* add the start code point to the USet */
490     const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
491     const USetAdder *sa=ctx->sa;
492     sa->add(sa->set, start);
493     if (start != end && ctx->impl.isAlgorithmicNoNo((uint16_t)value) &&
494             (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {
495         // Range of code points with same-norm16-value algorithmic decompositions.
496         // They might have different non-zero FCD16 values.
497         uint16_t prevFCD16=ctx->impl.getFCD16(start);
498         while(++start<=end) {
499             uint16_t fcd16=ctx->impl.getFCD16(start);
500             if(fcd16!=prevFCD16) {
501                 sa->add(sa->set, start);
502                 prevFCD16=fcd16;
503             }
504         }
505     }
506     return TRUE;
507 }
508 
509 static UBool U_CALLCONV
enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)510 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
511     /* add the start code point to the USet */
512     const USetAdder *sa=(const USetAdder *)context;
513     sa->add(sa->set, start);
514     return TRUE;
515 }
516 
517 static uint32_t U_CALLCONV
segmentStarterMapper(const void *,uint32_t value)518 segmentStarterMapper(const void * /*context*/, uint32_t value) {
519     return value&CANON_NOT_SEGMENT_STARTER;
520 }
521 
522 U_CDECL_END
523 
524 void
addLcccChars(UnicodeSet & set) const525 Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
526     LcccContext context(*this, set);
527     utrie2_enum(normTrie, NULL, enumLcccRange, &context);
528 }
529 
530 void
addPropertyStarts(const USetAdder * sa,UErrorCode &) const531 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
532     /* add the start code point of each same-value range of each trie */
533     PropertyStartsContext context(*this, sa);
534     utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
535 
536     /* add Hangul LV syllables and LV+1 because of skippables */
537     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
538         sa->add(sa->set, c);
539         sa->add(sa->set, c+1);
540     }
541     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
542 }
543 
544 void
addCanonIterPropertyStarts(const USetAdder * sa,UErrorCode & errorCode) const545 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
546     /* add the start code point of each same-value range of the canonical iterator data trie */
547     if(ensureCanonIterData(errorCode)) {
548         // currently only used for the SEGMENT_STARTER property
549         utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
550     }
551 }
552 
553 const UChar *
copyLowPrefixFromNulTerminated(const UChar * src,UChar32 minNeedDataCP,ReorderingBuffer * buffer,UErrorCode & errorCode) const554 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
555                                                 UChar32 minNeedDataCP,
556                                                 ReorderingBuffer *buffer,
557                                                 UErrorCode &errorCode) const {
558     // Make some effort to support NUL-terminated strings reasonably.
559     // Take the part of the fast quick check loop that does not look up
560     // data and check the first part of the string.
561     // After this prefix, determine the string length to simplify the rest
562     // of the code.
563     const UChar *prevSrc=src;
564     UChar c;
565     while((c=*src++)<minNeedDataCP && c!=0) {}
566     // Back out the last character for full processing.
567     // Copy this prefix.
568     if(--src!=prevSrc) {
569         if(buffer!=NULL) {
570             buffer->appendZeroCC(prevSrc, src, errorCode);
571         }
572     }
573     return src;
574 }
575 
576 UnicodeString &
decompose(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const577 Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
578                            UErrorCode &errorCode) const {
579     if(U_FAILURE(errorCode)) {
580         dest.setToBogus();
581         return dest;
582     }
583     const UChar *sArray=src.getBuffer();
584     if(&dest==&src || sArray==NULL) {
585         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
586         dest.setToBogus();
587         return dest;
588     }
589     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
590     return dest;
591 }
592 
593 void
decompose(const UChar * src,const UChar * limit,UnicodeString & dest,int32_t destLengthEstimate,UErrorCode & errorCode) const594 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
595                            UnicodeString &dest,
596                            int32_t destLengthEstimate,
597                            UErrorCode &errorCode) const {
598     if(destLengthEstimate<0 && limit!=NULL) {
599         destLengthEstimate=(int32_t)(limit-src);
600     }
601     dest.remove();
602     ReorderingBuffer buffer(*this, dest);
603     if(buffer.init(destLengthEstimate, errorCode)) {
604         decompose(src, limit, &buffer, errorCode);
605     }
606 }
607 
608 // Dual functionality:
609 // buffer!=NULL: normalize
610 // buffer==NULL: isNormalized/spanQuickCheckYes
611 const UChar *
decompose(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const612 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
613                            ReorderingBuffer *buffer,
614                            UErrorCode &errorCode) const {
615     UChar32 minNoCP=minDecompNoCP;
616     if(limit==NULL) {
617         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
618         if(U_FAILURE(errorCode)) {
619             return src;
620         }
621         limit=u_strchr(src, 0);
622     }
623 
624     const UChar *prevSrc;
625     UChar32 c=0;
626     uint16_t norm16=0;
627 
628     // only for quick check
629     const UChar *prevBoundary=src;
630     uint8_t prevCC=0;
631 
632     for(;;) {
633         // count code units below the minimum or with irrelevant data for the quick check
634         for(prevSrc=src; src!=limit;) {
635             if( (c=*src)<minNoCP ||
636                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
637             ) {
638                 ++src;
639             } else if(!U16_IS_SURROGATE(c)) {
640                 break;
641             } else {
642                 UChar c2;
643                 if(U16_IS_SURROGATE_LEAD(c)) {
644                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
645                         c=U16_GET_SUPPLEMENTARY(c, c2);
646                     }
647                 } else /* trail surrogate */ {
648                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
649                         --src;
650                         c=U16_GET_SUPPLEMENTARY(c2, c);
651                     }
652                 }
653                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
654                     src+=U16_LENGTH(c);
655                 } else {
656                     break;
657                 }
658             }
659         }
660         // copy these code units all at once
661         if(src!=prevSrc) {
662             if(buffer!=NULL) {
663                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
664                     break;
665                 }
666             } else {
667                 prevCC=0;
668                 prevBoundary=src;
669             }
670         }
671         if(src==limit) {
672             break;
673         }
674 
675         // Check one above-minimum, relevant code point.
676         src+=U16_LENGTH(c);
677         if(buffer!=NULL) {
678             if(!decompose(c, norm16, *buffer, errorCode)) {
679                 break;
680             }
681         } else {
682             if(isDecompYes(norm16)) {
683                 uint8_t cc=getCCFromYesOrMaybe(norm16);
684                 if(prevCC<=cc || cc==0) {
685                     prevCC=cc;
686                     if(cc<=1) {
687                         prevBoundary=src;
688                     }
689                     continue;
690                 }
691             }
692             return prevBoundary;  // "no" or cc out of order
693         }
694     }
695     return src;
696 }
697 
698 // Decompose a short piece of text which is likely to contain characters that
699 // fail the quick check loop and/or where the quick check loop's overhead
700 // is unlikely to be amortized.
701 // Called by the compose() and makeFCD() implementations.
702 const UChar *
decomposeShort(const UChar * src,const UChar * limit,UBool stopAtCompBoundary,UBool onlyContiguous,ReorderingBuffer & buffer,UErrorCode & errorCode) const703 Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
704                                 UBool stopAtCompBoundary, UBool onlyContiguous,
705                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
706     if (U_FAILURE(errorCode)) {
707         return nullptr;
708     }
709     while(src<limit) {
710         if (stopAtCompBoundary && *src < minCompNoMaybeCP) {
711             return src;
712         }
713         const UChar *prevSrc = src;
714         UChar32 c;
715         uint16_t norm16;
716         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
717         if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
718             return prevSrc;
719         }
720         if(!decompose(c, norm16, buffer, errorCode)) {
721             return nullptr;
722         }
723         if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
724             return src;
725         }
726     }
727     return src;
728 }
729 
decompose(UChar32 c,uint16_t norm16,ReorderingBuffer & buffer,UErrorCode & errorCode) const730 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
731                                  ReorderingBuffer &buffer,
732                                  UErrorCode &errorCode) const {
733     // get the decomposition and the lead and trail cc's
734     if (norm16 >= limitNoNo) {
735         if (isMaybeOrNonZeroCC(norm16)) {
736             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
737         }
738         // Maps to an isCompYesAndZeroCC.
739         c=mapAlgorithmic(c, norm16);
740         norm16=getNorm16(c);
741     }
742     if (norm16 < minYesNo) {
743         // c does not decompose
744         return buffer.append(c, 0, errorCode);
745     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
746         // Hangul syllable: decompose algorithmically
747         UChar jamos[3];
748         return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
749     }
750     // c decomposes, get everything from the variable-length extra data
751     const uint16_t *mapping=getMapping(norm16);
752     uint16_t firstUnit=*mapping;
753     int32_t length=firstUnit&MAPPING_LENGTH_MASK;
754     uint8_t leadCC, trailCC;
755     trailCC=(uint8_t)(firstUnit>>8);
756     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
757         leadCC=(uint8_t)(*(mapping-1)>>8);
758     } else {
759         leadCC=0;
760     }
761     return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
762 }
763 
764 const uint8_t *
decomposeShort(const uint8_t * src,const uint8_t * limit,UBool stopAtCompBoundary,UBool onlyContiguous,ReorderingBuffer & buffer,UErrorCode & errorCode) const765 Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
766                                 UBool stopAtCompBoundary, UBool onlyContiguous,
767                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
768     if (U_FAILURE(errorCode)) {
769         return nullptr;
770     }
771     while (src < limit) {
772         const uint8_t *prevSrc = src;
773         uint16_t norm16;
774         UTRIE2_U8_NEXT16(normTrie, src, limit, norm16);
775         // Get the decomposition and the lead and trail cc's.
776         UChar32 c = U_SENTINEL;
777         if (norm16 >= limitNoNo) {
778             if (isMaybeOrNonZeroCC(norm16)) {
779                 // No boundaries around this character.
780                 c = codePointFromValidUTF8(prevSrc, src);
781                 if (!buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode)) {
782                     return nullptr;
783                 }
784                 continue;
785             }
786             // Maps to an isCompYesAndZeroCC.
787             if (stopAtCompBoundary) {
788                 return prevSrc;
789             }
790             c = codePointFromValidUTF8(prevSrc, src);
791             c = mapAlgorithmic(c, norm16);
792             norm16 = getNorm16(c);
793         } else if (stopAtCompBoundary && norm16 < minNoNoCompNoMaybeCC) {
794             return prevSrc;
795         }
796         // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
797         // We do not see invalid UTF-8 here because
798         // its norm16==INERT is normalization-inert,
799         // so it gets copied unchanged in the fast path,
800         // and we stop the slow path where invalid UTF-8 begins.
801         U_ASSERT(norm16 != INERT);
802         if (norm16 < minYesNo) {
803             if (c < 0) {
804                 c = codePointFromValidUTF8(prevSrc, src);
805             }
806             // does not decompose
807             if (!buffer.append(c, 0, errorCode)) {
808                 return nullptr;
809             }
810         } else if (isHangulLV(norm16) || isHangulLVT(norm16)) {
811             // Hangul syllable: decompose algorithmically
812             if (c < 0) {
813                 c = codePointFromValidUTF8(prevSrc, src);
814             }
815             char16_t jamos[3];
816             if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) {
817                 return nullptr;
818             }
819         } else {
820             // The character decomposes, get everything from the variable-length extra data.
821             const uint16_t *mapping = getMapping(norm16);
822             uint16_t firstUnit = *mapping;
823             int32_t length = firstUnit & MAPPING_LENGTH_MASK;
824             uint8_t trailCC = (uint8_t)(firstUnit >> 8);
825             uint8_t leadCC;
826             if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) {
827                 leadCC = (uint8_t)(*(mapping-1) >> 8);
828             } else {
829                 leadCC = 0;
830             }
831             if (!buffer.append((const char16_t *)mapping+1, length, leadCC, trailCC, errorCode)) {
832                 return nullptr;
833             }
834         }
835         if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
836             return src;
837         }
838     }
839     return src;
840 }
841 
842 const UChar *
getDecomposition(UChar32 c,UChar buffer[4],int32_t & length) const843 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
844     uint16_t norm16;
845     if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
846         // c does not decompose
847         return nullptr;
848     }
849     const UChar *decomp = nullptr;
850     if(isDecompNoAlgorithmic(norm16)) {
851         // Maps to an isCompYesAndZeroCC.
852         c=mapAlgorithmic(c, norm16);
853         decomp=buffer;
854         length=0;
855         U16_APPEND_UNSAFE(buffer, length, c);
856         // The mapping might decompose further.
857         norm16 = getNorm16(c);
858     }
859     if (norm16 < minYesNo) {
860         return decomp;
861     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
862         // Hangul syllable: decompose algorithmically
863         length=Hangul::decompose(c, buffer);
864         return buffer;
865     }
866     // c decomposes, get everything from the variable-length extra data
867     const uint16_t *mapping=getMapping(norm16);
868     length=*mapping&MAPPING_LENGTH_MASK;
869     return (const UChar *)mapping+1;
870 }
871 
872 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
873 // so that a raw mapping fits that consists of one unit ("rm0")
874 // plus all but the first two code units of the normal mapping.
875 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
876 const UChar *
getRawDecomposition(UChar32 c,UChar buffer[30],int32_t & length) const877 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
878     uint16_t norm16;
879     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
880         // c does not decompose
881         return NULL;
882     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
883         // Hangul syllable: decompose algorithmically
884         Hangul::getRawDecomposition(c, buffer);
885         length=2;
886         return buffer;
887     } else if(isDecompNoAlgorithmic(norm16)) {
888         c=mapAlgorithmic(c, norm16);
889         length=0;
890         U16_APPEND_UNSAFE(buffer, length, c);
891         return buffer;
892     }
893     // c decomposes, get everything from the variable-length extra data
894     const uint16_t *mapping=getMapping(norm16);
895     uint16_t firstUnit=*mapping;
896     int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
897     if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
898         // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
899         // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
900         const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
901         uint16_t rm0=*rawMapping;
902         if(rm0<=MAPPING_LENGTH_MASK) {
903             length=rm0;
904             return (const UChar *)rawMapping-rm0;
905         } else {
906             // Copy the normal mapping and replace its first two code units with rm0.
907             buffer[0]=(UChar)rm0;
908             u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
909             length=mLength-1;
910             return buffer;
911         }
912     } else {
913         length=mLength;
914         return (const UChar *)mapping+1;
915     }
916 }
917 
decomposeAndAppend(const UChar * src,const UChar * limit,UBool doDecompose,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const918 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
919                                          UBool doDecompose,
920                                          UnicodeString &safeMiddle,
921                                          ReorderingBuffer &buffer,
922                                          UErrorCode &errorCode) const {
923     buffer.copyReorderableSuffixTo(safeMiddle);
924     if(doDecompose) {
925         decompose(src, limit, &buffer, errorCode);
926         return;
927     }
928     // Just merge the strings at the boundary.
929     ForwardUTrie2StringIterator iter(normTrie, src, limit);
930     uint8_t firstCC, prevCC, cc;
931     firstCC=prevCC=cc=getCC(iter.next16());
932     while(cc!=0) {
933         prevCC=cc;
934         cc=getCC(iter.next16());
935     };
936     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
937         limit=u_strchr(iter.codePointStart, 0);
938     }
939 
940     if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
941         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
942     }
943 }
944 
hasDecompBoundaryBefore(UChar32 c) const945 UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const {
946     return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
947         norm16HasDecompBoundaryBefore(getNorm16(c));
948 }
949 
norm16HasDecompBoundaryBefore(uint16_t norm16) const950 UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const {
951     if (norm16 < minNoNoCompNoMaybeCC) {
952         return TRUE;
953     }
954     if (norm16 >= limitNoNo) {
955         return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
956     }
957     // c decomposes, get everything from the variable-length extra data
958     const uint16_t *mapping=getMapping(norm16);
959     uint16_t firstUnit=*mapping;
960     // TRUE if leadCC==0 (hasFCDBoundaryBefore())
961     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
962 }
963 
hasDecompBoundaryAfter(UChar32 c) const964 UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const {
965     if (c < minDecompNoCP) {
966         return TRUE;
967     }
968     if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
969         return TRUE;
970     }
971     return norm16HasDecompBoundaryAfter(getNorm16(c));
972 }
973 
norm16HasDecompBoundaryAfter(uint16_t norm16) const974 UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const {
975     if(norm16 <= minYesNo || isHangulLVT(norm16)) {
976         return TRUE;
977     }
978     if (norm16 >= limitNoNo) {
979         if (isMaybeOrNonZeroCC(norm16)) {
980             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
981         }
982         // Maps to an isCompYesAndZeroCC.
983         return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
984     }
985     // c decomposes, get everything from the variable-length extra data
986     const uint16_t *mapping=getMapping(norm16);
987     uint16_t firstUnit=*mapping;
988     // decomp after-boundary: same as hasFCDBoundaryAfter(),
989     // fcd16<=1 || trailCC==0
990     if(firstUnit>0x1ff) {
991         return FALSE;  // trailCC>1
992     }
993     if(firstUnit<=0xff) {
994         return TRUE;  // trailCC==0
995     }
996     // if(trailCC==1) test leadCC==0, same as checking for before-boundary
997     // TRUE if leadCC==0 (hasFCDBoundaryBefore())
998     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
999 }
1000 
1001 /*
1002  * Finds the recomposition result for
1003  * a forward-combining "lead" character,
1004  * specified with a pointer to its compositions list,
1005  * and a backward-combining "trail" character.
1006  *
1007  * If the lead and trail characters combine, then this function returns
1008  * the following "compositeAndFwd" value:
1009  * Bits 21..1  composite character
1010  * Bit      0  set if the composite is a forward-combining starter
1011  * otherwise it returns -1.
1012  *
1013  * The compositions list has (trail, compositeAndFwd) pair entries,
1014  * encoded as either pairs or triples of 16-bit units.
1015  * The last entry has the high bit of its first unit set.
1016  *
1017  * The list is sorted by ascending trail characters (there are no duplicates).
1018  * A linear search is used.
1019  *
1020  * See normalizer2impl.h for a more detailed description
1021  * of the compositions list format.
1022  */
combine(const uint16_t * list,UChar32 trail)1023 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
1024     uint16_t key1, firstUnit;
1025     if(trail<COMP_1_TRAIL_LIMIT) {
1026         // trail character is 0..33FF
1027         // result entry may have 2 or 3 units
1028         key1=(uint16_t)(trail<<1);
1029         while(key1>(firstUnit=*list)) {
1030             list+=2+(firstUnit&COMP_1_TRIPLE);
1031         }
1032         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1033             if(firstUnit&COMP_1_TRIPLE) {
1034                 return ((int32_t)list[1]<<16)|list[2];
1035             } else {
1036                 return list[1];
1037             }
1038         }
1039     } else {
1040         // trail character is 3400..10FFFF
1041         // result entry has 3 units
1042         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
1043                         (((trail>>COMP_1_TRAIL_SHIFT))&
1044                           ~COMP_1_TRIPLE));
1045         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
1046         uint16_t secondUnit;
1047         for(;;) {
1048             if(key1>(firstUnit=*list)) {
1049                 list+=2+(firstUnit&COMP_1_TRIPLE);
1050             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1051                 if(key2>(secondUnit=list[1])) {
1052                     if(firstUnit&COMP_1_LAST_TUPLE) {
1053                         break;
1054                     } else {
1055                         list+=3;
1056                     }
1057                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1058                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
1059                 } else {
1060                     break;
1061                 }
1062             } else {
1063                 break;
1064             }
1065         }
1066     }
1067     return -1;
1068 }
1069 
1070 /**
1071   * @param list some character's compositions list
1072   * @param set recursively receives the composites from these compositions
1073   */
addComposites(const uint16_t * list,UnicodeSet & set) const1074 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
1075     uint16_t firstUnit;
1076     int32_t compositeAndFwd;
1077     do {
1078         firstUnit=*list;
1079         if((firstUnit&COMP_1_TRIPLE)==0) {
1080             compositeAndFwd=list[1];
1081             list+=2;
1082         } else {
1083             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
1084             list+=3;
1085         }
1086         UChar32 composite=compositeAndFwd>>1;
1087         if((compositeAndFwd&1)!=0) {
1088             addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
1089         }
1090         set.add(composite);
1091     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1092 }
1093 
1094 /*
1095  * Recomposes the buffer text starting at recomposeStartIndex
1096  * (which is in NFD - decomposed and canonically ordered),
1097  * and truncates the buffer contents.
1098  *
1099  * Note that recomposition never lengthens the text:
1100  * Any character consists of either one or two code units;
1101  * a composition may contain at most one more code unit than the original starter,
1102  * while the combining mark that is removed has at least one code unit.
1103  */
recompose(ReorderingBuffer & buffer,int32_t recomposeStartIndex,UBool onlyContiguous) const1104 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
1105                                 UBool onlyContiguous) const {
1106     UChar *p=buffer.getStart()+recomposeStartIndex;
1107     UChar *limit=buffer.getLimit();
1108     if(p==limit) {
1109         return;
1110     }
1111 
1112     UChar *starter, *pRemove, *q, *r;
1113     const uint16_t *compositionsList;
1114     UChar32 c, compositeAndFwd;
1115     uint16_t norm16;
1116     uint8_t cc, prevCC;
1117     UBool starterIsSupplementary;
1118 
1119     // Some of the following variables are not used until we have a forward-combining starter
1120     // and are only initialized now to avoid compiler warnings.
1121     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
1122     starter=NULL;
1123     starterIsSupplementary=FALSE;
1124     prevCC=0;
1125 
1126     for(;;) {
1127         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
1128         cc=getCCFromYesOrMaybe(norm16);
1129         if( // this character combines backward and
1130             isMaybe(norm16) &&
1131             // we have seen a starter that combines forward and
1132             compositionsList!=NULL &&
1133             // the backward-combining character is not blocked
1134             (prevCC<cc || prevCC==0)
1135         ) {
1136             if(isJamoVT(norm16)) {
1137                 // c is a Jamo V/T, see if we can compose it with the previous character.
1138                 if(c<Hangul::JAMO_T_BASE) {
1139                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1140                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
1141                     if(prev<Hangul::JAMO_L_COUNT) {
1142                         pRemove=p-1;
1143                         UChar syllable=(UChar)
1144                             (Hangul::HANGUL_BASE+
1145                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1146                              Hangul::JAMO_T_COUNT);
1147                         UChar t;
1148                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1149                             ++p;
1150                             syllable+=t;  // The next character was a Jamo T.
1151                         }
1152                         *starter=syllable;
1153                         // remove the Jamo V/T
1154                         q=pRemove;
1155                         r=p;
1156                         while(r<limit) {
1157                             *q++=*r++;
1158                         }
1159                         limit=q;
1160                         p=pRemove;
1161                     }
1162                 }
1163                 /*
1164                  * No "else" for Jamo T:
1165                  * Since the input is in NFD, there are no Hangul LV syllables that
1166                  * a Jamo T could combine with.
1167                  * All Jamo Ts are combined above when handling Jamo Vs.
1168                  */
1169                 if(p==limit) {
1170                     break;
1171                 }
1172                 compositionsList=NULL;
1173                 continue;
1174             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
1175                 // The starter and the combining mark (c) do combine.
1176                 UChar32 composite=compositeAndFwd>>1;
1177 
1178                 // Replace the starter with the composite, remove the combining mark.
1179                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
1180                 if(starterIsSupplementary) {
1181                     if(U_IS_SUPPLEMENTARY(composite)) {
1182                         // both are supplementary
1183                         starter[0]=U16_LEAD(composite);
1184                         starter[1]=U16_TRAIL(composite);
1185                     } else {
1186                         *starter=(UChar)composite;
1187                         // The composite is shorter than the starter,
1188                         // move the intermediate characters forward one.
1189                         starterIsSupplementary=FALSE;
1190                         q=starter+1;
1191                         r=q+1;
1192                         while(r<pRemove) {
1193                             *q++=*r++;
1194                         }
1195                         --pRemove;
1196                     }
1197                 } else if(U_IS_SUPPLEMENTARY(composite)) {
1198                     // The composite is longer than the starter,
1199                     // move the intermediate characters back one.
1200                     starterIsSupplementary=TRUE;
1201                     ++starter;  // temporarily increment for the loop boundary
1202                     q=pRemove;
1203                     r=++pRemove;
1204                     while(starter<q) {
1205                         *--r=*--q;
1206                     }
1207                     *starter=U16_TRAIL(composite);
1208                     *--starter=U16_LEAD(composite);  // undo the temporary increment
1209                 } else {
1210                     // both are on the BMP
1211                     *starter=(UChar)composite;
1212                 }
1213 
1214                 /* remove the combining mark by moving the following text over it */
1215                 if(pRemove<p) {
1216                     q=pRemove;
1217                     r=p;
1218                     while(r<limit) {
1219                         *q++=*r++;
1220                     }
1221                     limit=q;
1222                     p=pRemove;
1223                 }
1224                 // Keep prevCC because we removed the combining mark.
1225 
1226                 if(p==limit) {
1227                     break;
1228                 }
1229                 // Is the composite a starter that combines forward?
1230                 if(compositeAndFwd&1) {
1231                     compositionsList=
1232                         getCompositionsListForComposite(getNorm16(composite));
1233                 } else {
1234                     compositionsList=NULL;
1235                 }
1236 
1237                 // We combined; continue with looking for compositions.
1238                 continue;
1239             }
1240         }
1241 
1242         // no combination this time
1243         prevCC=cc;
1244         if(p==limit) {
1245             break;
1246         }
1247 
1248         // If c did not combine, then check if it is a starter.
1249         if(cc==0) {
1250             // Found a new starter.
1251             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
1252                 // It may combine with something, prepare for it.
1253                 if(U_IS_BMP(c)) {
1254                     starterIsSupplementary=FALSE;
1255                     starter=p-1;
1256                 } else {
1257                     starterIsSupplementary=TRUE;
1258                     starter=p-2;
1259                 }
1260             }
1261         } else if(onlyContiguous) {
1262             // FCC: no discontiguous compositions; any intervening character blocks.
1263             compositionsList=NULL;
1264         }
1265     }
1266     buffer.setReorderingLimit(limit);
1267 }
1268 
1269 UChar32
composePair(UChar32 a,UChar32 b) const1270 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1271     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
1272     const uint16_t *list;
1273     if(isInert(norm16)) {
1274         return U_SENTINEL;
1275     } else if(norm16<minYesNoMappingsOnly) {
1276         // a combines forward.
1277         if(isJamoL(norm16)) {
1278             b-=Hangul::JAMO_V_BASE;
1279             if(0<=b && b<Hangul::JAMO_V_COUNT) {
1280                 return
1281                     (Hangul::HANGUL_BASE+
1282                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1283                      Hangul::JAMO_T_COUNT);
1284             } else {
1285                 return U_SENTINEL;
1286             }
1287         } else if(isHangulLV(norm16)) {
1288             b-=Hangul::JAMO_T_BASE;
1289             if(0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
1290                 return a+b;
1291             } else {
1292                 return U_SENTINEL;
1293             }
1294         } else {
1295             // 'a' has a compositions list in extraData
1296             list=getMapping(norm16);
1297             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
1298                 list+=  // mapping pointer
1299                     1+  // +1 to skip the first unit with the mapping length
1300                     (*list&MAPPING_LENGTH_MASK);  // + mapping length
1301             }
1302         }
1303     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1304         return U_SENTINEL;
1305     } else {
1306         list=getCompositionsListForMaybe(norm16);
1307     }
1308     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
1309         return U_SENTINEL;
1310     }
1311 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1312     return combine(list, b)>>1;
1313 #else
1314     int32_t compositeAndFwd=combine(list, b);
1315     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1316 #endif
1317 }
1318 
1319 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1320 // doCompose: normalize
1321 // !doCompose: isNormalized (buffer must be empty and initialized)
1322 UBool
compose(const UChar * src,const UChar * limit,UBool onlyContiguous,UBool doCompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const1323 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1324                          UBool onlyContiguous,
1325                          UBool doCompose,
1326                          ReorderingBuffer &buffer,
1327                          UErrorCode &errorCode) const {
1328     const UChar *prevBoundary=src;
1329     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1330     if(limit==NULL) {
1331         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1332                                            doCompose ? &buffer : NULL,
1333                                            errorCode);
1334         if(U_FAILURE(errorCode)) {
1335             return FALSE;
1336         }
1337         limit=u_strchr(src, 0);
1338         if (prevBoundary != src) {
1339             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1340                 prevBoundary = src;
1341             } else {
1342                 buffer.removeSuffix(1);
1343                 prevBoundary = --src;
1344             }
1345         }
1346     }
1347 
1348     for (;;) {
1349         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1350         // or with (compYes && ccc==0) properties.
1351         const UChar *prevSrc;
1352         UChar32 c = 0;
1353         uint16_t norm16 = 0;
1354         for (;;) {
1355             if (src == limit) {
1356                 if (prevBoundary != limit && doCompose) {
1357                     buffer.appendZeroCC(prevBoundary, limit, errorCode);
1358                 }
1359                 return TRUE;
1360             }
1361             if( (c=*src)<minNoMaybeCP ||
1362                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1363             ) {
1364                 ++src;
1365             } else {
1366                 prevSrc = src++;
1367                 if(!U16_IS_SURROGATE(c)) {
1368                     break;
1369                 } else {
1370                     UChar c2;
1371                     if(U16_IS_SURROGATE_LEAD(c)) {
1372                         if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1373                             ++src;
1374                             c=U16_GET_SUPPLEMENTARY(c, c2);
1375                         }
1376                     } else /* trail surrogate */ {
1377                         if(prevBoundary<prevSrc && U16_IS_LEAD(c2=*(prevSrc-1))) {
1378                             --prevSrc;
1379                             c=U16_GET_SUPPLEMENTARY(c2, c);
1380                         }
1381                     }
1382                     if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1383                         break;
1384                     }
1385                 }
1386             }
1387         }
1388         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1389         // The current character is either a "noNo" (has a mapping)
1390         // or a "maybeYes" (combines backward)
1391         // or a "yesYes" with ccc!=0.
1392         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1393 
1394         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1395         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1396             if (!doCompose) {
1397                 return FALSE;
1398             }
1399             // Fast path for mapping a character that is immediately surrounded by boundaries.
1400             // In this case, we need not decompose around the current character.
1401             if (isDecompNoAlgorithmic(norm16)) {
1402                 // Maps to a single isCompYesAndZeroCC character
1403                 // which also implies hasCompBoundaryBefore.
1404                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1405                         hasCompBoundaryBefore(src, limit)) {
1406                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1407                         break;
1408                     }
1409                     if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) {
1410                         break;
1411                     }
1412                     prevBoundary = src;
1413                     continue;
1414                 }
1415             } else if (norm16 < minNoNoCompBoundaryBefore) {
1416                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1417                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1418                         hasCompBoundaryBefore(src, limit)) {
1419                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1420                         break;
1421                     }
1422                     const UChar *mapping = reinterpret_cast<const UChar *>(getMapping(norm16));
1423                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1424                     if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) {
1425                         break;
1426                     }
1427                     prevBoundary = src;
1428                     continue;
1429                 }
1430             } else if (norm16 >= minNoNoEmpty) {
1431                 // The current character maps to nothing.
1432                 // Simply omit it from the output if there is a boundary before _or_ after it.
1433                 // The character itself implies no boundaries.
1434                 if (hasCompBoundaryBefore(src, limit) ||
1435                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1436                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1437                         break;
1438                     }
1439                     prevBoundary = src;
1440                     continue;
1441                 }
1442             }
1443             // Other "noNo" type, or need to examine more text around this character:
1444             // Fall through to the slow path.
1445         } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
1446             UChar prev=*(prevSrc-1);
1447             if(c<Hangul::JAMO_T_BASE) {
1448                 // The current character is a Jamo Vowel,
1449                 // compose with previous Jamo L and following Jamo T.
1450                 UChar l = (UChar)(prev-Hangul::JAMO_L_BASE);
1451                 if(l<Hangul::JAMO_L_COUNT) {
1452                     if (!doCompose) {
1453                         return FALSE;
1454                     }
1455                     int32_t t;
1456                     if (src != limit &&
1457                             0 < (t = ((int32_t)*src - Hangul::JAMO_T_BASE)) &&
1458                             t < Hangul::JAMO_T_COUNT) {
1459                         // The next character is a Jamo T.
1460                         ++src;
1461                     } else if (hasCompBoundaryBefore(src, limit)) {
1462                         // No Jamo T follows, not even via decomposition.
1463                         t = 0;
1464                     } else {
1465                         t = -1;
1466                     }
1467                     if (t >= 0) {
1468                         UChar32 syllable = Hangul::HANGUL_BASE +
1469                             (l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) *
1470                             Hangul::JAMO_T_COUNT + t;
1471                         --prevSrc;  // Replace the Jamo L as well.
1472                         if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1473                             break;
1474                         }
1475                         if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1476                             break;
1477                         }
1478                         prevBoundary = src;
1479                         continue;
1480                     }
1481                     // If we see L+V+x where x!=T then we drop to the slow path,
1482                     // decompose and recompose.
1483                     // This is to deal with NFKC finding normal L and V but a
1484                     // compatibility variant of a T.
1485                     // We need to either fully compose that combination here
1486                     // (which would complicate the code and may not work with strange custom data)
1487                     // or use the slow path.
1488                 }
1489             } else if (Hangul::isHangulLV(prev)) {
1490                 // The current character is a Jamo Trailing consonant,
1491                 // compose with previous Hangul LV that does not contain a Jamo T.
1492                 if (!doCompose) {
1493                     return FALSE;
1494                 }
1495                 UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;
1496                 --prevSrc;  // Replace the Hangul LV as well.
1497                 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1498                     break;
1499                 }
1500                 if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) {
1501                     break;
1502                 }
1503                 prevBoundary = src;
1504                 continue;
1505             }
1506             // No matching context, or may need to decompose surrounding text first:
1507             // Fall through to the slow path.
1508         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1509             // One or more combining marks that do not combine-back:
1510             // Check for canonical order, copy unchanged if ok and
1511             // if followed by a character with a boundary-before.
1512             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1513             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1514                 // Fails FCD test, need to decompose and contiguously recompose.
1515                 if (!doCompose) {
1516                     return FALSE;
1517                 }
1518             } else {
1519                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1520                 // the previous character which passed the quick check "yes && ccc==0" test.
1521                 const UChar *nextSrc;
1522                 uint16_t n16;
1523                 for (;;) {
1524                     if (src == limit) {
1525                         if (doCompose) {
1526                             buffer.appendZeroCC(prevBoundary, limit, errorCode);
1527                         }
1528                         return TRUE;
1529                     }
1530                     uint8_t prevCC = cc;
1531                     nextSrc = src;
1532                     UTRIE2_U16_NEXT16(normTrie, nextSrc, limit, c, n16);
1533                     if (n16 >= MIN_YES_YES_WITH_CC) {
1534                         cc = getCCFromNormalYesOrMaybe(n16);
1535                         if (prevCC > cc) {
1536                             if (!doCompose) {
1537                                 return FALSE;
1538                             }
1539                             break;
1540                         }
1541                     } else {
1542                         break;
1543                     }
1544                     src = nextSrc;
1545                 }
1546                 // src is after the last in-order combining mark.
1547                 // If there is a boundary here, then we continue with no change.
1548                 if (norm16HasCompBoundaryBefore(n16)) {
1549                     if (isCompYesAndZeroCC(n16)) {
1550                         src = nextSrc;
1551                     }
1552                     continue;
1553                 }
1554                 // Use the slow path. There is no boundary in [prevSrc, src[.
1555             }
1556         }
1557 
1558         // Slow path: Find the nearest boundaries around the current character,
1559         // decompose and recompose.
1560         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1561             const UChar *p = prevSrc;
1562             UTRIE2_U16_PREV16(normTrie, prevBoundary, p, c, norm16);
1563             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1564                 prevSrc = p;
1565             }
1566         }
1567         if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) {
1568             break;
1569         }
1570         int32_t recomposeStartIndex=buffer.length();
1571         // We know there is not a boundary here.
1572         decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
1573                        buffer, errorCode);
1574         // Decompose until the next boundary.
1575         src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
1576                              buffer, errorCode);
1577         if (U_FAILURE(errorCode)) {
1578             break;
1579         }
1580         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
1581             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1582             return TRUE;
1583         }
1584         recompose(buffer, recomposeStartIndex, onlyContiguous);
1585         if(!doCompose) {
1586             if(!buffer.equals(prevSrc, src)) {
1587                 return FALSE;
1588             }
1589             buffer.remove();
1590         }
1591         prevBoundary=src;
1592     }
1593     return TRUE;
1594 }
1595 
1596 // Very similar to compose(): Make the same changes in both places if relevant.
1597 // pQCResult==NULL: spanQuickCheckYes
1598 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1599 const UChar *
composeQuickCheck(const UChar * src,const UChar * limit,UBool onlyContiguous,UNormalizationCheckResult * pQCResult) const1600 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1601                                    UBool onlyContiguous,
1602                                    UNormalizationCheckResult *pQCResult) const {
1603     const UChar *prevBoundary=src;
1604     UChar32 minNoMaybeCP=minCompNoMaybeCP;
1605     if(limit==NULL) {
1606         UErrorCode errorCode=U_ZERO_ERROR;
1607         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1608         limit=u_strchr(src, 0);
1609         if (prevBoundary != src) {
1610             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) {
1611                 prevBoundary = src;
1612             } else {
1613                 prevBoundary = --src;
1614             }
1615         }
1616     }
1617 
1618     for(;;) {
1619         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1620         // or with (compYes && ccc==0) properties.
1621         const UChar *prevSrc;
1622         UChar32 c = 0;
1623         uint16_t norm16 = 0;
1624         for (;;) {
1625             if(src==limit) {
1626                 return src;
1627             }
1628             if( (c=*src)<minNoMaybeCP ||
1629                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1630             ) {
1631                 ++src;
1632             } else {
1633                 prevSrc = src++;
1634                 if(!U16_IS_SURROGATE(c)) {
1635                     break;
1636                 } else {
1637                     UChar c2;
1638                     if(U16_IS_SURROGATE_LEAD(c)) {
1639                         if(src!=limit && U16_IS_TRAIL(c2=*src)) {
1640                             ++src;
1641                             c=U16_GET_SUPPLEMENTARY(c, c2);
1642                         }
1643                     } else /* trail surrogate */ {
1644                         if(prevBoundary<prevSrc && U16_IS_LEAD(c2=*(prevSrc-1))) {
1645                             --prevSrc;
1646                             c=U16_GET_SUPPLEMENTARY(c2, c);
1647                         }
1648                     }
1649                     if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
1650                         break;
1651                     }
1652                 }
1653             }
1654         }
1655         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1656         // The current character is either a "noNo" (has a mapping)
1657         // or a "maybeYes" (combines backward)
1658         // or a "yesYes" with ccc!=0.
1659         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1660 
1661         uint16_t prevNorm16 = INERT;
1662         if (prevBoundary != prevSrc) {
1663             if (norm16HasCompBoundaryBefore(norm16)) {
1664                 prevBoundary = prevSrc;
1665             } else {
1666                 const UChar *p = prevSrc;
1667                 uint16_t n16;
1668                 UTRIE2_U16_PREV16(normTrie, prevBoundary, p, c, n16);
1669                 if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1670                     prevBoundary = prevSrc;
1671                 } else {
1672                     prevBoundary = p;
1673                     prevNorm16 = n16;
1674                 }
1675             }
1676         }
1677 
1678         if(isMaybeOrNonZeroCC(norm16)) {
1679             uint8_t cc=getCCFromYesOrMaybe(norm16);
1680             if (onlyContiguous /* FCC */ && cc != 0 &&
1681                     getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1682                 // The [prevBoundary..prevSrc[ character
1683                 // passed the quick check "yes && ccc==0" test
1684                 // but is out of canonical order with the current combining mark.
1685             } else {
1686                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1687                 // the previous character which passed the quick check "yes && ccc==0" test.
1688                 const UChar *nextSrc;
1689                 for (;;) {
1690                     if (norm16 < MIN_YES_YES_WITH_CC) {
1691                         if (pQCResult != nullptr) {
1692                             *pQCResult = UNORM_MAYBE;
1693                         } else {
1694                             return prevBoundary;
1695                         }
1696                     }
1697                     if (src == limit) {
1698                         return src;
1699                     }
1700                     uint8_t prevCC = cc;
1701                     nextSrc = src;
1702                     UTRIE2_U16_NEXT16(normTrie, nextSrc, limit, c, norm16);
1703                     if (isMaybeOrNonZeroCC(norm16)) {
1704                         cc = getCCFromYesOrMaybe(norm16);
1705                         if (!(prevCC <= cc || cc == 0)) {
1706                             break;
1707                         }
1708                     } else {
1709                         break;
1710                     }
1711                     src = nextSrc;
1712                 }
1713                 // src is after the last in-order combining mark.
1714                 if (isCompYesAndZeroCC(norm16)) {
1715                     prevBoundary = src;
1716                     src = nextSrc;
1717                     continue;
1718                 }
1719             }
1720         }
1721         if(pQCResult!=NULL) {
1722             *pQCResult=UNORM_NO;
1723         }
1724         return prevBoundary;
1725     }
1726 }
1727 
composeAndAppend(const UChar * src,const UChar * limit,UBool doCompose,UBool onlyContiguous,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1728 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1729                                        UBool doCompose,
1730                                        UBool onlyContiguous,
1731                                        UnicodeString &safeMiddle,
1732                                        ReorderingBuffer &buffer,
1733                                        UErrorCode &errorCode) const {
1734     if(!buffer.isEmpty()) {
1735         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous);
1736         if(src!=firstStarterInSrc) {
1737             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1738                                                                     buffer.getLimit(), onlyContiguous);
1739             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1740             UnicodeString middle(lastStarterInDest, destSuffixLength);
1741             buffer.removeSuffix(destSuffixLength);
1742             safeMiddle=middle;
1743             middle.append(src, (int32_t)(firstStarterInSrc-src));
1744             const UChar *middleStart=middle.getBuffer();
1745             compose(middleStart, middleStart+middle.length(), onlyContiguous,
1746                     TRUE, buffer, errorCode);
1747             if(U_FAILURE(errorCode)) {
1748                 return;
1749             }
1750             src=firstStarterInSrc;
1751         }
1752     }
1753     if(doCompose) {
1754         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1755     } else {
1756         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1757             limit=u_strchr(src, 0);
1758         }
1759         buffer.appendZeroCC(src, limit, errorCode);
1760     }
1761 }
1762 
1763 UBool
composeUTF8(uint32_t options,UBool onlyContiguous,const uint8_t * src,const uint8_t * limit,ByteSink * sink,Edits * edits,UErrorCode & errorCode) const1764 Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
1765                              const uint8_t *src, const uint8_t *limit,
1766                              ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
1767     U_ASSERT(limit != nullptr);
1768     UnicodeString s16;
1769     uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);
1770     const uint8_t *prevBoundary = src;
1771 
1772     for (;;) {
1773         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1774         // or with (compYes && ccc==0) properties.
1775         const uint8_t *prevSrc;
1776         uint16_t norm16 = 0;
1777         for (;;) {
1778             if (src == limit) {
1779                 if (prevBoundary != limit && sink != nullptr) {
1780                     ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1781                                                   *sink, options, edits, errorCode);
1782                 }
1783                 return TRUE;
1784             }
1785             if (*src < minNoMaybeLead) {
1786                 ++src;
1787             } else {
1788                 prevSrc = src;
1789                 UTRIE2_U8_NEXT16(normTrie, src, limit, norm16);
1790                 if (!isCompYesAndZeroCC(norm16)) {
1791                     break;
1792                 }
1793             }
1794         }
1795         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1796         // The current character is either a "noNo" (has a mapping)
1797         // or a "maybeYes" (combines backward)
1798         // or a "yesYes" with ccc!=0.
1799         // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1800 
1801         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1802         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1803             if (sink == nullptr) {
1804                 return FALSE;
1805             }
1806             // Fast path for mapping a character that is immediately surrounded by boundaries.
1807             // In this case, we need not decompose around the current character.
1808             if (isDecompNoAlgorithmic(norm16)) {
1809                 // Maps to a single isCompYesAndZeroCC character
1810                 // which also implies hasCompBoundaryBefore.
1811                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1812                         hasCompBoundaryBefore(src, limit)) {
1813                     if (prevBoundary != prevSrc &&
1814                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1815                                                            *sink, options, edits, errorCode)) {
1816                         break;
1817                     }
1818                     appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);
1819                     prevBoundary = src;
1820                     continue;
1821                 }
1822             } else if (norm16 < minNoNoCompBoundaryBefore) {
1823                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1824                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1825                         hasCompBoundaryBefore(src, limit)) {
1826                     if (prevBoundary != prevSrc &&
1827                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1828                                                            *sink, options, edits, errorCode)) {
1829                         break;
1830                     }
1831                     const uint16_t *mapping = getMapping(norm16);
1832                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;
1833                     if (!ByteSinkUtil::appendChange(prevSrc, src, (const UChar *)mapping, length,
1834                                                     *sink, edits, errorCode)) {
1835                         break;
1836                     }
1837                     prevBoundary = src;
1838                     continue;
1839                 }
1840             } else if (norm16 >= minNoNoEmpty) {
1841                 // The current character maps to nothing.
1842                 // Simply omit it from the output if there is a boundary before _or_ after it.
1843                 // The character itself implies no boundaries.
1844                 if (hasCompBoundaryBefore(src, limit) ||
1845                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) {
1846                     if (prevBoundary != prevSrc &&
1847                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1848                                                            *sink, options, edits, errorCode)) {
1849                         break;
1850                     }
1851                     if (edits != nullptr) {
1852                         edits->addReplace((int32_t)(src - prevSrc), 0);
1853                     }
1854                     prevBoundary = src;
1855                     continue;
1856                 }
1857             }
1858             // Other "noNo" type, or need to examine more text around this character:
1859             // Fall through to the slow path.
1860         } else if (isJamoVT(norm16)) {
1861             // Jamo L: E1 84 80..92
1862             // Jamo V: E1 85 A1..B5
1863             // Jamo T: E1 86 A8..E1 87 82
1864             U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1);
1865             UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc);
1866             if (prevSrc[1] == 0x85) {
1867                 // The current character is a Jamo Vowel,
1868                 // compose with previous Jamo L and following Jamo T.
1869                 UChar32 l = prev - Hangul::JAMO_L_BASE;
1870                 if ((uint32_t)l < Hangul::JAMO_L_COUNT) {
1871                     if (sink == nullptr) {
1872                         return FALSE;
1873                     }
1874                     int32_t t = getJamoTMinusBase(src, limit);
1875                     if (t >= 0) {
1876                         // The next character is a Jamo T.
1877                         src += 3;
1878                     } else if (hasCompBoundaryBefore(src, limit)) {
1879                         // No Jamo T follows, not even via decomposition.
1880                         t = 0;
1881                     }
1882                     if (t >= 0) {
1883                         UChar32 syllable = Hangul::HANGUL_BASE +
1884                             (l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) *
1885                             Hangul::JAMO_T_COUNT + t;
1886                         prevSrc -= 3;  // Replace the Jamo L as well.
1887                         if (prevBoundary != prevSrc &&
1888                                 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1889                                                                *sink, options, edits, errorCode)) {
1890                             break;
1891                         }
1892                         ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1893                         prevBoundary = src;
1894                         continue;
1895                     }
1896                     // If we see L+V+x where x!=T then we drop to the slow path,
1897                     // decompose and recompose.
1898                     // This is to deal with NFKC finding normal L and V but a
1899                     // compatibility variant of a T.
1900                     // We need to either fully compose that combination here
1901                     // (which would complicate the code and may not work with strange custom data)
1902                     // or use the slow path.
1903                 }
1904             } else if (Hangul::isHangulLV(prev)) {
1905                 // The current character is a Jamo Trailing consonant,
1906                 // compose with previous Hangul LV that does not contain a Jamo T.
1907                 if (sink == nullptr) {
1908                     return FALSE;
1909                 }
1910                 UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);
1911                 prevSrc -= 3;  // Replace the Hangul LV as well.
1912                 if (prevBoundary != prevSrc &&
1913                         !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
1914                                                        *sink, options, edits, errorCode)) {
1915                     break;
1916                 }
1917                 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);
1918                 prevBoundary = src;
1919                 continue;
1920             }
1921             // No matching context, or may need to decompose surrounding text first:
1922             // Fall through to the slow path.
1923         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1924             // One or more combining marks that do not combine-back:
1925             // Check for canonical order, copy unchanged if ok and
1926             // if followed by a character with a boundary-before.
1927             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1928             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) {
1929                 // Fails FCD test, need to decompose and contiguously recompose.
1930                 if (sink == nullptr) {
1931                     return FALSE;
1932                 }
1933             } else {
1934                 // If !onlyContiguous (not FCC), then we ignore the tccc of
1935                 // the previous character which passed the quick check "yes && ccc==0" test.
1936                 const uint8_t *nextSrc;
1937                 uint16_t n16;
1938                 for (;;) {
1939                     if (src == limit) {
1940                         if (sink != nullptr) {
1941                             ByteSinkUtil::appendUnchanged(prevBoundary, limit,
1942                                                           *sink, options, edits, errorCode);
1943                         }
1944                         return TRUE;
1945                     }
1946                     uint8_t prevCC = cc;
1947                     nextSrc = src;
1948                     UTRIE2_U8_NEXT16(normTrie, nextSrc, limit, n16);
1949                     if (n16 >= MIN_YES_YES_WITH_CC) {
1950                         cc = getCCFromNormalYesOrMaybe(n16);
1951                         if (prevCC > cc) {
1952                             if (sink == nullptr) {
1953                                 return FALSE;
1954                             }
1955                             break;
1956                         }
1957                     } else {
1958                         break;
1959                     }
1960                     src = nextSrc;
1961                 }
1962                 // src is after the last in-order combining mark.
1963                 // If there is a boundary here, then we continue with no change.
1964                 if (norm16HasCompBoundaryBefore(n16)) {
1965                     if (isCompYesAndZeroCC(n16)) {
1966                         src = nextSrc;
1967                     }
1968                     continue;
1969                 }
1970                 // Use the slow path. There is no boundary in [prevSrc, src[.
1971             }
1972         }
1973 
1974         // Slow path: Find the nearest boundaries around the current character,
1975         // decompose and recompose.
1976         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1977             const uint8_t *p = prevSrc;
1978             UTRIE2_U8_PREV16(normTrie, prevBoundary, p, norm16);
1979             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1980                 prevSrc = p;
1981             }
1982         }
1983         ReorderingBuffer buffer(*this, s16, errorCode);
1984         if (U_FAILURE(errorCode)) {
1985             break;
1986         }
1987         // We know there is not a boundary here.
1988         decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
1989                        buffer, errorCode);
1990         // Decompose until the next boundary.
1991         src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
1992                              buffer, errorCode);
1993         if (U_FAILURE(errorCode)) {
1994             break;
1995         }
1996         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
1997             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1998             return TRUE;
1999         }
2000         recompose(buffer, 0, onlyContiguous);
2001         if (!buffer.equals(prevSrc, src)) {
2002             if (sink == nullptr) {
2003                 return FALSE;
2004             }
2005             if (prevBoundary != prevSrc &&
2006                     !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
2007                                                    *sink, options, edits, errorCode)) {
2008                 break;
2009             }
2010             if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),
2011                                             *sink, edits, errorCode)) {
2012                 break;
2013             }
2014             prevBoundary = src;
2015         }
2016     }
2017     return TRUE;
2018 }
2019 
hasCompBoundaryBefore(const UChar * src,const UChar * limit) const2020 UBool Normalizer2Impl::hasCompBoundaryBefore(const UChar *src, const UChar *limit) const {
2021     if (src == limit || *src < minCompNoMaybeCP) {
2022         return TRUE;
2023     }
2024     UChar32 c;
2025     uint16_t norm16;
2026     UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
2027     return norm16HasCompBoundaryBefore(norm16);
2028 }
2029 
hasCompBoundaryBefore(const uint8_t * src,const uint8_t * limit) const2030 UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const {
2031     if (src == limit) {
2032         return TRUE;
2033     }
2034     uint16_t norm16;
2035     UTRIE2_U8_NEXT16(normTrie, src, limit, norm16);
2036     return norm16HasCompBoundaryBefore(norm16);
2037 }
2038 
hasCompBoundaryAfter(const UChar * start,const UChar * p,UBool onlyContiguous) const2039 UBool Normalizer2Impl::hasCompBoundaryAfter(const UChar *start, const UChar *p,
2040                                             UBool onlyContiguous) const {
2041     if (start == p) {
2042         return TRUE;
2043     }
2044     UChar32 c;
2045     uint16_t norm16;
2046     UTRIE2_U16_PREV16(normTrie, start, p, c, norm16);
2047     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2048 }
2049 
hasCompBoundaryAfter(const uint8_t * start,const uint8_t * p,UBool onlyContiguous) const2050 UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
2051                                             UBool onlyContiguous) const {
2052     if (start == p) {
2053         return TRUE;
2054     }
2055     uint16_t norm16;
2056     UTRIE2_U8_PREV16(normTrie, start, p, norm16);
2057     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
2058 }
2059 
findPreviousCompBoundary(const UChar * start,const UChar * p,UBool onlyContiguous) const2060 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p,
2061                                                        UBool onlyContiguous) const {
2062     BackwardUTrie2StringIterator iter(normTrie, start, p);
2063     for(;;) {
2064         uint16_t norm16=iter.previous16();
2065         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2066             return iter.codePointLimit;
2067         }
2068         if (hasCompBoundaryBefore(iter.codePoint, norm16)) {
2069             return iter.codePointStart;
2070         }
2071     }
2072 }
2073 
findNextCompBoundary(const UChar * p,const UChar * limit,UBool onlyContiguous) const2074 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit,
2075                                                    UBool onlyContiguous) const {
2076     ForwardUTrie2StringIterator iter(normTrie, p, limit);
2077     for(;;) {
2078         uint16_t norm16=iter.next16();
2079         if (hasCompBoundaryBefore(iter.codePoint, norm16)) {
2080             return iter.codePointStart;
2081         }
2082         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2083             return iter.codePointLimit;
2084         }
2085     }
2086 }
2087 
getPreviousTrailCC(const UChar * start,const UChar * p) const2088 uint8_t Normalizer2Impl::getPreviousTrailCC(const UChar *start, const UChar *p) const {
2089     if (start == p) {
2090         return 0;
2091     }
2092     int32_t i = (int32_t)(p - start);
2093     UChar32 c;
2094     U16_PREV(start, 0, i, c);
2095     return (uint8_t)getFCD16(c);
2096 }
2097 
getPreviousTrailCC(const uint8_t * start,const uint8_t * p) const2098 uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const {
2099     if (start == p) {
2100         return 0;
2101     }
2102     int32_t i = (int32_t)(p - start);
2103     UChar32 c;
2104     U8_PREV(start, 0, i, c);
2105     return (uint8_t)getFCD16(c);
2106 }
2107 
2108 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
2109 // still had getFCDTrie() which built and cached an FCD trie.
2110 // That provided faster access to FCD data than getFCD16FromNormData()
2111 // but required synchronization and consumed some 10kB of heap memory
2112 // in any process that uses FCD (e.g., via collation).
2113 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
2114 // at least for ASCII & CJK.
2115 
2116 // Gets the FCD value from the regular normalization data.
getFCD16FromNormData(UChar32 c) const2117 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
2118     uint16_t norm16=getNorm16(c);
2119     if (norm16 >= limitNoNo) {
2120         if(norm16>=MIN_NORMAL_MAYBE_YES) {
2121             // combining mark
2122             norm16=getCCFromNormalYesOrMaybe(norm16);
2123             return norm16|(norm16<<8);
2124         } else if(norm16>=minMaybeYes) {
2125             return 0;
2126         } else {  // isDecompNoAlgorithmic(norm16)
2127             uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK;
2128             if (deltaTrailCC <= DELTA_TCCC_1) {
2129                 return deltaTrailCC >> OFFSET_SHIFT;
2130             }
2131             // Maps to an isCompYesAndZeroCC.
2132             c=mapAlgorithmic(c, norm16);
2133             norm16=getNorm16(c);
2134         }
2135     }
2136     if(norm16<=minYesNo || isHangulLVT(norm16)) {
2137         // no decomposition or Hangul syllable, all zeros
2138         return 0;
2139     }
2140     // c decomposes, get everything from the variable-length extra data
2141     const uint16_t *mapping=getMapping(norm16);
2142     uint16_t firstUnit=*mapping;
2143     norm16=firstUnit>>8;  // tccc
2144     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
2145         norm16|=*(mapping-1)&0xff00;  // lccc
2146     }
2147     return norm16;
2148 }
2149 
2150 // Dual functionality:
2151 // buffer!=NULL: normalize
2152 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
2153 const UChar *
makeFCD(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const2154 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
2155                          ReorderingBuffer *buffer,
2156                          UErrorCode &errorCode) const {
2157     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
2158     // Similar to the prevBoundary in the compose() implementation.
2159     const UChar *prevBoundary=src;
2160     int32_t prevFCD16=0;
2161     if(limit==NULL) {
2162         src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode);
2163         if(U_FAILURE(errorCode)) {
2164             return src;
2165         }
2166         if(prevBoundary<src) {
2167             prevBoundary=src;
2168             // We know that the previous character's lccc==0.
2169             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
2170             prevFCD16=getFCD16(*(src-1));
2171             if(prevFCD16>1) {
2172                 --prevBoundary;
2173             }
2174         }
2175         limit=u_strchr(src, 0);
2176     }
2177 
2178     // Note: In this function we use buffer->appendZeroCC() because we track
2179     // the lead and trail combining classes here, rather than leaving it to
2180     // the ReorderingBuffer.
2181     // The exception is the call to decomposeShort() which uses the buffer
2182     // in the normal way.
2183 
2184     const UChar *prevSrc;
2185     UChar32 c=0;
2186     uint16_t fcd16=0;
2187 
2188     for(;;) {
2189         // count code units with lccc==0
2190         for(prevSrc=src; src!=limit;) {
2191             if((c=*src)<minLcccCP) {
2192                 prevFCD16=~c;
2193                 ++src;
2194             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
2195                 prevFCD16=0;
2196                 ++src;
2197             } else {
2198                 if(U16_IS_SURROGATE(c)) {
2199                     UChar c2;
2200                     if(U16_IS_SURROGATE_LEAD(c)) {
2201                         if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
2202                             c=U16_GET_SUPPLEMENTARY(c, c2);
2203                         }
2204                     } else /* trail surrogate */ {
2205                         if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
2206                             --src;
2207                             c=U16_GET_SUPPLEMENTARY(c2, c);
2208                         }
2209                     }
2210                 }
2211                 if((fcd16=getFCD16FromNormData(c))<=0xff) {
2212                     prevFCD16=fcd16;
2213                     src+=U16_LENGTH(c);
2214                 } else {
2215                     break;
2216                 }
2217             }
2218         }
2219         // copy these code units all at once
2220         if(src!=prevSrc) {
2221             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
2222                 break;
2223             }
2224             if(src==limit) {
2225                 break;
2226             }
2227             prevBoundary=src;
2228             // We know that the previous character's lccc==0.
2229             if(prevFCD16<0) {
2230                 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
2231                 UChar32 prev=~prevFCD16;
2232                 if(prev<minDecompNoCP) {
2233                     prevFCD16=0;
2234                 } else {
2235                     prevFCD16=getFCD16FromNormData(prev);
2236                     if(prevFCD16>1) {
2237                         --prevBoundary;
2238                     }
2239                 }
2240             } else {
2241                 const UChar *p=src-1;
2242                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
2243                     --p;
2244                     // Need to fetch the previous character's FCD value because
2245                     // prevFCD16 was just for the trail surrogate code point.
2246                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
2247                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
2248                 }
2249                 if(prevFCD16>1) {
2250                     prevBoundary=p;
2251                 }
2252             }
2253             // The start of the current character (c).
2254             prevSrc=src;
2255         } else if(src==limit) {
2256             break;
2257         }
2258 
2259         src+=U16_LENGTH(c);
2260         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
2261         // Check for proper order, and decompose locally if necessary.
2262         if((prevFCD16&0xff)<=(fcd16>>8)) {
2263             // proper order: prev tccc <= current lccc
2264             if((fcd16&0xff)<=1) {
2265                 prevBoundary=src;
2266             }
2267             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
2268                 break;
2269             }
2270             prevFCD16=fcd16;
2271             continue;
2272         } else if(buffer==NULL) {
2273             return prevBoundary;  // quick check "no"
2274         } else {
2275             /*
2276              * Back out the part of the source that we copied or appended
2277              * already but is now going to be decomposed.
2278              * prevSrc is set to after what was copied/appended.
2279              */
2280             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
2281             /*
2282              * Find the part of the source that needs to be decomposed,
2283              * up to the next safe boundary.
2284              */
2285             src=findNextFCDBoundary(src, limit);
2286             /*
2287              * The source text does not fulfill the conditions for FCD.
2288              * Decompose and reorder a limited piece of the text.
2289              */
2290             decomposeShort(prevBoundary, src, FALSE, FALSE, *buffer, errorCode);
2291             if (U_FAILURE(errorCode)) {
2292                 break;
2293             }
2294             prevBoundary=src;
2295             prevFCD16=0;
2296         }
2297     }
2298     return src;
2299 }
2300 
makeFCDAndAppend(const UChar * src,const UChar * limit,UBool doMakeFCD,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const2301 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
2302                                        UBool doMakeFCD,
2303                                        UnicodeString &safeMiddle,
2304                                        ReorderingBuffer &buffer,
2305                                        UErrorCode &errorCode) const {
2306     if(!buffer.isEmpty()) {
2307         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
2308         if(src!=firstBoundaryInSrc) {
2309             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
2310                                                                     buffer.getLimit());
2311             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
2312             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
2313             buffer.removeSuffix(destSuffixLength);
2314             safeMiddle=middle;
2315             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
2316             const UChar *middleStart=middle.getBuffer();
2317             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
2318             if(U_FAILURE(errorCode)) {
2319                 return;
2320             }
2321             src=firstBoundaryInSrc;
2322         }
2323     }
2324     if(doMakeFCD) {
2325         makeFCD(src, limit, &buffer, errorCode);
2326     } else {
2327         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
2328             limit=u_strchr(src, 0);
2329         }
2330         buffer.appendZeroCC(src, limit, errorCode);
2331     }
2332 }
2333 
findPreviousFCDBoundary(const UChar * start,const UChar * p) const2334 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
2335     while(start<p) {
2336         const UChar *codePointLimit = p;
2337         UChar32 c;
2338         uint16_t norm16;
2339         UTRIE2_U16_PREV16(normTrie, start, p, c, norm16);
2340         if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) {
2341             return codePointLimit;
2342         }
2343         if (norm16HasDecompBoundaryBefore(norm16)) {
2344             return p;
2345         }
2346     }
2347     return p;
2348 }
2349 
findNextFCDBoundary(const UChar * p,const UChar * limit) const2350 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
2351     while(p<limit) {
2352         const UChar *codePointStart=p;
2353         UChar32 c;
2354         uint16_t norm16;
2355         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
2356         if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) {
2357             return codePointStart;
2358         }
2359         if (norm16HasDecompBoundaryAfter(norm16)) {
2360             return p;
2361         }
2362     }
2363     return p;
2364 }
2365 
2366 // CanonicalIterator data -------------------------------------------------- ***
2367 
CanonIterData(UErrorCode & errorCode)2368 CanonIterData::CanonIterData(UErrorCode &errorCode) :
2369         trie(utrie2_open(0, 0, &errorCode)),
2370         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
2371 
~CanonIterData()2372 CanonIterData::~CanonIterData() {
2373     utrie2_close(trie);
2374 }
2375 
addToStartSet(UChar32 origin,UChar32 decompLead,UErrorCode & errorCode)2376 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
2377     uint32_t canonValue=utrie2_get32(trie, decompLead);
2378     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2379         // origin is the first character whose decomposition starts with
2380         // the character for which we are setting the value.
2381         utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
2382     } else {
2383         // origin is not the first character, or it is U+0000.
2384         UnicodeSet *set;
2385         if((canonValue&CANON_HAS_SET)==0) {
2386             set=new UnicodeSet;
2387             if(set==NULL) {
2388                 errorCode=U_MEMORY_ALLOCATION_ERROR;
2389                 return;
2390             }
2391             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
2392             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
2393             utrie2_set32(trie, decompLead, canonValue, &errorCode);
2394             canonStartSets.addElement(set, errorCode);
2395             if(firstOrigin!=0) {
2396                 set->add(firstOrigin);
2397             }
2398         } else {
2399             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
2400         }
2401         set->add(origin);
2402     }
2403 }
2404 
2405 // C++ class for friend access to private Normalizer2Impl members.
2406 class InitCanonIterData {
2407 public:
2408     static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);
2409     static void handleRange(Normalizer2Impl *impl, UChar32 start, UChar32 end, uint16_t value, UErrorCode &errorCode);
2410 };
2411 
2412 U_CDECL_BEGIN
2413 
2414 // UInitOnce instantiation function for CanonIterData
2415 static void U_CALLCONV
initCanonIterData(Normalizer2Impl * impl,UErrorCode & errorCode)2416 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
2417     InitCanonIterData::doInit(impl, errorCode);
2418 }
2419 
2420 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
2421 //     context: the Normalizer2Impl
2422 static UBool U_CALLCONV
enumCIDRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)2423 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
2424     UErrorCode errorCode = U_ZERO_ERROR;
2425     if (value != Normalizer2Impl::INERT) {
2426         Normalizer2Impl *impl = (Normalizer2Impl *)context;
2427         InitCanonIterData::handleRange(impl, start, end, (uint16_t)value, errorCode);
2428     }
2429     return U_SUCCESS(errorCode);
2430 }
2431 
2432 U_CDECL_END
2433 
doInit(Normalizer2Impl * impl,UErrorCode & errorCode)2434 void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
2435     U_ASSERT(impl->fCanonIterData == NULL);
2436     impl->fCanonIterData = new CanonIterData(errorCode);
2437     if (impl->fCanonIterData == NULL) {
2438         errorCode=U_MEMORY_ALLOCATION_ERROR;
2439     }
2440     if (U_SUCCESS(errorCode)) {
2441         utrie2_enum(impl->normTrie, NULL, enumCIDRangeHandler, impl);
2442         utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
2443     }
2444     if (U_FAILURE(errorCode)) {
2445         delete impl->fCanonIterData;
2446         impl->fCanonIterData = NULL;
2447     }
2448 }
2449 
handleRange(Normalizer2Impl * impl,UChar32 start,UChar32 end,uint16_t value,UErrorCode & errorCode)2450 void InitCanonIterData::handleRange(
2451         Normalizer2Impl *impl, UChar32 start, UChar32 end, uint16_t value, UErrorCode &errorCode) {
2452     impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);
2453 }
2454 
makeCanonIterDataFromNorm16(UChar32 start,UChar32 end,const uint16_t norm16,CanonIterData & newData,UErrorCode & errorCode) const2455 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
2456                                                   CanonIterData &newData,
2457                                                   UErrorCode &errorCode) const {
2458     if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) {
2459         // Inert, or 2-way mapping (including Hangul syllable).
2460         // We do not write a canonStartSet for any yesNo character.
2461         // Composites from 2-way mappings are added at runtime from the
2462         // starter's compositions list, and the other characters in
2463         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
2464         // "maybe" characters.
2465         return;
2466     }
2467     for(UChar32 c=start; c<=end; ++c) {
2468         uint32_t oldValue=utrie2_get32(newData.trie, c);
2469         uint32_t newValue=oldValue;
2470         if(isMaybeOrNonZeroCC(norm16)) {
2471             // not a segment starter if it occurs in a decomposition or has cc!=0
2472             newValue|=CANON_NOT_SEGMENT_STARTER;
2473             if(norm16<MIN_NORMAL_MAYBE_YES) {
2474                 newValue|=CANON_HAS_COMPOSITIONS;
2475             }
2476         } else if(norm16<minYesNo) {
2477             newValue|=CANON_HAS_COMPOSITIONS;
2478         } else {
2479             // c has a one-way decomposition
2480             UChar32 c2=c;
2481             // Do not modify the whole-range norm16 value.
2482             uint16_t norm16_2=norm16;
2483             if (isDecompNoAlgorithmic(norm16_2)) {
2484                 // Maps to an isCompYesAndZeroCC.
2485                 c2 = mapAlgorithmic(c2, norm16_2);
2486                 norm16_2 = getNorm16(c2);
2487                 // No compatibility mappings for the CanonicalIterator.
2488                 U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
2489             }
2490             if (norm16_2 > minYesNo) {
2491                 // c decomposes, get everything from the variable-length extra data
2492                 const uint16_t *mapping=getMapping(norm16_2);
2493                 uint16_t firstUnit=*mapping;
2494                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
2495                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
2496                     if(c==c2 && (*(mapping-1)&0xff)!=0) {
2497                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
2498                     }
2499                 }
2500                 // Skip empty mappings (no characters in the decomposition).
2501                 if(length!=0) {
2502                     ++mapping;  // skip over the firstUnit
2503                     // add c to first code point's start set
2504                     int32_t i=0;
2505                     U16_NEXT_UNSAFE(mapping, i, c2);
2506                     newData.addToStartSet(c, c2, errorCode);
2507                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
2508                     // one-way mapping. A 2-way mapping is possible here after
2509                     // intermediate algorithmic mapping.
2510                     if(norm16_2>=minNoNo) {
2511                         while(i<length) {
2512                             U16_NEXT_UNSAFE(mapping, i, c2);
2513                             uint32_t c2Value=utrie2_get32(newData.trie, c2);
2514                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
2515                                 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
2516                                              &errorCode);
2517                             }
2518                         }
2519                     }
2520                 }
2521             } else {
2522                 // c decomposed to c2 algorithmically; c has cc==0
2523                 newData.addToStartSet(c, c2, errorCode);
2524             }
2525         }
2526         if(newValue!=oldValue) {
2527             utrie2_set32(newData.trie, c, newValue, &errorCode);
2528         }
2529     }
2530 }
2531 
ensureCanonIterData(UErrorCode & errorCode) const2532 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
2533     // Logically const: Synchronized instantiation.
2534     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
2535     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
2536     return U_SUCCESS(errorCode);
2537 }
2538 
getCanonValue(UChar32 c) const2539 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
2540     return (int32_t)utrie2_get32(fCanonIterData->trie, c);
2541 }
2542 
getCanonStartSet(int32_t n) const2543 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
2544     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
2545 }
2546 
isCanonSegmentStarter(UChar32 c) const2547 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
2548     return getCanonValue(c)>=0;
2549 }
2550 
getCanonStartSet(UChar32 c,UnicodeSet & set) const2551 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
2552     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
2553     if(canonValue==0) {
2554         return FALSE;
2555     }
2556     set.clear();
2557     int32_t value=canonValue&CANON_VALUE_MASK;
2558     if((canonValue&CANON_HAS_SET)!=0) {
2559         set.addAll(getCanonStartSet(value));
2560     } else if(value!=0) {
2561         set.add(value);
2562     }
2563     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
2564         uint16_t norm16=getNorm16(c);
2565         if(norm16==JAMO_L) {
2566             UChar32 syllable=
2567                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
2568             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
2569         } else {
2570             addComposites(getCompositionsList(norm16), set);
2571         }
2572     }
2573     return TRUE;
2574 }
2575 
2576 U_NAMESPACE_END
2577 
2578 // Normalizer2 data swapping ----------------------------------------------- ***
2579 
2580 U_NAMESPACE_USE
2581 
2582 U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)2583 unorm2_swap(const UDataSwapper *ds,
2584             const void *inData, int32_t length, void *outData,
2585             UErrorCode *pErrorCode) {
2586     const UDataInfo *pInfo;
2587     int32_t headerSize;
2588 
2589     const uint8_t *inBytes;
2590     uint8_t *outBytes;
2591 
2592     const int32_t *inIndexes;
2593     int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1];
2594 
2595     int32_t i, offset, nextOffset, size;
2596 
2597     /* udata_swapDataHeader checks the arguments */
2598     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2599     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2600         return 0;
2601     }
2602 
2603     /* check data format and format version */
2604     pInfo=(const UDataInfo *)((const char *)inData+4);
2605     uint8_t formatVersion0=pInfo->formatVersion[0];
2606     if(!(
2607         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
2608         pInfo->dataFormat[1]==0x72 &&
2609         pInfo->dataFormat[2]==0x6d &&
2610         pInfo->dataFormat[3]==0x32 &&
2611         (1<=formatVersion0 && formatVersion0<=3)
2612     )) {
2613         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2614                          pInfo->dataFormat[0], pInfo->dataFormat[1],
2615                          pInfo->dataFormat[2], pInfo->dataFormat[3],
2616                          pInfo->formatVersion[0]);
2617         *pErrorCode=U_UNSUPPORTED_ERROR;
2618         return 0;
2619     }
2620 
2621     inBytes=(const uint8_t *)inData+headerSize;
2622     outBytes=(uint8_t *)outData+headerSize;
2623 
2624     inIndexes=(const int32_t *)inBytes;
2625     int32_t minIndexesLength;
2626     if(formatVersion0==1) {
2627         minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1;
2628     } else if(formatVersion0==2) {
2629         minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1;
2630     } else {
2631         minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1;
2632     }
2633 
2634     if(length>=0) {
2635         length-=headerSize;
2636         if(length<minIndexesLength*4) {
2637             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2638                              length);
2639             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2640             return 0;
2641         }
2642     }
2643 
2644     /* read the first few indexes */
2645     for(i=0; i<UPRV_LENGTHOF(indexes); ++i) {
2646         indexes[i]=udata_readInt32(ds, inIndexes[i]);
2647     }
2648 
2649     /* get the total length of the data */
2650     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2651 
2652     if(length>=0) {
2653         if(length<size) {
2654             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2655                              length);
2656             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2657             return 0;
2658         }
2659 
2660         /* copy the data for inaccessible bytes */
2661         if(inBytes!=outBytes) {
2662             uprv_memcpy(outBytes, inBytes, size);
2663         }
2664 
2665         offset=0;
2666 
2667         /* swap the int32_t indexes[] */
2668         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2669         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2670         offset=nextOffset;
2671 
2672         /* swap the UTrie2 */
2673         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2674         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2675         offset=nextOffset;
2676 
2677         /* swap the uint16_t extraData[] */
2678         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2679         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2680         offset=nextOffset;
2681 
2682         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2683         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2684         offset=nextOffset;
2685 
2686         U_ASSERT(offset==size);
2687     }
2688 
2689     return headerSize+size;
2690 }
2691 
2692 #endif  // !UCONFIG_NO_NORMALIZATION
2693