1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: normalizer2impl.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009nov22
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_NORMALIZATION
20
21 #include "unicode/normalizer2.h"
22 #include "unicode/udata.h"
23 #include "unicode/ustring.h"
24 #include "cmemory.h"
25 #include "mutex.h"
26 #include "normalizer2impl.h"
27 #include "uassert.h"
28 #include "uhash.h"
29 #include "uset_imp.h"
30 #include "utrie2.h"
31 #include "uvector.h"
32
33 U_NAMESPACE_BEGIN
34
35 // ReorderingBuffer -------------------------------------------------------- ***
36
init(int32_t destCapacity,UErrorCode & errorCode)37 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
38 int32_t length=str.length();
39 start=str.getBuffer(destCapacity);
40 if(start==NULL) {
41 // getBuffer() already did str.setToBogus()
42 errorCode=U_MEMORY_ALLOCATION_ERROR;
43 return FALSE;
44 }
45 limit=start+length;
46 remainingCapacity=str.getCapacity()-length;
47 reorderStart=start;
48 if(start==limit) {
49 lastCC=0;
50 } else {
51 setIterator();
52 lastCC=previousCC();
53 // Set reorderStart after the last code point with cc<=1 if there is one.
54 if(lastCC>1) {
55 while(previousCC()>1) {}
56 }
57 reorderStart=codePointLimit;
58 }
59 return TRUE;
60 }
61
equals(const UChar * otherStart,const UChar * otherLimit) const62 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
63 int32_t length=(int32_t)(limit-start);
64 return
65 length==(int32_t)(otherLimit-otherStart) &&
66 0==u_memcmp(start, otherStart, length);
67 }
68
appendSupplementary(UChar32 c,uint8_t cc,UErrorCode & errorCode)69 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
70 if(remainingCapacity<2 && !resize(2, errorCode)) {
71 return FALSE;
72 }
73 if(lastCC<=cc || cc==0) {
74 limit[0]=U16_LEAD(c);
75 limit[1]=U16_TRAIL(c);
76 limit+=2;
77 lastCC=cc;
78 if(cc<=1) {
79 reorderStart=limit;
80 }
81 } else {
82 insert(c, cc);
83 }
84 remainingCapacity-=2;
85 return TRUE;
86 }
87
append(const UChar * s,int32_t length,uint8_t leadCC,uint8_t trailCC,UErrorCode & errorCode)88 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
89 uint8_t leadCC, uint8_t trailCC,
90 UErrorCode &errorCode) {
91 if(length==0) {
92 return TRUE;
93 }
94 if(remainingCapacity<length && !resize(length, errorCode)) {
95 return FALSE;
96 }
97 remainingCapacity-=length;
98 if(lastCC<=leadCC || leadCC==0) {
99 if(trailCC<=1) {
100 reorderStart=limit+length;
101 } else if(leadCC<=1) {
102 reorderStart=limit+1; // Ok if not a code point boundary.
103 }
104 const UChar *sLimit=s+length;
105 do { *limit++=*s++; } while(s!=sLimit);
106 lastCC=trailCC;
107 } else {
108 int32_t i=0;
109 UChar32 c;
110 U16_NEXT(s, i, length, c);
111 insert(c, leadCC); // insert first code point
112 while(i<length) {
113 U16_NEXT(s, i, length, c);
114 if(i<length) {
115 // s must be in NFD, otherwise we need to use getCC().
116 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
117 } else {
118 leadCC=trailCC;
119 }
120 append(c, leadCC, errorCode);
121 }
122 }
123 return TRUE;
124 }
125
appendZeroCC(UChar32 c,UErrorCode & errorCode)126 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
127 int32_t cpLength=U16_LENGTH(c);
128 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
129 return FALSE;
130 }
131 remainingCapacity-=cpLength;
132 if(cpLength==1) {
133 *limit++=(UChar)c;
134 } else {
135 limit[0]=U16_LEAD(c);
136 limit[1]=U16_TRAIL(c);
137 limit+=2;
138 }
139 lastCC=0;
140 reorderStart=limit;
141 return TRUE;
142 }
143
appendZeroCC(const UChar * s,const UChar * sLimit,UErrorCode & errorCode)144 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
145 if(s==sLimit) {
146 return TRUE;
147 }
148 int32_t length=(int32_t)(sLimit-s);
149 if(remainingCapacity<length && !resize(length, errorCode)) {
150 return FALSE;
151 }
152 u_memcpy(limit, s, length);
153 limit+=length;
154 remainingCapacity-=length;
155 lastCC=0;
156 reorderStart=limit;
157 return TRUE;
158 }
159
remove()160 void ReorderingBuffer::remove() {
161 reorderStart=limit=start;
162 remainingCapacity=str.getCapacity();
163 lastCC=0;
164 }
165
removeSuffix(int32_t suffixLength)166 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
167 if(suffixLength<(limit-start)) {
168 limit-=suffixLength;
169 remainingCapacity+=suffixLength;
170 } else {
171 limit=start;
172 remainingCapacity=str.getCapacity();
173 }
174 lastCC=0;
175 reorderStart=limit;
176 }
177
resize(int32_t appendLength,UErrorCode & errorCode)178 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
179 int32_t reorderStartIndex=(int32_t)(reorderStart-start);
180 int32_t length=(int32_t)(limit-start);
181 str.releaseBuffer(length);
182 int32_t newCapacity=length+appendLength;
183 int32_t doubleCapacity=2*str.getCapacity();
184 if(newCapacity<doubleCapacity) {
185 newCapacity=doubleCapacity;
186 }
187 if(newCapacity<256) {
188 newCapacity=256;
189 }
190 start=str.getBuffer(newCapacity);
191 if(start==NULL) {
192 // getBuffer() already did str.setToBogus()
193 errorCode=U_MEMORY_ALLOCATION_ERROR;
194 return FALSE;
195 }
196 reorderStart=start+reorderStartIndex;
197 limit=start+length;
198 remainingCapacity=str.getCapacity()-length;
199 return TRUE;
200 }
201
skipPrevious()202 void ReorderingBuffer::skipPrevious() {
203 codePointLimit=codePointStart;
204 UChar c=*--codePointStart;
205 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
206 --codePointStart;
207 }
208 }
209
previousCC()210 uint8_t ReorderingBuffer::previousCC() {
211 codePointLimit=codePointStart;
212 if(reorderStart>=codePointStart) {
213 return 0;
214 }
215 UChar32 c=*--codePointStart;
216 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
217 return 0;
218 }
219
220 UChar c2;
221 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
222 --codePointStart;
223 c=U16_GET_SUPPLEMENTARY(c2, c);
224 }
225 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
226 }
227
228 // Inserts c somewhere before the last character.
229 // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(UChar32 c,uint8_t cc)230 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
231 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
232 // insert c at codePointLimit, after the character with prevCC<=cc
233 UChar *q=limit;
234 UChar *r=limit+=U16_LENGTH(c);
235 do {
236 *--r=*--q;
237 } while(codePointLimit!=q);
238 writeCodePoint(q, c);
239 if(cc<=1) {
240 reorderStart=r;
241 }
242 }
243
244 // Normalizer2Impl --------------------------------------------------------- ***
245
246 struct CanonIterData : public UMemory {
247 CanonIterData(UErrorCode &errorCode);
248 ~CanonIterData();
249 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
250 UTrie2 *trie;
251 UVector canonStartSets; // contains UnicodeSet *
252 };
253
~Normalizer2Impl()254 Normalizer2Impl::~Normalizer2Impl() {
255 udata_close(memory);
256 utrie2_close(normTrie);
257 UTrie2Singleton(fcdTrieSingleton).deleteInstance();
258 delete (CanonIterData *)canonIterDataSingleton.fInstance;
259 }
260
261 UBool U_CALLCONV
isAcceptable(void * context,const char *,const char *,const UDataInfo * pInfo)262 Normalizer2Impl::isAcceptable(void *context,
263 const char * /* type */, const char * /*name*/,
264 const UDataInfo *pInfo) {
265 if(
266 pInfo->size>=20 &&
267 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
268 pInfo->charsetFamily==U_CHARSET_FAMILY &&
269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
270 pInfo->dataFormat[1]==0x72 &&
271 pInfo->dataFormat[2]==0x6d &&
272 pInfo->dataFormat[3]==0x32 &&
273 pInfo->formatVersion[0]==1
274 ) {
275 Normalizer2Impl *me=(Normalizer2Impl *)context;
276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
277 return TRUE;
278 } else {
279 return FALSE;
280 }
281 }
282
283 void
load(const char * packageName,const char * name,UErrorCode & errorCode)284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
285 if(U_FAILURE(errorCode)) {
286 return;
287 }
288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
289 if(U_FAILURE(errorCode)) {
290 return;
291 }
292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
293 const int32_t *inIndexes=(const int32_t *)inBytes;
294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
295 if(indexesLength<=IX_MIN_MAYBE_YES) {
296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
297 return;
298 }
299
300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
302
303 minYesNo=inIndexes[IX_MIN_YES_NO];
304 minNoNo=inIndexes[IX_MIN_NO_NO];
305 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
306 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
307
308 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
309 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
310 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
311 inBytes+offset, nextOffset-offset, NULL,
312 &errorCode);
313 if(U_FAILURE(errorCode)) {
314 return;
315 }
316
317 offset=nextOffset;
318 maybeYesCompositions=(const uint16_t *)(inBytes+offset);
319 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
320 }
321
getTrailCCFromCompYesAndZeroCC(const UChar * cpStart,const UChar * cpLimit) const322 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
323 UChar32 c;
324 if(cpStart==(cpLimit-1)) {
325 c=*cpStart;
326 } else {
327 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
328 }
329 uint16_t prevNorm16=getNorm16(c);
330 if(prevNorm16<=minYesNo) {
331 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
332 } else {
333 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
334 }
335 }
336
337 U_CDECL_BEGIN
338
339 static UBool U_CALLCONV
enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)340 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
341 /* add the start code point to the USet */
342 const USetAdder *sa=(const USetAdder *)context;
343 sa->add(sa->set, start);
344 return TRUE;
345 }
346
347 static uint32_t U_CALLCONV
segmentStarterMapper(const void *,uint32_t value)348 segmentStarterMapper(const void * /*context*/, uint32_t value) {
349 return value&CANON_NOT_SEGMENT_STARTER;
350 }
351
352 U_CDECL_END
353
354 void
addPropertyStarts(const USetAdder * sa,UErrorCode &) const355 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
356 /* add the start code point of each same-value range of each trie */
357 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
358
359 /* add Hangul LV syllables and LV+1 because of skippables */
360 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
361 sa->add(sa->set, c);
362 sa->add(sa->set, c+1);
363 }
364 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
365 }
366
367 void
addCanonIterPropertyStarts(const USetAdder * sa,UErrorCode & errorCode) const368 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
369 /* add the start code point of each same-value range of the canonical iterator data trie */
370 if(ensureCanonIterData(errorCode)) {
371 // currently only used for the SEGMENT_STARTER property
372 utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie,
373 segmentStarterMapper, enumPropertyStartsRange, sa);
374 }
375 }
376
377 const UChar *
copyLowPrefixFromNulTerminated(const UChar * src,UChar32 minNeedDataCP,ReorderingBuffer * buffer,UErrorCode & errorCode) const378 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
379 UChar32 minNeedDataCP,
380 ReorderingBuffer *buffer,
381 UErrorCode &errorCode) const {
382 // Make some effort to support NUL-terminated strings reasonably.
383 // Take the part of the fast quick check loop that does not look up
384 // data and check the first part of the string.
385 // After this prefix, determine the string length to simplify the rest
386 // of the code.
387 const UChar *prevSrc=src;
388 UChar c;
389 while((c=*src++)<minNeedDataCP && c!=0) {}
390 // Back out the last character for full processing.
391 // Copy this prefix.
392 if(--src!=prevSrc) {
393 if(buffer!=NULL) {
394 buffer->appendZeroCC(prevSrc, src, errorCode);
395 }
396 }
397 return src;
398 }
399
400 // Dual functionality:
401 // buffer!=NULL: normalize
402 // buffer==NULL: isNormalized/spanQuickCheckYes
403 const UChar *
decompose(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const404 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
405 ReorderingBuffer *buffer,
406 UErrorCode &errorCode) const {
407 UChar32 minNoCP=minDecompNoCP;
408 if(limit==NULL) {
409 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
410 if(U_FAILURE(errorCode)) {
411 return src;
412 }
413 limit=u_strchr(src, 0);
414 }
415
416 const UChar *prevSrc;
417 UChar32 c=0;
418 uint16_t norm16=0;
419
420 // only for quick check
421 const UChar *prevBoundary=src;
422 uint8_t prevCC=0;
423
424 for(;;) {
425 // count code units below the minimum or with irrelevant data for the quick check
426 for(prevSrc=src; src!=limit;) {
427 if( (c=*src)<minNoCP ||
428 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
429 ) {
430 ++src;
431 } else if(!U16_IS_SURROGATE(c)) {
432 break;
433 } else {
434 UChar c2;
435 if(U16_IS_SURROGATE_LEAD(c)) {
436 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
437 c=U16_GET_SUPPLEMENTARY(c, c2);
438 }
439 } else /* trail surrogate */ {
440 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
441 --src;
442 c=U16_GET_SUPPLEMENTARY(c2, c);
443 }
444 }
445 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
446 src+=U16_LENGTH(c);
447 } else {
448 break;
449 }
450 }
451 }
452 // copy these code units all at once
453 if(src!=prevSrc) {
454 if(buffer!=NULL) {
455 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
456 break;
457 }
458 } else {
459 prevCC=0;
460 prevBoundary=src;
461 }
462 }
463 if(src==limit) {
464 break;
465 }
466
467 // Check one above-minimum, relevant code point.
468 src+=U16_LENGTH(c);
469 if(buffer!=NULL) {
470 if(!decompose(c, norm16, *buffer, errorCode)) {
471 break;
472 }
473 } else {
474 if(isDecompYes(norm16)) {
475 uint8_t cc=getCCFromYesOrMaybe(norm16);
476 if(prevCC<=cc || cc==0) {
477 prevCC=cc;
478 if(cc<=1) {
479 prevBoundary=src;
480 }
481 continue;
482 }
483 }
484 return prevBoundary; // "no" or cc out of order
485 }
486 }
487 return src;
488 }
489
490 // Decompose a short piece of text which is likely to contain characters that
491 // fail the quick check loop and/or where the quick check loop's overhead
492 // is unlikely to be amortized.
493 // Called by the compose() and makeFCD() implementations.
decomposeShort(const UChar * src,const UChar * limit,ReorderingBuffer & buffer,UErrorCode & errorCode) const494 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
495 ReorderingBuffer &buffer,
496 UErrorCode &errorCode) const {
497 while(src<limit) {
498 UChar32 c;
499 uint16_t norm16;
500 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
501 if(!decompose(c, norm16, buffer, errorCode)) {
502 return FALSE;
503 }
504 }
505 return TRUE;
506 }
507
decompose(UChar32 c,uint16_t norm16,ReorderingBuffer & buffer,UErrorCode & errorCode) const508 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
509 ReorderingBuffer &buffer,
510 UErrorCode &errorCode) const {
511 // Only loops for 1:1 algorithmic mappings.
512 for(;;) {
513 // get the decomposition and the lead and trail cc's
514 if(isDecompYes(norm16)) {
515 // c does not decompose
516 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
517 } else if(isHangul(norm16)) {
518 // Hangul syllable: decompose algorithmically
519 UChar jamos[3];
520 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
521 } else if(isDecompNoAlgorithmic(norm16)) {
522 c=mapAlgorithmic(c, norm16);
523 norm16=getNorm16(c);
524 } else {
525 // c decomposes, get everything from the variable-length extra data
526 const uint16_t *mapping=getMapping(norm16);
527 uint16_t firstUnit=*mapping++;
528 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
529 uint8_t leadCC, trailCC;
530 trailCC=(uint8_t)(firstUnit>>8);
531 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
532 leadCC=(uint8_t)(*mapping++>>8);
533 } else {
534 leadCC=0;
535 }
536 return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode);
537 }
538 }
539 }
540
541 const UChar *
getDecomposition(UChar32 c,UChar buffer[4],int32_t & length) const542 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
543 const UChar *decomp=NULL;
544 uint16_t norm16;
545 for(;;) {
546 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
547 // c does not decompose
548 return decomp;
549 } else if(isHangul(norm16)) {
550 // Hangul syllable: decompose algorithmically
551 length=Hangul::decompose(c, buffer);
552 return buffer;
553 } else if(isDecompNoAlgorithmic(norm16)) {
554 c=mapAlgorithmic(c, norm16);
555 decomp=buffer;
556 length=0;
557 U16_APPEND_UNSAFE(buffer, length, c);
558 } else {
559 // c decomposes, get everything from the variable-length extra data
560 const uint16_t *mapping=getMapping(norm16);
561 uint16_t firstUnit=*mapping++;
562 length=firstUnit&MAPPING_LENGTH_MASK;
563 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
564 ++mapping;
565 }
566 return (const UChar *)mapping;
567 }
568 }
569 }
570
decomposeAndAppend(const UChar * src,const UChar * limit,UBool doDecompose,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const571 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
572 UBool doDecompose,
573 UnicodeString &safeMiddle,
574 ReorderingBuffer &buffer,
575 UErrorCode &errorCode) const {
576 buffer.copyReorderableSuffixTo(safeMiddle);
577 if(doDecompose) {
578 decompose(src, limit, &buffer, errorCode);
579 return;
580 }
581 // Just merge the strings at the boundary.
582 ForwardUTrie2StringIterator iter(normTrie, src, limit);
583 uint8_t firstCC, prevCC, cc;
584 firstCC=prevCC=cc=getCC(iter.next16());
585 while(cc!=0) {
586 prevCC=cc;
587 cc=getCC(iter.next16());
588 };
589 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
590 limit=u_strchr(iter.codePointStart, 0);
591 }
592 buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) &&
593 buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
594 }
595
596 // Note: hasDecompBoundary() could be implemented as aliases to
597 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
598 // at the cost of building the FCD trie for a decomposition normalizer.
hasDecompBoundary(UChar32 c,UBool before) const599 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
600 for(;;) {
601 if(c<minDecompNoCP) {
602 return TRUE;
603 }
604 uint16_t norm16=getNorm16(c);
605 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
606 return TRUE;
607 } else if(norm16>MIN_NORMAL_MAYBE_YES) {
608 return FALSE; // ccc!=0
609 } else if(isDecompNoAlgorithmic(norm16)) {
610 c=mapAlgorithmic(c, norm16);
611 } else {
612 // c decomposes, get everything from the variable-length extra data
613 const uint16_t *mapping=getMapping(norm16);
614 uint16_t firstUnit=*mapping++;
615 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
616 return FALSE;
617 }
618 if(!before) {
619 // decomp after-boundary: same as hasFCDBoundaryAfter(),
620 // fcd16<=1 || trailCC==0
621 if(firstUnit>0x1ff) {
622 return FALSE; // trailCC>1
623 }
624 if(firstUnit<=0xff) {
625 return TRUE; // trailCC==0
626 }
627 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
628 }
629 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
630 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0;
631 }
632 }
633 }
634
635 /*
636 * Finds the recomposition result for
637 * a forward-combining "lead" character,
638 * specified with a pointer to its compositions list,
639 * and a backward-combining "trail" character.
640 *
641 * If the lead and trail characters combine, then this function returns
642 * the following "compositeAndFwd" value:
643 * Bits 21..1 composite character
644 * Bit 0 set if the composite is a forward-combining starter
645 * otherwise it returns -1.
646 *
647 * The compositions list has (trail, compositeAndFwd) pair entries,
648 * encoded as either pairs or triples of 16-bit units.
649 * The last entry has the high bit of its first unit set.
650 *
651 * The list is sorted by ascending trail characters (there are no duplicates).
652 * A linear search is used.
653 *
654 * See normalizer2impl.h for a more detailed description
655 * of the compositions list format.
656 */
combine(const uint16_t * list,UChar32 trail)657 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
658 uint16_t key1, firstUnit;
659 if(trail<COMP_1_TRAIL_LIMIT) {
660 // trail character is 0..33FF
661 // result entry may have 2 or 3 units
662 key1=(uint16_t)(trail<<1);
663 while(key1>(firstUnit=*list)) {
664 list+=2+(firstUnit&COMP_1_TRIPLE);
665 }
666 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
667 if(firstUnit&COMP_1_TRIPLE) {
668 return ((int32_t)list[1]<<16)|list[2];
669 } else {
670 return list[1];
671 }
672 }
673 } else {
674 // trail character is 3400..10FFFF
675 // result entry has 3 units
676 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
677 (((trail>>COMP_1_TRAIL_SHIFT))&
678 ~COMP_1_TRIPLE));
679 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
680 uint16_t secondUnit;
681 for(;;) {
682 if(key1>(firstUnit=*list)) {
683 list+=2+(firstUnit&COMP_1_TRIPLE);
684 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
685 if(key2>(secondUnit=list[1])) {
686 if(firstUnit&COMP_1_LAST_TUPLE) {
687 break;
688 } else {
689 list+=3;
690 }
691 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
692 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
693 } else {
694 break;
695 }
696 } else {
697 break;
698 }
699 }
700 }
701 return -1;
702 }
703
704 /**
705 * @param list some character's compositions list
706 * @param set recursively receives the composites from these compositions
707 */
addComposites(const uint16_t * list,UnicodeSet & set) const708 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
709 uint16_t firstUnit;
710 int32_t compositeAndFwd;
711 do {
712 firstUnit=*list;
713 if((firstUnit&COMP_1_TRIPLE)==0) {
714 compositeAndFwd=list[1];
715 list+=2;
716 } else {
717 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
718 list+=3;
719 }
720 UChar32 composite=compositeAndFwd>>1;
721 if((compositeAndFwd&1)!=0) {
722 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
723 }
724 set.add(composite);
725 } while((firstUnit&COMP_1_LAST_TUPLE)==0);
726 }
727
728 /*
729 * Recomposes the buffer text starting at recomposeStartIndex
730 * (which is in NFD - decomposed and canonically ordered),
731 * and truncates the buffer contents.
732 *
733 * Note that recomposition never lengthens the text:
734 * Any character consists of either one or two code units;
735 * a composition may contain at most one more code unit than the original starter,
736 * while the combining mark that is removed has at least one code unit.
737 */
recompose(ReorderingBuffer & buffer,int32_t recomposeStartIndex,UBool onlyContiguous) const738 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
739 UBool onlyContiguous) const {
740 UChar *p=buffer.getStart()+recomposeStartIndex;
741 UChar *limit=buffer.getLimit();
742 if(p==limit) {
743 return;
744 }
745
746 UChar *starter, *pRemove, *q, *r;
747 const uint16_t *compositionsList;
748 UChar32 c, compositeAndFwd;
749 uint16_t norm16;
750 uint8_t cc, prevCC;
751 UBool starterIsSupplementary;
752
753 // Some of the following variables are not used until we have a forward-combining starter
754 // and are only initialized now to avoid compiler warnings.
755 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
756 starter=NULL;
757 starterIsSupplementary=FALSE;
758 prevCC=0;
759
760 for(;;) {
761 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
762 cc=getCCFromYesOrMaybe(norm16);
763 if( // this character combines backward and
764 isMaybe(norm16) &&
765 // we have seen a starter that combines forward and
766 compositionsList!=NULL &&
767 // the backward-combining character is not blocked
768 (prevCC<cc || prevCC==0)
769 ) {
770 if(isJamoVT(norm16)) {
771 // c is a Jamo V/T, see if we can compose it with the previous character.
772 if(c<Hangul::JAMO_T_BASE) {
773 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
774 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
775 if(prev<Hangul::JAMO_L_COUNT) {
776 pRemove=p-1;
777 UChar syllable=(UChar)
778 (Hangul::HANGUL_BASE+
779 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
780 Hangul::JAMO_T_COUNT);
781 UChar t;
782 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
783 ++p;
784 syllable+=t; // The next character was a Jamo T.
785 }
786 *starter=syllable;
787 // remove the Jamo V/T
788 q=pRemove;
789 r=p;
790 while(r<limit) {
791 *q++=*r++;
792 }
793 limit=q;
794 p=pRemove;
795 }
796 }
797 /*
798 * No "else" for Jamo T:
799 * Since the input is in NFD, there are no Hangul LV syllables that
800 * a Jamo T could combine with.
801 * All Jamo Ts are combined above when handling Jamo Vs.
802 */
803 if(p==limit) {
804 break;
805 }
806 compositionsList=NULL;
807 continue;
808 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
809 // The starter and the combining mark (c) do combine.
810 UChar32 composite=compositeAndFwd>>1;
811
812 // Replace the starter with the composite, remove the combining mark.
813 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
814 if(starterIsSupplementary) {
815 if(U_IS_SUPPLEMENTARY(composite)) {
816 // both are supplementary
817 starter[0]=U16_LEAD(composite);
818 starter[1]=U16_TRAIL(composite);
819 } else {
820 *starter=(UChar)composite;
821 // The composite is shorter than the starter,
822 // move the intermediate characters forward one.
823 starterIsSupplementary=FALSE;
824 q=starter+1;
825 r=q+1;
826 while(r<pRemove) {
827 *q++=*r++;
828 }
829 --pRemove;
830 }
831 } else if(U_IS_SUPPLEMENTARY(composite)) {
832 // The composite is longer than the starter,
833 // move the intermediate characters back one.
834 starterIsSupplementary=TRUE;
835 ++starter; // temporarily increment for the loop boundary
836 q=pRemove;
837 r=++pRemove;
838 while(starter<q) {
839 *--r=*--q;
840 }
841 *starter=U16_TRAIL(composite);
842 *--starter=U16_LEAD(composite); // undo the temporary increment
843 } else {
844 // both are on the BMP
845 *starter=(UChar)composite;
846 }
847
848 /* remove the combining mark by moving the following text over it */
849 if(pRemove<p) {
850 q=pRemove;
851 r=p;
852 while(r<limit) {
853 *q++=*r++;
854 }
855 limit=q;
856 p=pRemove;
857 }
858 // Keep prevCC because we removed the combining mark.
859
860 if(p==limit) {
861 break;
862 }
863 // Is the composite a starter that combines forward?
864 if(compositeAndFwd&1) {
865 compositionsList=
866 getCompositionsListForComposite(getNorm16(composite));
867 } else {
868 compositionsList=NULL;
869 }
870
871 // We combined; continue with looking for compositions.
872 continue;
873 }
874 }
875
876 // no combination this time
877 prevCC=cc;
878 if(p==limit) {
879 break;
880 }
881
882 // If c did not combine, then check if it is a starter.
883 if(cc==0) {
884 // Found a new starter.
885 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
886 // It may combine with something, prepare for it.
887 if(U_IS_BMP(c)) {
888 starterIsSupplementary=FALSE;
889 starter=p-1;
890 } else {
891 starterIsSupplementary=TRUE;
892 starter=p-2;
893 }
894 }
895 } else if(onlyContiguous) {
896 // FCC: no discontiguous compositions; any intervening character blocks.
897 compositionsList=NULL;
898 }
899 }
900 buffer.setReorderingLimit(limit);
901 }
902
903 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
904 // doCompose: normalize
905 // !doCompose: isNormalized (buffer must be empty and initialized)
906 UBool
compose(const UChar * src,const UChar * limit,UBool onlyContiguous,UBool doCompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const907 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
908 UBool onlyContiguous,
909 UBool doCompose,
910 ReorderingBuffer &buffer,
911 UErrorCode &errorCode) const {
912 /*
913 * prevBoundary points to the last character before the current one
914 * that has a composition boundary before it with ccc==0 and quick check "yes".
915 * Keeping track of prevBoundary saves us looking for a composition boundary
916 * when we find a "no" or "maybe".
917 *
918 * When we back out from prevSrc back to prevBoundary,
919 * then we also remove those same characters (which had been simply copied
920 * or canonically-order-inserted) from the ReorderingBuffer.
921 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
922 * must correspond 1:1 to destination units at the end of the destination buffer.
923 */
924 const UChar *prevBoundary=src;
925 UChar32 minNoMaybeCP=minCompNoMaybeCP;
926 if(limit==NULL) {
927 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
928 doCompose ? &buffer : NULL,
929 errorCode);
930 if(U_FAILURE(errorCode)) {
931 return FALSE;
932 }
933 if(prevBoundary<src) {
934 // Set prevBoundary to the last character in the prefix.
935 prevBoundary=src-1;
936 }
937 limit=u_strchr(src, 0);
938 }
939
940 const UChar *prevSrc;
941 UChar32 c=0;
942 uint16_t norm16=0;
943
944 // only for isNormalized
945 uint8_t prevCC=0;
946
947 for(;;) {
948 // count code units below the minimum or with irrelevant data for the quick check
949 for(prevSrc=src; src!=limit;) {
950 if( (c=*src)<minNoMaybeCP ||
951 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
952 ) {
953 ++src;
954 } else if(!U16_IS_SURROGATE(c)) {
955 break;
956 } else {
957 UChar c2;
958 if(U16_IS_SURROGATE_LEAD(c)) {
959 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
960 c=U16_GET_SUPPLEMENTARY(c, c2);
961 }
962 } else /* trail surrogate */ {
963 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
964 --src;
965 c=U16_GET_SUPPLEMENTARY(c2, c);
966 }
967 }
968 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
969 src+=U16_LENGTH(c);
970 } else {
971 break;
972 }
973 }
974 }
975 // copy these code units all at once
976 if(src!=prevSrc) {
977 if(doCompose) {
978 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
979 break;
980 }
981 } else {
982 prevCC=0;
983 }
984 if(src==limit) {
985 break;
986 }
987 // Set prevBoundary to the last character in the quick check loop.
988 prevBoundary=src-1;
989 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
990 U16_IS_LEAD(*(prevBoundary-1))
991 ) {
992 --prevBoundary;
993 }
994 // The start of the current character (c).
995 prevSrc=src;
996 } else if(src==limit) {
997 break;
998 }
999
1000 src+=U16_LENGTH(c);
1001 /*
1002 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1003 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1004 * or has ccc!=0.
1005 * Check for Jamo V/T, then for regular characters.
1006 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1007 */
1008 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1009 UChar prev=*(prevSrc-1);
1010 UBool needToDecompose=FALSE;
1011 if(c<Hangul::JAMO_T_BASE) {
1012 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1013 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1014 if(prev<Hangul::JAMO_L_COUNT) {
1015 if(!doCompose) {
1016 return FALSE;
1017 }
1018 UChar syllable=(UChar)
1019 (Hangul::HANGUL_BASE+
1020 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1021 Hangul::JAMO_T_COUNT);
1022 UChar t;
1023 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1024 ++src;
1025 syllable+=t; // The next character was a Jamo T.
1026 prevBoundary=src;
1027 buffer.setLastChar(syllable);
1028 continue;
1029 }
1030 // If we see L+V+x where x!=T then we drop to the slow path,
1031 // decompose and recompose.
1032 // This is to deal with NFKC finding normal L and V but a
1033 // compatibility variant of a T. We need to either fully compose that
1034 // combination here (which would complicate the code and may not work
1035 // with strange custom data) or use the slow path -- or else our replacing
1036 // two input characters (L+V) with one output character (LV syllable)
1037 // would violate the invariant that [prevBoundary..prevSrc[ has the same
1038 // length as what we appended to the buffer since prevBoundary.
1039 needToDecompose=TRUE;
1040 }
1041 } else if(Hangul::isHangulWithoutJamoT(prev)) {
1042 // c is a Jamo Trailing consonant,
1043 // compose with previous Hangul LV that does not contain a Jamo T.
1044 if(!doCompose) {
1045 return FALSE;
1046 }
1047 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1048 prevBoundary=src;
1049 continue;
1050 }
1051 if(!needToDecompose) {
1052 // The Jamo V/T did not compose into a Hangul syllable.
1053 if(doCompose) {
1054 if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1055 break;
1056 }
1057 } else {
1058 prevCC=0;
1059 }
1060 continue;
1061 }
1062 }
1063 /*
1064 * Source buffer pointers:
1065 *
1066 * all done quick check current char not yet
1067 * "yes" but (c) processed
1068 * may combine
1069 * forward
1070 * [-------------[-------------[-------------[-------------[
1071 * | | | | |
1072 * orig. src prevBoundary prevSrc src limit
1073 *
1074 *
1075 * Destination buffer pointers inside the ReorderingBuffer:
1076 *
1077 * all done might take not filled yet
1078 * characters for
1079 * reordering
1080 * [-------------[-------------[-------------[
1081 * | | | |
1082 * start reorderStart limit |
1083 * +remainingCap.+
1084 */
1085 if(norm16>=MIN_YES_YES_WITH_CC) {
1086 uint8_t cc=(uint8_t)norm16; // cc!=0
1087 if( onlyContiguous && // FCC
1088 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1089 prevBoundary<prevSrc &&
1090 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1091 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1092 // passed the quick check "yes && ccc==0" test.
1093 // Check whether the last character was a "yesYes" or a "yesNo".
1094 // If a "yesNo", then we get its trailing ccc from its
1095 // mapping and check for canonical order.
1096 // All other cases are ok.
1097 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1098 ) {
1099 // Fails FCD test, need to decompose and contiguously recompose.
1100 if(!doCompose) {
1101 return FALSE;
1102 }
1103 } else if(doCompose) {
1104 if(!buffer.append(c, cc, errorCode)) {
1105 break;
1106 }
1107 continue;
1108 } else if(prevCC<=cc) {
1109 prevCC=cc;
1110 continue;
1111 } else {
1112 return FALSE;
1113 }
1114 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1115 return FALSE;
1116 }
1117
1118 /*
1119 * Find appropriate boundaries around this character,
1120 * decompose the source text from between the boundaries,
1121 * and recompose it.
1122 *
1123 * We may need to remove the last few characters from the ReorderingBuffer
1124 * to account for source text that was copied or appended
1125 * but needs to take part in the recomposition.
1126 */
1127
1128 /*
1129 * Find the last composition boundary in [prevBoundary..src[.
1130 * It is either the decomposition of the current character (at prevSrc),
1131 * or prevBoundary.
1132 */
1133 if(hasCompBoundaryBefore(c, norm16)) {
1134 prevBoundary=prevSrc;
1135 } else if(doCompose) {
1136 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1137 }
1138
1139 // Find the next composition boundary in [src..limit[ -
1140 // modifies src to point to the next starter.
1141 src=(UChar *)findNextCompBoundary(src, limit);
1142
1143 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1144 int32_t recomposeStartIndex=buffer.length();
1145 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1146 break;
1147 }
1148 recompose(buffer, recomposeStartIndex, onlyContiguous);
1149 if(!doCompose) {
1150 if(!buffer.equals(prevBoundary, src)) {
1151 return FALSE;
1152 }
1153 buffer.remove();
1154 prevCC=0;
1155 }
1156
1157 // Move to the next starter. We never need to look back before this point again.
1158 prevBoundary=src;
1159 }
1160 return TRUE;
1161 }
1162
1163 // Very similar to compose(): Make the same changes in both places if relevant.
1164 // pQCResult==NULL: spanQuickCheckYes
1165 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1166 const UChar *
composeQuickCheck(const UChar * src,const UChar * limit,UBool onlyContiguous,UNormalizationCheckResult * pQCResult) const1167 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1168 UBool onlyContiguous,
1169 UNormalizationCheckResult *pQCResult) const {
1170 /*
1171 * prevBoundary points to the last character before the current one
1172 * that has a composition boundary before it with ccc==0 and quick check "yes".
1173 */
1174 const UChar *prevBoundary=src;
1175 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1176 if(limit==NULL) {
1177 UErrorCode errorCode=U_ZERO_ERROR;
1178 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1179 if(prevBoundary<src) {
1180 // Set prevBoundary to the last character in the prefix.
1181 prevBoundary=src-1;
1182 }
1183 limit=u_strchr(src, 0);
1184 }
1185
1186 const UChar *prevSrc;
1187 UChar32 c=0;
1188 uint16_t norm16=0;
1189 uint8_t prevCC=0;
1190
1191 for(;;) {
1192 // count code units below the minimum or with irrelevant data for the quick check
1193 for(prevSrc=src;;) {
1194 if(src==limit) {
1195 return src;
1196 }
1197 if( (c=*src)<minNoMaybeCP ||
1198 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1199 ) {
1200 ++src;
1201 } else if(!U16_IS_SURROGATE(c)) {
1202 break;
1203 } else {
1204 UChar c2;
1205 if(U16_IS_SURROGATE_LEAD(c)) {
1206 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1207 c=U16_GET_SUPPLEMENTARY(c, c2);
1208 }
1209 } else /* trail surrogate */ {
1210 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1211 --src;
1212 c=U16_GET_SUPPLEMENTARY(c2, c);
1213 }
1214 }
1215 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1216 src+=U16_LENGTH(c);
1217 } else {
1218 break;
1219 }
1220 }
1221 }
1222 if(src!=prevSrc) {
1223 // Set prevBoundary to the last character in the quick check loop.
1224 prevBoundary=src-1;
1225 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1226 U16_IS_LEAD(*(prevBoundary-1))
1227 ) {
1228 --prevBoundary;
1229 }
1230 prevCC=0;
1231 // The start of the current character (c).
1232 prevSrc=src;
1233 }
1234
1235 src+=U16_LENGTH(c);
1236 /*
1237 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1238 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1239 * or has ccc!=0.
1240 */
1241 if(isMaybeOrNonZeroCC(norm16)) {
1242 uint8_t cc=getCCFromYesOrMaybe(norm16);
1243 if( onlyContiguous && // FCC
1244 cc!=0 &&
1245 prevCC==0 &&
1246 prevBoundary<prevSrc &&
1247 // prevCC==0 && prevBoundary<prevSrc tell us that
1248 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1249 // passed the quick check "yes && ccc==0" test.
1250 // Check whether the last character was a "yesYes" or a "yesNo".
1251 // If a "yesNo", then we get its trailing ccc from its
1252 // mapping and check for canonical order.
1253 // All other cases are ok.
1254 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1255 ) {
1256 // Fails FCD test.
1257 } else if(prevCC<=cc || cc==0) {
1258 prevCC=cc;
1259 if(norm16<MIN_YES_YES_WITH_CC) {
1260 if(pQCResult!=NULL) {
1261 *pQCResult=UNORM_MAYBE;
1262 } else {
1263 return prevBoundary;
1264 }
1265 }
1266 continue;
1267 }
1268 }
1269 if(pQCResult!=NULL) {
1270 *pQCResult=UNORM_NO;
1271 }
1272 return prevBoundary;
1273 }
1274 }
1275
composeAndAppend(const UChar * src,const UChar * limit,UBool doCompose,UBool onlyContiguous,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1276 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1277 UBool doCompose,
1278 UBool onlyContiguous,
1279 UnicodeString &safeMiddle,
1280 ReorderingBuffer &buffer,
1281 UErrorCode &errorCode) const {
1282 if(!buffer.isEmpty()) {
1283 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1284 if(src!=firstStarterInSrc) {
1285 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1286 buffer.getLimit());
1287 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1288 UnicodeString middle(lastStarterInDest, destSuffixLength);
1289 buffer.removeSuffix(destSuffixLength);
1290 safeMiddle=middle;
1291 middle.append(src, (int32_t)(firstStarterInSrc-src));
1292 const UChar *middleStart=middle.getBuffer();
1293 compose(middleStart, middleStart+middle.length(), onlyContiguous,
1294 TRUE, buffer, errorCode);
1295 if(U_FAILURE(errorCode)) {
1296 return;
1297 }
1298 src=firstStarterInSrc;
1299 }
1300 }
1301 if(doCompose) {
1302 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1303 } else {
1304 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1305 limit=u_strchr(src, 0);
1306 }
1307 buffer.appendZeroCC(src, limit, errorCode);
1308 }
1309 }
1310
1311 /**
1312 * Does c have a composition boundary before it?
1313 * True if its decomposition begins with a character that has
1314 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1315 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1316 * (isCompYesAndZeroCC()) so we need not decompose.
1317 */
hasCompBoundaryBefore(UChar32 c,uint16_t norm16) const1318 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1319 for(;;) {
1320 if(isCompYesAndZeroCC(norm16)) {
1321 return TRUE;
1322 } else if(isMaybeOrNonZeroCC(norm16)) {
1323 return FALSE;
1324 } else if(isDecompNoAlgorithmic(norm16)) {
1325 c=mapAlgorithmic(c, norm16);
1326 norm16=getNorm16(c);
1327 } else {
1328 // c decomposes, get everything from the variable-length extra data
1329 const uint16_t *mapping=getMapping(norm16);
1330 uint16_t firstUnit=*mapping++;
1331 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1332 return FALSE;
1333 }
1334 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) {
1335 return FALSE; // non-zero leadCC
1336 }
1337 int32_t i=0;
1338 UChar32 c;
1339 U16_NEXT_UNSAFE(mapping, i, c);
1340 return isCompYesAndZeroCC(getNorm16(c));
1341 }
1342 }
1343 }
1344
hasCompBoundaryAfter(UChar32 c,UBool onlyContiguous,UBool testInert) const1345 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1346 for(;;) {
1347 uint16_t norm16=getNorm16(c);
1348 if(isInert(norm16)) {
1349 return TRUE;
1350 } else if(norm16<=minYesNo) {
1351 // Hangul LVT (==minYesNo) has a boundary after it.
1352 // Hangul LV and non-inert yesYes characters combine forward.
1353 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1354 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1355 return FALSE;
1356 } else if(isDecompNoAlgorithmic(norm16)) {
1357 c=mapAlgorithmic(c, norm16);
1358 } else {
1359 // c decomposes, get everything from the variable-length extra data.
1360 // If testInert, then c must be a yesNo character which has lccc=0,
1361 // otherwise it could be a noNo.
1362 const uint16_t *mapping=getMapping(norm16);
1363 uint16_t firstUnit=*mapping;
1364 // TRUE if
1365 // c is not deleted, and
1366 // it and its decomposition do not combine forward, and it has a starter, and
1367 // if FCC then trailCC<=1
1368 return
1369 (firstUnit&MAPPING_LENGTH_MASK)!=0 &&
1370 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
1371 (!onlyContiguous || firstUnit<=0x1ff);
1372 }
1373 }
1374 }
1375
findPreviousCompBoundary(const UChar * start,const UChar * p) const1376 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1377 BackwardUTrie2StringIterator iter(normTrie, start, p);
1378 uint16_t norm16;
1379 do {
1380 norm16=iter.previous16();
1381 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1382 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1383 // but that's probably not worth the extra cost.
1384 return iter.codePointStart;
1385 }
1386
findNextCompBoundary(const UChar * p,const UChar * limit) const1387 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1388 ForwardUTrie2StringIterator iter(normTrie, p, limit);
1389 uint16_t norm16;
1390 do {
1391 norm16=iter.next16();
1392 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1393 return iter.codePointStart;
1394 }
1395
1396 class FCDTrieSingleton : public UTrie2Singleton {
1397 public:
FCDTrieSingleton(SimpleSingleton & s,Normalizer2Impl & ni,UErrorCode & ec)1398 FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1399 UTrie2Singleton(s), impl(ni), errorCode(ec) {}
getInstance(UErrorCode & errorCode)1400 UTrie2 *getInstance(UErrorCode &errorCode) {
1401 return UTrie2Singleton::getInstance(createInstance, this, errorCode);
1402 }
1403 static void *createInstance(const void *context, UErrorCode &errorCode);
rangeHandler(UChar32 start,UChar32 end,uint32_t value)1404 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1405 if(value!=0) {
1406 impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode);
1407 }
1408 return U_SUCCESS(errorCode);
1409 }
1410
1411 Normalizer2Impl &impl;
1412 UTrie2 *newFCDTrie;
1413 UErrorCode &errorCode;
1414 };
1415
1416 U_CDECL_BEGIN
1417
1418 // Set the FCD value for a range of same-norm16 characters.
1419 static UBool U_CALLCONV
enumRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)1420 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1421 return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value);
1422 }
1423
1424 // Collect (OR together) the FCD values for a range of supplementary characters,
1425 // for their lead surrogate code unit.
1426 static UBool U_CALLCONV
enumRangeOrValue(const void * context,UChar32,UChar32,uint32_t value)1427 enumRangeOrValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1428 *((uint32_t *)context)|=value;
1429 return TRUE;
1430 }
1431
1432 U_CDECL_END
1433
createInstance(const void * context,UErrorCode & errorCode)1434 void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1435 FCDTrieSingleton *me=(FCDTrieSingleton *)context;
1436 me->newFCDTrie=utrie2_open(0, 0, &errorCode);
1437 if(U_SUCCESS(errorCode)) {
1438 utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me);
1439 for(UChar lead=0xd800; lead<0xdc00; ++lead) {
1440 uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead);
1441 utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue);
1442 if(oredValue!=0) {
1443 // Set a "bad" value for makeFCD() to break the quick check loop
1444 // and look up the value for the supplementary code point.
1445 // If there is any lccc, then set the worst-case lccc of 1.
1446 // The ORed-together value's tccc is already the worst case.
1447 if(oredValue>0xff) {
1448 oredValue=0x100|(oredValue&0xff);
1449 }
1450 utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode);
1451 }
1452 }
1453 utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode);
1454 if(U_SUCCESS(errorCode)) {
1455 return me->newFCDTrie;
1456 }
1457 }
1458 utrie2_close(me->newFCDTrie);
1459 return NULL;
1460 }
1461
setFCD16FromNorm16(UChar32 start,UChar32 end,uint16_t norm16,UTrie2 * newFCDTrie,UErrorCode & errorCode) const1462 void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1463 UTrie2 *newFCDTrie, UErrorCode &errorCode) const {
1464 // Only loops for 1:1 algorithmic mappings.
1465 for(;;) {
1466 if(norm16>=MIN_NORMAL_MAYBE_YES) {
1467 norm16&=0xff;
1468 norm16|=norm16<<8;
1469 } else if(norm16<=minYesNo || minMaybeYes<=norm16) {
1470 // no decomposition or Hangul syllable, all zeros
1471 break;
1472 } else if(limitNoNo<=norm16) {
1473 int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1);
1474 if(start==end) {
1475 start+=delta;
1476 norm16=getNorm16(start);
1477 } else {
1478 // the same delta leads from different original characters to different mappings
1479 do {
1480 UChar32 c=start+delta;
1481 setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode);
1482 } while(++start<=end);
1483 break;
1484 }
1485 } else {
1486 // c decomposes, get everything from the variable-length extra data
1487 const uint16_t *mapping=getMapping(norm16);
1488 uint16_t firstUnit=*mapping;
1489 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1490 // A character that is deleted (maps to an empty string) must
1491 // get the worst-case lccc and tccc values because arbitrary
1492 // characters on both sides will become adjacent.
1493 norm16=0x1ff;
1494 } else {
1495 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1496 norm16=mapping[1]&0xff00; // lccc
1497 } else {
1498 norm16=0;
1499 }
1500 norm16|=firstUnit>>8; // tccc
1501 }
1502 }
1503 utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);
1504 break;
1505 }
1506 }
1507
getFCDTrie(UErrorCode & errorCode) const1508 const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {
1509 // Logically const: Synchronized instantiation.
1510 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1511 return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode);
1512 }
1513
1514 // Dual functionality:
1515 // buffer!=NULL: normalize
1516 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1517 const UChar *
makeFCD(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const1518 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1519 ReorderingBuffer *buffer,
1520 UErrorCode &errorCode) const {
1521 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1522 // Similar to the prevBoundary in the compose() implementation.
1523 const UChar *prevBoundary=src;
1524 int32_t prevFCD16=0;
1525 if(limit==NULL) {
1526 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1527 if(U_FAILURE(errorCode)) {
1528 return src;
1529 }
1530 if(prevBoundary<src) {
1531 prevBoundary=src;
1532 // We know that the previous character's lccc==0.
1533 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1534 prevFCD16=getFCD16FromSingleLead(*(src-1));
1535 if(prevFCD16>1) {
1536 --prevBoundary;
1537 }
1538 }
1539 limit=u_strchr(src, 0);
1540 }
1541
1542 // Note: In this function we use buffer->appendZeroCC() because we track
1543 // the lead and trail combining classes here, rather than leaving it to
1544 // the ReorderingBuffer.
1545 // The exception is the call to decomposeShort() which uses the buffer
1546 // in the normal way.
1547
1548 const UTrie2 *trie=fcdTrie();
1549
1550 const UChar *prevSrc;
1551 UChar32 c=0;
1552 uint16_t fcd16=0;
1553
1554 for(;;) {
1555 // count code units with lccc==0
1556 for(prevSrc=src; src!=limit;) {
1557 if((c=*src)<MIN_CCC_LCCC_CP) {
1558 prevFCD16=~c;
1559 ++src;
1560 } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) {
1561 prevFCD16=fcd16;
1562 ++src;
1563 } else if(!U16_IS_SURROGATE(c)) {
1564 break;
1565 } else {
1566 UChar c2;
1567 if(U16_IS_SURROGATE_LEAD(c)) {
1568 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1569 c=U16_GET_SUPPLEMENTARY(c, c2);
1570 }
1571 } else /* trail surrogate */ {
1572 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1573 --src;
1574 c=U16_GET_SUPPLEMENTARY(c2, c);
1575 }
1576 }
1577 if((fcd16=getFCD16(c))<=0xff) {
1578 prevFCD16=fcd16;
1579 src+=U16_LENGTH(c);
1580 } else {
1581 break;
1582 }
1583 }
1584 }
1585 // copy these code units all at once
1586 if(src!=prevSrc) {
1587 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1588 break;
1589 }
1590 if(src==limit) {
1591 break;
1592 }
1593 prevBoundary=src;
1594 // We know that the previous character's lccc==0.
1595 if(prevFCD16<0) {
1596 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1597 prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16);
1598 if(prevFCD16>1) {
1599 --prevBoundary;
1600 }
1601 } else {
1602 const UChar *p=src-1;
1603 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1604 --p;
1605 // Need to fetch the previous character's FCD value because
1606 // prevFCD16 was just for the trail surrogate code point.
1607 prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]);
1608 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1609 }
1610 if(prevFCD16>1) {
1611 prevBoundary=p;
1612 }
1613 }
1614 // The start of the current character (c).
1615 prevSrc=src;
1616 } else if(src==limit) {
1617 break;
1618 }
1619
1620 src+=U16_LENGTH(c);
1621 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1622 // Check for proper order, and decompose locally if necessary.
1623 if((prevFCD16&0xff)<=(fcd16>>8)) {
1624 // proper order: prev tccc <= current lccc
1625 if((fcd16&0xff)<=1) {
1626 prevBoundary=src;
1627 }
1628 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1629 break;
1630 }
1631 prevFCD16=fcd16;
1632 continue;
1633 } else if(buffer==NULL) {
1634 return prevBoundary; // quick check "no"
1635 } else {
1636 /*
1637 * Back out the part of the source that we copied or appended
1638 * already but is now going to be decomposed.
1639 * prevSrc is set to after what was copied/appended.
1640 */
1641 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1642 /*
1643 * Find the part of the source that needs to be decomposed,
1644 * up to the next safe boundary.
1645 */
1646 src=findNextFCDBoundary(src, limit);
1647 /*
1648 * The source text does not fulfill the conditions for FCD.
1649 * Decompose and reorder a limited piece of the text.
1650 */
1651 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1652 break;
1653 }
1654 prevBoundary=src;
1655 prevFCD16=0;
1656 }
1657 }
1658 return src;
1659 }
1660
makeFCDAndAppend(const UChar * src,const UChar * limit,UBool doMakeFCD,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1661 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1662 UBool doMakeFCD,
1663 UnicodeString &safeMiddle,
1664 ReorderingBuffer &buffer,
1665 UErrorCode &errorCode) const {
1666 if(!buffer.isEmpty()) {
1667 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1668 if(src!=firstBoundaryInSrc) {
1669 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1670 buffer.getLimit());
1671 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
1672 UnicodeString middle(lastBoundaryInDest, destSuffixLength);
1673 buffer.removeSuffix(destSuffixLength);
1674 safeMiddle=middle;
1675 middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1676 const UChar *middleStart=middle.getBuffer();
1677 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1678 if(U_FAILURE(errorCode)) {
1679 return;
1680 }
1681 src=firstBoundaryInSrc;
1682 }
1683 }
1684 if(doMakeFCD) {
1685 makeFCD(src, limit, &buffer, errorCode);
1686 } else {
1687 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1688 limit=u_strchr(src, 0);
1689 }
1690 buffer.appendZeroCC(src, limit, errorCode);
1691 }
1692 }
1693
findPreviousFCDBoundary(const UChar * start,const UChar * p) const1694 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1695 BackwardUTrie2StringIterator iter(fcdTrie(), start, p);
1696 uint16_t fcd16;
1697 do {
1698 fcd16=iter.previous16();
1699 } while(fcd16>0xff);
1700 return iter.codePointStart;
1701 }
1702
findNextFCDBoundary(const UChar * p,const UChar * limit) const1703 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1704 ForwardUTrie2StringIterator iter(fcdTrie(), p, limit);
1705 uint16_t fcd16;
1706 do {
1707 fcd16=iter.next16();
1708 } while(fcd16>0xff);
1709 return iter.codePointStart;
1710 }
1711
1712 // CanonicalIterator data -------------------------------------------------- ***
1713
CanonIterData(UErrorCode & errorCode)1714 CanonIterData::CanonIterData(UErrorCode &errorCode) :
1715 trie(utrie2_open(0, 0, &errorCode)),
1716 canonStartSets(uhash_deleteUObject, NULL, errorCode) {}
1717
~CanonIterData()1718 CanonIterData::~CanonIterData() {
1719 utrie2_close(trie);
1720 }
1721
addToStartSet(UChar32 origin,UChar32 decompLead,UErrorCode & errorCode)1722 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1723 uint32_t canonValue=utrie2_get32(trie, decompLead);
1724 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1725 // origin is the first character whose decomposition starts with
1726 // the character for which we are setting the value.
1727 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1728 } else {
1729 // origin is not the first character, or it is U+0000.
1730 UnicodeSet *set;
1731 if((canonValue&CANON_HAS_SET)==0) {
1732 set=new UnicodeSet;
1733 if(set==NULL) {
1734 errorCode=U_MEMORY_ALLOCATION_ERROR;
1735 return;
1736 }
1737 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1738 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1739 utrie2_set32(trie, decompLead, canonValue, &errorCode);
1740 canonStartSets.addElement(set, errorCode);
1741 if(firstOrigin!=0) {
1742 set->add(firstOrigin);
1743 }
1744 } else {
1745 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1746 }
1747 set->add(origin);
1748 }
1749 }
1750
1751 class CanonIterDataSingleton {
1752 public:
CanonIterDataSingleton(SimpleSingleton & s,Normalizer2Impl & ni,UErrorCode & ec)1753 CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1754 singleton(s), impl(ni), errorCode(ec) {}
getInstance(UErrorCode & errorCode)1755 CanonIterData *getInstance(UErrorCode &errorCode) {
1756 void *duplicate;
1757 CanonIterData *instance=
1758 (CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode);
1759 delete (CanonIterData *)duplicate;
1760 return instance;
1761 }
1762 static void *createInstance(const void *context, UErrorCode &errorCode);
rangeHandler(UChar32 start,UChar32 end,uint32_t value)1763 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1764 if(value!=0) {
1765 impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode);
1766 }
1767 return U_SUCCESS(errorCode);
1768 }
1769
1770 private:
1771 SimpleSingleton &singleton;
1772 Normalizer2Impl &impl;
1773 CanonIterData *newData;
1774 UErrorCode &errorCode;
1775 };
1776
1777 U_CDECL_BEGIN
1778
1779 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1780 static UBool U_CALLCONV
enumCIDRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)1781 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1782 return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value);
1783 }
1784
1785 U_CDECL_END
1786
createInstance(const void * context,UErrorCode & errorCode)1787 void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1788 CanonIterDataSingleton *me=(CanonIterDataSingleton *)context;
1789 me->newData=new CanonIterData(errorCode);
1790 if(me->newData==NULL) {
1791 errorCode=U_MEMORY_ALLOCATION_ERROR;
1792 return NULL;
1793 }
1794 if(U_SUCCESS(errorCode)) {
1795 utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me);
1796 utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1797 if(U_SUCCESS(errorCode)) {
1798 return me->newData;
1799 }
1800 }
1801 delete me->newData;
1802 return NULL;
1803 }
1804
makeCanonIterDataFromNorm16(UChar32 start,UChar32 end,uint16_t norm16,CanonIterData & newData,UErrorCode & errorCode) const1805 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1806 CanonIterData &newData,
1807 UErrorCode &errorCode) const {
1808 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1809 // Inert, or 2-way mapping (including Hangul syllable).
1810 // We do not write a canonStartSet for any yesNo character.
1811 // Composites from 2-way mappings are added at runtime from the
1812 // starter's compositions list, and the other characters in
1813 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1814 // "maybe" characters.
1815 return;
1816 }
1817 for(UChar32 c=start; c<=end; ++c) {
1818 uint32_t oldValue=utrie2_get32(newData.trie, c);
1819 uint32_t newValue=oldValue;
1820 if(norm16>=minMaybeYes) {
1821 // not a segment starter if it occurs in a decomposition or has cc!=0
1822 newValue|=CANON_NOT_SEGMENT_STARTER;
1823 if(norm16<MIN_NORMAL_MAYBE_YES) {
1824 newValue|=CANON_HAS_COMPOSITIONS;
1825 }
1826 } else if(norm16<minYesNo) {
1827 newValue|=CANON_HAS_COMPOSITIONS;
1828 } else {
1829 // c has a one-way decomposition
1830 UChar32 c2=c;
1831 uint16_t norm16_2=norm16;
1832 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1833 c2=mapAlgorithmic(c2, norm16_2);
1834 norm16_2=getNorm16(c2);
1835 }
1836 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1837 // c decomposes, get everything from the variable-length extra data
1838 const uint16_t *mapping=getMapping(norm16_2);
1839 uint16_t firstUnit=*mapping++;
1840 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1841 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1842 if(c==c2 && (*mapping&0xff)!=0) {
1843 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
1844 }
1845 ++mapping;
1846 }
1847 // Skip empty mappings (no characters in the decomposition).
1848 if(length!=0) {
1849 // add c to first code point's start set
1850 int32_t i=0;
1851 U16_NEXT_UNSAFE(mapping, i, c2);
1852 newData.addToStartSet(c, c2, errorCode);
1853 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1854 // one-way mapping. A 2-way mapping is possible here after
1855 // intermediate algorithmic mapping.
1856 if(norm16_2>=minNoNo) {
1857 while(i<length) {
1858 U16_NEXT_UNSAFE(mapping, i, c2);
1859 uint32_t c2Value=utrie2_get32(newData.trie, c2);
1860 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1861 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1862 &errorCode);
1863 }
1864 }
1865 }
1866 }
1867 } else {
1868 // c decomposed to c2 algorithmically; c has cc==0
1869 newData.addToStartSet(c, c2, errorCode);
1870 }
1871 }
1872 if(newValue!=oldValue) {
1873 utrie2_set32(newData.trie, c, newValue, &errorCode);
1874 }
1875 }
1876 }
1877
ensureCanonIterData(UErrorCode & errorCode) const1878 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1879 // Logically const: Synchronized instantiation.
1880 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1881 CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode);
1882 return U_SUCCESS(errorCode);
1883 }
1884
getCanonValue(UChar32 c) const1885 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1886 return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c);
1887 }
1888
getCanonStartSet(int32_t n) const1889 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1890 return *(const UnicodeSet *)(
1891 ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]);
1892 }
1893
isCanonSegmentStarter(UChar32 c) const1894 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1895 return getCanonValue(c)>=0;
1896 }
1897
getCanonStartSet(UChar32 c,UnicodeSet & set) const1898 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1899 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1900 if(canonValue==0) {
1901 return FALSE;
1902 }
1903 set.clear();
1904 int32_t value=canonValue&CANON_VALUE_MASK;
1905 if((canonValue&CANON_HAS_SET)!=0) {
1906 set.addAll(getCanonStartSet(value));
1907 } else if(value!=0) {
1908 set.add(value);
1909 }
1910 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1911 uint16_t norm16=getNorm16(c);
1912 if(norm16==JAMO_L) {
1913 UChar32 syllable=
1914 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1915 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1916 } else {
1917 addComposites(getCompositionsList(norm16), set);
1918 }
1919 }
1920 return TRUE;
1921 }
1922
1923 U_NAMESPACE_END
1924
1925 // Normalizer2 data swapping ----------------------------------------------- ***
1926
1927 U_NAMESPACE_USE
1928
1929 U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1930 unorm2_swap(const UDataSwapper *ds,
1931 const void *inData, int32_t length, void *outData,
1932 UErrorCode *pErrorCode) {
1933 const UDataInfo *pInfo;
1934 int32_t headerSize;
1935
1936 const uint8_t *inBytes;
1937 uint8_t *outBytes;
1938
1939 const int32_t *inIndexes;
1940 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
1941
1942 int32_t i, offset, nextOffset, size;
1943
1944 /* udata_swapDataHeader checks the arguments */
1945 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1946 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1947 return 0;
1948 }
1949
1950 /* check data format and format version */
1951 pInfo=(const UDataInfo *)((const char *)inData+4);
1952 if(!(
1953 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
1954 pInfo->dataFormat[1]==0x72 &&
1955 pInfo->dataFormat[2]==0x6d &&
1956 pInfo->dataFormat[3]==0x32 &&
1957 pInfo->formatVersion[0]==1
1958 )) {
1959 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
1960 pInfo->dataFormat[0], pInfo->dataFormat[1],
1961 pInfo->dataFormat[2], pInfo->dataFormat[3],
1962 pInfo->formatVersion[0]);
1963 *pErrorCode=U_UNSUPPORTED_ERROR;
1964 return 0;
1965 }
1966
1967 inBytes=(const uint8_t *)inData+headerSize;
1968 outBytes=(uint8_t *)outData+headerSize;
1969
1970 inIndexes=(const int32_t *)inBytes;
1971
1972 if(length>=0) {
1973 length-=headerSize;
1974 if(length<(int32_t)sizeof(indexes)) {
1975 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
1976 length);
1977 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1978 return 0;
1979 }
1980 }
1981
1982 /* read the first few indexes */
1983 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
1984 indexes[i]=udata_readInt32(ds, inIndexes[i]);
1985 }
1986
1987 /* get the total length of the data */
1988 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
1989
1990 if(length>=0) {
1991 if(length<size) {
1992 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
1993 length);
1994 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1995 return 0;
1996 }
1997
1998 /* copy the data for inaccessible bytes */
1999 if(inBytes!=outBytes) {
2000 uprv_memcpy(outBytes, inBytes, size);
2001 }
2002
2003 offset=0;
2004
2005 /* swap the int32_t indexes[] */
2006 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2007 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2008 offset=nextOffset;
2009
2010 /* swap the UTrie2 */
2011 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2012 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2013 offset=nextOffset;
2014
2015 /* swap the uint16_t extraData[] */
2016 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1];
2017 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2018 offset=nextOffset;
2019
2020 U_ASSERT(offset==size);
2021 }
2022
2023 return headerSize+size;
2024 }
2025
2026 #endif // !UCONFIG_NO_NORMALIZATION
2027