1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: normalizer2impl.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009nov22
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_NORMALIZATION
20
21 #include "unicode/normalizer2.h"
22 #include "unicode/udata.h"
23 #include "unicode/ustring.h"
24 #include "cmemory.h"
25 #include "mutex.h"
26 #include "normalizer2impl.h"
27 #include "uassert.h"
28 #include "uset_imp.h"
29 #include "utrie2.h"
30
31 U_NAMESPACE_BEGIN
32
33 // ReorderingBuffer -------------------------------------------------------- ***
34
init(int32_t destCapacity,UErrorCode & errorCode)35 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
36 int32_t length=str.length();
37 start=str.getBuffer(destCapacity);
38 if(start==NULL) {
39 // getBuffer() already did str.setToBogus()
40 errorCode=U_MEMORY_ALLOCATION_ERROR;
41 return FALSE;
42 }
43 limit=start+length;
44 remainingCapacity=str.getCapacity()-length;
45 reorderStart=start;
46 if(start==limit) {
47 lastCC=0;
48 } else {
49 setIterator();
50 lastCC=previousCC();
51 // Set reorderStart after the last code point with cc<=1 if there is one.
52 if(lastCC>1) {
53 while(previousCC()>1) {}
54 }
55 reorderStart=codePointLimit;
56 }
57 return TRUE;
58 }
59
equals(const UChar * otherStart,const UChar * otherLimit) const60 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
61 int32_t length=(int32_t)(limit-start);
62 return
63 length==(int32_t)(otherLimit-otherStart) &&
64 0==u_memcmp(start, otherStart, length);
65 }
66
appendSupplementary(UChar32 c,uint8_t cc,UErrorCode & errorCode)67 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
68 if(remainingCapacity<2 && !resize(2, errorCode)) {
69 return FALSE;
70 }
71 if(lastCC<=cc || cc==0) {
72 limit[0]=U16_LEAD(c);
73 limit[1]=U16_TRAIL(c);
74 limit+=2;
75 lastCC=cc;
76 if(cc<=1) {
77 reorderStart=limit;
78 }
79 } else {
80 insert(c, cc);
81 }
82 remainingCapacity-=2;
83 return TRUE;
84 }
85
append(const UChar * s,int32_t length,uint8_t leadCC,uint8_t trailCC,UErrorCode & errorCode)86 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
87 uint8_t leadCC, uint8_t trailCC,
88 UErrorCode &errorCode) {
89 if(length==0) {
90 return TRUE;
91 }
92 if(remainingCapacity<length && !resize(length, errorCode)) {
93 return FALSE;
94 }
95 remainingCapacity-=length;
96 if(lastCC<=leadCC || leadCC==0) {
97 if(trailCC<=1) {
98 reorderStart=limit+length;
99 } else if(leadCC<=1) {
100 reorderStart=limit+1; // Ok if not a code point boundary.
101 }
102 const UChar *sLimit=s+length;
103 do { *limit++=*s++; } while(s!=sLimit);
104 lastCC=trailCC;
105 } else {
106 int32_t i=0;
107 UChar32 c;
108 U16_NEXT(s, i, length, c);
109 insert(c, leadCC); // insert first code point
110 while(i<length) {
111 U16_NEXT(s, i, length, c);
112 if(i<length) {
113 // s must be in NFD, otherwise we need to use getCC().
114 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
115 } else {
116 leadCC=trailCC;
117 }
118 append(c, leadCC, errorCode);
119 }
120 }
121 return TRUE;
122 }
123
appendZeroCC(UChar32 c,UErrorCode & errorCode)124 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
125 int32_t cpLength=U16_LENGTH(c);
126 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
127 return FALSE;
128 }
129 remainingCapacity-=cpLength;
130 if(cpLength==1) {
131 *limit++=(UChar)c;
132 } else {
133 limit[0]=U16_LEAD(c);
134 limit[1]=U16_TRAIL(c);
135 limit+=2;
136 }
137 lastCC=0;
138 reorderStart=limit;
139 return TRUE;
140 }
141
appendZeroCC(const UChar * s,const UChar * sLimit,UErrorCode & errorCode)142 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
143 if(s==sLimit) {
144 return TRUE;
145 }
146 int32_t length=(int32_t)(sLimit-s);
147 if(remainingCapacity<length && !resize(length, errorCode)) {
148 return FALSE;
149 }
150 u_memcpy(limit, s, length);
151 limit+=length;
152 remainingCapacity-=length;
153 lastCC=0;
154 reorderStart=limit;
155 return TRUE;
156 }
157
remove()158 void ReorderingBuffer::remove() {
159 reorderStart=limit=start;
160 remainingCapacity=str.getCapacity();
161 lastCC=0;
162 }
163
removeSuffix(int32_t suffixLength)164 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
165 if(suffixLength<(limit-start)) {
166 limit-=suffixLength;
167 remainingCapacity+=suffixLength;
168 } else {
169 limit=start;
170 remainingCapacity=str.getCapacity();
171 }
172 lastCC=0;
173 reorderStart=limit;
174 }
175
resize(int32_t appendLength,UErrorCode & errorCode)176 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
177 int32_t reorderStartIndex=(int32_t)(reorderStart-start);
178 int32_t length=(int32_t)(limit-start);
179 str.releaseBuffer(length);
180 int32_t newCapacity=length+appendLength;
181 int32_t doubleCapacity=2*str.getCapacity();
182 if(newCapacity<doubleCapacity) {
183 newCapacity=doubleCapacity;
184 }
185 if(newCapacity<256) {
186 newCapacity=256;
187 }
188 start=str.getBuffer(newCapacity);
189 if(start==NULL) {
190 // getBuffer() already did str.setToBogus()
191 errorCode=U_MEMORY_ALLOCATION_ERROR;
192 return FALSE;
193 }
194 reorderStart=start+reorderStartIndex;
195 limit=start+length;
196 remainingCapacity=str.getCapacity()-length;
197 return TRUE;
198 }
199
skipPrevious()200 void ReorderingBuffer::skipPrevious() {
201 codePointLimit=codePointStart;
202 UChar c=*--codePointStart;
203 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
204 --codePointStart;
205 }
206 }
207
previousCC()208 uint8_t ReorderingBuffer::previousCC() {
209 codePointLimit=codePointStart;
210 if(reorderStart>=codePointStart) {
211 return 0;
212 }
213 UChar32 c=*--codePointStart;
214 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
215 return 0;
216 }
217
218 UChar c2;
219 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
220 --codePointStart;
221 c=U16_GET_SUPPLEMENTARY(c2, c);
222 }
223 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
224 }
225
226 // Inserts c somewhere before the last character.
227 // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(UChar32 c,uint8_t cc)228 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
229 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
230 // insert c at codePointLimit, after the character with prevCC<=cc
231 UChar *q=limit;
232 UChar *r=limit+=U16_LENGTH(c);
233 do {
234 *--r=*--q;
235 } while(codePointLimit!=q);
236 writeCodePoint(q, c);
237 if(cc<=1) {
238 reorderStart=r;
239 }
240 }
241
242 // Normalizer2Impl --------------------------------------------------------- ***
243
~Normalizer2Impl()244 Normalizer2Impl::~Normalizer2Impl() {
245 udata_close(memory);
246 utrie2_close(normTrie);
247 UTrie2Singleton(fcdTrieSingleton).deleteInstance();
248 }
249
250 UBool U_CALLCONV
isAcceptable(void * context,const char * type,const char * name,const UDataInfo * pInfo)251 Normalizer2Impl::isAcceptable(void *context,
252 const char *type, const char *name,
253 const UDataInfo *pInfo) {
254 if(
255 pInfo->size>=20 &&
256 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
257 pInfo->charsetFamily==U_CHARSET_FAMILY &&
258 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
259 pInfo->dataFormat[1]==0x72 &&
260 pInfo->dataFormat[2]==0x6d &&
261 pInfo->dataFormat[3]==0x32 &&
262 pInfo->formatVersion[0]==1
263 ) {
264 Normalizer2Impl *me=(Normalizer2Impl *)context;
265 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
266 return TRUE;
267 } else {
268 return FALSE;
269 }
270 }
271
272 void
load(const char * packageName,const char * name,UErrorCode & errorCode)273 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
274 if(U_FAILURE(errorCode)) {
275 return;
276 }
277 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
278 if(U_FAILURE(errorCode)) {
279 return;
280 }
281 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
282 const int32_t *inIndexes=(const int32_t *)inBytes;
283 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
284 if(indexesLength<=IX_MIN_MAYBE_YES) {
285 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
286 return;
287 }
288
289 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
290 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
291
292 minYesNo=inIndexes[IX_MIN_YES_NO];
293 minNoNo=inIndexes[IX_MIN_NO_NO];
294 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
295 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
296
297 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
298 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
299 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
300 inBytes+offset, nextOffset-offset, NULL,
301 &errorCode);
302 if(U_FAILURE(errorCode)) {
303 return;
304 }
305
306 offset=nextOffset;
307 maybeYesCompositions=(const uint16_t *)(inBytes+offset);
308 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
309 }
310
getTrailCCFromCompYesAndZeroCC(const UChar * cpStart,const UChar * cpLimit) const311 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
312 UChar32 c;
313 if(cpStart==(cpLimit-1)) {
314 c=*cpStart;
315 } else {
316 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
317 }
318 uint16_t prevNorm16=getNorm16(c);
319 if(prevNorm16<=minYesNo) {
320 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
321 } else {
322 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
323 }
324 }
325
326 U_CDECL_BEGIN
327
328 static UBool U_CALLCONV
enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)329 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
330 /* add the start code point to the USet */
331 const USetAdder *sa=(const USetAdder *)context;
332 sa->add(sa->set, start);
333 return TRUE;
334 }
335
336 U_CDECL_END
337
338 void
addPropertyStarts(const USetAdder * sa,UErrorCode & errorCode) const339 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
340 /* add the start code point of each same-value range of each trie */
341 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
342
343 /* add Hangul LV syllables and LV+1 because of skippables */
344 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
345 sa->add(sa->set, c);
346 sa->add(sa->set, c+1);
347 }
348 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
349 }
350
351 const UChar *
copyLowPrefixFromNulTerminated(const UChar * src,UChar32 minNeedDataCP,ReorderingBuffer * buffer,UErrorCode & errorCode) const352 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
353 UChar32 minNeedDataCP,
354 ReorderingBuffer *buffer,
355 UErrorCode &errorCode) const {
356 // Make some effort to support NUL-terminated strings reasonably.
357 // Take the part of the fast quick check loop that does not look up
358 // data and check the first part of the string.
359 // After this prefix, determine the string length to simplify the rest
360 // of the code.
361 const UChar *prevSrc=src;
362 UChar c;
363 while((c=*src++)<minNeedDataCP && c!=0) {}
364 // Back out the last character for full processing.
365 // Copy this prefix.
366 if(--src!=prevSrc) {
367 if(buffer!=NULL) {
368 buffer->appendZeroCC(prevSrc, src, errorCode);
369 }
370 }
371 return src;
372 }
373
374 // Dual functionality:
375 // buffer!=NULL: normalize
376 // buffer==NULL: isNormalized/spanQuickCheckYes
377 const UChar *
decompose(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const378 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
379 ReorderingBuffer *buffer,
380 UErrorCode &errorCode) const {
381 UChar32 minNoCP=minDecompNoCP;
382 if(limit==NULL) {
383 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
384 if(U_FAILURE(errorCode)) {
385 return src;
386 }
387 limit=u_strchr(src, 0);
388 }
389
390 const UChar *prevSrc;
391 UChar32 c=0;
392 uint16_t norm16=0;
393
394 // only for quick check
395 const UChar *prevBoundary=src;
396 uint8_t prevCC=0;
397
398 for(;;) {
399 // count code units below the minimum or with irrelevant data for the quick check
400 for(prevSrc=src; src!=limit;) {
401 if( (c=*src)<minNoCP ||
402 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
403 ) {
404 ++src;
405 } else if(!U16_IS_SURROGATE(c)) {
406 break;
407 } else {
408 UChar c2;
409 if(U16_IS_SURROGATE_LEAD(c)) {
410 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
411 c=U16_GET_SUPPLEMENTARY(c, c2);
412 }
413 } else /* trail surrogate */ {
414 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
415 --src;
416 c=U16_GET_SUPPLEMENTARY(c2, c);
417 }
418 }
419 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
420 src+=U16_LENGTH(c);
421 } else {
422 break;
423 }
424 }
425 }
426 // copy these code units all at once
427 if(src!=prevSrc) {
428 if(buffer!=NULL) {
429 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
430 break;
431 }
432 } else {
433 prevCC=0;
434 prevBoundary=src;
435 }
436 }
437 if(src==limit) {
438 break;
439 }
440
441 // Check one above-minimum, relevant code point.
442 src+=U16_LENGTH(c);
443 if(buffer!=NULL) {
444 if(!decompose(c, norm16, *buffer, errorCode)) {
445 break;
446 }
447 } else {
448 if(isDecompYes(norm16)) {
449 uint8_t cc=getCCFromYesOrMaybe(norm16);
450 if(prevCC<=cc || cc==0) {
451 prevCC=cc;
452 if(cc<=1) {
453 prevBoundary=src;
454 }
455 continue;
456 }
457 }
458 return prevBoundary; // "no" or cc out of order
459 }
460 }
461 return src;
462 }
463
464 // Decompose a short piece of text which is likely to contain characters that
465 // fail the quick check loop and/or where the quick check loop's overhead
466 // is unlikely to be amortized.
467 // Called by the compose() and makeFCD() implementations.
decomposeShort(const UChar * src,const UChar * limit,ReorderingBuffer & buffer,UErrorCode & errorCode) const468 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
469 ReorderingBuffer &buffer,
470 UErrorCode &errorCode) const {
471 while(src<limit) {
472 UChar32 c;
473 uint16_t norm16;
474 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
475 if(!decompose(c, norm16, buffer, errorCode)) {
476 return FALSE;
477 }
478 }
479 return TRUE;
480 }
481
decompose(UChar32 c,uint16_t norm16,ReorderingBuffer & buffer,UErrorCode & errorCode) const482 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
483 ReorderingBuffer &buffer,
484 UErrorCode &errorCode) const {
485 // Only loops for 1:1 algorithmic mappings.
486 for(;;) {
487 // get the decomposition and the lead and trail cc's
488 if(isDecompYes(norm16)) {
489 // c does not decompose
490 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
491 } else if(isHangul(norm16)) {
492 // Hangul syllable: decompose algorithmically
493 UChar jamos[3];
494 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
495 } else if(isDecompNoAlgorithmic(norm16)) {
496 c=mapAlgorithmic(c, norm16);
497 norm16=getNorm16(c);
498 } else {
499 // c decomposes, get everything from the variable-length extra data
500 const uint16_t *mapping=getMapping(norm16);
501 uint16_t firstUnit=*mapping++;
502 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
503 uint8_t leadCC, trailCC;
504 trailCC=(uint8_t)(firstUnit>>8);
505 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
506 leadCC=(uint8_t)(*mapping++>>8);
507 } else {
508 leadCC=0;
509 }
510 return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode);
511 }
512 }
513 }
514
515 const UChar *
getDecomposition(UChar32 c,UChar buffer[4],int32_t & length) const516 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
517 const UChar *decomp=NULL;
518 uint16_t norm16;
519 for(;;) {
520 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
521 // c does not decompose
522 return decomp;
523 } else if(isHangul(norm16)) {
524 // Hangul syllable: decompose algorithmically
525 length=Hangul::decompose(c, buffer);
526 return buffer;
527 } else if(isDecompNoAlgorithmic(norm16)) {
528 c=mapAlgorithmic(c, norm16);
529 decomp=buffer;
530 length=0;
531 U16_APPEND_UNSAFE(buffer, length, c);
532 } else {
533 // c decomposes, get everything from the variable-length extra data
534 const uint16_t *mapping=getMapping(norm16);
535 uint16_t firstUnit=*mapping++;
536 length=firstUnit&MAPPING_LENGTH_MASK;
537 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
538 ++mapping;
539 }
540 return (const UChar *)mapping;
541 }
542 }
543 }
544
decomposeAndAppend(const UChar * src,const UChar * limit,UBool doDecompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const545 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
546 UBool doDecompose,
547 ReorderingBuffer &buffer,
548 UErrorCode &errorCode) const {
549 if(doDecompose) {
550 decompose(src, limit, &buffer, errorCode);
551 return;
552 }
553 // Just merge the strings at the boundary.
554 ForwardUTrie2StringIterator iter(normTrie, src, limit);
555 uint8_t firstCC, prevCC, cc;
556 firstCC=prevCC=cc=getCC(iter.next16());
557 while(cc!=0) {
558 prevCC=cc;
559 cc=getCC(iter.next16());
560 };
561 buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) &&
562 buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
563 }
564
565 // Note: hasDecompBoundary() could be implemented as aliases to
566 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
567 // at the cost of building the FCD trie for a decomposition normalizer.
hasDecompBoundary(UChar32 c,UBool before) const568 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
569 for(;;) {
570 if(c<minDecompNoCP) {
571 return TRUE;
572 }
573 uint16_t norm16=getNorm16(c);
574 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
575 return TRUE;
576 } else if(norm16>MIN_NORMAL_MAYBE_YES) {
577 return FALSE; // ccc!=0
578 } else if(isDecompNoAlgorithmic(norm16)) {
579 c=mapAlgorithmic(c, norm16);
580 } else {
581 // c decomposes, get everything from the variable-length extra data
582 const uint16_t *mapping=getMapping(norm16);
583 uint16_t firstUnit=*mapping++;
584 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
585 return FALSE;
586 }
587 if(!before) {
588 // decomp after-boundary: same as hasFCDBoundaryAfter(),
589 // fcd16<=1 || trailCC==0
590 if(firstUnit>0x1ff) {
591 return FALSE; // trailCC>1
592 }
593 if(firstUnit<=0xff) {
594 return TRUE; // trailCC==0
595 }
596 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
597 }
598 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
599 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0;
600 }
601 }
602 }
603
604 /*
605 * Finds the recomposition result for
606 * a forward-combining "lead" character,
607 * specified with a pointer to its compositions list,
608 * and a backward-combining "trail" character.
609 *
610 * If the lead and trail characters combine, then this function returns
611 * the following "compositeAndFwd" value:
612 * Bits 21..1 composite character
613 * Bit 0 set if the composite is a forward-combining starter
614 * otherwise it returns -1.
615 *
616 * The compositions list has (trail, compositeAndFwd) pair entries,
617 * encoded as either pairs or triples of 16-bit units.
618 * The last entry has the high bit of its first unit set.
619 *
620 * The list is sorted by ascending trail characters (there are no duplicates).
621 * A linear search is used.
622 *
623 * See normalizer2impl.h for a more detailed description
624 * of the compositions list format.
625 */
combine(const uint16_t * list,UChar32 trail)626 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
627 uint16_t key1, firstUnit;
628 if(trail<COMP_1_TRAIL_LIMIT) {
629 // trail character is 0..33FF
630 // result entry may have 2 or 3 units
631 key1=(uint16_t)(trail<<1);
632 while(key1>(firstUnit=*list)) {
633 list+=2+(firstUnit&COMP_1_TRIPLE);
634 }
635 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
636 if(firstUnit&COMP_1_TRIPLE) {
637 return ((int32_t)list[1]<<16)|list[2];
638 } else {
639 return list[1];
640 }
641 }
642 } else {
643 // trail character is 3400..10FFFF
644 // result entry has 3 units
645 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
646 ((trail>>COMP_1_TRAIL_SHIFT))&
647 ~COMP_1_TRIPLE);
648 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
649 uint16_t secondUnit;
650 for(;;) {
651 if(key1>(firstUnit=*list)) {
652 list+=2+(firstUnit&COMP_1_TRIPLE);
653 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
654 if(key2>(secondUnit=list[1])) {
655 if(firstUnit&COMP_1_LAST_TUPLE) {
656 break;
657 } else {
658 list+=3;
659 }
660 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
661 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
662 } else {
663 break;
664 }
665 } else {
666 break;
667 }
668 }
669 }
670 return -1;
671 }
672
673 /*
674 * Recomposes the buffer text starting at recomposeStartIndex
675 * (which is in NFD - decomposed and canonically ordered),
676 * and truncates the buffer contents.
677 *
678 * Note that recomposition never lengthens the text:
679 * Any character consists of either one or two code units;
680 * a composition may contain at most one more code unit than the original starter,
681 * while the combining mark that is removed has at least one code unit.
682 */
recompose(ReorderingBuffer & buffer,int32_t recomposeStartIndex,UBool onlyContiguous) const683 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
684 UBool onlyContiguous) const {
685 UChar *p=buffer.getStart()+recomposeStartIndex;
686 UChar *limit=buffer.getLimit();
687 if(p==limit) {
688 return;
689 }
690
691 UChar *starter, *pRemove, *q, *r;
692 const uint16_t *compositionsList;
693 UChar32 c, compositeAndFwd;
694 uint16_t norm16;
695 uint8_t cc, prevCC;
696 UBool starterIsSupplementary;
697
698 // Some of the following variables are not used until we have a forward-combining starter
699 // and are only initialized now to avoid compiler warnings.
700 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
701 starter=NULL;
702 starterIsSupplementary=FALSE;
703 prevCC=0;
704
705 for(;;) {
706 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
707 cc=getCCFromYesOrMaybe(norm16);
708 if( // this character combines backward and
709 isMaybe(norm16) &&
710 // we have seen a starter that combines forward and
711 compositionsList!=NULL &&
712 // the backward-combining character is not blocked
713 (prevCC<cc || prevCC==0)
714 ) {
715 if(isJamoVT(norm16)) {
716 // c is a Jamo V/T, see if we can compose it with the previous character.
717 if(c<Hangul::JAMO_T_BASE) {
718 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
719 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
720 if(prev<Hangul::JAMO_L_COUNT) {
721 pRemove=p-1;
722 UChar syllable=(UChar)
723 (Hangul::HANGUL_BASE+
724 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
725 Hangul::JAMO_T_COUNT);
726 UChar t;
727 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
728 ++p;
729 syllable+=t; // The next character was a Jamo T.
730 }
731 *starter=syllable;
732 // remove the Jamo V/T
733 q=pRemove;
734 r=p;
735 while(r<limit) {
736 *q++=*r++;
737 }
738 limit=q;
739 p=pRemove;
740 }
741 }
742 /*
743 * No "else" for Jamo T:
744 * Since the input is in NFD, there are no Hangul LV syllables that
745 * a Jamo T could combine with.
746 * All Jamo Ts are combined above when handling Jamo Vs.
747 */
748 if(p==limit) {
749 break;
750 }
751 compositionsList=NULL;
752 continue;
753 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
754 // The starter and the combining mark (c) do combine.
755 UChar32 composite=compositeAndFwd>>1;
756
757 // Replace the starter with the composite, remove the combining mark.
758 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
759 if(starterIsSupplementary) {
760 if(U_IS_SUPPLEMENTARY(composite)) {
761 // both are supplementary
762 starter[0]=U16_LEAD(composite);
763 starter[1]=U16_TRAIL(composite);
764 } else {
765 *starter=(UChar)composite;
766 // The composite is shorter than the starter,
767 // move the intermediate characters forward one.
768 starterIsSupplementary=FALSE;
769 q=starter+1;
770 r=q+1;
771 while(r<pRemove) {
772 *q++=*r++;
773 }
774 --pRemove;
775 }
776 } else if(U_IS_SUPPLEMENTARY(composite)) {
777 // The composite is longer than the starter,
778 // move the intermediate characters back one.
779 starterIsSupplementary=TRUE;
780 ++starter; // temporarily increment for the loop boundary
781 q=pRemove;
782 r=++pRemove;
783 while(starter<q) {
784 *--r=*--q;
785 }
786 *starter=U16_TRAIL(composite);
787 *--starter=U16_LEAD(composite); // undo the temporary increment
788 } else {
789 // both are on the BMP
790 *starter=(UChar)composite;
791 }
792
793 /* remove the combining mark by moving the following text over it */
794 if(pRemove<p) {
795 q=pRemove;
796 r=p;
797 while(r<limit) {
798 *q++=*r++;
799 }
800 limit=q;
801 p=pRemove;
802 }
803 // Keep prevCC because we removed the combining mark.
804
805 if(p==limit) {
806 break;
807 }
808 // Is the composite a starter that combines forward?
809 if(compositeAndFwd&1) {
810 compositionsList=
811 getCompositionsListForComposite(getNorm16(composite));
812 } else {
813 compositionsList=NULL;
814 }
815
816 // We combined; continue with looking for compositions.
817 continue;
818 }
819 }
820
821 // no combination this time
822 prevCC=cc;
823 if(p==limit) {
824 break;
825 }
826
827 // If c did not combine, then check if it is a starter.
828 if(cc==0) {
829 // Found a new starter.
830 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
831 // It may combine with something, prepare for it.
832 if(U_IS_BMP(c)) {
833 starterIsSupplementary=FALSE;
834 starter=p-1;
835 } else {
836 starterIsSupplementary=TRUE;
837 starter=p-2;
838 }
839 }
840 } else if(onlyContiguous) {
841 // FCC: no discontiguous compositions; any intervening character blocks.
842 compositionsList=NULL;
843 }
844 }
845 buffer.setReorderingLimit(limit);
846 }
847
848 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
849 // doCompose: normalize
850 // !doCompose: isNormalized (buffer must be empty and initialized)
851 UBool
compose(const UChar * src,const UChar * limit,UBool onlyContiguous,UBool doCompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const852 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
853 UBool onlyContiguous,
854 UBool doCompose,
855 ReorderingBuffer &buffer,
856 UErrorCode &errorCode) const {
857 UChar32 minNoMaybeCP=minCompNoMaybeCP;
858 if(limit==NULL) {
859 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
860 doCompose ? &buffer : NULL,
861 errorCode);
862 if(U_FAILURE(errorCode)) {
863 return FALSE;
864 }
865 limit=u_strchr(src, 0);
866 }
867
868 /*
869 * prevBoundary points to the last character before the current one
870 * that has a composition boundary before it with ccc==0 and quick check "yes".
871 * Keeping track of prevBoundary saves us looking for a composition boundary
872 * when we find a "no" or "maybe".
873 *
874 * When we back out from prevSrc back to prevBoundary,
875 * then we also remove those same characters (which had been simply copied
876 * or canonically-order-inserted) from the ReorderingBuffer.
877 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
878 * must correspond 1:1 to destination units at the end of the destination buffer.
879 */
880 const UChar *prevBoundary=src;
881 const UChar *prevSrc;
882 UChar32 c=0;
883 uint16_t norm16=0;
884
885 // only for isNormalized
886 uint8_t prevCC=0;
887
888 for(;;) {
889 // count code units below the minimum or with irrelevant data for the quick check
890 for(prevSrc=src; src!=limit;) {
891 if( (c=*src)<minNoMaybeCP ||
892 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
893 ) {
894 ++src;
895 } else if(!U16_IS_SURROGATE(c)) {
896 break;
897 } else {
898 UChar c2;
899 if(U16_IS_SURROGATE_LEAD(c)) {
900 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
901 c=U16_GET_SUPPLEMENTARY(c, c2);
902 }
903 } else /* trail surrogate */ {
904 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
905 --src;
906 c=U16_GET_SUPPLEMENTARY(c2, c);
907 }
908 }
909 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
910 src+=U16_LENGTH(c);
911 } else {
912 break;
913 }
914 }
915 }
916 // copy these code units all at once
917 if(src!=prevSrc) {
918 if(doCompose) {
919 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
920 break;
921 }
922 } else {
923 prevCC=0;
924 }
925 if(src==limit) {
926 break;
927 }
928 // Set prevBoundary to the last character in the quick check loop.
929 prevBoundary=src-1;
930 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
931 U16_IS_LEAD(*(prevBoundary-1))
932 ) {
933 --prevBoundary;
934 }
935 // The start of the current character (c).
936 prevSrc=src;
937 } else if(src==limit) {
938 break;
939 }
940
941 src+=U16_LENGTH(c);
942 /*
943 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
944 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
945 * or has ccc!=0.
946 * Check for Jamo V/T, then for regular characters.
947 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
948 */
949 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
950 UChar prev=*(prevSrc-1);
951 UBool needToDecompose=FALSE;
952 if(c<Hangul::JAMO_T_BASE) {
953 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
954 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
955 if(prev<Hangul::JAMO_L_COUNT) {
956 if(!doCompose) {
957 return FALSE;
958 }
959 UChar syllable=(UChar)
960 (Hangul::HANGUL_BASE+
961 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
962 Hangul::JAMO_T_COUNT);
963 UChar t;
964 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
965 ++src;
966 syllable+=t; // The next character was a Jamo T.
967 prevBoundary=src;
968 buffer.setLastChar(syllable);
969 continue;
970 }
971 // If we see L+V+x where x!=T then we drop to the slow path,
972 // decompose and recompose.
973 // This is to deal with NFKC finding normal L and V but a
974 // compatibility variant of a T. We need to either fully compose that
975 // combination here (which would complicate the code and may not work
976 // with strange custom data) or use the slow path -- or else our replacing
977 // two input characters (L+V) with one output character (LV syllable)
978 // would violate the invariant that [prevBoundary..prevSrc[ has the same
979 // length as what we appended to the buffer since prevBoundary.
980 needToDecompose=TRUE;
981 }
982 } else if(Hangul::isHangulWithoutJamoT(prev)) {
983 // c is a Jamo Trailing consonant,
984 // compose with previous Hangul LV that does not contain a Jamo T.
985 if(!doCompose) {
986 return FALSE;
987 }
988 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
989 prevBoundary=src;
990 continue;
991 }
992 if(!needToDecompose) {
993 // The Jamo V/T did not compose into a Hangul syllable.
994 if(doCompose) {
995 if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
996 break;
997 }
998 } else {
999 prevCC=0;
1000 }
1001 continue;
1002 }
1003 }
1004 /*
1005 * Source buffer pointers:
1006 *
1007 * all done quick check current char not yet
1008 * "yes" but (c) processed
1009 * may combine
1010 * forward
1011 * [-------------[-------------[-------------[-------------[
1012 * | | | | |
1013 * orig. src prevBoundary prevSrc src limit
1014 *
1015 *
1016 * Destination buffer pointers inside the ReorderingBuffer:
1017 *
1018 * all done might take not filled yet
1019 * characters for
1020 * reordering
1021 * [-------------[-------------[-------------[
1022 * | | | |
1023 * start reorderStart limit |
1024 * +remainingCap.+
1025 */
1026 if(norm16>=MIN_YES_YES_WITH_CC) {
1027 uint8_t cc=(uint8_t)norm16; // cc!=0
1028 if( onlyContiguous && // FCC
1029 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1030 prevBoundary<prevSrc &&
1031 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1032 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1033 // passed the quick check "yes && ccc==0" test.
1034 // Check whether the last character was a "yesYes" or a "yesNo".
1035 // If a "yesNo", then we get its trailing ccc from its
1036 // mapping and check for canonical order.
1037 // All other cases are ok.
1038 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1039 ) {
1040 // Fails FCD test, need to decompose and contiguously recompose.
1041 if(!doCompose) {
1042 return FALSE;
1043 }
1044 } else if(doCompose) {
1045 if(!buffer.append(c, cc, errorCode)) {
1046 break;
1047 }
1048 continue;
1049 } else if(prevCC<=cc) {
1050 prevCC=cc;
1051 continue;
1052 } else {
1053 return FALSE;
1054 }
1055 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1056 return FALSE;
1057 }
1058
1059 /*
1060 * Find appropriate boundaries around this character,
1061 * decompose the source text from between the boundaries,
1062 * and recompose it.
1063 *
1064 * We may need to remove the last few characters from the ReorderingBuffer
1065 * to account for source text that was copied or appended
1066 * but needs to take part in the recomposition.
1067 */
1068
1069 /*
1070 * Find the last composition boundary in [prevBoundary..src[.
1071 * It is either the decomposition of the current character (at prevSrc),
1072 * or prevBoundary.
1073 */
1074 if(hasCompBoundaryBefore(c, norm16)) {
1075 prevBoundary=prevSrc;
1076 } else if(doCompose) {
1077 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1078 }
1079
1080 // Find the next composition boundary in [src..limit[ -
1081 // modifies src to point to the next starter.
1082 src=(UChar *)findNextCompBoundary(src, limit);
1083
1084 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1085 int32_t recomposeStartIndex=buffer.length();
1086 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1087 break;
1088 }
1089 recompose(buffer, recomposeStartIndex, onlyContiguous);
1090 if(!doCompose) {
1091 if(!buffer.equals(prevBoundary, src)) {
1092 return FALSE;
1093 }
1094 buffer.remove();
1095 prevCC=0;
1096 }
1097
1098 // Move to the next starter. We never need to look back before this point again.
1099 prevBoundary=src;
1100 }
1101 return TRUE;
1102 }
1103
1104 // Very similar to compose(): Make the same changes in both places if relevant.
1105 // pQCResult==NULL: spanQuickCheckYes
1106 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1107 const UChar *
composeQuickCheck(const UChar * src,const UChar * limit,UBool onlyContiguous,UNormalizationCheckResult * pQCResult) const1108 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1109 UBool onlyContiguous,
1110 UNormalizationCheckResult *pQCResult) const {
1111 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1112 if(limit==NULL) {
1113 UErrorCode errorCode=U_ZERO_ERROR;
1114 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1115 limit=u_strchr(src, 0);
1116 }
1117
1118 /*
1119 * prevBoundary points to the last character before the current one
1120 * that has a composition boundary before it with ccc==0 and quick check "yes".
1121 */
1122 const UChar *prevBoundary=src;
1123 const UChar *prevSrc;
1124 UChar32 c=0;
1125 uint16_t norm16=0;
1126 uint8_t prevCC=0;
1127
1128 for(;;) {
1129 // count code units below the minimum or with irrelevant data for the quick check
1130 for(prevSrc=src;;) {
1131 if(src==limit) {
1132 return src;
1133 }
1134 if( (c=*src)<minNoMaybeCP ||
1135 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1136 ) {
1137 ++src;
1138 } else if(!U16_IS_SURROGATE(c)) {
1139 break;
1140 } else {
1141 UChar c2;
1142 if(U16_IS_SURROGATE_LEAD(c)) {
1143 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1144 c=U16_GET_SUPPLEMENTARY(c, c2);
1145 }
1146 } else /* trail surrogate */ {
1147 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1148 --src;
1149 c=U16_GET_SUPPLEMENTARY(c2, c);
1150 }
1151 }
1152 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1153 src+=U16_LENGTH(c);
1154 } else {
1155 break;
1156 }
1157 }
1158 }
1159 if(src!=prevSrc) {
1160 // Set prevBoundary to the last character in the quick check loop.
1161 prevBoundary=src-1;
1162 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1163 U16_IS_LEAD(*(prevBoundary-1))
1164 ) {
1165 --prevBoundary;
1166 }
1167 prevCC=0;
1168 // The start of the current character (c).
1169 prevSrc=src;
1170 }
1171
1172 src+=U16_LENGTH(c);
1173 /*
1174 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1175 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1176 * or has ccc!=0.
1177 */
1178 if(isMaybeOrNonZeroCC(norm16)) {
1179 uint8_t cc=getCCFromYesOrMaybe(norm16);
1180 if( onlyContiguous && // FCC
1181 cc!=0 &&
1182 prevCC==0 &&
1183 prevBoundary<prevSrc &&
1184 // prevCC==0 && prevBoundary<prevSrc tell us that
1185 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1186 // passed the quick check "yes && ccc==0" test.
1187 // Check whether the last character was a "yesYes" or a "yesNo".
1188 // If a "yesNo", then we get its trailing ccc from its
1189 // mapping and check for canonical order.
1190 // All other cases are ok.
1191 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1192 ) {
1193 // Fails FCD test.
1194 } else if(prevCC<=cc || cc==0) {
1195 prevCC=cc;
1196 if(norm16<MIN_YES_YES_WITH_CC) {
1197 if(pQCResult!=NULL) {
1198 *pQCResult=UNORM_MAYBE;
1199 } else {
1200 return prevBoundary;
1201 }
1202 }
1203 continue;
1204 }
1205 }
1206 if(pQCResult!=NULL) {
1207 *pQCResult=UNORM_NO;
1208 }
1209 return prevBoundary;
1210 }
1211 }
1212
composeAndAppend(const UChar * src,const UChar * limit,UBool doCompose,UBool onlyContiguous,ReorderingBuffer & buffer,UErrorCode & errorCode) const1213 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1214 UBool doCompose,
1215 UBool onlyContiguous,
1216 ReorderingBuffer &buffer,
1217 UErrorCode &errorCode) const {
1218 if(!buffer.isEmpty()) {
1219 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1220 if(src!=firstStarterInSrc) {
1221 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1222 buffer.getLimit());
1223 UnicodeString middle(lastStarterInDest,
1224 (int32_t)(buffer.getLimit()-lastStarterInDest));
1225 buffer.removeSuffix((int32_t)(buffer.getLimit()-lastStarterInDest));
1226 middle.append(src, (int32_t)(firstStarterInSrc-src));
1227 const UChar *middleStart=middle.getBuffer();
1228 compose(middleStart, middleStart+middle.length(), onlyContiguous,
1229 TRUE, buffer, errorCode);
1230 if(U_FAILURE(errorCode)) {
1231 return;
1232 }
1233 src=firstStarterInSrc;
1234 }
1235 }
1236 if(doCompose) {
1237 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1238 } else {
1239 buffer.appendZeroCC(src, limit, errorCode);
1240 }
1241 }
1242
1243 /**
1244 * Does c have a composition boundary before it?
1245 * True if its decomposition begins with a character that has
1246 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1247 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1248 * (isCompYesAndZeroCC()) so we need not decompose.
1249 */
hasCompBoundaryBefore(UChar32 c,uint16_t norm16) const1250 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1251 for(;;) {
1252 if(isCompYesAndZeroCC(norm16)) {
1253 return TRUE;
1254 } else if(isMaybeOrNonZeroCC(norm16)) {
1255 return FALSE;
1256 } else if(isDecompNoAlgorithmic(norm16)) {
1257 c=mapAlgorithmic(c, norm16);
1258 norm16=getNorm16(c);
1259 } else {
1260 // c decomposes, get everything from the variable-length extra data
1261 const uint16_t *mapping=getMapping(norm16);
1262 uint16_t firstUnit=*mapping++;
1263 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1264 return FALSE;
1265 }
1266 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) {
1267 return FALSE; // non-zero leadCC
1268 }
1269 int32_t i=0;
1270 UChar32 c;
1271 U16_NEXT_UNSAFE(mapping, i, c);
1272 return isCompYesAndZeroCC(getNorm16(c));
1273 }
1274 }
1275 }
1276
hasCompBoundaryAfter(UChar32 c,UBool onlyContiguous,UBool testInert) const1277 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1278 for(;;) {
1279 uint16_t norm16=getNorm16(c);
1280 if(isInert(norm16)) {
1281 return TRUE;
1282 } else if(norm16<=minYesNo) {
1283 // Hangul LVT (==minYesNo) has a boundary after it.
1284 // Hangul LV and non-inert yesYes characters combine forward.
1285 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1286 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1287 return FALSE;
1288 } else if(isDecompNoAlgorithmic(norm16)) {
1289 c=mapAlgorithmic(c, norm16);
1290 } else {
1291 // c decomposes, get everything from the variable-length extra data.
1292 // If testInert, then c must be a yesNo character which has lccc=0,
1293 // otherwise it could be a noNo.
1294 const uint16_t *mapping=getMapping(norm16);
1295 uint16_t firstUnit=*mapping;
1296 // TRUE if
1297 // c is not deleted, and
1298 // it and its decomposition do not combine forward, and it has a starter, and
1299 // if FCC then trailCC<=1
1300 return
1301 (firstUnit&MAPPING_LENGTH_MASK)!=0 &&
1302 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
1303 (!onlyContiguous || firstUnit<=0x1ff);
1304 }
1305 }
1306 }
1307
findPreviousCompBoundary(const UChar * start,const UChar * p) const1308 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1309 BackwardUTrie2StringIterator iter(normTrie, start, p);
1310 uint16_t norm16;
1311 do {
1312 norm16=iter.previous16();
1313 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1314 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1315 // but that's probably not worth the extra cost.
1316 return iter.codePointStart;
1317 }
1318
findNextCompBoundary(const UChar * p,const UChar * limit) const1319 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1320 ForwardUTrie2StringIterator iter(normTrie, p, limit);
1321 uint16_t norm16;
1322 do {
1323 norm16=iter.next16();
1324 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1325 return iter.codePointStart;
1326 }
1327
1328 class FCDTrieSingleton : public UTrie2Singleton {
1329 public:
FCDTrieSingleton(SimpleSingleton & s,Normalizer2Impl & ni,UErrorCode & ec)1330 FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1331 UTrie2Singleton(s), impl(ni), errorCode(ec) {}
getInstance(UErrorCode & errorCode)1332 UTrie2 *getInstance(UErrorCode &errorCode) {
1333 return UTrie2Singleton::getInstance(createInstance, this, errorCode);
1334 }
1335 static void *createInstance(const void *context, UErrorCode &errorCode);
rangeHandler(UChar32 start,UChar32 end,uint32_t value)1336 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1337 if(value!=0) {
1338 impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode);
1339 }
1340 return U_SUCCESS(errorCode);
1341 }
1342
1343 Normalizer2Impl &impl;
1344 UTrie2 *newFCDTrie;
1345 UErrorCode &errorCode;
1346 };
1347
1348 U_CDECL_BEGIN
1349
1350 // Set the FCD value for a range of same-norm16 characters.
1351 static UBool U_CALLCONV
enumRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)1352 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1353 return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value);
1354 }
1355
1356 // Collect (OR together) the FCD values for a range of supplementary characters,
1357 // for their lead surrogate code unit.
1358 static UBool U_CALLCONV
enumRangeOrValue(const void * context,UChar32 start,UChar32 end,uint32_t value)1359 enumRangeOrValue(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1360 *((uint32_t *)context)|=value;
1361 return TRUE;
1362 }
1363
1364 U_CDECL_END
1365
createInstance(const void * context,UErrorCode & errorCode)1366 void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1367 FCDTrieSingleton *me=(FCDTrieSingleton *)context;
1368 me->newFCDTrie=utrie2_open(0, 0, &errorCode);
1369 if(U_SUCCESS(errorCode)) {
1370 utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me);
1371 for(UChar lead=0xd800; lead<0xdc00; ++lead) {
1372 uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead);
1373 utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue);
1374 if(oredValue!=0) {
1375 // Set a "bad" value for makeFCD() to break the quick check loop
1376 // and look up the value for the supplementary code point.
1377 // If there is any lccc, then set the worst-case lccc of 1.
1378 // The ORed-together value's tccc is already the worst case.
1379 if(oredValue>0xff) {
1380 oredValue=0x100|(oredValue&0xff);
1381 }
1382 utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode);
1383 }
1384 }
1385 utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode);
1386 if(U_SUCCESS(errorCode)) {
1387 return me->newFCDTrie;
1388 }
1389 }
1390 utrie2_close(me->newFCDTrie);
1391 return NULL;
1392 }
1393
setFCD16FromNorm16(UChar32 start,UChar32 end,uint16_t norm16,UTrie2 * newFCDTrie,UErrorCode & errorCode) const1394 void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1395 UTrie2 *newFCDTrie, UErrorCode &errorCode) const {
1396 // Only loops for 1:1 algorithmic mappings.
1397 for(;;) {
1398 if(norm16>=MIN_NORMAL_MAYBE_YES) {
1399 norm16&=0xff;
1400 norm16|=norm16<<8;
1401 } else if(norm16<=minYesNo || minMaybeYes<=norm16) {
1402 // no decomposition or Hangul syllable, all zeros
1403 break;
1404 } else if(limitNoNo<=norm16) {
1405 int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1);
1406 if(start==end) {
1407 start+=delta;
1408 norm16=getNorm16(start);
1409 } else {
1410 // the same delta leads from different original characters to different mappings
1411 do {
1412 UChar32 c=start+delta;
1413 setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode);
1414 } while(++start<=end);
1415 break;
1416 }
1417 } else {
1418 // c decomposes, get everything from the variable-length extra data
1419 const uint16_t *mapping=getMapping(norm16);
1420 uint16_t firstUnit=*mapping;
1421 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1422 // A character that is deleted (maps to an empty string) must
1423 // get the worst-case lccc and tccc values because arbitrary
1424 // characters on both sides will become adjacent.
1425 norm16=0x1ff;
1426 } else {
1427 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1428 norm16=mapping[1]&0xff00; // lccc
1429 } else {
1430 norm16=0;
1431 }
1432 norm16|=firstUnit>>8; // tccc
1433 }
1434 }
1435 utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);
1436 break;
1437 }
1438 }
1439
getFCDTrie(UErrorCode & errorCode) const1440 const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {
1441 // Logically const: Synchronized instantiation.
1442 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1443 return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode);
1444 }
1445
1446 // Dual functionality:
1447 // buffer!=NULL: normalize
1448 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1449 const UChar *
makeFCD(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const1450 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1451 ReorderingBuffer *buffer,
1452 UErrorCode &errorCode) const {
1453 if(limit==NULL) {
1454 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1455 if(U_FAILURE(errorCode)) {
1456 return src;
1457 }
1458 limit=u_strchr(src, 0);
1459 }
1460
1461 // Note: In this function we use buffer->appendZeroCC() because we track
1462 // the lead and trail combining classes here, rather than leaving it to
1463 // the ReorderingBuffer.
1464 // The exception is the call to decomposeShort() which uses the buffer
1465 // in the normal way.
1466
1467 const UTrie2 *trie=fcdTrie();
1468
1469 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1470 // Similar to the prevBoundary in the compose() implementation.
1471 const UChar *prevBoundary=src;
1472 const UChar *prevSrc;
1473 UChar32 c=0;
1474 int32_t prevFCD16=0;
1475 uint16_t fcd16=0;
1476
1477 for(;;) {
1478 // count code units with lccc==0
1479 for(prevSrc=src; src!=limit;) {
1480 if((c=*src)<MIN_CCC_LCCC_CP) {
1481 prevFCD16=~c;
1482 ++src;
1483 } else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) {
1484 prevFCD16=fcd16;
1485 ++src;
1486 } else if(!U16_IS_SURROGATE(c)) {
1487 break;
1488 } else {
1489 UChar c2;
1490 if(U16_IS_SURROGATE_LEAD(c)) {
1491 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1492 c=U16_GET_SUPPLEMENTARY(c, c2);
1493 }
1494 } else /* trail surrogate */ {
1495 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1496 --src;
1497 c=U16_GET_SUPPLEMENTARY(c2, c);
1498 }
1499 }
1500 if((fcd16=getFCD16(c))<=0xff) {
1501 prevFCD16=fcd16;
1502 src+=U16_LENGTH(c);
1503 } else {
1504 break;
1505 }
1506 }
1507 }
1508 // copy these code units all at once
1509 if(src!=prevSrc) {
1510 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1511 break;
1512 }
1513 if(src==limit) {
1514 break;
1515 }
1516 prevBoundary=src;
1517 // We know that the previous character's lccc==0.
1518 if(prevFCD16<0) {
1519 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1520 prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16);
1521 if(prevFCD16>1) {
1522 --prevBoundary;
1523 }
1524 } else {
1525 const UChar *p=src-1;
1526 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1527 --p;
1528 // Need to fetch the previous character's FCD value because
1529 // prevFCD16 was just for the trail surrogate code point.
1530 prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]);
1531 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1532 }
1533 if(prevFCD16>1) {
1534 prevBoundary=p;
1535 }
1536 }
1537 // The start of the current character (c).
1538 prevSrc=src;
1539 } else if(src==limit) {
1540 break;
1541 }
1542
1543 src+=U16_LENGTH(c);
1544 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1545 // Check for proper order, and decompose locally if necessary.
1546 if((prevFCD16&0xff)<=(fcd16>>8)) {
1547 // proper order: prev tccc <= current lccc
1548 if((fcd16&0xff)<=1) {
1549 prevBoundary=src;
1550 }
1551 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1552 break;
1553 }
1554 prevFCD16=fcd16;
1555 continue;
1556 } else if(buffer==NULL) {
1557 return prevBoundary; // quick check "no"
1558 } else {
1559 /*
1560 * Back out the part of the source that we copied or appended
1561 * already but is now going to be decomposed.
1562 * prevSrc is set to after what was copied/appended.
1563 */
1564 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1565 /*
1566 * Find the part of the source that needs to be decomposed,
1567 * up to the next safe boundary.
1568 */
1569 src=findNextFCDBoundary(src, limit);
1570 /*
1571 * The source text does not fulfill the conditions for FCD.
1572 * Decompose and reorder a limited piece of the text.
1573 */
1574 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1575 break;
1576 }
1577 prevBoundary=src;
1578 prevFCD16=0;
1579 }
1580 }
1581 return src;
1582 }
1583
makeFCDAndAppend(const UChar * src,const UChar * limit,UBool doMakeFCD,ReorderingBuffer & buffer,UErrorCode & errorCode) const1584 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1585 UBool doMakeFCD,
1586 ReorderingBuffer &buffer,
1587 UErrorCode &errorCode) const {
1588 if(!buffer.isEmpty()) {
1589 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1590 if(src!=firstBoundaryInSrc) {
1591 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1592 buffer.getLimit());
1593 UnicodeString middle(lastBoundaryInDest,
1594 (int32_t)(buffer.getLimit()-lastBoundaryInDest));
1595 buffer.removeSuffix((int32_t)(buffer.getLimit()-lastBoundaryInDest));
1596 middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1597 const UChar *middleStart=middle.getBuffer();
1598 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1599 if(U_FAILURE(errorCode)) {
1600 return;
1601 }
1602 src=firstBoundaryInSrc;
1603 }
1604 }
1605 if(doMakeFCD) {
1606 makeFCD(src, limit, &buffer, errorCode);
1607 } else {
1608 buffer.appendZeroCC(src, limit, errorCode);
1609 }
1610 }
1611
findPreviousFCDBoundary(const UChar * start,const UChar * p) const1612 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1613 BackwardUTrie2StringIterator iter(fcdTrie(), start, p);
1614 uint16_t fcd16;
1615 do {
1616 fcd16=iter.previous16();
1617 } while(fcd16>0xff);
1618 return iter.codePointStart;
1619 }
1620
findNextFCDBoundary(const UChar * p,const UChar * limit) const1621 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1622 ForwardUTrie2StringIterator iter(fcdTrie(), p, limit);
1623 uint16_t fcd16;
1624 do {
1625 fcd16=iter.next16();
1626 } while(fcd16>0xff);
1627 return iter.codePointStart;
1628 }
1629
1630 U_NAMESPACE_END
1631
1632 // Normalizer2 data swapping ----------------------------------------------- ***
1633
1634 U_NAMESPACE_USE
1635
1636 U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1637 unorm2_swap(const UDataSwapper *ds,
1638 const void *inData, int32_t length, void *outData,
1639 UErrorCode *pErrorCode) {
1640 const UDataInfo *pInfo;
1641 int32_t headerSize;
1642
1643 const uint8_t *inBytes;
1644 uint8_t *outBytes;
1645
1646 const int32_t *inIndexes;
1647 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
1648
1649 int32_t i, offset, nextOffset, size;
1650
1651 /* udata_swapDataHeader checks the arguments */
1652 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1653 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1654 return 0;
1655 }
1656
1657 /* check data format and format version */
1658 pInfo=(const UDataInfo *)((const char *)inData+4);
1659 if(!(
1660 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
1661 pInfo->dataFormat[1]==0x72 &&
1662 pInfo->dataFormat[2]==0x6d &&
1663 pInfo->dataFormat[3]==0x32 &&
1664 pInfo->formatVersion[0]==1
1665 )) {
1666 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
1667 pInfo->dataFormat[0], pInfo->dataFormat[1],
1668 pInfo->dataFormat[2], pInfo->dataFormat[3],
1669 pInfo->formatVersion[0]);
1670 *pErrorCode=U_UNSUPPORTED_ERROR;
1671 return 0;
1672 }
1673
1674 inBytes=(const uint8_t *)inData+headerSize;
1675 outBytes=(uint8_t *)outData+headerSize;
1676
1677 inIndexes=(const int32_t *)inBytes;
1678
1679 if(length>=0) {
1680 length-=headerSize;
1681 if(length<sizeof(indexes)) {
1682 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
1683 length);
1684 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1685 return 0;
1686 }
1687 }
1688
1689 /* read the first few indexes */
1690 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
1691 indexes[i]=udata_readInt32(ds, inIndexes[i]);
1692 }
1693
1694 /* get the total length of the data */
1695 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
1696
1697 if(length>=0) {
1698 if(length<size) {
1699 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
1700 length);
1701 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1702 return 0;
1703 }
1704
1705 /* copy the data for inaccessible bytes */
1706 if(inBytes!=outBytes) {
1707 uprv_memcpy(outBytes, inBytes, size);
1708 }
1709
1710 offset=0;
1711
1712 /* swap the int32_t indexes[] */
1713 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
1714 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
1715 offset=nextOffset;
1716
1717 /* swap the UTrie2 */
1718 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
1719 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
1720 offset=nextOffset;
1721
1722 /* swap the uint16_t extraData[] */
1723 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1];
1724 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
1725 offset=nextOffset;
1726
1727 U_ASSERT(offset==size);
1728 }
1729
1730 return headerSize+size;
1731 }
1732
1733 #endif // !UCONFIG_NO_NORMALIZATION
1734