1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: normalizer2impl.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009nov22
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_NORMALIZATION
20
21 #include "unicode/normalizer2.h"
22 #include "unicode/udata.h"
23 #include "unicode/ustring.h"
24 #include "unicode/utf16.h"
25 #include "cmemory.h"
26 #include "mutex.h"
27 #include "normalizer2impl.h"
28 #include "putilimp.h"
29 #include "uassert.h"
30 #include "uset_imp.h"
31 #include "utrie2.h"
32 #include "uvector.h"
33
34 U_NAMESPACE_BEGIN
35
36 // ReorderingBuffer -------------------------------------------------------- ***
37
init(int32_t destCapacity,UErrorCode & errorCode)38 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
39 int32_t length=str.length();
40 start=str.getBuffer(destCapacity);
41 if(start==NULL) {
42 // getBuffer() already did str.setToBogus()
43 errorCode=U_MEMORY_ALLOCATION_ERROR;
44 return FALSE;
45 }
46 limit=start+length;
47 remainingCapacity=str.getCapacity()-length;
48 reorderStart=start;
49 if(start==limit) {
50 lastCC=0;
51 } else {
52 setIterator();
53 lastCC=previousCC();
54 // Set reorderStart after the last code point with cc<=1 if there is one.
55 if(lastCC>1) {
56 while(previousCC()>1) {}
57 }
58 reorderStart=codePointLimit;
59 }
60 return TRUE;
61 }
62
equals(const UChar * otherStart,const UChar * otherLimit) const63 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
64 int32_t length=(int32_t)(limit-start);
65 return
66 length==(int32_t)(otherLimit-otherStart) &&
67 0==u_memcmp(start, otherStart, length);
68 }
69
appendSupplementary(UChar32 c,uint8_t cc,UErrorCode & errorCode)70 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
71 if(remainingCapacity<2 && !resize(2, errorCode)) {
72 return FALSE;
73 }
74 if(lastCC<=cc || cc==0) {
75 limit[0]=U16_LEAD(c);
76 limit[1]=U16_TRAIL(c);
77 limit+=2;
78 lastCC=cc;
79 if(cc<=1) {
80 reorderStart=limit;
81 }
82 } else {
83 insert(c, cc);
84 }
85 remainingCapacity-=2;
86 return TRUE;
87 }
88
append(const UChar * s,int32_t length,uint8_t leadCC,uint8_t trailCC,UErrorCode & errorCode)89 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
90 uint8_t leadCC, uint8_t trailCC,
91 UErrorCode &errorCode) {
92 if(length==0) {
93 return TRUE;
94 }
95 if(remainingCapacity<length && !resize(length, errorCode)) {
96 return FALSE;
97 }
98 remainingCapacity-=length;
99 if(lastCC<=leadCC || leadCC==0) {
100 if(trailCC<=1) {
101 reorderStart=limit+length;
102 } else if(leadCC<=1) {
103 reorderStart=limit+1; // Ok if not a code point boundary.
104 }
105 const UChar *sLimit=s+length;
106 do { *limit++=*s++; } while(s!=sLimit);
107 lastCC=trailCC;
108 } else {
109 int32_t i=0;
110 UChar32 c;
111 U16_NEXT(s, i, length, c);
112 insert(c, leadCC); // insert first code point
113 while(i<length) {
114 U16_NEXT(s, i, length, c);
115 if(i<length) {
116 // s must be in NFD, otherwise we need to use getCC().
117 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
118 } else {
119 leadCC=trailCC;
120 }
121 append(c, leadCC, errorCode);
122 }
123 }
124 return TRUE;
125 }
126
appendZeroCC(UChar32 c,UErrorCode & errorCode)127 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
128 int32_t cpLength=U16_LENGTH(c);
129 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
130 return FALSE;
131 }
132 remainingCapacity-=cpLength;
133 if(cpLength==1) {
134 *limit++=(UChar)c;
135 } else {
136 limit[0]=U16_LEAD(c);
137 limit[1]=U16_TRAIL(c);
138 limit+=2;
139 }
140 lastCC=0;
141 reorderStart=limit;
142 return TRUE;
143 }
144
appendZeroCC(const UChar * s,const UChar * sLimit,UErrorCode & errorCode)145 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
146 if(s==sLimit) {
147 return TRUE;
148 }
149 int32_t length=(int32_t)(sLimit-s);
150 if(remainingCapacity<length && !resize(length, errorCode)) {
151 return FALSE;
152 }
153 u_memcpy(limit, s, length);
154 limit+=length;
155 remainingCapacity-=length;
156 lastCC=0;
157 reorderStart=limit;
158 return TRUE;
159 }
160
remove()161 void ReorderingBuffer::remove() {
162 reorderStart=limit=start;
163 remainingCapacity=str.getCapacity();
164 lastCC=0;
165 }
166
removeSuffix(int32_t suffixLength)167 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
168 if(suffixLength<(limit-start)) {
169 limit-=suffixLength;
170 remainingCapacity+=suffixLength;
171 } else {
172 limit=start;
173 remainingCapacity=str.getCapacity();
174 }
175 lastCC=0;
176 reorderStart=limit;
177 }
178
resize(int32_t appendLength,UErrorCode & errorCode)179 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
180 int32_t reorderStartIndex=(int32_t)(reorderStart-start);
181 int32_t length=(int32_t)(limit-start);
182 str.releaseBuffer(length);
183 int32_t newCapacity=length+appendLength;
184 int32_t doubleCapacity=2*str.getCapacity();
185 if(newCapacity<doubleCapacity) {
186 newCapacity=doubleCapacity;
187 }
188 if(newCapacity<256) {
189 newCapacity=256;
190 }
191 start=str.getBuffer(newCapacity);
192 if(start==NULL) {
193 // getBuffer() already did str.setToBogus()
194 errorCode=U_MEMORY_ALLOCATION_ERROR;
195 return FALSE;
196 }
197 reorderStart=start+reorderStartIndex;
198 limit=start+length;
199 remainingCapacity=str.getCapacity()-length;
200 return TRUE;
201 }
202
skipPrevious()203 void ReorderingBuffer::skipPrevious() {
204 codePointLimit=codePointStart;
205 UChar c=*--codePointStart;
206 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
207 --codePointStart;
208 }
209 }
210
previousCC()211 uint8_t ReorderingBuffer::previousCC() {
212 codePointLimit=codePointStart;
213 if(reorderStart>=codePointStart) {
214 return 0;
215 }
216 UChar32 c=*--codePointStart;
217 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
218 return 0;
219 }
220
221 UChar c2;
222 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
223 --codePointStart;
224 c=U16_GET_SUPPLEMENTARY(c2, c);
225 }
226 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
227 }
228
229 // Inserts c somewhere before the last character.
230 // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(UChar32 c,uint8_t cc)231 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
232 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
233 // insert c at codePointLimit, after the character with prevCC<=cc
234 UChar *q=limit;
235 UChar *r=limit+=U16_LENGTH(c);
236 do {
237 *--r=*--q;
238 } while(codePointLimit!=q);
239 writeCodePoint(q, c);
240 if(cc<=1) {
241 reorderStart=r;
242 }
243 }
244
245 // Normalizer2Impl --------------------------------------------------------- ***
246
247 struct CanonIterData : public UMemory {
248 CanonIterData(UErrorCode &errorCode);
249 ~CanonIterData();
250 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
251 UTrie2 *trie;
252 UVector canonStartSets; // contains UnicodeSet *
253 };
254
~Normalizer2Impl()255 Normalizer2Impl::~Normalizer2Impl() {
256 udata_close(memory);
257 utrie2_close(normTrie);
258 delete (CanonIterData *)canonIterDataSingleton.fInstance;
259 }
260
261 UBool U_CALLCONV
isAcceptable(void * context,const char *,const char *,const UDataInfo * pInfo)262 Normalizer2Impl::isAcceptable(void *context,
263 const char * /* type */, const char * /*name*/,
264 const UDataInfo *pInfo) {
265 if(
266 pInfo->size>=20 &&
267 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
268 pInfo->charsetFamily==U_CHARSET_FAMILY &&
269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
270 pInfo->dataFormat[1]==0x72 &&
271 pInfo->dataFormat[2]==0x6d &&
272 pInfo->dataFormat[3]==0x32 &&
273 pInfo->formatVersion[0]==2
274 ) {
275 Normalizer2Impl *me=(Normalizer2Impl *)context;
276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
277 return TRUE;
278 } else {
279 return FALSE;
280 }
281 }
282
283 void
load(const char * packageName,const char * name,UErrorCode & errorCode)284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
285 if(U_FAILURE(errorCode)) {
286 return;
287 }
288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
289 if(U_FAILURE(errorCode)) {
290 return;
291 }
292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
293 const int32_t *inIndexes=(const int32_t *)inBytes;
294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
295 if(indexesLength<=IX_MIN_MAYBE_YES) {
296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
297 return;
298 }
299
300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
302
303 minYesNo=inIndexes[IX_MIN_YES_NO];
304 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
305 minNoNo=inIndexes[IX_MIN_NO_NO];
306 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
307 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
308
309 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
310 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
311 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
312 inBytes+offset, nextOffset-offset, NULL,
313 &errorCode);
314 if(U_FAILURE(errorCode)) {
315 return;
316 }
317
318 offset=nextOffset;
319 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
320 maybeYesCompositions=(const uint16_t *)(inBytes+offset);
321 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
322
323 // smallFCD: new in formatVersion 2
324 offset=nextOffset;
325 smallFCD=inBytes+offset;
326
327 // Build tccc180[].
328 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
329 uint8_t bits=0;
330 for(UChar c=0; c<0x180; bits>>=1) {
331 if((c&0xff)==0) {
332 bits=smallFCD[c>>8]; // one byte per 0x100 code points
333 }
334 if(bits&1) {
335 for(int i=0; i<0x20; ++i, ++c) {
336 tccc180[c]=(uint8_t)getFCD16FromNormData(c);
337 }
338 } else {
339 uprv_memset(tccc180+c, 0, 0x20);
340 c+=0x20;
341 }
342 }
343 }
344
getTrailCCFromCompYesAndZeroCC(const UChar * cpStart,const UChar * cpLimit) const345 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
346 UChar32 c;
347 if(cpStart==(cpLimit-1)) {
348 c=*cpStart;
349 } else {
350 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
351 }
352 uint16_t prevNorm16=getNorm16(c);
353 if(prevNorm16<=minYesNo) {
354 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
355 } else {
356 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
357 }
358 }
359
360 U_CDECL_BEGIN
361
362 static UBool U_CALLCONV
enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)363 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
364 /* add the start code point to the USet */
365 const USetAdder *sa=(const USetAdder *)context;
366 sa->add(sa->set, start);
367 return TRUE;
368 }
369
370 static uint32_t U_CALLCONV
segmentStarterMapper(const void *,uint32_t value)371 segmentStarterMapper(const void * /*context*/, uint32_t value) {
372 return value&CANON_NOT_SEGMENT_STARTER;
373 }
374
375 U_CDECL_END
376
377 void
addPropertyStarts(const USetAdder * sa,UErrorCode &) const378 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
379 /* add the start code point of each same-value range of each trie */
380 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
381
382 /* add Hangul LV syllables and LV+1 because of skippables */
383 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
384 sa->add(sa->set, c);
385 sa->add(sa->set, c+1);
386 }
387 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
388 }
389
390 void
addCanonIterPropertyStarts(const USetAdder * sa,UErrorCode & errorCode) const391 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
392 /* add the start code point of each same-value range of the canonical iterator data trie */
393 if(ensureCanonIterData(errorCode)) {
394 // currently only used for the SEGMENT_STARTER property
395 utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie,
396 segmentStarterMapper, enumPropertyStartsRange, sa);
397 }
398 }
399
400 const UChar *
copyLowPrefixFromNulTerminated(const UChar * src,UChar32 minNeedDataCP,ReorderingBuffer * buffer,UErrorCode & errorCode) const401 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
402 UChar32 minNeedDataCP,
403 ReorderingBuffer *buffer,
404 UErrorCode &errorCode) const {
405 // Make some effort to support NUL-terminated strings reasonably.
406 // Take the part of the fast quick check loop that does not look up
407 // data and check the first part of the string.
408 // After this prefix, determine the string length to simplify the rest
409 // of the code.
410 const UChar *prevSrc=src;
411 UChar c;
412 while((c=*src++)<minNeedDataCP && c!=0) {}
413 // Back out the last character for full processing.
414 // Copy this prefix.
415 if(--src!=prevSrc) {
416 if(buffer!=NULL) {
417 buffer->appendZeroCC(prevSrc, src, errorCode);
418 }
419 }
420 return src;
421 }
422
423 // Dual functionality:
424 // buffer!=NULL: normalize
425 // buffer==NULL: isNormalized/spanQuickCheckYes
426 const UChar *
decompose(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const427 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
428 ReorderingBuffer *buffer,
429 UErrorCode &errorCode) const {
430 UChar32 minNoCP=minDecompNoCP;
431 if(limit==NULL) {
432 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
433 if(U_FAILURE(errorCode)) {
434 return src;
435 }
436 limit=u_strchr(src, 0);
437 }
438
439 const UChar *prevSrc;
440 UChar32 c=0;
441 uint16_t norm16=0;
442
443 // only for quick check
444 const UChar *prevBoundary=src;
445 uint8_t prevCC=0;
446
447 for(;;) {
448 // count code units below the minimum or with irrelevant data for the quick check
449 for(prevSrc=src; src!=limit;) {
450 if( (c=*src)<minNoCP ||
451 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
452 ) {
453 ++src;
454 } else if(!U16_IS_SURROGATE(c)) {
455 break;
456 } else {
457 UChar c2;
458 if(U16_IS_SURROGATE_LEAD(c)) {
459 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
460 c=U16_GET_SUPPLEMENTARY(c, c2);
461 }
462 } else /* trail surrogate */ {
463 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
464 --src;
465 c=U16_GET_SUPPLEMENTARY(c2, c);
466 }
467 }
468 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
469 src+=U16_LENGTH(c);
470 } else {
471 break;
472 }
473 }
474 }
475 // copy these code units all at once
476 if(src!=prevSrc) {
477 if(buffer!=NULL) {
478 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
479 break;
480 }
481 } else {
482 prevCC=0;
483 prevBoundary=src;
484 }
485 }
486 if(src==limit) {
487 break;
488 }
489
490 // Check one above-minimum, relevant code point.
491 src+=U16_LENGTH(c);
492 if(buffer!=NULL) {
493 if(!decompose(c, norm16, *buffer, errorCode)) {
494 break;
495 }
496 } else {
497 if(isDecompYes(norm16)) {
498 uint8_t cc=getCCFromYesOrMaybe(norm16);
499 if(prevCC<=cc || cc==0) {
500 prevCC=cc;
501 if(cc<=1) {
502 prevBoundary=src;
503 }
504 continue;
505 }
506 }
507 return prevBoundary; // "no" or cc out of order
508 }
509 }
510 return src;
511 }
512
513 // Decompose a short piece of text which is likely to contain characters that
514 // fail the quick check loop and/or where the quick check loop's overhead
515 // is unlikely to be amortized.
516 // Called by the compose() and makeFCD() implementations.
decomposeShort(const UChar * src,const UChar * limit,ReorderingBuffer & buffer,UErrorCode & errorCode) const517 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
518 ReorderingBuffer &buffer,
519 UErrorCode &errorCode) const {
520 while(src<limit) {
521 UChar32 c;
522 uint16_t norm16;
523 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
524 if(!decompose(c, norm16, buffer, errorCode)) {
525 return FALSE;
526 }
527 }
528 return TRUE;
529 }
530
decompose(UChar32 c,uint16_t norm16,ReorderingBuffer & buffer,UErrorCode & errorCode) const531 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
532 ReorderingBuffer &buffer,
533 UErrorCode &errorCode) const {
534 // Only loops for 1:1 algorithmic mappings.
535 for(;;) {
536 // get the decomposition and the lead and trail cc's
537 if(isDecompYes(norm16)) {
538 // c does not decompose
539 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
540 } else if(isHangul(norm16)) {
541 // Hangul syllable: decompose algorithmically
542 UChar jamos[3];
543 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
544 } else if(isDecompNoAlgorithmic(norm16)) {
545 c=mapAlgorithmic(c, norm16);
546 norm16=getNorm16(c);
547 } else {
548 // c decomposes, get everything from the variable-length extra data
549 const uint16_t *mapping=getMapping(norm16);
550 uint16_t firstUnit=*mapping;
551 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
552 uint8_t leadCC, trailCC;
553 trailCC=(uint8_t)(firstUnit>>8);
554 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
555 leadCC=(uint8_t)(*(mapping-1)>>8);
556 } else {
557 leadCC=0;
558 }
559 return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
560 }
561 }
562 }
563
564 const UChar *
getDecomposition(UChar32 c,UChar buffer[4],int32_t & length) const565 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
566 const UChar *decomp=NULL;
567 uint16_t norm16;
568 for(;;) {
569 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
570 // c does not decompose
571 return decomp;
572 } else if(isHangul(norm16)) {
573 // Hangul syllable: decompose algorithmically
574 length=Hangul::decompose(c, buffer);
575 return buffer;
576 } else if(isDecompNoAlgorithmic(norm16)) {
577 c=mapAlgorithmic(c, norm16);
578 decomp=buffer;
579 length=0;
580 U16_APPEND_UNSAFE(buffer, length, c);
581 } else {
582 // c decomposes, get everything from the variable-length extra data
583 const uint16_t *mapping=getMapping(norm16);
584 length=*mapping&MAPPING_LENGTH_MASK;
585 return (const UChar *)mapping+1;
586 }
587 }
588 }
589
590 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
591 // so that a raw mapping fits that consists of one unit ("rm0")
592 // plus all but the first two code units of the normal mapping.
593 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
594 const UChar *
getRawDecomposition(UChar32 c,UChar buffer[30],int32_t & length) const595 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
596 // We do not loop in this method because an algorithmic mapping itself
597 // becomes a final result rather than having to be decomposed recursively.
598 uint16_t norm16;
599 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
600 // c does not decompose
601 return NULL;
602 } else if(isHangul(norm16)) {
603 // Hangul syllable: decompose algorithmically
604 Hangul::getRawDecomposition(c, buffer);
605 length=2;
606 return buffer;
607 } else if(isDecompNoAlgorithmic(norm16)) {
608 c=mapAlgorithmic(c, norm16);
609 length=0;
610 U16_APPEND_UNSAFE(buffer, length, c);
611 return buffer;
612 } else {
613 // c decomposes, get everything from the variable-length extra data
614 const uint16_t *mapping=getMapping(norm16);
615 uint16_t firstUnit=*mapping;
616 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
617 if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
618 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
619 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
620 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
621 uint16_t rm0=*rawMapping;
622 if(rm0<=MAPPING_LENGTH_MASK) {
623 length=rm0;
624 return (const UChar *)rawMapping-rm0;
625 } else {
626 // Copy the normal mapping and replace its first two code units with rm0.
627 buffer[0]=(UChar)rm0;
628 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
629 length=mLength-1;
630 return buffer;
631 }
632 } else {
633 length=mLength;
634 return (const UChar *)mapping+1;
635 }
636 }
637 }
638
decomposeAndAppend(const UChar * src,const UChar * limit,UBool doDecompose,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const639 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
640 UBool doDecompose,
641 UnicodeString &safeMiddle,
642 ReorderingBuffer &buffer,
643 UErrorCode &errorCode) const {
644 buffer.copyReorderableSuffixTo(safeMiddle);
645 if(doDecompose) {
646 decompose(src, limit, &buffer, errorCode);
647 return;
648 }
649 // Just merge the strings at the boundary.
650 ForwardUTrie2StringIterator iter(normTrie, src, limit);
651 uint8_t firstCC, prevCC, cc;
652 firstCC=prevCC=cc=getCC(iter.next16());
653 while(cc!=0) {
654 prevCC=cc;
655 cc=getCC(iter.next16());
656 };
657 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
658 limit=u_strchr(iter.codePointStart, 0);
659 }
660
661 if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
662 buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
663 }
664 }
665
666 // Note: hasDecompBoundary() could be implemented as aliases to
667 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
668 // at the cost of building the FCD trie for a decomposition normalizer.
hasDecompBoundary(UChar32 c,UBool before) const669 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
670 for(;;) {
671 if(c<minDecompNoCP) {
672 return TRUE;
673 }
674 uint16_t norm16=getNorm16(c);
675 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
676 return TRUE;
677 } else if(norm16>MIN_NORMAL_MAYBE_YES) {
678 return FALSE; // ccc!=0
679 } else if(isDecompNoAlgorithmic(norm16)) {
680 c=mapAlgorithmic(c, norm16);
681 } else {
682 // c decomposes, get everything from the variable-length extra data
683 const uint16_t *mapping=getMapping(norm16);
684 uint16_t firstUnit=*mapping;
685 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
686 return FALSE;
687 }
688 if(!before) {
689 // decomp after-boundary: same as hasFCDBoundaryAfter(),
690 // fcd16<=1 || trailCC==0
691 if(firstUnit>0x1ff) {
692 return FALSE; // trailCC>1
693 }
694 if(firstUnit<=0xff) {
695 return TRUE; // trailCC==0
696 }
697 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
698 }
699 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
700 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
701 }
702 }
703 }
704
705 /*
706 * Finds the recomposition result for
707 * a forward-combining "lead" character,
708 * specified with a pointer to its compositions list,
709 * and a backward-combining "trail" character.
710 *
711 * If the lead and trail characters combine, then this function returns
712 * the following "compositeAndFwd" value:
713 * Bits 21..1 composite character
714 * Bit 0 set if the composite is a forward-combining starter
715 * otherwise it returns -1.
716 *
717 * The compositions list has (trail, compositeAndFwd) pair entries,
718 * encoded as either pairs or triples of 16-bit units.
719 * The last entry has the high bit of its first unit set.
720 *
721 * The list is sorted by ascending trail characters (there are no duplicates).
722 * A linear search is used.
723 *
724 * See normalizer2impl.h for a more detailed description
725 * of the compositions list format.
726 */
combine(const uint16_t * list,UChar32 trail)727 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
728 uint16_t key1, firstUnit;
729 if(trail<COMP_1_TRAIL_LIMIT) {
730 // trail character is 0..33FF
731 // result entry may have 2 or 3 units
732 key1=(uint16_t)(trail<<1);
733 while(key1>(firstUnit=*list)) {
734 list+=2+(firstUnit&COMP_1_TRIPLE);
735 }
736 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
737 if(firstUnit&COMP_1_TRIPLE) {
738 return ((int32_t)list[1]<<16)|list[2];
739 } else {
740 return list[1];
741 }
742 }
743 } else {
744 // trail character is 3400..10FFFF
745 // result entry has 3 units
746 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
747 (((trail>>COMP_1_TRAIL_SHIFT))&
748 ~COMP_1_TRIPLE));
749 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
750 uint16_t secondUnit;
751 for(;;) {
752 if(key1>(firstUnit=*list)) {
753 list+=2+(firstUnit&COMP_1_TRIPLE);
754 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
755 if(key2>(secondUnit=list[1])) {
756 if(firstUnit&COMP_1_LAST_TUPLE) {
757 break;
758 } else {
759 list+=3;
760 }
761 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
762 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
763 } else {
764 break;
765 }
766 } else {
767 break;
768 }
769 }
770 }
771 return -1;
772 }
773
774 /**
775 * @param list some character's compositions list
776 * @param set recursively receives the composites from these compositions
777 */
addComposites(const uint16_t * list,UnicodeSet & set) const778 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
779 uint16_t firstUnit;
780 int32_t compositeAndFwd;
781 do {
782 firstUnit=*list;
783 if((firstUnit&COMP_1_TRIPLE)==0) {
784 compositeAndFwd=list[1];
785 list+=2;
786 } else {
787 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
788 list+=3;
789 }
790 UChar32 composite=compositeAndFwd>>1;
791 if((compositeAndFwd&1)!=0) {
792 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
793 }
794 set.add(composite);
795 } while((firstUnit&COMP_1_LAST_TUPLE)==0);
796 }
797
798 /*
799 * Recomposes the buffer text starting at recomposeStartIndex
800 * (which is in NFD - decomposed and canonically ordered),
801 * and truncates the buffer contents.
802 *
803 * Note that recomposition never lengthens the text:
804 * Any character consists of either one or two code units;
805 * a composition may contain at most one more code unit than the original starter,
806 * while the combining mark that is removed has at least one code unit.
807 */
recompose(ReorderingBuffer & buffer,int32_t recomposeStartIndex,UBool onlyContiguous) const808 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
809 UBool onlyContiguous) const {
810 UChar *p=buffer.getStart()+recomposeStartIndex;
811 UChar *limit=buffer.getLimit();
812 if(p==limit) {
813 return;
814 }
815
816 UChar *starter, *pRemove, *q, *r;
817 const uint16_t *compositionsList;
818 UChar32 c, compositeAndFwd;
819 uint16_t norm16;
820 uint8_t cc, prevCC;
821 UBool starterIsSupplementary;
822
823 // Some of the following variables are not used until we have a forward-combining starter
824 // and are only initialized now to avoid compiler warnings.
825 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
826 starter=NULL;
827 starterIsSupplementary=FALSE;
828 prevCC=0;
829
830 for(;;) {
831 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
832 cc=getCCFromYesOrMaybe(norm16);
833 if( // this character combines backward and
834 isMaybe(norm16) &&
835 // we have seen a starter that combines forward and
836 compositionsList!=NULL &&
837 // the backward-combining character is not blocked
838 (prevCC<cc || prevCC==0)
839 ) {
840 if(isJamoVT(norm16)) {
841 // c is a Jamo V/T, see if we can compose it with the previous character.
842 if(c<Hangul::JAMO_T_BASE) {
843 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
844 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
845 if(prev<Hangul::JAMO_L_COUNT) {
846 pRemove=p-1;
847 UChar syllable=(UChar)
848 (Hangul::HANGUL_BASE+
849 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
850 Hangul::JAMO_T_COUNT);
851 UChar t;
852 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
853 ++p;
854 syllable+=t; // The next character was a Jamo T.
855 }
856 *starter=syllable;
857 // remove the Jamo V/T
858 q=pRemove;
859 r=p;
860 while(r<limit) {
861 *q++=*r++;
862 }
863 limit=q;
864 p=pRemove;
865 }
866 }
867 /*
868 * No "else" for Jamo T:
869 * Since the input is in NFD, there are no Hangul LV syllables that
870 * a Jamo T could combine with.
871 * All Jamo Ts are combined above when handling Jamo Vs.
872 */
873 if(p==limit) {
874 break;
875 }
876 compositionsList=NULL;
877 continue;
878 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
879 // The starter and the combining mark (c) do combine.
880 UChar32 composite=compositeAndFwd>>1;
881
882 // Replace the starter with the composite, remove the combining mark.
883 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
884 if(starterIsSupplementary) {
885 if(U_IS_SUPPLEMENTARY(composite)) {
886 // both are supplementary
887 starter[0]=U16_LEAD(composite);
888 starter[1]=U16_TRAIL(composite);
889 } else {
890 *starter=(UChar)composite;
891 // The composite is shorter than the starter,
892 // move the intermediate characters forward one.
893 starterIsSupplementary=FALSE;
894 q=starter+1;
895 r=q+1;
896 while(r<pRemove) {
897 *q++=*r++;
898 }
899 --pRemove;
900 }
901 } else if(U_IS_SUPPLEMENTARY(composite)) {
902 // The composite is longer than the starter,
903 // move the intermediate characters back one.
904 starterIsSupplementary=TRUE;
905 ++starter; // temporarily increment for the loop boundary
906 q=pRemove;
907 r=++pRemove;
908 while(starter<q) {
909 *--r=*--q;
910 }
911 *starter=U16_TRAIL(composite);
912 *--starter=U16_LEAD(composite); // undo the temporary increment
913 } else {
914 // both are on the BMP
915 *starter=(UChar)composite;
916 }
917
918 /* remove the combining mark by moving the following text over it */
919 if(pRemove<p) {
920 q=pRemove;
921 r=p;
922 while(r<limit) {
923 *q++=*r++;
924 }
925 limit=q;
926 p=pRemove;
927 }
928 // Keep prevCC because we removed the combining mark.
929
930 if(p==limit) {
931 break;
932 }
933 // Is the composite a starter that combines forward?
934 if(compositeAndFwd&1) {
935 compositionsList=
936 getCompositionsListForComposite(getNorm16(composite));
937 } else {
938 compositionsList=NULL;
939 }
940
941 // We combined; continue with looking for compositions.
942 continue;
943 }
944 }
945
946 // no combination this time
947 prevCC=cc;
948 if(p==limit) {
949 break;
950 }
951
952 // If c did not combine, then check if it is a starter.
953 if(cc==0) {
954 // Found a new starter.
955 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
956 // It may combine with something, prepare for it.
957 if(U_IS_BMP(c)) {
958 starterIsSupplementary=FALSE;
959 starter=p-1;
960 } else {
961 starterIsSupplementary=TRUE;
962 starter=p-2;
963 }
964 }
965 } else if(onlyContiguous) {
966 // FCC: no discontiguous compositions; any intervening character blocks.
967 compositionsList=NULL;
968 }
969 }
970 buffer.setReorderingLimit(limit);
971 }
972
973 UChar32
composePair(UChar32 a,UChar32 b) const974 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
975 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
976 const uint16_t *list;
977 if(isInert(norm16)) {
978 return U_SENTINEL;
979 } else if(norm16<minYesNoMappingsOnly) {
980 if(isJamoL(norm16)) {
981 b-=Hangul::JAMO_V_BASE;
982 if(0<=b && b<Hangul::JAMO_V_COUNT) {
983 return
984 (Hangul::HANGUL_BASE+
985 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
986 Hangul::JAMO_T_COUNT);
987 } else {
988 return U_SENTINEL;
989 }
990 } else if(isHangul(norm16)) {
991 b-=Hangul::JAMO_T_BASE;
992 if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
993 return a+b;
994 } else {
995 return U_SENTINEL;
996 }
997 } else {
998 // 'a' has a compositions list in extraData
999 list=extraData+norm16;
1000 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
1001 list+= // mapping pointer
1002 1+ // +1 to skip the first unit with the mapping lenth
1003 (*list&MAPPING_LENGTH_MASK); // + mapping length
1004 }
1005 }
1006 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1007 return U_SENTINEL;
1008 } else {
1009 list=maybeYesCompositions+norm16-minMaybeYes;
1010 }
1011 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
1012 return U_SENTINEL;
1013 }
1014 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1015 return combine(list, b)>>1;
1016 #else
1017 int32_t compositeAndFwd=combine(list, b);
1018 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1019 #endif
1020 }
1021
1022 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1023 // doCompose: normalize
1024 // !doCompose: isNormalized (buffer must be empty and initialized)
1025 UBool
compose(const UChar * src,const UChar * limit,UBool onlyContiguous,UBool doCompose,ReorderingBuffer & buffer,UErrorCode & errorCode) const1026 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1027 UBool onlyContiguous,
1028 UBool doCompose,
1029 ReorderingBuffer &buffer,
1030 UErrorCode &errorCode) const {
1031 /*
1032 * prevBoundary points to the last character before the current one
1033 * that has a composition boundary before it with ccc==0 and quick check "yes".
1034 * Keeping track of prevBoundary saves us looking for a composition boundary
1035 * when we find a "no" or "maybe".
1036 *
1037 * When we back out from prevSrc back to prevBoundary,
1038 * then we also remove those same characters (which had been simply copied
1039 * or canonically-order-inserted) from the ReorderingBuffer.
1040 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
1041 * must correspond 1:1 to destination units at the end of the destination buffer.
1042 */
1043 const UChar *prevBoundary=src;
1044 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1045 if(limit==NULL) {
1046 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1047 doCompose ? &buffer : NULL,
1048 errorCode);
1049 if(U_FAILURE(errorCode)) {
1050 return FALSE;
1051 }
1052 if(prevBoundary<src) {
1053 // Set prevBoundary to the last character in the prefix.
1054 prevBoundary=src-1;
1055 }
1056 limit=u_strchr(src, 0);
1057 }
1058
1059 const UChar *prevSrc;
1060 UChar32 c=0;
1061 uint16_t norm16=0;
1062
1063 // only for isNormalized
1064 uint8_t prevCC=0;
1065
1066 for(;;) {
1067 // count code units below the minimum or with irrelevant data for the quick check
1068 for(prevSrc=src; src!=limit;) {
1069 if( (c=*src)<minNoMaybeCP ||
1070 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1071 ) {
1072 ++src;
1073 } else if(!U16_IS_SURROGATE(c)) {
1074 break;
1075 } else {
1076 UChar c2;
1077 if(U16_IS_SURROGATE_LEAD(c)) {
1078 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1079 c=U16_GET_SUPPLEMENTARY(c, c2);
1080 }
1081 } else /* trail surrogate */ {
1082 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1083 --src;
1084 c=U16_GET_SUPPLEMENTARY(c2, c);
1085 }
1086 }
1087 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1088 src+=U16_LENGTH(c);
1089 } else {
1090 break;
1091 }
1092 }
1093 }
1094 // copy these code units all at once
1095 if(src!=prevSrc) {
1096 if(doCompose) {
1097 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
1098 break;
1099 }
1100 } else {
1101 prevCC=0;
1102 }
1103 if(src==limit) {
1104 break;
1105 }
1106 // Set prevBoundary to the last character in the quick check loop.
1107 prevBoundary=src-1;
1108 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1109 U16_IS_LEAD(*(prevBoundary-1))
1110 ) {
1111 --prevBoundary;
1112 }
1113 // The start of the current character (c).
1114 prevSrc=src;
1115 } else if(src==limit) {
1116 break;
1117 }
1118
1119 src+=U16_LENGTH(c);
1120 /*
1121 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1122 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1123 * or has ccc!=0.
1124 * Check for Jamo V/T, then for regular characters.
1125 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1126 */
1127 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1128 UChar prev=*(prevSrc-1);
1129 UBool needToDecompose=FALSE;
1130 if(c<Hangul::JAMO_T_BASE) {
1131 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1132 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1133 if(prev<Hangul::JAMO_L_COUNT) {
1134 if(!doCompose) {
1135 return FALSE;
1136 }
1137 UChar syllable=(UChar)
1138 (Hangul::HANGUL_BASE+
1139 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1140 Hangul::JAMO_T_COUNT);
1141 UChar t;
1142 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1143 ++src;
1144 syllable+=t; // The next character was a Jamo T.
1145 prevBoundary=src;
1146 buffer.setLastChar(syllable);
1147 continue;
1148 }
1149 // If we see L+V+x where x!=T then we drop to the slow path,
1150 // decompose and recompose.
1151 // This is to deal with NFKC finding normal L and V but a
1152 // compatibility variant of a T. We need to either fully compose that
1153 // combination here (which would complicate the code and may not work
1154 // with strange custom data) or use the slow path -- or else our replacing
1155 // two input characters (L+V) with one output character (LV syllable)
1156 // would violate the invariant that [prevBoundary..prevSrc[ has the same
1157 // length as what we appended to the buffer since prevBoundary.
1158 needToDecompose=TRUE;
1159 }
1160 } else if(Hangul::isHangulWithoutJamoT(prev)) {
1161 // c is a Jamo Trailing consonant,
1162 // compose with previous Hangul LV that does not contain a Jamo T.
1163 if(!doCompose) {
1164 return FALSE;
1165 }
1166 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1167 prevBoundary=src;
1168 continue;
1169 }
1170 if(!needToDecompose) {
1171 // The Jamo V/T did not compose into a Hangul syllable.
1172 if(doCompose) {
1173 if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1174 break;
1175 }
1176 } else {
1177 prevCC=0;
1178 }
1179 continue;
1180 }
1181 }
1182 /*
1183 * Source buffer pointers:
1184 *
1185 * all done quick check current char not yet
1186 * "yes" but (c) processed
1187 * may combine
1188 * forward
1189 * [-------------[-------------[-------------[-------------[
1190 * | | | | |
1191 * orig. src prevBoundary prevSrc src limit
1192 *
1193 *
1194 * Destination buffer pointers inside the ReorderingBuffer:
1195 *
1196 * all done might take not filled yet
1197 * characters for
1198 * reordering
1199 * [-------------[-------------[-------------[
1200 * | | | |
1201 * start reorderStart limit |
1202 * +remainingCap.+
1203 */
1204 if(norm16>=MIN_YES_YES_WITH_CC) {
1205 uint8_t cc=(uint8_t)norm16; // cc!=0
1206 if( onlyContiguous && // FCC
1207 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1208 prevBoundary<prevSrc &&
1209 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1210 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1211 // passed the quick check "yes && ccc==0" test.
1212 // Check whether the last character was a "yesYes" or a "yesNo".
1213 // If a "yesNo", then we get its trailing ccc from its
1214 // mapping and check for canonical order.
1215 // All other cases are ok.
1216 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1217 ) {
1218 // Fails FCD test, need to decompose and contiguously recompose.
1219 if(!doCompose) {
1220 return FALSE;
1221 }
1222 } else if(doCompose) {
1223 if(!buffer.append(c, cc, errorCode)) {
1224 break;
1225 }
1226 continue;
1227 } else if(prevCC<=cc) {
1228 prevCC=cc;
1229 continue;
1230 } else {
1231 return FALSE;
1232 }
1233 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1234 return FALSE;
1235 }
1236
1237 /*
1238 * Find appropriate boundaries around this character,
1239 * decompose the source text from between the boundaries,
1240 * and recompose it.
1241 *
1242 * We may need to remove the last few characters from the ReorderingBuffer
1243 * to account for source text that was copied or appended
1244 * but needs to take part in the recomposition.
1245 */
1246
1247 /*
1248 * Find the last composition boundary in [prevBoundary..src[.
1249 * It is either the decomposition of the current character (at prevSrc),
1250 * or prevBoundary.
1251 */
1252 if(hasCompBoundaryBefore(c, norm16)) {
1253 prevBoundary=prevSrc;
1254 } else if(doCompose) {
1255 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1256 }
1257
1258 // Find the next composition boundary in [src..limit[ -
1259 // modifies src to point to the next starter.
1260 src=(UChar *)findNextCompBoundary(src, limit);
1261
1262 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1263 int32_t recomposeStartIndex=buffer.length();
1264 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1265 break;
1266 }
1267 recompose(buffer, recomposeStartIndex, onlyContiguous);
1268 if(!doCompose) {
1269 if(!buffer.equals(prevBoundary, src)) {
1270 return FALSE;
1271 }
1272 buffer.remove();
1273 prevCC=0;
1274 }
1275
1276 // Move to the next starter. We never need to look back before this point again.
1277 prevBoundary=src;
1278 }
1279 return TRUE;
1280 }
1281
1282 // Very similar to compose(): Make the same changes in both places if relevant.
1283 // pQCResult==NULL: spanQuickCheckYes
1284 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1285 const UChar *
composeQuickCheck(const UChar * src,const UChar * limit,UBool onlyContiguous,UNormalizationCheckResult * pQCResult) const1286 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1287 UBool onlyContiguous,
1288 UNormalizationCheckResult *pQCResult) const {
1289 /*
1290 * prevBoundary points to the last character before the current one
1291 * that has a composition boundary before it with ccc==0 and quick check "yes".
1292 */
1293 const UChar *prevBoundary=src;
1294 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1295 if(limit==NULL) {
1296 UErrorCode errorCode=U_ZERO_ERROR;
1297 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1298 if(prevBoundary<src) {
1299 // Set prevBoundary to the last character in the prefix.
1300 prevBoundary=src-1;
1301 }
1302 limit=u_strchr(src, 0);
1303 }
1304
1305 const UChar *prevSrc;
1306 UChar32 c=0;
1307 uint16_t norm16=0;
1308 uint8_t prevCC=0;
1309
1310 for(;;) {
1311 // count code units below the minimum or with irrelevant data for the quick check
1312 for(prevSrc=src;;) {
1313 if(src==limit) {
1314 return src;
1315 }
1316 if( (c=*src)<minNoMaybeCP ||
1317 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1318 ) {
1319 ++src;
1320 } else if(!U16_IS_SURROGATE(c)) {
1321 break;
1322 } else {
1323 UChar c2;
1324 if(U16_IS_SURROGATE_LEAD(c)) {
1325 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1326 c=U16_GET_SUPPLEMENTARY(c, c2);
1327 }
1328 } else /* trail surrogate */ {
1329 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1330 --src;
1331 c=U16_GET_SUPPLEMENTARY(c2, c);
1332 }
1333 }
1334 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1335 src+=U16_LENGTH(c);
1336 } else {
1337 break;
1338 }
1339 }
1340 }
1341 if(src!=prevSrc) {
1342 // Set prevBoundary to the last character in the quick check loop.
1343 prevBoundary=src-1;
1344 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1345 U16_IS_LEAD(*(prevBoundary-1))
1346 ) {
1347 --prevBoundary;
1348 }
1349 prevCC=0;
1350 // The start of the current character (c).
1351 prevSrc=src;
1352 }
1353
1354 src+=U16_LENGTH(c);
1355 /*
1356 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1357 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1358 * or has ccc!=0.
1359 */
1360 if(isMaybeOrNonZeroCC(norm16)) {
1361 uint8_t cc=getCCFromYesOrMaybe(norm16);
1362 if( onlyContiguous && // FCC
1363 cc!=0 &&
1364 prevCC==0 &&
1365 prevBoundary<prevSrc &&
1366 // prevCC==0 && prevBoundary<prevSrc tell us that
1367 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1368 // passed the quick check "yes && ccc==0" test.
1369 // Check whether the last character was a "yesYes" or a "yesNo".
1370 // If a "yesNo", then we get its trailing ccc from its
1371 // mapping and check for canonical order.
1372 // All other cases are ok.
1373 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1374 ) {
1375 // Fails FCD test.
1376 } else if(prevCC<=cc || cc==0) {
1377 prevCC=cc;
1378 if(norm16<MIN_YES_YES_WITH_CC) {
1379 if(pQCResult!=NULL) {
1380 *pQCResult=UNORM_MAYBE;
1381 } else {
1382 return prevBoundary;
1383 }
1384 }
1385 continue;
1386 }
1387 }
1388 if(pQCResult!=NULL) {
1389 *pQCResult=UNORM_NO;
1390 }
1391 return prevBoundary;
1392 }
1393 }
1394
composeAndAppend(const UChar * src,const UChar * limit,UBool doCompose,UBool onlyContiguous,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1395 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1396 UBool doCompose,
1397 UBool onlyContiguous,
1398 UnicodeString &safeMiddle,
1399 ReorderingBuffer &buffer,
1400 UErrorCode &errorCode) const {
1401 if(!buffer.isEmpty()) {
1402 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1403 if(src!=firstStarterInSrc) {
1404 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1405 buffer.getLimit());
1406 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1407 UnicodeString middle(lastStarterInDest, destSuffixLength);
1408 buffer.removeSuffix(destSuffixLength);
1409 safeMiddle=middle;
1410 middle.append(src, (int32_t)(firstStarterInSrc-src));
1411 const UChar *middleStart=middle.getBuffer();
1412 compose(middleStart, middleStart+middle.length(), onlyContiguous,
1413 TRUE, buffer, errorCode);
1414 if(U_FAILURE(errorCode)) {
1415 return;
1416 }
1417 src=firstStarterInSrc;
1418 }
1419 }
1420 if(doCompose) {
1421 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1422 } else {
1423 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1424 limit=u_strchr(src, 0);
1425 }
1426 buffer.appendZeroCC(src, limit, errorCode);
1427 }
1428 }
1429
1430 /**
1431 * Does c have a composition boundary before it?
1432 * True if its decomposition begins with a character that has
1433 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1434 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1435 * (isCompYesAndZeroCC()) so we need not decompose.
1436 */
hasCompBoundaryBefore(UChar32 c,uint16_t norm16) const1437 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1438 for(;;) {
1439 if(isCompYesAndZeroCC(norm16)) {
1440 return TRUE;
1441 } else if(isMaybeOrNonZeroCC(norm16)) {
1442 return FALSE;
1443 } else if(isDecompNoAlgorithmic(norm16)) {
1444 c=mapAlgorithmic(c, norm16);
1445 norm16=getNorm16(c);
1446 } else {
1447 // c decomposes, get everything from the variable-length extra data
1448 const uint16_t *mapping=getMapping(norm16);
1449 uint16_t firstUnit=*mapping;
1450 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1451 return FALSE;
1452 }
1453 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
1454 return FALSE; // non-zero leadCC
1455 }
1456 int32_t i=1; // skip over the firstUnit
1457 UChar32 c;
1458 U16_NEXT_UNSAFE(mapping, i, c);
1459 return isCompYesAndZeroCC(getNorm16(c));
1460 }
1461 }
1462 }
1463
hasCompBoundaryAfter(UChar32 c,UBool onlyContiguous,UBool testInert) const1464 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1465 for(;;) {
1466 uint16_t norm16=getNorm16(c);
1467 if(isInert(norm16)) {
1468 return TRUE;
1469 } else if(norm16<=minYesNo) {
1470 // Hangul: norm16==minYesNo
1471 // Hangul LVT has a boundary after it.
1472 // Hangul LV and non-inert yesYes characters combine forward.
1473 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1474 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1475 return FALSE;
1476 } else if(isDecompNoAlgorithmic(norm16)) {
1477 c=mapAlgorithmic(c, norm16);
1478 } else {
1479 // c decomposes, get everything from the variable-length extra data.
1480 // If testInert, then c must be a yesNo character which has lccc=0,
1481 // otherwise it could be a noNo.
1482 const uint16_t *mapping=getMapping(norm16);
1483 uint16_t firstUnit=*mapping;
1484 // TRUE if
1485 // not MAPPING_NO_COMP_BOUNDARY_AFTER
1486 // (which is set if
1487 // c is not deleted, and
1488 // it and its decomposition do not combine forward, and it has a starter)
1489 // and if FCC then trailCC<=1
1490 return
1491 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
1492 (!onlyContiguous || firstUnit<=0x1ff);
1493 }
1494 }
1495 }
1496
findPreviousCompBoundary(const UChar * start,const UChar * p) const1497 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1498 BackwardUTrie2StringIterator iter(normTrie, start, p);
1499 uint16_t norm16;
1500 do {
1501 norm16=iter.previous16();
1502 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1503 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1504 // but that's probably not worth the extra cost.
1505 return iter.codePointStart;
1506 }
1507
findNextCompBoundary(const UChar * p,const UChar * limit) const1508 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1509 ForwardUTrie2StringIterator iter(normTrie, p, limit);
1510 uint16_t norm16;
1511 do {
1512 norm16=iter.next16();
1513 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1514 return iter.codePointStart;
1515 }
1516
1517 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
1518 // still had getFCDTrie() which built and cached an FCD trie.
1519 // That provided faster access to FCD data than getFCD16FromNormData()
1520 // but required synchronization and consumed some 10kB of heap memory
1521 // in any process that uses FCD (e.g., via collation).
1522 // tccc180[] and smallFCD[] are intended to help with any loss of performance,
1523 // at least for Latin & CJK.
1524
1525 // Gets the FCD value from the regular normalization data.
getFCD16FromNormData(UChar32 c) const1526 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
1527 // Only loops for 1:1 algorithmic mappings.
1528 for(;;) {
1529 uint16_t norm16=getNorm16(c);
1530 if(norm16<=minYesNo) {
1531 // no decomposition or Hangul syllable, all zeros
1532 return 0;
1533 } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
1534 // combining mark
1535 norm16&=0xff;
1536 return norm16|(norm16<<8);
1537 } else if(norm16>=minMaybeYes) {
1538 return 0;
1539 } else if(isDecompNoAlgorithmic(norm16)) {
1540 c=mapAlgorithmic(c, norm16);
1541 } else {
1542 // c decomposes, get everything from the variable-length extra data
1543 const uint16_t *mapping=getMapping(norm16);
1544 uint16_t firstUnit=*mapping;
1545 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1546 // A character that is deleted (maps to an empty string) must
1547 // get the worst-case lccc and tccc values because arbitrary
1548 // characters on both sides will become adjacent.
1549 return 0x1ff;
1550 } else {
1551 norm16=firstUnit>>8; // tccc
1552 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1553 norm16|=*(mapping-1)&0xff00; // lccc
1554 }
1555 return norm16;
1556 }
1557 }
1558 }
1559 }
1560
1561 // Dual functionality:
1562 // buffer!=NULL: normalize
1563 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1564 const UChar *
makeFCD(const UChar * src,const UChar * limit,ReorderingBuffer * buffer,UErrorCode & errorCode) const1565 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1566 ReorderingBuffer *buffer,
1567 UErrorCode &errorCode) const {
1568 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1569 // Similar to the prevBoundary in the compose() implementation.
1570 const UChar *prevBoundary=src;
1571 int32_t prevFCD16=0;
1572 if(limit==NULL) {
1573 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1574 if(U_FAILURE(errorCode)) {
1575 return src;
1576 }
1577 if(prevBoundary<src) {
1578 prevBoundary=src;
1579 // We know that the previous character's lccc==0.
1580 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1581 prevFCD16=getFCD16(*(src-1));
1582 if(prevFCD16>1) {
1583 --prevBoundary;
1584 }
1585 }
1586 limit=u_strchr(src, 0);
1587 }
1588
1589 // Note: In this function we use buffer->appendZeroCC() because we track
1590 // the lead and trail combining classes here, rather than leaving it to
1591 // the ReorderingBuffer.
1592 // The exception is the call to decomposeShort() which uses the buffer
1593 // in the normal way.
1594
1595 const UChar *prevSrc;
1596 UChar32 c=0;
1597 uint16_t fcd16=0;
1598
1599 for(;;) {
1600 // count code units with lccc==0
1601 for(prevSrc=src; src!=limit;) {
1602 if((c=*src)<MIN_CCC_LCCC_CP) {
1603 prevFCD16=~c;
1604 ++src;
1605 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1606 prevFCD16=0;
1607 ++src;
1608 } else {
1609 if(U16_IS_SURROGATE(c)) {
1610 UChar c2;
1611 if(U16_IS_SURROGATE_LEAD(c)) {
1612 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1613 c=U16_GET_SUPPLEMENTARY(c, c2);
1614 }
1615 } else /* trail surrogate */ {
1616 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1617 --src;
1618 c=U16_GET_SUPPLEMENTARY(c2, c);
1619 }
1620 }
1621 }
1622 if((fcd16=getFCD16FromNormData(c))<=0xff) {
1623 prevFCD16=fcd16;
1624 src+=U16_LENGTH(c);
1625 } else {
1626 break;
1627 }
1628 }
1629 }
1630 // copy these code units all at once
1631 if(src!=prevSrc) {
1632 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1633 break;
1634 }
1635 if(src==limit) {
1636 break;
1637 }
1638 prevBoundary=src;
1639 // We know that the previous character's lccc==0.
1640 if(prevFCD16<0) {
1641 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1642 UChar32 prev=~prevFCD16;
1643 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1644 if(prevFCD16>1) {
1645 --prevBoundary;
1646 }
1647 } else {
1648 const UChar *p=src-1;
1649 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1650 --p;
1651 // Need to fetch the previous character's FCD value because
1652 // prevFCD16 was just for the trail surrogate code point.
1653 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
1654 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1655 }
1656 if(prevFCD16>1) {
1657 prevBoundary=p;
1658 }
1659 }
1660 // The start of the current character (c).
1661 prevSrc=src;
1662 } else if(src==limit) {
1663 break;
1664 }
1665
1666 src+=U16_LENGTH(c);
1667 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1668 // Check for proper order, and decompose locally if necessary.
1669 if((prevFCD16&0xff)<=(fcd16>>8)) {
1670 // proper order: prev tccc <= current lccc
1671 if((fcd16&0xff)<=1) {
1672 prevBoundary=src;
1673 }
1674 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1675 break;
1676 }
1677 prevFCD16=fcd16;
1678 continue;
1679 } else if(buffer==NULL) {
1680 return prevBoundary; // quick check "no"
1681 } else {
1682 /*
1683 * Back out the part of the source that we copied or appended
1684 * already but is now going to be decomposed.
1685 * prevSrc is set to after what was copied/appended.
1686 */
1687 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1688 /*
1689 * Find the part of the source that needs to be decomposed,
1690 * up to the next safe boundary.
1691 */
1692 src=findNextFCDBoundary(src, limit);
1693 /*
1694 * The source text does not fulfill the conditions for FCD.
1695 * Decompose and reorder a limited piece of the text.
1696 */
1697 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1698 break;
1699 }
1700 prevBoundary=src;
1701 prevFCD16=0;
1702 }
1703 }
1704 return src;
1705 }
1706
makeFCDAndAppend(const UChar * src,const UChar * limit,UBool doMakeFCD,UnicodeString & safeMiddle,ReorderingBuffer & buffer,UErrorCode & errorCode) const1707 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1708 UBool doMakeFCD,
1709 UnicodeString &safeMiddle,
1710 ReorderingBuffer &buffer,
1711 UErrorCode &errorCode) const {
1712 if(!buffer.isEmpty()) {
1713 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1714 if(src!=firstBoundaryInSrc) {
1715 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1716 buffer.getLimit());
1717 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
1718 UnicodeString middle(lastBoundaryInDest, destSuffixLength);
1719 buffer.removeSuffix(destSuffixLength);
1720 safeMiddle=middle;
1721 middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1722 const UChar *middleStart=middle.getBuffer();
1723 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1724 if(U_FAILURE(errorCode)) {
1725 return;
1726 }
1727 src=firstBoundaryInSrc;
1728 }
1729 }
1730 if(doMakeFCD) {
1731 makeFCD(src, limit, &buffer, errorCode);
1732 } else {
1733 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1734 limit=u_strchr(src, 0);
1735 }
1736 buffer.appendZeroCC(src, limit, errorCode);
1737 }
1738 }
1739
findPreviousFCDBoundary(const UChar * start,const UChar * p) const1740 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1741 while(start<p && previousFCD16(start, p)>0xff) {}
1742 return p;
1743 }
1744
findNextFCDBoundary(const UChar * p,const UChar * limit) const1745 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1746 while(p<limit) {
1747 const UChar *codePointStart=p;
1748 if(nextFCD16(p, limit)<=0xff) {
1749 return codePointStart;
1750 }
1751 }
1752 return p;
1753 }
1754
1755 // CanonicalIterator data -------------------------------------------------- ***
1756
CanonIterData(UErrorCode & errorCode)1757 CanonIterData::CanonIterData(UErrorCode &errorCode) :
1758 trie(utrie2_open(0, 0, &errorCode)),
1759 canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
1760
~CanonIterData()1761 CanonIterData::~CanonIterData() {
1762 utrie2_close(trie);
1763 }
1764
addToStartSet(UChar32 origin,UChar32 decompLead,UErrorCode & errorCode)1765 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1766 uint32_t canonValue=utrie2_get32(trie, decompLead);
1767 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1768 // origin is the first character whose decomposition starts with
1769 // the character for which we are setting the value.
1770 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1771 } else {
1772 // origin is not the first character, or it is U+0000.
1773 UnicodeSet *set;
1774 if((canonValue&CANON_HAS_SET)==0) {
1775 set=new UnicodeSet;
1776 if(set==NULL) {
1777 errorCode=U_MEMORY_ALLOCATION_ERROR;
1778 return;
1779 }
1780 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1781 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1782 utrie2_set32(trie, decompLead, canonValue, &errorCode);
1783 canonStartSets.addElement(set, errorCode);
1784 if(firstOrigin!=0) {
1785 set->add(firstOrigin);
1786 }
1787 } else {
1788 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1789 }
1790 set->add(origin);
1791 }
1792 }
1793
1794 class CanonIterDataSingleton {
1795 public:
CanonIterDataSingleton(SimpleSingleton & s,Normalizer2Impl & ni,UErrorCode & ec)1796 CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1797 singleton(s), impl(ni), errorCode(ec) {}
getInstance(UErrorCode & errorCode)1798 CanonIterData *getInstance(UErrorCode &errorCode) {
1799 void *duplicate;
1800 CanonIterData *instance=
1801 (CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode);
1802 delete (CanonIterData *)duplicate;
1803 return instance;
1804 }
1805 static void *createInstance(const void *context, UErrorCode &errorCode);
rangeHandler(UChar32 start,UChar32 end,uint32_t value)1806 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1807 if(value!=0) {
1808 impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode);
1809 }
1810 return U_SUCCESS(errorCode);
1811 }
1812
1813 private:
1814 SimpleSingleton &singleton;
1815 Normalizer2Impl &impl;
1816 CanonIterData *newData;
1817 UErrorCode &errorCode;
1818 };
1819
1820 U_CDECL_BEGIN
1821
1822 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1823 static UBool U_CALLCONV
enumCIDRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)1824 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1825 return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value);
1826 }
1827
1828 U_CDECL_END
1829
createInstance(const void * context,UErrorCode & errorCode)1830 void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1831 CanonIterDataSingleton *me=(CanonIterDataSingleton *)context;
1832 me->newData=new CanonIterData(errorCode);
1833 if(me->newData==NULL) {
1834 errorCode=U_MEMORY_ALLOCATION_ERROR;
1835 return NULL;
1836 }
1837 if(U_SUCCESS(errorCode)) {
1838 utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me);
1839 utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1840 if(U_SUCCESS(errorCode)) {
1841 return me->newData;
1842 }
1843 }
1844 delete me->newData;
1845 return NULL;
1846 }
1847
makeCanonIterDataFromNorm16(UChar32 start,UChar32 end,uint16_t norm16,CanonIterData & newData,UErrorCode & errorCode) const1848 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1849 CanonIterData &newData,
1850 UErrorCode &errorCode) const {
1851 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1852 // Inert, or 2-way mapping (including Hangul syllable).
1853 // We do not write a canonStartSet for any yesNo character.
1854 // Composites from 2-way mappings are added at runtime from the
1855 // starter's compositions list, and the other characters in
1856 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1857 // "maybe" characters.
1858 return;
1859 }
1860 for(UChar32 c=start; c<=end; ++c) {
1861 uint32_t oldValue=utrie2_get32(newData.trie, c);
1862 uint32_t newValue=oldValue;
1863 if(norm16>=minMaybeYes) {
1864 // not a segment starter if it occurs in a decomposition or has cc!=0
1865 newValue|=CANON_NOT_SEGMENT_STARTER;
1866 if(norm16<MIN_NORMAL_MAYBE_YES) {
1867 newValue|=CANON_HAS_COMPOSITIONS;
1868 }
1869 } else if(norm16<minYesNo) {
1870 newValue|=CANON_HAS_COMPOSITIONS;
1871 } else {
1872 // c has a one-way decomposition
1873 UChar32 c2=c;
1874 uint16_t norm16_2=norm16;
1875 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1876 c2=mapAlgorithmic(c2, norm16_2);
1877 norm16_2=getNorm16(c2);
1878 }
1879 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1880 // c decomposes, get everything from the variable-length extra data
1881 const uint16_t *mapping=getMapping(norm16_2);
1882 uint16_t firstUnit=*mapping;
1883 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1884 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1885 if(c==c2 && (*(mapping-1)&0xff)!=0) {
1886 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
1887 }
1888 }
1889 // Skip empty mappings (no characters in the decomposition).
1890 if(length!=0) {
1891 ++mapping; // skip over the firstUnit
1892 // add c to first code point's start set
1893 int32_t i=0;
1894 U16_NEXT_UNSAFE(mapping, i, c2);
1895 newData.addToStartSet(c, c2, errorCode);
1896 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1897 // one-way mapping. A 2-way mapping is possible here after
1898 // intermediate algorithmic mapping.
1899 if(norm16_2>=minNoNo) {
1900 while(i<length) {
1901 U16_NEXT_UNSAFE(mapping, i, c2);
1902 uint32_t c2Value=utrie2_get32(newData.trie, c2);
1903 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1904 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1905 &errorCode);
1906 }
1907 }
1908 }
1909 }
1910 } else {
1911 // c decomposed to c2 algorithmically; c has cc==0
1912 newData.addToStartSet(c, c2, errorCode);
1913 }
1914 }
1915 if(newValue!=oldValue) {
1916 utrie2_set32(newData.trie, c, newValue, &errorCode);
1917 }
1918 }
1919 }
1920
ensureCanonIterData(UErrorCode & errorCode) const1921 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1922 // Logically const: Synchronized instantiation.
1923 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1924 CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode);
1925 return U_SUCCESS(errorCode);
1926 }
1927
getCanonValue(UChar32 c) const1928 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1929 return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c);
1930 }
1931
getCanonStartSet(int32_t n) const1932 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1933 return *(const UnicodeSet *)(
1934 ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]);
1935 }
1936
isCanonSegmentStarter(UChar32 c) const1937 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1938 return getCanonValue(c)>=0;
1939 }
1940
getCanonStartSet(UChar32 c,UnicodeSet & set) const1941 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1942 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1943 if(canonValue==0) {
1944 return FALSE;
1945 }
1946 set.clear();
1947 int32_t value=canonValue&CANON_VALUE_MASK;
1948 if((canonValue&CANON_HAS_SET)!=0) {
1949 set.addAll(getCanonStartSet(value));
1950 } else if(value!=0) {
1951 set.add(value);
1952 }
1953 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1954 uint16_t norm16=getNorm16(c);
1955 if(norm16==JAMO_L) {
1956 UChar32 syllable=
1957 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1958 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1959 } else {
1960 addComposites(getCompositionsList(norm16), set);
1961 }
1962 }
1963 return TRUE;
1964 }
1965
1966 U_NAMESPACE_END
1967
1968 // Normalizer2 data swapping ----------------------------------------------- ***
1969
1970 U_NAMESPACE_USE
1971
1972 U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1973 unorm2_swap(const UDataSwapper *ds,
1974 const void *inData, int32_t length, void *outData,
1975 UErrorCode *pErrorCode) {
1976 const UDataInfo *pInfo;
1977 int32_t headerSize;
1978
1979 const uint8_t *inBytes;
1980 uint8_t *outBytes;
1981
1982 const int32_t *inIndexes;
1983 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
1984
1985 int32_t i, offset, nextOffset, size;
1986
1987 /* udata_swapDataHeader checks the arguments */
1988 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1989 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1990 return 0;
1991 }
1992
1993 /* check data format and format version */
1994 pInfo=(const UDataInfo *)((const char *)inData+4);
1995 if(!(
1996 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
1997 pInfo->dataFormat[1]==0x72 &&
1998 pInfo->dataFormat[2]==0x6d &&
1999 pInfo->dataFormat[3]==0x32 &&
2000 (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
2001 )) {
2002 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2003 pInfo->dataFormat[0], pInfo->dataFormat[1],
2004 pInfo->dataFormat[2], pInfo->dataFormat[3],
2005 pInfo->formatVersion[0]);
2006 *pErrorCode=U_UNSUPPORTED_ERROR;
2007 return 0;
2008 }
2009
2010 inBytes=(const uint8_t *)inData+headerSize;
2011 outBytes=(uint8_t *)outData+headerSize;
2012
2013 inIndexes=(const int32_t *)inBytes;
2014
2015 if(length>=0) {
2016 length-=headerSize;
2017 if(length<(int32_t)sizeof(indexes)) {
2018 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2019 length);
2020 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2021 return 0;
2022 }
2023 }
2024
2025 /* read the first few indexes */
2026 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
2027 indexes[i]=udata_readInt32(ds, inIndexes[i]);
2028 }
2029
2030 /* get the total length of the data */
2031 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2032
2033 if(length>=0) {
2034 if(length<size) {
2035 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2036 length);
2037 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2038 return 0;
2039 }
2040
2041 /* copy the data for inaccessible bytes */
2042 if(inBytes!=outBytes) {
2043 uprv_memcpy(outBytes, inBytes, size);
2044 }
2045
2046 offset=0;
2047
2048 /* swap the int32_t indexes[] */
2049 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2050 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2051 offset=nextOffset;
2052
2053 /* swap the UTrie2 */
2054 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2055 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2056 offset=nextOffset;
2057
2058 /* swap the uint16_t extraData[] */
2059 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2060 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2061 offset=nextOffset;
2062
2063 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2064 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2065 offset=nextOffset;
2066
2067 U_ASSERT(offset==size);
2068 }
2069
2070 return headerSize+size;
2071 }
2072
2073 #endif // !UCONFIG_NO_NORMALIZATION
2074