1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2001-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ustrcase.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2002feb20
16 * created by: Markus W. Scherer
17 *
18 * Implementation file for string casing C API functions.
19 * Uses functions from uchar.c for basic functionality that requires access
20 * to the Unicode Character Database (uprops.dat).
21 */
22
23 #include "unicode/utypes.h"
24 #include "unicode/brkiter.h"
25 #include "unicode/casemap.h"
26 #include "unicode/edits.h"
27 #include "unicode/stringoptions.h"
28 #include "unicode/ustring.h"
29 #include "unicode/ucasemap.h"
30 #include "unicode/ubrk.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf16.h"
33 #include "cmemory.h"
34 #include "ucase.h"
35 #include "ucasemap_imp.h"
36 #include "ustr_imp.h"
37 #include "uassert.h"
38
39 U_NAMESPACE_BEGIN
40
41 namespace {
42
checkOverflowAndEditsError(int32_t destIndex,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)43 int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
44 Edits *edits, UErrorCode &errorCode) {
45 if (U_SUCCESS(errorCode)) {
46 if (destIndex > destCapacity) {
47 errorCode = U_BUFFER_OVERFLOW_ERROR;
48 } else if (edits != NULL) {
49 edits->copyErrorTo(errorCode);
50 }
51 }
52 return destIndex;
53 }
54
55 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
56 inline int32_t
appendResult(UChar * dest,int32_t destIndex,int32_t destCapacity,int32_t result,const UChar * s,int32_t cpLength,uint32_t options,icu::Edits * edits)57 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
58 int32_t result, const UChar *s,
59 int32_t cpLength, uint32_t options, icu::Edits *edits) {
60 UChar32 c;
61 int32_t length;
62
63 /* decode the result */
64 if(result<0) {
65 /* (not) original code point */
66 if(edits!=NULL) {
67 edits->addUnchanged(cpLength);
68 }
69 if(options & U_OMIT_UNCHANGED_TEXT) {
70 return destIndex;
71 }
72 c=~result;
73 if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
74 dest[destIndex++]=(UChar)c;
75 return destIndex;
76 }
77 length=cpLength;
78 } else {
79 if(result<=UCASE_MAX_STRING_LENGTH) {
80 c=U_SENTINEL;
81 length=result;
82 } else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath
83 dest[destIndex++]=(UChar)result;
84 if(edits!=NULL) {
85 edits->addReplace(cpLength, 1);
86 }
87 return destIndex;
88 } else {
89 c=result;
90 length=U16_LENGTH(c);
91 }
92 if(edits!=NULL) {
93 edits->addReplace(cpLength, length);
94 }
95 }
96 if(length>(INT32_MAX-destIndex)) {
97 return -1; // integer overflow
98 }
99
100 if(destIndex<destCapacity) {
101 /* append the result */
102 if(c>=0) {
103 /* code point */
104 UBool isError=FALSE;
105 U16_APPEND(dest, destIndex, destCapacity, c, isError);
106 if(isError) {
107 /* overflow, nothing written */
108 destIndex+=length;
109 }
110 } else {
111 /* string */
112 if((destIndex+length)<=destCapacity) {
113 while(length>0) {
114 dest[destIndex++]=*s++;
115 --length;
116 }
117 } else {
118 /* overflow */
119 destIndex+=length;
120 }
121 }
122 } else {
123 /* preflight */
124 destIndex+=length;
125 }
126 return destIndex;
127 }
128
129 inline int32_t
appendUChar(UChar * dest,int32_t destIndex,int32_t destCapacity,UChar c)130 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
131 if(destIndex<destCapacity) {
132 dest[destIndex]=c;
133 } else if(destIndex==INT32_MAX) {
134 return -1; // integer overflow
135 }
136 return destIndex+1;
137 }
138
139 int32_t
appendNonEmptyUnchanged(UChar * dest,int32_t destIndex,int32_t destCapacity,const UChar * s,int32_t length,uint32_t options,icu::Edits * edits)140 appendNonEmptyUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
141 const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
142 if(edits!=NULL) {
143 edits->addUnchanged(length);
144 }
145 if(options & U_OMIT_UNCHANGED_TEXT) {
146 return destIndex;
147 }
148 if(length>(INT32_MAX-destIndex)) {
149 return -1; // integer overflow
150 }
151 if((destIndex+length)<=destCapacity) {
152 u_memcpy(dest+destIndex, s, length);
153 }
154 return destIndex + length;
155 }
156
157 inline int32_t
appendUnchanged(UChar * dest,int32_t destIndex,int32_t destCapacity,const UChar * s,int32_t length,uint32_t options,icu::Edits * edits)158 appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
159 const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
160 if (length <= 0) {
161 return destIndex;
162 }
163 return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits);
164 }
165
166 UChar32 U_CALLCONV
utf16_caseContextIterator(void * context,int8_t dir)167 utf16_caseContextIterator(void *context, int8_t dir) {
168 UCaseContext *csc=(UCaseContext *)context;
169 UChar32 c;
170
171 if(dir<0) {
172 /* reset for backward iteration */
173 csc->index=csc->cpStart;
174 csc->dir=dir;
175 } else if(dir>0) {
176 /* reset for forward iteration */
177 csc->index=csc->cpLimit;
178 csc->dir=dir;
179 } else {
180 /* continue current iteration direction */
181 dir=csc->dir;
182 }
183
184 if(dir<0) {
185 if(csc->start<csc->index) {
186 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
187 return c;
188 }
189 } else {
190 if(csc->index<csc->limit) {
191 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
192 return c;
193 }
194 }
195 return U_SENTINEL;
196 }
197
198 /**
199 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
200 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
201 */
toLower(int32_t caseLocale,uint32_t options,UChar * dest,int32_t destCapacity,const UChar * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::Edits * edits,UErrorCode & errorCode)202 int32_t toLower(int32_t caseLocale, uint32_t options,
203 UChar *dest, int32_t destCapacity,
204 const UChar *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
205 icu::Edits *edits, UErrorCode &errorCode) {
206 const int8_t *latinToLower;
207 if (caseLocale == UCASE_LOC_ROOT ||
208 (caseLocale >= 0 ?
209 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
210 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
211 latinToLower = LatinCase::TO_LOWER_NORMAL;
212 } else {
213 latinToLower = LatinCase::TO_LOWER_TR_LT;
214 }
215 const UTrie2 *trie = ucase_getTrie();
216 int32_t destIndex = 0;
217 int32_t prev = srcStart;
218 int32_t srcIndex = srcStart;
219 for (;;) {
220 // fast path for simple cases
221 UChar lead = 0;
222 while (srcIndex < srcLimit) {
223 lead = src[srcIndex];
224 int32_t delta;
225 if (lead < LatinCase::LONG_S) {
226 int8_t d = latinToLower[lead];
227 if (d == LatinCase::EXC) { break; }
228 ++srcIndex;
229 if (d == 0) { continue; }
230 delta = d;
231 } else if (lead >= 0xd800) {
232 break; // surrogate or higher
233 } else {
234 uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
235 if (UCASE_HAS_EXCEPTION(props)) { break; }
236 ++srcIndex;
237 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
238 continue;
239 }
240 }
241 lead += static_cast<UChar>(delta);
242 destIndex = appendUnchanged(dest, destIndex, destCapacity,
243 src + prev, srcIndex - 1 - prev, options, edits);
244 if (destIndex >= 0) {
245 destIndex = appendUChar(dest, destIndex, destCapacity, lead);
246 if (edits != nullptr) {
247 edits->addReplace(1, 1);
248 }
249 }
250 if (destIndex < 0) {
251 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
252 return 0;
253 }
254 prev = srcIndex;
255 }
256 if (srcIndex >= srcLimit) {
257 break;
258 }
259 // slow path
260 int32_t cpStart = srcIndex++;
261 UChar trail;
262 UChar32 c;
263 if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) {
264 c = U16_GET_SUPPLEMENTARY(lead, trail);
265 ++srcIndex;
266 } else {
267 c = lead;
268 }
269 const UChar *s;
270 if (caseLocale >= 0) {
271 csc->cpStart = cpStart;
272 csc->cpLimit = srcIndex;
273 c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale);
274 } else {
275 c = ucase_toFullFolding(c, &s, options);
276 }
277 if (c >= 0) {
278 destIndex = appendUnchanged(dest, destIndex, destCapacity,
279 src + prev, cpStart - prev, options, edits);
280 if (destIndex >= 0) {
281 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
282 srcIndex - cpStart, options, edits);
283 }
284 if (destIndex < 0) {
285 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
286 return 0;
287 }
288 prev = srcIndex;
289 }
290 }
291 destIndex = appendUnchanged(dest, destIndex, destCapacity,
292 src + prev, srcIndex - prev, options, edits);
293 if (destIndex < 0) {
294 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
295 return 0;
296 }
297 return destIndex;
298 }
299
toUpper(int32_t caseLocale,uint32_t options,UChar * dest,int32_t destCapacity,const UChar * src,UCaseContext * csc,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)300 int32_t toUpper(int32_t caseLocale, uint32_t options,
301 UChar *dest, int32_t destCapacity,
302 const UChar *src, UCaseContext *csc, int32_t srcLength,
303 icu::Edits *edits, UErrorCode &errorCode) {
304 const int8_t *latinToUpper;
305 if (caseLocale == UCASE_LOC_TURKISH) {
306 latinToUpper = LatinCase::TO_UPPER_TR;
307 } else {
308 latinToUpper = LatinCase::TO_UPPER_NORMAL;
309 }
310 const UTrie2 *trie = ucase_getTrie();
311 int32_t destIndex = 0;
312 int32_t prev = 0;
313 int32_t srcIndex = 0;
314 for (;;) {
315 // fast path for simple cases
316 UChar lead = 0;
317 while (srcIndex < srcLength) {
318 lead = src[srcIndex];
319 int32_t delta;
320 if (lead < LatinCase::LONG_S) {
321 int8_t d = latinToUpper[lead];
322 if (d == LatinCase::EXC) { break; }
323 ++srcIndex;
324 if (d == 0) { continue; }
325 delta = d;
326 } else if (lead >= 0xd800) {
327 break; // surrogate or higher
328 } else {
329 uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
330 if (UCASE_HAS_EXCEPTION(props)) { break; }
331 ++srcIndex;
332 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
333 continue;
334 }
335 }
336 lead += static_cast<UChar>(delta);
337 destIndex = appendUnchanged(dest, destIndex, destCapacity,
338 src + prev, srcIndex - 1 - prev, options, edits);
339 if (destIndex >= 0) {
340 destIndex = appendUChar(dest, destIndex, destCapacity, lead);
341 if (edits != nullptr) {
342 edits->addReplace(1, 1);
343 }
344 }
345 if (destIndex < 0) {
346 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
347 return 0;
348 }
349 prev = srcIndex;
350 }
351 if (srcIndex >= srcLength) {
352 break;
353 }
354 // slow path
355 int32_t cpStart;
356 csc->cpStart = cpStart = srcIndex++;
357 UChar trail;
358 UChar32 c;
359 if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) {
360 c = U16_GET_SUPPLEMENTARY(lead, trail);
361 ++srcIndex;
362 } else {
363 c = lead;
364 }
365 csc->cpLimit = srcIndex;
366 const UChar *s;
367 c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale);
368 if (c >= 0) {
369 destIndex = appendUnchanged(dest, destIndex, destCapacity,
370 src + prev, cpStart - prev, options, edits);
371 if (destIndex >= 0) {
372 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
373 srcIndex - cpStart, options, edits);
374 }
375 if (destIndex < 0) {
376 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
377 return 0;
378 }
379 prev = srcIndex;
380 }
381 }
382 destIndex = appendUnchanged(dest, destIndex, destCapacity,
383 src + prev, srcIndex - prev, options, edits);
384 if (destIndex < 0) {
385 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
386 return 0;
387 }
388 return destIndex;
389 }
390
391 } // namespace
392
393 U_NAMESPACE_END
394
395 U_NAMESPACE_USE
396
397 #if !UCONFIG_NO_BREAK_ITERATION
398
399 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)400 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
401 UChar *dest, int32_t destCapacity,
402 const UChar *src, int32_t srcLength,
403 icu::Edits *edits,
404 UErrorCode &errorCode) {
405 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
406 return 0;
407 }
408
409 /* set up local variables */
410 UCaseContext csc=UCASECONTEXT_INITIALIZER;
411 csc.p=(void *)src;
412 csc.limit=srcLength;
413 int32_t destIndex=0;
414 int32_t prev=0;
415 UBool isFirstIndex=TRUE;
416
417 /* titlecasing loop */
418 while(prev<srcLength) {
419 /* find next index where to titlecase */
420 int32_t index;
421 if(isFirstIndex) {
422 isFirstIndex=FALSE;
423 index=iter->first();
424 } else {
425 index=iter->next();
426 }
427 if(index==UBRK_DONE || index>srcLength) {
428 index=srcLength;
429 }
430
431 /*
432 * Segment [prev..index[ into 3 parts:
433 * a) skipped characters (copy as-is) [prev..titleStart[
434 * b) first letter (titlecase) [titleStart..titleLimit[
435 * c) subsequent characters (lowercase) [titleLimit..index[
436 */
437 if(prev<index) {
438 // Find and copy skipped characters [prev..titleStart[
439 int32_t titleStart=prev;
440 int32_t titleLimit=prev;
441 UChar32 c;
442 U16_NEXT(src, titleLimit, index, c);
443 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
444 // Adjust the titlecasing index to the next cased character,
445 // or to the next letter/number/symbol/private use.
446 // Stop with titleStart<titleLimit<=index
447 // if there is a character to be titlecased,
448 // or else stop with titleStart==titleLimit==index.
449 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
450 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
451 titleStart=titleLimit;
452 if(titleLimit==index) {
453 break;
454 }
455 U16_NEXT(src, titleLimit, index, c);
456 }
457 if (prev < titleStart) {
458 destIndex=appendUnchanged(dest, destIndex, destCapacity,
459 src+prev, titleStart-prev, options, edits);
460 if(destIndex<0) {
461 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
462 return 0;
463 }
464 }
465 }
466
467 if(titleStart<titleLimit) {
468 /* titlecase c which is from [titleStart..titleLimit[ */
469 csc.cpStart=titleStart;
470 csc.cpLimit=titleLimit;
471 const UChar *s;
472 c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
473 destIndex=appendResult(dest, destIndex, destCapacity, c, s,
474 titleLimit-titleStart, options, edits);
475 if(destIndex<0) {
476 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
477 return 0;
478 }
479
480 /* Special case Dutch IJ titlecasing */
481 if (titleStart+1 < index &&
482 caseLocale == UCASE_LOC_DUTCH &&
483 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
484 if (src[titleStart+1] == 0x006A) {
485 destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
486 if(destIndex<0) {
487 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
488 return 0;
489 }
490 if(edits!=NULL) {
491 edits->addReplace(1, 1);
492 }
493 titleLimit++;
494 } else if (src[titleStart+1] == 0x004A) {
495 // Keep the capital J from getting lowercased.
496 destIndex=appendUnchanged(dest, destIndex, destCapacity,
497 src+titleStart+1, 1, options, edits);
498 if(destIndex<0) {
499 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
500 return 0;
501 }
502 titleLimit++;
503 }
504 }
505
506 /* lowercase [titleLimit..index[ */
507 if(titleLimit<index) {
508 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
509 /* Normal operation: Lowercase the rest of the word. */
510 destIndex+=
511 toLower(
512 caseLocale, options,
513 dest+destIndex, destCapacity-destIndex,
514 src, &csc, titleLimit, index,
515 edits, errorCode);
516 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
517 errorCode=U_ZERO_ERROR;
518 }
519 if(U_FAILURE(errorCode)) {
520 return destIndex;
521 }
522 } else {
523 /* Optionally just copy the rest of the word unchanged. */
524 destIndex=appendUnchanged(dest, destIndex, destCapacity,
525 src+titleLimit, index-titleLimit, options, edits);
526 if(destIndex<0) {
527 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
528 return 0;
529 }
530 }
531 }
532 }
533 }
534
535 prev=index;
536 }
537
538 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
539 }
540
541 #endif // !UCONFIG_NO_BREAK_ITERATION
542
543 U_NAMESPACE_BEGIN
544 namespace GreekUpper {
545
546 // Data generated by prototype code, see
547 // http://site.icu-project.org/design/case/greek-upper
548 // TODO: Move this data into ucase.icu.
549 static const uint16_t data0370[] = {
550 // U+0370..03FF
551 0x0370,
552 0x0370,
553 0x0372,
554 0x0372,
555 0,
556 0,
557 0x0376,
558 0x0376,
559 0,
560 0,
561 0x037A,
562 0x03FD,
563 0x03FE,
564 0x03FF,
565 0,
566 0x037F,
567 0,
568 0,
569 0,
570 0,
571 0,
572 0,
573 0x0391 | HAS_VOWEL | HAS_ACCENT,
574 0,
575 0x0395 | HAS_VOWEL | HAS_ACCENT,
576 0x0397 | HAS_VOWEL | HAS_ACCENT,
577 0x0399 | HAS_VOWEL | HAS_ACCENT,
578 0,
579 0x039F | HAS_VOWEL | HAS_ACCENT,
580 0,
581 0x03A5 | HAS_VOWEL | HAS_ACCENT,
582 0x03A9 | HAS_VOWEL | HAS_ACCENT,
583 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
584 0x0391 | HAS_VOWEL,
585 0x0392,
586 0x0393,
587 0x0394,
588 0x0395 | HAS_VOWEL,
589 0x0396,
590 0x0397 | HAS_VOWEL,
591 0x0398,
592 0x0399 | HAS_VOWEL,
593 0x039A,
594 0x039B,
595 0x039C,
596 0x039D,
597 0x039E,
598 0x039F | HAS_VOWEL,
599 0x03A0,
600 0x03A1,
601 0,
602 0x03A3,
603 0x03A4,
604 0x03A5 | HAS_VOWEL,
605 0x03A6,
606 0x03A7,
607 0x03A8,
608 0x03A9 | HAS_VOWEL,
609 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
610 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
611 0x0391 | HAS_VOWEL | HAS_ACCENT,
612 0x0395 | HAS_VOWEL | HAS_ACCENT,
613 0x0397 | HAS_VOWEL | HAS_ACCENT,
614 0x0399 | HAS_VOWEL | HAS_ACCENT,
615 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
616 0x0391 | HAS_VOWEL,
617 0x0392,
618 0x0393,
619 0x0394,
620 0x0395 | HAS_VOWEL,
621 0x0396,
622 0x0397 | HAS_VOWEL,
623 0x0398,
624 0x0399 | HAS_VOWEL,
625 0x039A,
626 0x039B,
627 0x039C,
628 0x039D,
629 0x039E,
630 0x039F | HAS_VOWEL,
631 0x03A0,
632 0x03A1,
633 0x03A3,
634 0x03A3,
635 0x03A4,
636 0x03A5 | HAS_VOWEL,
637 0x03A6,
638 0x03A7,
639 0x03A8,
640 0x03A9 | HAS_VOWEL,
641 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
642 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
643 0x039F | HAS_VOWEL | HAS_ACCENT,
644 0x03A5 | HAS_VOWEL | HAS_ACCENT,
645 0x03A9 | HAS_VOWEL | HAS_ACCENT,
646 0x03CF,
647 0x0392,
648 0x0398,
649 0x03D2,
650 0x03D2 | HAS_ACCENT,
651 0x03D2 | HAS_DIALYTIKA,
652 0x03A6,
653 0x03A0,
654 0x03CF,
655 0x03D8,
656 0x03D8,
657 0x03DA,
658 0x03DA,
659 0x03DC,
660 0x03DC,
661 0x03DE,
662 0x03DE,
663 0x03E0,
664 0x03E0,
665 0,
666 0,
667 0,
668 0,
669 0,
670 0,
671 0,
672 0,
673 0,
674 0,
675 0,
676 0,
677 0,
678 0,
679 0x039A,
680 0x03A1,
681 0x03F9,
682 0x037F,
683 0x03F4,
684 0x0395 | HAS_VOWEL,
685 0,
686 0x03F7,
687 0x03F7,
688 0x03F9,
689 0x03FA,
690 0x03FA,
691 0x03FC,
692 0x03FD,
693 0x03FE,
694 0x03FF,
695 };
696
697 static const uint16_t data1F00[] = {
698 // U+1F00..1FFF
699 0x0391 | HAS_VOWEL,
700 0x0391 | HAS_VOWEL,
701 0x0391 | HAS_VOWEL | HAS_ACCENT,
702 0x0391 | HAS_VOWEL | HAS_ACCENT,
703 0x0391 | HAS_VOWEL | HAS_ACCENT,
704 0x0391 | HAS_VOWEL | HAS_ACCENT,
705 0x0391 | HAS_VOWEL | HAS_ACCENT,
706 0x0391 | HAS_VOWEL | HAS_ACCENT,
707 0x0391 | HAS_VOWEL,
708 0x0391 | HAS_VOWEL,
709 0x0391 | HAS_VOWEL | HAS_ACCENT,
710 0x0391 | HAS_VOWEL | HAS_ACCENT,
711 0x0391 | HAS_VOWEL | HAS_ACCENT,
712 0x0391 | HAS_VOWEL | HAS_ACCENT,
713 0x0391 | HAS_VOWEL | HAS_ACCENT,
714 0x0391 | HAS_VOWEL | HAS_ACCENT,
715 0x0395 | HAS_VOWEL,
716 0x0395 | HAS_VOWEL,
717 0x0395 | HAS_VOWEL | HAS_ACCENT,
718 0x0395 | HAS_VOWEL | HAS_ACCENT,
719 0x0395 | HAS_VOWEL | HAS_ACCENT,
720 0x0395 | HAS_VOWEL | HAS_ACCENT,
721 0,
722 0,
723 0x0395 | HAS_VOWEL,
724 0x0395 | HAS_VOWEL,
725 0x0395 | HAS_VOWEL | HAS_ACCENT,
726 0x0395 | HAS_VOWEL | HAS_ACCENT,
727 0x0395 | HAS_VOWEL | HAS_ACCENT,
728 0x0395 | HAS_VOWEL | HAS_ACCENT,
729 0,
730 0,
731 0x0397 | HAS_VOWEL,
732 0x0397 | HAS_VOWEL,
733 0x0397 | HAS_VOWEL | HAS_ACCENT,
734 0x0397 | HAS_VOWEL | HAS_ACCENT,
735 0x0397 | HAS_VOWEL | HAS_ACCENT,
736 0x0397 | HAS_VOWEL | HAS_ACCENT,
737 0x0397 | HAS_VOWEL | HAS_ACCENT,
738 0x0397 | HAS_VOWEL | HAS_ACCENT,
739 0x0397 | HAS_VOWEL,
740 0x0397 | HAS_VOWEL,
741 0x0397 | HAS_VOWEL | HAS_ACCENT,
742 0x0397 | HAS_VOWEL | HAS_ACCENT,
743 0x0397 | HAS_VOWEL | HAS_ACCENT,
744 0x0397 | HAS_VOWEL | HAS_ACCENT,
745 0x0397 | HAS_VOWEL | HAS_ACCENT,
746 0x0397 | HAS_VOWEL | HAS_ACCENT,
747 0x0399 | HAS_VOWEL,
748 0x0399 | HAS_VOWEL,
749 0x0399 | HAS_VOWEL | HAS_ACCENT,
750 0x0399 | HAS_VOWEL | HAS_ACCENT,
751 0x0399 | HAS_VOWEL | HAS_ACCENT,
752 0x0399 | HAS_VOWEL | HAS_ACCENT,
753 0x0399 | HAS_VOWEL | HAS_ACCENT,
754 0x0399 | HAS_VOWEL | HAS_ACCENT,
755 0x0399 | HAS_VOWEL,
756 0x0399 | HAS_VOWEL,
757 0x0399 | HAS_VOWEL | HAS_ACCENT,
758 0x0399 | HAS_VOWEL | HAS_ACCENT,
759 0x0399 | HAS_VOWEL | HAS_ACCENT,
760 0x0399 | HAS_VOWEL | HAS_ACCENT,
761 0x0399 | HAS_VOWEL | HAS_ACCENT,
762 0x0399 | HAS_VOWEL | HAS_ACCENT,
763 0x039F | HAS_VOWEL,
764 0x039F | HAS_VOWEL,
765 0x039F | HAS_VOWEL | HAS_ACCENT,
766 0x039F | HAS_VOWEL | HAS_ACCENT,
767 0x039F | HAS_VOWEL | HAS_ACCENT,
768 0x039F | HAS_VOWEL | HAS_ACCENT,
769 0,
770 0,
771 0x039F | HAS_VOWEL,
772 0x039F | HAS_VOWEL,
773 0x039F | HAS_VOWEL | HAS_ACCENT,
774 0x039F | HAS_VOWEL | HAS_ACCENT,
775 0x039F | HAS_VOWEL | HAS_ACCENT,
776 0x039F | HAS_VOWEL | HAS_ACCENT,
777 0,
778 0,
779 0x03A5 | HAS_VOWEL,
780 0x03A5 | HAS_VOWEL,
781 0x03A5 | HAS_VOWEL | HAS_ACCENT,
782 0x03A5 | HAS_VOWEL | HAS_ACCENT,
783 0x03A5 | HAS_VOWEL | HAS_ACCENT,
784 0x03A5 | HAS_VOWEL | HAS_ACCENT,
785 0x03A5 | HAS_VOWEL | HAS_ACCENT,
786 0x03A5 | HAS_VOWEL | HAS_ACCENT,
787 0,
788 0x03A5 | HAS_VOWEL,
789 0,
790 0x03A5 | HAS_VOWEL | HAS_ACCENT,
791 0,
792 0x03A5 | HAS_VOWEL | HAS_ACCENT,
793 0,
794 0x03A5 | HAS_VOWEL | HAS_ACCENT,
795 0x03A9 | HAS_VOWEL,
796 0x03A9 | HAS_VOWEL,
797 0x03A9 | HAS_VOWEL | HAS_ACCENT,
798 0x03A9 | HAS_VOWEL | HAS_ACCENT,
799 0x03A9 | HAS_VOWEL | HAS_ACCENT,
800 0x03A9 | HAS_VOWEL | HAS_ACCENT,
801 0x03A9 | HAS_VOWEL | HAS_ACCENT,
802 0x03A9 | HAS_VOWEL | HAS_ACCENT,
803 0x03A9 | HAS_VOWEL,
804 0x03A9 | HAS_VOWEL,
805 0x03A9 | HAS_VOWEL | HAS_ACCENT,
806 0x03A9 | HAS_VOWEL | HAS_ACCENT,
807 0x03A9 | HAS_VOWEL | HAS_ACCENT,
808 0x03A9 | HAS_VOWEL | HAS_ACCENT,
809 0x03A9 | HAS_VOWEL | HAS_ACCENT,
810 0x03A9 | HAS_VOWEL | HAS_ACCENT,
811 0x0391 | HAS_VOWEL | HAS_ACCENT,
812 0x0391 | HAS_VOWEL | HAS_ACCENT,
813 0x0395 | HAS_VOWEL | HAS_ACCENT,
814 0x0395 | HAS_VOWEL | HAS_ACCENT,
815 0x0397 | HAS_VOWEL | HAS_ACCENT,
816 0x0397 | HAS_VOWEL | HAS_ACCENT,
817 0x0399 | HAS_VOWEL | HAS_ACCENT,
818 0x0399 | HAS_VOWEL | HAS_ACCENT,
819 0x039F | HAS_VOWEL | HAS_ACCENT,
820 0x039F | HAS_VOWEL | HAS_ACCENT,
821 0x03A5 | HAS_VOWEL | HAS_ACCENT,
822 0x03A5 | HAS_VOWEL | HAS_ACCENT,
823 0x03A9 | HAS_VOWEL | HAS_ACCENT,
824 0x03A9 | HAS_VOWEL | HAS_ACCENT,
825 0,
826 0,
827 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
828 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
829 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
830 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
831 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
832 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
833 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
834 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
835 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
836 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
837 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
838 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
839 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
840 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
841 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
842 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
843 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
844 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
845 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
846 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
847 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
848 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
849 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
850 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
851 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
852 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
853 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
854 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
855 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
856 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
857 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
858 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
859 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
860 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
861 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
862 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
863 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
864 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
865 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
866 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
867 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
868 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
869 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
870 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
871 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
872 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
873 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
874 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
875 0x0391 | HAS_VOWEL,
876 0x0391 | HAS_VOWEL,
877 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
878 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
879 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
880 0,
881 0x0391 | HAS_VOWEL | HAS_ACCENT,
882 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
883 0x0391 | HAS_VOWEL,
884 0x0391 | HAS_VOWEL,
885 0x0391 | HAS_VOWEL | HAS_ACCENT,
886 0x0391 | HAS_VOWEL | HAS_ACCENT,
887 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
888 0,
889 0x0399 | HAS_VOWEL,
890 0,
891 0,
892 0,
893 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
894 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
895 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
896 0,
897 0x0397 | HAS_VOWEL | HAS_ACCENT,
898 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
899 0x0395 | HAS_VOWEL | HAS_ACCENT,
900 0x0395 | HAS_VOWEL | HAS_ACCENT,
901 0x0397 | HAS_VOWEL | HAS_ACCENT,
902 0x0397 | HAS_VOWEL | HAS_ACCENT,
903 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
904 0,
905 0,
906 0,
907 0x0399 | HAS_VOWEL,
908 0x0399 | HAS_VOWEL,
909 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
910 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
911 0,
912 0,
913 0x0399 | HAS_VOWEL | HAS_ACCENT,
914 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
915 0x0399 | HAS_VOWEL,
916 0x0399 | HAS_VOWEL,
917 0x0399 | HAS_VOWEL | HAS_ACCENT,
918 0x0399 | HAS_VOWEL | HAS_ACCENT,
919 0,
920 0,
921 0,
922 0,
923 0x03A5 | HAS_VOWEL,
924 0x03A5 | HAS_VOWEL,
925 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
926 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
927 0x03A1,
928 0x03A1,
929 0x03A5 | HAS_VOWEL | HAS_ACCENT,
930 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
931 0x03A5 | HAS_VOWEL,
932 0x03A5 | HAS_VOWEL,
933 0x03A5 | HAS_VOWEL | HAS_ACCENT,
934 0x03A5 | HAS_VOWEL | HAS_ACCENT,
935 0x03A1,
936 0,
937 0,
938 0,
939 0,
940 0,
941 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
942 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
943 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
944 0,
945 0x03A9 | HAS_VOWEL | HAS_ACCENT,
946 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
947 0x039F | HAS_VOWEL | HAS_ACCENT,
948 0x039F | HAS_VOWEL | HAS_ACCENT,
949 0x03A9 | HAS_VOWEL | HAS_ACCENT,
950 0x03A9 | HAS_VOWEL | HAS_ACCENT,
951 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
952 0,
953 0,
954 0,
955 };
956
957 // U+2126 Ohm sign
958 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
959
getLetterData(UChar32 c)960 uint32_t getLetterData(UChar32 c) {
961 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
962 return 0;
963 } else if (c <= 0x3ff) {
964 return data0370[c - 0x370];
965 } else if (c <= 0x1fff) {
966 return data1F00[c - 0x1f00];
967 } else if (c == 0x2126) {
968 return data2126;
969 } else {
970 return 0;
971 }
972 }
973
getDiacriticData(UChar32 c)974 uint32_t getDiacriticData(UChar32 c) {
975 switch (c) {
976 case 0x0300: // varia
977 case 0x0301: // tonos = oxia
978 case 0x0342: // perispomeni
979 case 0x0302: // circumflex can look like perispomeni
980 case 0x0303: // tilde can look like perispomeni
981 case 0x0311: // inverted breve can look like perispomeni
982 return HAS_ACCENT;
983 case 0x0308: // dialytika = diaeresis
984 return HAS_COMBINING_DIALYTIKA;
985 case 0x0344: // dialytika tonos
986 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
987 case 0x0345: // ypogegrammeni = iota subscript
988 return HAS_YPOGEGRAMMENI;
989 case 0x0304: // macron
990 case 0x0306: // breve
991 case 0x0313: // comma above
992 case 0x0314: // reversed comma above
993 case 0x0343: // koronis
994 return HAS_OTHER_GREEK_DIACRITIC;
995 default:
996 return 0;
997 }
998 }
999
isFollowedByCasedLetter(const UChar * s,int32_t i,int32_t length)1000 UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) {
1001 while (i < length) {
1002 UChar32 c;
1003 U16_NEXT(s, i, length, c);
1004 int32_t type = ucase_getTypeOrIgnorable(c);
1005 if ((type & UCASE_IGNORABLE) != 0) {
1006 // Case-ignorable, continue with the loop.
1007 } else if (type != UCASE_NONE) {
1008 return TRUE; // Followed by cased letter.
1009 } else {
1010 return FALSE; // Uncased and not case-ignorable.
1011 }
1012 }
1013 return FALSE; // Not followed by cased letter.
1014 }
1015
1016 /**
1017 * Greek string uppercasing with a state machine.
1018 * Probably simpler than a stateless function that has to figure out complex context-before
1019 * for each character.
1020 * TODO: Try to re-consolidate one way or another with the non-Greek function.
1021 */
toUpper(uint32_t options,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,Edits * edits,UErrorCode & errorCode)1022 int32_t toUpper(uint32_t options,
1023 UChar *dest, int32_t destCapacity,
1024 const UChar *src, int32_t srcLength,
1025 Edits *edits,
1026 UErrorCode &errorCode) {
1027 int32_t destIndex=0;
1028 uint32_t state = 0;
1029 for (int32_t i = 0; i < srcLength;) {
1030 int32_t nextIndex = i;
1031 UChar32 c;
1032 U16_NEXT(src, nextIndex, srcLength, c);
1033 uint32_t nextState = 0;
1034 int32_t type = ucase_getTypeOrIgnorable(c);
1035 if ((type & UCASE_IGNORABLE) != 0) {
1036 // c is case-ignorable
1037 nextState |= (state & AFTER_CASED);
1038 } else if (type != UCASE_NONE) {
1039 // c is cased
1040 nextState |= AFTER_CASED;
1041 }
1042 uint32_t data = getLetterData(c);
1043 if (data > 0) {
1044 uint32_t upper = data & UPPER_MASK;
1045 // Add a dialytika to this iota or ypsilon vowel
1046 // if we removed a tonos from the previous vowel,
1047 // and that previous vowel did not also have (or gain) a dialytika.
1048 // Adding one only to the final vowel in a longer sequence
1049 // (which does not occur in normal writing) would require lookahead.
1050 // Set the same flag as for preserving an existing dialytika.
1051 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
1052 (upper == 0x399 || upper == 0x3A5)) {
1053 data |= HAS_DIALYTIKA;
1054 }
1055 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
1056 if ((data & HAS_YPOGEGRAMMENI) != 0) {
1057 numYpogegrammeni = 1;
1058 }
1059 // Skip combining diacritics after this Greek letter.
1060 while (nextIndex < srcLength) {
1061 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
1062 if (diacriticData != 0) {
1063 data |= diacriticData;
1064 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
1065 ++numYpogegrammeni;
1066 }
1067 ++nextIndex;
1068 } else {
1069 break; // not a Greek diacritic
1070 }
1071 }
1072 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
1073 nextState |= AFTER_VOWEL_WITH_ACCENT;
1074 }
1075 // Map according to Greek rules.
1076 UBool addTonos = FALSE;
1077 if (upper == 0x397 &&
1078 (data & HAS_ACCENT) != 0 &&
1079 numYpogegrammeni == 0 &&
1080 (state & AFTER_CASED) == 0 &&
1081 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
1082 // Keep disjunctive "or" with (only) a tonos.
1083 // We use the same "word boundary" conditions as for the Final_Sigma test.
1084 if (i == nextIndex) {
1085 upper = 0x389; // Preserve the precomposed form.
1086 } else {
1087 addTonos = TRUE;
1088 }
1089 } else if ((data & HAS_DIALYTIKA) != 0) {
1090 // Preserve a vowel with dialytika in precomposed form if it exists.
1091 if (upper == 0x399) {
1092 upper = 0x3AA;
1093 data &= ~HAS_EITHER_DIALYTIKA;
1094 } else if (upper == 0x3A5) {
1095 upper = 0x3AB;
1096 data &= ~HAS_EITHER_DIALYTIKA;
1097 }
1098 }
1099
1100 UBool change;
1101 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
1102 change = TRUE; // common, simple usage
1103 } else {
1104 // Find out first whether we are changing the text.
1105 change = src[i] != upper || numYpogegrammeni > 0;
1106 int32_t i2 = i + 1;
1107 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
1108 change |= i2 >= nextIndex || src[i2] != 0x308;
1109 ++i2;
1110 }
1111 if (addTonos) {
1112 change |= i2 >= nextIndex || src[i2] != 0x301;
1113 ++i2;
1114 }
1115 int32_t oldLength = nextIndex - i;
1116 int32_t newLength = (i2 - i) + numYpogegrammeni;
1117 change |= oldLength != newLength;
1118 if (change) {
1119 if (edits != NULL) {
1120 edits->addReplace(oldLength, newLength);
1121 }
1122 } else {
1123 if (edits != NULL) {
1124 edits->addUnchanged(oldLength);
1125 }
1126 // Write unchanged text?
1127 change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
1128 }
1129 }
1130
1131 if (change) {
1132 destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
1133 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
1134 destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika
1135 }
1136 if (destIndex >= 0 && addTonos) {
1137 destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
1138 }
1139 while (destIndex >= 0 && numYpogegrammeni > 0) {
1140 destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
1141 --numYpogegrammeni;
1142 }
1143 if(destIndex<0) {
1144 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1145 return 0;
1146 }
1147 }
1148 } else {
1149 const UChar *s;
1150 c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
1151 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
1152 nextIndex - i, options, edits);
1153 if (destIndex < 0) {
1154 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1155 return 0;
1156 }
1157 }
1158 i = nextIndex;
1159 state = nextState;
1160 }
1161
1162 return destIndex;
1163 }
1164
1165 } // namespace GreekUpper
1166 U_NAMESPACE_END
1167
1168 /* functions available in the common library (for unistr_case.cpp) */
1169
1170 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1171 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1172 UChar *dest, int32_t destCapacity,
1173 const UChar *src, int32_t srcLength,
1174 icu::Edits *edits,
1175 UErrorCode &errorCode) {
1176 UCaseContext csc=UCASECONTEXT_INITIALIZER;
1177 csc.p=(void *)src;
1178 csc.limit=srcLength;
1179 int32_t destIndex = toLower(
1180 caseLocale, options,
1181 dest, destCapacity,
1182 src, &csc, 0, srcLength,
1183 edits, errorCode);
1184 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1185 }
1186
1187 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1188 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1189 UChar *dest, int32_t destCapacity,
1190 const UChar *src, int32_t srcLength,
1191 icu::Edits *edits,
1192 UErrorCode &errorCode) {
1193 int32_t destIndex;
1194 if (caseLocale == UCASE_LOC_GREEK) {
1195 destIndex = GreekUpper::toUpper(options, dest, destCapacity,
1196 src, srcLength, edits, errorCode);
1197 } else {
1198 UCaseContext csc=UCASECONTEXT_INITIALIZER;
1199 csc.p=(void *)src;
1200 csc.limit=srcLength;
1201 destIndex = toUpper(
1202 caseLocale, options,
1203 dest, destCapacity,
1204 src, &csc, srcLength,
1205 edits, errorCode);
1206 }
1207 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1208 }
1209
1210 U_CFUNC int32_t U_CALLCONV
ustrcase_internalFold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1211 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1212 UChar *dest, int32_t destCapacity,
1213 const UChar *src, int32_t srcLength,
1214 icu::Edits *edits,
1215 UErrorCode &errorCode) {
1216 int32_t destIndex = toLower(
1217 -1, options,
1218 dest, destCapacity,
1219 src, nullptr, 0, srcLength,
1220 edits, errorCode);
1221 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1222 }
1223
1224 U_CFUNC int32_t
ustrcase_map(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UStringCaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)1225 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1226 UChar *dest, int32_t destCapacity,
1227 const UChar *src, int32_t srcLength,
1228 UStringCaseMapper *stringCaseMapper,
1229 icu::Edits *edits,
1230 UErrorCode &errorCode) {
1231 int32_t destLength;
1232
1233 /* check argument values */
1234 if(U_FAILURE(errorCode)) {
1235 return 0;
1236 }
1237 if( destCapacity<0 ||
1238 (dest==NULL && destCapacity>0) ||
1239 src==NULL ||
1240 srcLength<-1
1241 ) {
1242 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1243 return 0;
1244 }
1245
1246 /* get the string length */
1247 if(srcLength==-1) {
1248 srcLength=u_strlen(src);
1249 }
1250
1251 /* check for overlapping source and destination */
1252 if( dest!=NULL &&
1253 ((src>=dest && src<(dest+destCapacity)) ||
1254 (dest>=src && dest<(src+srcLength)))
1255 ) {
1256 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1257 return 0;
1258 }
1259
1260 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
1261 edits->reset();
1262 }
1263 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1264 dest, destCapacity, src, srcLength, edits, errorCode);
1265 return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1266 }
1267
1268 U_CFUNC int32_t
ustrcase_mapWithOverlap(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UStringCaseMapper * stringCaseMapper,UErrorCode & errorCode)1269 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1270 UChar *dest, int32_t destCapacity,
1271 const UChar *src, int32_t srcLength,
1272 UStringCaseMapper *stringCaseMapper,
1273 UErrorCode &errorCode) {
1274 UChar buffer[300];
1275 UChar *temp;
1276
1277 int32_t destLength;
1278
1279 /* check argument values */
1280 if(U_FAILURE(errorCode)) {
1281 return 0;
1282 }
1283 if( destCapacity<0 ||
1284 (dest==NULL && destCapacity>0) ||
1285 src==NULL ||
1286 srcLength<-1
1287 ) {
1288 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1289 return 0;
1290 }
1291
1292 /* get the string length */
1293 if(srcLength==-1) {
1294 srcLength=u_strlen(src);
1295 }
1296
1297 /* check for overlapping source and destination */
1298 if( dest!=NULL &&
1299 ((src>=dest && src<(dest+destCapacity)) ||
1300 (dest>=src && dest<(src+srcLength)))
1301 ) {
1302 /* overlap: provide a temporary destination buffer and later copy the result */
1303 if(destCapacity<=UPRV_LENGTHOF(buffer)) {
1304 /* the stack buffer is large enough */
1305 temp=buffer;
1306 } else {
1307 /* allocate a buffer */
1308 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1309 if(temp==NULL) {
1310 errorCode=U_MEMORY_ALLOCATION_ERROR;
1311 return 0;
1312 }
1313 }
1314 } else {
1315 temp=dest;
1316 }
1317
1318 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1319 temp, destCapacity, src, srcLength, NULL, errorCode);
1320 if(temp!=dest) {
1321 /* copy the result string to the destination buffer */
1322 if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
1323 u_memmove(dest, temp, destLength);
1324 }
1325 if(temp!=buffer) {
1326 uprv_free(temp);
1327 }
1328 }
1329
1330 return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1331 }
1332
1333 /* public API functions */
1334
1335 U_CAPI int32_t U_EXPORT2
u_strFoldCase(UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,uint32_t options,UErrorCode * pErrorCode)1336 u_strFoldCase(UChar *dest, int32_t destCapacity,
1337 const UChar *src, int32_t srcLength,
1338 uint32_t options,
1339 UErrorCode *pErrorCode) {
1340 return ustrcase_mapWithOverlap(
1341 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1342 dest, destCapacity,
1343 src, srcLength,
1344 ustrcase_internalFold, *pErrorCode);
1345 }
1346
1347 U_NAMESPACE_BEGIN
1348
fold(uint32_t options,const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1349 int32_t CaseMap::fold(
1350 uint32_t options,
1351 const UChar *src, int32_t srcLength,
1352 UChar *dest, int32_t destCapacity, Edits *edits,
1353 UErrorCode &errorCode) {
1354 return ustrcase_map(
1355 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1356 dest, destCapacity,
1357 src, srcLength,
1358 ustrcase_internalFold, edits, errorCode);
1359 }
1360
1361 U_NAMESPACE_END
1362
1363 /* case-insensitive string comparisons -------------------------------------- */
1364
1365 /*
1366 * This function is a copy of unorm_cmpEquivFold() minus the parts for
1367 * canonical equivalence.
1368 * Keep the functions in sync, and see there for how this works.
1369 * The duplication is for modularization:
1370 * It makes caseless (but not canonical caseless) matches independent of
1371 * the normalization code.
1372 */
1373
1374 /* stack element for previous-level source/decomposition pointers */
1375 struct CmpEquivLevel {
1376 const UChar *start, *s, *limit;
1377 };
1378 typedef struct CmpEquivLevel CmpEquivLevel;
1379
1380 /**
1381 * Internal implementation code comparing string with case fold.
1382 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1383 *
1384 * @param s1 input string 1
1385 * @param length1 length of string 1, or -1 (NULL terminated)
1386 * @param s2 input string 2
1387 * @param length2 length of string 2, or -1 (NULL terminated)
1388 * @param options compare options
1389 * @param matchLen1 (output) length of partial prefix match in s1
1390 * @param matchLen2 (output) length of partial prefix match in s2
1391 * @param pErrorCode receives error status
1392 * @return The result of comparison
1393 */
_cmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1394 static int32_t _cmpFold(
1395 const UChar *s1, int32_t length1,
1396 const UChar *s2, int32_t length2,
1397 uint32_t options,
1398 int32_t *matchLen1, int32_t *matchLen2,
1399 UErrorCode *pErrorCode) {
1400 int32_t cmpRes = 0;
1401
1402 /* current-level start/limit - s1/s2 as current */
1403 const UChar *start1, *start2, *limit1, *limit2;
1404
1405 /* points to the original start address */
1406 const UChar *org1, *org2;
1407
1408 /* points to the end of match + 1 */
1409 const UChar *m1, *m2;
1410
1411 /* case folding variables */
1412 const UChar *p;
1413 int32_t length;
1414
1415 /* stacks of previous-level start/current/limit */
1416 CmpEquivLevel stack1[2], stack2[2];
1417
1418 /* case folding buffers, only use current-level start/limit */
1419 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1420
1421 /* track which is the current level per string */
1422 int32_t level1, level2;
1423
1424 /* current code units, and code points for lookups */
1425 UChar32 c1, c2, cp1, cp2;
1426
1427 /* no argument error checking because this itself is not an API */
1428
1429 /*
1430 * assume that at least the option U_COMPARE_IGNORE_CASE is set
1431 * otherwise this function would have to behave exactly as uprv_strCompare()
1432 */
1433 if(U_FAILURE(*pErrorCode)) {
1434 return 0;
1435 }
1436
1437 /* initialize */
1438 if(matchLen1) {
1439 U_ASSERT(matchLen2 !=NULL);
1440 *matchLen1=0;
1441 *matchLen2=0;
1442 }
1443
1444 start1=m1=org1=s1;
1445 if(length1==-1) {
1446 limit1=NULL;
1447 } else {
1448 limit1=s1+length1;
1449 }
1450
1451 start2=m2=org2=s2;
1452 if(length2==-1) {
1453 limit2=NULL;
1454 } else {
1455 limit2=s2+length2;
1456 }
1457
1458 level1=level2=0;
1459 c1=c2=-1;
1460
1461 /* comparison loop */
1462 for(;;) {
1463 /*
1464 * here a code unit value of -1 means "get another code unit"
1465 * below it will mean "this source is finished"
1466 */
1467
1468 if(c1<0) {
1469 /* get next code unit from string 1, post-increment */
1470 for(;;) {
1471 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
1472 if(level1==0) {
1473 c1=-1;
1474 break;
1475 }
1476 } else {
1477 ++s1;
1478 break;
1479 }
1480
1481 /* reached end of level buffer, pop one level */
1482 do {
1483 --level1;
1484 start1=stack1[level1].start; /*Not uninitialized*/
1485 } while(start1==NULL);
1486 s1=stack1[level1].s; /*Not uninitialized*/
1487 limit1=stack1[level1].limit; /*Not uninitialized*/
1488 }
1489 }
1490
1491 if(c2<0) {
1492 /* get next code unit from string 2, post-increment */
1493 for(;;) {
1494 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
1495 if(level2==0) {
1496 c2=-1;
1497 break;
1498 }
1499 } else {
1500 ++s2;
1501 break;
1502 }
1503
1504 /* reached end of level buffer, pop one level */
1505 do {
1506 --level2;
1507 start2=stack2[level2].start; /*Not uninitialized*/
1508 } while(start2==NULL);
1509 s2=stack2[level2].s; /*Not uninitialized*/
1510 limit2=stack2[level2].limit; /*Not uninitialized*/
1511 }
1512 }
1513
1514 /*
1515 * compare c1 and c2
1516 * either variable c1, c2 is -1 only if the corresponding string is finished
1517 */
1518 if(c1==c2) {
1519 const UChar *next1, *next2;
1520
1521 if(c1<0) {
1522 cmpRes=0; /* c1==c2==-1 indicating end of strings */
1523 break;
1524 }
1525
1526 /*
1527 * Note: Move the match positions in both strings at the same time
1528 * only when corresponding code point(s) in the original strings
1529 * are fully consumed. For example, when comparing s1="Fust" and
1530 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1531 * the first code point in the case-folded data. But the second "s"
1532 * has no matching code point in s1, so this implementation returns
1533 * 2 as the prefix match length ("Fu").
1534 */
1535 next1=next2=NULL;
1536 if(level1==0) {
1537 next1=s1;
1538 } else if(s1==limit1) {
1539 /* Note: This implementation only use a single level of stack.
1540 * If this code needs to be changed to use multiple levels
1541 * of stacks, the code above should check if the current
1542 * code is at the end of all stacks.
1543 */
1544 U_ASSERT(level1==1);
1545
1546 /* is s1 at the end of the current stack? */
1547 next1=stack1[0].s;
1548 }
1549
1550 if (next1!=NULL) {
1551 if(level2==0) {
1552 next2=s2;
1553 } else if(s2==limit2) {
1554 U_ASSERT(level2==1);
1555
1556 /* is s2 at the end of the current stack? */
1557 next2=stack2[0].s;
1558 }
1559 if(next2!=NULL) {
1560 m1=next1;
1561 m2=next2;
1562 }
1563 }
1564 c1=c2=-1; /* make us fetch new code units */
1565 continue;
1566 } else if(c1<0) {
1567 cmpRes=-1; /* string 1 ends before string 2 */
1568 break;
1569 } else if(c2<0) {
1570 cmpRes=1; /* string 2 ends before string 1 */
1571 break;
1572 }
1573 /* c1!=c2 && c1>=0 && c2>=0 */
1574
1575 /* get complete code points for c1, c2 for lookups if either is a surrogate */
1576 cp1=c1;
1577 if(U_IS_SURROGATE(c1)) {
1578 UChar c;
1579
1580 if(U_IS_SURROGATE_LEAD(c1)) {
1581 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1582 /* advance ++s1; only below if cp1 decomposes/case-folds */
1583 cp1=U16_GET_SUPPLEMENTARY(c1, c);
1584 }
1585 } else /* isTrail(c1) */ {
1586 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1587 cp1=U16_GET_SUPPLEMENTARY(c, c1);
1588 }
1589 }
1590 }
1591
1592 cp2=c2;
1593 if(U_IS_SURROGATE(c2)) {
1594 UChar c;
1595
1596 if(U_IS_SURROGATE_LEAD(c2)) {
1597 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1598 /* advance ++s2; only below if cp2 decomposes/case-folds */
1599 cp2=U16_GET_SUPPLEMENTARY(c2, c);
1600 }
1601 } else /* isTrail(c2) */ {
1602 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1603 cp2=U16_GET_SUPPLEMENTARY(c, c2);
1604 }
1605 }
1606 }
1607
1608 /*
1609 * go down one level for each string
1610 * continue with the main loop as soon as there is a real change
1611 */
1612
1613 if( level1==0 &&
1614 (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
1615 ) {
1616 /* cp1 case-folds to the code point "length" or to p[length] */
1617 if(U_IS_SURROGATE(c1)) {
1618 if(U_IS_SURROGATE_LEAD(c1)) {
1619 /* advance beyond source surrogate pair if it case-folds */
1620 ++s1;
1621 } else /* isTrail(c1) */ {
1622 /*
1623 * we got a supplementary code point when hitting its trail surrogate,
1624 * therefore the lead surrogate must have been the same as in the other string;
1625 * compare this decomposition with the lead surrogate in the other string
1626 * remember that this simulates bulk text replacement:
1627 * the decomposition would replace the entire code point
1628 */
1629 --s2;
1630 --m2;
1631 c2=*(s2-1);
1632 }
1633 }
1634
1635 /* push current level pointers */
1636 stack1[0].start=start1;
1637 stack1[0].s=s1;
1638 stack1[0].limit=limit1;
1639 ++level1;
1640
1641 /* copy the folding result to fold1[] */
1642 if(length<=UCASE_MAX_STRING_LENGTH) {
1643 u_memcpy(fold1, p, length);
1644 } else {
1645 int32_t i=0;
1646 U16_APPEND_UNSAFE(fold1, i, length);
1647 length=i;
1648 }
1649
1650 /* set next level pointers to case folding */
1651 start1=s1=fold1;
1652 limit1=fold1+length;
1653
1654 /* get ready to read from decomposition, continue with loop */
1655 c1=-1;
1656 continue;
1657 }
1658
1659 if( level2==0 &&
1660 (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
1661 ) {
1662 /* cp2 case-folds to the code point "length" or to p[length] */
1663 if(U_IS_SURROGATE(c2)) {
1664 if(U_IS_SURROGATE_LEAD(c2)) {
1665 /* advance beyond source surrogate pair if it case-folds */
1666 ++s2;
1667 } else /* isTrail(c2) */ {
1668 /*
1669 * we got a supplementary code point when hitting its trail surrogate,
1670 * therefore the lead surrogate must have been the same as in the other string;
1671 * compare this decomposition with the lead surrogate in the other string
1672 * remember that this simulates bulk text replacement:
1673 * the decomposition would replace the entire code point
1674 */
1675 --s1;
1676 --m2;
1677 c1=*(s1-1);
1678 }
1679 }
1680
1681 /* push current level pointers */
1682 stack2[0].start=start2;
1683 stack2[0].s=s2;
1684 stack2[0].limit=limit2;
1685 ++level2;
1686
1687 /* copy the folding result to fold2[] */
1688 if(length<=UCASE_MAX_STRING_LENGTH) {
1689 u_memcpy(fold2, p, length);
1690 } else {
1691 int32_t i=0;
1692 U16_APPEND_UNSAFE(fold2, i, length);
1693 length=i;
1694 }
1695
1696 /* set next level pointers to case folding */
1697 start2=s2=fold2;
1698 limit2=fold2+length;
1699
1700 /* get ready to read from decomposition, continue with loop */
1701 c2=-1;
1702 continue;
1703 }
1704
1705 /*
1706 * no decomposition/case folding, max level for both sides:
1707 * return difference result
1708 *
1709 * code point order comparison must not just return cp1-cp2
1710 * because when single surrogates are present then the surrogate pairs
1711 * that formed cp1 and cp2 may be from different string indexes
1712 *
1713 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1714 * c1=d800 cp1=10001 c2=dc00 cp2=10000
1715 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1716 *
1717 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1718 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1719 * so we have slightly different pointer/start/limit comparisons here
1720 */
1721
1722 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1723 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1724 if(
1725 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1726 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1727 ) {
1728 /* part of a surrogate pair, leave >=d800 */
1729 } else {
1730 /* BMP code point - may be surrogate code point - make <d800 */
1731 c1-=0x2800;
1732 }
1733
1734 if(
1735 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1736 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1737 ) {
1738 /* part of a surrogate pair, leave >=d800 */
1739 } else {
1740 /* BMP code point - may be surrogate code point - make <d800 */
1741 c2-=0x2800;
1742 }
1743 }
1744
1745 cmpRes=c1-c2;
1746 break;
1747 }
1748
1749 if(matchLen1) {
1750 *matchLen1=static_cast<int32_t>(m1-org1);
1751 *matchLen2=static_cast<int32_t>(m2-org2);
1752 }
1753 return cmpRes;
1754 }
1755
1756 /* internal function */
1757 U_CFUNC int32_t
u_strcmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1758 u_strcmpFold(const UChar *s1, int32_t length1,
1759 const UChar *s2, int32_t length2,
1760 uint32_t options,
1761 UErrorCode *pErrorCode) {
1762 return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
1763 }
1764
1765 /* public API functions */
1766
1767 U_CAPI int32_t U_EXPORT2
u_strCaseCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1768 u_strCaseCompare(const UChar *s1, int32_t length1,
1769 const UChar *s2, int32_t length2,
1770 uint32_t options,
1771 UErrorCode *pErrorCode) {
1772 /* argument checking */
1773 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1774 return 0;
1775 }
1776 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
1777 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1778 return 0;
1779 }
1780 return u_strcmpFold(s1, length1, s2, length2,
1781 options|U_COMPARE_IGNORE_CASE,
1782 pErrorCode);
1783 }
1784
1785 U_CAPI int32_t U_EXPORT2
u_strcasecmp(const UChar * s1,const UChar * s2,uint32_t options)1786 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
1787 UErrorCode errorCode=U_ZERO_ERROR;
1788 return u_strcmpFold(s1, -1, s2, -1,
1789 options|U_COMPARE_IGNORE_CASE,
1790 &errorCode);
1791 }
1792
1793 U_CAPI int32_t U_EXPORT2
u_memcasecmp(const UChar * s1,const UChar * s2,int32_t length,uint32_t options)1794 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
1795 UErrorCode errorCode=U_ZERO_ERROR;
1796 return u_strcmpFold(s1, length, s2, length,
1797 options|U_COMPARE_IGNORE_CASE,
1798 &errorCode);
1799 }
1800
1801 U_CAPI int32_t U_EXPORT2
u_strncasecmp(const UChar * s1,const UChar * s2,int32_t n,uint32_t options)1802 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
1803 UErrorCode errorCode=U_ZERO_ERROR;
1804 return u_strcmpFold(s1, n, s2, n,
1805 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1806 &errorCode);
1807 }
1808
1809 /* internal API - detect length of shared prefix */
1810 U_CAPI void
u_caseInsensitivePrefixMatch(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1811 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
1812 const UChar *s2, int32_t length2,
1813 uint32_t options,
1814 int32_t *matchLen1, int32_t *matchLen2,
1815 UErrorCode *pErrorCode) {
1816 _cmpFold(s1, length1, s2, length2, options,
1817 matchLen1, matchLen2, pErrorCode);
1818 }
1819