1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2005-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucasemap.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2005may06
16 * created by: Markus W. Scherer
17 *
18 * Case mapping service object and functions using it.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/bytestream.h"
24 #include "unicode/casemap.h"
25 #include "unicode/edits.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/ubrk.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ustring.h"
31 #include "unicode/ucasemap.h"
32 #if !UCONFIG_NO_BREAK_ITERATION
33 #include "unicode/utext.h"
34 #endif
35 #include "unicode/utf.h"
36 #include "unicode/utf8.h"
37 #include "unicode/utf16.h"
38 #include "bytesinkutil.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uassert.h"
42 #include "ucase.h"
43 #include "ucasemap_imp.h"
44 #include "ustr_imp.h"
45
46 U_NAMESPACE_USE
47
48 /* UCaseMap service object -------------------------------------------------- */
49
UCaseMap(const char * localeID,uint32_t opts,UErrorCode * pErrorCode)50 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51 #if !UCONFIG_NO_BREAK_ITERATION
52 iter(NULL),
53 #endif
54 caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55 ucasemap_setLocale(this, localeID, pErrorCode);
56 }
57
~UCaseMap()58 UCaseMap::~UCaseMap() {
59 #if !UCONFIG_NO_BREAK_ITERATION
60 delete iter;
61 #endif
62 }
63
64 U_CAPI UCaseMap * U_EXPORT2
ucasemap_open(const char * locale,uint32_t options,UErrorCode * pErrorCode)65 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
66 if(U_FAILURE(*pErrorCode)) {
67 return NULL;
68 }
69 UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
70 if(csm==NULL) {
71 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72 return NULL;
73 } else if (U_FAILURE(*pErrorCode)) {
74 delete csm;
75 return NULL;
76 }
77 return csm;
78 }
79
80 U_CAPI void U_EXPORT2
ucasemap_close(UCaseMap * csm)81 ucasemap_close(UCaseMap *csm) {
82 delete csm;
83 }
84
85 U_CAPI const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap * csm)86 ucasemap_getLocale(const UCaseMap *csm) {
87 return csm->locale;
88 }
89
90 U_CAPI uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap * csm)91 ucasemap_getOptions(const UCaseMap *csm) {
92 return csm->options;
93 }
94
95 U_CAPI void U_EXPORT2
ucasemap_setLocale(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)96 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
97 if(U_FAILURE(*pErrorCode)) {
98 return;
99 }
100 if (locale != NULL && *locale == 0) {
101 csm->locale[0] = 0;
102 csm->caseLocale = UCASE_LOC_ROOT;
103 return;
104 }
105
106 int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108 *pErrorCode=U_ZERO_ERROR;
109 /* we only really need the language code for case mappings */
110 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111 }
112 if(length==sizeof(csm->locale)) {
113 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114 }
115 if(U_SUCCESS(*pErrorCode)) {
116 csm->caseLocale=UCASE_LOC_UNKNOWN;
117 csm->caseLocale = ucase_getCaseLocale(csm->locale);
118 } else {
119 csm->locale[0]=0;
120 csm->caseLocale = UCASE_LOC_ROOT;
121 }
122 }
123
124 U_CAPI void U_EXPORT2
ucasemap_setOptions(UCaseMap * csm,uint32_t options,UErrorCode * pErrorCode)125 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
126 if(U_FAILURE(*pErrorCode)) {
127 return;
128 }
129 csm->options=options;
130 }
131
132 /* UTF-8 string case mappings ----------------------------------------------- */
133
134 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
135
136 namespace {
137
138 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
139 inline UBool
appendResult(int32_t cpLength,int32_t result,const UChar * s,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)140 appendResult(int32_t cpLength, int32_t result, const UChar *s,
141 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
142 U_ASSERT(U_SUCCESS(errorCode));
143
144 /* decode the result */
145 if(result<0) {
146 /* (not) original code point */
147 if(edits!=NULL) {
148 edits->addUnchanged(cpLength);
149 }
150 if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
151 ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
152 }
153 } else {
154 if(result<=UCASE_MAX_STRING_LENGTH) {
155 // string: "result" is the UTF-16 length
156 return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
157 } else {
158 ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
159 }
160 }
161 return TRUE;
162 }
163
164 // See unicode/utf8.h U8_APPEND_UNSAFE().
getTwoByteLead(UChar32 c)165 inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
getTwoByteTrail(UChar32 c)166 inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
167
168 UChar32 U_CALLCONV
utf8_caseContextIterator(void * context,int8_t dir)169 utf8_caseContextIterator(void *context, int8_t dir) {
170 UCaseContext *csc=(UCaseContext *)context;
171 UChar32 c;
172
173 if(dir<0) {
174 /* reset for backward iteration */
175 csc->index=csc->cpStart;
176 csc->dir=dir;
177 } else if(dir>0) {
178 /* reset for forward iteration */
179 csc->index=csc->cpLimit;
180 csc->dir=dir;
181 } else {
182 /* continue current iteration direction */
183 dir=csc->dir;
184 }
185
186 if(dir<0) {
187 if(csc->start<csc->index) {
188 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
189 return c;
190 }
191 } else {
192 if(csc->index<csc->limit) {
193 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
194 return c;
195 }
196 }
197 return U_SENTINEL;
198 }
199
200 /**
201 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
202 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
203 */
toLower(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)204 void toLower(int32_t caseLocale, uint32_t options,
205 const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
206 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
207 const int8_t *latinToLower;
208 if (caseLocale == UCASE_LOC_ROOT ||
209 (caseLocale >= 0 ?
210 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
211 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
212 latinToLower = LatinCase::TO_LOWER_NORMAL;
213 } else {
214 latinToLower = LatinCase::TO_LOWER_TR_LT;
215 }
216 const UTrie2 *trie = ucase_getTrie();
217 int32_t prev = srcStart;
218 int32_t srcIndex = srcStart;
219 for (;;) {
220 // fast path for simple cases
221 int32_t cpStart;
222 UChar32 c;
223 for (;;) {
224 if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
225 c = U_SENTINEL;
226 break;
227 }
228 uint8_t lead = src[srcIndex++];
229 if (lead <= 0x7f) {
230 int8_t d = latinToLower[lead];
231 if (d == LatinCase::EXC) {
232 cpStart = srcIndex - 1;
233 c = lead;
234 break;
235 }
236 if (d == 0) { continue; }
237 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
238 sink, options, edits, errorCode);
239 char ascii = (char)(lead + d);
240 sink.Append(&ascii, 1);
241 if (edits != nullptr) {
242 edits->addReplace(1, 1);
243 }
244 prev = srcIndex;
245 continue;
246 } else if (lead < 0xe3) {
247 uint8_t t;
248 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
249 (t = src[srcIndex] - 0x80) <= 0x3f) {
250 // U+0080..U+017F
251 ++srcIndex;
252 c = ((lead - 0xc0) << 6) | t;
253 int8_t d = latinToLower[c];
254 if (d == LatinCase::EXC) {
255 cpStart = srcIndex - 2;
256 break;
257 }
258 if (d == 0) { continue; }
259 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
260 sink, options, edits, errorCode);
261 ByteSinkUtil::appendTwoBytes(c + d, sink);
262 if (edits != nullptr) {
263 edits->addReplace(2, 2);
264 }
265 prev = srcIndex;
266 continue;
267 }
268 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
269 (srcIndex + 2) <= srcLimit &&
270 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
271 // most of CJK: no case mappings
272 srcIndex += 2;
273 continue;
274 }
275 cpStart = --srcIndex;
276 U8_NEXT(src, srcIndex, srcLimit, c);
277 if (c < 0) {
278 // ill-formed UTF-8
279 continue;
280 }
281 uint16_t props = UTRIE2_GET16(trie, c);
282 if (UCASE_HAS_EXCEPTION(props)) { break; }
283 int32_t delta;
284 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
285 continue;
286 }
287 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
288 sink, options, edits, errorCode);
289 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
290 prev = srcIndex;
291 }
292 if (c < 0) {
293 break;
294 }
295 // slow path
296 const UChar *s;
297 if (caseLocale >= 0) {
298 csc->cpStart = cpStart;
299 csc->cpLimit = srcIndex;
300 c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
301 } else {
302 c = ucase_toFullFolding(c, &s, options);
303 }
304 if (c >= 0) {
305 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
306 sink, options, edits, errorCode);
307 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
308 prev = srcIndex;
309 }
310 }
311 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
312 sink, options, edits, errorCode);
313 }
314
toUpper(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)315 void toUpper(int32_t caseLocale, uint32_t options,
316 const uint8_t *src, UCaseContext *csc, int32_t srcLength,
317 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
318 const int8_t *latinToUpper;
319 if (caseLocale == UCASE_LOC_TURKISH) {
320 latinToUpper = LatinCase::TO_UPPER_TR;
321 } else {
322 latinToUpper = LatinCase::TO_UPPER_NORMAL;
323 }
324 const UTrie2 *trie = ucase_getTrie();
325 int32_t prev = 0;
326 int32_t srcIndex = 0;
327 for (;;) {
328 // fast path for simple cases
329 int32_t cpStart;
330 UChar32 c;
331 for (;;) {
332 if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
333 c = U_SENTINEL;
334 break;
335 }
336 uint8_t lead = src[srcIndex++];
337 if (lead <= 0x7f) {
338 int8_t d = latinToUpper[lead];
339 if (d == LatinCase::EXC) {
340 cpStart = srcIndex - 1;
341 c = lead;
342 break;
343 }
344 if (d == 0) { continue; }
345 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
346 sink, options, edits, errorCode);
347 char ascii = (char)(lead + d);
348 sink.Append(&ascii, 1);
349 if (edits != nullptr) {
350 edits->addReplace(1, 1);
351 }
352 prev = srcIndex;
353 continue;
354 } else if (lead < 0xe3) {
355 uint8_t t;
356 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
357 (t = src[srcIndex] - 0x80) <= 0x3f) {
358 // U+0080..U+017F
359 ++srcIndex;
360 c = ((lead - 0xc0) << 6) | t;
361 int8_t d = latinToUpper[c];
362 if (d == LatinCase::EXC) {
363 cpStart = srcIndex - 2;
364 break;
365 }
366 if (d == 0) { continue; }
367 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
368 sink, options, edits, errorCode);
369 ByteSinkUtil::appendTwoBytes(c + d, sink);
370 if (edits != nullptr) {
371 edits->addReplace(2, 2);
372 }
373 prev = srcIndex;
374 continue;
375 }
376 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
377 (srcIndex + 2) <= srcLength &&
378 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
379 // most of CJK: no case mappings
380 srcIndex += 2;
381 continue;
382 }
383 cpStart = --srcIndex;
384 U8_NEXT(src, srcIndex, srcLength, c);
385 if (c < 0) {
386 // ill-formed UTF-8
387 continue;
388 }
389 uint16_t props = UTRIE2_GET16(trie, c);
390 if (UCASE_HAS_EXCEPTION(props)) { break; }
391 int32_t delta;
392 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
393 continue;
394 }
395 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
396 sink, options, edits, errorCode);
397 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
398 prev = srcIndex;
399 }
400 if (c < 0) {
401 break;
402 }
403 // slow path
404 csc->cpStart = cpStart;
405 csc->cpLimit = srcIndex;
406 const UChar *s;
407 c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
408 if (c >= 0) {
409 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
410 sink, options, edits, errorCode);
411 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
412 prev = srcIndex;
413 }
414 }
415 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
416 sink, options, edits, errorCode);
417 }
418
419 } // namespace
420
421 #if !UCONFIG_NO_BREAK_ITERATION
422
423 U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,const uint8_t * src,int32_t srcLength,ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)424 ucasemap_internalUTF8ToTitle(
425 int32_t caseLocale, uint32_t options, BreakIterator *iter,
426 const uint8_t *src, int32_t srcLength,
427 ByteSink &sink, icu::Edits *edits,
428 UErrorCode &errorCode) {
429 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
430 return;
431 }
432
433 /* set up local variables */
434 UCaseContext csc=UCASECONTEXT_INITIALIZER;
435 csc.p=(void *)src;
436 csc.limit=srcLength;
437 int32_t prev=0;
438 UBool isFirstIndex=TRUE;
439
440 /* titlecasing loop */
441 while(prev<srcLength) {
442 /* find next index where to titlecase */
443 int32_t index;
444 if(isFirstIndex) {
445 isFirstIndex=FALSE;
446 index=iter->first();
447 } else {
448 index=iter->next();
449 }
450 if(index==UBRK_DONE || index>srcLength) {
451 index=srcLength;
452 }
453
454 /*
455 * Segment [prev..index[ into 3 parts:
456 * a) skipped characters (copy as-is) [prev..titleStart[
457 * b) first letter (titlecase) [titleStart..titleLimit[
458 * c) subsequent characters (lowercase) [titleLimit..index[
459 */
460 if(prev<index) {
461 /* find and copy skipped characters [prev..titleStart[ */
462 int32_t titleStart=prev;
463 int32_t titleLimit=prev;
464 UChar32 c;
465 U8_NEXT(src, titleLimit, index, c);
466 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
467 // Adjust the titlecasing index to the next cased character,
468 // or to the next letter/number/symbol/private use.
469 // Stop with titleStart<titleLimit<=index
470 // if there is a character to be titlecased,
471 // or else stop with titleStart==titleLimit==index.
472 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
473 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
474 titleStart=titleLimit;
475 if(titleLimit==index) {
476 break;
477 }
478 U8_NEXT(src, titleLimit, index, c);
479 }
480 if (prev < titleStart) {
481 if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
482 sink, options, edits, errorCode)) {
483 return;
484 }
485 }
486 }
487
488 if(titleStart<titleLimit) {
489 /* titlecase c which is from [titleStart..titleLimit[ */
490 if(c>=0) {
491 csc.cpStart=titleStart;
492 csc.cpLimit=titleLimit;
493 const UChar *s;
494 c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
495 if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
496 return;
497 }
498 } else {
499 // Malformed UTF-8.
500 if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
501 sink, options, edits, errorCode)) {
502 return;
503 }
504 }
505
506 /* Special case Dutch IJ titlecasing */
507 if (titleStart+1 < index &&
508 caseLocale == UCASE_LOC_DUTCH &&
509 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
510 if (src[titleStart+1] == 0x006A) {
511 ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
512 titleLimit++;
513 } else if (src[titleStart+1] == 0x004A) {
514 // Keep the capital J from getting lowercased.
515 if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
516 sink, options, edits, errorCode)) {
517 return;
518 }
519 titleLimit++;
520 }
521 }
522
523 /* lowercase [titleLimit..index[ */
524 if(titleLimit<index) {
525 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
526 /* Normal operation: Lowercase the rest of the word. */
527 toLower(caseLocale, options,
528 src, &csc, titleLimit, index,
529 sink, edits, errorCode);
530 if(U_FAILURE(errorCode)) {
531 return;
532 }
533 } else {
534 /* Optionally just copy the rest of the word unchanged. */
535 if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
536 sink, options, edits, errorCode)) {
537 return;
538 }
539 }
540 }
541 }
542 }
543
544 prev=index;
545 }
546 }
547
548 #endif
549
550 U_NAMESPACE_BEGIN
551 namespace GreekUpper {
552
isFollowedByCasedLetter(const uint8_t * s,int32_t i,int32_t length)553 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
554 while (i < length) {
555 UChar32 c;
556 U8_NEXT(s, i, length, c);
557 int32_t type = ucase_getTypeOrIgnorable(c);
558 if ((type & UCASE_IGNORABLE) != 0) {
559 // Case-ignorable, continue with the loop.
560 } else if (type != UCASE_NONE) {
561 return TRUE; // Followed by cased letter.
562 } else {
563 return FALSE; // Uncased and not case-ignorable.
564 }
565 }
566 return FALSE; // Not followed by cased letter.
567 }
568
569 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
toUpper(uint32_t options,const uint8_t * src,int32_t srcLength,ByteSink & sink,Edits * edits,UErrorCode & errorCode)570 void toUpper(uint32_t options,
571 const uint8_t *src, int32_t srcLength,
572 ByteSink &sink, Edits *edits,
573 UErrorCode &errorCode) {
574 uint32_t state = 0;
575 for (int32_t i = 0; i < srcLength;) {
576 int32_t nextIndex = i;
577 UChar32 c;
578 U8_NEXT(src, nextIndex, srcLength, c);
579 uint32_t nextState = 0;
580 int32_t type = ucase_getTypeOrIgnorable(c);
581 if ((type & UCASE_IGNORABLE) != 0) {
582 // c is case-ignorable
583 nextState |= (state & AFTER_CASED);
584 } else if (type != UCASE_NONE) {
585 // c is cased
586 nextState |= AFTER_CASED;
587 }
588 uint32_t data = getLetterData(c);
589 if (data > 0) {
590 uint32_t upper = data & UPPER_MASK;
591 // Add a dialytika to this iota or ypsilon vowel
592 // if we removed a tonos from the previous vowel,
593 // and that previous vowel did not also have (or gain) a dialytika.
594 // Adding one only to the final vowel in a longer sequence
595 // (which does not occur in normal writing) would require lookahead.
596 // Set the same flag as for preserving an existing dialytika.
597 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
598 (upper == 0x399 || upper == 0x3A5)) {
599 data |= HAS_DIALYTIKA;
600 }
601 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
602 if ((data & HAS_YPOGEGRAMMENI) != 0) {
603 numYpogegrammeni = 1;
604 }
605 // Skip combining diacritics after this Greek letter.
606 int32_t nextNextIndex = nextIndex;
607 while (nextIndex < srcLength) {
608 UChar32 c2;
609 U8_NEXT(src, nextNextIndex, srcLength, c2);
610 uint32_t diacriticData = getDiacriticData(c2);
611 if (diacriticData != 0) {
612 data |= diacriticData;
613 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
614 ++numYpogegrammeni;
615 }
616 nextIndex = nextNextIndex;
617 } else {
618 break; // not a Greek diacritic
619 }
620 }
621 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
622 nextState |= AFTER_VOWEL_WITH_ACCENT;
623 }
624 // Map according to Greek rules.
625 UBool addTonos = FALSE;
626 if (upper == 0x397 &&
627 (data & HAS_ACCENT) != 0 &&
628 numYpogegrammeni == 0 &&
629 (state & AFTER_CASED) == 0 &&
630 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
631 // Keep disjunctive "or" with (only) a tonos.
632 // We use the same "word boundary" conditions as for the Final_Sigma test.
633 if (i == nextIndex) {
634 upper = 0x389; // Preserve the precomposed form.
635 } else {
636 addTonos = TRUE;
637 }
638 } else if ((data & HAS_DIALYTIKA) != 0) {
639 // Preserve a vowel with dialytika in precomposed form if it exists.
640 if (upper == 0x399) {
641 upper = 0x3AA;
642 data &= ~HAS_EITHER_DIALYTIKA;
643 } else if (upper == 0x3A5) {
644 upper = 0x3AB;
645 data &= ~HAS_EITHER_DIALYTIKA;
646 }
647 }
648
649 UBool change;
650 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
651 change = TRUE; // common, simple usage
652 } else {
653 // Find out first whether we are changing the text.
654 U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
655 change = (i + 2) > nextIndex ||
656 src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
657 numYpogegrammeni > 0;
658 int32_t i2 = i + 2;
659 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
660 change |= (i2 + 2) > nextIndex ||
661 src[i2] != (uint8_t)u8"\u0308"[0] ||
662 src[i2 + 1] != (uint8_t)u8"\u0308"[1];
663 i2 += 2;
664 }
665 if (addTonos) {
666 change |= (i2 + 2) > nextIndex ||
667 src[i2] != (uint8_t)u8"\u0301"[0] ||
668 src[i2 + 1] != (uint8_t)u8"\u0301"[1];
669 i2 += 2;
670 }
671 int32_t oldLength = nextIndex - i;
672 int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399
673 change |= oldLength != newLength;
674 if (change) {
675 if (edits != NULL) {
676 edits->addReplace(oldLength, newLength);
677 }
678 } else {
679 if (edits != NULL) {
680 edits->addUnchanged(oldLength);
681 }
682 // Write unchanged text?
683 change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
684 }
685 }
686
687 if (change) {
688 ByteSinkUtil::appendTwoBytes(upper, sink);
689 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
690 sink.Append(reinterpret_cast<const char*>(u8"\u0308"), 2); // restore or add a dialytika
691 }
692 if (addTonos) {
693 sink.Append(reinterpret_cast<const char*>(u8"\u0301"), 2);
694 }
695 while (numYpogegrammeni > 0) {
696 sink.Append(reinterpret_cast<const char*>(u8"\u0399"), 2);
697 --numYpogegrammeni;
698 }
699 }
700 } else if(c>=0) {
701 const UChar *s;
702 c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
703 if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
704 return;
705 }
706 } else {
707 // Malformed UTF-8.
708 if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
709 sink, options, edits, errorCode)) {
710 return;
711 }
712 }
713 i = nextIndex;
714 state = nextState;
715 }
716 }
717
718 } // namespace GreekUpper
719 U_NAMESPACE_END
720
721 static void U_CALLCONV
ucasemap_internalUTF8ToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)722 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
723 const uint8_t *src, int32_t srcLength,
724 icu::ByteSink &sink, icu::Edits *edits,
725 UErrorCode &errorCode) {
726 UCaseContext csc=UCASECONTEXT_INITIALIZER;
727 csc.p=(void *)src;
728 csc.limit=srcLength;
729 toLower(
730 caseLocale, options,
731 src, &csc, 0, srcLength,
732 sink, edits, errorCode);
733 }
734
735 static void U_CALLCONV
ucasemap_internalUTF8ToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)736 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
737 const uint8_t *src, int32_t srcLength,
738 icu::ByteSink &sink, icu::Edits *edits,
739 UErrorCode &errorCode) {
740 if (caseLocale == UCASE_LOC_GREEK) {
741 GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
742 } else {
743 UCaseContext csc=UCASECONTEXT_INITIALIZER;
744 csc.p=(void *)src;
745 csc.limit=srcLength;
746 toUpper(
747 caseLocale, options,
748 src, &csc, srcLength,
749 sink, edits, errorCode);
750 }
751 }
752
753 static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)754 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
755 const uint8_t *src, int32_t srcLength,
756 icu::ByteSink &sink, icu::Edits *edits,
757 UErrorCode &errorCode) {
758 toLower(
759 -1, options,
760 src, nullptr, 0, srcLength,
761 sink, edits, errorCode);
762 }
763
764 void
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)765 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
766 const char *src, int32_t srcLength,
767 UTF8CaseMapper *stringCaseMapper,
768 icu::ByteSink &sink, icu::Edits *edits,
769 UErrorCode &errorCode) {
770 /* check argument values */
771 if (U_FAILURE(errorCode)) {
772 return;
773 }
774 if ((src == nullptr && srcLength != 0) || srcLength < -1) {
775 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
776 return;
777 }
778
779 // Get the string length.
780 if (srcLength == -1) {
781 srcLength = (int32_t)uprv_strlen((const char *)src);
782 }
783
784 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
785 edits->reset();
786 }
787 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
788 (const uint8_t *)src, srcLength, sink, edits, errorCode);
789 sink.Flush();
790 if (U_SUCCESS(errorCode)) {
791 if (edits != nullptr) {
792 edits->copyErrorTo(errorCode);
793 }
794 }
795 }
796
797 int32_t
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)798 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
799 char *dest, int32_t destCapacity,
800 const char *src, int32_t srcLength,
801 UTF8CaseMapper *stringCaseMapper,
802 icu::Edits *edits,
803 UErrorCode &errorCode) {
804 /* check argument values */
805 if(U_FAILURE(errorCode)) {
806 return 0;
807 }
808 if( destCapacity<0 ||
809 (dest==NULL && destCapacity>0) ||
810 (src==NULL && srcLength!=0) || srcLength<-1
811 ) {
812 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
813 return 0;
814 }
815
816 /* get the string length */
817 if(srcLength==-1) {
818 srcLength=(int32_t)uprv_strlen((const char *)src);
819 }
820
821 /* check for overlapping source and destination */
822 if( dest!=NULL &&
823 ((src>=dest && src<(dest+destCapacity)) ||
824 (dest>=src && dest<(src+srcLength)))
825 ) {
826 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
827 return 0;
828 }
829
830 CheckedArrayByteSink sink(dest, destCapacity);
831 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
832 edits->reset();
833 }
834 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
835 (const uint8_t *)src, srcLength, sink, edits, errorCode);
836 sink.Flush();
837 if (U_SUCCESS(errorCode)) {
838 if (sink.Overflowed()) {
839 errorCode = U_BUFFER_OVERFLOW_ERROR;
840 } else if (edits != nullptr) {
841 edits->copyErrorTo(errorCode);
842 }
843 }
844 return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
845 }
846
847 /* public API functions */
848
849 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)850 ucasemap_utf8ToLower(const UCaseMap *csm,
851 char *dest, int32_t destCapacity,
852 const char *src, int32_t srcLength,
853 UErrorCode *pErrorCode) {
854 return ucasemap_mapUTF8(
855 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
856 dest, destCapacity,
857 src, srcLength,
858 ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
859 }
860
861 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)862 ucasemap_utf8ToUpper(const UCaseMap *csm,
863 char *dest, int32_t destCapacity,
864 const char *src, int32_t srcLength,
865 UErrorCode *pErrorCode) {
866 return ucasemap_mapUTF8(
867 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
868 dest, destCapacity,
869 src, srcLength,
870 ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
871 }
872
873 U_CAPI int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)874 ucasemap_utf8FoldCase(const UCaseMap *csm,
875 char *dest, int32_t destCapacity,
876 const char *src, int32_t srcLength,
877 UErrorCode *pErrorCode) {
878 return ucasemap_mapUTF8(
879 UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
880 dest, destCapacity,
881 src, srcLength,
882 ucasemap_internalUTF8Fold, NULL, *pErrorCode);
883 }
884
885 U_NAMESPACE_BEGIN
886
utf8ToLower(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)887 void CaseMap::utf8ToLower(
888 const char *locale, uint32_t options,
889 StringPiece src, ByteSink &sink, Edits *edits,
890 UErrorCode &errorCode) {
891 ucasemap_mapUTF8(
892 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
893 src.data(), src.length(),
894 ucasemap_internalUTF8ToLower, sink, edits, errorCode);
895 }
896
utf8ToUpper(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)897 void CaseMap::utf8ToUpper(
898 const char *locale, uint32_t options,
899 StringPiece src, ByteSink &sink, Edits *edits,
900 UErrorCode &errorCode) {
901 ucasemap_mapUTF8(
902 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
903 src.data(), src.length(),
904 ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
905 }
906
utf8Fold(uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)907 void CaseMap::utf8Fold(
908 uint32_t options,
909 StringPiece src, ByteSink &sink, Edits *edits,
910 UErrorCode &errorCode) {
911 ucasemap_mapUTF8(
912 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
913 src.data(), src.length(),
914 ucasemap_internalUTF8Fold, sink, edits, errorCode);
915 }
916
utf8ToLower(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)917 int32_t CaseMap::utf8ToLower(
918 const char *locale, uint32_t options,
919 const char *src, int32_t srcLength,
920 char *dest, int32_t destCapacity, Edits *edits,
921 UErrorCode &errorCode) {
922 return ucasemap_mapUTF8(
923 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
924 dest, destCapacity,
925 src, srcLength,
926 ucasemap_internalUTF8ToLower, edits, errorCode);
927 }
928
utf8ToUpper(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)929 int32_t CaseMap::utf8ToUpper(
930 const char *locale, uint32_t options,
931 const char *src, int32_t srcLength,
932 char *dest, int32_t destCapacity, Edits *edits,
933 UErrorCode &errorCode) {
934 return ucasemap_mapUTF8(
935 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
936 dest, destCapacity,
937 src, srcLength,
938 ucasemap_internalUTF8ToUpper, edits, errorCode);
939 }
940
utf8Fold(uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)941 int32_t CaseMap::utf8Fold(
942 uint32_t options,
943 const char *src, int32_t srcLength,
944 char *dest, int32_t destCapacity, Edits *edits,
945 UErrorCode &errorCode) {
946 return ucasemap_mapUTF8(
947 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
948 dest, destCapacity,
949 src, srcLength,
950 ucasemap_internalUTF8Fold, edits, errorCode);
951 }
952
953 U_NAMESPACE_END
954