1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2005-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucasemap.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2005may06
16 * created by: Markus W. Scherer
17 *
18 * Case mapping service object and functions using it.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/bytestream.h"
24 #include "unicode/casemap.h"
25 #include "unicode/edits.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/ubrk.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ustring.h"
31 #include "unicode/ucasemap.h"
32 #if !UCONFIG_NO_BREAK_ITERATION
33 #include "unicode/utext.h"
34 #endif
35 #include "unicode/utf.h"
36 #include "unicode/utf8.h"
37 #include "unicode/utf16.h"
38 #include "bytesinkutil.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uassert.h"
42 #include "ucase.h"
43 #include "ucasemap_imp.h"
44
45 U_NAMESPACE_USE
46
47 /* UCaseMap service object -------------------------------------------------- */
48
UCaseMap(const char * localeID,uint32_t opts,UErrorCode * pErrorCode)49 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
50 #if !UCONFIG_NO_BREAK_ITERATION
51 iter(nullptr),
52 #endif
53 caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
54 ucasemap_setLocale(this, localeID, pErrorCode);
55 }
56
~UCaseMap()57 UCaseMap::~UCaseMap() {
58 #if !UCONFIG_NO_BREAK_ITERATION
59 delete iter;
60 #endif
61 }
62
63 U_CAPI UCaseMap * U_EXPORT2
ucasemap_open(const char * locale,uint32_t options,UErrorCode * pErrorCode)64 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
65 if(U_FAILURE(*pErrorCode)) {
66 return nullptr;
67 }
68 UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
69 if(csm==nullptr) {
70 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
71 return nullptr;
72 } else if (U_FAILURE(*pErrorCode)) {
73 delete csm;
74 return nullptr;
75 }
76 return csm;
77 }
78
79 U_CAPI void U_EXPORT2
ucasemap_close(UCaseMap * csm)80 ucasemap_close(UCaseMap *csm) {
81 delete csm;
82 }
83
84 U_CAPI const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap * csm)85 ucasemap_getLocale(const UCaseMap *csm) {
86 return csm->locale;
87 }
88
89 U_CAPI uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap * csm)90 ucasemap_getOptions(const UCaseMap *csm) {
91 return csm->options;
92 }
93
94 U_CAPI void U_EXPORT2
ucasemap_setLocale(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)95 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
96 if(U_FAILURE(*pErrorCode)) {
97 return;
98 }
99 if (locale != nullptr && *locale == 0) {
100 csm->locale[0] = 0;
101 csm->caseLocale = UCASE_LOC_ROOT;
102 return;
103 }
104
105 int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
106 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
107 *pErrorCode=U_ZERO_ERROR;
108 /* we only really need the language code for case mappings */
109 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
110 }
111 if(length==sizeof(csm->locale)) {
112 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
113 }
114 if(U_SUCCESS(*pErrorCode)) {
115 csm->caseLocale = ucase_getCaseLocale(csm->locale);
116 } else {
117 csm->locale[0]=0;
118 csm->caseLocale = UCASE_LOC_ROOT;
119 }
120 }
121
122 U_CAPI void U_EXPORT2
ucasemap_setOptions(UCaseMap * csm,uint32_t options,UErrorCode * pErrorCode)123 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
124 if(U_FAILURE(*pErrorCode)) {
125 return;
126 }
127 csm->options=options;
128 }
129
130 /* UTF-8 string case mappings ----------------------------------------------- */
131
132 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
133
134 namespace {
135
136 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
137 inline UBool
appendResult(int32_t cpLength,int32_t result,const char16_t * s,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)138 appendResult(int32_t cpLength, int32_t result, const char16_t *s,
139 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
140 U_ASSERT(U_SUCCESS(errorCode));
141
142 /* decode the result */
143 if(result<0) {
144 /* (not) original code point */
145 if(edits!=nullptr) {
146 edits->addUnchanged(cpLength);
147 }
148 if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
149 ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
150 }
151 } else {
152 if(result<=UCASE_MAX_STRING_LENGTH) {
153 // string: "result" is the UTF-16 length
154 return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
155 } else {
156 ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
157 }
158 }
159 return true;
160 }
161
162 // See unicode/utf8.h U8_APPEND_UNSAFE().
getTwoByteLead(UChar32 c)163 inline uint8_t getTwoByteLead(UChar32 c) { return static_cast<uint8_t>((c >> 6) | 0xc0); }
getTwoByteTrail(UChar32 c)164 inline uint8_t getTwoByteTrail(UChar32 c) { return static_cast<uint8_t>((c & 0x3f) | 0x80); }
165
166 UChar32 U_CALLCONV
utf8_caseContextIterator(void * context,int8_t dir)167 utf8_caseContextIterator(void *context, int8_t dir) {
168 UCaseContext* csc = static_cast<UCaseContext*>(context);
169 UChar32 c;
170
171 if(dir<0) {
172 /* reset for backward iteration */
173 csc->index=csc->cpStart;
174 csc->dir=dir;
175 } else if(dir>0) {
176 /* reset for forward iteration */
177 csc->index=csc->cpLimit;
178 csc->dir=dir;
179 } else {
180 /* continue current iteration direction */
181 dir=csc->dir;
182 }
183
184 if(dir<0) {
185 if(csc->start<csc->index) {
186 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
187 return c;
188 }
189 } else {
190 if(csc->index<csc->limit) {
191 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
192 return c;
193 }
194 }
195 return U_SENTINEL;
196 }
197
198 /**
199 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
200 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
201 */
toLower(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)202 void toLower(int32_t caseLocale, uint32_t options,
203 const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
204 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
205 const int8_t *latinToLower;
206 if (caseLocale == UCASE_LOC_ROOT ||
207 (caseLocale >= 0 ?
208 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
209 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
210 latinToLower = LatinCase::TO_LOWER_NORMAL;
211 } else {
212 latinToLower = LatinCase::TO_LOWER_TR_LT;
213 }
214 const UTrie2 *trie = ucase_getTrie();
215 int32_t prev = srcStart;
216 int32_t srcIndex = srcStart;
217 for (;;) {
218 // fast path for simple cases
219 int32_t cpStart;
220 UChar32 c;
221 for (;;) {
222 if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
223 c = U_SENTINEL;
224 break;
225 }
226 uint8_t lead = src[srcIndex++];
227 if (lead <= 0x7f) {
228 int8_t d = latinToLower[lead];
229 if (d == LatinCase::EXC) {
230 cpStart = srcIndex - 1;
231 c = lead;
232 break;
233 }
234 if (d == 0) { continue; }
235 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
236 sink, options, edits, errorCode);
237 char ascii = static_cast<char>(lead + d);
238 sink.Append(&ascii, 1);
239 if (edits != nullptr) {
240 edits->addReplace(1, 1);
241 }
242 prev = srcIndex;
243 continue;
244 } else if (lead < 0xe3) {
245 uint8_t t;
246 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
247 (t = src[srcIndex] - 0x80) <= 0x3f) {
248 // U+0080..U+017F
249 ++srcIndex;
250 c = ((lead - 0xc0) << 6) | t;
251 int8_t d = latinToLower[c];
252 if (d == LatinCase::EXC) {
253 cpStart = srcIndex - 2;
254 break;
255 }
256 if (d == 0) { continue; }
257 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
258 sink, options, edits, errorCode);
259 ByteSinkUtil::appendTwoBytes(c + d, sink);
260 if (edits != nullptr) {
261 edits->addReplace(2, 2);
262 }
263 prev = srcIndex;
264 continue;
265 }
266 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
267 (srcIndex + 2) <= srcLimit &&
268 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
269 // most of CJK: no case mappings
270 srcIndex += 2;
271 continue;
272 }
273 cpStart = --srcIndex;
274 U8_NEXT(src, srcIndex, srcLimit, c);
275 if (c < 0) {
276 // ill-formed UTF-8
277 continue;
278 }
279 uint16_t props = UTRIE2_GET16(trie, c);
280 if (UCASE_HAS_EXCEPTION(props)) { break; }
281 int32_t delta;
282 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
283 continue;
284 }
285 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
286 sink, options, edits, errorCode);
287 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
288 prev = srcIndex;
289 }
290 if (c < 0) {
291 break;
292 }
293 // slow path
294 const char16_t *s;
295 if (caseLocale >= 0) {
296 csc->cpStart = cpStart;
297 csc->cpLimit = srcIndex;
298 c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
299 } else {
300 c = ucase_toFullFolding(c, &s, options);
301 }
302 if (c >= 0) {
303 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
304 sink, options, edits, errorCode);
305 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
306 prev = srcIndex;
307 }
308 }
309 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
310 sink, options, edits, errorCode);
311 }
312
toUpper(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)313 void toUpper(int32_t caseLocale, uint32_t options,
314 const uint8_t *src, UCaseContext *csc, int32_t srcLength,
315 icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
316 const int8_t *latinToUpper;
317 if (caseLocale == UCASE_LOC_TURKISH) {
318 latinToUpper = LatinCase::TO_UPPER_TR;
319 } else {
320 latinToUpper = LatinCase::TO_UPPER_NORMAL;
321 }
322 const UTrie2 *trie = ucase_getTrie();
323 int32_t prev = 0;
324 int32_t srcIndex = 0;
325 for (;;) {
326 // fast path for simple cases
327 int32_t cpStart;
328 UChar32 c;
329 for (;;) {
330 if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
331 c = U_SENTINEL;
332 break;
333 }
334 uint8_t lead = src[srcIndex++];
335 if (lead <= 0x7f) {
336 int8_t d = latinToUpper[lead];
337 if (d == LatinCase::EXC) {
338 cpStart = srcIndex - 1;
339 c = lead;
340 break;
341 }
342 if (d == 0) { continue; }
343 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
344 sink, options, edits, errorCode);
345 char ascii = static_cast<char>(lead + d);
346 sink.Append(&ascii, 1);
347 if (edits != nullptr) {
348 edits->addReplace(1, 1);
349 }
350 prev = srcIndex;
351 continue;
352 } else if (lead < 0xe3) {
353 uint8_t t;
354 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
355 (t = src[srcIndex] - 0x80) <= 0x3f) {
356 // U+0080..U+017F
357 ++srcIndex;
358 c = ((lead - 0xc0) << 6) | t;
359 int8_t d = latinToUpper[c];
360 if (d == LatinCase::EXC) {
361 cpStart = srcIndex - 2;
362 break;
363 }
364 if (d == 0) { continue; }
365 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
366 sink, options, edits, errorCode);
367 ByteSinkUtil::appendTwoBytes(c + d, sink);
368 if (edits != nullptr) {
369 edits->addReplace(2, 2);
370 }
371 prev = srcIndex;
372 continue;
373 }
374 } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
375 (srcIndex + 2) <= srcLength &&
376 U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
377 // most of CJK: no case mappings
378 srcIndex += 2;
379 continue;
380 }
381 cpStart = --srcIndex;
382 U8_NEXT(src, srcIndex, srcLength, c);
383 if (c < 0) {
384 // ill-formed UTF-8
385 continue;
386 }
387 uint16_t props = UTRIE2_GET16(trie, c);
388 if (UCASE_HAS_EXCEPTION(props)) { break; }
389 int32_t delta;
390 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
391 continue;
392 }
393 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
394 sink, options, edits, errorCode);
395 ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
396 prev = srcIndex;
397 }
398 if (c < 0) {
399 break;
400 }
401 // slow path
402 csc->cpStart = cpStart;
403 csc->cpLimit = srcIndex;
404 const char16_t *s;
405 c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
406 if (c >= 0) {
407 ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
408 sink, options, edits, errorCode);
409 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
410 prev = srcIndex;
411 }
412 }
413 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
414 sink, options, edits, errorCode);
415 }
416
417 } // namespace
418
419 #if !UCONFIG_NO_BREAK_ITERATION
420
421 namespace {
422
423 constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
424
425 constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
426
427 /**
428 * Input: c is a letter I with or without acute accent.
429 * start is the index in src after c, and is less than segmentLimit.
430 * If a plain i/I is followed by a plain j/J,
431 * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
432 * then we output accordingly.
433 *
434 * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
435 */
maybeTitleDutchIJ(const uint8_t * src,UChar32 c,int32_t start,int32_t segmentLimit,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)436 int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
437 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
438 U_ASSERT(start < segmentLimit);
439
440 int32_t index = start;
441 bool withAcute = false;
442
443 // If the conditions are met, then the following variables tell us what to output.
444 int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
445 bool doTitleJ = false; // true if the j needs to be titlecased
446 int32_t unchanged2 = 0; // after the j (0 or 1)
447
448 // next character after the first letter
449 UChar32 c2;
450 c2 = src[index++];
451
452 // Is the first letter an i/I with accent?
453 if (c == u'I') {
454 if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
455 withAcute = true;
456 unchanged1 = 2; // ACUTE is 2 code units in UTF-8
457 if (index == segmentLimit) { return start; }
458 c2 = src[index++];
459 }
460 } else { // Í
461 withAcute = true;
462 }
463
464 // Is the next character a j/J?
465 if (c2 == u'j') {
466 doTitleJ = true;
467 } else if (c2 == u'J') {
468 ++unchanged1;
469 } else {
470 return start;
471 }
472
473 // A plain i/I must be followed by a plain j/J.
474 // An i/I with acute must be followed by a j/J with acute.
475 if (withAcute) {
476 if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
477 return start;
478 }
479 if (doTitleJ) {
480 unchanged2 = 2; // ACUTE is 2 code units in UTF-8
481 } else {
482 unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8
483 }
484 }
485
486 // There must not be another combining mark.
487 if (index < segmentLimit) {
488 int32_t cp;
489 int32_t i = index;
490 U8_NEXT(src, i, segmentLimit, cp);
491 uint32_t typeMask = U_GET_GC_MASK(cp);
492 if ((typeMask & U_GC_M_MASK) != 0) {
493 return start;
494 }
495 }
496
497 // Output the rest of the Dutch IJ.
498 ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
499 start += unchanged1;
500 if (doTitleJ) {
501 ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
502 ++start;
503 }
504 ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
505
506 U_ASSERT(start + unchanged2 == index);
507 return index;
508 }
509
510 } // namespace
511
512 U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,const uint8_t * src,int32_t srcLength,ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)513 ucasemap_internalUTF8ToTitle(
514 int32_t caseLocale, uint32_t options, BreakIterator *iter,
515 const uint8_t *src, int32_t srcLength,
516 ByteSink &sink, icu::Edits *edits,
517 UErrorCode &errorCode) {
518 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
519 return;
520 }
521
522 /* set up local variables */
523 UCaseContext csc=UCASECONTEXT_INITIALIZER;
524 csc.p=(void *)src;
525 csc.limit=srcLength;
526 int32_t prev=0;
527 UBool isFirstIndex=true;
528
529 /* titlecasing loop */
530 while(prev<srcLength) {
531 /* find next index where to titlecase */
532 int32_t index;
533 if(isFirstIndex) {
534 isFirstIndex=false;
535 index=iter->first();
536 } else {
537 index=iter->next();
538 }
539 if(index==UBRK_DONE || index>srcLength) {
540 index=srcLength;
541 }
542
543 /*
544 * Segment [prev..index[ into 3 parts:
545 * a) skipped characters (copy as-is) [prev..titleStart[
546 * b) first letter (titlecase) [titleStart..titleLimit[
547 * c) subsequent characters (lowercase) [titleLimit..index[
548 */
549 if(prev<index) {
550 /* find and copy skipped characters [prev..titleStart[ */
551 int32_t titleStart=prev;
552 int32_t titleLimit=prev;
553 UChar32 c;
554 U8_NEXT(src, titleLimit, index, c);
555 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
556 // Adjust the titlecasing index to the next cased character,
557 // or to the next letter/number/symbol/private use.
558 // Stop with titleStart<titleLimit<=index
559 // if there is a character to be titlecased,
560 // or else stop with titleStart==titleLimit==index.
561 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
562 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
563 titleStart=titleLimit;
564 if(titleLimit==index) {
565 break;
566 }
567 U8_NEXT(src, titleLimit, index, c);
568 }
569 if (prev < titleStart) {
570 if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
571 sink, options, edits, errorCode)) {
572 return;
573 }
574 }
575 }
576
577 if(titleStart<titleLimit) {
578 /* titlecase c which is from [titleStart..titleLimit[ */
579 if(c>=0) {
580 csc.cpStart=titleStart;
581 csc.cpLimit=titleLimit;
582 const char16_t *s;
583 c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
584 if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
585 return;
586 }
587 } else {
588 // Malformed UTF-8.
589 if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
590 sink, options, edits, errorCode)) {
591 return;
592 }
593 }
594
595 /* Special case Dutch IJ titlecasing */
596 if (titleLimit < index &&
597 caseLocale == UCASE_LOC_DUTCH) {
598 if (c < 0) {
599 c = ~c;
600 }
601
602 if (c == u'I' || c == u'Í') {
603 titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
604 }
605 }
606
607 /* lowercase [titleLimit..index[ */
608 if(titleLimit<index) {
609 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
610 /* Normal operation: Lowercase the rest of the word. */
611 toLower(caseLocale, options,
612 src, &csc, titleLimit, index,
613 sink, edits, errorCode);
614 if(U_FAILURE(errorCode)) {
615 return;
616 }
617 } else {
618 /* Optionally just copy the rest of the word unchanged. */
619 if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
620 sink, options, edits, errorCode)) {
621 return;
622 }
623 }
624 }
625 }
626 }
627
628 prev=index;
629 }
630 }
631
632 #endif
633
634 U_NAMESPACE_BEGIN
635 namespace GreekUpper {
636
isFollowedByCasedLetter(const uint8_t * s,int32_t i,int32_t length)637 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
638 while (i < length) {
639 UChar32 c;
640 U8_NEXT(s, i, length, c);
641 int32_t type = ucase_getTypeOrIgnorable(c);
642 if ((type & UCASE_IGNORABLE) != 0) {
643 // Case-ignorable, continue with the loop.
644 } else if (type != UCASE_NONE) {
645 return true; // Followed by cased letter.
646 } else {
647 return false; // Uncased and not case-ignorable.
648 }
649 }
650 return false; // Not followed by cased letter.
651 }
652
653 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
toUpper(uint32_t options,const uint8_t * src,int32_t srcLength,ByteSink & sink,Edits * edits,UErrorCode & errorCode)654 void toUpper(uint32_t options,
655 const uint8_t *src, int32_t srcLength,
656 ByteSink &sink, Edits *edits,
657 UErrorCode &errorCode) {
658 uint32_t state = 0;
659 for (int32_t i = 0; i < srcLength;) {
660 int32_t nextIndex = i;
661 UChar32 c;
662 U8_NEXT(src, nextIndex, srcLength, c);
663 uint32_t nextState = 0;
664 int32_t type = ucase_getTypeOrIgnorable(c);
665 if ((type & UCASE_IGNORABLE) != 0) {
666 // c is case-ignorable
667 nextState |= (state & AFTER_CASED);
668 } else if (type != UCASE_NONE) {
669 // c is cased
670 nextState |= AFTER_CASED;
671 }
672 uint32_t data = getLetterData(c);
673 if (data > 0) {
674 uint32_t upper = data & UPPER_MASK;
675 // Add a dialytika to this iota or ypsilon vowel
676 // if we removed a tonos from the previous vowel,
677 // and that previous vowel did not also have (or gain) a dialytika.
678 // Adding one only to the final vowel in a longer sequence
679 // (which does not occur in normal writing) would require lookahead.
680 // Set the same flag as for preserving an existing dialytika.
681 if ((data & HAS_VOWEL) != 0 &&
682 (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
683 0 &&
684 (upper == 0x399 || upper == 0x3A5)) {
685 data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
686 : HAS_COMBINING_DIALYTIKA;
687 }
688 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
689 if ((data & HAS_YPOGEGRAMMENI) != 0) {
690 numYpogegrammeni = 1;
691 }
692 const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
693 // Skip combining diacritics after this Greek letter.
694 int32_t nextNextIndex = nextIndex;
695 while (nextIndex < srcLength) {
696 UChar32 c2;
697 U8_NEXT(src, nextNextIndex, srcLength, c2);
698 uint32_t diacriticData = getDiacriticData(c2);
699 if (diacriticData != 0) {
700 data |= diacriticData;
701 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
702 ++numYpogegrammeni;
703 }
704 nextIndex = nextNextIndex;
705 } else {
706 break; // not a Greek diacritic
707 }
708 }
709 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
710 nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
711 : AFTER_VOWEL_WITH_COMBINING_ACCENT;
712 }
713 // Map according to Greek rules.
714 UBool addTonos = false;
715 if (upper == 0x397 &&
716 (data & HAS_ACCENT) != 0 &&
717 numYpogegrammeni == 0 &&
718 (state & AFTER_CASED) == 0 &&
719 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
720 // Keep disjunctive "or" with (only) a tonos.
721 // We use the same "word boundary" conditions as for the Final_Sigma test.
722 if (hasPrecomposedAccent) {
723 upper = 0x389; // Preserve the precomposed form.
724 } else {
725 addTonos = true;
726 }
727 } else if ((data & HAS_DIALYTIKA) != 0) {
728 // Preserve a vowel with dialytika in precomposed form if it exists.
729 if (upper == 0x399) {
730 upper = 0x3AA;
731 data &= ~HAS_EITHER_DIALYTIKA;
732 } else if (upper == 0x3A5) {
733 upper = 0x3AB;
734 data &= ~HAS_EITHER_DIALYTIKA;
735 }
736 }
737
738 UBool change;
739 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
740 change = true; // common, simple usage
741 } else {
742 // Find out first whether we are changing the text.
743 U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
744 change = (i + 2) > nextIndex ||
745 src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
746 numYpogegrammeni > 0;
747 int32_t i2 = i + 2;
748 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
749 change |= (i2 + 2) > nextIndex ||
750 src[i2] != static_cast<uint8_t>(u8"\u0308"[0]) ||
751 src[i2 + 1] != static_cast<uint8_t>(u8"\u0308"[1]);
752 i2 += 2;
753 }
754 if (addTonos) {
755 change |= (i2 + 2) > nextIndex ||
756 src[i2] != static_cast<uint8_t>(u8"\u0301"[0]) ||
757 src[i2 + 1] != static_cast<uint8_t>(u8"\u0301"[1]);
758 i2 += 2;
759 }
760 int32_t oldLength = nextIndex - i;
761 int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399
762 change |= oldLength != newLength;
763 if (change) {
764 if (edits != nullptr) {
765 edits->addReplace(oldLength, newLength);
766 }
767 } else {
768 if (edits != nullptr) {
769 edits->addUnchanged(oldLength);
770 }
771 // Write unchanged text?
772 change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
773 }
774 }
775
776 if (change) {
777 ByteSinkUtil::appendTwoBytes(upper, sink);
778 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
779 sink.AppendU8(u8"\u0308", 2); // restore or add a dialytika
780 }
781 if (addTonos) {
782 sink.AppendU8(u8"\u0301", 2);
783 }
784 while (numYpogegrammeni > 0) {
785 sink.AppendU8(u8"\u0399", 2);
786 --numYpogegrammeni;
787 }
788 }
789 } else if(c>=0) {
790 const char16_t *s;
791 c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
792 if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
793 return;
794 }
795 } else {
796 // Malformed UTF-8.
797 if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
798 sink, options, edits, errorCode)) {
799 return;
800 }
801 }
802 i = nextIndex;
803 state = nextState;
804 }
805 }
806
807 } // namespace GreekUpper
808 U_NAMESPACE_END
809
810 static void U_CALLCONV
ucasemap_internalUTF8ToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)811 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
812 const uint8_t *src, int32_t srcLength,
813 icu::ByteSink &sink, icu::Edits *edits,
814 UErrorCode &errorCode) {
815 UCaseContext csc=UCASECONTEXT_INITIALIZER;
816 csc.p=(void *)src;
817 csc.limit=srcLength;
818 toLower(
819 caseLocale, options,
820 src, &csc, 0, srcLength,
821 sink, edits, errorCode);
822 }
823
824 static void U_CALLCONV
ucasemap_internalUTF8ToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)825 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
826 const uint8_t *src, int32_t srcLength,
827 icu::ByteSink &sink, icu::Edits *edits,
828 UErrorCode &errorCode) {
829 if (caseLocale == UCASE_LOC_GREEK) {
830 GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
831 } else {
832 UCaseContext csc=UCASECONTEXT_INITIALIZER;
833 csc.p=(void *)src;
834 csc.limit=srcLength;
835 toUpper(
836 caseLocale, options,
837 src, &csc, srcLength,
838 sink, edits, errorCode);
839 }
840 }
841
842 static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)843 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
844 const uint8_t *src, int32_t srcLength,
845 icu::ByteSink &sink, icu::Edits *edits,
846 UErrorCode &errorCode) {
847 toLower(
848 -1, options,
849 src, nullptr, 0, srcLength,
850 sink, edits, errorCode);
851 }
852
853 void
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)854 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
855 const char *src, int32_t srcLength,
856 UTF8CaseMapper *stringCaseMapper,
857 icu::ByteSink &sink, icu::Edits *edits,
858 UErrorCode &errorCode) {
859 /* check argument values */
860 if (U_FAILURE(errorCode)) {
861 return;
862 }
863 if ((src == nullptr && srcLength != 0) || srcLength < -1) {
864 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
865 return;
866 }
867
868 // Get the string length.
869 if (srcLength == -1) {
870 srcLength = static_cast<int32_t>(uprv_strlen(src));
871 }
872
873 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
874 edits->reset();
875 }
876 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
877 reinterpret_cast<const uint8_t*>(src), srcLength, sink, edits, errorCode);
878 sink.Flush();
879 if (U_SUCCESS(errorCode)) {
880 if (edits != nullptr) {
881 edits->copyErrorTo(errorCode);
882 }
883 }
884 }
885
886 int32_t
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)887 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
888 char *dest, int32_t destCapacity,
889 const char *src, int32_t srcLength,
890 UTF8CaseMapper *stringCaseMapper,
891 icu::Edits *edits,
892 UErrorCode &errorCode) {
893 /* check argument values */
894 if(U_FAILURE(errorCode)) {
895 return 0;
896 }
897 if( destCapacity<0 ||
898 (dest==nullptr && destCapacity>0) ||
899 (src==nullptr && srcLength!=0) || srcLength<-1
900 ) {
901 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
902 return 0;
903 }
904
905 /* get the string length */
906 if(srcLength==-1) {
907 srcLength = static_cast<int32_t>(uprv_strlen(src));
908 }
909
910 /* check for overlapping source and destination */
911 if( dest!=nullptr &&
912 ((src>=dest && src<(dest+destCapacity)) ||
913 (dest>=src && dest<(src+srcLength)))
914 ) {
915 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
916 return 0;
917 }
918
919 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
920 edits->reset();
921 }
922 int32_t reslen = ByteSinkUtil::viaByteSinkToTerminatedChars(
923 dest, destCapacity,
924 [&](ByteSink& sink, UErrorCode& status) {
925 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
926 reinterpret_cast<const uint8_t*>(src), srcLength, sink, edits, status);
927 },
928 errorCode);
929 if (U_SUCCESS(errorCode) && edits != nullptr) {
930 edits->copyErrorTo(errorCode);
931 }
932 return reslen;
933 }
934
935 /* public API functions */
936
937 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)938 ucasemap_utf8ToLower(const UCaseMap *csm,
939 char *dest, int32_t destCapacity,
940 const char *src, int32_t srcLength,
941 UErrorCode *pErrorCode) {
942 return ucasemap_mapUTF8(
943 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
944 dest, destCapacity,
945 src, srcLength,
946 ucasemap_internalUTF8ToLower, nullptr, *pErrorCode);
947 }
948
949 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)950 ucasemap_utf8ToUpper(const UCaseMap *csm,
951 char *dest, int32_t destCapacity,
952 const char *src, int32_t srcLength,
953 UErrorCode *pErrorCode) {
954 return ucasemap_mapUTF8(
955 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
956 dest, destCapacity,
957 src, srcLength,
958 ucasemap_internalUTF8ToUpper, nullptr, *pErrorCode);
959 }
960
961 U_CAPI int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)962 ucasemap_utf8FoldCase(const UCaseMap *csm,
963 char *dest, int32_t destCapacity,
964 const char *src, int32_t srcLength,
965 UErrorCode *pErrorCode) {
966 return ucasemap_mapUTF8(
967 UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
968 dest, destCapacity,
969 src, srcLength,
970 ucasemap_internalUTF8Fold, nullptr, *pErrorCode);
971 }
972
973 U_NAMESPACE_BEGIN
974
utf8ToLower(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)975 void CaseMap::utf8ToLower(
976 const char *locale, uint32_t options,
977 StringPiece src, ByteSink &sink, Edits *edits,
978 UErrorCode &errorCode) {
979 ucasemap_mapUTF8(
980 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
981 src.data(), src.length(),
982 ucasemap_internalUTF8ToLower, sink, edits, errorCode);
983 }
984
utf8ToUpper(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)985 void CaseMap::utf8ToUpper(
986 const char *locale, uint32_t options,
987 StringPiece src, ByteSink &sink, Edits *edits,
988 UErrorCode &errorCode) {
989 ucasemap_mapUTF8(
990 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
991 src.data(), src.length(),
992 ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
993 }
994
utf8Fold(uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)995 void CaseMap::utf8Fold(
996 uint32_t options,
997 StringPiece src, ByteSink &sink, Edits *edits,
998 UErrorCode &errorCode) {
999 ucasemap_mapUTF8(
1000 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1001 src.data(), src.length(),
1002 ucasemap_internalUTF8Fold, sink, edits, errorCode);
1003 }
1004
utf8ToLower(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1005 int32_t CaseMap::utf8ToLower(
1006 const char *locale, uint32_t options,
1007 const char *src, int32_t srcLength,
1008 char *dest, int32_t destCapacity, Edits *edits,
1009 UErrorCode &errorCode) {
1010 return ucasemap_mapUTF8(
1011 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1012 dest, destCapacity,
1013 src, srcLength,
1014 ucasemap_internalUTF8ToLower, edits, errorCode);
1015 }
1016
utf8ToUpper(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1017 int32_t CaseMap::utf8ToUpper(
1018 const char *locale, uint32_t options,
1019 const char *src, int32_t srcLength,
1020 char *dest, int32_t destCapacity, Edits *edits,
1021 UErrorCode &errorCode) {
1022 return ucasemap_mapUTF8(
1023 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1024 dest, destCapacity,
1025 src, srcLength,
1026 ucasemap_internalUTF8ToUpper, edits, errorCode);
1027 }
1028
utf8Fold(uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1029 int32_t CaseMap::utf8Fold(
1030 uint32_t options,
1031 const char *src, int32_t srcLength,
1032 char *dest, int32_t destCapacity, Edits *edits,
1033 UErrorCode &errorCode) {
1034 return ucasemap_mapUTF8(
1035 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1036 dest, destCapacity,
1037 src, srcLength,
1038 ucasemap_internalUTF8Fold, edits, errorCode);
1039 }
1040
1041 U_NAMESPACE_END
1042