1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2005-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucasemap.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2005may06
16 * created by: Markus W. Scherer
17 *
18 * Case mapping service object and functions using it.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/bytestream.h"
24 #include "unicode/casemap.h"
25 #include "unicode/edits.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/ubrk.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ustring.h"
31 #include "unicode/ucasemap.h"
32 #if !UCONFIG_NO_BREAK_ITERATION
33 #include "unicode/utext.h"
34 #endif
35 #include "unicode/utf.h"
36 #include "unicode/utf8.h"
37 #include "unicode/utf16.h"
38 #include "bytesinkutil.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uassert.h"
42 #include "ucase.h"
43 #include "ucasemap_imp.h"
44 #include "ustr_imp.h"
45
46 U_NAMESPACE_USE
47
48 /* UCaseMap service object -------------------------------------------------- */
49
UCaseMap(const char * localeID,uint32_t opts,UErrorCode * pErrorCode)50 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51 #if !UCONFIG_NO_BREAK_ITERATION
52 iter(NULL),
53 #endif
54 caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55 ucasemap_setLocale(this, localeID, pErrorCode);
56 }
57
~UCaseMap()58 UCaseMap::~UCaseMap() {
59 #if !UCONFIG_NO_BREAK_ITERATION
60 delete iter;
61 #endif
62 }
63
64 U_CAPI UCaseMap * U_EXPORT2
ucasemap_open(const char * locale,uint32_t options,UErrorCode * pErrorCode)65 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
66 if(U_FAILURE(*pErrorCode)) {
67 return NULL;
68 }
69 UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
70 if(csm==NULL) {
71 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72 return NULL;
73 } else if (U_FAILURE(*pErrorCode)) {
74 delete csm;
75 return NULL;
76 }
77 return csm;
78 }
79
80 U_CAPI void U_EXPORT2
ucasemap_close(UCaseMap * csm)81 ucasemap_close(UCaseMap *csm) {
82 delete csm;
83 }
84
85 U_CAPI const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap * csm)86 ucasemap_getLocale(const UCaseMap *csm) {
87 return csm->locale;
88 }
89
90 U_CAPI uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap * csm)91 ucasemap_getOptions(const UCaseMap *csm) {
92 return csm->options;
93 }
94
95 U_CAPI void U_EXPORT2
ucasemap_setLocale(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)96 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
97 if(U_FAILURE(*pErrorCode)) {
98 return;
99 }
100 if (locale != NULL && *locale == 0) {
101 csm->locale[0] = 0;
102 csm->caseLocale = UCASE_LOC_ROOT;
103 return;
104 }
105
106 int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108 *pErrorCode=U_ZERO_ERROR;
109 /* we only really need the language code for case mappings */
110 length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111 }
112 if(length==sizeof(csm->locale)) {
113 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114 }
115 if(U_SUCCESS(*pErrorCode)) {
116 csm->caseLocale=UCASE_LOC_UNKNOWN;
117 csm->caseLocale = ucase_getCaseLocale(csm->locale);
118 } else {
119 csm->locale[0]=0;
120 csm->caseLocale = UCASE_LOC_ROOT;
121 }
122 }
123
124 U_CAPI void U_EXPORT2
ucasemap_setOptions(UCaseMap * csm,uint32_t options,UErrorCode * pErrorCode)125 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
126 if(U_FAILURE(*pErrorCode)) {
127 return;
128 }
129 csm->options=options;
130 }
131
132 /* UTF-8 string case mappings ----------------------------------------------- */
133
134 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
135
136 namespace {
137
138 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
139 inline UBool
appendResult(int32_t cpLength,int32_t result,const UChar * s,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)140 appendResult(int32_t cpLength, int32_t result, const UChar *s,
141 ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
142 U_ASSERT(U_SUCCESS(errorCode));
143
144 /* decode the result */
145 if(result<0) {
146 /* (not) original code point */
147 if(edits!=NULL) {
148 edits->addUnchanged(cpLength);
149 }
150 if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
151 ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
152 }
153 } else {
154 if(result<=UCASE_MAX_STRING_LENGTH) {
155 // string: "result" is the UTF-16 length
156 return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
157 } else {
158 ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
159 }
160 }
161 return TRUE;
162 }
163
164 // See unicode/utf8.h U8_APPEND_UNSAFE().
getTwoByteLead(UChar32 c)165 inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
getTwoByteTrail(UChar32 c)166 inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
167
168 } // namespace
169
170 static UChar32 U_CALLCONV
utf8_caseContextIterator(void * context,int8_t dir)171 utf8_caseContextIterator(void *context, int8_t dir) {
172 UCaseContext *csc=(UCaseContext *)context;
173 UChar32 c;
174
175 if(dir<0) {
176 /* reset for backward iteration */
177 csc->index=csc->cpStart;
178 csc->dir=dir;
179 } else if(dir>0) {
180 /* reset for forward iteration */
181 csc->index=csc->cpLimit;
182 csc->dir=dir;
183 } else {
184 /* continue current iteration direction */
185 dir=csc->dir;
186 }
187
188 if(dir<0) {
189 if(csc->start<csc->index) {
190 U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
191 return c;
192 }
193 } else {
194 if(csc->index<csc->limit) {
195 U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
196 return c;
197 }
198 }
199 return U_SENTINEL;
200 }
201
202 /*
203 * Case-maps [srcStart..srcLimit[ but takes
204 * context [0..srcLength[ into account.
205 */
206 static void
_caseMap(int32_t caseLocale,uint32_t options,UCaseMapFull * map,const uint8_t * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)207 _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
208 const uint8_t *src, UCaseContext *csc,
209 int32_t srcStart, int32_t srcLimit,
210 icu::ByteSink &sink, icu::Edits *edits,
211 UErrorCode &errorCode) {
212 /* case mapping loop */
213 int32_t srcIndex=srcStart;
214 while (U_SUCCESS(errorCode) && srcIndex<srcLimit) {
215 int32_t cpStart;
216 csc->cpStart=cpStart=srcIndex;
217 UChar32 c;
218 U8_NEXT(src, srcIndex, srcLimit, c);
219 csc->cpLimit=srcIndex;
220 if(c<0) {
221 // Malformed UTF-8.
222 ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
223 sink, options, edits, errorCode);
224 } else {
225 const UChar *s;
226 c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
227 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
228 }
229 }
230 }
231
232 #if !UCONFIG_NO_BREAK_ITERATION
233
234 U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,const uint8_t * src,int32_t srcLength,ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)235 ucasemap_internalUTF8ToTitle(
236 int32_t caseLocale, uint32_t options, BreakIterator *iter,
237 const uint8_t *src, int32_t srcLength,
238 ByteSink &sink, icu::Edits *edits,
239 UErrorCode &errorCode) {
240 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
241 return;
242 }
243
244 /* set up local variables */
245 UCaseContext csc=UCASECONTEXT_INITIALIZER;
246 csc.p=(void *)src;
247 csc.limit=srcLength;
248 int32_t prev=0;
249 UBool isFirstIndex=TRUE;
250
251 /* titlecasing loop */
252 while(prev<srcLength) {
253 /* find next index where to titlecase */
254 int32_t index;
255 if(isFirstIndex) {
256 isFirstIndex=FALSE;
257 index=iter->first();
258 } else {
259 index=iter->next();
260 }
261 if(index==UBRK_DONE || index>srcLength) {
262 index=srcLength;
263 }
264
265 /*
266 * Segment [prev..index[ into 3 parts:
267 * a) skipped characters (copy as-is) [prev..titleStart[
268 * b) first letter (titlecase) [titleStart..titleLimit[
269 * c) subsequent characters (lowercase) [titleLimit..index[
270 */
271 if(prev<index) {
272 /* find and copy skipped characters [prev..titleStart[ */
273 int32_t titleStart=prev;
274 int32_t titleLimit=prev;
275 UChar32 c;
276 U8_NEXT(src, titleLimit, index, c);
277 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
278 // Adjust the titlecasing index to the next cased character,
279 // or to the next letter/number/symbol/private use.
280 // Stop with titleStart<titleLimit<=index
281 // if there is a character to be titlecased,
282 // or else stop with titleStart==titleLimit==index.
283 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
284 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
285 titleStart=titleLimit;
286 if(titleLimit==index) {
287 break;
288 }
289 U8_NEXT(src, titleLimit, index, c);
290 }
291 if (prev < titleStart) {
292 if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
293 sink, options, edits, errorCode)) {
294 return;
295 }
296 }
297 }
298
299 if(titleStart<titleLimit) {
300 /* titlecase c which is from [titleStart..titleLimit[ */
301 if(c>=0) {
302 csc.cpStart=titleStart;
303 csc.cpLimit=titleLimit;
304 const UChar *s;
305 c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
306 if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
307 return;
308 }
309 } else {
310 // Malformed UTF-8.
311 if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
312 sink, options, edits, errorCode)) {
313 return;
314 }
315 }
316
317 /* Special case Dutch IJ titlecasing */
318 if (titleStart+1 < index &&
319 caseLocale == UCASE_LOC_DUTCH &&
320 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
321 if (src[titleStart+1] == 0x006A) {
322 ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
323 titleLimit++;
324 } else if (src[titleStart+1] == 0x004A) {
325 // Keep the capital J from getting lowercased.
326 if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
327 sink, options, edits, errorCode)) {
328 return;
329 }
330 titleLimit++;
331 }
332 }
333
334 /* lowercase [titleLimit..index[ */
335 if(titleLimit<index) {
336 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
337 /* Normal operation: Lowercase the rest of the word. */
338 _caseMap(caseLocale, options, ucase_toFullLower,
339 src, &csc,
340 titleLimit, index,
341 sink, edits, errorCode);
342 if(U_FAILURE(errorCode)) {
343 return;
344 }
345 } else {
346 /* Optionally just copy the rest of the word unchanged. */
347 if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
348 sink, options, edits, errorCode)) {
349 return;
350 }
351 }
352 }
353 }
354 }
355
356 prev=index;
357 }
358 }
359
360 #endif
361
362 U_NAMESPACE_BEGIN
363 namespace GreekUpper {
364
isFollowedByCasedLetter(const uint8_t * s,int32_t i,int32_t length)365 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
366 while (i < length) {
367 UChar32 c;
368 U8_NEXT(s, i, length, c);
369 int32_t type = ucase_getTypeOrIgnorable(c);
370 if ((type & UCASE_IGNORABLE) != 0) {
371 // Case-ignorable, continue with the loop.
372 } else if (type != UCASE_NONE) {
373 return TRUE; // Followed by cased letter.
374 } else {
375 return FALSE; // Uncased and not case-ignorable.
376 }
377 }
378 return FALSE; // Not followed by cased letter.
379 }
380
381 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
toUpper(uint32_t options,const uint8_t * src,int32_t srcLength,ByteSink & sink,Edits * edits,UErrorCode & errorCode)382 void toUpper(uint32_t options,
383 const uint8_t *src, int32_t srcLength,
384 ByteSink &sink, Edits *edits,
385 UErrorCode &errorCode) {
386 uint32_t state = 0;
387 for (int32_t i = 0; i < srcLength;) {
388 int32_t nextIndex = i;
389 UChar32 c;
390 U8_NEXT(src, nextIndex, srcLength, c);
391 uint32_t nextState = 0;
392 int32_t type = ucase_getTypeOrIgnorable(c);
393 if ((type & UCASE_IGNORABLE) != 0) {
394 // c is case-ignorable
395 nextState |= (state & AFTER_CASED);
396 } else if (type != UCASE_NONE) {
397 // c is cased
398 nextState |= AFTER_CASED;
399 }
400 uint32_t data = getLetterData(c);
401 if (data > 0) {
402 uint32_t upper = data & UPPER_MASK;
403 // Add a dialytika to this iota or ypsilon vowel
404 // if we removed a tonos from the previous vowel,
405 // and that previous vowel did not also have (or gain) a dialytika.
406 // Adding one only to the final vowel in a longer sequence
407 // (which does not occur in normal writing) would require lookahead.
408 // Set the same flag as for preserving an existing dialytika.
409 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
410 (upper == 0x399 || upper == 0x3A5)) {
411 data |= HAS_DIALYTIKA;
412 }
413 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
414 if ((data & HAS_YPOGEGRAMMENI) != 0) {
415 numYpogegrammeni = 1;
416 }
417 // Skip combining diacritics after this Greek letter.
418 int32_t nextNextIndex = nextIndex;
419 while (nextIndex < srcLength) {
420 UChar32 c2;
421 U8_NEXT(src, nextNextIndex, srcLength, c2);
422 uint32_t diacriticData = getDiacriticData(c2);
423 if (diacriticData != 0) {
424 data |= diacriticData;
425 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
426 ++numYpogegrammeni;
427 }
428 nextIndex = nextNextIndex;
429 } else {
430 break; // not a Greek diacritic
431 }
432 }
433 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
434 nextState |= AFTER_VOWEL_WITH_ACCENT;
435 }
436 // Map according to Greek rules.
437 UBool addTonos = FALSE;
438 if (upper == 0x397 &&
439 (data & HAS_ACCENT) != 0 &&
440 numYpogegrammeni == 0 &&
441 (state & AFTER_CASED) == 0 &&
442 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
443 // Keep disjunctive "or" with (only) a tonos.
444 // We use the same "word boundary" conditions as for the Final_Sigma test.
445 if (i == nextIndex) {
446 upper = 0x389; // Preserve the precomposed form.
447 } else {
448 addTonos = TRUE;
449 }
450 } else if ((data & HAS_DIALYTIKA) != 0) {
451 // Preserve a vowel with dialytika in precomposed form if it exists.
452 if (upper == 0x399) {
453 upper = 0x3AA;
454 data &= ~HAS_EITHER_DIALYTIKA;
455 } else if (upper == 0x3A5) {
456 upper = 0x3AB;
457 data &= ~HAS_EITHER_DIALYTIKA;
458 }
459 }
460
461 UBool change;
462 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
463 change = TRUE; // common, simple usage
464 } else {
465 // Find out first whether we are changing the text.
466 U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
467 change = (i + 2) > nextIndex ||
468 src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
469 numYpogegrammeni > 0;
470 int32_t i2 = i + 2;
471 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
472 change |= (i2 + 2) > nextIndex ||
473 src[i2] != (uint8_t)u8"\u0308"[0] ||
474 src[i2 + 1] != (uint8_t)u8"\u0308"[1];
475 i2 += 2;
476 }
477 if (addTonos) {
478 change |= (i2 + 2) > nextIndex ||
479 src[i2] != (uint8_t)u8"\u0301"[0] ||
480 src[i2 + 1] != (uint8_t)u8"\u0301"[1];
481 i2 += 2;
482 }
483 int32_t oldLength = nextIndex - i;
484 int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399
485 change |= oldLength != newLength;
486 if (change) {
487 if (edits != NULL) {
488 edits->addReplace(oldLength, newLength);
489 }
490 } else {
491 if (edits != NULL) {
492 edits->addUnchanged(oldLength);
493 }
494 // Write unchanged text?
495 change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
496 }
497 }
498
499 if (change) {
500 ByteSinkUtil::appendTwoBytes(upper, sink);
501 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
502 sink.Append(u8"\u0308", 2); // restore or add a dialytika
503 }
504 if (addTonos) {
505 sink.Append(u8"\u0301", 2);
506 }
507 while (numYpogegrammeni > 0) {
508 sink.Append(u8"\u0399", 2);
509 --numYpogegrammeni;
510 }
511 }
512 } else if(c>=0) {
513 const UChar *s;
514 c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
515 if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
516 return;
517 }
518 } else {
519 // Malformed UTF-8.
520 if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
521 sink, options, edits, errorCode)) {
522 return;
523 }
524 }
525 i = nextIndex;
526 state = nextState;
527 }
528 }
529
530 } // namespace GreekUpper
531 U_NAMESPACE_END
532
533 static void U_CALLCONV
ucasemap_internalUTF8ToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)534 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
535 const uint8_t *src, int32_t srcLength,
536 icu::ByteSink &sink, icu::Edits *edits,
537 UErrorCode &errorCode) {
538 UCaseContext csc=UCASECONTEXT_INITIALIZER;
539 csc.p=(void *)src;
540 csc.limit=srcLength;
541 _caseMap(
542 caseLocale, options, ucase_toFullLower,
543 src, &csc, 0, srcLength,
544 sink, edits, errorCode);
545 }
546
547 static void U_CALLCONV
ucasemap_internalUTF8ToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)548 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
549 const uint8_t *src, int32_t srcLength,
550 icu::ByteSink &sink, icu::Edits *edits,
551 UErrorCode &errorCode) {
552 if (caseLocale == UCASE_LOC_GREEK) {
553 GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
554 } else {
555 UCaseContext csc=UCASECONTEXT_INITIALIZER;
556 csc.p=(void *)src;
557 csc.limit=srcLength;
558 _caseMap(
559 caseLocale, options, ucase_toFullUpper,
560 src, &csc, 0, srcLength,
561 sink, edits, errorCode);
562 }
563 }
564
565 static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)566 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
567 const uint8_t *src, int32_t srcLength,
568 icu::ByteSink &sink, icu::Edits *edits,
569 UErrorCode &errorCode) {
570 /* case mapping loop */
571 int32_t srcIndex = 0;
572 while (U_SUCCESS(errorCode) && srcIndex < srcLength) {
573 int32_t cpStart = srcIndex;
574 UChar32 c;
575 U8_NEXT(src, srcIndex, srcLength, c);
576 if(c<0) {
577 // Malformed UTF-8.
578 ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
579 sink, options, edits, errorCode);
580 } else {
581 const UChar *s;
582 c = ucase_toFullFolding(c, &s, options);
583 appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
584 }
585 }
586 }
587
588 void
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)589 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
590 const char *src, int32_t srcLength,
591 UTF8CaseMapper *stringCaseMapper,
592 icu::ByteSink &sink, icu::Edits *edits,
593 UErrorCode &errorCode) {
594 /* check argument values */
595 if (U_FAILURE(errorCode)) {
596 return;
597 }
598 if ((src == nullptr && srcLength != 0) || srcLength < -1) {
599 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
600 return;
601 }
602
603 // Get the string length.
604 if (srcLength == -1) {
605 srcLength = (int32_t)uprv_strlen((const char *)src);
606 }
607
608 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
609 edits->reset();
610 }
611 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
612 (const uint8_t *)src, srcLength, sink, edits, errorCode);
613 sink.Flush();
614 if (U_SUCCESS(errorCode)) {
615 if (edits != nullptr) {
616 edits->copyErrorTo(errorCode);
617 }
618 }
619 }
620
621 int32_t
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)622 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
623 char *dest, int32_t destCapacity,
624 const char *src, int32_t srcLength,
625 UTF8CaseMapper *stringCaseMapper,
626 icu::Edits *edits,
627 UErrorCode &errorCode) {
628 /* check argument values */
629 if(U_FAILURE(errorCode)) {
630 return 0;
631 }
632 if( destCapacity<0 ||
633 (dest==NULL && destCapacity>0) ||
634 (src==NULL && srcLength!=0) || srcLength<-1
635 ) {
636 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
637 return 0;
638 }
639
640 /* get the string length */
641 if(srcLength==-1) {
642 srcLength=(int32_t)uprv_strlen((const char *)src);
643 }
644
645 /* check for overlapping source and destination */
646 if( dest!=NULL &&
647 ((src>=dest && src<(dest+destCapacity)) ||
648 (dest>=src && dest<(src+srcLength)))
649 ) {
650 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
651 return 0;
652 }
653
654 CheckedArrayByteSink sink(dest, destCapacity);
655 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
656 edits->reset();
657 }
658 stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
659 (const uint8_t *)src, srcLength, sink, edits, errorCode);
660 sink.Flush();
661 if (U_SUCCESS(errorCode)) {
662 if (sink.Overflowed()) {
663 errorCode = U_BUFFER_OVERFLOW_ERROR;
664 } else if (edits != nullptr) {
665 edits->copyErrorTo(errorCode);
666 }
667 }
668 return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
669 }
670
671 /* public API functions */
672
673 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)674 ucasemap_utf8ToLower(const UCaseMap *csm,
675 char *dest, int32_t destCapacity,
676 const char *src, int32_t srcLength,
677 UErrorCode *pErrorCode) {
678 return ucasemap_mapUTF8(
679 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
680 dest, destCapacity,
681 src, srcLength,
682 ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
683 }
684
685 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)686 ucasemap_utf8ToUpper(const UCaseMap *csm,
687 char *dest, int32_t destCapacity,
688 const char *src, int32_t srcLength,
689 UErrorCode *pErrorCode) {
690 return ucasemap_mapUTF8(
691 csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
692 dest, destCapacity,
693 src, srcLength,
694 ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
695 }
696
697 U_CAPI int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)698 ucasemap_utf8FoldCase(const UCaseMap *csm,
699 char *dest, int32_t destCapacity,
700 const char *src, int32_t srcLength,
701 UErrorCode *pErrorCode) {
702 return ucasemap_mapUTF8(
703 UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
704 dest, destCapacity,
705 src, srcLength,
706 ucasemap_internalUTF8Fold, NULL, *pErrorCode);
707 }
708
709 U_NAMESPACE_BEGIN
710
utf8ToLower(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)711 void CaseMap::utf8ToLower(
712 const char *locale, uint32_t options,
713 StringPiece src, ByteSink &sink, Edits *edits,
714 UErrorCode &errorCode) {
715 ucasemap_mapUTF8(
716 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
717 src.data(), src.length(),
718 ucasemap_internalUTF8ToLower, sink, edits, errorCode);
719 }
720
utf8ToUpper(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)721 void CaseMap::utf8ToUpper(
722 const char *locale, uint32_t options,
723 StringPiece src, ByteSink &sink, Edits *edits,
724 UErrorCode &errorCode) {
725 ucasemap_mapUTF8(
726 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
727 src.data(), src.length(),
728 ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
729 }
730
utf8Fold(uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)731 void CaseMap::utf8Fold(
732 uint32_t options,
733 StringPiece src, ByteSink &sink, Edits *edits,
734 UErrorCode &errorCode) {
735 ucasemap_mapUTF8(
736 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
737 src.data(), src.length(),
738 ucasemap_internalUTF8Fold, sink, edits, errorCode);
739 }
740
utf8ToLower(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)741 int32_t CaseMap::utf8ToLower(
742 const char *locale, uint32_t options,
743 const char *src, int32_t srcLength,
744 char *dest, int32_t destCapacity, Edits *edits,
745 UErrorCode &errorCode) {
746 return ucasemap_mapUTF8(
747 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
748 dest, destCapacity,
749 src, srcLength,
750 ucasemap_internalUTF8ToLower, edits, errorCode);
751 }
752
utf8ToUpper(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)753 int32_t CaseMap::utf8ToUpper(
754 const char *locale, uint32_t options,
755 const char *src, int32_t srcLength,
756 char *dest, int32_t destCapacity, Edits *edits,
757 UErrorCode &errorCode) {
758 return ucasemap_mapUTF8(
759 ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
760 dest, destCapacity,
761 src, srcLength,
762 ucasemap_internalUTF8ToUpper, edits, errorCode);
763 }
764
utf8Fold(uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)765 int32_t CaseMap::utf8Fold(
766 uint32_t options,
767 const char *src, int32_t srcLength,
768 char *dest, int32_t destCapacity, Edits *edits,
769 UErrorCode &errorCode) {
770 return ucasemap_mapUTF8(
771 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
772 dest, destCapacity,
773 src, srcLength,
774 ucasemap_internalUTF8Fold, edits, errorCode);
775 }
776
777 U_NAMESPACE_END
778