1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2001-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ustrcase.cpp
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2002feb20
16 * created by: Markus W. Scherer
17 *
18 * Implementation file for string casing C API functions.
19 * Uses functions from uchar.c for basic functionality that requires access
20 * to the Unicode Character Database (uprops.dat).
21 */
22
23 #include "unicode/utypes.h"
24 #include "unicode/brkiter.h"
25 #include "unicode/ustring.h"
26 #include "unicode/ucasemap.h"
27 #include "unicode/ubrk.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
30 #include "cmemory.h"
31 #include "ucase.h"
32 #include "ustr_imp.h"
33 #include "uassert.h"
34
35 U_NAMESPACE_USE
36
37 /* string casing ------------------------------------------------------------ */
38
39 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
40 static inline int32_t
appendResult(UChar * dest,int32_t destIndex,int32_t destCapacity,int32_t result,const UChar * s)41 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
42 int32_t result, const UChar *s) {
43 UChar32 c;
44 int32_t length;
45
46 /* decode the result */
47 if(result<0) {
48 /* (not) original code point */
49 c=~result;
50 length=U16_LENGTH(c);
51 } else if(result<=UCASE_MAX_STRING_LENGTH) {
52 c=U_SENTINEL;
53 length=result;
54 } else {
55 c=result;
56 length=U16_LENGTH(c);
57 }
58 if(length>(INT32_MAX-destIndex)) {
59 return -1; // integer overflow
60 }
61
62 if(destIndex<destCapacity) {
63 /* append the result */
64 if(c>=0) {
65 /* code point */
66 UBool isError=FALSE;
67 U16_APPEND(dest, destIndex, destCapacity, c, isError);
68 if(isError) {
69 /* overflow, nothing written */
70 destIndex+=length;
71 }
72 } else {
73 /* string */
74 if((destIndex+length)<=destCapacity) {
75 while(length>0) {
76 dest[destIndex++]=*s++;
77 --length;
78 }
79 } else {
80 /* overflow */
81 destIndex+=length;
82 }
83 }
84 } else {
85 /* preflight */
86 destIndex+=length;
87 }
88 return destIndex;
89 }
90
91 static inline int32_t
appendUChar(UChar * dest,int32_t destIndex,int32_t destCapacity,UChar c)92 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
93 if(destIndex<destCapacity) {
94 dest[destIndex]=c;
95 } else if(destIndex==INT32_MAX) {
96 return -1; // integer overflow
97 }
98 return destIndex+1;
99 }
100
101 static inline int32_t
appendString(UChar * dest,int32_t destIndex,int32_t destCapacity,const UChar * s,int32_t length)102 appendString(UChar *dest, int32_t destIndex, int32_t destCapacity,
103 const UChar *s, int32_t length) {
104 if(length>0) {
105 if(length>(INT32_MAX-destIndex)) {
106 return -1; // integer overflow
107 }
108 if((destIndex+length)<=destCapacity) {
109 u_memcpy(dest+destIndex, s, length);
110 }
111 destIndex+=length;
112 }
113 return destIndex;
114 }
115
116 static UChar32 U_CALLCONV
utf16_caseContextIterator(void * context,int8_t dir)117 utf16_caseContextIterator(void *context, int8_t dir) {
118 UCaseContext *csc=(UCaseContext *)context;
119 UChar32 c;
120
121 if(dir<0) {
122 /* reset for backward iteration */
123 csc->index=csc->cpStart;
124 csc->dir=dir;
125 } else if(dir>0) {
126 /* reset for forward iteration */
127 csc->index=csc->cpLimit;
128 csc->dir=dir;
129 } else {
130 /* continue current iteration direction */
131 dir=csc->dir;
132 }
133
134 if(dir<0) {
135 if(csc->start<csc->index) {
136 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
137 return c;
138 }
139 } else {
140 if(csc->index<csc->limit) {
141 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
142 return c;
143 }
144 }
145 return U_SENTINEL;
146 }
147
148 /*
149 * Case-maps [srcStart..srcLimit[ but takes
150 * context [0..srcLength[ into account.
151 */
152 static int32_t
_caseMap(const UCaseMap * csm,UCaseMapFull * map,UChar * dest,int32_t destCapacity,const UChar * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,UErrorCode * pErrorCode)153 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
154 UChar *dest, int32_t destCapacity,
155 const UChar *src, UCaseContext *csc,
156 int32_t srcStart, int32_t srcLimit,
157 UErrorCode *pErrorCode) {
158 const UChar *s;
159 UChar32 c, c2 = 0;
160 int32_t srcIndex, destIndex;
161 int32_t locCache;
162
163 locCache=csm->locCache;
164
165 /* case mapping loop */
166 srcIndex=srcStart;
167 destIndex=0;
168 while(srcIndex<srcLimit) {
169 csc->cpStart=srcIndex;
170 U16_NEXT(src, srcIndex, srcLimit, c);
171 csc->cpLimit=srcIndex;
172 c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
173 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
174 /* fast path version of appendResult() for BMP results */
175 dest[destIndex++]=(UChar)c2;
176 } else {
177 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
178 if(destIndex<0) {
179 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
180 return 0;
181 }
182 }
183 }
184
185 if(destIndex>destCapacity) {
186 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
187 }
188 return destIndex;
189 }
190
191 #if !UCONFIG_NO_BREAK_ITERATION
192
193 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)194 ustrcase_internalToTitle(const UCaseMap *csm,
195 UChar *dest, int32_t destCapacity,
196 const UChar *src, int32_t srcLength,
197 UErrorCode *pErrorCode) {
198 const UChar *s;
199 UChar32 c;
200 int32_t prev, titleStart, titleLimit, idx, destIndex;
201 UBool isFirstIndex;
202
203 if(U_FAILURE(*pErrorCode)) {
204 return 0;
205 }
206
207 // Use the C++ abstract base class to minimize dependencies.
208 // TODO: Change UCaseMap.iter to store a BreakIterator directly.
209 BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
210
211 /* set up local variables */
212 int32_t locCache=csm->locCache;
213 UCaseContext csc=UCASECONTEXT_INITIALIZER;
214 csc.p=(void *)src;
215 csc.limit=srcLength;
216 destIndex=0;
217 prev=0;
218 isFirstIndex=TRUE;
219
220 /* titlecasing loop */
221 while(prev<srcLength) {
222 /* find next index where to titlecase */
223 if(isFirstIndex) {
224 isFirstIndex=FALSE;
225 idx=bi->first();
226 } else {
227 idx=bi->next();
228 }
229 if(idx==UBRK_DONE || idx>srcLength) {
230 idx=srcLength;
231 }
232
233 /*
234 * Unicode 4 & 5 section 3.13 Default Case Operations:
235 *
236 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
237 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
238 * cased character F. If F exists, map F to default_title(F); then map each
239 * subsequent character C to default_lower(C).
240 *
241 * In this implementation, segment [prev..index[ into 3 parts:
242 * a) uncased characters (copy as-is) [prev..titleStart[
243 * b) first case letter (titlecase) [titleStart..titleLimit[
244 * c) subsequent characters (lowercase) [titleLimit..index[
245 */
246 if(prev<idx) {
247 /* find and copy uncased characters [prev..titleStart[ */
248 titleStart=titleLimit=prev;
249 U16_NEXT(src, titleLimit, idx, c);
250 if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
251 /* Adjust the titlecasing index (titleStart) to the next cased character. */
252 for(;;) {
253 titleStart=titleLimit;
254 if(titleLimit==idx) {
255 /*
256 * only uncased characters in [prev..index[
257 * stop with titleStart==titleLimit==index
258 */
259 break;
260 }
261 U16_NEXT(src, titleLimit, idx, c);
262 if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
263 break; /* cased letter at [titleStart..titleLimit[ */
264 }
265 }
266 destIndex=appendString(dest, destIndex, destCapacity, src+prev, titleStart-prev);
267 if(destIndex<0) {
268 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
269 return 0;
270 }
271 }
272
273 if(titleStart<titleLimit) {
274 /* titlecase c which is from [titleStart..titleLimit[ */
275 csc.cpStart=titleStart;
276 csc.cpLimit=titleLimit;
277 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache);
278 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
279 if(destIndex<0) {
280 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
281 return 0;
282 }
283
284 /* Special case Dutch IJ titlecasing */
285 if (titleStart+1 < idx &&
286 ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH &&
287 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069) &&
288 (src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A)) {
289 destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
290 if(destIndex<0) {
291 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
292 return 0;
293 }
294 titleLimit++;
295 }
296
297 /* lowercase [titleLimit..index[ */
298 if(titleLimit<idx) {
299 if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
300 /* Normal operation: Lowercase the rest of the word. */
301 destIndex+=
302 _caseMap(
303 csm, ucase_toFullLower,
304 dest+destIndex, destCapacity-destIndex,
305 src, &csc,
306 titleLimit, idx,
307 pErrorCode);
308 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
309 *pErrorCode=U_ZERO_ERROR;
310 }
311 if(U_FAILURE(*pErrorCode)) {
312 return destIndex;
313 }
314 } else {
315 /* Optionally just copy the rest of the word unchanged. */
316 destIndex=appendString(dest, destIndex, destCapacity, src+titleLimit, idx-titleLimit);
317 if(destIndex<0) {
318 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
319 return 0;
320 }
321 }
322 }
323 }
324 }
325
326 prev=idx;
327 }
328
329 if(destIndex>destCapacity) {
330 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
331 }
332 return destIndex;
333 }
334
335 #endif // !UCONFIG_NO_BREAK_ITERATION
336
337 U_NAMESPACE_BEGIN
338 namespace GreekUpper {
339
340 // Data generated by prototype code, see
341 // http://site.icu-project.org/design/case/greek-upper
342 // TODO: Move this data into ucase.icu.
343 static const uint16_t data0370[] = {
344 // U+0370..03FF
345 0x0370,
346 0x0370,
347 0x0372,
348 0x0372,
349 0,
350 0,
351 0x0376,
352 0x0376,
353 0,
354 0,
355 0x037A,
356 0x03FD,
357 0x03FE,
358 0x03FF,
359 0,
360 0x037F,
361 0,
362 0,
363 0,
364 0,
365 0,
366 0,
367 0x0391 | HAS_VOWEL | HAS_ACCENT,
368 0,
369 0x0395 | HAS_VOWEL | HAS_ACCENT,
370 0x0397 | HAS_VOWEL | HAS_ACCENT,
371 0x0399 | HAS_VOWEL | HAS_ACCENT,
372 0,
373 0x039F | HAS_VOWEL | HAS_ACCENT,
374 0,
375 0x03A5 | HAS_VOWEL | HAS_ACCENT,
376 0x03A9 | HAS_VOWEL | HAS_ACCENT,
377 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
378 0x0391 | HAS_VOWEL,
379 0x0392,
380 0x0393,
381 0x0394,
382 0x0395 | HAS_VOWEL,
383 0x0396,
384 0x0397 | HAS_VOWEL,
385 0x0398,
386 0x0399 | HAS_VOWEL,
387 0x039A,
388 0x039B,
389 0x039C,
390 0x039D,
391 0x039E,
392 0x039F | HAS_VOWEL,
393 0x03A0,
394 0x03A1,
395 0,
396 0x03A3,
397 0x03A4,
398 0x03A5 | HAS_VOWEL,
399 0x03A6,
400 0x03A7,
401 0x03A8,
402 0x03A9 | HAS_VOWEL,
403 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
404 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
405 0x0391 | HAS_VOWEL | HAS_ACCENT,
406 0x0395 | HAS_VOWEL | HAS_ACCENT,
407 0x0397 | HAS_VOWEL | HAS_ACCENT,
408 0x0399 | HAS_VOWEL | HAS_ACCENT,
409 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
410 0x0391 | HAS_VOWEL,
411 0x0392,
412 0x0393,
413 0x0394,
414 0x0395 | HAS_VOWEL,
415 0x0396,
416 0x0397 | HAS_VOWEL,
417 0x0398,
418 0x0399 | HAS_VOWEL,
419 0x039A,
420 0x039B,
421 0x039C,
422 0x039D,
423 0x039E,
424 0x039F | HAS_VOWEL,
425 0x03A0,
426 0x03A1,
427 0x03A3,
428 0x03A3,
429 0x03A4,
430 0x03A5 | HAS_VOWEL,
431 0x03A6,
432 0x03A7,
433 0x03A8,
434 0x03A9 | HAS_VOWEL,
435 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
436 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
437 0x039F | HAS_VOWEL | HAS_ACCENT,
438 0x03A5 | HAS_VOWEL | HAS_ACCENT,
439 0x03A9 | HAS_VOWEL | HAS_ACCENT,
440 0x03CF,
441 0x0392,
442 0x0398,
443 0x03D2,
444 0x03D2 | HAS_ACCENT,
445 0x03D2 | HAS_DIALYTIKA,
446 0x03A6,
447 0x03A0,
448 0x03CF,
449 0x03D8,
450 0x03D8,
451 0x03DA,
452 0x03DA,
453 0x03DC,
454 0x03DC,
455 0x03DE,
456 0x03DE,
457 0x03E0,
458 0x03E0,
459 0,
460 0,
461 0,
462 0,
463 0,
464 0,
465 0,
466 0,
467 0,
468 0,
469 0,
470 0,
471 0,
472 0,
473 0x039A,
474 0x03A1,
475 0x03F9,
476 0x037F,
477 0x03F4,
478 0x0395 | HAS_VOWEL,
479 0,
480 0x03F7,
481 0x03F7,
482 0x03F9,
483 0x03FA,
484 0x03FA,
485 0x03FC,
486 0x03FD,
487 0x03FE,
488 0x03FF,
489 };
490
491 static const uint16_t data1F00[] = {
492 // U+1F00..1FFF
493 0x0391 | HAS_VOWEL,
494 0x0391 | HAS_VOWEL,
495 0x0391 | HAS_VOWEL | HAS_ACCENT,
496 0x0391 | HAS_VOWEL | HAS_ACCENT,
497 0x0391 | HAS_VOWEL | HAS_ACCENT,
498 0x0391 | HAS_VOWEL | HAS_ACCENT,
499 0x0391 | HAS_VOWEL | HAS_ACCENT,
500 0x0391 | HAS_VOWEL | HAS_ACCENT,
501 0x0391 | HAS_VOWEL,
502 0x0391 | HAS_VOWEL,
503 0x0391 | HAS_VOWEL | HAS_ACCENT,
504 0x0391 | HAS_VOWEL | HAS_ACCENT,
505 0x0391 | HAS_VOWEL | HAS_ACCENT,
506 0x0391 | HAS_VOWEL | HAS_ACCENT,
507 0x0391 | HAS_VOWEL | HAS_ACCENT,
508 0x0391 | HAS_VOWEL | HAS_ACCENT,
509 0x0395 | HAS_VOWEL,
510 0x0395 | HAS_VOWEL,
511 0x0395 | HAS_VOWEL | HAS_ACCENT,
512 0x0395 | HAS_VOWEL | HAS_ACCENT,
513 0x0395 | HAS_VOWEL | HAS_ACCENT,
514 0x0395 | HAS_VOWEL | HAS_ACCENT,
515 0,
516 0,
517 0x0395 | HAS_VOWEL,
518 0x0395 | HAS_VOWEL,
519 0x0395 | HAS_VOWEL | HAS_ACCENT,
520 0x0395 | HAS_VOWEL | HAS_ACCENT,
521 0x0395 | HAS_VOWEL | HAS_ACCENT,
522 0x0395 | HAS_VOWEL | HAS_ACCENT,
523 0,
524 0,
525 0x0397 | HAS_VOWEL,
526 0x0397 | HAS_VOWEL,
527 0x0397 | HAS_VOWEL | HAS_ACCENT,
528 0x0397 | HAS_VOWEL | HAS_ACCENT,
529 0x0397 | HAS_VOWEL | HAS_ACCENT,
530 0x0397 | HAS_VOWEL | HAS_ACCENT,
531 0x0397 | HAS_VOWEL | HAS_ACCENT,
532 0x0397 | HAS_VOWEL | HAS_ACCENT,
533 0x0397 | HAS_VOWEL,
534 0x0397 | HAS_VOWEL,
535 0x0397 | HAS_VOWEL | HAS_ACCENT,
536 0x0397 | HAS_VOWEL | HAS_ACCENT,
537 0x0397 | HAS_VOWEL | HAS_ACCENT,
538 0x0397 | HAS_VOWEL | HAS_ACCENT,
539 0x0397 | HAS_VOWEL | HAS_ACCENT,
540 0x0397 | HAS_VOWEL | HAS_ACCENT,
541 0x0399 | HAS_VOWEL,
542 0x0399 | HAS_VOWEL,
543 0x0399 | HAS_VOWEL | HAS_ACCENT,
544 0x0399 | HAS_VOWEL | HAS_ACCENT,
545 0x0399 | HAS_VOWEL | HAS_ACCENT,
546 0x0399 | HAS_VOWEL | HAS_ACCENT,
547 0x0399 | HAS_VOWEL | HAS_ACCENT,
548 0x0399 | HAS_VOWEL | HAS_ACCENT,
549 0x0399 | HAS_VOWEL,
550 0x0399 | HAS_VOWEL,
551 0x0399 | HAS_VOWEL | HAS_ACCENT,
552 0x0399 | HAS_VOWEL | HAS_ACCENT,
553 0x0399 | HAS_VOWEL | HAS_ACCENT,
554 0x0399 | HAS_VOWEL | HAS_ACCENT,
555 0x0399 | HAS_VOWEL | HAS_ACCENT,
556 0x0399 | HAS_VOWEL | HAS_ACCENT,
557 0x039F | HAS_VOWEL,
558 0x039F | HAS_VOWEL,
559 0x039F | HAS_VOWEL | HAS_ACCENT,
560 0x039F | HAS_VOWEL | HAS_ACCENT,
561 0x039F | HAS_VOWEL | HAS_ACCENT,
562 0x039F | HAS_VOWEL | HAS_ACCENT,
563 0,
564 0,
565 0x039F | HAS_VOWEL,
566 0x039F | HAS_VOWEL,
567 0x039F | HAS_VOWEL | HAS_ACCENT,
568 0x039F | HAS_VOWEL | HAS_ACCENT,
569 0x039F | HAS_VOWEL | HAS_ACCENT,
570 0x039F | HAS_VOWEL | HAS_ACCENT,
571 0,
572 0,
573 0x03A5 | HAS_VOWEL,
574 0x03A5 | HAS_VOWEL,
575 0x03A5 | HAS_VOWEL | HAS_ACCENT,
576 0x03A5 | HAS_VOWEL | HAS_ACCENT,
577 0x03A5 | HAS_VOWEL | HAS_ACCENT,
578 0x03A5 | HAS_VOWEL | HAS_ACCENT,
579 0x03A5 | HAS_VOWEL | HAS_ACCENT,
580 0x03A5 | HAS_VOWEL | HAS_ACCENT,
581 0,
582 0x03A5 | HAS_VOWEL,
583 0,
584 0x03A5 | HAS_VOWEL | HAS_ACCENT,
585 0,
586 0x03A5 | HAS_VOWEL | HAS_ACCENT,
587 0,
588 0x03A5 | HAS_VOWEL | HAS_ACCENT,
589 0x03A9 | HAS_VOWEL,
590 0x03A9 | HAS_VOWEL,
591 0x03A9 | HAS_VOWEL | HAS_ACCENT,
592 0x03A9 | HAS_VOWEL | HAS_ACCENT,
593 0x03A9 | HAS_VOWEL | HAS_ACCENT,
594 0x03A9 | HAS_VOWEL | HAS_ACCENT,
595 0x03A9 | HAS_VOWEL | HAS_ACCENT,
596 0x03A9 | HAS_VOWEL | HAS_ACCENT,
597 0x03A9 | HAS_VOWEL,
598 0x03A9 | HAS_VOWEL,
599 0x03A9 | HAS_VOWEL | HAS_ACCENT,
600 0x03A9 | HAS_VOWEL | HAS_ACCENT,
601 0x03A9 | HAS_VOWEL | HAS_ACCENT,
602 0x03A9 | HAS_VOWEL | HAS_ACCENT,
603 0x03A9 | HAS_VOWEL | HAS_ACCENT,
604 0x03A9 | HAS_VOWEL | HAS_ACCENT,
605 0x0391 | HAS_VOWEL | HAS_ACCENT,
606 0x0391 | HAS_VOWEL | HAS_ACCENT,
607 0x0395 | HAS_VOWEL | HAS_ACCENT,
608 0x0395 | HAS_VOWEL | HAS_ACCENT,
609 0x0397 | HAS_VOWEL | HAS_ACCENT,
610 0x0397 | HAS_VOWEL | HAS_ACCENT,
611 0x0399 | HAS_VOWEL | HAS_ACCENT,
612 0x0399 | HAS_VOWEL | HAS_ACCENT,
613 0x039F | HAS_VOWEL | HAS_ACCENT,
614 0x039F | HAS_VOWEL | HAS_ACCENT,
615 0x03A5 | HAS_VOWEL | HAS_ACCENT,
616 0x03A5 | HAS_VOWEL | HAS_ACCENT,
617 0x03A9 | HAS_VOWEL | HAS_ACCENT,
618 0x03A9 | HAS_VOWEL | HAS_ACCENT,
619 0,
620 0,
621 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
622 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
623 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
624 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
625 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
626 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
627 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
628 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
629 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
630 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
631 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
632 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
633 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
634 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
635 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
636 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
637 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
638 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
639 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
640 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
641 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
642 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
643 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
644 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
645 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
646 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
647 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
648 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
649 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
650 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
651 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
652 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
653 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
654 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
655 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
656 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
657 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
658 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
659 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
660 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
661 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
662 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
663 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
664 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
665 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
666 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
667 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
668 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
669 0x0391 | HAS_VOWEL,
670 0x0391 | HAS_VOWEL,
671 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
672 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
673 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
674 0,
675 0x0391 | HAS_VOWEL | HAS_ACCENT,
676 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
677 0x0391 | HAS_VOWEL,
678 0x0391 | HAS_VOWEL,
679 0x0391 | HAS_VOWEL | HAS_ACCENT,
680 0x0391 | HAS_VOWEL | HAS_ACCENT,
681 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
682 0,
683 0x0399 | HAS_VOWEL,
684 0,
685 0,
686 0,
687 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
688 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
689 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
690 0,
691 0x0397 | HAS_VOWEL | HAS_ACCENT,
692 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
693 0x0395 | HAS_VOWEL | HAS_ACCENT,
694 0x0395 | HAS_VOWEL | HAS_ACCENT,
695 0x0397 | HAS_VOWEL | HAS_ACCENT,
696 0x0397 | HAS_VOWEL | HAS_ACCENT,
697 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
698 0,
699 0,
700 0,
701 0x0399 | HAS_VOWEL,
702 0x0399 | HAS_VOWEL,
703 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
704 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
705 0,
706 0,
707 0x0399 | HAS_VOWEL | HAS_ACCENT,
708 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
709 0x0399 | HAS_VOWEL,
710 0x0399 | HAS_VOWEL,
711 0x0399 | HAS_VOWEL | HAS_ACCENT,
712 0x0399 | HAS_VOWEL | HAS_ACCENT,
713 0,
714 0,
715 0,
716 0,
717 0x03A5 | HAS_VOWEL,
718 0x03A5 | HAS_VOWEL,
719 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
720 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
721 0x03A1,
722 0x03A1,
723 0x03A5 | HAS_VOWEL | HAS_ACCENT,
724 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
725 0x03A5 | HAS_VOWEL,
726 0x03A5 | HAS_VOWEL,
727 0x03A5 | HAS_VOWEL | HAS_ACCENT,
728 0x03A5 | HAS_VOWEL | HAS_ACCENT,
729 0x03A1,
730 0,
731 0,
732 0,
733 0,
734 0,
735 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
736 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
737 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
738 0,
739 0x03A9 | HAS_VOWEL | HAS_ACCENT,
740 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
741 0x039F | HAS_VOWEL | HAS_ACCENT,
742 0x039F | HAS_VOWEL | HAS_ACCENT,
743 0x03A9 | HAS_VOWEL | HAS_ACCENT,
744 0x03A9 | HAS_VOWEL | HAS_ACCENT,
745 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
746 0,
747 0,
748 0,
749 };
750
751 // U+2126 Ohm sign
752 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
753
getLetterData(UChar32 c)754 uint32_t getLetterData(UChar32 c) {
755 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
756 return 0;
757 } else if (c <= 0x3ff) {
758 return data0370[c - 0x370];
759 } else if (c <= 0x1fff) {
760 return data1F00[c - 0x1f00];
761 } else if (c == 0x2126) {
762 return data2126;
763 } else {
764 return 0;
765 }
766 }
767
getDiacriticData(UChar32 c)768 uint32_t getDiacriticData(UChar32 c) {
769 switch (c) {
770 case 0x0300: // varia
771 case 0x0301: // tonos = oxia
772 case 0x0342: // perispomeni
773 case 0x0302: // circumflex can look like perispomeni
774 case 0x0303: // tilde can look like perispomeni
775 case 0x0311: // inverted breve can look like perispomeni
776 return HAS_ACCENT;
777 case 0x0308: // dialytika = diaeresis
778 return HAS_COMBINING_DIALYTIKA;
779 case 0x0344: // dialytika tonos
780 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
781 case 0x0345: // ypogegrammeni = iota subscript
782 return HAS_YPOGEGRAMMENI;
783 case 0x0304: // macron
784 case 0x0306: // breve
785 case 0x0313: // comma above
786 case 0x0314: // reversed comma above
787 case 0x0343: // koronis
788 return HAS_OTHER_GREEK_DIACRITIC;
789 default:
790 return 0;
791 }
792 }
793
isFollowedByCasedLetter(const UCaseProps * csp,const UChar * s,int32_t i,int32_t length)794 UBool isFollowedByCasedLetter(const UCaseProps *csp, const UChar *s, int32_t i, int32_t length) {
795 while (i < length) {
796 UChar32 c;
797 U16_NEXT(s, i, length, c);
798 int32_t type = ucase_getTypeOrIgnorable(csp, c);
799 if ((type & UCASE_IGNORABLE) != 0) {
800 // Case-ignorable, continue with the loop.
801 } else if (type != UCASE_NONE) {
802 return TRUE; // Followed by cased letter.
803 } else {
804 return FALSE; // Uncased and not case-ignorable.
805 }
806 }
807 return FALSE; // Not followed by cased letter.
808 }
809
810 /**
811 * Greek string uppercasing with a state machine.
812 * Probably simpler than a stateless function that has to figure out complex context-before
813 * for each character.
814 * TODO: Try to re-consolidate one way or another with the non-Greek function.
815 */
toUpper(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)816 int32_t toUpper(const UCaseMap *csm,
817 UChar *dest, int32_t destCapacity,
818 const UChar *src, int32_t srcLength,
819 UErrorCode *pErrorCode) {
820 int32_t locCache = UCASE_LOC_GREEK;
821 int32_t destIndex=0;
822 uint32_t state = 0;
823 for (int32_t i = 0; i < srcLength;) {
824 int32_t nextIndex = i;
825 UChar32 c;
826 U16_NEXT(src, nextIndex, srcLength, c);
827 uint32_t nextState = 0;
828 int32_t type = ucase_getTypeOrIgnorable(csm->csp, c);
829 if ((type & UCASE_IGNORABLE) != 0) {
830 // c is case-ignorable
831 nextState |= (state & AFTER_CASED);
832 } else if (type != UCASE_NONE) {
833 // c is cased
834 nextState |= AFTER_CASED;
835 }
836 uint32_t data = getLetterData(c);
837 if (data > 0) {
838 uint32_t upper = data & UPPER_MASK;
839 // Add a dialytika to this iota or ypsilon vowel
840 // if we removed a tonos from the previous vowel,
841 // and that previous vowel did not also have (or gain) a dialytika.
842 // Adding one only to the final vowel in a longer sequence
843 // (which does not occur in normal writing) would require lookahead.
844 // Set the same flag as for preserving an existing dialytika.
845 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
846 (upper == 0x399 || upper == 0x3A5)) {
847 data |= HAS_DIALYTIKA;
848 }
849 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
850 if ((data & HAS_YPOGEGRAMMENI) != 0) {
851 numYpogegrammeni = 1;
852 }
853 // Skip combining diacritics after this Greek letter.
854 while (nextIndex < srcLength) {
855 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
856 if (diacriticData != 0) {
857 data |= diacriticData;
858 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
859 ++numYpogegrammeni;
860 }
861 ++nextIndex;
862 } else {
863 break; // not a Greek diacritic
864 }
865 }
866 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
867 nextState |= AFTER_VOWEL_WITH_ACCENT;
868 }
869 // Map according to Greek rules.
870 UBool addTonos = FALSE;
871 if (upper == 0x397 &&
872 (data & HAS_ACCENT) != 0 &&
873 numYpogegrammeni == 0 &&
874 (state & AFTER_CASED) == 0 &&
875 !isFollowedByCasedLetter(csm->csp, src, nextIndex, srcLength)) {
876 // Keep disjunctive "or" with (only) a tonos.
877 // We use the same "word boundary" conditions as for the Final_Sigma test.
878 if (i == nextIndex) {
879 upper = 0x389; // Preserve the precomposed form.
880 } else {
881 addTonos = TRUE;
882 }
883 } else if ((data & HAS_DIALYTIKA) != 0) {
884 // Preserve a vowel with dialytika in precomposed form if it exists.
885 if (upper == 0x399) {
886 upper = 0x3AA;
887 data &= ~HAS_EITHER_DIALYTIKA;
888 } else if (upper == 0x3A5) {
889 upper = 0x3AB;
890 data &= ~HAS_EITHER_DIALYTIKA;
891 }
892 }
893 destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
894 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
895 destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika
896 }
897 if (destIndex >= 0 && addTonos) {
898 destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
899 }
900 while (destIndex >= 0 && numYpogegrammeni > 0) {
901 destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
902 --numYpogegrammeni;
903 }
904 if(destIndex<0) {
905 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
906 return 0;
907 }
908 } else {
909 const UChar *s;
910 UChar32 c2 = 0;
911 c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
912 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
913 /* fast path version of appendResult() for BMP results */
914 dest[destIndex++]=(UChar)c2;
915 } else {
916 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
917 if(destIndex<0) {
918 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
919 return 0;
920 }
921 }
922 }
923 i = nextIndex;
924 state = nextState;
925 }
926
927 if(destIndex>destCapacity) {
928 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
929 }
930 return destIndex;
931 }
932
933 } // namespace GreekUpper
934 U_NAMESPACE_END
935
936 /* functions available in the common library (for unistr_case.cpp) */
937
938 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToLower(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)939 ustrcase_internalToLower(const UCaseMap *csm,
940 UChar *dest, int32_t destCapacity,
941 const UChar *src, int32_t srcLength,
942 UErrorCode *pErrorCode) {
943 UCaseContext csc=UCASECONTEXT_INITIALIZER;
944 csc.p=(void *)src;
945 csc.limit=srcLength;
946 return _caseMap(
947 csm, ucase_toFullLower,
948 dest, destCapacity,
949 src, &csc, 0, srcLength,
950 pErrorCode);
951 }
952
953 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToUpper(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)954 ustrcase_internalToUpper(const UCaseMap *csm,
955 UChar *dest, int32_t destCapacity,
956 const UChar *src, int32_t srcLength,
957 UErrorCode *pErrorCode) {
958 int32_t locCache = csm->locCache;
959 if (ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_GREEK) {
960 return GreekUpper::toUpper(csm, dest, destCapacity, src, srcLength, pErrorCode);
961 }
962 UCaseContext csc=UCASECONTEXT_INITIALIZER;
963 csc.p=(void *)src;
964 csc.limit=srcLength;
965 return _caseMap(
966 csm, ucase_toFullUpper,
967 dest, destCapacity,
968 src, &csc, 0, srcLength,
969 pErrorCode);
970 }
971
972 static int32_t
ustr_foldCase(const UCaseProps * csp,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,uint32_t options,UErrorCode * pErrorCode)973 ustr_foldCase(const UCaseProps *csp,
974 UChar *dest, int32_t destCapacity,
975 const UChar *src, int32_t srcLength,
976 uint32_t options,
977 UErrorCode *pErrorCode) {
978 int32_t srcIndex, destIndex;
979
980 const UChar *s;
981 UChar32 c, c2 = 0;
982
983 /* case mapping loop */
984 srcIndex=destIndex=0;
985 while(srcIndex<srcLength) {
986 U16_NEXT(src, srcIndex, srcLength, c);
987 c=ucase_toFullFolding(csp, c, &s, options);
988 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
989 /* fast path version of appendResult() for BMP results */
990 dest[destIndex++]=(UChar)c2;
991 } else {
992 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
993 if(destIndex<0) {
994 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
995 return 0;
996 }
997 }
998 }
999
1000 if(destIndex>destCapacity) {
1001 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1002 }
1003 return destIndex;
1004 }
1005
1006 U_CFUNC int32_t U_CALLCONV
ustrcase_internalFold(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)1007 ustrcase_internalFold(const UCaseMap *csm,
1008 UChar *dest, int32_t destCapacity,
1009 const UChar *src, int32_t srcLength,
1010 UErrorCode *pErrorCode) {
1011 return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
1012 }
1013
1014 U_CFUNC int32_t
ustrcase_map(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UStringCaseMapper * stringCaseMapper,UErrorCode * pErrorCode)1015 ustrcase_map(const UCaseMap *csm,
1016 UChar *dest, int32_t destCapacity,
1017 const UChar *src, int32_t srcLength,
1018 UStringCaseMapper *stringCaseMapper,
1019 UErrorCode *pErrorCode) {
1020 UChar buffer[300];
1021 UChar *temp;
1022
1023 int32_t destLength;
1024
1025 /* check argument values */
1026 if(U_FAILURE(*pErrorCode)) {
1027 return 0;
1028 }
1029 if( destCapacity<0 ||
1030 (dest==NULL && destCapacity>0) ||
1031 src==NULL ||
1032 srcLength<-1
1033 ) {
1034 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1035 return 0;
1036 }
1037
1038 /* get the string length */
1039 if(srcLength==-1) {
1040 srcLength=u_strlen(src);
1041 }
1042
1043 /* check for overlapping source and destination */
1044 if( dest!=NULL &&
1045 ((src>=dest && src<(dest+destCapacity)) ||
1046 (dest>=src && dest<(src+srcLength)))
1047 ) {
1048 /* overlap: provide a temporary destination buffer and later copy the result */
1049 if(destCapacity<=UPRV_LENGTHOF(buffer)) {
1050 /* the stack buffer is large enough */
1051 temp=buffer;
1052 } else {
1053 /* allocate a buffer */
1054 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1055 if(temp==NULL) {
1056 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1057 return 0;
1058 }
1059 }
1060 } else {
1061 temp=dest;
1062 }
1063
1064 destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode);
1065 if(temp!=dest) {
1066 /* copy the result string to the destination buffer */
1067 if(destLength>0) {
1068 int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
1069 if(copyLength>0) {
1070 u_memmove(dest, temp, copyLength);
1071 }
1072 }
1073 if(temp!=buffer) {
1074 uprv_free(temp);
1075 }
1076 }
1077
1078 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
1079 }
1080
1081 /* public API functions */
1082
1083 U_CAPI int32_t U_EXPORT2
u_strFoldCase(UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,uint32_t options,UErrorCode * pErrorCode)1084 u_strFoldCase(UChar *dest, int32_t destCapacity,
1085 const UChar *src, int32_t srcLength,
1086 uint32_t options,
1087 UErrorCode *pErrorCode) {
1088 UCaseMap csm=UCASEMAP_INITIALIZER;
1089 csm.csp=ucase_getSingleton();
1090 csm.options=options;
1091 return ustrcase_map(
1092 &csm,
1093 dest, destCapacity,
1094 src, srcLength,
1095 ustrcase_internalFold, pErrorCode);
1096 }
1097
1098 /* case-insensitive string comparisons -------------------------------------- */
1099
1100 /*
1101 * This function is a copy of unorm_cmpEquivFold() minus the parts for
1102 * canonical equivalence.
1103 * Keep the functions in sync, and see there for how this works.
1104 * The duplication is for modularization:
1105 * It makes caseless (but not canonical caseless) matches independent of
1106 * the normalization code.
1107 */
1108
1109 /* stack element for previous-level source/decomposition pointers */
1110 struct CmpEquivLevel {
1111 const UChar *start, *s, *limit;
1112 };
1113 typedef struct CmpEquivLevel CmpEquivLevel;
1114
1115 /**
1116 * Internal implementation code comparing string with case fold.
1117 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1118 *
1119 * @param s1 input string 1
1120 * @param length1 length of string 1, or -1 (NULL terminated)
1121 * @param s2 input string 2
1122 * @param length2 length of string 2, or -1 (NULL terminated)
1123 * @param options compare options
1124 * @param matchLen1 (output) length of partial prefix match in s1
1125 * @param matchLen2 (output) length of partial prefix match in s2
1126 * @param pErrorCode receives error status
1127 * @return The result of comparison
1128 */
_cmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1129 static int32_t _cmpFold(
1130 const UChar *s1, int32_t length1,
1131 const UChar *s2, int32_t length2,
1132 uint32_t options,
1133 int32_t *matchLen1, int32_t *matchLen2,
1134 UErrorCode *pErrorCode) {
1135 int32_t cmpRes = 0;
1136
1137 const UCaseProps *csp;
1138
1139 /* current-level start/limit - s1/s2 as current */
1140 const UChar *start1, *start2, *limit1, *limit2;
1141
1142 /* points to the original start address */
1143 const UChar *org1, *org2;
1144
1145 /* points to the end of match + 1 */
1146 const UChar *m1, *m2;
1147
1148 /* case folding variables */
1149 const UChar *p;
1150 int32_t length;
1151
1152 /* stacks of previous-level start/current/limit */
1153 CmpEquivLevel stack1[2], stack2[2];
1154
1155 /* case folding buffers, only use current-level start/limit */
1156 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1157
1158 /* track which is the current level per string */
1159 int32_t level1, level2;
1160
1161 /* current code units, and code points for lookups */
1162 UChar32 c1, c2, cp1, cp2;
1163
1164 /* no argument error checking because this itself is not an API */
1165
1166 /*
1167 * assume that at least the option U_COMPARE_IGNORE_CASE is set
1168 * otherwise this function would have to behave exactly as uprv_strCompare()
1169 */
1170 csp=ucase_getSingleton();
1171 if(U_FAILURE(*pErrorCode)) {
1172 return 0;
1173 }
1174
1175 /* initialize */
1176 if(matchLen1) {
1177 U_ASSERT(matchLen2 !=NULL);
1178 *matchLen1=0;
1179 *matchLen2=0;
1180 }
1181
1182 start1=m1=org1=s1;
1183 if(length1==-1) {
1184 limit1=NULL;
1185 } else {
1186 limit1=s1+length1;
1187 }
1188
1189 start2=m2=org2=s2;
1190 if(length2==-1) {
1191 limit2=NULL;
1192 } else {
1193 limit2=s2+length2;
1194 }
1195
1196 level1=level2=0;
1197 c1=c2=-1;
1198
1199 /* comparison loop */
1200 for(;;) {
1201 /*
1202 * here a code unit value of -1 means "get another code unit"
1203 * below it will mean "this source is finished"
1204 */
1205
1206 if(c1<0) {
1207 /* get next code unit from string 1, post-increment */
1208 for(;;) {
1209 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
1210 if(level1==0) {
1211 c1=-1;
1212 break;
1213 }
1214 } else {
1215 ++s1;
1216 break;
1217 }
1218
1219 /* reached end of level buffer, pop one level */
1220 do {
1221 --level1;
1222 start1=stack1[level1].start; /*Not uninitialized*/
1223 } while(start1==NULL);
1224 s1=stack1[level1].s; /*Not uninitialized*/
1225 limit1=stack1[level1].limit; /*Not uninitialized*/
1226 }
1227 }
1228
1229 if(c2<0) {
1230 /* get next code unit from string 2, post-increment */
1231 for(;;) {
1232 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
1233 if(level2==0) {
1234 c2=-1;
1235 break;
1236 }
1237 } else {
1238 ++s2;
1239 break;
1240 }
1241
1242 /* reached end of level buffer, pop one level */
1243 do {
1244 --level2;
1245 start2=stack2[level2].start; /*Not uninitialized*/
1246 } while(start2==NULL);
1247 s2=stack2[level2].s; /*Not uninitialized*/
1248 limit2=stack2[level2].limit; /*Not uninitialized*/
1249 }
1250 }
1251
1252 /*
1253 * compare c1 and c2
1254 * either variable c1, c2 is -1 only if the corresponding string is finished
1255 */
1256 if(c1==c2) {
1257 const UChar *next1, *next2;
1258
1259 if(c1<0) {
1260 cmpRes=0; /* c1==c2==-1 indicating end of strings */
1261 break;
1262 }
1263
1264 /*
1265 * Note: Move the match positions in both strings at the same time
1266 * only when corresponding code point(s) in the original strings
1267 * are fully consumed. For example, when comparing s1="Fust" and
1268 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1269 * the first code point in the case-folded data. But the second "s"
1270 * has no matching code point in s1, so this implementation returns
1271 * 2 as the prefix match length ("Fu").
1272 */
1273 next1=next2=NULL;
1274 if(level1==0) {
1275 next1=s1;
1276 } else if(s1==limit1) {
1277 /* Note: This implementation only use a single level of stack.
1278 * If this code needs to be changed to use multiple levels
1279 * of stacks, the code above should check if the current
1280 * code is at the end of all stacks.
1281 */
1282 U_ASSERT(level1==1);
1283
1284 /* is s1 at the end of the current stack? */
1285 next1=stack1[0].s;
1286 }
1287
1288 if (next1!=NULL) {
1289 if(level2==0) {
1290 next2=s2;
1291 } else if(s2==limit2) {
1292 U_ASSERT(level2==1);
1293
1294 /* is s2 at the end of the current stack? */
1295 next2=stack2[0].s;
1296 }
1297 if(next2!=NULL) {
1298 m1=next1;
1299 m2=next2;
1300 }
1301 }
1302 c1=c2=-1; /* make us fetch new code units */
1303 continue;
1304 } else if(c1<0) {
1305 cmpRes=-1; /* string 1 ends before string 2 */
1306 break;
1307 } else if(c2<0) {
1308 cmpRes=1; /* string 2 ends before string 1 */
1309 break;
1310 }
1311 /* c1!=c2 && c1>=0 && c2>=0 */
1312
1313 /* get complete code points for c1, c2 for lookups if either is a surrogate */
1314 cp1=c1;
1315 if(U_IS_SURROGATE(c1)) {
1316 UChar c;
1317
1318 if(U_IS_SURROGATE_LEAD(c1)) {
1319 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1320 /* advance ++s1; only below if cp1 decomposes/case-folds */
1321 cp1=U16_GET_SUPPLEMENTARY(c1, c);
1322 }
1323 } else /* isTrail(c1) */ {
1324 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1325 cp1=U16_GET_SUPPLEMENTARY(c, c1);
1326 }
1327 }
1328 }
1329
1330 cp2=c2;
1331 if(U_IS_SURROGATE(c2)) {
1332 UChar c;
1333
1334 if(U_IS_SURROGATE_LEAD(c2)) {
1335 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1336 /* advance ++s2; only below if cp2 decomposes/case-folds */
1337 cp2=U16_GET_SUPPLEMENTARY(c2, c);
1338 }
1339 } else /* isTrail(c2) */ {
1340 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1341 cp2=U16_GET_SUPPLEMENTARY(c, c2);
1342 }
1343 }
1344 }
1345
1346 /*
1347 * go down one level for each string
1348 * continue with the main loop as soon as there is a real change
1349 */
1350
1351 if( level1==0 &&
1352 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
1353 ) {
1354 /* cp1 case-folds to the code point "length" or to p[length] */
1355 if(U_IS_SURROGATE(c1)) {
1356 if(U_IS_SURROGATE_LEAD(c1)) {
1357 /* advance beyond source surrogate pair if it case-folds */
1358 ++s1;
1359 } else /* isTrail(c1) */ {
1360 /*
1361 * we got a supplementary code point when hitting its trail surrogate,
1362 * therefore the lead surrogate must have been the same as in the other string;
1363 * compare this decomposition with the lead surrogate in the other string
1364 * remember that this simulates bulk text replacement:
1365 * the decomposition would replace the entire code point
1366 */
1367 --s2;
1368 --m2;
1369 c2=*(s2-1);
1370 }
1371 }
1372
1373 /* push current level pointers */
1374 stack1[0].start=start1;
1375 stack1[0].s=s1;
1376 stack1[0].limit=limit1;
1377 ++level1;
1378
1379 /* copy the folding result to fold1[] */
1380 if(length<=UCASE_MAX_STRING_LENGTH) {
1381 u_memcpy(fold1, p, length);
1382 } else {
1383 int32_t i=0;
1384 U16_APPEND_UNSAFE(fold1, i, length);
1385 length=i;
1386 }
1387
1388 /* set next level pointers to case folding */
1389 start1=s1=fold1;
1390 limit1=fold1+length;
1391
1392 /* get ready to read from decomposition, continue with loop */
1393 c1=-1;
1394 continue;
1395 }
1396
1397 if( level2==0 &&
1398 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
1399 ) {
1400 /* cp2 case-folds to the code point "length" or to p[length] */
1401 if(U_IS_SURROGATE(c2)) {
1402 if(U_IS_SURROGATE_LEAD(c2)) {
1403 /* advance beyond source surrogate pair if it case-folds */
1404 ++s2;
1405 } else /* isTrail(c2) */ {
1406 /*
1407 * we got a supplementary code point when hitting its trail surrogate,
1408 * therefore the lead surrogate must have been the same as in the other string;
1409 * compare this decomposition with the lead surrogate in the other string
1410 * remember that this simulates bulk text replacement:
1411 * the decomposition would replace the entire code point
1412 */
1413 --s1;
1414 --m2;
1415 c1=*(s1-1);
1416 }
1417 }
1418
1419 /* push current level pointers */
1420 stack2[0].start=start2;
1421 stack2[0].s=s2;
1422 stack2[0].limit=limit2;
1423 ++level2;
1424
1425 /* copy the folding result to fold2[] */
1426 if(length<=UCASE_MAX_STRING_LENGTH) {
1427 u_memcpy(fold2, p, length);
1428 } else {
1429 int32_t i=0;
1430 U16_APPEND_UNSAFE(fold2, i, length);
1431 length=i;
1432 }
1433
1434 /* set next level pointers to case folding */
1435 start2=s2=fold2;
1436 limit2=fold2+length;
1437
1438 /* get ready to read from decomposition, continue with loop */
1439 c2=-1;
1440 continue;
1441 }
1442
1443 /*
1444 * no decomposition/case folding, max level for both sides:
1445 * return difference result
1446 *
1447 * code point order comparison must not just return cp1-cp2
1448 * because when single surrogates are present then the surrogate pairs
1449 * that formed cp1 and cp2 may be from different string indexes
1450 *
1451 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1452 * c1=d800 cp1=10001 c2=dc00 cp2=10000
1453 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1454 *
1455 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1456 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1457 * so we have slightly different pointer/start/limit comparisons here
1458 */
1459
1460 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1461 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1462 if(
1463 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1464 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1465 ) {
1466 /* part of a surrogate pair, leave >=d800 */
1467 } else {
1468 /* BMP code point - may be surrogate code point - make <d800 */
1469 c1-=0x2800;
1470 }
1471
1472 if(
1473 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1474 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1475 ) {
1476 /* part of a surrogate pair, leave >=d800 */
1477 } else {
1478 /* BMP code point - may be surrogate code point - make <d800 */
1479 c2-=0x2800;
1480 }
1481 }
1482
1483 cmpRes=c1-c2;
1484 break;
1485 }
1486
1487 if(matchLen1) {
1488 *matchLen1=m1-org1;
1489 *matchLen2=m2-org2;
1490 }
1491 return cmpRes;
1492 }
1493
1494 /* internal function */
1495 U_CFUNC int32_t
u_strcmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1496 u_strcmpFold(const UChar *s1, int32_t length1,
1497 const UChar *s2, int32_t length2,
1498 uint32_t options,
1499 UErrorCode *pErrorCode) {
1500 return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
1501 }
1502
1503 /* public API functions */
1504
1505 U_CAPI int32_t U_EXPORT2
u_strCaseCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1506 u_strCaseCompare(const UChar *s1, int32_t length1,
1507 const UChar *s2, int32_t length2,
1508 uint32_t options,
1509 UErrorCode *pErrorCode) {
1510 /* argument checking */
1511 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1512 return 0;
1513 }
1514 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
1515 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1516 return 0;
1517 }
1518 return u_strcmpFold(s1, length1, s2, length2,
1519 options|U_COMPARE_IGNORE_CASE,
1520 pErrorCode);
1521 }
1522
1523 U_CAPI int32_t U_EXPORT2
u_strcasecmp(const UChar * s1,const UChar * s2,uint32_t options)1524 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
1525 UErrorCode errorCode=U_ZERO_ERROR;
1526 return u_strcmpFold(s1, -1, s2, -1,
1527 options|U_COMPARE_IGNORE_CASE,
1528 &errorCode);
1529 }
1530
1531 U_CAPI int32_t U_EXPORT2
u_memcasecmp(const UChar * s1,const UChar * s2,int32_t length,uint32_t options)1532 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
1533 UErrorCode errorCode=U_ZERO_ERROR;
1534 return u_strcmpFold(s1, length, s2, length,
1535 options|U_COMPARE_IGNORE_CASE,
1536 &errorCode);
1537 }
1538
1539 U_CAPI int32_t U_EXPORT2
u_strncasecmp(const UChar * s1,const UChar * s2,int32_t n,uint32_t options)1540 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
1541 UErrorCode errorCode=U_ZERO_ERROR;
1542 return u_strcmpFold(s1, n, s2, n,
1543 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1544 &errorCode);
1545 }
1546
1547 /* internal API - detect length of shared prefix */
1548 U_CAPI void
u_caseInsensitivePrefixMatch(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1549 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
1550 const UChar *s2, int32_t length2,
1551 uint32_t options,
1552 int32_t *matchLen1, int32_t *matchLen2,
1553 UErrorCode *pErrorCode) {
1554 _cmpFold(s1, length1, s2, length2, options,
1555 matchLen1, matchLen2, pErrorCode);
1556 }
1557