• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2001-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ustrcase.cpp
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002feb20
16 *   created by: Markus W. Scherer
17 *
18 *   Implementation file for string casing C API functions.
19 *   Uses functions from uchar.c for basic functionality that requires access
20 *   to the Unicode Character Database (uprops.dat).
21 */
22 
23 #include "unicode/utypes.h"
24 #include "unicode/brkiter.h"
25 #include "unicode/ustring.h"
26 #include "unicode/ucasemap.h"
27 #include "unicode/ubrk.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
30 #include "cmemory.h"
31 #include "ucase.h"
32 #include "ustr_imp.h"
33 #include "uassert.h"
34 
35 U_NAMESPACE_USE
36 
37 /* string casing ------------------------------------------------------------ */
38 
39 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
40 static inline int32_t
appendResult(UChar * dest,int32_t destIndex,int32_t destCapacity,int32_t result,const UChar * s)41 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
42              int32_t result, const UChar *s) {
43     UChar32 c;
44     int32_t length;
45 
46     /* decode the result */
47     if(result<0) {
48         /* (not) original code point */
49         c=~result;
50         length=U16_LENGTH(c);
51     } else if(result<=UCASE_MAX_STRING_LENGTH) {
52         c=U_SENTINEL;
53         length=result;
54     } else {
55         c=result;
56         length=U16_LENGTH(c);
57     }
58     if(length>(INT32_MAX-destIndex)) {
59         return -1;  // integer overflow
60     }
61 
62     if(destIndex<destCapacity) {
63         /* append the result */
64         if(c>=0) {
65             /* code point */
66             UBool isError=FALSE;
67             U16_APPEND(dest, destIndex, destCapacity, c, isError);
68             if(isError) {
69                 /* overflow, nothing written */
70                 destIndex+=length;
71             }
72         } else {
73             /* string */
74             if((destIndex+length)<=destCapacity) {
75                 while(length>0) {
76                     dest[destIndex++]=*s++;
77                     --length;
78                 }
79             } else {
80                 /* overflow */
81                 destIndex+=length;
82             }
83         }
84     } else {
85         /* preflight */
86         destIndex+=length;
87     }
88     return destIndex;
89 }
90 
91 static inline int32_t
appendUChar(UChar * dest,int32_t destIndex,int32_t destCapacity,UChar c)92 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
93     if(destIndex<destCapacity) {
94         dest[destIndex]=c;
95     } else if(destIndex==INT32_MAX) {
96         return -1;  // integer overflow
97     }
98     return destIndex+1;
99 }
100 
101 static inline int32_t
appendString(UChar * dest,int32_t destIndex,int32_t destCapacity,const UChar * s,int32_t length)102 appendString(UChar *dest, int32_t destIndex, int32_t destCapacity,
103              const UChar *s, int32_t length) {
104     if(length>0) {
105         if(length>(INT32_MAX-destIndex)) {
106             return -1;  // integer overflow
107         }
108         if((destIndex+length)<=destCapacity) {
109             u_memcpy(dest+destIndex, s, length);
110         }
111         destIndex+=length;
112     }
113     return destIndex;
114 }
115 
116 static UChar32 U_CALLCONV
utf16_caseContextIterator(void * context,int8_t dir)117 utf16_caseContextIterator(void *context, int8_t dir) {
118     UCaseContext *csc=(UCaseContext *)context;
119     UChar32 c;
120 
121     if(dir<0) {
122         /* reset for backward iteration */
123         csc->index=csc->cpStart;
124         csc->dir=dir;
125     } else if(dir>0) {
126         /* reset for forward iteration */
127         csc->index=csc->cpLimit;
128         csc->dir=dir;
129     } else {
130         /* continue current iteration direction */
131         dir=csc->dir;
132     }
133 
134     if(dir<0) {
135         if(csc->start<csc->index) {
136             U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
137             return c;
138         }
139     } else {
140         if(csc->index<csc->limit) {
141             U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
142             return c;
143         }
144     }
145     return U_SENTINEL;
146 }
147 
148 /*
149  * Case-maps [srcStart..srcLimit[ but takes
150  * context [0..srcLength[ into account.
151  */
152 static int32_t
_caseMap(const UCaseMap * csm,UCaseMapFull * map,UChar * dest,int32_t destCapacity,const UChar * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,UErrorCode * pErrorCode)153 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
154          UChar *dest, int32_t destCapacity,
155          const UChar *src, UCaseContext *csc,
156          int32_t srcStart, int32_t srcLimit,
157          UErrorCode *pErrorCode) {
158     const UChar *s;
159     UChar32 c, c2 = 0;
160     int32_t srcIndex, destIndex;
161     int32_t locCache;
162 
163     locCache=csm->locCache;
164 
165     /* case mapping loop */
166     srcIndex=srcStart;
167     destIndex=0;
168     while(srcIndex<srcLimit) {
169         csc->cpStart=srcIndex;
170         U16_NEXT(src, srcIndex, srcLimit, c);
171         csc->cpLimit=srcIndex;
172         c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
173         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
174             /* fast path version of appendResult() for BMP results */
175             dest[destIndex++]=(UChar)c2;
176         } else {
177             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
178             if(destIndex<0) {
179                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
180                 return 0;
181             }
182         }
183     }
184 
185     if(destIndex>destCapacity) {
186         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
187     }
188     return destIndex;
189 }
190 
191 #if !UCONFIG_NO_BREAK_ITERATION
192 
193 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)194 ustrcase_internalToTitle(const UCaseMap *csm,
195                          UChar *dest, int32_t destCapacity,
196                          const UChar *src, int32_t srcLength,
197                          UErrorCode *pErrorCode) {
198     const UChar *s;
199     UChar32 c;
200     int32_t prev, titleStart, titleLimit, idx, destIndex;
201     UBool isFirstIndex;
202 
203     if(U_FAILURE(*pErrorCode)) {
204         return 0;
205     }
206 
207     // Use the C++ abstract base class to minimize dependencies.
208     // TODO: Change UCaseMap.iter to store a BreakIterator directly.
209     BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
210 
211     /* set up local variables */
212     int32_t locCache=csm->locCache;
213     UCaseContext csc=UCASECONTEXT_INITIALIZER;
214     csc.p=(void *)src;
215     csc.limit=srcLength;
216     destIndex=0;
217     prev=0;
218     isFirstIndex=TRUE;
219 
220     /* titlecasing loop */
221     while(prev<srcLength) {
222         /* find next index where to titlecase */
223         if(isFirstIndex) {
224             isFirstIndex=FALSE;
225             idx=bi->first();
226         } else {
227             idx=bi->next();
228         }
229         if(idx==UBRK_DONE || idx>srcLength) {
230             idx=srcLength;
231         }
232 
233         /*
234          * Unicode 4 & 5 section 3.13 Default Case Operations:
235          *
236          * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
237          * #29, "Text Boundaries." Between each pair of word boundaries, find the first
238          * cased character F. If F exists, map F to default_title(F); then map each
239          * subsequent character C to default_lower(C).
240          *
241          * In this implementation, segment [prev..index[ into 3 parts:
242          * a) uncased characters (copy as-is) [prev..titleStart[
243          * b) first case letter (titlecase)         [titleStart..titleLimit[
244          * c) subsequent characters (lowercase)                 [titleLimit..index[
245          */
246         if(prev<idx) {
247             /* find and copy uncased characters [prev..titleStart[ */
248             titleStart=titleLimit=prev;
249             U16_NEXT(src, titleLimit, idx, c);
250             if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
251                 /* Adjust the titlecasing index (titleStart) to the next cased character. */
252                 for(;;) {
253                     titleStart=titleLimit;
254                     if(titleLimit==idx) {
255                         /*
256                          * only uncased characters in [prev..index[
257                          * stop with titleStart==titleLimit==index
258                          */
259                         break;
260                     }
261                     U16_NEXT(src, titleLimit, idx, c);
262                     if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
263                         break; /* cased letter at [titleStart..titleLimit[ */
264                     }
265                 }
266                 destIndex=appendString(dest, destIndex, destCapacity, src+prev, titleStart-prev);
267                 if(destIndex<0) {
268                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
269                     return 0;
270                 }
271             }
272 
273             if(titleStart<titleLimit) {
274                 /* titlecase c which is from [titleStart..titleLimit[ */
275                 csc.cpStart=titleStart;
276                 csc.cpLimit=titleLimit;
277                 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache);
278                 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
279                 if(destIndex<0) {
280                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
281                     return 0;
282                 }
283 
284                 /* Special case Dutch IJ titlecasing */
285                 if (titleStart+1 < idx &&
286                         ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH &&
287                         (src[titleStart] == 0x0049 || src[titleStart] == 0x0069) &&
288                         (src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A)) {
289                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
290                     if(destIndex<0) {
291                         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
292                         return 0;
293                     }
294                     titleLimit++;
295                 }
296 
297                 /* lowercase [titleLimit..index[ */
298                 if(titleLimit<idx) {
299                     if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
300                         /* Normal operation: Lowercase the rest of the word. */
301                         destIndex+=
302                             _caseMap(
303                                 csm, ucase_toFullLower,
304                                 dest+destIndex, destCapacity-destIndex,
305                                 src, &csc,
306                                 titleLimit, idx,
307                                 pErrorCode);
308                         if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
309                             *pErrorCode=U_ZERO_ERROR;
310                         }
311                         if(U_FAILURE(*pErrorCode)) {
312                             return destIndex;
313                         }
314                     } else {
315                         /* Optionally just copy the rest of the word unchanged. */
316                         destIndex=appendString(dest, destIndex, destCapacity, src+titleLimit, idx-titleLimit);
317                         if(destIndex<0) {
318                             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
319                             return 0;
320                         }
321                     }
322                 }
323             }
324         }
325 
326         prev=idx;
327     }
328 
329     if(destIndex>destCapacity) {
330         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
331     }
332     return destIndex;
333 }
334 
335 #endif  // !UCONFIG_NO_BREAK_ITERATION
336 
337 U_NAMESPACE_BEGIN
338 namespace GreekUpper {
339 
340 // Data generated by prototype code, see
341 // http://site.icu-project.org/design/case/greek-upper
342 // TODO: Move this data into ucase.icu.
343 static const uint16_t data0370[] = {
344     // U+0370..03FF
345     0x0370,
346     0x0370,
347     0x0372,
348     0x0372,
349     0,
350     0,
351     0x0376,
352     0x0376,
353     0,
354     0,
355     0x037A,
356     0x03FD,
357     0x03FE,
358     0x03FF,
359     0,
360     0x037F,
361     0,
362     0,
363     0,
364     0,
365     0,
366     0,
367     0x0391 | HAS_VOWEL | HAS_ACCENT,
368     0,
369     0x0395 | HAS_VOWEL | HAS_ACCENT,
370     0x0397 | HAS_VOWEL | HAS_ACCENT,
371     0x0399 | HAS_VOWEL | HAS_ACCENT,
372     0,
373     0x039F | HAS_VOWEL | HAS_ACCENT,
374     0,
375     0x03A5 | HAS_VOWEL | HAS_ACCENT,
376     0x03A9 | HAS_VOWEL | HAS_ACCENT,
377     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
378     0x0391 | HAS_VOWEL,
379     0x0392,
380     0x0393,
381     0x0394,
382     0x0395 | HAS_VOWEL,
383     0x0396,
384     0x0397 | HAS_VOWEL,
385     0x0398,
386     0x0399 | HAS_VOWEL,
387     0x039A,
388     0x039B,
389     0x039C,
390     0x039D,
391     0x039E,
392     0x039F | HAS_VOWEL,
393     0x03A0,
394     0x03A1,
395     0,
396     0x03A3,
397     0x03A4,
398     0x03A5 | HAS_VOWEL,
399     0x03A6,
400     0x03A7,
401     0x03A8,
402     0x03A9 | HAS_VOWEL,
403     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
404     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
405     0x0391 | HAS_VOWEL | HAS_ACCENT,
406     0x0395 | HAS_VOWEL | HAS_ACCENT,
407     0x0397 | HAS_VOWEL | HAS_ACCENT,
408     0x0399 | HAS_VOWEL | HAS_ACCENT,
409     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
410     0x0391 | HAS_VOWEL,
411     0x0392,
412     0x0393,
413     0x0394,
414     0x0395 | HAS_VOWEL,
415     0x0396,
416     0x0397 | HAS_VOWEL,
417     0x0398,
418     0x0399 | HAS_VOWEL,
419     0x039A,
420     0x039B,
421     0x039C,
422     0x039D,
423     0x039E,
424     0x039F | HAS_VOWEL,
425     0x03A0,
426     0x03A1,
427     0x03A3,
428     0x03A3,
429     0x03A4,
430     0x03A5 | HAS_VOWEL,
431     0x03A6,
432     0x03A7,
433     0x03A8,
434     0x03A9 | HAS_VOWEL,
435     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
436     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
437     0x039F | HAS_VOWEL | HAS_ACCENT,
438     0x03A5 | HAS_VOWEL | HAS_ACCENT,
439     0x03A9 | HAS_VOWEL | HAS_ACCENT,
440     0x03CF,
441     0x0392,
442     0x0398,
443     0x03D2,
444     0x03D2 | HAS_ACCENT,
445     0x03D2 | HAS_DIALYTIKA,
446     0x03A6,
447     0x03A0,
448     0x03CF,
449     0x03D8,
450     0x03D8,
451     0x03DA,
452     0x03DA,
453     0x03DC,
454     0x03DC,
455     0x03DE,
456     0x03DE,
457     0x03E0,
458     0x03E0,
459     0,
460     0,
461     0,
462     0,
463     0,
464     0,
465     0,
466     0,
467     0,
468     0,
469     0,
470     0,
471     0,
472     0,
473     0x039A,
474     0x03A1,
475     0x03F9,
476     0x037F,
477     0x03F4,
478     0x0395 | HAS_VOWEL,
479     0,
480     0x03F7,
481     0x03F7,
482     0x03F9,
483     0x03FA,
484     0x03FA,
485     0x03FC,
486     0x03FD,
487     0x03FE,
488     0x03FF,
489 };
490 
491 static const uint16_t data1F00[] = {
492     // U+1F00..1FFF
493     0x0391 | HAS_VOWEL,
494     0x0391 | HAS_VOWEL,
495     0x0391 | HAS_VOWEL | HAS_ACCENT,
496     0x0391 | HAS_VOWEL | HAS_ACCENT,
497     0x0391 | HAS_VOWEL | HAS_ACCENT,
498     0x0391 | HAS_VOWEL | HAS_ACCENT,
499     0x0391 | HAS_VOWEL | HAS_ACCENT,
500     0x0391 | HAS_VOWEL | HAS_ACCENT,
501     0x0391 | HAS_VOWEL,
502     0x0391 | HAS_VOWEL,
503     0x0391 | HAS_VOWEL | HAS_ACCENT,
504     0x0391 | HAS_VOWEL | HAS_ACCENT,
505     0x0391 | HAS_VOWEL | HAS_ACCENT,
506     0x0391 | HAS_VOWEL | HAS_ACCENT,
507     0x0391 | HAS_VOWEL | HAS_ACCENT,
508     0x0391 | HAS_VOWEL | HAS_ACCENT,
509     0x0395 | HAS_VOWEL,
510     0x0395 | HAS_VOWEL,
511     0x0395 | HAS_VOWEL | HAS_ACCENT,
512     0x0395 | HAS_VOWEL | HAS_ACCENT,
513     0x0395 | HAS_VOWEL | HAS_ACCENT,
514     0x0395 | HAS_VOWEL | HAS_ACCENT,
515     0,
516     0,
517     0x0395 | HAS_VOWEL,
518     0x0395 | HAS_VOWEL,
519     0x0395 | HAS_VOWEL | HAS_ACCENT,
520     0x0395 | HAS_VOWEL | HAS_ACCENT,
521     0x0395 | HAS_VOWEL | HAS_ACCENT,
522     0x0395 | HAS_VOWEL | HAS_ACCENT,
523     0,
524     0,
525     0x0397 | HAS_VOWEL,
526     0x0397 | HAS_VOWEL,
527     0x0397 | HAS_VOWEL | HAS_ACCENT,
528     0x0397 | HAS_VOWEL | HAS_ACCENT,
529     0x0397 | HAS_VOWEL | HAS_ACCENT,
530     0x0397 | HAS_VOWEL | HAS_ACCENT,
531     0x0397 | HAS_VOWEL | HAS_ACCENT,
532     0x0397 | HAS_VOWEL | HAS_ACCENT,
533     0x0397 | HAS_VOWEL,
534     0x0397 | HAS_VOWEL,
535     0x0397 | HAS_VOWEL | HAS_ACCENT,
536     0x0397 | HAS_VOWEL | HAS_ACCENT,
537     0x0397 | HAS_VOWEL | HAS_ACCENT,
538     0x0397 | HAS_VOWEL | HAS_ACCENT,
539     0x0397 | HAS_VOWEL | HAS_ACCENT,
540     0x0397 | HAS_VOWEL | HAS_ACCENT,
541     0x0399 | HAS_VOWEL,
542     0x0399 | HAS_VOWEL,
543     0x0399 | HAS_VOWEL | HAS_ACCENT,
544     0x0399 | HAS_VOWEL | HAS_ACCENT,
545     0x0399 | HAS_VOWEL | HAS_ACCENT,
546     0x0399 | HAS_VOWEL | HAS_ACCENT,
547     0x0399 | HAS_VOWEL | HAS_ACCENT,
548     0x0399 | HAS_VOWEL | HAS_ACCENT,
549     0x0399 | HAS_VOWEL,
550     0x0399 | HAS_VOWEL,
551     0x0399 | HAS_VOWEL | HAS_ACCENT,
552     0x0399 | HAS_VOWEL | HAS_ACCENT,
553     0x0399 | HAS_VOWEL | HAS_ACCENT,
554     0x0399 | HAS_VOWEL | HAS_ACCENT,
555     0x0399 | HAS_VOWEL | HAS_ACCENT,
556     0x0399 | HAS_VOWEL | HAS_ACCENT,
557     0x039F | HAS_VOWEL,
558     0x039F | HAS_VOWEL,
559     0x039F | HAS_VOWEL | HAS_ACCENT,
560     0x039F | HAS_VOWEL | HAS_ACCENT,
561     0x039F | HAS_VOWEL | HAS_ACCENT,
562     0x039F | HAS_VOWEL | HAS_ACCENT,
563     0,
564     0,
565     0x039F | HAS_VOWEL,
566     0x039F | HAS_VOWEL,
567     0x039F | HAS_VOWEL | HAS_ACCENT,
568     0x039F | HAS_VOWEL | HAS_ACCENT,
569     0x039F | HAS_VOWEL | HAS_ACCENT,
570     0x039F | HAS_VOWEL | HAS_ACCENT,
571     0,
572     0,
573     0x03A5 | HAS_VOWEL,
574     0x03A5 | HAS_VOWEL,
575     0x03A5 | HAS_VOWEL | HAS_ACCENT,
576     0x03A5 | HAS_VOWEL | HAS_ACCENT,
577     0x03A5 | HAS_VOWEL | HAS_ACCENT,
578     0x03A5 | HAS_VOWEL | HAS_ACCENT,
579     0x03A5 | HAS_VOWEL | HAS_ACCENT,
580     0x03A5 | HAS_VOWEL | HAS_ACCENT,
581     0,
582     0x03A5 | HAS_VOWEL,
583     0,
584     0x03A5 | HAS_VOWEL | HAS_ACCENT,
585     0,
586     0x03A5 | HAS_VOWEL | HAS_ACCENT,
587     0,
588     0x03A5 | HAS_VOWEL | HAS_ACCENT,
589     0x03A9 | HAS_VOWEL,
590     0x03A9 | HAS_VOWEL,
591     0x03A9 | HAS_VOWEL | HAS_ACCENT,
592     0x03A9 | HAS_VOWEL | HAS_ACCENT,
593     0x03A9 | HAS_VOWEL | HAS_ACCENT,
594     0x03A9 | HAS_VOWEL | HAS_ACCENT,
595     0x03A9 | HAS_VOWEL | HAS_ACCENT,
596     0x03A9 | HAS_VOWEL | HAS_ACCENT,
597     0x03A9 | HAS_VOWEL,
598     0x03A9 | HAS_VOWEL,
599     0x03A9 | HAS_VOWEL | HAS_ACCENT,
600     0x03A9 | HAS_VOWEL | HAS_ACCENT,
601     0x03A9 | HAS_VOWEL | HAS_ACCENT,
602     0x03A9 | HAS_VOWEL | HAS_ACCENT,
603     0x03A9 | HAS_VOWEL | HAS_ACCENT,
604     0x03A9 | HAS_VOWEL | HAS_ACCENT,
605     0x0391 | HAS_VOWEL | HAS_ACCENT,
606     0x0391 | HAS_VOWEL | HAS_ACCENT,
607     0x0395 | HAS_VOWEL | HAS_ACCENT,
608     0x0395 | HAS_VOWEL | HAS_ACCENT,
609     0x0397 | HAS_VOWEL | HAS_ACCENT,
610     0x0397 | HAS_VOWEL | HAS_ACCENT,
611     0x0399 | HAS_VOWEL | HAS_ACCENT,
612     0x0399 | HAS_VOWEL | HAS_ACCENT,
613     0x039F | HAS_VOWEL | HAS_ACCENT,
614     0x039F | HAS_VOWEL | HAS_ACCENT,
615     0x03A5 | HAS_VOWEL | HAS_ACCENT,
616     0x03A5 | HAS_VOWEL | HAS_ACCENT,
617     0x03A9 | HAS_VOWEL | HAS_ACCENT,
618     0x03A9 | HAS_VOWEL | HAS_ACCENT,
619     0,
620     0,
621     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
622     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
623     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
624     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
625     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
626     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
627     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
628     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
629     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
630     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
631     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
632     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
633     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
634     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
635     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
636     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
637     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
638     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
639     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
640     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
641     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
642     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
643     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
644     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
645     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
646     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
647     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
648     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
649     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
650     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
651     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
652     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
653     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
654     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
655     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
656     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
657     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
658     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
659     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
660     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
661     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
662     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
663     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
664     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
665     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
666     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
667     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
668     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
669     0x0391 | HAS_VOWEL,
670     0x0391 | HAS_VOWEL,
671     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
672     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
673     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
674     0,
675     0x0391 | HAS_VOWEL | HAS_ACCENT,
676     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
677     0x0391 | HAS_VOWEL,
678     0x0391 | HAS_VOWEL,
679     0x0391 | HAS_VOWEL | HAS_ACCENT,
680     0x0391 | HAS_VOWEL | HAS_ACCENT,
681     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
682     0,
683     0x0399 | HAS_VOWEL,
684     0,
685     0,
686     0,
687     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
688     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
689     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
690     0,
691     0x0397 | HAS_VOWEL | HAS_ACCENT,
692     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
693     0x0395 | HAS_VOWEL | HAS_ACCENT,
694     0x0395 | HAS_VOWEL | HAS_ACCENT,
695     0x0397 | HAS_VOWEL | HAS_ACCENT,
696     0x0397 | HAS_VOWEL | HAS_ACCENT,
697     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
698     0,
699     0,
700     0,
701     0x0399 | HAS_VOWEL,
702     0x0399 | HAS_VOWEL,
703     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
704     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
705     0,
706     0,
707     0x0399 | HAS_VOWEL | HAS_ACCENT,
708     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
709     0x0399 | HAS_VOWEL,
710     0x0399 | HAS_VOWEL,
711     0x0399 | HAS_VOWEL | HAS_ACCENT,
712     0x0399 | HAS_VOWEL | HAS_ACCENT,
713     0,
714     0,
715     0,
716     0,
717     0x03A5 | HAS_VOWEL,
718     0x03A5 | HAS_VOWEL,
719     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
720     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
721     0x03A1,
722     0x03A1,
723     0x03A5 | HAS_VOWEL | HAS_ACCENT,
724     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
725     0x03A5 | HAS_VOWEL,
726     0x03A5 | HAS_VOWEL,
727     0x03A5 | HAS_VOWEL | HAS_ACCENT,
728     0x03A5 | HAS_VOWEL | HAS_ACCENT,
729     0x03A1,
730     0,
731     0,
732     0,
733     0,
734     0,
735     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
736     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
737     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
738     0,
739     0x03A9 | HAS_VOWEL | HAS_ACCENT,
740     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
741     0x039F | HAS_VOWEL | HAS_ACCENT,
742     0x039F | HAS_VOWEL | HAS_ACCENT,
743     0x03A9 | HAS_VOWEL | HAS_ACCENT,
744     0x03A9 | HAS_VOWEL | HAS_ACCENT,
745     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
746     0,
747     0,
748     0,
749 };
750 
751 // U+2126 Ohm sign
752 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
753 
getLetterData(UChar32 c)754 uint32_t getLetterData(UChar32 c) {
755     if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
756         return 0;
757     } else if (c <= 0x3ff) {
758         return data0370[c - 0x370];
759     } else if (c <= 0x1fff) {
760         return data1F00[c - 0x1f00];
761     } else if (c == 0x2126) {
762         return data2126;
763     } else {
764         return 0;
765     }
766 }
767 
getDiacriticData(UChar32 c)768 uint32_t getDiacriticData(UChar32 c) {
769     switch (c) {
770     case 0x0300:  // varia
771     case 0x0301:  // tonos = oxia
772     case 0x0342:  // perispomeni
773     case 0x0302:  // circumflex can look like perispomeni
774     case 0x0303:  // tilde can look like perispomeni
775     case 0x0311:  // inverted breve can look like perispomeni
776         return HAS_ACCENT;
777     case 0x0308:  // dialytika = diaeresis
778         return HAS_COMBINING_DIALYTIKA;
779     case 0x0344:  // dialytika tonos
780         return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
781     case 0x0345:  // ypogegrammeni = iota subscript
782         return HAS_YPOGEGRAMMENI;
783     case 0x0304:  // macron
784     case 0x0306:  // breve
785     case 0x0313:  // comma above
786     case 0x0314:  // reversed comma above
787     case 0x0343:  // koronis
788         return HAS_OTHER_GREEK_DIACRITIC;
789     default:
790         return 0;
791     }
792 }
793 
isFollowedByCasedLetter(const UCaseProps * csp,const UChar * s,int32_t i,int32_t length)794 UBool isFollowedByCasedLetter(const UCaseProps *csp, const UChar *s, int32_t i, int32_t length) {
795     while (i < length) {
796         UChar32 c;
797         U16_NEXT(s, i, length, c);
798         int32_t type = ucase_getTypeOrIgnorable(csp, c);
799         if ((type & UCASE_IGNORABLE) != 0) {
800             // Case-ignorable, continue with the loop.
801         } else if (type != UCASE_NONE) {
802             return TRUE;  // Followed by cased letter.
803         } else {
804             return FALSE;  // Uncased and not case-ignorable.
805         }
806     }
807     return FALSE;  // Not followed by cased letter.
808 }
809 
810 /**
811  * Greek string uppercasing with a state machine.
812  * Probably simpler than a stateless function that has to figure out complex context-before
813  * for each character.
814  * TODO: Try to re-consolidate one way or another with the non-Greek function.
815  */
toUpper(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)816 int32_t toUpper(const UCaseMap *csm,
817                 UChar *dest, int32_t destCapacity,
818                 const UChar *src, int32_t srcLength,
819                 UErrorCode *pErrorCode) {
820     int32_t locCache = UCASE_LOC_GREEK;
821     int32_t destIndex=0;
822     uint32_t state = 0;
823     for (int32_t i = 0; i < srcLength;) {
824         int32_t nextIndex = i;
825         UChar32 c;
826         U16_NEXT(src, nextIndex, srcLength, c);
827         uint32_t nextState = 0;
828         int32_t type = ucase_getTypeOrIgnorable(csm->csp, c);
829         if ((type & UCASE_IGNORABLE) != 0) {
830             // c is case-ignorable
831             nextState |= (state & AFTER_CASED);
832         } else if (type != UCASE_NONE) {
833             // c is cased
834             nextState |= AFTER_CASED;
835         }
836         uint32_t data = getLetterData(c);
837         if (data > 0) {
838             uint32_t upper = data & UPPER_MASK;
839             // Add a dialytika to this iota or ypsilon vowel
840             // if we removed a tonos from the previous vowel,
841             // and that previous vowel did not also have (or gain) a dialytika.
842             // Adding one only to the final vowel in a longer sequence
843             // (which does not occur in normal writing) would require lookahead.
844             // Set the same flag as for preserving an existing dialytika.
845             if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
846                     (upper == 0x399 || upper == 0x3A5)) {
847                 data |= HAS_DIALYTIKA;
848             }
849             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
850             if ((data & HAS_YPOGEGRAMMENI) != 0) {
851                 numYpogegrammeni = 1;
852             }
853             // Skip combining diacritics after this Greek letter.
854             while (nextIndex < srcLength) {
855                 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
856                 if (diacriticData != 0) {
857                     data |= diacriticData;
858                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
859                         ++numYpogegrammeni;
860                     }
861                     ++nextIndex;
862                 } else {
863                     break;  // not a Greek diacritic
864                 }
865             }
866             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
867                 nextState |= AFTER_VOWEL_WITH_ACCENT;
868             }
869             // Map according to Greek rules.
870             UBool addTonos = FALSE;
871             if (upper == 0x397 &&
872                     (data & HAS_ACCENT) != 0 &&
873                     numYpogegrammeni == 0 &&
874                     (state & AFTER_CASED) == 0 &&
875                     !isFollowedByCasedLetter(csm->csp, src, nextIndex, srcLength)) {
876                 // Keep disjunctive "or" with (only) a tonos.
877                 // We use the same "word boundary" conditions as for the Final_Sigma test.
878                 if (i == nextIndex) {
879                     upper = 0x389;  // Preserve the precomposed form.
880                 } else {
881                     addTonos = TRUE;
882                 }
883             } else if ((data & HAS_DIALYTIKA) != 0) {
884                 // Preserve a vowel with dialytika in precomposed form if it exists.
885                 if (upper == 0x399) {
886                     upper = 0x3AA;
887                     data &= ~HAS_EITHER_DIALYTIKA;
888                 } else if (upper == 0x3A5) {
889                     upper = 0x3AB;
890                     data &= ~HAS_EITHER_DIALYTIKA;
891                 }
892             }
893             destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
894             if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
895                 destIndex=appendUChar(dest, destIndex, destCapacity, 0x308);  // restore or add a dialytika
896             }
897             if (destIndex >= 0 && addTonos) {
898                 destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
899             }
900             while (destIndex >= 0 && numYpogegrammeni > 0) {
901                 destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
902                 --numYpogegrammeni;
903             }
904             if(destIndex<0) {
905                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
906                 return 0;
907             }
908         } else {
909             const UChar *s;
910             UChar32 c2 = 0;
911             c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
912             if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
913                 /* fast path version of appendResult() for BMP results */
914                 dest[destIndex++]=(UChar)c2;
915             } else {
916                 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
917                 if(destIndex<0) {
918                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
919                     return 0;
920                 }
921             }
922         }
923         i = nextIndex;
924         state = nextState;
925     }
926 
927     if(destIndex>destCapacity) {
928         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
929     }
930     return destIndex;
931 }
932 
933 }  // namespace GreekUpper
934 U_NAMESPACE_END
935 
936 /* functions available in the common library (for unistr_case.cpp) */
937 
938 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToLower(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)939 ustrcase_internalToLower(const UCaseMap *csm,
940                          UChar *dest, int32_t destCapacity,
941                          const UChar *src, int32_t srcLength,
942                          UErrorCode *pErrorCode) {
943     UCaseContext csc=UCASECONTEXT_INITIALIZER;
944     csc.p=(void *)src;
945     csc.limit=srcLength;
946     return _caseMap(
947         csm, ucase_toFullLower,
948         dest, destCapacity,
949         src, &csc, 0, srcLength,
950         pErrorCode);
951 }
952 
953 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToUpper(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)954 ustrcase_internalToUpper(const UCaseMap *csm,
955                          UChar *dest, int32_t destCapacity,
956                          const UChar *src, int32_t srcLength,
957                          UErrorCode *pErrorCode) {
958     int32_t locCache = csm->locCache;
959     if (ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_GREEK) {
960         return GreekUpper::toUpper(csm, dest, destCapacity, src, srcLength, pErrorCode);
961     }
962     UCaseContext csc=UCASECONTEXT_INITIALIZER;
963     csc.p=(void *)src;
964     csc.limit=srcLength;
965     return _caseMap(
966         csm, ucase_toFullUpper,
967         dest, destCapacity,
968         src, &csc, 0, srcLength,
969         pErrorCode);
970 }
971 
972 static int32_t
ustr_foldCase(const UCaseProps * csp,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,uint32_t options,UErrorCode * pErrorCode)973 ustr_foldCase(const UCaseProps *csp,
974               UChar *dest, int32_t destCapacity,
975               const UChar *src, int32_t srcLength,
976               uint32_t options,
977               UErrorCode *pErrorCode) {
978     int32_t srcIndex, destIndex;
979 
980     const UChar *s;
981     UChar32 c, c2 = 0;
982 
983     /* case mapping loop */
984     srcIndex=destIndex=0;
985     while(srcIndex<srcLength) {
986         U16_NEXT(src, srcIndex, srcLength, c);
987         c=ucase_toFullFolding(csp, c, &s, options);
988         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
989             /* fast path version of appendResult() for BMP results */
990             dest[destIndex++]=(UChar)c2;
991         } else {
992             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
993             if(destIndex<0) {
994                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
995                 return 0;
996             }
997         }
998     }
999 
1000     if(destIndex>destCapacity) {
1001         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1002     }
1003     return destIndex;
1004 }
1005 
1006 U_CFUNC int32_t U_CALLCONV
ustrcase_internalFold(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)1007 ustrcase_internalFold(const UCaseMap *csm,
1008                       UChar *dest, int32_t destCapacity,
1009                       const UChar *src, int32_t srcLength,
1010                       UErrorCode *pErrorCode) {
1011     return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
1012 }
1013 
1014 U_CFUNC int32_t
ustrcase_map(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UStringCaseMapper * stringCaseMapper,UErrorCode * pErrorCode)1015 ustrcase_map(const UCaseMap *csm,
1016              UChar *dest, int32_t destCapacity,
1017              const UChar *src, int32_t srcLength,
1018              UStringCaseMapper *stringCaseMapper,
1019              UErrorCode *pErrorCode) {
1020     UChar buffer[300];
1021     UChar *temp;
1022 
1023     int32_t destLength;
1024 
1025     /* check argument values */
1026     if(U_FAILURE(*pErrorCode)) {
1027         return 0;
1028     }
1029     if( destCapacity<0 ||
1030         (dest==NULL && destCapacity>0) ||
1031         src==NULL ||
1032         srcLength<-1
1033     ) {
1034         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1035         return 0;
1036     }
1037 
1038     /* get the string length */
1039     if(srcLength==-1) {
1040         srcLength=u_strlen(src);
1041     }
1042 
1043     /* check for overlapping source and destination */
1044     if( dest!=NULL &&
1045         ((src>=dest && src<(dest+destCapacity)) ||
1046          (dest>=src && dest<(src+srcLength)))
1047     ) {
1048         /* overlap: provide a temporary destination buffer and later copy the result */
1049         if(destCapacity<=UPRV_LENGTHOF(buffer)) {
1050             /* the stack buffer is large enough */
1051             temp=buffer;
1052         } else {
1053             /* allocate a buffer */
1054             temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1055             if(temp==NULL) {
1056                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1057                 return 0;
1058             }
1059         }
1060     } else {
1061         temp=dest;
1062     }
1063 
1064     destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode);
1065     if(temp!=dest) {
1066         /* copy the result string to the destination buffer */
1067         if(destLength>0) {
1068             int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
1069             if(copyLength>0) {
1070                 u_memmove(dest, temp, copyLength);
1071             }
1072         }
1073         if(temp!=buffer) {
1074             uprv_free(temp);
1075         }
1076     }
1077 
1078     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
1079 }
1080 
1081 /* public API functions */
1082 
1083 U_CAPI int32_t U_EXPORT2
u_strFoldCase(UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,uint32_t options,UErrorCode * pErrorCode)1084 u_strFoldCase(UChar *dest, int32_t destCapacity,
1085               const UChar *src, int32_t srcLength,
1086               uint32_t options,
1087               UErrorCode *pErrorCode) {
1088     UCaseMap csm=UCASEMAP_INITIALIZER;
1089     csm.csp=ucase_getSingleton();
1090     csm.options=options;
1091     return ustrcase_map(
1092         &csm,
1093         dest, destCapacity,
1094         src, srcLength,
1095         ustrcase_internalFold, pErrorCode);
1096 }
1097 
1098 /* case-insensitive string comparisons -------------------------------------- */
1099 
1100 /*
1101  * This function is a copy of unorm_cmpEquivFold() minus the parts for
1102  * canonical equivalence.
1103  * Keep the functions in sync, and see there for how this works.
1104  * The duplication is for modularization:
1105  * It makes caseless (but not canonical caseless) matches independent of
1106  * the normalization code.
1107  */
1108 
1109 /* stack element for previous-level source/decomposition pointers */
1110 struct CmpEquivLevel {
1111     const UChar *start, *s, *limit;
1112 };
1113 typedef struct CmpEquivLevel CmpEquivLevel;
1114 
1115 /**
1116  * Internal implementation code comparing string with case fold.
1117  * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1118  *
1119  * @param s1            input string 1
1120  * @param length1       length of string 1, or -1 (NULL terminated)
1121  * @param s2            input string 2
1122  * @param length2       length of string 2, or -1 (NULL terminated)
1123  * @param options       compare options
1124  * @param matchLen1     (output) length of partial prefix match in s1
1125  * @param matchLen2     (output) length of partial prefix match in s2
1126  * @param pErrorCode    receives error status
1127  * @return The result of comparison
1128  */
_cmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1129 static int32_t _cmpFold(
1130             const UChar *s1, int32_t length1,
1131             const UChar *s2, int32_t length2,
1132             uint32_t options,
1133             int32_t *matchLen1, int32_t *matchLen2,
1134             UErrorCode *pErrorCode) {
1135     int32_t cmpRes = 0;
1136 
1137     const UCaseProps *csp;
1138 
1139     /* current-level start/limit - s1/s2 as current */
1140     const UChar *start1, *start2, *limit1, *limit2;
1141 
1142     /* points to the original start address */
1143     const UChar *org1, *org2;
1144 
1145     /* points to the end of match + 1 */
1146     const UChar *m1, *m2;
1147 
1148     /* case folding variables */
1149     const UChar *p;
1150     int32_t length;
1151 
1152     /* stacks of previous-level start/current/limit */
1153     CmpEquivLevel stack1[2], stack2[2];
1154 
1155     /* case folding buffers, only use current-level start/limit */
1156     UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1157 
1158     /* track which is the current level per string */
1159     int32_t level1, level2;
1160 
1161     /* current code units, and code points for lookups */
1162     UChar32 c1, c2, cp1, cp2;
1163 
1164     /* no argument error checking because this itself is not an API */
1165 
1166     /*
1167      * assume that at least the option U_COMPARE_IGNORE_CASE is set
1168      * otherwise this function would have to behave exactly as uprv_strCompare()
1169      */
1170     csp=ucase_getSingleton();
1171     if(U_FAILURE(*pErrorCode)) {
1172         return 0;
1173     }
1174 
1175     /* initialize */
1176     if(matchLen1) {
1177         U_ASSERT(matchLen2 !=NULL);
1178         *matchLen1=0;
1179         *matchLen2=0;
1180     }
1181 
1182     start1=m1=org1=s1;
1183     if(length1==-1) {
1184         limit1=NULL;
1185     } else {
1186         limit1=s1+length1;
1187     }
1188 
1189     start2=m2=org2=s2;
1190     if(length2==-1) {
1191         limit2=NULL;
1192     } else {
1193         limit2=s2+length2;
1194     }
1195 
1196     level1=level2=0;
1197     c1=c2=-1;
1198 
1199     /* comparison loop */
1200     for(;;) {
1201         /*
1202          * here a code unit value of -1 means "get another code unit"
1203          * below it will mean "this source is finished"
1204          */
1205 
1206         if(c1<0) {
1207             /* get next code unit from string 1, post-increment */
1208             for(;;) {
1209                 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
1210                     if(level1==0) {
1211                         c1=-1;
1212                         break;
1213                     }
1214                 } else {
1215                     ++s1;
1216                     break;
1217                 }
1218 
1219                 /* reached end of level buffer, pop one level */
1220                 do {
1221                     --level1;
1222                     start1=stack1[level1].start;    /*Not uninitialized*/
1223                 } while(start1==NULL);
1224                 s1=stack1[level1].s;                /*Not uninitialized*/
1225                 limit1=stack1[level1].limit;        /*Not uninitialized*/
1226             }
1227         }
1228 
1229         if(c2<0) {
1230             /* get next code unit from string 2, post-increment */
1231             for(;;) {
1232                 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
1233                     if(level2==0) {
1234                         c2=-1;
1235                         break;
1236                     }
1237                 } else {
1238                     ++s2;
1239                     break;
1240                 }
1241 
1242                 /* reached end of level buffer, pop one level */
1243                 do {
1244                     --level2;
1245                     start2=stack2[level2].start;    /*Not uninitialized*/
1246                 } while(start2==NULL);
1247                 s2=stack2[level2].s;                /*Not uninitialized*/
1248                 limit2=stack2[level2].limit;        /*Not uninitialized*/
1249             }
1250         }
1251 
1252         /*
1253          * compare c1 and c2
1254          * either variable c1, c2 is -1 only if the corresponding string is finished
1255          */
1256         if(c1==c2) {
1257             const UChar *next1, *next2;
1258 
1259             if(c1<0) {
1260                 cmpRes=0;   /* c1==c2==-1 indicating end of strings */
1261                 break;
1262             }
1263 
1264             /*
1265              * Note: Move the match positions in both strings at the same time
1266              *      only when corresponding code point(s) in the original strings
1267              *      are fully consumed. For example, when comparing s1="Fust" and
1268              *      s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1269              *      the first code point in the case-folded data. But the second "s"
1270              *      has no matching code point in s1, so this implementation returns
1271              *      2 as the prefix match length ("Fu").
1272              */
1273             next1=next2=NULL;
1274             if(level1==0) {
1275                 next1=s1;
1276             } else if(s1==limit1) {
1277                 /* Note: This implementation only use a single level of stack.
1278                  *      If this code needs to be changed to use multiple levels
1279                  *      of stacks, the code above should check if the current
1280                  *      code is at the end of all stacks.
1281                  */
1282                 U_ASSERT(level1==1);
1283 
1284                 /* is s1 at the end of the current stack? */
1285                 next1=stack1[0].s;
1286             }
1287 
1288             if (next1!=NULL) {
1289                 if(level2==0) {
1290                     next2=s2;
1291                 } else if(s2==limit2) {
1292                     U_ASSERT(level2==1);
1293 
1294                     /* is s2 at the end of the current stack? */
1295                     next2=stack2[0].s;
1296                 }
1297                 if(next2!=NULL) {
1298                     m1=next1;
1299                     m2=next2;
1300                 }
1301             }
1302             c1=c2=-1;       /* make us fetch new code units */
1303             continue;
1304         } else if(c1<0) {
1305             cmpRes=-1;      /* string 1 ends before string 2 */
1306             break;
1307         } else if(c2<0) {
1308             cmpRes=1;       /* string 2 ends before string 1 */
1309             break;
1310         }
1311         /* c1!=c2 && c1>=0 && c2>=0 */
1312 
1313         /* get complete code points for c1, c2 for lookups if either is a surrogate */
1314         cp1=c1;
1315         if(U_IS_SURROGATE(c1)) {
1316             UChar c;
1317 
1318             if(U_IS_SURROGATE_LEAD(c1)) {
1319                 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1320                     /* advance ++s1; only below if cp1 decomposes/case-folds */
1321                     cp1=U16_GET_SUPPLEMENTARY(c1, c);
1322                 }
1323             } else /* isTrail(c1) */ {
1324                 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1325                     cp1=U16_GET_SUPPLEMENTARY(c, c1);
1326                 }
1327             }
1328         }
1329 
1330         cp2=c2;
1331         if(U_IS_SURROGATE(c2)) {
1332             UChar c;
1333 
1334             if(U_IS_SURROGATE_LEAD(c2)) {
1335                 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1336                     /* advance ++s2; only below if cp2 decomposes/case-folds */
1337                     cp2=U16_GET_SUPPLEMENTARY(c2, c);
1338                 }
1339             } else /* isTrail(c2) */ {
1340                 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1341                     cp2=U16_GET_SUPPLEMENTARY(c, c2);
1342                 }
1343             }
1344         }
1345 
1346         /*
1347          * go down one level for each string
1348          * continue with the main loop as soon as there is a real change
1349          */
1350 
1351         if( level1==0 &&
1352             (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
1353         ) {
1354             /* cp1 case-folds to the code point "length" or to p[length] */
1355             if(U_IS_SURROGATE(c1)) {
1356                 if(U_IS_SURROGATE_LEAD(c1)) {
1357                     /* advance beyond source surrogate pair if it case-folds */
1358                     ++s1;
1359                 } else /* isTrail(c1) */ {
1360                     /*
1361                      * we got a supplementary code point when hitting its trail surrogate,
1362                      * therefore the lead surrogate must have been the same as in the other string;
1363                      * compare this decomposition with the lead surrogate in the other string
1364                      * remember that this simulates bulk text replacement:
1365                      * the decomposition would replace the entire code point
1366                      */
1367                     --s2;
1368                     --m2;
1369                     c2=*(s2-1);
1370                 }
1371             }
1372 
1373             /* push current level pointers */
1374             stack1[0].start=start1;
1375             stack1[0].s=s1;
1376             stack1[0].limit=limit1;
1377             ++level1;
1378 
1379             /* copy the folding result to fold1[] */
1380             if(length<=UCASE_MAX_STRING_LENGTH) {
1381                 u_memcpy(fold1, p, length);
1382             } else {
1383                 int32_t i=0;
1384                 U16_APPEND_UNSAFE(fold1, i, length);
1385                 length=i;
1386             }
1387 
1388             /* set next level pointers to case folding */
1389             start1=s1=fold1;
1390             limit1=fold1+length;
1391 
1392             /* get ready to read from decomposition, continue with loop */
1393             c1=-1;
1394             continue;
1395         }
1396 
1397         if( level2==0 &&
1398             (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
1399         ) {
1400             /* cp2 case-folds to the code point "length" or to p[length] */
1401             if(U_IS_SURROGATE(c2)) {
1402                 if(U_IS_SURROGATE_LEAD(c2)) {
1403                     /* advance beyond source surrogate pair if it case-folds */
1404                     ++s2;
1405                 } else /* isTrail(c2) */ {
1406                     /*
1407                      * we got a supplementary code point when hitting its trail surrogate,
1408                      * therefore the lead surrogate must have been the same as in the other string;
1409                      * compare this decomposition with the lead surrogate in the other string
1410                      * remember that this simulates bulk text replacement:
1411                      * the decomposition would replace the entire code point
1412                      */
1413                     --s1;
1414                     --m2;
1415                     c1=*(s1-1);
1416                 }
1417             }
1418 
1419             /* push current level pointers */
1420             stack2[0].start=start2;
1421             stack2[0].s=s2;
1422             stack2[0].limit=limit2;
1423             ++level2;
1424 
1425             /* copy the folding result to fold2[] */
1426             if(length<=UCASE_MAX_STRING_LENGTH) {
1427                 u_memcpy(fold2, p, length);
1428             } else {
1429                 int32_t i=0;
1430                 U16_APPEND_UNSAFE(fold2, i, length);
1431                 length=i;
1432             }
1433 
1434             /* set next level pointers to case folding */
1435             start2=s2=fold2;
1436             limit2=fold2+length;
1437 
1438             /* get ready to read from decomposition, continue with loop */
1439             c2=-1;
1440             continue;
1441         }
1442 
1443         /*
1444          * no decomposition/case folding, max level for both sides:
1445          * return difference result
1446          *
1447          * code point order comparison must not just return cp1-cp2
1448          * because when single surrogates are present then the surrogate pairs
1449          * that formed cp1 and cp2 may be from different string indexes
1450          *
1451          * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1452          * c1=d800 cp1=10001 c2=dc00 cp2=10000
1453          * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1454          *
1455          * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1456          * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1457          * so we have slightly different pointer/start/limit comparisons here
1458          */
1459 
1460         if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1461             /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1462             if(
1463                 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1464                 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1465             ) {
1466                 /* part of a surrogate pair, leave >=d800 */
1467             } else {
1468                 /* BMP code point - may be surrogate code point - make <d800 */
1469                 c1-=0x2800;
1470             }
1471 
1472             if(
1473                 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1474                 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1475             ) {
1476                 /* part of a surrogate pair, leave >=d800 */
1477             } else {
1478                 /* BMP code point - may be surrogate code point - make <d800 */
1479                 c2-=0x2800;
1480             }
1481         }
1482 
1483         cmpRes=c1-c2;
1484         break;
1485     }
1486 
1487     if(matchLen1) {
1488         *matchLen1=m1-org1;
1489         *matchLen2=m2-org2;
1490     }
1491     return cmpRes;
1492 }
1493 
1494 /* internal function */
1495 U_CFUNC int32_t
u_strcmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1496 u_strcmpFold(const UChar *s1, int32_t length1,
1497              const UChar *s2, int32_t length2,
1498              uint32_t options,
1499              UErrorCode *pErrorCode) {
1500     return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
1501 }
1502 
1503 /* public API functions */
1504 
1505 U_CAPI int32_t U_EXPORT2
u_strCaseCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1506 u_strCaseCompare(const UChar *s1, int32_t length1,
1507                  const UChar *s2, int32_t length2,
1508                  uint32_t options,
1509                  UErrorCode *pErrorCode) {
1510     /* argument checking */
1511     if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1512         return 0;
1513     }
1514     if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
1515         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1516         return 0;
1517     }
1518     return u_strcmpFold(s1, length1, s2, length2,
1519                         options|U_COMPARE_IGNORE_CASE,
1520                         pErrorCode);
1521 }
1522 
1523 U_CAPI int32_t U_EXPORT2
u_strcasecmp(const UChar * s1,const UChar * s2,uint32_t options)1524 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
1525     UErrorCode errorCode=U_ZERO_ERROR;
1526     return u_strcmpFold(s1, -1, s2, -1,
1527                         options|U_COMPARE_IGNORE_CASE,
1528                         &errorCode);
1529 }
1530 
1531 U_CAPI int32_t U_EXPORT2
u_memcasecmp(const UChar * s1,const UChar * s2,int32_t length,uint32_t options)1532 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
1533     UErrorCode errorCode=U_ZERO_ERROR;
1534     return u_strcmpFold(s1, length, s2, length,
1535                         options|U_COMPARE_IGNORE_CASE,
1536                         &errorCode);
1537 }
1538 
1539 U_CAPI int32_t U_EXPORT2
u_strncasecmp(const UChar * s1,const UChar * s2,int32_t n,uint32_t options)1540 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
1541     UErrorCode errorCode=U_ZERO_ERROR;
1542     return u_strcmpFold(s1, n, s2, n,
1543                         options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1544                         &errorCode);
1545 }
1546 
1547 /* internal API - detect length of shared prefix */
1548 U_CAPI void
u_caseInsensitivePrefixMatch(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1549 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
1550                              const UChar *s2, int32_t length2,
1551                              uint32_t options,
1552                              int32_t *matchLen1, int32_t *matchLen2,
1553                              UErrorCode *pErrorCode) {
1554     _cmpFold(s1, length1, s2, length2, options,
1555         matchLen1, matchLen2, pErrorCode);
1556 }
1557