• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2001-2007, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  ustrcase.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002feb20
14 *   created by: Markus W. Scherer
15 *
16 *   Implementation file for string casing C API functions.
17 *   Uses functions from uchar.c for basic functionality that requires access
18 *   to the Unicode Character Database (uprops.dat).
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/uloc.h"
23 #include "unicode/ustring.h"
24 #include "unicode/ucasemap.h"
25 #include "unicode/ubrk.h"
26 #include "cmemory.h"
27 #include "ucase.h"
28 #include "unormimp.h"
29 #include "ustr_imp.h"
30 
31 /* string casing ------------------------------------------------------------ */
32 
33 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
34 static U_INLINE int32_t
appendResult(UChar * dest,int32_t destIndex,int32_t destCapacity,int32_t result,const UChar * s)35 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
36              int32_t result, const UChar *s) {
37     UChar32 c;
38     int32_t length;
39 
40     /* decode the result */
41     if(result<0) {
42         /* (not) original code point */
43         c=~result;
44         length=-1;
45     } else if(result<=UCASE_MAX_STRING_LENGTH) {
46         c=U_SENTINEL;
47         length=result;
48     } else {
49         c=result;
50         length=-1;
51     }
52 
53     if(destIndex<destCapacity) {
54         /* append the result */
55         if(length<0) {
56             /* code point */
57             UBool isError=FALSE;
58             U16_APPEND(dest, destIndex, destCapacity, c, isError);
59             if(isError) {
60                 /* overflow, nothing written */
61                 destIndex+=U16_LENGTH(c);
62             }
63         } else {
64             /* string */
65             if((destIndex+length)<=destCapacity) {
66                 while(length>0) {
67                     dest[destIndex++]=*s++;
68                     --length;
69                 }
70             } else {
71                 /* overflow */
72                 destIndex+=length;
73             }
74         }
75     } else {
76         /* preflight */
77         if(length<0) {
78             destIndex+=U16_LENGTH(c);
79         } else {
80             destIndex+=length;
81         }
82     }
83     return destIndex;
84 }
85 
86 static UChar32 U_CALLCONV
utf16_caseContextIterator(void * context,int8_t dir)87 utf16_caseContextIterator(void *context, int8_t dir) {
88     UCaseContext *csc=(UCaseContext *)context;
89     UChar32 c;
90 
91     if(dir<0) {
92         /* reset for backward iteration */
93         csc->index=csc->cpStart;
94         csc->dir=dir;
95     } else if(dir>0) {
96         /* reset for forward iteration */
97         csc->index=csc->cpLimit;
98         csc->dir=dir;
99     } else {
100         /* continue current iteration direction */
101         dir=csc->dir;
102     }
103 
104     if(dir<0) {
105         if(csc->start<csc->index) {
106             U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
107             return c;
108         }
109     } else {
110         if(csc->index<csc->limit) {
111             U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
112             return c;
113         }
114     }
115     return U_SENTINEL;
116 }
117 
118 /*
119  * Case-maps [srcStart..srcLimit[ but takes
120  * context [0..srcLength[ into account.
121  */
122 static int32_t
_caseMap(const UCaseMap * csm,UCaseMapFull * map,UChar * dest,int32_t destCapacity,const UChar * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,UErrorCode * pErrorCode)123 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
124          UChar *dest, int32_t destCapacity,
125          const UChar *src, UCaseContext *csc,
126          int32_t srcStart, int32_t srcLimit,
127          UErrorCode *pErrorCode) {
128     const UChar *s;
129     UChar32 c, c2;
130     int32_t srcIndex, destIndex;
131     int32_t locCache;
132 
133     locCache=csm->locCache;
134 
135     /* case mapping loop */
136     srcIndex=srcStart;
137     destIndex=0;
138     while(srcIndex<srcLimit) {
139         csc->cpStart=srcIndex;
140         U16_NEXT(src, srcIndex, srcLimit, c);
141         csc->cpLimit=srcIndex;
142         c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
143         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
144             /* fast path version of appendResult() for BMP results */
145             dest[destIndex++]=(UChar)c2;
146         } else {
147             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
148         }
149     }
150 
151     if(destIndex>destCapacity) {
152         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
153     }
154     return destIndex;
155 }
156 
157 static void
setTempCaseMapLocale(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)158 setTempCaseMapLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
159     /*
160      * We could call ucasemap_setLocale(), but here we really only care about
161      * the initial language subtag, we need not return the real string via
162      * ucasemap_getLocale(), and we don't care about only getting "x" from
163      * "x-some-thing" etc.
164      *
165      * We ignore locales with a longer-than-3 initial subtag.
166      *
167      * We also do not fill in the locCache because it is rarely used,
168      * and not worth setting unless we reuse it for many case mapping operations.
169      * (That's why UCaseMap was created.)
170      */
171     int i;
172     char c;
173 
174     /* the internal functions require locale!=NULL */
175     if(locale==NULL) {
176         locale=uloc_getDefault();
177     }
178     for(i=0; i<4 && (c=locale[i])!=0 && c!='-' && c!='_'; ++i) {
179         csm->locale[i]=c;
180     }
181     if(i<=3) {
182         csm->locale[i]=0;  /* Up to 3 non-separator characters. */
183     } else {
184         csm->locale[0]=0;  /* Longer-than-3 initial subtag: Ignore. */
185     }
186 }
187 
188 /*
189  * Set parameters on an empty UCaseMap, for UCaseMap-less API functions.
190  * Do this fast because it is called with every function call.
191  */
192 static U_INLINE void
setTempCaseMap(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)193 setTempCaseMap(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
194     if(csm->csp==NULL) {
195         csm->csp=ucase_getSingleton(pErrorCode);
196         if(U_FAILURE(*pErrorCode)) {
197             return;
198         }
199     }
200     if(locale!=NULL && locale[0]==0) {
201         csm->locale[0]=0;
202     } else {
203         setTempCaseMapLocale(csm, locale, pErrorCode);
204     }
205 }
206 
207 #if !UCONFIG_NO_BREAK_ITERATION
208 
209 /*
210  * Internal titlecasing function.
211  */
212 static int32_t
_toTitle(UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,UCaseContext * csc,int32_t srcLength,UErrorCode * pErrorCode)213 _toTitle(UCaseMap *csm,
214          UChar *dest, int32_t destCapacity,
215          const UChar *src, UCaseContext *csc,
216          int32_t srcLength,
217          UErrorCode *pErrorCode) {
218     const UChar *s;
219     UChar32 c;
220     int32_t prev, titleStart, titleLimit, index, destIndex, length;
221     UBool isFirstIndex;
222 
223     if(csm->iter!=NULL) {
224         ubrk_setText(csm->iter, src, srcLength, pErrorCode);
225     } else {
226         csm->iter=ubrk_open(UBRK_WORD, csm->locale,
227                             src, srcLength,
228                             pErrorCode);
229     }
230     if(U_FAILURE(*pErrorCode)) {
231         return 0;
232     }
233 
234     /* set up local variables */
235     destIndex=0;
236     prev=0;
237     isFirstIndex=TRUE;
238 
239     /* titlecasing loop */
240     while(prev<srcLength) {
241         /* find next index where to titlecase */
242         if(isFirstIndex) {
243             isFirstIndex=FALSE;
244             index=ubrk_first(csm->iter);
245         } else {
246             index=ubrk_next(csm->iter);
247         }
248         if(index==UBRK_DONE || index>srcLength) {
249             index=srcLength;
250         }
251 
252         /*
253          * Unicode 4 & 5 section 3.13 Default Case Operations:
254          *
255          * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
256          * #29, "Text Boundaries." Between each pair of word boundaries, find the first
257          * cased character F. If F exists, map F to default_title(F); then map each
258          * subsequent character C to default_lower(C).
259          *
260          * In this implementation, segment [prev..index[ into 3 parts:
261          * a) uncased characters (copy as-is) [prev..titleStart[
262          * b) first case letter (titlecase)         [titleStart..titleLimit[
263          * c) subsequent characters (lowercase)                 [titleLimit..index[
264          */
265         if(prev<index) {
266             /* find and copy uncased characters [prev..titleStart[ */
267             titleStart=titleLimit=prev;
268             U16_NEXT(src, titleLimit, index, c);
269             if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
270                 /* Adjust the titlecasing index (titleStart) to the next cased character. */
271                 for(;;) {
272                     titleStart=titleLimit;
273                     if(titleLimit==index) {
274                         /*
275                          * only uncased characters in [prev..index[
276                          * stop with titleStart==titleLimit==index
277                          */
278                         break;
279                     }
280                     U16_NEXT(src, titleLimit, index, c);
281                     if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
282                         break; /* cased letter at [titleStart..titleLimit[ */
283                     }
284                 }
285                 length=titleStart-prev;
286                 if(length>0) {
287                     if((destIndex+length)<=destCapacity) {
288                         uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
289                     }
290                     destIndex+=length;
291                 }
292             }
293 
294             if(titleStart<titleLimit) {
295                 /* titlecase c which is from [titleStart..titleLimit[ */
296                 csc->cpStart=titleStart;
297                 csc->cpLimit=titleLimit;
298                 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache);
299                 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
300 
301                 /* lowercase [titleLimit..index[ */
302                 if(titleLimit<index) {
303                     if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
304                         /* Normal operation: Lowercase the rest of the word. */
305                         destIndex+=
306                             _caseMap(
307                                 csm, ucase_toFullLower,
308                                 dest+destIndex, destCapacity-destIndex,
309                                 src, csc,
310                                 titleLimit, index,
311                                 pErrorCode);
312                     } else {
313                         /* Optionally just copy the rest of the word unchanged. */
314                         length=index-titleLimit;
315                         if((destIndex+length)<=destCapacity) {
316                             uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR);
317                         }
318                         destIndex+=length;
319                     }
320                 }
321             }
322         }
323 
324         prev=index;
325     }
326 
327     if(destIndex>destCapacity) {
328         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
329     }
330     return destIndex;
331 }
332 
333 #endif
334 
335 /* functions available in the common library (for unistr_case.cpp) */
336 
337 U_CFUNC int32_t
ustr_toLower(const UCaseProps * csp,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,const char * locale,UErrorCode * pErrorCode)338 ustr_toLower(const UCaseProps *csp,
339              UChar *dest, int32_t destCapacity,
340              const UChar *src, int32_t srcLength,
341              const char *locale,
342              UErrorCode *pErrorCode) {
343     UCaseMap csm={ NULL };
344     UCaseContext csc={ NULL };
345 
346     csm.csp=csp;
347     setTempCaseMap(&csm, locale, pErrorCode);
348     csc.p=(void *)src;
349     csc.limit=srcLength;
350 
351     return _caseMap(&csm, ucase_toFullLower,
352                     dest, destCapacity,
353                     src, &csc, 0, srcLength,
354                     pErrorCode);
355 }
356 
357 U_CFUNC int32_t
ustr_toUpper(const UCaseProps * csp,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,const char * locale,UErrorCode * pErrorCode)358 ustr_toUpper(const UCaseProps *csp,
359              UChar *dest, int32_t destCapacity,
360              const UChar *src, int32_t srcLength,
361              const char *locale,
362              UErrorCode *pErrorCode) {
363     UCaseMap csm={ NULL };
364     UCaseContext csc={ NULL };
365 
366     csm.csp=csp;
367     setTempCaseMap(&csm, locale, pErrorCode);
368     csc.p=(void *)src;
369     csc.limit=srcLength;
370 
371     return _caseMap(&csm, ucase_toFullUpper,
372                     dest, destCapacity,
373                     src, &csc, 0, srcLength,
374                     pErrorCode);
375 }
376 
377 #if !UCONFIG_NO_BREAK_ITERATION
378 
379 U_CFUNC int32_t
ustr_toTitle(const UCaseProps * csp,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UBreakIterator * titleIter,const char * locale,uint32_t options,UErrorCode * pErrorCode)380 ustr_toTitle(const UCaseProps *csp,
381              UChar *dest, int32_t destCapacity,
382              const UChar *src, int32_t srcLength,
383              UBreakIterator *titleIter,
384              const char *locale, uint32_t options,
385              UErrorCode *pErrorCode) {
386     UCaseMap csm={ NULL };
387     UCaseContext csc={ NULL };
388     int32_t length;
389 
390     csm.csp=csp;
391     csm.iter=titleIter;
392     csm.options=options;
393     setTempCaseMap(&csm, locale, pErrorCode);
394     csc.p=(void *)src;
395     csc.limit=srcLength;
396 
397     length=_toTitle(&csm,
398                     dest, destCapacity,
399                     src, &csc, srcLength,
400                     pErrorCode);
401     if(titleIter==NULL && csm.iter!=NULL) {
402         ubrk_close(csm.iter);
403     }
404     return length;
405 }
406 
407 #endif
408 
409 U_CFUNC int32_t
ustr_foldCase(const UCaseProps * csp,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,uint32_t options,UErrorCode * pErrorCode)410 ustr_foldCase(const UCaseProps *csp,
411               UChar *dest, int32_t destCapacity,
412               const UChar *src, int32_t srcLength,
413               uint32_t options,
414               UErrorCode *pErrorCode) {
415     int32_t srcIndex, destIndex;
416 
417     const UChar *s;
418     UChar32 c, c2;
419 
420     /* case mapping loop */
421     srcIndex=destIndex=0;
422     while(srcIndex<srcLength) {
423         U16_NEXT(src, srcIndex, srcLength, c);
424         c=ucase_toFullFolding(csp, c, &s, options);
425         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
426             /* fast path version of appendResult() for BMP results */
427             dest[destIndex++]=(UChar)c2;
428         } else {
429             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
430         }
431     }
432 
433     if(destIndex>destCapacity) {
434         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
435     }
436     return destIndex;
437 }
438 
439 /*
440  * Implement argument checking and buffer handling
441  * for string case mapping as a common function.
442  */
443 
444 /* common internal function for public API functions */
445 
446 static int32_t
caseMap(const UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,int32_t toWhichCase,UErrorCode * pErrorCode)447 caseMap(const UCaseMap *csm,
448         UChar *dest, int32_t destCapacity,
449         const UChar *src, int32_t srcLength,
450         int32_t toWhichCase,
451         UErrorCode *pErrorCode) {
452     UChar buffer[300];
453     UChar *temp;
454 
455     int32_t destLength;
456 
457     /* check argument values */
458     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
459         return 0;
460     }
461     if( destCapacity<0 ||
462         (dest==NULL && destCapacity>0) ||
463         src==NULL ||
464         srcLength<-1
465     ) {
466         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
467         return 0;
468     }
469 
470     /* get the string length */
471     if(srcLength==-1) {
472         srcLength=u_strlen(src);
473     }
474 
475     /* check for overlapping source and destination */
476     if( dest!=NULL &&
477         ((src>=dest && src<(dest+destCapacity)) ||
478          (dest>=src && dest<(src+srcLength)))
479     ) {
480         /* overlap: provide a temporary destination buffer and later copy the result */
481         if(destCapacity<=(sizeof(buffer)/U_SIZEOF_UCHAR)) {
482             /* the stack buffer is large enough */
483             temp=buffer;
484         } else {
485             /* allocate a buffer */
486             temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
487             if(temp==NULL) {
488                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
489                 return 0;
490             }
491         }
492     } else {
493         temp=dest;
494     }
495 
496     destLength=0;
497 
498     if(toWhichCase==FOLD_CASE) {
499         destLength=ustr_foldCase(csm->csp, temp, destCapacity, src, srcLength,
500                                  csm->options, pErrorCode);
501     } else {
502         UCaseContext csc={ NULL };
503 
504         csc.p=(void *)src;
505         csc.limit=srcLength;
506 
507         if(toWhichCase==TO_LOWER) {
508             destLength=_caseMap(csm, ucase_toFullLower,
509                                 temp, destCapacity,
510                                 src, &csc,
511                                 0, srcLength,
512                                 pErrorCode);
513         } else if(toWhichCase==TO_UPPER) {
514             destLength=_caseMap(csm, ucase_toFullUpper,
515                                 temp, destCapacity,
516                                 src, &csc,
517                                 0, srcLength,
518                                 pErrorCode);
519         } else /* if(toWhichCase==TO_TITLE) */ {
520 #if UCONFIG_NO_BREAK_ITERATION
521             *pErrorCode=U_UNSUPPORTED_ERROR;
522 #else
523             /* UCaseMap is actually non-const in toTitle() APIs. */
524             destLength=_toTitle((UCaseMap *)csm, temp, destCapacity,
525                                 src, &csc, srcLength,
526                                 pErrorCode);
527 #endif
528         }
529     }
530     if(temp!=dest) {
531         /* copy the result string to the destination buffer */
532         if(destLength>0) {
533             int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
534             if(copyLength>0) {
535                 uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR);
536             }
537         }
538         if(temp!=buffer) {
539             uprv_free(temp);
540         }
541     }
542 
543     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
544 }
545 
546 /* public API functions */
547 
548 U_CAPI int32_t U_EXPORT2
u_strToLower(UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,const char * locale,UErrorCode * pErrorCode)549 u_strToLower(UChar *dest, int32_t destCapacity,
550              const UChar *src, int32_t srcLength,
551              const char *locale,
552              UErrorCode *pErrorCode) {
553     UCaseMap csm={ NULL };
554     setTempCaseMap(&csm, locale, pErrorCode);
555     return caseMap(&csm,
556                    dest, destCapacity,
557                    src, srcLength,
558                    TO_LOWER, pErrorCode);
559 }
560 
561 U_CAPI int32_t U_EXPORT2
u_strToUpper(UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,const char * locale,UErrorCode * pErrorCode)562 u_strToUpper(UChar *dest, int32_t destCapacity,
563              const UChar *src, int32_t srcLength,
564              const char *locale,
565              UErrorCode *pErrorCode) {
566     UCaseMap csm={ NULL };
567     setTempCaseMap(&csm, locale, pErrorCode);
568     return caseMap(&csm,
569                    dest, destCapacity,
570                    src, srcLength,
571                    TO_UPPER, pErrorCode);
572 }
573 
574 #if !UCONFIG_NO_BREAK_ITERATION
575 
576 U_CAPI int32_t U_EXPORT2
u_strToTitle(UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UBreakIterator * titleIter,const char * locale,UErrorCode * pErrorCode)577 u_strToTitle(UChar *dest, int32_t destCapacity,
578              const UChar *src, int32_t srcLength,
579              UBreakIterator *titleIter,
580              const char *locale,
581              UErrorCode *pErrorCode) {
582     UCaseMap csm={ NULL };
583     int32_t length;
584 
585     csm.iter=titleIter;
586     setTempCaseMap(&csm, locale, pErrorCode);
587     length=caseMap(&csm,
588                    dest, destCapacity,
589                    src, srcLength,
590                    TO_TITLE, pErrorCode);
591     if(titleIter==NULL && csm.iter!=NULL) {
592         ubrk_close(csm.iter);
593     }
594     return length;
595 }
596 
597 U_CAPI int32_t U_EXPORT2
ucasemap_toTitle(UCaseMap * csm,UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)598 ucasemap_toTitle(UCaseMap *csm,
599                  UChar *dest, int32_t destCapacity,
600                  const UChar *src, int32_t srcLength,
601                  UErrorCode *pErrorCode) {
602     return caseMap(csm,
603                    dest, destCapacity,
604                    src, srcLength,
605                    TO_TITLE, pErrorCode);
606 }
607 
608 #endif
609 
610 U_CAPI int32_t U_EXPORT2
u_strFoldCase(UChar * dest,int32_t destCapacity,const UChar * src,int32_t srcLength,uint32_t options,UErrorCode * pErrorCode)611 u_strFoldCase(UChar *dest, int32_t destCapacity,
612               const UChar *src, int32_t srcLength,
613               uint32_t options,
614               UErrorCode *pErrorCode) {
615     UCaseMap csm={ NULL };
616     csm.csp=ucase_getSingleton(pErrorCode);
617     csm.options=options;
618     return caseMap(&csm,
619                    dest, destCapacity,
620                    src, srcLength,
621                    FOLD_CASE, pErrorCode);
622 }
623 
624 /* case-insensitive string comparisons -------------------------------------- */
625 
626 /*
627  * This function is a copy of unorm_cmpEquivFold() minus the parts for
628  * canonical equivalence.
629  * Keep the functions in sync, and see there for how this works.
630  * The duplication is for modularization:
631  * It makes caseless (but not canonical caseless) matches independent of
632  * the normalization code.
633  */
634 
635 /* stack element for previous-level source/decomposition pointers */
636 struct CmpEquivLevel {
637     const UChar *start, *s, *limit;
638 };
639 typedef struct CmpEquivLevel CmpEquivLevel;
640 
641 /* internal function */
642 U_CFUNC int32_t
u_strcmpFold(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)643 u_strcmpFold(const UChar *s1, int32_t length1,
644              const UChar *s2, int32_t length2,
645              uint32_t options,
646              UErrorCode *pErrorCode) {
647     const UCaseProps *csp;
648 
649     /* current-level start/limit - s1/s2 as current */
650     const UChar *start1, *start2, *limit1, *limit2;
651 
652     /* case folding variables */
653     const UChar *p;
654     int32_t length;
655 
656     /* stacks of previous-level start/current/limit */
657     CmpEquivLevel stack1[2], stack2[2];
658 
659     /* case folding buffers, only use current-level start/limit */
660     UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
661 
662     /* track which is the current level per string */
663     int32_t level1, level2;
664 
665     /* current code units, and code points for lookups */
666     UChar32 c1, c2, cp1, cp2;
667 
668     /* no argument error checking because this itself is not an API */
669 
670     /*
671      * assume that at least the option U_COMPARE_IGNORE_CASE is set
672      * otherwise this function would have to behave exactly as uprv_strCompare()
673      */
674     csp=ucase_getSingleton(pErrorCode);
675     if(U_FAILURE(*pErrorCode)) {
676         return 0;
677     }
678 
679     /* initialize */
680     start1=s1;
681     if(length1==-1) {
682         limit1=NULL;
683     } else {
684         limit1=s1+length1;
685     }
686 
687     start2=s2;
688     if(length2==-1) {
689         limit2=NULL;
690     } else {
691         limit2=s2+length2;
692     }
693 
694     level1=level2=0;
695     c1=c2=-1;
696 
697     /* comparison loop */
698     for(;;) {
699         /*
700          * here a code unit value of -1 means "get another code unit"
701          * below it will mean "this source is finished"
702          */
703 
704         if(c1<0) {
705             /* get next code unit from string 1, post-increment */
706             for(;;) {
707                 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
708                     if(level1==0) {
709                         c1=-1;
710                         break;
711                     }
712                 } else {
713                     ++s1;
714                     break;
715                 }
716 
717                 /* reached end of level buffer, pop one level */
718                 do {
719                     --level1;
720                     start1=stack1[level1].start;
721                 } while(start1==NULL);
722                 s1=stack1[level1].s;
723                 limit1=stack1[level1].limit;
724             }
725         }
726 
727         if(c2<0) {
728             /* get next code unit from string 2, post-increment */
729             for(;;) {
730                 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
731                     if(level2==0) {
732                         c2=-1;
733                         break;
734                     }
735                 } else {
736                     ++s2;
737                     break;
738                 }
739 
740                 /* reached end of level buffer, pop one level */
741                 do {
742                     --level2;
743                     start2=stack2[level2].start;
744                 } while(start2==NULL);
745                 s2=stack2[level2].s;
746                 limit2=stack2[level2].limit;
747             }
748         }
749 
750         /*
751          * compare c1 and c2
752          * either variable c1, c2 is -1 only if the corresponding string is finished
753          */
754         if(c1==c2) {
755             if(c1<0) {
756                 return 0;   /* c1==c2==-1 indicating end of strings */
757             }
758             c1=c2=-1;       /* make us fetch new code units */
759             continue;
760         } else if(c1<0) {
761             return -1;      /* string 1 ends before string 2 */
762         } else if(c2<0) {
763             return 1;       /* string 2 ends before string 1 */
764         }
765         /* c1!=c2 && c1>=0 && c2>=0 */
766 
767         /* get complete code points for c1, c2 for lookups if either is a surrogate */
768         cp1=c1;
769         if(U_IS_SURROGATE(c1)) {
770             UChar c;
771 
772             if(U_IS_SURROGATE_LEAD(c1)) {
773                 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
774                     /* advance ++s1; only below if cp1 decomposes/case-folds */
775                     cp1=U16_GET_SUPPLEMENTARY(c1, c);
776                 }
777             } else /* isTrail(c1) */ {
778                 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
779                     cp1=U16_GET_SUPPLEMENTARY(c, c1);
780                 }
781             }
782         }
783 
784         cp2=c2;
785         if(U_IS_SURROGATE(c2)) {
786             UChar c;
787 
788             if(U_IS_SURROGATE_LEAD(c2)) {
789                 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
790                     /* advance ++s2; only below if cp2 decomposes/case-folds */
791                     cp2=U16_GET_SUPPLEMENTARY(c2, c);
792                 }
793             } else /* isTrail(c2) */ {
794                 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
795                     cp2=U16_GET_SUPPLEMENTARY(c, c2);
796                 }
797             }
798         }
799 
800         /*
801          * go down one level for each string
802          * continue with the main loop as soon as there is a real change
803          */
804 
805         if( level1==0 &&
806             (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
807         ) {
808             /* cp1 case-folds to the code point "length" or to p[length] */
809             if(U_IS_SURROGATE(c1)) {
810                 if(U_IS_SURROGATE_LEAD(c1)) {
811                     /* advance beyond source surrogate pair if it case-folds */
812                     ++s1;
813                 } else /* isTrail(c1) */ {
814                     /*
815                      * we got a supplementary code point when hitting its trail surrogate,
816                      * therefore the lead surrogate must have been the same as in the other string;
817                      * compare this decomposition with the lead surrogate in the other string
818                      * remember that this simulates bulk text replacement:
819                      * the decomposition would replace the entire code point
820                      */
821                     --s2;
822                     c2=*(s2-1);
823                 }
824             }
825 
826             /* push current level pointers */
827             stack1[0].start=start1;
828             stack1[0].s=s1;
829             stack1[0].limit=limit1;
830             ++level1;
831 
832             /* copy the folding result to fold1[] */
833             if(length<=UCASE_MAX_STRING_LENGTH) {
834                 u_memcpy(fold1, p, length);
835             } else {
836                 int32_t i=0;
837                 U16_APPEND_UNSAFE(fold1, i, length);
838                 length=i;
839             }
840 
841             /* set next level pointers to case folding */
842             start1=s1=fold1;
843             limit1=fold1+length;
844 
845             /* get ready to read from decomposition, continue with loop */
846             c1=-1;
847             continue;
848         }
849 
850         if( level2==0 &&
851             (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
852         ) {
853             /* cp2 case-folds to the code point "length" or to p[length] */
854             if(U_IS_SURROGATE(c2)) {
855                 if(U_IS_SURROGATE_LEAD(c2)) {
856                     /* advance beyond source surrogate pair if it case-folds */
857                     ++s2;
858                 } else /* isTrail(c2) */ {
859                     /*
860                      * we got a supplementary code point when hitting its trail surrogate,
861                      * therefore the lead surrogate must have been the same as in the other string;
862                      * compare this decomposition with the lead surrogate in the other string
863                      * remember that this simulates bulk text replacement:
864                      * the decomposition would replace the entire code point
865                      */
866                     --s1;
867                     c1=*(s1-1);
868                 }
869             }
870 
871             /* push current level pointers */
872             stack2[0].start=start2;
873             stack2[0].s=s2;
874             stack2[0].limit=limit2;
875             ++level2;
876 
877             /* copy the folding result to fold2[] */
878             if(length<=UCASE_MAX_STRING_LENGTH) {
879                 u_memcpy(fold2, p, length);
880             } else {
881                 int32_t i=0;
882                 U16_APPEND_UNSAFE(fold2, i, length);
883                 length=i;
884             }
885 
886             /* set next level pointers to case folding */
887             start2=s2=fold2;
888             limit2=fold2+length;
889 
890             /* get ready to read from decomposition, continue with loop */
891             c2=-1;
892             continue;
893         }
894 
895         /*
896          * no decomposition/case folding, max level for both sides:
897          * return difference result
898          *
899          * code point order comparison must not just return cp1-cp2
900          * because when single surrogates are present then the surrogate pairs
901          * that formed cp1 and cp2 may be from different string indexes
902          *
903          * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
904          * c1=d800 cp1=10001 c2=dc00 cp2=10000
905          * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
906          *
907          * therefore, use same fix-up as in ustring.c/uprv_strCompare()
908          * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
909          * so we have slightly different pointer/start/limit comparisons here
910          */
911 
912         if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
913             /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
914             if(
915                 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
916                 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
917             ) {
918                 /* part of a surrogate pair, leave >=d800 */
919             } else {
920                 /* BMP code point - may be surrogate code point - make <d800 */
921                 c1-=0x2800;
922             }
923 
924             if(
925                 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
926                 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
927             ) {
928                 /* part of a surrogate pair, leave >=d800 */
929             } else {
930                 /* BMP code point - may be surrogate code point - make <d800 */
931                 c2-=0x2800;
932             }
933         }
934 
935         return c1-c2;
936     }
937 }
938 
939 /* public API functions */
940 
941 U_CAPI int32_t U_EXPORT2
u_strCaseCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)942 u_strCaseCompare(const UChar *s1, int32_t length1,
943                  const UChar *s2, int32_t length2,
944                  uint32_t options,
945                  UErrorCode *pErrorCode) {
946     /* argument checking */
947     if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
948         return 0;
949     }
950     if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
951         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
952         return 0;
953     }
954     return u_strcmpFold(s1, length1, s2, length2,
955                         options|U_COMPARE_IGNORE_CASE,
956                         pErrorCode);
957 }
958 
959 U_CAPI int32_t U_EXPORT2
u_strcasecmp(const UChar * s1,const UChar * s2,uint32_t options)960 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
961     UErrorCode errorCode=U_ZERO_ERROR;
962     return u_strcmpFold(s1, -1, s2, -1,
963                         options|U_COMPARE_IGNORE_CASE,
964                         &errorCode);
965 }
966 
967 U_CAPI int32_t U_EXPORT2
u_memcasecmp(const UChar * s1,const UChar * s2,int32_t length,uint32_t options)968 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
969     UErrorCode errorCode=U_ZERO_ERROR;
970     return u_strcmpFold(s1, length, s2, length,
971                         options|U_COMPARE_IGNORE_CASE,
972                         &errorCode);
973 }
974 
975 U_CAPI int32_t U_EXPORT2
u_strncasecmp(const UChar * s1,const UChar * s2,int32_t n,uint32_t options)976 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
977     UErrorCode errorCode=U_ZERO_ERROR;
978     return u_strcmpFold(s1, n, s2, n,
979                         options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
980                         &errorCode);
981 }
982