• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2001-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *
9 * File ustrtrns.c
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   9/10/2001    Ram    Creation.
15 ******************************************************************************
16 */
17 
18 /*******************************************************************************
19  *
20  * u_strTo* and u_strFrom* APIs
21  * WCS functions moved to ustr_wcs.c for better modularization
22  *
23  *******************************************************************************
24  */
25 
26 
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
29 #include "cstring.h"
30 #include "cmemory.h"
31 #include "ustr_imp.h"
32 
33 U_CAPI UChar* U_EXPORT2
u_strFromUTF32WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const UChar32 * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)34 u_strFromUTF32WithSub(UChar *dest,
35                int32_t destCapacity,
36                int32_t *pDestLength,
37                const UChar32 *src,
38                int32_t srcLength,
39                UChar32 subchar, int32_t *pNumSubstitutions,
40                UErrorCode *pErrorCode) {
41     const UChar32 *srcLimit;
42     UChar32 ch;
43     UChar *destLimit;
44     UChar *pDest;
45     int32_t reqLength;
46     int32_t numSubstitutions;
47 
48     /* args check */
49     if(U_FAILURE(*pErrorCode)){
50         return NULL;
51     }
52     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
53         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
54         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
55     ) {
56         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
57         return NULL;
58     }
59 
60     if(pNumSubstitutions != NULL) {
61         *pNumSubstitutions = 0;
62     }
63 
64     pDest = dest;
65     destLimit = dest + destCapacity;
66     reqLength = 0;
67     numSubstitutions = 0;
68 
69     if(srcLength < 0) {
70         /* simple loop for conversion of a NUL-terminated BMP string */
71         while((ch=*src) != 0 &&
72               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
73             ++src;
74             if(pDest < destLimit) {
75                 *pDest++ = (UChar)ch;
76             } else {
77                 ++reqLength;
78             }
79         }
80         srcLimit = src;
81         if(ch != 0) {
82             /* "complicated" case, find the end of the remaining string */
83             while(*++srcLimit != 0) {}
84         }
85     } else {
86         srcLimit = src + srcLength;
87     }
88 
89     /* convert with length */
90     while(src < srcLimit) {
91         ch = *src++;
92         do {
93             /* usually "loops" once; twice only for writing subchar */
94             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
95                 if(pDest < destLimit) {
96                     *pDest++ = (UChar)ch;
97                 } else {
98                     ++reqLength;
99                 }
100                 break;
101             } else if(0x10000 <= ch && ch <= 0x10ffff) {
102                 if((pDest + 2) <= destLimit) {
103                     *pDest++ = U16_LEAD(ch);
104                     *pDest++ = U16_TRAIL(ch);
105                 } else {
106                     reqLength += 2;
107                 }
108                 break;
109             } else if((ch = subchar) < 0) {
110                 /* surrogate code point, or not a Unicode code point at all */
111                 *pErrorCode = U_INVALID_CHAR_FOUND;
112                 return NULL;
113             } else {
114                 ++numSubstitutions;
115             }
116         } while(TRUE);
117     }
118 
119     reqLength += (int32_t)(pDest - dest);
120     if(pDestLength) {
121         *pDestLength = reqLength;
122     }
123     if(pNumSubstitutions != NULL) {
124         *pNumSubstitutions = numSubstitutions;
125     }
126 
127     /* Terminate the buffer */
128     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
129 
130     return dest;
131 }
132 
133 U_CAPI UChar* U_EXPORT2
u_strFromUTF32(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const UChar32 * src,int32_t srcLength,UErrorCode * pErrorCode)134 u_strFromUTF32(UChar *dest,
135                int32_t destCapacity,
136                int32_t *pDestLength,
137                const UChar32 *src,
138                int32_t srcLength,
139                UErrorCode *pErrorCode) {
140     return u_strFromUTF32WithSub(
141             dest, destCapacity, pDestLength,
142             src, srcLength,
143             U_SENTINEL, NULL,
144             pErrorCode);
145 }
146 
147 U_CAPI UChar32* U_EXPORT2
u_strToUTF32WithSub(UChar32 * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)148 u_strToUTF32WithSub(UChar32 *dest,
149              int32_t destCapacity,
150              int32_t *pDestLength,
151              const UChar *src,
152              int32_t srcLength,
153              UChar32 subchar, int32_t *pNumSubstitutions,
154              UErrorCode *pErrorCode) {
155     const UChar *srcLimit;
156     UChar32 ch;
157     UChar ch2;
158     UChar32 *destLimit;
159     UChar32 *pDest;
160     int32_t reqLength;
161     int32_t numSubstitutions;
162 
163     /* args check */
164     if(U_FAILURE(*pErrorCode)){
165         return NULL;
166     }
167     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
168         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
169         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
170     ) {
171         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
172         return NULL;
173     }
174 
175     if(pNumSubstitutions != NULL) {
176         *pNumSubstitutions = 0;
177     }
178 
179     pDest = dest;
180     destLimit = dest + destCapacity;
181     reqLength = 0;
182     numSubstitutions = 0;
183 
184     if(srcLength < 0) {
185         /* simple loop for conversion of a NUL-terminated BMP string */
186         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
187             ++src;
188             if(pDest < destLimit) {
189                 *pDest++ = ch;
190             } else {
191                 ++reqLength;
192             }
193         }
194         srcLimit = src;
195         if(ch != 0) {
196             /* "complicated" case, find the end of the remaining string */
197             while(*++srcLimit != 0) {}
198         }
199     } else {
200         srcLimit = src + srcLength;
201     }
202 
203     /* convert with length */
204     while(src < srcLimit) {
205         ch = *src++;
206         if(!U16_IS_SURROGATE(ch)) {
207             /* write or count ch below */
208         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
209             ++src;
210             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
211         } else if((ch = subchar) < 0) {
212             /* unpaired surrogate */
213             *pErrorCode = U_INVALID_CHAR_FOUND;
214             return NULL;
215         } else {
216             ++numSubstitutions;
217         }
218         if(pDest < destLimit) {
219             *pDest++ = ch;
220         } else {
221             ++reqLength;
222         }
223     }
224 
225     reqLength += (int32_t)(pDest - dest);
226     if(pDestLength) {
227         *pDestLength = reqLength;
228     }
229     if(pNumSubstitutions != NULL) {
230         *pNumSubstitutions = numSubstitutions;
231     }
232 
233     /* Terminate the buffer */
234     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
235 
236     return dest;
237 }
238 
239 U_CAPI UChar32* U_EXPORT2
u_strToUTF32(UChar32 * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)240 u_strToUTF32(UChar32 *dest,
241              int32_t destCapacity,
242              int32_t *pDestLength,
243              const UChar *src,
244              int32_t srcLength,
245              UErrorCode *pErrorCode) {
246     return u_strToUTF32WithSub(
247             dest, destCapacity, pDestLength,
248             src, srcLength,
249             U_SENTINEL, NULL,
250             pErrorCode);
251 }
252 
253 /* for utf8_nextCharSafeBodyTerminated() */
254 static const UChar32
255 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
256 
257 /*
258  * Version of utf8_nextCharSafeBody() with the following differences:
259  * - checks for NUL termination instead of length
260  * - works with pointers instead of indexes
261  * - always strict (strict==-1)
262  *
263  * *ps points to after the lead byte and will be moved to after the last trail byte.
264  * c is the lead byte.
265  * @return the code point, or U_SENTINEL
266  */
267 static UChar32
utf8_nextCharSafeBodyTerminated(const uint8_t ** ps,UChar32 c)268 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
269     const uint8_t *s=*ps;
270     uint8_t trail, illegal=0;
271     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
272     UTF8_MASK_LEAD_BYTE((c), count);
273     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
274     switch(count) {
275     /* each branch falls through to the next one */
276     case 5:
277     case 4:
278         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
279         illegal=1;
280         break;
281     case 3:
282         trail=(uint8_t)(*s++ - 0x80);
283         c=(c<<6)|trail;
284         if(trail>0x3f || c>=0x110) {
285             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
286             illegal=1;
287             break;
288         }
289     case 2:
290         trail=(uint8_t)(*s++ - 0x80);
291         if(trail>0x3f) {
292             /* not a trail byte */
293             illegal=1;
294             break;
295         }
296         c=(c<<6)|trail;
297     case 1:
298         trail=(uint8_t)(*s++ - 0x80);
299         if(trail>0x3f) {
300             /* not a trail byte */
301             illegal=1;
302         }
303         c=(c<<6)|trail;
304         break;
305     case 0:
306         return U_SENTINEL;
307     /* no default branch to optimize switch()  - all values are covered */
308     }
309 
310     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
311     /* illegal is also set if count>=4 */
312     if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
313         /* error handling */
314         /* don't go beyond this sequence */
315         s=*ps;
316         while(count>0 && UTF8_IS_TRAIL(*s)) {
317             ++s;
318             --count;
319         }
320         c=U_SENTINEL;
321     }
322     *ps=s;
323     return c;
324 }
325 
326 /*
327  * Version of utf8_nextCharSafeBody() with the following differences:
328  * - works with pointers instead of indexes
329  * - always strict (strict==-1)
330  *
331  * *ps points to after the lead byte and will be moved to after the last trail byte.
332  * c is the lead byte.
333  * @return the code point, or U_SENTINEL
334  */
335 static UChar32
utf8_nextCharSafeBodyPointer(const uint8_t ** ps,const uint8_t * limit,UChar32 c)336 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
337     const uint8_t *s=*ps;
338     uint8_t trail, illegal=0;
339     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
340     if((limit-s)>=count) {
341         UTF8_MASK_LEAD_BYTE((c), count);
342         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
343         switch(count) {
344         /* each branch falls through to the next one */
345         case 5:
346         case 4:
347             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
348             illegal=1;
349             break;
350         case 3:
351             trail=*s++;
352             c=(c<<6)|(trail&0x3f);
353             if(c<0x110) {
354                 illegal|=(trail&0xc0)^0x80;
355             } else {
356                 /* code point>0x10ffff, outside Unicode */
357                 illegal=1;
358                 break;
359             }
360         case 2:
361             trail=*s++;
362             c=(c<<6)|(trail&0x3f);
363             illegal|=(trail&0xc0)^0x80;
364         case 1:
365             trail=*s++;
366             c=(c<<6)|(trail&0x3f);
367             illegal|=(trail&0xc0)^0x80;
368             break;
369         case 0:
370             return U_SENTINEL;
371         /* no default branch to optimize switch()  - all values are covered */
372         }
373     } else {
374         illegal=1; /* too few bytes left */
375     }
376 
377     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
378     /* illegal is also set if count>=4 */
379     if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
380         /* error handling */
381         /* don't go beyond this sequence */
382         s=*ps;
383         while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) {
384             ++s;
385             --count;
386         }
387         c=U_SENTINEL;
388     }
389     *ps=s;
390     return c;
391 }
392 
393 U_CAPI UChar* U_EXPORT2
u_strFromUTF8WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)394 u_strFromUTF8WithSub(UChar *dest,
395               int32_t destCapacity,
396               int32_t *pDestLength,
397               const char* src,
398               int32_t srcLength,
399               UChar32 subchar, int32_t *pNumSubstitutions,
400               UErrorCode *pErrorCode){
401     UChar *pDest = dest;
402     UChar *pDestLimit = dest+destCapacity;
403     UChar32 ch;
404     int32_t reqLength = 0;
405     const uint8_t* pSrc = (const uint8_t*) src;
406     uint8_t t1, t2; /* trail bytes */
407     int32_t numSubstitutions;
408 
409     /* args check */
410     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
411         return NULL;
412     }
413 
414     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
415         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
416         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
417     ) {
418         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
419         return NULL;
420     }
421 
422     if(pNumSubstitutions!=NULL) {
423         *pNumSubstitutions=0;
424     }
425     numSubstitutions=0;
426 
427     /*
428      * Inline processing of UTF-8 byte sequences:
429      *
430      * Byte sequences for the most common characters are handled inline in
431      * the conversion loops. In order to reduce the path lengths for those
432      * characters, the tests are arranged in a kind of binary search.
433      * ASCII (<=0x7f) is checked first, followed by the dividing point
434      * between 2- and 3-byte sequences (0xe0).
435      * The 3-byte branch is tested first to speed up CJK text.
436      * The compiler should combine the subtractions for the two tests for 0xe0.
437      * Each branch then tests for the other end of its range.
438      */
439 
440     if(srcLength < 0){
441         /*
442          * Transform a NUL-terminated string.
443          * The code explicitly checks for NULs only in the lead byte position.
444          * A NUL byte in the trail byte position fails the trail byte range check anyway.
445          */
446         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
447             if(ch <= 0x7f){
448                 *pDest++=(UChar)ch;
449                 ++pSrc;
450             } else {
451                 if(ch > 0xe0) {
452                     if( /* handle U+1000..U+CFFF inline */
453                         ch <= 0xec &&
454                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
455                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
456                     ) {
457                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
458                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
459                         pSrc += 3;
460                         continue;
461                     }
462                 } else if(ch < 0xe0) {
463                     if( /* handle U+0080..U+07FF inline */
464                         ch >= 0xc2 &&
465                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
466                     ) {
467                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
468                         pSrc += 2;
469                         continue;
470                     }
471                 }
472 
473                 /* function call for "complicated" and error cases */
474                 ++pSrc; /* continue after the lead byte */
475                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
476                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
477                     *pErrorCode = U_INVALID_CHAR_FOUND;
478                     return NULL;
479                 } else if(ch<=0xFFFF) {
480                     *(pDest++)=(UChar)ch;
481                 } else {
482                     *(pDest++)=UTF16_LEAD(ch);
483                     if(pDest<pDestLimit) {
484                         *(pDest++)=UTF16_TRAIL(ch);
485                     } else {
486                         reqLength++;
487                         break;
488                     }
489                 }
490             }
491         }
492 
493         /* Pre-flight the rest of the string. */
494         while((ch = *pSrc) != 0) {
495             if(ch <= 0x7f){
496                 ++reqLength;
497                 ++pSrc;
498             } else {
499                 if(ch > 0xe0) {
500                     if( /* handle U+1000..U+CFFF inline */
501                         ch <= 0xec &&
502                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
503                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
504                     ) {
505                         ++reqLength;
506                         pSrc += 3;
507                         continue;
508                     }
509                 } else if(ch < 0xe0) {
510                     if( /* handle U+0080..U+07FF inline */
511                         ch >= 0xc2 &&
512                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
513                     ) {
514                         ++reqLength;
515                         pSrc += 2;
516                         continue;
517                     }
518                 }
519 
520                 /* function call for "complicated" and error cases */
521                 ++pSrc; /* continue after the lead byte */
522                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
523                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
524                     *pErrorCode = U_INVALID_CHAR_FOUND;
525                     return NULL;
526                 }
527                 reqLength += U16_LENGTH(ch);
528             }
529         }
530     } else /* srcLength >= 0 */ {
531         const uint8_t *pSrcLimit = pSrc + srcLength;
532         int32_t count;
533 
534         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
535         for(;;) {
536             /*
537              * Each iteration of the inner loop progresses by at most 3 UTF-8
538              * bytes and one UChar, for most characters.
539              * For supplementary code points (4 & 2), which are rare,
540              * there is an additional adjustment.
541              */
542             count = (int32_t)(pDestLimit - pDest);
543             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
544             if(count > srcLength) {
545                 count = srcLength; /* min(remaining dest, remaining src/3) */
546             }
547             if(count < 3) {
548                 /*
549                  * Too much overhead if we get near the end of the string,
550                  * continue with the next loop.
551                  */
552                 break;
553             }
554 
555             do {
556                 ch = *pSrc;
557                 if(ch <= 0x7f){
558                     *pDest++=(UChar)ch;
559                     ++pSrc;
560                 } else {
561                     if(ch > 0xe0) {
562                         if( /* handle U+1000..U+CFFF inline */
563                             ch <= 0xec &&
564                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
565                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
566                         ) {
567                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
568                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
569                             pSrc += 3;
570                             continue;
571                         }
572                     } else if(ch < 0xe0) {
573                         if( /* handle U+0080..U+07FF inline */
574                             ch >= 0xc2 &&
575                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
576                         ) {
577                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
578                             pSrc += 2;
579                             continue;
580                         }
581                     }
582 
583                     if(ch >= 0xf0 || subchar > 0xffff) {
584                         /*
585                          * We may read up to six bytes and write up to two UChars,
586                          * which we didn't account for with computing count,
587                          * so we adjust it here.
588                          */
589                         if(--count == 0) {
590                             break;
591                         }
592                     }
593 
594                     /* function call for "complicated" and error cases */
595                     ++pSrc; /* continue after the lead byte */
596                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
597                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
598                         *pErrorCode = U_INVALID_CHAR_FOUND;
599                         return NULL;
600                     }else if(ch<=0xFFFF){
601                         *(pDest++)=(UChar)ch;
602                     }else{
603                         *(pDest++)=UTF16_LEAD(ch);
604                         *(pDest++)=UTF16_TRAIL(ch);
605                     }
606                 }
607             } while(--count > 0);
608         }
609 
610         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
611             ch = *pSrc;
612             if(ch <= 0x7f){
613                 *pDest++=(UChar)ch;
614                 ++pSrc;
615             } else {
616                 if(ch > 0xe0) {
617                     if( /* handle U+1000..U+CFFF inline */
618                         ch <= 0xec &&
619                         ((pSrcLimit - pSrc) >= 3) &&
620                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
621                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
622                     ) {
623                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
624                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
625                         pSrc += 3;
626                         continue;
627                     }
628                 } else if(ch < 0xe0) {
629                     if( /* handle U+0080..U+07FF inline */
630                         ch >= 0xc2 &&
631                         ((pSrcLimit - pSrc) >= 2) &&
632                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
633                     ) {
634                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
635                         pSrc += 2;
636                         continue;
637                     }
638                 }
639 
640                 /* function call for "complicated" and error cases */
641                 ++pSrc; /* continue after the lead byte */
642                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
643                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
644                     *pErrorCode = U_INVALID_CHAR_FOUND;
645                     return NULL;
646                 }else if(ch<=0xFFFF){
647                     *(pDest++)=(UChar)ch;
648                 }else{
649                     *(pDest++)=UTF16_LEAD(ch);
650                     if(pDest<pDestLimit){
651                         *(pDest++)=UTF16_TRAIL(ch);
652                     }else{
653                         reqLength++;
654                         break;
655                     }
656                 }
657             }
658         }
659         /* do not fill the dest buffer just count the UChars needed */
660         while(pSrc < pSrcLimit){
661             ch = *pSrc;
662             if(ch <= 0x7f){
663                 reqLength++;
664                 ++pSrc;
665             } else {
666                 if(ch > 0xe0) {
667                     if( /* handle U+1000..U+CFFF inline */
668                         ch <= 0xec &&
669                         ((pSrcLimit - pSrc) >= 3) &&
670                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
671                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
672                     ) {
673                         reqLength++;
674                         pSrc += 3;
675                         continue;
676                     }
677                 } else if(ch < 0xe0) {
678                     if( /* handle U+0080..U+07FF inline */
679                         ch >= 0xc2 &&
680                         ((pSrcLimit - pSrc) >= 2) &&
681                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
682                     ) {
683                         reqLength++;
684                         pSrc += 2;
685                         continue;
686                     }
687                 }
688 
689                 /* function call for "complicated" and error cases */
690                 ++pSrc; /* continue after the lead byte */
691                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
692                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
693                     *pErrorCode = U_INVALID_CHAR_FOUND;
694                     return NULL;
695                 }
696                 reqLength+=UTF_CHAR_LENGTH(ch);
697             }
698         }
699     }
700 
701     reqLength+=(int32_t)(pDest - dest);
702 
703     if(pNumSubstitutions!=NULL) {
704         *pNumSubstitutions=numSubstitutions;
705     }
706 
707     if(pDestLength){
708         *pDestLength = reqLength;
709     }
710 
711     /* Terminate the buffer */
712     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
713 
714     return dest;
715 }
716 
717 U_CAPI UChar* U_EXPORT2
u_strFromUTF8(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)718 u_strFromUTF8(UChar *dest,
719               int32_t destCapacity,
720               int32_t *pDestLength,
721               const char* src,
722               int32_t srcLength,
723               UErrorCode *pErrorCode){
724     return u_strFromUTF8WithSub(
725             dest, destCapacity, pDestLength,
726             src, srcLength,
727             U_SENTINEL, NULL,
728             pErrorCode);
729 }
730 
731 U_CAPI UChar * U_EXPORT2
u_strFromUTF8Lenient(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)732 u_strFromUTF8Lenient(UChar *dest,
733                      int32_t destCapacity,
734                      int32_t *pDestLength,
735                      const char *src,
736                      int32_t srcLength,
737                      UErrorCode *pErrorCode) {
738     UChar *pDest = dest;
739     UChar32 ch;
740     int32_t reqLength = 0;
741     uint8_t* pSrc = (uint8_t*) src;
742 
743     /* args check */
744     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
745         return NULL;
746     }
747 
748     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
749         (destCapacity<0) || (dest == NULL && destCapacity > 0)
750     ) {
751         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
752         return NULL;
753     }
754 
755     if(srcLength < 0) {
756         /* Transform a NUL-terminated string. */
757         UChar *pDestLimit = dest+destCapacity;
758         uint8_t t1, t2, t3; /* trail bytes */
759 
760         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
761             if(ch < 0xc0) {
762                 /*
763                  * ASCII, or a trail byte in lead position which is treated like
764                  * a single-byte sequence for better character boundary
765                  * resynchronization after illegal sequences.
766                  */
767                 *pDest++=(UChar)ch;
768                 ++pSrc;
769                 continue;
770             } else if(ch < 0xe0) { /* U+0080..U+07FF */
771                 if((t1 = pSrc[1]) != 0) {
772                     /* 0x3080 = (0xc0 << 6) + 0x80 */
773                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
774                     pSrc += 2;
775                     continue;
776                 }
777             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
778                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
779                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
780                     /* 0x2080 = (0x80 << 6) + 0x80 */
781                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
782                     pSrc += 3;
783                     continue;
784                 }
785             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
786                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
787                     pSrc += 4;
788                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
789                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
790                     *(pDest++) = U16_LEAD(ch);
791                     if(pDest < pDestLimit) {
792                         *(pDest++) = U16_TRAIL(ch);
793                     } else {
794                         reqLength = 1;
795                         break;
796                     }
797                     continue;
798                 }
799             }
800 
801             /* truncated character at the end */
802             *pDest++ = 0xfffd;
803             while(*++pSrc != 0) {}
804             break;
805         }
806 
807         /* Pre-flight the rest of the string. */
808         while((ch = *pSrc) != 0) {
809             if(ch < 0xc0) {
810                 /*
811                  * ASCII, or a trail byte in lead position which is treated like
812                  * a single-byte sequence for better character boundary
813                  * resynchronization after illegal sequences.
814                  */
815                 ++reqLength;
816                 ++pSrc;
817                 continue;
818             } else if(ch < 0xe0) { /* U+0080..U+07FF */
819                 if(pSrc[1] != 0) {
820                     ++reqLength;
821                     pSrc += 2;
822                     continue;
823                 }
824             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
825                 if(pSrc[1] != 0 && pSrc[2] != 0) {
826                     ++reqLength;
827                     pSrc += 3;
828                     continue;
829                 }
830             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
831                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
832                     reqLength += 2;
833                     pSrc += 4;
834                     continue;
835                 }
836             }
837 
838             /* truncated character at the end */
839             ++reqLength;
840             break;
841         }
842     } else /* srcLength >= 0 */ {
843         const uint8_t *pSrcLimit = pSrc + srcLength;
844 
845         /*
846          * This function requires that if srcLength is given, then it must be
847          * destCapatity >= srcLength so that we need not check for
848          * destination buffer overflow in the loop.
849          */
850         if(destCapacity < srcLength) {
851             if(pDestLength != NULL) {
852                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
853             }
854             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
855             return NULL;
856         }
857 
858         if((pSrcLimit - pSrc) >= 4) {
859             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
860 
861             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
862             do {
863                 ch = *pSrc++;
864                 if(ch < 0xc0) {
865                     /*
866                      * ASCII, or a trail byte in lead position which is treated like
867                      * a single-byte sequence for better character boundary
868                      * resynchronization after illegal sequences.
869                      */
870                     *pDest++=(UChar)ch;
871                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
872                     /* 0x3080 = (0xc0 << 6) + 0x80 */
873                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
874                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
875                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
876                     /* 0x2080 = (0x80 << 6) + 0x80 */
877                     ch = (ch << 12) + (*pSrc++ << 6);
878                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
879                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
880                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
881                     ch = (ch << 18) + (*pSrc++ << 12);
882                     ch += *pSrc++ << 6;
883                     ch += *pSrc++ - 0x3c82080;
884                     *(pDest++) = U16_LEAD(ch);
885                     *(pDest++) = U16_TRAIL(ch);
886                 }
887             } while(pSrc < pSrcLimit);
888 
889             pSrcLimit += 3; /* restore original pSrcLimit */
890         }
891 
892         while(pSrc < pSrcLimit) {
893             ch = *pSrc++;
894             if(ch < 0xc0) {
895                 /*
896                  * ASCII, or a trail byte in lead position which is treated like
897                  * a single-byte sequence for better character boundary
898                  * resynchronization after illegal sequences.
899                  */
900                 *pDest++=(UChar)ch;
901                 continue;
902             } else if(ch < 0xe0) { /* U+0080..U+07FF */
903                 if(pSrc < pSrcLimit) {
904                     /* 0x3080 = (0xc0 << 6) + 0x80 */
905                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
906                     continue;
907                 }
908             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
909                 if((pSrcLimit - pSrc) >= 2) {
910                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
911                     /* 0x2080 = (0x80 << 6) + 0x80 */
912                     ch = (ch << 12) + (*pSrc++ << 6);
913                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
914                     pSrc += 3;
915                     continue;
916                 }
917             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
918                 if((pSrcLimit - pSrc) >= 3) {
919                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
920                     ch = (ch << 18) + (*pSrc++ << 12);
921                     ch += *pSrc++ << 6;
922                     ch += *pSrc++ - 0x3c82080;
923                     *(pDest++) = U16_LEAD(ch);
924                     *(pDest++) = U16_TRAIL(ch);
925                     pSrc += 4;
926                     continue;
927                 }
928             }
929 
930             /* truncated character at the end */
931             *pDest++ = 0xfffd;
932             break;
933         }
934     }
935 
936     reqLength+=(int32_t)(pDest - dest);
937 
938     if(pDestLength){
939         *pDestLength = reqLength;
940     }
941 
942     /* Terminate the buffer */
943     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
944 
945     return dest;
946 }
947 
948 static U_INLINE uint8_t *
_appendUTF8(uint8_t * pDest,UChar32 c)949 _appendUTF8(uint8_t *pDest, UChar32 c) {
950     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
951     if((c)<=0x7f) {
952         *pDest++=(uint8_t)c;
953     } else if(c<=0x7ff) {
954         *pDest++=(uint8_t)((c>>6)|0xc0);
955         *pDest++=(uint8_t)((c&0x3f)|0x80);
956     } else if(c<=0xffff) {
957         *pDest++=(uint8_t)((c>>12)|0xe0);
958         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
959         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
960     } else /* if((uint32_t)(c)<=0x10ffff) */ {
961         *pDest++=(uint8_t)(((c)>>18)|0xf0);
962         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
963         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
964         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
965     }
966     return pDest;
967 }
968 
969 
970 U_CAPI char* U_EXPORT2
u_strToUTF8WithSub(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * pSrc,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)971 u_strToUTF8WithSub(char *dest,
972             int32_t destCapacity,
973             int32_t *pDestLength,
974             const UChar *pSrc,
975             int32_t srcLength,
976             UChar32 subchar, int32_t *pNumSubstitutions,
977             UErrorCode *pErrorCode){
978     int32_t reqLength=0;
979     uint32_t ch=0,ch2=0;
980     uint8_t *pDest = (uint8_t *)dest;
981     uint8_t *pDestLimit = pDest + destCapacity;
982     int32_t numSubstitutions;
983 
984     /* args check */
985     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
986         return NULL;
987     }
988 
989     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
990         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
991         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
992     ) {
993         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
994         return NULL;
995     }
996 
997     if(pNumSubstitutions!=NULL) {
998         *pNumSubstitutions=0;
999     }
1000     numSubstitutions=0;
1001 
1002     if(srcLength==-1) {
1003         while((ch=*pSrc)!=0) {
1004             ++pSrc;
1005             if(ch <= 0x7f) {
1006                 if(pDest<pDestLimit) {
1007                     *pDest++ = (uint8_t)ch;
1008                 } else {
1009                     reqLength = 1;
1010                     break;
1011                 }
1012             } else if(ch <= 0x7ff) {
1013                 if((pDestLimit - pDest) >= 2) {
1014                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1015                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1016                 } else {
1017                     reqLength = 2;
1018                     break;
1019                 }
1020             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1021                 if((pDestLimit - pDest) >= 3) {
1022                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1023                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1024                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1025                 } else {
1026                     reqLength = 3;
1027                     break;
1028                 }
1029             } else /* ch is a surrogate */ {
1030                 int32_t length;
1031 
1032                 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
1033                 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1034                     ++pSrc;
1035                     ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1036                 } else if(subchar>=0) {
1037                     ch=subchar;
1038                     ++numSubstitutions;
1039                 } else {
1040                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1041                     *pErrorCode = U_INVALID_CHAR_FOUND;
1042                     return NULL;
1043                 }
1044 
1045                 length = U8_LENGTH(ch);
1046                 if((pDestLimit - pDest) >= length) {
1047                     /* convert and append*/
1048                     pDest=_appendUTF8(pDest, ch);
1049                 } else {
1050                     reqLength = length;
1051                     break;
1052                 }
1053             }
1054         }
1055         while((ch=*pSrc++)!=0) {
1056             if(ch<=0x7f) {
1057                 ++reqLength;
1058             } else if(ch<=0x7ff) {
1059                 reqLength+=2;
1060             } else if(!UTF_IS_SURROGATE(ch)) {
1061                 reqLength+=3;
1062             } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1063                 ++pSrc;
1064                 reqLength+=4;
1065             } else if(subchar>=0) {
1066                 reqLength+=U8_LENGTH(subchar);
1067                 ++numSubstitutions;
1068             } else {
1069                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1070                 *pErrorCode = U_INVALID_CHAR_FOUND;
1071                 return NULL;
1072             }
1073         }
1074     } else {
1075         const UChar *pSrcLimit = pSrc+srcLength;
1076         int32_t count;
1077 
1078         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1079         for(;;) {
1080             /*
1081              * Each iteration of the inner loop progresses by at most 3 UTF-8
1082              * bytes and one UChar, for most characters.
1083              * For supplementary code points (4 & 2), which are rare,
1084              * there is an additional adjustment.
1085              */
1086             count = (int32_t)((pDestLimit - pDest) / 3);
1087             srcLength = (int32_t)(pSrcLimit - pSrc);
1088             if(count > srcLength) {
1089                 count = srcLength; /* min(remaining dest/3, remaining src) */
1090             }
1091             if(count < 3) {
1092                 /*
1093                  * Too much overhead if we get near the end of the string,
1094                  * continue with the next loop.
1095                  */
1096                 break;
1097             }
1098             do {
1099                 ch=*pSrc++;
1100                 if(ch <= 0x7f) {
1101                     *pDest++ = (uint8_t)ch;
1102                 } else if(ch <= 0x7ff) {
1103                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1104                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1105                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1106                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1107                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1108                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1109                 } else /* ch is a surrogate */ {
1110                     /*
1111                      * We will read two UChars and probably output four bytes,
1112                      * which we didn't account for with computing count,
1113                      * so we adjust it here.
1114                      */
1115                     if(--count == 0) {
1116                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1117                         break;  /* recompute count */
1118                     }
1119 
1120                     if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1121                         ++pSrc;
1122                         ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1123 
1124                         /* writing 4 bytes per 2 UChars is ok */
1125                         *pDest++=(uint8_t)((ch>>18)|0xf0);
1126                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1127                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1128                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
1129                     } else  {
1130                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1131                         if(subchar>=0) {
1132                             ch=subchar;
1133                             ++numSubstitutions;
1134                         } else {
1135                             *pErrorCode = U_INVALID_CHAR_FOUND;
1136                             return NULL;
1137                         }
1138 
1139                         /* convert and append*/
1140                         pDest=_appendUTF8(pDest, ch);
1141                     }
1142                 }
1143             } while(--count > 0);
1144         }
1145 
1146         while(pSrc<pSrcLimit) {
1147             ch=*pSrc++;
1148             if(ch <= 0x7f) {
1149                 if(pDest<pDestLimit) {
1150                     *pDest++ = (uint8_t)ch;
1151                 } else {
1152                     reqLength = 1;
1153                     break;
1154                 }
1155             } else if(ch <= 0x7ff) {
1156                 if((pDestLimit - pDest) >= 2) {
1157                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1158                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1159                 } else {
1160                     reqLength = 2;
1161                     break;
1162                 }
1163             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1164                 if((pDestLimit - pDest) >= 3) {
1165                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1166                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1167                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1168                 } else {
1169                     reqLength = 3;
1170                     break;
1171                 }
1172             } else /* ch is a surrogate */ {
1173                 int32_t length;
1174 
1175                 if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1176                     ++pSrc;
1177                     ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1178                 } else if(subchar>=0) {
1179                     ch=subchar;
1180                     ++numSubstitutions;
1181                 } else {
1182                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1183                     *pErrorCode = U_INVALID_CHAR_FOUND;
1184                     return NULL;
1185                 }
1186 
1187                 length = U8_LENGTH(ch);
1188                 if((pDestLimit - pDest) >= length) {
1189                     /* convert and append*/
1190                     pDest=_appendUTF8(pDest, ch);
1191                 } else {
1192                     reqLength = length;
1193                     break;
1194                 }
1195             }
1196         }
1197         while(pSrc<pSrcLimit) {
1198             ch=*pSrc++;
1199             if(ch<=0x7f) {
1200                 ++reqLength;
1201             } else if(ch<=0x7ff) {
1202                 reqLength+=2;
1203             } else if(!UTF_IS_SURROGATE(ch)) {
1204                 reqLength+=3;
1205             } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1206                 ++pSrc;
1207                 reqLength+=4;
1208             } else if(subchar>=0) {
1209                 reqLength+=U8_LENGTH(subchar);
1210                 ++numSubstitutions;
1211             } else {
1212                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1213                 *pErrorCode = U_INVALID_CHAR_FOUND;
1214                 return NULL;
1215             }
1216         }
1217     }
1218 
1219     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1220 
1221     if(pNumSubstitutions!=NULL) {
1222         *pNumSubstitutions=numSubstitutions;
1223     }
1224 
1225     if(pDestLength){
1226         *pDestLength = reqLength;
1227     }
1228 
1229     /* Terminate the buffer */
1230     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1231     return dest;
1232 }
1233 
1234 U_CAPI char* U_EXPORT2
u_strToUTF8(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * pSrc,int32_t srcLength,UErrorCode * pErrorCode)1235 u_strToUTF8(char *dest,
1236             int32_t destCapacity,
1237             int32_t *pDestLength,
1238             const UChar *pSrc,
1239             int32_t srcLength,
1240             UErrorCode *pErrorCode){
1241     return u_strToUTF8WithSub(
1242             dest, destCapacity, pDestLength,
1243             pSrc, srcLength,
1244             U_SENTINEL, NULL,
1245             pErrorCode);
1246 }
1247 
1248 U_CAPI UChar* U_EXPORT2
u_strFromJavaModifiedUTF8WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)1249 u_strFromJavaModifiedUTF8WithSub(
1250         UChar *dest,
1251         int32_t destCapacity,
1252         int32_t *pDestLength,
1253         const char *src,
1254         int32_t srcLength,
1255         UChar32 subchar, int32_t *pNumSubstitutions,
1256         UErrorCode *pErrorCode) {
1257     UChar *pDest = dest;
1258     UChar *pDestLimit = dest+destCapacity;
1259     UChar32 ch;
1260     int32_t reqLength = 0;
1261     const uint8_t* pSrc = (const uint8_t*) src;
1262     const uint8_t *pSrcLimit;
1263     int32_t count;
1264     uint8_t t1, t2; /* trail bytes */
1265     int32_t numSubstitutions;
1266 
1267     /* args check */
1268     if(U_FAILURE(*pErrorCode)){
1269         return NULL;
1270     }
1271     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1272         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1273         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1274     ) {
1275         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1276         return NULL;
1277     }
1278 
1279     if(pNumSubstitutions!=NULL) {
1280         *pNumSubstitutions=0;
1281     }
1282     numSubstitutions=0;
1283 
1284     if(srcLength < 0) {
1285         /*
1286          * Transform a NUL-terminated ASCII string.
1287          * Handle non-ASCII strings with slower code.
1288          */
1289         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1290             *pDest++=(UChar)ch;
1291             ++pSrc;
1292         }
1293         if(ch == 0) {
1294             reqLength=(int32_t)(pDest - dest);
1295             if(pDestLength) {
1296                 *pDestLength = reqLength;
1297             }
1298 
1299             /* Terminate the buffer */
1300             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1301             return dest;
1302         }
1303         srcLength = uprv_strlen((const char *)pSrc);
1304     }
1305 
1306     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1307     pSrcLimit = pSrc + srcLength;
1308     for(;;) {
1309         count = (int32_t)(pDestLimit - pDest);
1310         srcLength = (int32_t)(pSrcLimit - pSrc);
1311         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1312             /* fast ASCII loop */
1313             const uint8_t *prevSrc = pSrc;
1314             int32_t delta;
1315             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1316                 *pDest++=(UChar)ch;
1317                 ++pSrc;
1318             }
1319             delta = (int32_t)(pSrc - prevSrc);
1320             count -= delta;
1321             srcLength -= delta;
1322         }
1323         /*
1324          * Each iteration of the inner loop progresses by at most 3 UTF-8
1325          * bytes and one UChar.
1326          */
1327         srcLength /= 3;
1328         if(count > srcLength) {
1329             count = srcLength; /* min(remaining dest, remaining src/3) */
1330         }
1331         if(count < 3) {
1332             /*
1333              * Too much overhead if we get near the end of the string,
1334              * continue with the next loop.
1335              */
1336             break;
1337         }
1338         do {
1339             ch = *pSrc;
1340             if(ch <= 0x7f){
1341                 *pDest++=(UChar)ch;
1342                 ++pSrc;
1343             } else {
1344                 if(ch >= 0xe0) {
1345                     if( /* handle U+0000..U+FFFF inline */
1346                         ch <= 0xef &&
1347                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1348                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1349                     ) {
1350                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1351                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1352                         pSrc += 3;
1353                         continue;
1354                     }
1355                 } else {
1356                     if( /* handle U+0000..U+07FF inline */
1357                         ch >= 0xc0 &&
1358                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1359                     ) {
1360                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1361                         pSrc += 2;
1362                         continue;
1363                     }
1364                 }
1365 
1366                 if(subchar < 0) {
1367                     *pErrorCode = U_INVALID_CHAR_FOUND;
1368                     return NULL;
1369                 } else if(subchar > 0xffff && --count == 0) {
1370                     /*
1371                      * We need to write two UChars, adjusted count for that,
1372                      * and ran out of space.
1373                      */
1374                     break;
1375                 } else {
1376                     /* function call for error cases */
1377                     ++pSrc; /* continue after the lead byte */
1378                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1379                     ++numSubstitutions;
1380                     if(subchar<=0xFFFF) {
1381                         *(pDest++)=(UChar)subchar;
1382                     } else {
1383                         *(pDest++)=U16_LEAD(subchar);
1384                         *(pDest++)=U16_TRAIL(subchar);
1385                     }
1386                 }
1387             }
1388         } while(--count > 0);
1389     }
1390 
1391     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1392         ch = *pSrc;
1393         if(ch <= 0x7f){
1394             *pDest++=(UChar)ch;
1395             ++pSrc;
1396         } else {
1397             if(ch >= 0xe0) {
1398                 if( /* handle U+0000..U+FFFF inline */
1399                     ch <= 0xef &&
1400                     ((pSrcLimit - pSrc) >= 3) &&
1401                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1402                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1403                 ) {
1404                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1405                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1406                     pSrc += 3;
1407                     continue;
1408                 }
1409             } else {
1410                 if( /* handle U+0000..U+07FF inline */
1411                     ch >= 0xc0 &&
1412                     ((pSrcLimit - pSrc) >= 2) &&
1413                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1414                 ) {
1415                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1416                     pSrc += 2;
1417                     continue;
1418                 }
1419             }
1420 
1421             if(subchar < 0) {
1422                 *pErrorCode = U_INVALID_CHAR_FOUND;
1423                 return NULL;
1424             } else {
1425                 /* function call for error cases */
1426                 ++pSrc; /* continue after the lead byte */
1427                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1428                 ++numSubstitutions;
1429                 if(subchar<=0xFFFF) {
1430                     *(pDest++)=(UChar)subchar;
1431                 } else {
1432                     *(pDest++)=U16_LEAD(subchar);
1433                     if(pDest<pDestLimit) {
1434                         *(pDest++)=U16_TRAIL(subchar);
1435                     } else {
1436                         reqLength++;
1437                         break;
1438                     }
1439                 }
1440             }
1441         }
1442     }
1443 
1444     /* do not fill the dest buffer just count the UChars needed */
1445     while(pSrc < pSrcLimit){
1446         ch = *pSrc;
1447         if(ch <= 0x7f) {
1448             reqLength++;
1449             ++pSrc;
1450         } else {
1451             if(ch >= 0xe0) {
1452                 if( /* handle U+0000..U+FFFF inline */
1453                     ch <= 0xef &&
1454                     ((pSrcLimit - pSrc) >= 3) &&
1455                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1456                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1457                 ) {
1458                     reqLength++;
1459                     pSrc += 3;
1460                     continue;
1461                 }
1462             } else {
1463                 if( /* handle U+0000..U+07FF inline */
1464                     ch >= 0xc0 &&
1465                     ((pSrcLimit - pSrc) >= 2) &&
1466                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1467                 ) {
1468                     reqLength++;
1469                     pSrc += 2;
1470                     continue;
1471                 }
1472             }
1473 
1474             if(subchar < 0) {
1475                 *pErrorCode = U_INVALID_CHAR_FOUND;
1476                 return NULL;
1477             } else {
1478                 /* function call for error cases */
1479                 ++pSrc; /* continue after the lead byte */
1480                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1481                 ++numSubstitutions;
1482                 reqLength+=U16_LENGTH(ch);
1483             }
1484         }
1485     }
1486 
1487     if(pNumSubstitutions!=NULL) {
1488         *pNumSubstitutions=numSubstitutions;
1489     }
1490 
1491     reqLength+=(int32_t)(pDest - dest);
1492     if(pDestLength) {
1493         *pDestLength = reqLength;
1494     }
1495 
1496     /* Terminate the buffer */
1497     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1498     return dest;
1499 }
1500 
1501 U_CAPI char* U_EXPORT2
u_strToJavaModifiedUTF8(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)1502 u_strToJavaModifiedUTF8(
1503         char *dest,
1504         int32_t destCapacity,
1505         int32_t *pDestLength,
1506         const UChar *src,
1507         int32_t srcLength,
1508         UErrorCode *pErrorCode) {
1509     int32_t reqLength=0;
1510     uint32_t ch=0;
1511     uint8_t *pDest = (uint8_t *)dest;
1512     uint8_t *pDestLimit = pDest + destCapacity;
1513     const UChar *pSrcLimit;
1514     int32_t count;
1515 
1516     /* args check */
1517     if(U_FAILURE(*pErrorCode)){
1518         return NULL;
1519     }
1520     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1521         (dest==NULL && destCapacity!=0) || destCapacity<0
1522     ) {
1523         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1524         return NULL;
1525     }
1526 
1527     if(srcLength==-1) {
1528         /* Convert NUL-terminated ASCII, then find the string length. */
1529         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1530             *pDest++ = (uint8_t)ch;
1531             ++src;
1532         }
1533         if(ch == 0) {
1534             reqLength=(int32_t)(pDest - (uint8_t *)dest);
1535             if(pDestLength) {
1536                 *pDestLength = reqLength;
1537             }
1538 
1539             /* Terminate the buffer */
1540             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1541             return dest;
1542         }
1543         srcLength = u_strlen(src);
1544     }
1545 
1546     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1547     pSrcLimit = src+srcLength;
1548     for(;;) {
1549         count = (int32_t)(pDestLimit - pDest);
1550         srcLength = (int32_t)(pSrcLimit - src);
1551         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1552             /* fast ASCII loop */
1553             const UChar *prevSrc = src;
1554             int32_t delta;
1555             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1556                 *pDest++=(uint8_t)ch;
1557                 ++src;
1558             }
1559             delta = (int32_t)(src - prevSrc);
1560             count -= delta;
1561             srcLength -= delta;
1562         }
1563         /*
1564          * Each iteration of the inner loop progresses by at most 3 UTF-8
1565          * bytes and one UChar.
1566          */
1567         count /= 3;
1568         if(count > srcLength) {
1569             count = srcLength; /* min(remaining dest/3, remaining src) */
1570         }
1571         if(count < 3) {
1572             /*
1573              * Too much overhead if we get near the end of the string,
1574              * continue with the next loop.
1575              */
1576             break;
1577         }
1578         do {
1579             ch=*src++;
1580             if(ch <= 0x7f && ch != 0) {
1581                 *pDest++ = (uint8_t)ch;
1582             } else if(ch <= 0x7ff) {
1583                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1584                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1585             } else {
1586                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1587                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1588                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1589             }
1590         } while(--count > 0);
1591     }
1592 
1593     while(src<pSrcLimit) {
1594         ch=*src++;
1595         if(ch <= 0x7f && ch != 0) {
1596             if(pDest<pDestLimit) {
1597                 *pDest++ = (uint8_t)ch;
1598             } else {
1599                 reqLength = 1;
1600                 break;
1601             }
1602         } else if(ch <= 0x7ff) {
1603             if((pDestLimit - pDest) >= 2) {
1604                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1605                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1606             } else {
1607                 reqLength = 2;
1608                 break;
1609             }
1610         } else {
1611             if((pDestLimit - pDest) >= 3) {
1612                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1613                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1614                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1615             } else {
1616                 reqLength = 3;
1617                 break;
1618             }
1619         }
1620     }
1621     while(src<pSrcLimit) {
1622         ch=*src++;
1623         if(ch <= 0x7f && ch != 0) {
1624             ++reqLength;
1625         } else if(ch<=0x7ff) {
1626             reqLength+=2;
1627         } else {
1628             reqLength+=3;
1629         }
1630     }
1631 
1632     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1633     if(pDestLength){
1634         *pDestLength = reqLength;
1635     }
1636 
1637     /* Terminate the buffer */
1638     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1639     return dest;
1640 }
1641