• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2001-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *
9 * File ustrtrns.cpp
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   9/10/2001    Ram    Creation.
15 ******************************************************************************
16 */
17 
18 /*******************************************************************************
19  *
20  * u_strTo* and u_strFrom* APIs
21  * WCS functions moved to ustr_wcs.c for better modularization
22  *
23  *******************************************************************************
24  */
25 
26 
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
29 #include "unicode/utf.h"
30 #include "unicode/utf8.h"
31 #include "unicode/utf16.h"
32 #include "cstring.h"
33 #include "cmemory.h"
34 #include "ustr_imp.h"
35 #include "uassert.h"
36 
37 U_CAPI UChar* U_EXPORT2
u_strFromUTF32WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const UChar32 * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)38 u_strFromUTF32WithSub(UChar *dest,
39                int32_t destCapacity,
40                int32_t *pDestLength,
41                const UChar32 *src,
42                int32_t srcLength,
43                UChar32 subchar, int32_t *pNumSubstitutions,
44                UErrorCode *pErrorCode) {
45     const UChar32 *srcLimit;
46     UChar32 ch;
47     UChar *destLimit;
48     UChar *pDest;
49     int32_t reqLength;
50     int32_t numSubstitutions;
51 
52     /* args check */
53     if(U_FAILURE(*pErrorCode)){
54         return NULL;
55     }
56     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
57         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
58         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
59     ) {
60         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
61         return NULL;
62     }
63 
64     if(pNumSubstitutions != NULL) {
65         *pNumSubstitutions = 0;
66     }
67 
68     pDest = dest;
69     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
70     reqLength = 0;
71     numSubstitutions = 0;
72 
73     if(srcLength < 0) {
74         /* simple loop for conversion of a NUL-terminated BMP string */
75         while((ch=*src) != 0 &&
76               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
77             ++src;
78             if(pDest < destLimit) {
79                 *pDest++ = (UChar)ch;
80             } else {
81                 ++reqLength;
82             }
83         }
84         srcLimit = src;
85         if(ch != 0) {
86             /* "complicated" case, find the end of the remaining string */
87             while(*++srcLimit != 0) {}
88         }
89     } else {
90       srcLimit = (src!=NULL)?(src + srcLength):NULL;
91     }
92 
93     /* convert with length */
94     while(src < srcLimit) {
95         ch = *src++;
96         do {
97             /* usually "loops" once; twice only for writing subchar */
98             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
99                 if(pDest < destLimit) {
100                     *pDest++ = (UChar)ch;
101                 } else {
102                     ++reqLength;
103                 }
104                 break;
105             } else if(0x10000 <= ch && ch <= 0x10ffff) {
106                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
107                     *pDest++ = U16_LEAD(ch);
108                     *pDest++ = U16_TRAIL(ch);
109                 } else {
110                     reqLength += 2;
111                 }
112                 break;
113             } else if((ch = subchar) < 0) {
114                 /* surrogate code point, or not a Unicode code point at all */
115                 *pErrorCode = U_INVALID_CHAR_FOUND;
116                 return NULL;
117             } else {
118                 ++numSubstitutions;
119             }
120         } while(TRUE);
121     }
122 
123     reqLength += (int32_t)(pDest - dest);
124     if(pDestLength) {
125         *pDestLength = reqLength;
126     }
127     if(pNumSubstitutions != NULL) {
128         *pNumSubstitutions = numSubstitutions;
129     }
130 
131     /* Terminate the buffer */
132     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
133 
134     return dest;
135 }
136 
137 U_CAPI UChar* U_EXPORT2
u_strFromUTF32(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const UChar32 * src,int32_t srcLength,UErrorCode * pErrorCode)138 u_strFromUTF32(UChar *dest,
139                int32_t destCapacity,
140                int32_t *pDestLength,
141                const UChar32 *src,
142                int32_t srcLength,
143                UErrorCode *pErrorCode) {
144     return u_strFromUTF32WithSub(
145             dest, destCapacity, pDestLength,
146             src, srcLength,
147             U_SENTINEL, NULL,
148             pErrorCode);
149 }
150 
151 U_CAPI UChar32* U_EXPORT2
u_strToUTF32WithSub(UChar32 * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)152 u_strToUTF32WithSub(UChar32 *dest,
153              int32_t destCapacity,
154              int32_t *pDestLength,
155              const UChar *src,
156              int32_t srcLength,
157              UChar32 subchar, int32_t *pNumSubstitutions,
158              UErrorCode *pErrorCode) {
159     const UChar *srcLimit;
160     UChar32 ch;
161     UChar ch2;
162     UChar32 *destLimit;
163     UChar32 *pDest;
164     int32_t reqLength;
165     int32_t numSubstitutions;
166 
167     /* args check */
168     if(U_FAILURE(*pErrorCode)){
169         return NULL;
170     }
171     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
172         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
173         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
174     ) {
175         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
176         return NULL;
177     }
178 
179     if(pNumSubstitutions != NULL) {
180         *pNumSubstitutions = 0;
181     }
182 
183     pDest = dest;
184     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
185     reqLength = 0;
186     numSubstitutions = 0;
187 
188     if(srcLength < 0) {
189         /* simple loop for conversion of a NUL-terminated BMP string */
190         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
191             ++src;
192             if(pDest < destLimit) {
193                 *pDest++ = ch;
194             } else {
195                 ++reqLength;
196             }
197         }
198         srcLimit = src;
199         if(ch != 0) {
200             /* "complicated" case, find the end of the remaining string */
201             while(*++srcLimit != 0) {}
202         }
203     } else {
204         srcLimit = (src!=NULL)?(src + srcLength):NULL;
205     }
206 
207     /* convert with length */
208     while(src < srcLimit) {
209         ch = *src++;
210         if(!U16_IS_SURROGATE(ch)) {
211             /* write or count ch below */
212         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
213             ++src;
214             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
215         } else if((ch = subchar) < 0) {
216             /* unpaired surrogate */
217             *pErrorCode = U_INVALID_CHAR_FOUND;
218             return NULL;
219         } else {
220             ++numSubstitutions;
221         }
222         if(pDest < destLimit) {
223             *pDest++ = ch;
224         } else {
225             ++reqLength;
226         }
227     }
228 
229     reqLength += (int32_t)(pDest - dest);
230     if(pDestLength) {
231         *pDestLength = reqLength;
232     }
233     if(pNumSubstitutions != NULL) {
234         *pNumSubstitutions = numSubstitutions;
235     }
236 
237     /* Terminate the buffer */
238     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
239 
240     return dest;
241 }
242 
243 U_CAPI UChar32* U_EXPORT2
u_strToUTF32(UChar32 * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)244 u_strToUTF32(UChar32 *dest,
245              int32_t destCapacity,
246              int32_t *pDestLength,
247              const UChar *src,
248              int32_t srcLength,
249              UErrorCode *pErrorCode) {
250     return u_strToUTF32WithSub(
251             dest, destCapacity, pDestLength,
252             src, srcLength,
253             U_SENTINEL, NULL,
254             pErrorCode);
255 }
256 
257 /* for utf8_nextCharSafeBodyTerminated() */
258 static const UChar32
259 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
260 
261 /*
262  * Version of utf8_nextCharSafeBody() with the following differences:
263  * - checks for NUL termination instead of length
264  * - works with pointers instead of indexes
265  * - always strict (strict==-1)
266  *
267  * *ps points to after the lead byte and will be moved to after the last trail byte.
268  * c is the lead byte.
269  * @return the code point, or U_SENTINEL
270  */
271 static UChar32
utf8_nextCharSafeBodyTerminated(const uint8_t ** ps,UChar32 c)272 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
273     const uint8_t *s=*ps;
274     uint8_t trail, illegal=0;
275     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
276     U_ASSERT(count<6);
277     U8_MASK_LEAD_BYTE((c), count);
278     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
279     switch(count) {
280     /* each branch falls through to the next one */
281     case 5:
282     case 4:
283         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
284         illegal=1;
285         break;
286     case 3:
287         trail=(uint8_t)(*s++ - 0x80);
288         c=(c<<6)|trail;
289         if(trail>0x3f || c>=0x110) {
290             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
291             illegal=1;
292             break;
293         }
294     case 2: /*fall through*/
295         trail=(uint8_t)(*s++ - 0x80);
296         if(trail>0x3f) {
297             /* not a trail byte */
298             illegal=1;
299             break;
300         }
301         c=(c<<6)|trail;
302     case 1: /*fall through*/
303         trail=(uint8_t)(*s++ - 0x80);
304         if(trail>0x3f) {
305             /* not a trail byte */
306             illegal=1;
307         }
308         c=(c<<6)|trail;
309         break;
310     case 0:
311         return U_SENTINEL;
312     /* no default branch to optimize switch()  - all values are covered */
313     }
314 
315     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
316     /* illegal is also set if count>=4 */
317     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
318         /* error handling */
319         /* don't go beyond this sequence */
320         s=*ps;
321         while(count>0 && U8_IS_TRAIL(*s)) {
322             ++s;
323             --count;
324         }
325         c=U_SENTINEL;
326     }
327     *ps=s;
328     return c;
329 }
330 
331 /*
332  * Version of utf8_nextCharSafeBody() with the following differences:
333  * - works with pointers instead of indexes
334  * - always strict (strict==-1)
335  *
336  * *ps points to after the lead byte and will be moved to after the last trail byte.
337  * c is the lead byte.
338  * @return the code point, or U_SENTINEL
339  */
340 static UChar32
utf8_nextCharSafeBodyPointer(const uint8_t ** ps,const uint8_t * limit,UChar32 c)341 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
342     const uint8_t *s=*ps;
343     uint8_t trail, illegal=0;
344     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
345     if((limit-s)>=count) {
346         U8_MASK_LEAD_BYTE((c), count);
347         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
348         switch(count) {
349         /* each branch falls through to the next one */
350         case 5:
351         case 4:
352             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
353             illegal=1;
354             break;
355         case 3:
356             trail=*s++;
357             c=(c<<6)|(trail&0x3f);
358             if(c<0x110) {
359                 illegal|=(trail&0xc0)^0x80;
360             } else {
361                 /* code point>0x10ffff, outside Unicode */
362                 illegal=1;
363                 break;
364             }
365         case 2: /*fall through*/
366             trail=*s++;
367             c=(c<<6)|(trail&0x3f);
368             illegal|=(trail&0xc0)^0x80;
369         case 1: /*fall through*/
370             trail=*s++;
371             c=(c<<6)|(trail&0x3f);
372             illegal|=(trail&0xc0)^0x80;
373             break;
374         case 0:
375             return U_SENTINEL;
376         /* no default branch to optimize switch()  - all values are covered */
377         }
378     } else {
379         illegal=1; /* too few bytes left */
380     }
381 
382     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
383     /* illegal is also set if count>=4 */
384     U_ASSERT(count<sizeof(utf8_minLegal)/sizeof(utf8_minLegal[0]));
385     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
386         /* error handling */
387         /* don't go beyond this sequence */
388         s=*ps;
389         while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
390             ++s;
391             --count;
392         }
393         c=U_SENTINEL;
394     }
395     *ps=s;
396     return c;
397 }
398 
399 U_CAPI UChar* U_EXPORT2
u_strFromUTF8WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)400 u_strFromUTF8WithSub(UChar *dest,
401               int32_t destCapacity,
402               int32_t *pDestLength,
403               const char* src,
404               int32_t srcLength,
405               UChar32 subchar, int32_t *pNumSubstitutions,
406               UErrorCode *pErrorCode){
407     UChar *pDest = dest;
408     UChar *pDestLimit = dest+destCapacity;
409     UChar32 ch;
410     int32_t reqLength = 0;
411     const uint8_t* pSrc = (const uint8_t*) src;
412     uint8_t t1, t2; /* trail bytes */
413     int32_t numSubstitutions;
414 
415     /* args check */
416     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
417         return NULL;
418     }
419 
420     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
421         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
422         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
423     ) {
424         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
425         return NULL;
426     }
427 
428     if(pNumSubstitutions!=NULL) {
429         *pNumSubstitutions=0;
430     }
431     numSubstitutions=0;
432 
433     /*
434      * Inline processing of UTF-8 byte sequences:
435      *
436      * Byte sequences for the most common characters are handled inline in
437      * the conversion loops. In order to reduce the path lengths for those
438      * characters, the tests are arranged in a kind of binary search.
439      * ASCII (<=0x7f) is checked first, followed by the dividing point
440      * between 2- and 3-byte sequences (0xe0).
441      * The 3-byte branch is tested first to speed up CJK text.
442      * The compiler should combine the subtractions for the two tests for 0xe0.
443      * Each branch then tests for the other end of its range.
444      */
445 
446     if(srcLength < 0){
447         /*
448          * Transform a NUL-terminated string.
449          * The code explicitly checks for NULs only in the lead byte position.
450          * A NUL byte in the trail byte position fails the trail byte range check anyway.
451          */
452         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
453             if(ch <= 0x7f){
454                 *pDest++=(UChar)ch;
455                 ++pSrc;
456             } else {
457                 if(ch > 0xe0) {
458                     if( /* handle U+1000..U+CFFF inline */
459                         ch <= 0xec &&
460                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
461                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
462                     ) {
463                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
464                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
465                         pSrc += 3;
466                         continue;
467                     }
468                 } else if(ch < 0xe0) {
469                     if( /* handle U+0080..U+07FF inline */
470                         ch >= 0xc2 &&
471                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
472                     ) {
473                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
474                         pSrc += 2;
475                         continue;
476                     }
477                 }
478 
479                 /* function call for "complicated" and error cases */
480                 ++pSrc; /* continue after the lead byte */
481                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
482                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
483                     *pErrorCode = U_INVALID_CHAR_FOUND;
484                     return NULL;
485                 } else if(ch<=0xFFFF) {
486                     *(pDest++)=(UChar)ch;
487                 } else {
488                     *(pDest++)=U16_LEAD(ch);
489                     if(pDest<pDestLimit) {
490                         *(pDest++)=U16_TRAIL(ch);
491                     } else {
492                         reqLength++;
493                         break;
494                     }
495                 }
496             }
497         }
498 
499         /* Pre-flight the rest of the string. */
500         while((ch = *pSrc) != 0) {
501             if(ch <= 0x7f){
502                 ++reqLength;
503                 ++pSrc;
504             } else {
505                 if(ch > 0xe0) {
506                     if( /* handle U+1000..U+CFFF inline */
507                         ch <= 0xec &&
508                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
509                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
510                     ) {
511                         ++reqLength;
512                         pSrc += 3;
513                         continue;
514                     }
515                 } else if(ch < 0xe0) {
516                     if( /* handle U+0080..U+07FF inline */
517                         ch >= 0xc2 &&
518                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
519                     ) {
520                         ++reqLength;
521                         pSrc += 2;
522                         continue;
523                     }
524                 }
525 
526                 /* function call for "complicated" and error cases */
527                 ++pSrc; /* continue after the lead byte */
528                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
529                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
530                     *pErrorCode = U_INVALID_CHAR_FOUND;
531                     return NULL;
532                 }
533                 reqLength += U16_LENGTH(ch);
534             }
535         }
536     } else /* srcLength >= 0 */ {
537         const uint8_t *pSrcLimit = pSrc + srcLength;
538         int32_t count;
539 
540         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
541         for(;;) {
542             /*
543              * Each iteration of the inner loop progresses by at most 3 UTF-8
544              * bytes and one UChar, for most characters.
545              * For supplementary code points (4 & 2), which are rare,
546              * there is an additional adjustment.
547              */
548             count = (int32_t)(pDestLimit - pDest);
549             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
550             if(count > srcLength) {
551                 count = srcLength; /* min(remaining dest, remaining src/3) */
552             }
553             if(count < 3) {
554                 /*
555                  * Too much overhead if we get near the end of the string,
556                  * continue with the next loop.
557                  */
558                 break;
559             }
560 
561             do {
562                 ch = *pSrc;
563                 if(ch <= 0x7f){
564                     *pDest++=(UChar)ch;
565                     ++pSrc;
566                 } else {
567                     if(ch > 0xe0) {
568                         if( /* handle U+1000..U+CFFF inline */
569                             ch <= 0xec &&
570                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
571                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
572                         ) {
573                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
574                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
575                             pSrc += 3;
576                             continue;
577                         }
578                     } else if(ch < 0xe0) {
579                         if( /* handle U+0080..U+07FF inline */
580                             ch >= 0xc2 &&
581                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
582                         ) {
583                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
584                             pSrc += 2;
585                             continue;
586                         }
587                     }
588 
589                     if(ch >= 0xf0 || subchar > 0xffff) {
590                         /*
591                          * We may read up to six bytes and write up to two UChars,
592                          * which we didn't account for with computing count,
593                          * so we adjust it here.
594                          */
595                         if(--count == 0) {
596                             break;
597                         }
598                     }
599 
600                     /* function call for "complicated" and error cases */
601                     ++pSrc; /* continue after the lead byte */
602                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
603                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
604                         *pErrorCode = U_INVALID_CHAR_FOUND;
605                         return NULL;
606                     }else if(ch<=0xFFFF){
607                         *(pDest++)=(UChar)ch;
608                     }else{
609                         *(pDest++)=U16_LEAD(ch);
610                         *(pDest++)=U16_TRAIL(ch);
611                     }
612                 }
613             } while(--count > 0);
614         }
615 
616         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
617             ch = *pSrc;
618             if(ch <= 0x7f){
619                 *pDest++=(UChar)ch;
620                 ++pSrc;
621             } else {
622                 if(ch > 0xe0) {
623                     if( /* handle U+1000..U+CFFF inline */
624                         ch <= 0xec &&
625                         ((pSrcLimit - pSrc) >= 3) &&
626                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
627                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
628                     ) {
629                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
630                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
631                         pSrc += 3;
632                         continue;
633                     }
634                 } else if(ch < 0xe0) {
635                     if( /* handle U+0080..U+07FF inline */
636                         ch >= 0xc2 &&
637                         ((pSrcLimit - pSrc) >= 2) &&
638                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
639                     ) {
640                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
641                         pSrc += 2;
642                         continue;
643                     }
644                 }
645 
646                 /* function call for "complicated" and error cases */
647                 ++pSrc; /* continue after the lead byte */
648                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
649                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
650                     *pErrorCode = U_INVALID_CHAR_FOUND;
651                     return NULL;
652                 }else if(ch<=0xFFFF){
653                     *(pDest++)=(UChar)ch;
654                 }else{
655                     *(pDest++)=U16_LEAD(ch);
656                     if(pDest<pDestLimit){
657                         *(pDest++)=U16_TRAIL(ch);
658                     }else{
659                         reqLength++;
660                         break;
661                     }
662                 }
663             }
664         }
665         /* do not fill the dest buffer just count the UChars needed */
666         while(pSrc < pSrcLimit){
667             ch = *pSrc;
668             if(ch <= 0x7f){
669                 reqLength++;
670                 ++pSrc;
671             } else {
672                 if(ch > 0xe0) {
673                     if( /* handle U+1000..U+CFFF inline */
674                         ch <= 0xec &&
675                         ((pSrcLimit - pSrc) >= 3) &&
676                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
677                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
678                     ) {
679                         reqLength++;
680                         pSrc += 3;
681                         continue;
682                     }
683                 } else if(ch < 0xe0) {
684                     if( /* handle U+0080..U+07FF inline */
685                         ch >= 0xc2 &&
686                         ((pSrcLimit - pSrc) >= 2) &&
687                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
688                     ) {
689                         reqLength++;
690                         pSrc += 2;
691                         continue;
692                     }
693                 }
694 
695                 /* function call for "complicated" and error cases */
696                 ++pSrc; /* continue after the lead byte */
697                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
698                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
699                     *pErrorCode = U_INVALID_CHAR_FOUND;
700                     return NULL;
701                 }
702                 reqLength+=U16_LENGTH(ch);
703             }
704         }
705     }
706 
707     reqLength+=(int32_t)(pDest - dest);
708 
709     if(pNumSubstitutions!=NULL) {
710         *pNumSubstitutions=numSubstitutions;
711     }
712 
713     if(pDestLength){
714         *pDestLength = reqLength;
715     }
716 
717     /* Terminate the buffer */
718     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
719 
720     return dest;
721 }
722 
723 U_CAPI UChar* U_EXPORT2
u_strFromUTF8(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)724 u_strFromUTF8(UChar *dest,
725               int32_t destCapacity,
726               int32_t *pDestLength,
727               const char* src,
728               int32_t srcLength,
729               UErrorCode *pErrorCode){
730     return u_strFromUTF8WithSub(
731             dest, destCapacity, pDestLength,
732             src, srcLength,
733             U_SENTINEL, NULL,
734             pErrorCode);
735 }
736 
737 U_CAPI UChar * U_EXPORT2
u_strFromUTF8Lenient(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)738 u_strFromUTF8Lenient(UChar *dest,
739                      int32_t destCapacity,
740                      int32_t *pDestLength,
741                      const char *src,
742                      int32_t srcLength,
743                      UErrorCode *pErrorCode) {
744     UChar *pDest = dest;
745     UChar32 ch;
746     int32_t reqLength = 0;
747     uint8_t* pSrc = (uint8_t*) src;
748 
749     /* args check */
750     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
751         return NULL;
752     }
753 
754     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
755         (destCapacity<0) || (dest == NULL && destCapacity > 0)
756     ) {
757         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
758         return NULL;
759     }
760 
761     if(srcLength < 0) {
762         /* Transform a NUL-terminated string. */
763         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
764         uint8_t t1, t2, t3; /* trail bytes */
765 
766         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
767             if(ch < 0xc0) {
768                 /*
769                  * ASCII, or a trail byte in lead position which is treated like
770                  * a single-byte sequence for better character boundary
771                  * resynchronization after illegal sequences.
772                  */
773                 *pDest++=(UChar)ch;
774                 ++pSrc;
775                 continue;
776             } else if(ch < 0xe0) { /* U+0080..U+07FF */
777                 if((t1 = pSrc[1]) != 0) {
778                     /* 0x3080 = (0xc0 << 6) + 0x80 */
779                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
780                     pSrc += 2;
781                     continue;
782                 }
783             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
784                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
785                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
786                     /* 0x2080 = (0x80 << 6) + 0x80 */
787                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
788                     pSrc += 3;
789                     continue;
790                 }
791             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
792                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
793                     pSrc += 4;
794                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
795                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
796                     *(pDest++) = U16_LEAD(ch);
797                     if(pDest < pDestLimit) {
798                         *(pDest++) = U16_TRAIL(ch);
799                     } else {
800                         reqLength = 1;
801                         break;
802                     }
803                     continue;
804                 }
805             }
806 
807             /* truncated character at the end */
808             *pDest++ = 0xfffd;
809             while(*++pSrc != 0) {}
810             break;
811         }
812 
813         /* Pre-flight the rest of the string. */
814         while((ch = *pSrc) != 0) {
815             if(ch < 0xc0) {
816                 /*
817                  * ASCII, or a trail byte in lead position which is treated like
818                  * a single-byte sequence for better character boundary
819                  * resynchronization after illegal sequences.
820                  */
821                 ++reqLength;
822                 ++pSrc;
823                 continue;
824             } else if(ch < 0xe0) { /* U+0080..U+07FF */
825                 if(pSrc[1] != 0) {
826                     ++reqLength;
827                     pSrc += 2;
828                     continue;
829                 }
830             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
831                 if(pSrc[1] != 0 && pSrc[2] != 0) {
832                     ++reqLength;
833                     pSrc += 3;
834                     continue;
835                 }
836             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
837                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
838                     reqLength += 2;
839                     pSrc += 4;
840                     continue;
841                 }
842             }
843 
844             /* truncated character at the end */
845             ++reqLength;
846             break;
847         }
848     } else /* srcLength >= 0 */ {
849       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
850 
851         /*
852          * This function requires that if srcLength is given, then it must be
853          * destCapatity >= srcLength so that we need not check for
854          * destination buffer overflow in the loop.
855          */
856         if(destCapacity < srcLength) {
857             if(pDestLength != NULL) {
858                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
859             }
860             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
861             return NULL;
862         }
863 
864         if((pSrcLimit - pSrc) >= 4) {
865             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
866 
867             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
868             do {
869                 ch = *pSrc++;
870                 if(ch < 0xc0) {
871                     /*
872                      * ASCII, or a trail byte in lead position which is treated like
873                      * a single-byte sequence for better character boundary
874                      * resynchronization after illegal sequences.
875                      */
876                     *pDest++=(UChar)ch;
877                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
878                     /* 0x3080 = (0xc0 << 6) + 0x80 */
879                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
880                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
881                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
882                     /* 0x2080 = (0x80 << 6) + 0x80 */
883                     ch = (ch << 12) + (*pSrc++ << 6);
884                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
885                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
886                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
887                     ch = (ch << 18) + (*pSrc++ << 12);
888                     ch += *pSrc++ << 6;
889                     ch += *pSrc++ - 0x3c82080;
890                     *(pDest++) = U16_LEAD(ch);
891                     *(pDest++) = U16_TRAIL(ch);
892                 }
893             } while(pSrc < pSrcLimit);
894 
895             pSrcLimit += 3; /* restore original pSrcLimit */
896         }
897 
898         while(pSrc < pSrcLimit) {
899             ch = *pSrc++;
900             if(ch < 0xc0) {
901                 /*
902                  * ASCII, or a trail byte in lead position which is treated like
903                  * a single-byte sequence for better character boundary
904                  * resynchronization after illegal sequences.
905                  */
906                 *pDest++=(UChar)ch;
907                 continue;
908             } else if(ch < 0xe0) { /* U+0080..U+07FF */
909                 if(pSrc < pSrcLimit) {
910                     /* 0x3080 = (0xc0 << 6) + 0x80 */
911                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
912                     continue;
913                 }
914             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
915                 if((pSrcLimit - pSrc) >= 2) {
916                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
917                     /* 0x2080 = (0x80 << 6) + 0x80 */
918                     ch = (ch << 12) + (*pSrc++ << 6);
919                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
920                     pSrc += 3;
921                     continue;
922                 }
923             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
924                 if((pSrcLimit - pSrc) >= 3) {
925                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
926                     ch = (ch << 18) + (*pSrc++ << 12);
927                     ch += *pSrc++ << 6;
928                     ch += *pSrc++ - 0x3c82080;
929                     *(pDest++) = U16_LEAD(ch);
930                     *(pDest++) = U16_TRAIL(ch);
931                     pSrc += 4;
932                     continue;
933                 }
934             }
935 
936             /* truncated character at the end */
937             *pDest++ = 0xfffd;
938             break;
939         }
940     }
941 
942     reqLength+=(int32_t)(pDest - dest);
943 
944     if(pDestLength){
945         *pDestLength = reqLength;
946     }
947 
948     /* Terminate the buffer */
949     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
950 
951     return dest;
952 }
953 
954 static inline uint8_t *
_appendUTF8(uint8_t * pDest,UChar32 c)955 _appendUTF8(uint8_t *pDest, UChar32 c) {
956     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
957     if((c)<=0x7f) {
958         *pDest++=(uint8_t)c;
959     } else if(c<=0x7ff) {
960         *pDest++=(uint8_t)((c>>6)|0xc0);
961         *pDest++=(uint8_t)((c&0x3f)|0x80);
962     } else if(c<=0xffff) {
963         *pDest++=(uint8_t)((c>>12)|0xe0);
964         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
965         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
966     } else /* if((uint32_t)(c)<=0x10ffff) */ {
967         *pDest++=(uint8_t)(((c)>>18)|0xf0);
968         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
969         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
970         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
971     }
972     return pDest;
973 }
974 
975 
976 U_CAPI char* U_EXPORT2
u_strToUTF8WithSub(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * pSrc,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)977 u_strToUTF8WithSub(char *dest,
978             int32_t destCapacity,
979             int32_t *pDestLength,
980             const UChar *pSrc,
981             int32_t srcLength,
982             UChar32 subchar, int32_t *pNumSubstitutions,
983             UErrorCode *pErrorCode){
984     int32_t reqLength=0;
985     uint32_t ch=0,ch2=0;
986     uint8_t *pDest = (uint8_t *)dest;
987     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
988     int32_t numSubstitutions;
989 
990     /* args check */
991     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
992         return NULL;
993     }
994 
995     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
996         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
997         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
998     ) {
999         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1000         return NULL;
1001     }
1002 
1003     if(pNumSubstitutions!=NULL) {
1004         *pNumSubstitutions=0;
1005     }
1006     numSubstitutions=0;
1007 
1008     if(srcLength==-1) {
1009         while((ch=*pSrc)!=0) {
1010             ++pSrc;
1011             if(ch <= 0x7f) {
1012                 if(pDest<pDestLimit) {
1013                     *pDest++ = (uint8_t)ch;
1014                 } else {
1015                     reqLength = 1;
1016                     break;
1017                 }
1018             } else if(ch <= 0x7ff) {
1019                 if((pDestLimit - pDest) >= 2) {
1020                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1021                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1022                 } else {
1023                     reqLength = 2;
1024                     break;
1025                 }
1026             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1027                 if((pDestLimit - pDest) >= 3) {
1028                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1029                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1030                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1031                 } else {
1032                     reqLength = 3;
1033                     break;
1034                 }
1035             } else /* ch is a surrogate */ {
1036                 int32_t length;
1037 
1038                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1039                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1040                     ++pSrc;
1041                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1042                 } else if(subchar>=0) {
1043                     ch=subchar;
1044                     ++numSubstitutions;
1045                 } else {
1046                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1047                     *pErrorCode = U_INVALID_CHAR_FOUND;
1048                     return NULL;
1049                 }
1050 
1051                 length = U8_LENGTH(ch);
1052                 if((pDestLimit - pDest) >= length) {
1053                     /* convert and append*/
1054                     pDest=_appendUTF8(pDest, ch);
1055                 } else {
1056                     reqLength = length;
1057                     break;
1058                 }
1059             }
1060         }
1061         while((ch=*pSrc++)!=0) {
1062             if(ch<=0x7f) {
1063                 ++reqLength;
1064             } else if(ch<=0x7ff) {
1065                 reqLength+=2;
1066             } else if(!U16_IS_SURROGATE(ch)) {
1067                 reqLength+=3;
1068             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1069                 ++pSrc;
1070                 reqLength+=4;
1071             } else if(subchar>=0) {
1072                 reqLength+=U8_LENGTH(subchar);
1073                 ++numSubstitutions;
1074             } else {
1075                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1076                 *pErrorCode = U_INVALID_CHAR_FOUND;
1077                 return NULL;
1078             }
1079         }
1080     } else {
1081         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1082         int32_t count;
1083 
1084         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1085         for(;;) {
1086             /*
1087              * Each iteration of the inner loop progresses by at most 3 UTF-8
1088              * bytes and one UChar, for most characters.
1089              * For supplementary code points (4 & 2), which are rare,
1090              * there is an additional adjustment.
1091              */
1092             count = (int32_t)((pDestLimit - pDest) / 3);
1093             srcLength = (int32_t)(pSrcLimit - pSrc);
1094             if(count > srcLength) {
1095                 count = srcLength; /* min(remaining dest/3, remaining src) */
1096             }
1097             if(count < 3) {
1098                 /*
1099                  * Too much overhead if we get near the end of the string,
1100                  * continue with the next loop.
1101                  */
1102                 break;
1103             }
1104             do {
1105                 ch=*pSrc++;
1106                 if(ch <= 0x7f) {
1107                     *pDest++ = (uint8_t)ch;
1108                 } else if(ch <= 0x7ff) {
1109                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1110                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1111                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1112                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1113                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1114                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1115                 } else /* ch is a surrogate */ {
1116                     /*
1117                      * We will read two UChars and probably output four bytes,
1118                      * which we didn't account for with computing count,
1119                      * so we adjust it here.
1120                      */
1121                     if(--count == 0) {
1122                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1123                         break;  /* recompute count */
1124                     }
1125 
1126                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1127                         ++pSrc;
1128                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1129 
1130                         /* writing 4 bytes per 2 UChars is ok */
1131                         *pDest++=(uint8_t)((ch>>18)|0xf0);
1132                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1133                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1134                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
1135                     } else  {
1136                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1137                         if(subchar>=0) {
1138                             ch=subchar;
1139                             ++numSubstitutions;
1140                         } else {
1141                             *pErrorCode = U_INVALID_CHAR_FOUND;
1142                             return NULL;
1143                         }
1144 
1145                         /* convert and append*/
1146                         pDest=_appendUTF8(pDest, ch);
1147                     }
1148                 }
1149             } while(--count > 0);
1150         }
1151 
1152         while(pSrc<pSrcLimit) {
1153             ch=*pSrc++;
1154             if(ch <= 0x7f) {
1155                 if(pDest<pDestLimit) {
1156                     *pDest++ = (uint8_t)ch;
1157                 } else {
1158                     reqLength = 1;
1159                     break;
1160                 }
1161             } else if(ch <= 0x7ff) {
1162                 if((pDestLimit - pDest) >= 2) {
1163                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1164                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1165                 } else {
1166                     reqLength = 2;
1167                     break;
1168                 }
1169             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1170                 if((pDestLimit - pDest) >= 3) {
1171                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1172                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1173                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1174                 } else {
1175                     reqLength = 3;
1176                     break;
1177                 }
1178             } else /* ch is a surrogate */ {
1179                 int32_t length;
1180 
1181                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1182                     ++pSrc;
1183                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1184                 } else if(subchar>=0) {
1185                     ch=subchar;
1186                     ++numSubstitutions;
1187                 } else {
1188                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1189                     *pErrorCode = U_INVALID_CHAR_FOUND;
1190                     return NULL;
1191                 }
1192 
1193                 length = U8_LENGTH(ch);
1194                 if((pDestLimit - pDest) >= length) {
1195                     /* convert and append*/
1196                     pDest=_appendUTF8(pDest, ch);
1197                 } else {
1198                     reqLength = length;
1199                     break;
1200                 }
1201             }
1202         }
1203         while(pSrc<pSrcLimit) {
1204             ch=*pSrc++;
1205             if(ch<=0x7f) {
1206                 ++reqLength;
1207             } else if(ch<=0x7ff) {
1208                 reqLength+=2;
1209             } else if(!U16_IS_SURROGATE(ch)) {
1210                 reqLength+=3;
1211             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1212                 ++pSrc;
1213                 reqLength+=4;
1214             } else if(subchar>=0) {
1215                 reqLength+=U8_LENGTH(subchar);
1216                 ++numSubstitutions;
1217             } else {
1218                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1219                 *pErrorCode = U_INVALID_CHAR_FOUND;
1220                 return NULL;
1221             }
1222         }
1223     }
1224 
1225     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1226 
1227     if(pNumSubstitutions!=NULL) {
1228         *pNumSubstitutions=numSubstitutions;
1229     }
1230 
1231     if(pDestLength){
1232         *pDestLength = reqLength;
1233     }
1234 
1235     /* Terminate the buffer */
1236     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1237     return dest;
1238 }
1239 
1240 U_CAPI char* U_EXPORT2
u_strToUTF8(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * pSrc,int32_t srcLength,UErrorCode * pErrorCode)1241 u_strToUTF8(char *dest,
1242             int32_t destCapacity,
1243             int32_t *pDestLength,
1244             const UChar *pSrc,
1245             int32_t srcLength,
1246             UErrorCode *pErrorCode){
1247     return u_strToUTF8WithSub(
1248             dest, destCapacity, pDestLength,
1249             pSrc, srcLength,
1250             U_SENTINEL, NULL,
1251             pErrorCode);
1252 }
1253 
1254 U_CAPI UChar* U_EXPORT2
u_strFromJavaModifiedUTF8WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)1255 u_strFromJavaModifiedUTF8WithSub(
1256         UChar *dest,
1257         int32_t destCapacity,
1258         int32_t *pDestLength,
1259         const char *src,
1260         int32_t srcLength,
1261         UChar32 subchar, int32_t *pNumSubstitutions,
1262         UErrorCode *pErrorCode) {
1263     UChar *pDest = dest;
1264     UChar *pDestLimit = dest+destCapacity;
1265     UChar32 ch;
1266     int32_t reqLength = 0;
1267     const uint8_t* pSrc = (const uint8_t*) src;
1268     const uint8_t *pSrcLimit;
1269     int32_t count;
1270     uint8_t t1, t2; /* trail bytes */
1271     int32_t numSubstitutions;
1272 
1273     /* args check */
1274     if(U_FAILURE(*pErrorCode)){
1275         return NULL;
1276     }
1277     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1278         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1279         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1280     ) {
1281         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1282         return NULL;
1283     }
1284 
1285     if(pNumSubstitutions!=NULL) {
1286         *pNumSubstitutions=0;
1287     }
1288     numSubstitutions=0;
1289 
1290     if(srcLength < 0) {
1291         /*
1292          * Transform a NUL-terminated ASCII string.
1293          * Handle non-ASCII strings with slower code.
1294          */
1295         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1296             *pDest++=(UChar)ch;
1297             ++pSrc;
1298         }
1299         if(ch == 0) {
1300             reqLength=(int32_t)(pDest - dest);
1301             if(pDestLength) {
1302                 *pDestLength = reqLength;
1303             }
1304 
1305             /* Terminate the buffer */
1306             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1307             return dest;
1308         }
1309         srcLength = uprv_strlen((const char *)pSrc);
1310     }
1311 
1312     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1313     pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1314     for(;;) {
1315         count = (int32_t)(pDestLimit - pDest);
1316         srcLength = (int32_t)(pSrcLimit - pSrc);
1317         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1318             /* fast ASCII loop */
1319             const uint8_t *prevSrc = pSrc;
1320             int32_t delta;
1321             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1322                 *pDest++=(UChar)ch;
1323                 ++pSrc;
1324             }
1325             delta = (int32_t)(pSrc - prevSrc);
1326             count -= delta;
1327             srcLength -= delta;
1328         }
1329         /*
1330          * Each iteration of the inner loop progresses by at most 3 UTF-8
1331          * bytes and one UChar.
1332          */
1333         srcLength /= 3;
1334         if(count > srcLength) {
1335             count = srcLength; /* min(remaining dest, remaining src/3) */
1336         }
1337         if(count < 3) {
1338             /*
1339              * Too much overhead if we get near the end of the string,
1340              * continue with the next loop.
1341              */
1342             break;
1343         }
1344         do {
1345             ch = *pSrc;
1346             if(ch <= 0x7f){
1347                 *pDest++=(UChar)ch;
1348                 ++pSrc;
1349             } else {
1350                 if(ch >= 0xe0) {
1351                     if( /* handle U+0000..U+FFFF inline */
1352                         ch <= 0xef &&
1353                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1354                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1355                     ) {
1356                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1357                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1358                         pSrc += 3;
1359                         continue;
1360                     }
1361                 } else {
1362                     if( /* handle U+0000..U+07FF inline */
1363                         ch >= 0xc0 &&
1364                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1365                     ) {
1366                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1367                         pSrc += 2;
1368                         continue;
1369                     }
1370                 }
1371 
1372                 if(subchar < 0) {
1373                     *pErrorCode = U_INVALID_CHAR_FOUND;
1374                     return NULL;
1375                 } else if(subchar > 0xffff && --count == 0) {
1376                     /*
1377                      * We need to write two UChars, adjusted count for that,
1378                      * and ran out of space.
1379                      */
1380                     break;
1381                 } else {
1382                     /* function call for error cases */
1383                     ++pSrc; /* continue after the lead byte */
1384                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1385                     ++numSubstitutions;
1386                     if(subchar<=0xFFFF) {
1387                         *(pDest++)=(UChar)subchar;
1388                     } else {
1389                         *(pDest++)=U16_LEAD(subchar);
1390                         *(pDest++)=U16_TRAIL(subchar);
1391                     }
1392                 }
1393             }
1394         } while(--count > 0);
1395     }
1396 
1397     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1398         ch = *pSrc;
1399         if(ch <= 0x7f){
1400             *pDest++=(UChar)ch;
1401             ++pSrc;
1402         } else {
1403             if(ch >= 0xe0) {
1404                 if( /* handle U+0000..U+FFFF inline */
1405                     ch <= 0xef &&
1406                     ((pSrcLimit - pSrc) >= 3) &&
1407                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1408                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1409                 ) {
1410                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1411                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1412                     pSrc += 3;
1413                     continue;
1414                 }
1415             } else {
1416                 if( /* handle U+0000..U+07FF inline */
1417                     ch >= 0xc0 &&
1418                     ((pSrcLimit - pSrc) >= 2) &&
1419                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1420                 ) {
1421                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1422                     pSrc += 2;
1423                     continue;
1424                 }
1425             }
1426 
1427             if(subchar < 0) {
1428                 *pErrorCode = U_INVALID_CHAR_FOUND;
1429                 return NULL;
1430             } else {
1431                 /* function call for error cases */
1432                 ++pSrc; /* continue after the lead byte */
1433                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1434                 ++numSubstitutions;
1435                 if(subchar<=0xFFFF) {
1436                     *(pDest++)=(UChar)subchar;
1437                 } else {
1438                     *(pDest++)=U16_LEAD(subchar);
1439                     if(pDest<pDestLimit) {
1440                         *(pDest++)=U16_TRAIL(subchar);
1441                     } else {
1442                         reqLength++;
1443                         break;
1444                     }
1445                 }
1446             }
1447         }
1448     }
1449 
1450     /* do not fill the dest buffer just count the UChars needed */
1451     while(pSrc < pSrcLimit){
1452         ch = *pSrc;
1453         if(ch <= 0x7f) {
1454             reqLength++;
1455             ++pSrc;
1456         } else {
1457             if(ch >= 0xe0) {
1458                 if( /* handle U+0000..U+FFFF inline */
1459                     ch <= 0xef &&
1460                     ((pSrcLimit - pSrc) >= 3) &&
1461                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1462                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1463                 ) {
1464                     reqLength++;
1465                     pSrc += 3;
1466                     continue;
1467                 }
1468             } else {
1469                 if( /* handle U+0000..U+07FF inline */
1470                     ch >= 0xc0 &&
1471                     ((pSrcLimit - pSrc) >= 2) &&
1472                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1473                 ) {
1474                     reqLength++;
1475                     pSrc += 2;
1476                     continue;
1477                 }
1478             }
1479 
1480             if(subchar < 0) {
1481                 *pErrorCode = U_INVALID_CHAR_FOUND;
1482                 return NULL;
1483             } else {
1484                 /* function call for error cases */
1485                 ++pSrc; /* continue after the lead byte */
1486                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1487                 ++numSubstitutions;
1488                 reqLength+=U16_LENGTH(ch);
1489             }
1490         }
1491     }
1492 
1493     if(pNumSubstitutions!=NULL) {
1494         *pNumSubstitutions=numSubstitutions;
1495     }
1496 
1497     reqLength+=(int32_t)(pDest - dest);
1498     if(pDestLength) {
1499         *pDestLength = reqLength;
1500     }
1501 
1502     /* Terminate the buffer */
1503     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1504     return dest;
1505 }
1506 
1507 U_CAPI char* U_EXPORT2
u_strToJavaModifiedUTF8(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)1508 u_strToJavaModifiedUTF8(
1509         char *dest,
1510         int32_t destCapacity,
1511         int32_t *pDestLength,
1512         const UChar *src,
1513         int32_t srcLength,
1514         UErrorCode *pErrorCode) {
1515     int32_t reqLength=0;
1516     uint32_t ch=0;
1517     uint8_t *pDest = (uint8_t *)dest;
1518     uint8_t *pDestLimit = pDest + destCapacity;
1519     const UChar *pSrcLimit;
1520     int32_t count;
1521 
1522     /* args check */
1523     if(U_FAILURE(*pErrorCode)){
1524         return NULL;
1525     }
1526     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1527         (dest==NULL && destCapacity!=0) || destCapacity<0
1528     ) {
1529         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1530         return NULL;
1531     }
1532 
1533     if(srcLength==-1) {
1534         /* Convert NUL-terminated ASCII, then find the string length. */
1535         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1536             *pDest++ = (uint8_t)ch;
1537             ++src;
1538         }
1539         if(ch == 0) {
1540             reqLength=(int32_t)(pDest - (uint8_t *)dest);
1541             if(pDestLength) {
1542                 *pDestLength = reqLength;
1543             }
1544 
1545             /* Terminate the buffer */
1546             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1547             return dest;
1548         }
1549         srcLength = u_strlen(src);
1550     }
1551 
1552     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1553     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1554     for(;;) {
1555         count = (int32_t)(pDestLimit - pDest);
1556         srcLength = (int32_t)(pSrcLimit - src);
1557         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1558             /* fast ASCII loop */
1559             const UChar *prevSrc = src;
1560             int32_t delta;
1561             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1562                 *pDest++=(uint8_t)ch;
1563                 ++src;
1564             }
1565             delta = (int32_t)(src - prevSrc);
1566             count -= delta;
1567             srcLength -= delta;
1568         }
1569         /*
1570          * Each iteration of the inner loop progresses by at most 3 UTF-8
1571          * bytes and one UChar.
1572          */
1573         count /= 3;
1574         if(count > srcLength) {
1575             count = srcLength; /* min(remaining dest/3, remaining src) */
1576         }
1577         if(count < 3) {
1578             /*
1579              * Too much overhead if we get near the end of the string,
1580              * continue with the next loop.
1581              */
1582             break;
1583         }
1584         do {
1585             ch=*src++;
1586             if(ch <= 0x7f && ch != 0) {
1587                 *pDest++ = (uint8_t)ch;
1588             } else if(ch <= 0x7ff) {
1589                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1590                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1591             } else {
1592                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1593                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1594                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1595             }
1596         } while(--count > 0);
1597     }
1598 
1599     while(src<pSrcLimit) {
1600         ch=*src++;
1601         if(ch <= 0x7f && ch != 0) {
1602             if(pDest<pDestLimit) {
1603                 *pDest++ = (uint8_t)ch;
1604             } else {
1605                 reqLength = 1;
1606                 break;
1607             }
1608         } else if(ch <= 0x7ff) {
1609             if((pDestLimit - pDest) >= 2) {
1610                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1611                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1612             } else {
1613                 reqLength = 2;
1614                 break;
1615             }
1616         } else {
1617             if((pDestLimit - pDest) >= 3) {
1618                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1619                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1620                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1621             } else {
1622                 reqLength = 3;
1623                 break;
1624             }
1625         }
1626     }
1627     while(src<pSrcLimit) {
1628         ch=*src++;
1629         if(ch <= 0x7f && ch != 0) {
1630             ++reqLength;
1631         } else if(ch<=0x7ff) {
1632             reqLength+=2;
1633         } else {
1634             reqLength+=3;
1635         }
1636     }
1637 
1638     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1639     if(pDestLength){
1640         *pDestLength = reqLength;
1641     }
1642 
1643     /* Terminate the buffer */
1644     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1645     return dest;
1646 }
1647