1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2001-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 *
9 * File ustrtrns.c
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 9/10/2001 Ram Creation.
15 ******************************************************************************
16 */
17
18 /*******************************************************************************
19 *
20 * u_strTo* and u_strFrom* APIs
21 * WCS functions moved to ustr_wcs.c for better modularization
22 *
23 *******************************************************************************
24 */
25
26
27 #include "unicode/putil.h"
28 #include "unicode/ustring.h"
29 #include "cstring.h"
30 #include "cmemory.h"
31 #include "ustr_imp.h"
32
33 U_CAPI UChar* U_EXPORT2
u_strFromUTF32WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const UChar32 * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)34 u_strFromUTF32WithSub(UChar *dest,
35 int32_t destCapacity,
36 int32_t *pDestLength,
37 const UChar32 *src,
38 int32_t srcLength,
39 UChar32 subchar, int32_t *pNumSubstitutions,
40 UErrorCode *pErrorCode) {
41 const UChar32 *srcLimit;
42 UChar32 ch;
43 UChar *destLimit;
44 UChar *pDest;
45 int32_t reqLength;
46 int32_t numSubstitutions;
47
48 /* args check */
49 if(U_FAILURE(*pErrorCode)){
50 return NULL;
51 }
52 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
53 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
54 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
55 ) {
56 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
57 return NULL;
58 }
59
60 if(pNumSubstitutions != NULL) {
61 *pNumSubstitutions = 0;
62 }
63
64 pDest = dest;
65 destLimit = dest + destCapacity;
66 reqLength = 0;
67 numSubstitutions = 0;
68
69 if(srcLength < 0) {
70 /* simple loop for conversion of a NUL-terminated BMP string */
71 while((ch=*src) != 0 &&
72 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
73 ++src;
74 if(pDest < destLimit) {
75 *pDest++ = (UChar)ch;
76 } else {
77 ++reqLength;
78 }
79 }
80 srcLimit = src;
81 if(ch != 0) {
82 /* "complicated" case, find the end of the remaining string */
83 while(*++srcLimit != 0) {}
84 }
85 } else {
86 srcLimit = src + srcLength;
87 }
88
89 /* convert with length */
90 while(src < srcLimit) {
91 ch = *src++;
92 do {
93 /* usually "loops" once; twice only for writing subchar */
94 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
95 if(pDest < destLimit) {
96 *pDest++ = (UChar)ch;
97 } else {
98 ++reqLength;
99 }
100 break;
101 } else if(0x10000 <= ch && ch <= 0x10ffff) {
102 if((pDest + 2) <= destLimit) {
103 *pDest++ = U16_LEAD(ch);
104 *pDest++ = U16_TRAIL(ch);
105 } else {
106 reqLength += 2;
107 }
108 break;
109 } else if((ch = subchar) < 0) {
110 /* surrogate code point, or not a Unicode code point at all */
111 *pErrorCode = U_INVALID_CHAR_FOUND;
112 return NULL;
113 } else {
114 ++numSubstitutions;
115 }
116 } while(TRUE);
117 }
118
119 reqLength += (int32_t)(pDest - dest);
120 if(pDestLength) {
121 *pDestLength = reqLength;
122 }
123 if(pNumSubstitutions != NULL) {
124 *pNumSubstitutions = numSubstitutions;
125 }
126
127 /* Terminate the buffer */
128 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
129
130 return dest;
131 }
132
133 U_CAPI UChar* U_EXPORT2
u_strFromUTF32(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const UChar32 * src,int32_t srcLength,UErrorCode * pErrorCode)134 u_strFromUTF32(UChar *dest,
135 int32_t destCapacity,
136 int32_t *pDestLength,
137 const UChar32 *src,
138 int32_t srcLength,
139 UErrorCode *pErrorCode) {
140 return u_strFromUTF32WithSub(
141 dest, destCapacity, pDestLength,
142 src, srcLength,
143 U_SENTINEL, NULL,
144 pErrorCode);
145 }
146
147 U_CAPI UChar32* U_EXPORT2
u_strToUTF32WithSub(UChar32 * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)148 u_strToUTF32WithSub(UChar32 *dest,
149 int32_t destCapacity,
150 int32_t *pDestLength,
151 const UChar *src,
152 int32_t srcLength,
153 UChar32 subchar, int32_t *pNumSubstitutions,
154 UErrorCode *pErrorCode) {
155 const UChar *srcLimit;
156 UChar32 ch;
157 UChar ch2;
158 UChar32 *destLimit;
159 UChar32 *pDest;
160 int32_t reqLength;
161 int32_t numSubstitutions;
162
163 /* args check */
164 if(U_FAILURE(*pErrorCode)){
165 return NULL;
166 }
167 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
168 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
169 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
170 ) {
171 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
172 return NULL;
173 }
174
175 if(pNumSubstitutions != NULL) {
176 *pNumSubstitutions = 0;
177 }
178
179 pDest = dest;
180 destLimit = dest + destCapacity;
181 reqLength = 0;
182 numSubstitutions = 0;
183
184 if(srcLength < 0) {
185 /* simple loop for conversion of a NUL-terminated BMP string */
186 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
187 ++src;
188 if(pDest < destLimit) {
189 *pDest++ = ch;
190 } else {
191 ++reqLength;
192 }
193 }
194 srcLimit = src;
195 if(ch != 0) {
196 /* "complicated" case, find the end of the remaining string */
197 while(*++srcLimit != 0) {}
198 }
199 } else {
200 srcLimit = src + srcLength;
201 }
202
203 /* convert with length */
204 while(src < srcLimit) {
205 ch = *src++;
206 if(!U16_IS_SURROGATE(ch)) {
207 /* write or count ch below */
208 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
209 ++src;
210 ch = U16_GET_SUPPLEMENTARY(ch, ch2);
211 } else if((ch = subchar) < 0) {
212 /* unpaired surrogate */
213 *pErrorCode = U_INVALID_CHAR_FOUND;
214 return NULL;
215 } else {
216 ++numSubstitutions;
217 }
218 if(pDest < destLimit) {
219 *pDest++ = ch;
220 } else {
221 ++reqLength;
222 }
223 }
224
225 reqLength += (int32_t)(pDest - dest);
226 if(pDestLength) {
227 *pDestLength = reqLength;
228 }
229 if(pNumSubstitutions != NULL) {
230 *pNumSubstitutions = numSubstitutions;
231 }
232
233 /* Terminate the buffer */
234 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
235
236 return dest;
237 }
238
239 U_CAPI UChar32* U_EXPORT2
u_strToUTF32(UChar32 * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)240 u_strToUTF32(UChar32 *dest,
241 int32_t destCapacity,
242 int32_t *pDestLength,
243 const UChar *src,
244 int32_t srcLength,
245 UErrorCode *pErrorCode) {
246 return u_strToUTF32WithSub(
247 dest, destCapacity, pDestLength,
248 src, srcLength,
249 U_SENTINEL, NULL,
250 pErrorCode);
251 }
252
253 /* for utf8_nextCharSafeBodyTerminated() */
254 static const UChar32
255 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
256
257 /*
258 * Version of utf8_nextCharSafeBody() with the following differences:
259 * - checks for NUL termination instead of length
260 * - works with pointers instead of indexes
261 * - always strict (strict==-1)
262 *
263 * *ps points to after the lead byte and will be moved to after the last trail byte.
264 * c is the lead byte.
265 * @return the code point, or U_SENTINEL
266 */
267 static UChar32
utf8_nextCharSafeBodyTerminated(const uint8_t ** ps,UChar32 c)268 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
269 const uint8_t *s=*ps;
270 uint8_t trail, illegal=0;
271 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
272 UTF8_MASK_LEAD_BYTE((c), count);
273 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
274 switch(count) {
275 /* each branch falls through to the next one */
276 case 5:
277 case 4:
278 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
279 illegal=1;
280 break;
281 case 3:
282 trail=(uint8_t)(*s++ - 0x80);
283 c=(c<<6)|trail;
284 if(trail>0x3f || c>=0x110) {
285 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
286 illegal=1;
287 break;
288 }
289 case 2:
290 trail=(uint8_t)(*s++ - 0x80);
291 if(trail>0x3f) {
292 /* not a trail byte */
293 illegal=1;
294 break;
295 }
296 c=(c<<6)|trail;
297 case 1:
298 trail=(uint8_t)(*s++ - 0x80);
299 if(trail>0x3f) {
300 /* not a trail byte */
301 illegal=1;
302 }
303 c=(c<<6)|trail;
304 break;
305 case 0:
306 return U_SENTINEL;
307 /* no default branch to optimize switch() - all values are covered */
308 }
309
310 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
311 /* illegal is also set if count>=4 */
312 if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
313 /* error handling */
314 /* don't go beyond this sequence */
315 s=*ps;
316 while(count>0 && UTF8_IS_TRAIL(*s)) {
317 ++s;
318 --count;
319 }
320 c=U_SENTINEL;
321 }
322 *ps=s;
323 return c;
324 }
325
326 /*
327 * Version of utf8_nextCharSafeBody() with the following differences:
328 * - works with pointers instead of indexes
329 * - always strict (strict==-1)
330 *
331 * *ps points to after the lead byte and will be moved to after the last trail byte.
332 * c is the lead byte.
333 * @return the code point, or U_SENTINEL
334 */
335 static UChar32
utf8_nextCharSafeBodyPointer(const uint8_t ** ps,const uint8_t * limit,UChar32 c)336 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
337 const uint8_t *s=*ps;
338 uint8_t trail, illegal=0;
339 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
340 if((limit-s)>=count) {
341 UTF8_MASK_LEAD_BYTE((c), count);
342 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
343 switch(count) {
344 /* each branch falls through to the next one */
345 case 5:
346 case 4:
347 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
348 illegal=1;
349 break;
350 case 3:
351 trail=*s++;
352 c=(c<<6)|(trail&0x3f);
353 if(c<0x110) {
354 illegal|=(trail&0xc0)^0x80;
355 } else {
356 /* code point>0x10ffff, outside Unicode */
357 illegal=1;
358 break;
359 }
360 case 2:
361 trail=*s++;
362 c=(c<<6)|(trail&0x3f);
363 illegal|=(trail&0xc0)^0x80;
364 case 1:
365 trail=*s++;
366 c=(c<<6)|(trail&0x3f);
367 illegal|=(trail&0xc0)^0x80;
368 break;
369 case 0:
370 return U_SENTINEL;
371 /* no default branch to optimize switch() - all values are covered */
372 }
373 } else {
374 illegal=1; /* too few bytes left */
375 }
376
377 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
378 /* illegal is also set if count>=4 */
379 if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
380 /* error handling */
381 /* don't go beyond this sequence */
382 s=*ps;
383 while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) {
384 ++s;
385 --count;
386 }
387 c=U_SENTINEL;
388 }
389 *ps=s;
390 return c;
391 }
392
393 U_CAPI UChar* U_EXPORT2
u_strFromUTF8WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)394 u_strFromUTF8WithSub(UChar *dest,
395 int32_t destCapacity,
396 int32_t *pDestLength,
397 const char* src,
398 int32_t srcLength,
399 UChar32 subchar, int32_t *pNumSubstitutions,
400 UErrorCode *pErrorCode){
401 UChar *pDest = dest;
402 UChar *pDestLimit = dest+destCapacity;
403 UChar32 ch;
404 int32_t reqLength = 0;
405 const uint8_t* pSrc = (const uint8_t*) src;
406 uint8_t t1, t2; /* trail bytes */
407 int32_t numSubstitutions;
408
409 /* args check */
410 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
411 return NULL;
412 }
413
414 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
415 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
416 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
417 ) {
418 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
419 return NULL;
420 }
421
422 if(pNumSubstitutions!=NULL) {
423 *pNumSubstitutions=0;
424 }
425 numSubstitutions=0;
426
427 /*
428 * Inline processing of UTF-8 byte sequences:
429 *
430 * Byte sequences for the most common characters are handled inline in
431 * the conversion loops. In order to reduce the path lengths for those
432 * characters, the tests are arranged in a kind of binary search.
433 * ASCII (<=0x7f) is checked first, followed by the dividing point
434 * between 2- and 3-byte sequences (0xe0).
435 * The 3-byte branch is tested first to speed up CJK text.
436 * The compiler should combine the subtractions for the two tests for 0xe0.
437 * Each branch then tests for the other end of its range.
438 */
439
440 if(srcLength < 0){
441 /*
442 * Transform a NUL-terminated string.
443 * The code explicitly checks for NULs only in the lead byte position.
444 * A NUL byte in the trail byte position fails the trail byte range check anyway.
445 */
446 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
447 if(ch <= 0x7f){
448 *pDest++=(UChar)ch;
449 ++pSrc;
450 } else {
451 if(ch > 0xe0) {
452 if( /* handle U+1000..U+CFFF inline */
453 ch <= 0xec &&
454 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
455 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
456 ) {
457 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
458 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
459 pSrc += 3;
460 continue;
461 }
462 } else if(ch < 0xe0) {
463 if( /* handle U+0080..U+07FF inline */
464 ch >= 0xc2 &&
465 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
466 ) {
467 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
468 pSrc += 2;
469 continue;
470 }
471 }
472
473 /* function call for "complicated" and error cases */
474 ++pSrc; /* continue after the lead byte */
475 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
476 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
477 *pErrorCode = U_INVALID_CHAR_FOUND;
478 return NULL;
479 } else if(ch<=0xFFFF) {
480 *(pDest++)=(UChar)ch;
481 } else {
482 *(pDest++)=UTF16_LEAD(ch);
483 if(pDest<pDestLimit) {
484 *(pDest++)=UTF16_TRAIL(ch);
485 } else {
486 reqLength++;
487 break;
488 }
489 }
490 }
491 }
492
493 /* Pre-flight the rest of the string. */
494 while((ch = *pSrc) != 0) {
495 if(ch <= 0x7f){
496 ++reqLength;
497 ++pSrc;
498 } else {
499 if(ch > 0xe0) {
500 if( /* handle U+1000..U+CFFF inline */
501 ch <= 0xec &&
502 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
503 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
504 ) {
505 ++reqLength;
506 pSrc += 3;
507 continue;
508 }
509 } else if(ch < 0xe0) {
510 if( /* handle U+0080..U+07FF inline */
511 ch >= 0xc2 &&
512 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
513 ) {
514 ++reqLength;
515 pSrc += 2;
516 continue;
517 }
518 }
519
520 /* function call for "complicated" and error cases */
521 ++pSrc; /* continue after the lead byte */
522 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
523 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
524 *pErrorCode = U_INVALID_CHAR_FOUND;
525 return NULL;
526 }
527 reqLength += U16_LENGTH(ch);
528 }
529 }
530 } else /* srcLength >= 0 */ {
531 const uint8_t *pSrcLimit = pSrc + srcLength;
532 int32_t count;
533
534 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
535 for(;;) {
536 /*
537 * Each iteration of the inner loop progresses by at most 3 UTF-8
538 * bytes and one UChar, for most characters.
539 * For supplementary code points (4 & 2), which are rare,
540 * there is an additional adjustment.
541 */
542 count = (int32_t)(pDestLimit - pDest);
543 srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
544 if(count > srcLength) {
545 count = srcLength; /* min(remaining dest, remaining src/3) */
546 }
547 if(count < 3) {
548 /*
549 * Too much overhead if we get near the end of the string,
550 * continue with the next loop.
551 */
552 break;
553 }
554
555 do {
556 ch = *pSrc;
557 if(ch <= 0x7f){
558 *pDest++=(UChar)ch;
559 ++pSrc;
560 } else {
561 if(ch > 0xe0) {
562 if( /* handle U+1000..U+CFFF inline */
563 ch <= 0xec &&
564 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
565 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
566 ) {
567 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
568 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
569 pSrc += 3;
570 continue;
571 }
572 } else if(ch < 0xe0) {
573 if( /* handle U+0080..U+07FF inline */
574 ch >= 0xc2 &&
575 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
576 ) {
577 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
578 pSrc += 2;
579 continue;
580 }
581 }
582
583 if(ch >= 0xf0 || subchar > 0xffff) {
584 /*
585 * We may read up to six bytes and write up to two UChars,
586 * which we didn't account for with computing count,
587 * so we adjust it here.
588 */
589 if(--count == 0) {
590 break;
591 }
592 }
593
594 /* function call for "complicated" and error cases */
595 ++pSrc; /* continue after the lead byte */
596 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
597 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
598 *pErrorCode = U_INVALID_CHAR_FOUND;
599 return NULL;
600 }else if(ch<=0xFFFF){
601 *(pDest++)=(UChar)ch;
602 }else{
603 *(pDest++)=UTF16_LEAD(ch);
604 *(pDest++)=UTF16_TRAIL(ch);
605 }
606 }
607 } while(--count > 0);
608 }
609
610 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
611 ch = *pSrc;
612 if(ch <= 0x7f){
613 *pDest++=(UChar)ch;
614 ++pSrc;
615 } else {
616 if(ch > 0xe0) {
617 if( /* handle U+1000..U+CFFF inline */
618 ch <= 0xec &&
619 ((pSrcLimit - pSrc) >= 3) &&
620 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
621 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
622 ) {
623 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
624 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
625 pSrc += 3;
626 continue;
627 }
628 } else if(ch < 0xe0) {
629 if( /* handle U+0080..U+07FF inline */
630 ch >= 0xc2 &&
631 ((pSrcLimit - pSrc) >= 2) &&
632 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
633 ) {
634 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
635 pSrc += 2;
636 continue;
637 }
638 }
639
640 /* function call for "complicated" and error cases */
641 ++pSrc; /* continue after the lead byte */
642 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
643 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
644 *pErrorCode = U_INVALID_CHAR_FOUND;
645 return NULL;
646 }else if(ch<=0xFFFF){
647 *(pDest++)=(UChar)ch;
648 }else{
649 *(pDest++)=UTF16_LEAD(ch);
650 if(pDest<pDestLimit){
651 *(pDest++)=UTF16_TRAIL(ch);
652 }else{
653 reqLength++;
654 break;
655 }
656 }
657 }
658 }
659 /* do not fill the dest buffer just count the UChars needed */
660 while(pSrc < pSrcLimit){
661 ch = *pSrc;
662 if(ch <= 0x7f){
663 reqLength++;
664 ++pSrc;
665 } else {
666 if(ch > 0xe0) {
667 if( /* handle U+1000..U+CFFF inline */
668 ch <= 0xec &&
669 ((pSrcLimit - pSrc) >= 3) &&
670 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
671 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
672 ) {
673 reqLength++;
674 pSrc += 3;
675 continue;
676 }
677 } else if(ch < 0xe0) {
678 if( /* handle U+0080..U+07FF inline */
679 ch >= 0xc2 &&
680 ((pSrcLimit - pSrc) >= 2) &&
681 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
682 ) {
683 reqLength++;
684 pSrc += 2;
685 continue;
686 }
687 }
688
689 /* function call for "complicated" and error cases */
690 ++pSrc; /* continue after the lead byte */
691 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
692 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
693 *pErrorCode = U_INVALID_CHAR_FOUND;
694 return NULL;
695 }
696 reqLength+=UTF_CHAR_LENGTH(ch);
697 }
698 }
699 }
700
701 reqLength+=(int32_t)(pDest - dest);
702
703 if(pNumSubstitutions!=NULL) {
704 *pNumSubstitutions=numSubstitutions;
705 }
706
707 if(pDestLength){
708 *pDestLength = reqLength;
709 }
710
711 /* Terminate the buffer */
712 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
713
714 return dest;
715 }
716
717 U_CAPI UChar* U_EXPORT2
u_strFromUTF8(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)718 u_strFromUTF8(UChar *dest,
719 int32_t destCapacity,
720 int32_t *pDestLength,
721 const char* src,
722 int32_t srcLength,
723 UErrorCode *pErrorCode){
724 return u_strFromUTF8WithSub(
725 dest, destCapacity, pDestLength,
726 src, srcLength,
727 U_SENTINEL, NULL,
728 pErrorCode);
729 }
730
731 U_CAPI UChar * U_EXPORT2
u_strFromUTF8Lenient(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)732 u_strFromUTF8Lenient(UChar *dest,
733 int32_t destCapacity,
734 int32_t *pDestLength,
735 const char *src,
736 int32_t srcLength,
737 UErrorCode *pErrorCode) {
738 UChar *pDest = dest;
739 UChar32 ch;
740 int32_t reqLength = 0;
741 uint8_t* pSrc = (uint8_t*) src;
742
743 /* args check */
744 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
745 return NULL;
746 }
747
748 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
749 (destCapacity<0) || (dest == NULL && destCapacity > 0)
750 ) {
751 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
752 return NULL;
753 }
754
755 if(srcLength < 0) {
756 /* Transform a NUL-terminated string. */
757 UChar *pDestLimit = dest+destCapacity;
758 uint8_t t1, t2, t3; /* trail bytes */
759
760 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
761 if(ch < 0xc0) {
762 /*
763 * ASCII, or a trail byte in lead position which is treated like
764 * a single-byte sequence for better character boundary
765 * resynchronization after illegal sequences.
766 */
767 *pDest++=(UChar)ch;
768 ++pSrc;
769 continue;
770 } else if(ch < 0xe0) { /* U+0080..U+07FF */
771 if((t1 = pSrc[1]) != 0) {
772 /* 0x3080 = (0xc0 << 6) + 0x80 */
773 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
774 pSrc += 2;
775 continue;
776 }
777 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
778 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
779 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
780 /* 0x2080 = (0x80 << 6) + 0x80 */
781 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
782 pSrc += 3;
783 continue;
784 }
785 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
786 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
787 pSrc += 4;
788 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
789 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
790 *(pDest++) = U16_LEAD(ch);
791 if(pDest < pDestLimit) {
792 *(pDest++) = U16_TRAIL(ch);
793 } else {
794 reqLength = 1;
795 break;
796 }
797 continue;
798 }
799 }
800
801 /* truncated character at the end */
802 *pDest++ = 0xfffd;
803 while(*++pSrc != 0) {}
804 break;
805 }
806
807 /* Pre-flight the rest of the string. */
808 while((ch = *pSrc) != 0) {
809 if(ch < 0xc0) {
810 /*
811 * ASCII, or a trail byte in lead position which is treated like
812 * a single-byte sequence for better character boundary
813 * resynchronization after illegal sequences.
814 */
815 ++reqLength;
816 ++pSrc;
817 continue;
818 } else if(ch < 0xe0) { /* U+0080..U+07FF */
819 if(pSrc[1] != 0) {
820 ++reqLength;
821 pSrc += 2;
822 continue;
823 }
824 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
825 if(pSrc[1] != 0 && pSrc[2] != 0) {
826 ++reqLength;
827 pSrc += 3;
828 continue;
829 }
830 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
831 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
832 reqLength += 2;
833 pSrc += 4;
834 continue;
835 }
836 }
837
838 /* truncated character at the end */
839 ++reqLength;
840 break;
841 }
842 } else /* srcLength >= 0 */ {
843 const uint8_t *pSrcLimit = pSrc + srcLength;
844
845 /*
846 * This function requires that if srcLength is given, then it must be
847 * destCapatity >= srcLength so that we need not check for
848 * destination buffer overflow in the loop.
849 */
850 if(destCapacity < srcLength) {
851 if(pDestLength != NULL) {
852 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
853 }
854 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
855 return NULL;
856 }
857
858 if((pSrcLimit - pSrc) >= 4) {
859 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
860
861 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
862 do {
863 ch = *pSrc++;
864 if(ch < 0xc0) {
865 /*
866 * ASCII, or a trail byte in lead position which is treated like
867 * a single-byte sequence for better character boundary
868 * resynchronization after illegal sequences.
869 */
870 *pDest++=(UChar)ch;
871 } else if(ch < 0xe0) { /* U+0080..U+07FF */
872 /* 0x3080 = (0xc0 << 6) + 0x80 */
873 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
874 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
875 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
876 /* 0x2080 = (0x80 << 6) + 0x80 */
877 ch = (ch << 12) + (*pSrc++ << 6);
878 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
879 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
880 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
881 ch = (ch << 18) + (*pSrc++ << 12);
882 ch += *pSrc++ << 6;
883 ch += *pSrc++ - 0x3c82080;
884 *(pDest++) = U16_LEAD(ch);
885 *(pDest++) = U16_TRAIL(ch);
886 }
887 } while(pSrc < pSrcLimit);
888
889 pSrcLimit += 3; /* restore original pSrcLimit */
890 }
891
892 while(pSrc < pSrcLimit) {
893 ch = *pSrc++;
894 if(ch < 0xc0) {
895 /*
896 * ASCII, or a trail byte in lead position which is treated like
897 * a single-byte sequence for better character boundary
898 * resynchronization after illegal sequences.
899 */
900 *pDest++=(UChar)ch;
901 continue;
902 } else if(ch < 0xe0) { /* U+0080..U+07FF */
903 if(pSrc < pSrcLimit) {
904 /* 0x3080 = (0xc0 << 6) + 0x80 */
905 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
906 continue;
907 }
908 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
909 if((pSrcLimit - pSrc) >= 2) {
910 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
911 /* 0x2080 = (0x80 << 6) + 0x80 */
912 ch = (ch << 12) + (*pSrc++ << 6);
913 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
914 pSrc += 3;
915 continue;
916 }
917 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
918 if((pSrcLimit - pSrc) >= 3) {
919 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
920 ch = (ch << 18) + (*pSrc++ << 12);
921 ch += *pSrc++ << 6;
922 ch += *pSrc++ - 0x3c82080;
923 *(pDest++) = U16_LEAD(ch);
924 *(pDest++) = U16_TRAIL(ch);
925 pSrc += 4;
926 continue;
927 }
928 }
929
930 /* truncated character at the end */
931 *pDest++ = 0xfffd;
932 break;
933 }
934 }
935
936 reqLength+=(int32_t)(pDest - dest);
937
938 if(pDestLength){
939 *pDestLength = reqLength;
940 }
941
942 /* Terminate the buffer */
943 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
944
945 return dest;
946 }
947
948 static U_INLINE uint8_t *
_appendUTF8(uint8_t * pDest,UChar32 c)949 _appendUTF8(uint8_t *pDest, UChar32 c) {
950 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
951 if((c)<=0x7f) {
952 *pDest++=(uint8_t)c;
953 } else if(c<=0x7ff) {
954 *pDest++=(uint8_t)((c>>6)|0xc0);
955 *pDest++=(uint8_t)((c&0x3f)|0x80);
956 } else if(c<=0xffff) {
957 *pDest++=(uint8_t)((c>>12)|0xe0);
958 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
959 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
960 } else /* if((uint32_t)(c)<=0x10ffff) */ {
961 *pDest++=(uint8_t)(((c)>>18)|0xf0);
962 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
963 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
964 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
965 }
966 return pDest;
967 }
968
969
970 U_CAPI char* U_EXPORT2
u_strToUTF8WithSub(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * pSrc,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)971 u_strToUTF8WithSub(char *dest,
972 int32_t destCapacity,
973 int32_t *pDestLength,
974 const UChar *pSrc,
975 int32_t srcLength,
976 UChar32 subchar, int32_t *pNumSubstitutions,
977 UErrorCode *pErrorCode){
978 int32_t reqLength=0;
979 uint32_t ch=0,ch2=0;
980 uint8_t *pDest = (uint8_t *)dest;
981 uint8_t *pDestLimit = pDest + destCapacity;
982 int32_t numSubstitutions;
983
984 /* args check */
985 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
986 return NULL;
987 }
988
989 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
990 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
991 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
992 ) {
993 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
994 return NULL;
995 }
996
997 if(pNumSubstitutions!=NULL) {
998 *pNumSubstitutions=0;
999 }
1000 numSubstitutions=0;
1001
1002 if(srcLength==-1) {
1003 while((ch=*pSrc)!=0) {
1004 ++pSrc;
1005 if(ch <= 0x7f) {
1006 if(pDest<pDestLimit) {
1007 *pDest++ = (uint8_t)ch;
1008 } else {
1009 reqLength = 1;
1010 break;
1011 }
1012 } else if(ch <= 0x7ff) {
1013 if((pDestLimit - pDest) >= 2) {
1014 *pDest++=(uint8_t)((ch>>6)|0xc0);
1015 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1016 } else {
1017 reqLength = 2;
1018 break;
1019 }
1020 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1021 if((pDestLimit - pDest) >= 3) {
1022 *pDest++=(uint8_t)((ch>>12)|0xe0);
1023 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1024 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1025 } else {
1026 reqLength = 3;
1027 break;
1028 }
1029 } else /* ch is a surrogate */ {
1030 int32_t length;
1031
1032 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
1033 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1034 ++pSrc;
1035 ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1036 } else if(subchar>=0) {
1037 ch=subchar;
1038 ++numSubstitutions;
1039 } else {
1040 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1041 *pErrorCode = U_INVALID_CHAR_FOUND;
1042 return NULL;
1043 }
1044
1045 length = U8_LENGTH(ch);
1046 if((pDestLimit - pDest) >= length) {
1047 /* convert and append*/
1048 pDest=_appendUTF8(pDest, ch);
1049 } else {
1050 reqLength = length;
1051 break;
1052 }
1053 }
1054 }
1055 while((ch=*pSrc++)!=0) {
1056 if(ch<=0x7f) {
1057 ++reqLength;
1058 } else if(ch<=0x7ff) {
1059 reqLength+=2;
1060 } else if(!UTF_IS_SURROGATE(ch)) {
1061 reqLength+=3;
1062 } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1063 ++pSrc;
1064 reqLength+=4;
1065 } else if(subchar>=0) {
1066 reqLength+=U8_LENGTH(subchar);
1067 ++numSubstitutions;
1068 } else {
1069 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1070 *pErrorCode = U_INVALID_CHAR_FOUND;
1071 return NULL;
1072 }
1073 }
1074 } else {
1075 const UChar *pSrcLimit = pSrc+srcLength;
1076 int32_t count;
1077
1078 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1079 for(;;) {
1080 /*
1081 * Each iteration of the inner loop progresses by at most 3 UTF-8
1082 * bytes and one UChar, for most characters.
1083 * For supplementary code points (4 & 2), which are rare,
1084 * there is an additional adjustment.
1085 */
1086 count = (int32_t)((pDestLimit - pDest) / 3);
1087 srcLength = (int32_t)(pSrcLimit - pSrc);
1088 if(count > srcLength) {
1089 count = srcLength; /* min(remaining dest/3, remaining src) */
1090 }
1091 if(count < 3) {
1092 /*
1093 * Too much overhead if we get near the end of the string,
1094 * continue with the next loop.
1095 */
1096 break;
1097 }
1098 do {
1099 ch=*pSrc++;
1100 if(ch <= 0x7f) {
1101 *pDest++ = (uint8_t)ch;
1102 } else if(ch <= 0x7ff) {
1103 *pDest++=(uint8_t)((ch>>6)|0xc0);
1104 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1105 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1106 *pDest++=(uint8_t)((ch>>12)|0xe0);
1107 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1108 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1109 } else /* ch is a surrogate */ {
1110 /*
1111 * We will read two UChars and probably output four bytes,
1112 * which we didn't account for with computing count,
1113 * so we adjust it here.
1114 */
1115 if(--count == 0) {
1116 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1117 break; /* recompute count */
1118 }
1119
1120 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
1121 ++pSrc;
1122 ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1123
1124 /* writing 4 bytes per 2 UChars is ok */
1125 *pDest++=(uint8_t)((ch>>18)|0xf0);
1126 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1127 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1128 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1129 } else {
1130 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1131 if(subchar>=0) {
1132 ch=subchar;
1133 ++numSubstitutions;
1134 } else {
1135 *pErrorCode = U_INVALID_CHAR_FOUND;
1136 return NULL;
1137 }
1138
1139 /* convert and append*/
1140 pDest=_appendUTF8(pDest, ch);
1141 }
1142 }
1143 } while(--count > 0);
1144 }
1145
1146 while(pSrc<pSrcLimit) {
1147 ch=*pSrc++;
1148 if(ch <= 0x7f) {
1149 if(pDest<pDestLimit) {
1150 *pDest++ = (uint8_t)ch;
1151 } else {
1152 reqLength = 1;
1153 break;
1154 }
1155 } else if(ch <= 0x7ff) {
1156 if((pDestLimit - pDest) >= 2) {
1157 *pDest++=(uint8_t)((ch>>6)|0xc0);
1158 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1159 } else {
1160 reqLength = 2;
1161 break;
1162 }
1163 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1164 if((pDestLimit - pDest) >= 3) {
1165 *pDest++=(uint8_t)((ch>>12)|0xe0);
1166 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1167 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1168 } else {
1169 reqLength = 3;
1170 break;
1171 }
1172 } else /* ch is a surrogate */ {
1173 int32_t length;
1174
1175 if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1176 ++pSrc;
1177 ch=UTF16_GET_PAIR_VALUE(ch, ch2);
1178 } else if(subchar>=0) {
1179 ch=subchar;
1180 ++numSubstitutions;
1181 } else {
1182 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1183 *pErrorCode = U_INVALID_CHAR_FOUND;
1184 return NULL;
1185 }
1186
1187 length = U8_LENGTH(ch);
1188 if((pDestLimit - pDest) >= length) {
1189 /* convert and append*/
1190 pDest=_appendUTF8(pDest, ch);
1191 } else {
1192 reqLength = length;
1193 break;
1194 }
1195 }
1196 }
1197 while(pSrc<pSrcLimit) {
1198 ch=*pSrc++;
1199 if(ch<=0x7f) {
1200 ++reqLength;
1201 } else if(ch<=0x7ff) {
1202 reqLength+=2;
1203 } else if(!UTF_IS_SURROGATE(ch)) {
1204 reqLength+=3;
1205 } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
1206 ++pSrc;
1207 reqLength+=4;
1208 } else if(subchar>=0) {
1209 reqLength+=U8_LENGTH(subchar);
1210 ++numSubstitutions;
1211 } else {
1212 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1213 *pErrorCode = U_INVALID_CHAR_FOUND;
1214 return NULL;
1215 }
1216 }
1217 }
1218
1219 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1220
1221 if(pNumSubstitutions!=NULL) {
1222 *pNumSubstitutions=numSubstitutions;
1223 }
1224
1225 if(pDestLength){
1226 *pDestLength = reqLength;
1227 }
1228
1229 /* Terminate the buffer */
1230 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1231 return dest;
1232 }
1233
1234 U_CAPI char* U_EXPORT2
u_strToUTF8(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * pSrc,int32_t srcLength,UErrorCode * pErrorCode)1235 u_strToUTF8(char *dest,
1236 int32_t destCapacity,
1237 int32_t *pDestLength,
1238 const UChar *pSrc,
1239 int32_t srcLength,
1240 UErrorCode *pErrorCode){
1241 return u_strToUTF8WithSub(
1242 dest, destCapacity, pDestLength,
1243 pSrc, srcLength,
1244 U_SENTINEL, NULL,
1245 pErrorCode);
1246 }
1247
1248 U_CAPI UChar* U_EXPORT2
u_strFromJavaModifiedUTF8WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)1249 u_strFromJavaModifiedUTF8WithSub(
1250 UChar *dest,
1251 int32_t destCapacity,
1252 int32_t *pDestLength,
1253 const char *src,
1254 int32_t srcLength,
1255 UChar32 subchar, int32_t *pNumSubstitutions,
1256 UErrorCode *pErrorCode) {
1257 UChar *pDest = dest;
1258 UChar *pDestLimit = dest+destCapacity;
1259 UChar32 ch;
1260 int32_t reqLength = 0;
1261 const uint8_t* pSrc = (const uint8_t*) src;
1262 const uint8_t *pSrcLimit;
1263 int32_t count;
1264 uint8_t t1, t2; /* trail bytes */
1265 int32_t numSubstitutions;
1266
1267 /* args check */
1268 if(U_FAILURE(*pErrorCode)){
1269 return NULL;
1270 }
1271 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1272 (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1273 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1274 ) {
1275 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1276 return NULL;
1277 }
1278
1279 if(pNumSubstitutions!=NULL) {
1280 *pNumSubstitutions=0;
1281 }
1282 numSubstitutions=0;
1283
1284 if(srcLength < 0) {
1285 /*
1286 * Transform a NUL-terminated ASCII string.
1287 * Handle non-ASCII strings with slower code.
1288 */
1289 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1290 *pDest++=(UChar)ch;
1291 ++pSrc;
1292 }
1293 if(ch == 0) {
1294 reqLength=(int32_t)(pDest - dest);
1295 if(pDestLength) {
1296 *pDestLength = reqLength;
1297 }
1298
1299 /* Terminate the buffer */
1300 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1301 return dest;
1302 }
1303 srcLength = uprv_strlen((const char *)pSrc);
1304 }
1305
1306 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1307 pSrcLimit = pSrc + srcLength;
1308 for(;;) {
1309 count = (int32_t)(pDestLimit - pDest);
1310 srcLength = (int32_t)(pSrcLimit - pSrc);
1311 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1312 /* fast ASCII loop */
1313 const uint8_t *prevSrc = pSrc;
1314 int32_t delta;
1315 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1316 *pDest++=(UChar)ch;
1317 ++pSrc;
1318 }
1319 delta = (int32_t)(pSrc - prevSrc);
1320 count -= delta;
1321 srcLength -= delta;
1322 }
1323 /*
1324 * Each iteration of the inner loop progresses by at most 3 UTF-8
1325 * bytes and one UChar.
1326 */
1327 srcLength /= 3;
1328 if(count > srcLength) {
1329 count = srcLength; /* min(remaining dest, remaining src/3) */
1330 }
1331 if(count < 3) {
1332 /*
1333 * Too much overhead if we get near the end of the string,
1334 * continue with the next loop.
1335 */
1336 break;
1337 }
1338 do {
1339 ch = *pSrc;
1340 if(ch <= 0x7f){
1341 *pDest++=(UChar)ch;
1342 ++pSrc;
1343 } else {
1344 if(ch >= 0xe0) {
1345 if( /* handle U+0000..U+FFFF inline */
1346 ch <= 0xef &&
1347 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1348 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1349 ) {
1350 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1351 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1352 pSrc += 3;
1353 continue;
1354 }
1355 } else {
1356 if( /* handle U+0000..U+07FF inline */
1357 ch >= 0xc0 &&
1358 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1359 ) {
1360 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1361 pSrc += 2;
1362 continue;
1363 }
1364 }
1365
1366 if(subchar < 0) {
1367 *pErrorCode = U_INVALID_CHAR_FOUND;
1368 return NULL;
1369 } else if(subchar > 0xffff && --count == 0) {
1370 /*
1371 * We need to write two UChars, adjusted count for that,
1372 * and ran out of space.
1373 */
1374 break;
1375 } else {
1376 /* function call for error cases */
1377 ++pSrc; /* continue after the lead byte */
1378 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1379 ++numSubstitutions;
1380 if(subchar<=0xFFFF) {
1381 *(pDest++)=(UChar)subchar;
1382 } else {
1383 *(pDest++)=U16_LEAD(subchar);
1384 *(pDest++)=U16_TRAIL(subchar);
1385 }
1386 }
1387 }
1388 } while(--count > 0);
1389 }
1390
1391 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1392 ch = *pSrc;
1393 if(ch <= 0x7f){
1394 *pDest++=(UChar)ch;
1395 ++pSrc;
1396 } else {
1397 if(ch >= 0xe0) {
1398 if( /* handle U+0000..U+FFFF inline */
1399 ch <= 0xef &&
1400 ((pSrcLimit - pSrc) >= 3) &&
1401 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1402 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1403 ) {
1404 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1405 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1406 pSrc += 3;
1407 continue;
1408 }
1409 } else {
1410 if( /* handle U+0000..U+07FF inline */
1411 ch >= 0xc0 &&
1412 ((pSrcLimit - pSrc) >= 2) &&
1413 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1414 ) {
1415 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1416 pSrc += 2;
1417 continue;
1418 }
1419 }
1420
1421 if(subchar < 0) {
1422 *pErrorCode = U_INVALID_CHAR_FOUND;
1423 return NULL;
1424 } else {
1425 /* function call for error cases */
1426 ++pSrc; /* continue after the lead byte */
1427 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1428 ++numSubstitutions;
1429 if(subchar<=0xFFFF) {
1430 *(pDest++)=(UChar)subchar;
1431 } else {
1432 *(pDest++)=U16_LEAD(subchar);
1433 if(pDest<pDestLimit) {
1434 *(pDest++)=U16_TRAIL(subchar);
1435 } else {
1436 reqLength++;
1437 break;
1438 }
1439 }
1440 }
1441 }
1442 }
1443
1444 /* do not fill the dest buffer just count the UChars needed */
1445 while(pSrc < pSrcLimit){
1446 ch = *pSrc;
1447 if(ch <= 0x7f) {
1448 reqLength++;
1449 ++pSrc;
1450 } else {
1451 if(ch >= 0xe0) {
1452 if( /* handle U+0000..U+FFFF inline */
1453 ch <= 0xef &&
1454 ((pSrcLimit - pSrc) >= 3) &&
1455 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1456 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1457 ) {
1458 reqLength++;
1459 pSrc += 3;
1460 continue;
1461 }
1462 } else {
1463 if( /* handle U+0000..U+07FF inline */
1464 ch >= 0xc0 &&
1465 ((pSrcLimit - pSrc) >= 2) &&
1466 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1467 ) {
1468 reqLength++;
1469 pSrc += 2;
1470 continue;
1471 }
1472 }
1473
1474 if(subchar < 0) {
1475 *pErrorCode = U_INVALID_CHAR_FOUND;
1476 return NULL;
1477 } else {
1478 /* function call for error cases */
1479 ++pSrc; /* continue after the lead byte */
1480 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1481 ++numSubstitutions;
1482 reqLength+=U16_LENGTH(ch);
1483 }
1484 }
1485 }
1486
1487 if(pNumSubstitutions!=NULL) {
1488 *pNumSubstitutions=numSubstitutions;
1489 }
1490
1491 reqLength+=(int32_t)(pDest - dest);
1492 if(pDestLength) {
1493 *pDestLength = reqLength;
1494 }
1495
1496 /* Terminate the buffer */
1497 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1498 return dest;
1499 }
1500
1501 U_CAPI char* U_EXPORT2
u_strToJavaModifiedUTF8(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)1502 u_strToJavaModifiedUTF8(
1503 char *dest,
1504 int32_t destCapacity,
1505 int32_t *pDestLength,
1506 const UChar *src,
1507 int32_t srcLength,
1508 UErrorCode *pErrorCode) {
1509 int32_t reqLength=0;
1510 uint32_t ch=0;
1511 uint8_t *pDest = (uint8_t *)dest;
1512 uint8_t *pDestLimit = pDest + destCapacity;
1513 const UChar *pSrcLimit;
1514 int32_t count;
1515
1516 /* args check */
1517 if(U_FAILURE(*pErrorCode)){
1518 return NULL;
1519 }
1520 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1521 (dest==NULL && destCapacity!=0) || destCapacity<0
1522 ) {
1523 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1524 return NULL;
1525 }
1526
1527 if(srcLength==-1) {
1528 /* Convert NUL-terminated ASCII, then find the string length. */
1529 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1530 *pDest++ = (uint8_t)ch;
1531 ++src;
1532 }
1533 if(ch == 0) {
1534 reqLength=(int32_t)(pDest - (uint8_t *)dest);
1535 if(pDestLength) {
1536 *pDestLength = reqLength;
1537 }
1538
1539 /* Terminate the buffer */
1540 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1541 return dest;
1542 }
1543 srcLength = u_strlen(src);
1544 }
1545
1546 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1547 pSrcLimit = src+srcLength;
1548 for(;;) {
1549 count = (int32_t)(pDestLimit - pDest);
1550 srcLength = (int32_t)(pSrcLimit - src);
1551 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1552 /* fast ASCII loop */
1553 const UChar *prevSrc = src;
1554 int32_t delta;
1555 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1556 *pDest++=(uint8_t)ch;
1557 ++src;
1558 }
1559 delta = (int32_t)(src - prevSrc);
1560 count -= delta;
1561 srcLength -= delta;
1562 }
1563 /*
1564 * Each iteration of the inner loop progresses by at most 3 UTF-8
1565 * bytes and one UChar.
1566 */
1567 count /= 3;
1568 if(count > srcLength) {
1569 count = srcLength; /* min(remaining dest/3, remaining src) */
1570 }
1571 if(count < 3) {
1572 /*
1573 * Too much overhead if we get near the end of the string,
1574 * continue with the next loop.
1575 */
1576 break;
1577 }
1578 do {
1579 ch=*src++;
1580 if(ch <= 0x7f && ch != 0) {
1581 *pDest++ = (uint8_t)ch;
1582 } else if(ch <= 0x7ff) {
1583 *pDest++=(uint8_t)((ch>>6)|0xc0);
1584 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1585 } else {
1586 *pDest++=(uint8_t)((ch>>12)|0xe0);
1587 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1588 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1589 }
1590 } while(--count > 0);
1591 }
1592
1593 while(src<pSrcLimit) {
1594 ch=*src++;
1595 if(ch <= 0x7f && ch != 0) {
1596 if(pDest<pDestLimit) {
1597 *pDest++ = (uint8_t)ch;
1598 } else {
1599 reqLength = 1;
1600 break;
1601 }
1602 } else if(ch <= 0x7ff) {
1603 if((pDestLimit - pDest) >= 2) {
1604 *pDest++=(uint8_t)((ch>>6)|0xc0);
1605 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1606 } else {
1607 reqLength = 2;
1608 break;
1609 }
1610 } else {
1611 if((pDestLimit - pDest) >= 3) {
1612 *pDest++=(uint8_t)((ch>>12)|0xe0);
1613 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1614 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1615 } else {
1616 reqLength = 3;
1617 break;
1618 }
1619 }
1620 }
1621 while(src<pSrcLimit) {
1622 ch=*src++;
1623 if(ch <= 0x7f && ch != 0) {
1624 ++reqLength;
1625 } else if(ch<=0x7ff) {
1626 reqLength+=2;
1627 } else {
1628 reqLength+=3;
1629 }
1630 }
1631
1632 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1633 if(pDestLength){
1634 *pDestLength = reqLength;
1635 }
1636
1637 /* Terminate the buffer */
1638 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1639 return dest;
1640 }
1641