1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2001-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 *
11 * File ustrtrns.cpp
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 9/10/2001 Ram Creation.
17 ******************************************************************************
18 */
19
20 /*******************************************************************************
21 *
22 * u_strTo* and u_strFrom* APIs
23 * WCS functions moved to ustr_wcs.c for better modularization
24 *
25 *******************************************************************************
26 */
27
28
29 #include "unicode/putil.h"
30 #include "unicode/ustring.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf8.h"
33 #include "unicode/utf16.h"
34 #include "cstring.h"
35 #include "cmemory.h"
36 #include "ustr_imp.h"
37 #include "uassert.h"
38
39 U_CAPI UChar* U_EXPORT2
u_strFromUTF32WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const UChar32 * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)40 u_strFromUTF32WithSub(UChar *dest,
41 int32_t destCapacity,
42 int32_t *pDestLength,
43 const UChar32 *src,
44 int32_t srcLength,
45 UChar32 subchar, int32_t *pNumSubstitutions,
46 UErrorCode *pErrorCode) {
47 const UChar32 *srcLimit;
48 UChar32 ch;
49 UChar *destLimit;
50 UChar *pDest;
51 int32_t reqLength;
52 int32_t numSubstitutions;
53
54 /* args check */
55 if(U_FAILURE(*pErrorCode)){
56 return NULL;
57 }
58 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
59 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
60 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
61 ) {
62 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
63 return NULL;
64 }
65
66 if(pNumSubstitutions != NULL) {
67 *pNumSubstitutions = 0;
68 }
69
70 pDest = dest;
71 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
72 reqLength = 0;
73 numSubstitutions = 0;
74
75 if(srcLength < 0) {
76 /* simple loop for conversion of a NUL-terminated BMP string */
77 while((ch=*src) != 0 &&
78 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
79 ++src;
80 if(pDest < destLimit) {
81 *pDest++ = (UChar)ch;
82 } else {
83 ++reqLength;
84 }
85 }
86 srcLimit = src;
87 if(ch != 0) {
88 /* "complicated" case, find the end of the remaining string */
89 while(*++srcLimit != 0) {}
90 }
91 } else {
92 srcLimit = (src!=NULL)?(src + srcLength):NULL;
93 }
94
95 /* convert with length */
96 while(src < srcLimit) {
97 ch = *src++;
98 do {
99 /* usually "loops" once; twice only for writing subchar */
100 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
101 if(pDest < destLimit) {
102 *pDest++ = (UChar)ch;
103 } else {
104 ++reqLength;
105 }
106 break;
107 } else if(0x10000 <= ch && ch <= 0x10ffff) {
108 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
109 *pDest++ = U16_LEAD(ch);
110 *pDest++ = U16_TRAIL(ch);
111 } else {
112 reqLength += 2;
113 }
114 break;
115 } else if((ch = subchar) < 0) {
116 /* surrogate code point, or not a Unicode code point at all */
117 *pErrorCode = U_INVALID_CHAR_FOUND;
118 return NULL;
119 } else {
120 ++numSubstitutions;
121 }
122 } while(true);
123 }
124
125 reqLength += (int32_t)(pDest - dest);
126 if(pDestLength) {
127 *pDestLength = reqLength;
128 }
129 if(pNumSubstitutions != NULL) {
130 *pNumSubstitutions = numSubstitutions;
131 }
132
133 /* Terminate the buffer */
134 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
135
136 return dest;
137 }
138
139 U_CAPI UChar* U_EXPORT2
u_strFromUTF32(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const UChar32 * src,int32_t srcLength,UErrorCode * pErrorCode)140 u_strFromUTF32(UChar *dest,
141 int32_t destCapacity,
142 int32_t *pDestLength,
143 const UChar32 *src,
144 int32_t srcLength,
145 UErrorCode *pErrorCode) {
146 return u_strFromUTF32WithSub(
147 dest, destCapacity, pDestLength,
148 src, srcLength,
149 U_SENTINEL, NULL,
150 pErrorCode);
151 }
152
153 U_CAPI UChar32* U_EXPORT2
u_strToUTF32WithSub(UChar32 * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)154 u_strToUTF32WithSub(UChar32 *dest,
155 int32_t destCapacity,
156 int32_t *pDestLength,
157 const UChar *src,
158 int32_t srcLength,
159 UChar32 subchar, int32_t *pNumSubstitutions,
160 UErrorCode *pErrorCode) {
161 const UChar *srcLimit;
162 UChar32 ch;
163 UChar ch2;
164 UChar32 *destLimit;
165 UChar32 *pDest;
166 int32_t reqLength;
167 int32_t numSubstitutions;
168
169 /* args check */
170 if(U_FAILURE(*pErrorCode)){
171 return NULL;
172 }
173 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
174 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
175 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
176 ) {
177 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
178 return NULL;
179 }
180
181 if(pNumSubstitutions != NULL) {
182 *pNumSubstitutions = 0;
183 }
184
185 pDest = dest;
186 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
187 reqLength = 0;
188 numSubstitutions = 0;
189
190 if(srcLength < 0) {
191 /* simple loop for conversion of a NUL-terminated BMP string */
192 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
193 ++src;
194 if(pDest < destLimit) {
195 *pDest++ = ch;
196 } else {
197 ++reqLength;
198 }
199 }
200 srcLimit = src;
201 if(ch != 0) {
202 /* "complicated" case, find the end of the remaining string */
203 while(*++srcLimit != 0) {}
204 }
205 } else {
206 srcLimit = (src!=NULL)?(src + srcLength):NULL;
207 }
208
209 /* convert with length */
210 while(src < srcLimit) {
211 ch = *src++;
212 if(!U16_IS_SURROGATE(ch)) {
213 /* write or count ch below */
214 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
215 ++src;
216 ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217 } else if((ch = subchar) < 0) {
218 /* unpaired surrogate */
219 *pErrorCode = U_INVALID_CHAR_FOUND;
220 return NULL;
221 } else {
222 ++numSubstitutions;
223 }
224 if(pDest < destLimit) {
225 *pDest++ = ch;
226 } else {
227 ++reqLength;
228 }
229 }
230
231 reqLength += (int32_t)(pDest - dest);
232 if(pDestLength) {
233 *pDestLength = reqLength;
234 }
235 if(pNumSubstitutions != NULL) {
236 *pNumSubstitutions = numSubstitutions;
237 }
238
239 /* Terminate the buffer */
240 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
241
242 return dest;
243 }
244
245 U_CAPI UChar32* U_EXPORT2
u_strToUTF32(UChar32 * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)246 u_strToUTF32(UChar32 *dest,
247 int32_t destCapacity,
248 int32_t *pDestLength,
249 const UChar *src,
250 int32_t srcLength,
251 UErrorCode *pErrorCode) {
252 return u_strToUTF32WithSub(
253 dest, destCapacity, pDestLength,
254 src, srcLength,
255 U_SENTINEL, NULL,
256 pErrorCode);
257 }
258
259 U_CAPI UChar* U_EXPORT2
u_strFromUTF8WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)260 u_strFromUTF8WithSub(UChar *dest,
261 int32_t destCapacity,
262 int32_t *pDestLength,
263 const char* src,
264 int32_t srcLength,
265 UChar32 subchar, int32_t *pNumSubstitutions,
266 UErrorCode *pErrorCode){
267 /* args check */
268 if(U_FAILURE(*pErrorCode)) {
269 return NULL;
270 }
271 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
272 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
273 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
274 ) {
275 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
276 return NULL;
277 }
278
279 if(pNumSubstitutions!=NULL) {
280 *pNumSubstitutions=0;
281 }
282 UChar *pDest = dest;
283 UChar *pDestLimit = dest+destCapacity;
284 int32_t reqLength = 0;
285 int32_t numSubstitutions=0;
286
287 /*
288 * Inline processing of UTF-8 byte sequences:
289 *
290 * Byte sequences for the most common characters are handled inline in
291 * the conversion loops. In order to reduce the path lengths for those
292 * characters, the tests are arranged in a kind of binary search.
293 * ASCII (<=0x7f) is checked first, followed by the dividing point
294 * between 2- and 3-byte sequences (0xe0).
295 * The 3-byte branch is tested first to speed up CJK text.
296 * The compiler should combine the subtractions for the two tests for 0xe0.
297 * Each branch then tests for the other end of its range.
298 */
299
300 if(srcLength < 0){
301 /*
302 * Transform a NUL-terminated string.
303 * The code explicitly checks for NULs only in the lead byte position.
304 * A NUL byte in the trail byte position fails the trail byte range check anyway.
305 */
306 int32_t i;
307 UChar32 c;
308 for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
309 // modified copy of U8_NEXT()
310 ++i;
311 if(U8_IS_SINGLE(c)) {
312 *pDest++=(UChar)c;
313 } else {
314 uint8_t __t1, __t2;
315 if( /* handle U+0800..U+FFFF inline */
316 (0xe0<=(c) && (c)<0xf0) &&
317 U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
318 (__t2=src[(i)+1]-0x80)<=0x3f) {
319 *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
320 i+=2;
321 } else if( /* handle U+0080..U+07FF inline */
322 ((c)<0xe0 && (c)>=0xc2) &&
323 (__t1=src[i]-0x80)<=0x3f) {
324 *pDest++ = (((c)&0x1f)<<6)|__t1;
325 ++(i);
326 } else {
327 /* function call for "complicated" and error cases */
328 (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
329 if(c<0 && (++numSubstitutions, c = subchar) < 0) {
330 *pErrorCode = U_INVALID_CHAR_FOUND;
331 return NULL;
332 } else if(c<=0xFFFF) {
333 *(pDest++)=(UChar)c;
334 } else {
335 *(pDest++)=U16_LEAD(c);
336 if(pDest<pDestLimit) {
337 *(pDest++)=U16_TRAIL(c);
338 } else {
339 reqLength++;
340 break;
341 }
342 }
343 }
344 }
345 }
346
347 /* Pre-flight the rest of the string. */
348 while((c = (uint8_t)src[i]) != 0) {
349 // modified copy of U8_NEXT()
350 ++i;
351 if(U8_IS_SINGLE(c)) {
352 ++reqLength;
353 } else {
354 uint8_t __t1, __t2;
355 if( /* handle U+0800..U+FFFF inline */
356 (0xe0<=(c) && (c)<0xf0) &&
357 U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
358 (__t2=src[(i)+1]-0x80)<=0x3f) {
359 ++reqLength;
360 i+=2;
361 } else if( /* handle U+0080..U+07FF inline */
362 ((c)<0xe0 && (c)>=0xc2) &&
363 (__t1=src[i]-0x80)<=0x3f) {
364 ++reqLength;
365 ++(i);
366 } else {
367 /* function call for "complicated" and error cases */
368 (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
369 if(c<0 && (++numSubstitutions, c = subchar) < 0) {
370 *pErrorCode = U_INVALID_CHAR_FOUND;
371 return NULL;
372 }
373 reqLength += U16_LENGTH(c);
374 }
375 }
376 }
377 } else /* srcLength >= 0 */ {
378 /* Faster loop without ongoing checking for srcLength and pDestLimit. */
379 int32_t i = 0;
380 UChar32 c;
381 for(;;) {
382 /*
383 * Each iteration of the inner loop progresses by at most 3 UTF-8
384 * bytes and one UChar, for most characters.
385 * For supplementary code points (4 & 2), which are rare,
386 * there is an additional adjustment.
387 */
388 int32_t count = (int32_t)(pDestLimit - pDest);
389 int32_t count2 = (srcLength - i) / 3;
390 if(count > count2) {
391 count = count2; /* min(remaining dest, remaining src/3) */
392 }
393 if(count < 3) {
394 /*
395 * Too much overhead if we get near the end of the string,
396 * continue with the next loop.
397 */
398 break;
399 }
400
401 do {
402 // modified copy of U8_NEXT()
403 c = (uint8_t)src[i++];
404 if(U8_IS_SINGLE(c)) {
405 *pDest++=(UChar)c;
406 } else {
407 uint8_t __t1, __t2;
408 if( /* handle U+0800..U+FFFF inline */
409 (0xe0<=(c) && (c)<0xf0) &&
410 ((i)+1)<srcLength &&
411 U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
412 (__t2=src[(i)+1]-0x80)<=0x3f) {
413 *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
414 i+=2;
415 } else if( /* handle U+0080..U+07FF inline */
416 ((c)<0xe0 && (c)>=0xc2) &&
417 ((i)!=srcLength) &&
418 (__t1=src[i]-0x80)<=0x3f) {
419 *pDest++ = (((c)&0x1f)<<6)|__t1;
420 ++(i);
421 } else {
422 if(c >= 0xf0 || subchar > 0xffff) {
423 // We may read up to four bytes and write up to two UChars,
424 // which we didn't account for with computing count,
425 // so we adjust it here.
426 if(--count == 0) {
427 --i; // back out byte c
428 break;
429 }
430 }
431
432 /* function call for "complicated" and error cases */
433 (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
434 if(c<0 && (++numSubstitutions, c = subchar) < 0) {
435 *pErrorCode = U_INVALID_CHAR_FOUND;
436 return NULL;
437 } else if(c<=0xFFFF) {
438 *(pDest++)=(UChar)c;
439 } else {
440 *(pDest++)=U16_LEAD(c);
441 *(pDest++)=U16_TRAIL(c);
442 }
443 }
444 }
445 } while(--count > 0);
446 }
447
448 while(i < srcLength && (pDest < pDestLimit)) {
449 // modified copy of U8_NEXT()
450 c = (uint8_t)src[i++];
451 if(U8_IS_SINGLE(c)) {
452 *pDest++=(UChar)c;
453 } else {
454 uint8_t __t1, __t2;
455 if( /* handle U+0800..U+FFFF inline */
456 (0xe0<=(c) && (c)<0xf0) &&
457 ((i)+1)<srcLength &&
458 U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
459 (__t2=src[(i)+1]-0x80)<=0x3f) {
460 *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
461 i+=2;
462 } else if( /* handle U+0080..U+07FF inline */
463 ((c)<0xe0 && (c)>=0xc2) &&
464 ((i)!=srcLength) &&
465 (__t1=src[i]-0x80)<=0x3f) {
466 *pDest++ = (((c)&0x1f)<<6)|__t1;
467 ++(i);
468 } else {
469 /* function call for "complicated" and error cases */
470 (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
471 if(c<0 && (++numSubstitutions, c = subchar) < 0) {
472 *pErrorCode = U_INVALID_CHAR_FOUND;
473 return NULL;
474 } else if(c<=0xFFFF) {
475 *(pDest++)=(UChar)c;
476 } else {
477 *(pDest++)=U16_LEAD(c);
478 if(pDest<pDestLimit) {
479 *(pDest++)=U16_TRAIL(c);
480 } else {
481 reqLength++;
482 break;
483 }
484 }
485 }
486 }
487 }
488
489 /* Pre-flight the rest of the string. */
490 while(i < srcLength) {
491 // modified copy of U8_NEXT()
492 c = (uint8_t)src[i++];
493 if(U8_IS_SINGLE(c)) {
494 ++reqLength;
495 } else {
496 uint8_t __t1, __t2;
497 if( /* handle U+0800..U+FFFF inline */
498 (0xe0<=(c) && (c)<0xf0) &&
499 ((i)+1)<srcLength &&
500 U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
501 (__t2=src[(i)+1]-0x80)<=0x3f) {
502 ++reqLength;
503 i+=2;
504 } else if( /* handle U+0080..U+07FF inline */
505 ((c)<0xe0 && (c)>=0xc2) &&
506 ((i)!=srcLength) &&
507 (__t1=src[i]-0x80)<=0x3f) {
508 ++reqLength;
509 ++(i);
510 } else {
511 /* function call for "complicated" and error cases */
512 (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
513 if(c<0 && (++numSubstitutions, c = subchar) < 0) {
514 *pErrorCode = U_INVALID_CHAR_FOUND;
515 return NULL;
516 }
517 reqLength += U16_LENGTH(c);
518 }
519 }
520 }
521 }
522
523 reqLength+=(int32_t)(pDest - dest);
524
525 if(pNumSubstitutions!=NULL) {
526 *pNumSubstitutions=numSubstitutions;
527 }
528
529 if(pDestLength){
530 *pDestLength = reqLength;
531 }
532
533 /* Terminate the buffer */
534 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
535
536 return dest;
537 }
538
539 U_CAPI UChar* U_EXPORT2
u_strFromUTF8(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)540 u_strFromUTF8(UChar *dest,
541 int32_t destCapacity,
542 int32_t *pDestLength,
543 const char* src,
544 int32_t srcLength,
545 UErrorCode *pErrorCode){
546 return u_strFromUTF8WithSub(
547 dest, destCapacity, pDestLength,
548 src, srcLength,
549 U_SENTINEL, NULL,
550 pErrorCode);
551 }
552
553 U_CAPI UChar * U_EXPORT2
u_strFromUTF8Lenient(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)554 u_strFromUTF8Lenient(UChar *dest,
555 int32_t destCapacity,
556 int32_t *pDestLength,
557 const char *src,
558 int32_t srcLength,
559 UErrorCode *pErrorCode) {
560 UChar *pDest = dest;
561 UChar32 ch;
562 int32_t reqLength = 0;
563 uint8_t* pSrc = (uint8_t*) src;
564
565 /* args check */
566 if(U_FAILURE(*pErrorCode)){
567 return NULL;
568 }
569
570 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
571 (destCapacity<0) || (dest == NULL && destCapacity > 0)
572 ) {
573 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
574 return NULL;
575 }
576
577 if(srcLength < 0) {
578 /* Transform a NUL-terminated string. */
579 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
580 uint8_t t1, t2, t3; /* trail bytes */
581
582 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
583 if(ch < 0xc0) {
584 /*
585 * ASCII, or a trail byte in lead position which is treated like
586 * a single-byte sequence for better character boundary
587 * resynchronization after illegal sequences.
588 */
589 *pDest++=(UChar)ch;
590 ++pSrc;
591 continue;
592 } else if(ch < 0xe0) { /* U+0080..U+07FF */
593 if((t1 = pSrc[1]) != 0) {
594 /* 0x3080 = (0xc0 << 6) + 0x80 */
595 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
596 pSrc += 2;
597 continue;
598 }
599 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
600 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
601 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
602 /* 0x2080 = (0x80 << 6) + 0x80 */
603 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
604 pSrc += 3;
605 continue;
606 }
607 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
608 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
609 pSrc += 4;
610 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
611 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
612 *(pDest++) = U16_LEAD(ch);
613 if(pDest < pDestLimit) {
614 *(pDest++) = U16_TRAIL(ch);
615 } else {
616 reqLength = 1;
617 break;
618 }
619 continue;
620 }
621 }
622
623 /* truncated character at the end */
624 *pDest++ = 0xfffd;
625 while(*++pSrc != 0) {}
626 break;
627 }
628
629 /* Pre-flight the rest of the string. */
630 while((ch = *pSrc) != 0) {
631 if(ch < 0xc0) {
632 /*
633 * ASCII, or a trail byte in lead position which is treated like
634 * a single-byte sequence for better character boundary
635 * resynchronization after illegal sequences.
636 */
637 ++reqLength;
638 ++pSrc;
639 continue;
640 } else if(ch < 0xe0) { /* U+0080..U+07FF */
641 if(pSrc[1] != 0) {
642 ++reqLength;
643 pSrc += 2;
644 continue;
645 }
646 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
647 if(pSrc[1] != 0 && pSrc[2] != 0) {
648 ++reqLength;
649 pSrc += 3;
650 continue;
651 }
652 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
653 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
654 reqLength += 2;
655 pSrc += 4;
656 continue;
657 }
658 }
659
660 /* truncated character at the end */
661 ++reqLength;
662 break;
663 }
664 } else /* srcLength >= 0 */ {
665 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
666
667 /*
668 * This function requires that if srcLength is given, then it must be
669 * destCapatity >= srcLength so that we need not check for
670 * destination buffer overflow in the loop.
671 */
672 if(destCapacity < srcLength) {
673 if(pDestLength != NULL) {
674 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
675 }
676 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
677 return NULL;
678 }
679
680 if((pSrcLimit - pSrc) >= 4) {
681 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
682
683 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
684 do {
685 ch = *pSrc++;
686 if(ch < 0xc0) {
687 /*
688 * ASCII, or a trail byte in lead position which is treated like
689 * a single-byte sequence for better character boundary
690 * resynchronization after illegal sequences.
691 */
692 *pDest++=(UChar)ch;
693 } else if(ch < 0xe0) { /* U+0080..U+07FF */
694 /* 0x3080 = (0xc0 << 6) + 0x80 */
695 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
696 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
697 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
698 /* 0x2080 = (0x80 << 6) + 0x80 */
699 ch = (ch << 12) + (*pSrc++ << 6);
700 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
701 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
702 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
703 ch = (ch << 18) + (*pSrc++ << 12);
704 ch += *pSrc++ << 6;
705 ch += *pSrc++ - 0x3c82080;
706 *(pDest++) = U16_LEAD(ch);
707 *(pDest++) = U16_TRAIL(ch);
708 }
709 } while(pSrc < pSrcLimit);
710
711 pSrcLimit += 3; /* restore original pSrcLimit */
712 }
713
714 while(pSrc < pSrcLimit) {
715 ch = *pSrc++;
716 if(ch < 0xc0) {
717 /*
718 * ASCII, or a trail byte in lead position which is treated like
719 * a single-byte sequence for better character boundary
720 * resynchronization after illegal sequences.
721 */
722 *pDest++=(UChar)ch;
723 continue;
724 } else if(ch < 0xe0) { /* U+0080..U+07FF */
725 if(pSrc < pSrcLimit) {
726 /* 0x3080 = (0xc0 << 6) + 0x80 */
727 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
728 continue;
729 }
730 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
731 if((pSrcLimit - pSrc) >= 2) {
732 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
733 /* 0x2080 = (0x80 << 6) + 0x80 */
734 ch = (ch << 12) + (*pSrc++ << 6);
735 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
736 pSrc += 3;
737 continue;
738 }
739 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
740 if((pSrcLimit - pSrc) >= 3) {
741 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
742 ch = (ch << 18) + (*pSrc++ << 12);
743 ch += *pSrc++ << 6;
744 ch += *pSrc++ - 0x3c82080;
745 *(pDest++) = U16_LEAD(ch);
746 *(pDest++) = U16_TRAIL(ch);
747 pSrc += 4;
748 continue;
749 }
750 }
751
752 /* truncated character at the end */
753 *pDest++ = 0xfffd;
754 break;
755 }
756 }
757
758 reqLength+=(int32_t)(pDest - dest);
759
760 if(pDestLength){
761 *pDestLength = reqLength;
762 }
763
764 /* Terminate the buffer */
765 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
766
767 return dest;
768 }
769
770 static inline uint8_t *
_appendUTF8(uint8_t * pDest,UChar32 c)771 _appendUTF8(uint8_t *pDest, UChar32 c) {
772 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
773 if((c)<=0x7f) {
774 *pDest++=(uint8_t)c;
775 } else if(c<=0x7ff) {
776 *pDest++=(uint8_t)((c>>6)|0xc0);
777 *pDest++=(uint8_t)((c&0x3f)|0x80);
778 } else if(c<=0xffff) {
779 *pDest++=(uint8_t)((c>>12)|0xe0);
780 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
781 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
782 } else /* if((uint32_t)(c)<=0x10ffff) */ {
783 *pDest++=(uint8_t)(((c)>>18)|0xf0);
784 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
785 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
786 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
787 }
788 return pDest;
789 }
790
791
792 U_CAPI char* U_EXPORT2
u_strToUTF8WithSub(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * pSrc,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)793 u_strToUTF8WithSub(char *dest,
794 int32_t destCapacity,
795 int32_t *pDestLength,
796 const UChar *pSrc,
797 int32_t srcLength,
798 UChar32 subchar, int32_t *pNumSubstitutions,
799 UErrorCode *pErrorCode){
800 int32_t reqLength=0;
801 uint32_t ch=0,ch2=0;
802 uint8_t *pDest = (uint8_t *)dest;
803 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
804 int32_t numSubstitutions;
805
806 /* args check */
807 if(U_FAILURE(*pErrorCode)){
808 return NULL;
809 }
810
811 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
812 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
813 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
814 ) {
815 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
816 return NULL;
817 }
818
819 if(pNumSubstitutions!=NULL) {
820 *pNumSubstitutions=0;
821 }
822 numSubstitutions=0;
823
824 if(srcLength==-1) {
825 while((ch=*pSrc)!=0) {
826 ++pSrc;
827 if(ch <= 0x7f) {
828 if(pDest<pDestLimit) {
829 *pDest++ = (uint8_t)ch;
830 } else {
831 reqLength = 1;
832 break;
833 }
834 } else if(ch <= 0x7ff) {
835 if((pDestLimit - pDest) >= 2) {
836 *pDest++=(uint8_t)((ch>>6)|0xc0);
837 *pDest++=(uint8_t)((ch&0x3f)|0x80);
838 } else {
839 reqLength = 2;
840 break;
841 }
842 } else if(ch <= 0xd7ff || ch >= 0xe000) {
843 if((pDestLimit - pDest) >= 3) {
844 *pDest++=(uint8_t)((ch>>12)|0xe0);
845 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
846 *pDest++=(uint8_t)((ch&0x3f)|0x80);
847 } else {
848 reqLength = 3;
849 break;
850 }
851 } else /* ch is a surrogate */ {
852 int32_t length;
853
854 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
855 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
856 ++pSrc;
857 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
858 } else if(subchar>=0) {
859 ch=subchar;
860 ++numSubstitutions;
861 } else {
862 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
863 *pErrorCode = U_INVALID_CHAR_FOUND;
864 return NULL;
865 }
866
867 length = U8_LENGTH(ch);
868 if((pDestLimit - pDest) >= length) {
869 /* convert and append*/
870 pDest=_appendUTF8(pDest, ch);
871 } else {
872 reqLength = length;
873 break;
874 }
875 }
876 }
877 while((ch=*pSrc++)!=0) {
878 if(ch<=0x7f) {
879 ++reqLength;
880 } else if(ch<=0x7ff) {
881 reqLength+=2;
882 } else if(!U16_IS_SURROGATE(ch)) {
883 reqLength+=3;
884 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
885 ++pSrc;
886 reqLength+=4;
887 } else if(subchar>=0) {
888 reqLength+=U8_LENGTH(subchar);
889 ++numSubstitutions;
890 } else {
891 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
892 *pErrorCode = U_INVALID_CHAR_FOUND;
893 return NULL;
894 }
895 }
896 } else {
897 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
898 int32_t count;
899
900 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
901 for(;;) {
902 /*
903 * Each iteration of the inner loop progresses by at most 3 UTF-8
904 * bytes and one UChar, for most characters.
905 * For supplementary code points (4 & 2), which are rare,
906 * there is an additional adjustment.
907 */
908 count = (int32_t)((pDestLimit - pDest) / 3);
909 srcLength = (int32_t)(pSrcLimit - pSrc);
910 if(count > srcLength) {
911 count = srcLength; /* min(remaining dest/3, remaining src) */
912 }
913 if(count < 3) {
914 /*
915 * Too much overhead if we get near the end of the string,
916 * continue with the next loop.
917 */
918 break;
919 }
920 do {
921 ch=*pSrc++;
922 if(ch <= 0x7f) {
923 *pDest++ = (uint8_t)ch;
924 } else if(ch <= 0x7ff) {
925 *pDest++=(uint8_t)((ch>>6)|0xc0);
926 *pDest++=(uint8_t)((ch&0x3f)|0x80);
927 } else if(ch <= 0xd7ff || ch >= 0xe000) {
928 *pDest++=(uint8_t)((ch>>12)|0xe0);
929 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
930 *pDest++=(uint8_t)((ch&0x3f)|0x80);
931 } else /* ch is a surrogate */ {
932 /*
933 * We will read two UChars and probably output four bytes,
934 * which we didn't account for with computing count,
935 * so we adjust it here.
936 */
937 if(--count == 0) {
938 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
939 break; /* recompute count */
940 }
941
942 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
943 ++pSrc;
944 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
945
946 /* writing 4 bytes per 2 UChars is ok */
947 *pDest++=(uint8_t)((ch>>18)|0xf0);
948 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
949 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
950 *pDest++=(uint8_t)((ch&0x3f)|0x80);
951 } else {
952 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
953 if(subchar>=0) {
954 ch=subchar;
955 ++numSubstitutions;
956 } else {
957 *pErrorCode = U_INVALID_CHAR_FOUND;
958 return NULL;
959 }
960
961 /* convert and append*/
962 pDest=_appendUTF8(pDest, ch);
963 }
964 }
965 } while(--count > 0);
966 }
967
968 while(pSrc<pSrcLimit) {
969 ch=*pSrc++;
970 if(ch <= 0x7f) {
971 if(pDest<pDestLimit) {
972 *pDest++ = (uint8_t)ch;
973 } else {
974 reqLength = 1;
975 break;
976 }
977 } else if(ch <= 0x7ff) {
978 if((pDestLimit - pDest) >= 2) {
979 *pDest++=(uint8_t)((ch>>6)|0xc0);
980 *pDest++=(uint8_t)((ch&0x3f)|0x80);
981 } else {
982 reqLength = 2;
983 break;
984 }
985 } else if(ch <= 0xd7ff || ch >= 0xe000) {
986 if((pDestLimit - pDest) >= 3) {
987 *pDest++=(uint8_t)((ch>>12)|0xe0);
988 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
989 *pDest++=(uint8_t)((ch&0x3f)|0x80);
990 } else {
991 reqLength = 3;
992 break;
993 }
994 } else /* ch is a surrogate */ {
995 int32_t length;
996
997 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
998 ++pSrc;
999 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1000 } else if(subchar>=0) {
1001 ch=subchar;
1002 ++numSubstitutions;
1003 } else {
1004 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1005 *pErrorCode = U_INVALID_CHAR_FOUND;
1006 return NULL;
1007 }
1008
1009 length = U8_LENGTH(ch);
1010 if((pDestLimit - pDest) >= length) {
1011 /* convert and append*/
1012 pDest=_appendUTF8(pDest, ch);
1013 } else {
1014 reqLength = length;
1015 break;
1016 }
1017 }
1018 }
1019 while(pSrc<pSrcLimit) {
1020 ch=*pSrc++;
1021 if(ch<=0x7f) {
1022 ++reqLength;
1023 } else if(ch<=0x7ff) {
1024 reqLength+=2;
1025 } else if(!U16_IS_SURROGATE(ch)) {
1026 reqLength+=3;
1027 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1028 ++pSrc;
1029 reqLength+=4;
1030 } else if(subchar>=0) {
1031 reqLength+=U8_LENGTH(subchar);
1032 ++numSubstitutions;
1033 } else {
1034 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1035 *pErrorCode = U_INVALID_CHAR_FOUND;
1036 return NULL;
1037 }
1038 }
1039 }
1040
1041 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1042
1043 if(pNumSubstitutions!=NULL) {
1044 *pNumSubstitutions=numSubstitutions;
1045 }
1046
1047 if(pDestLength){
1048 *pDestLength = reqLength;
1049 }
1050
1051 /* Terminate the buffer */
1052 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1053 return dest;
1054 }
1055
1056 U_CAPI char* U_EXPORT2
u_strToUTF8(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * pSrc,int32_t srcLength,UErrorCode * pErrorCode)1057 u_strToUTF8(char *dest,
1058 int32_t destCapacity,
1059 int32_t *pDestLength,
1060 const UChar *pSrc,
1061 int32_t srcLength,
1062 UErrorCode *pErrorCode){
1063 return u_strToUTF8WithSub(
1064 dest, destCapacity, pDestLength,
1065 pSrc, srcLength,
1066 U_SENTINEL, NULL,
1067 pErrorCode);
1068 }
1069
1070 U_CAPI UChar* U_EXPORT2
u_strFromJavaModifiedUTF8WithSub(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UChar32 subchar,int32_t * pNumSubstitutions,UErrorCode * pErrorCode)1071 u_strFromJavaModifiedUTF8WithSub(
1072 UChar *dest,
1073 int32_t destCapacity,
1074 int32_t *pDestLength,
1075 const char *src,
1076 int32_t srcLength,
1077 UChar32 subchar, int32_t *pNumSubstitutions,
1078 UErrorCode *pErrorCode) {
1079 /* args check */
1080 if(U_FAILURE(*pErrorCode)) {
1081 return NULL;
1082 }
1083 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1084 (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1085 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1086 ) {
1087 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1088 return NULL;
1089 }
1090
1091 if(pNumSubstitutions!=NULL) {
1092 *pNumSubstitutions=0;
1093 }
1094 UChar *pDest = dest;
1095 UChar *pDestLimit = dest+destCapacity;
1096 int32_t reqLength = 0;
1097 int32_t numSubstitutions=0;
1098
1099 if(srcLength < 0) {
1100 /*
1101 * Transform a NUL-terminated ASCII string.
1102 * Handle non-ASCII strings with slower code.
1103 */
1104 UChar32 c;
1105 while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
1106 *pDest++=(UChar)c;
1107 ++src;
1108 }
1109 if(c == 0) {
1110 reqLength=(int32_t)(pDest - dest);
1111 if(pDestLength) {
1112 *pDestLength = reqLength;
1113 }
1114
1115 /* Terminate the buffer */
1116 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1117 return dest;
1118 }
1119 srcLength = static_cast<int32_t>(uprv_strlen(src));
1120 }
1121
1122 /* Faster loop without ongoing checking for srcLength and pDestLimit. */
1123 UChar32 ch;
1124 uint8_t t1, t2;
1125 int32_t i = 0;
1126 for(;;) {
1127 int32_t count = (int32_t)(pDestLimit - pDest);
1128 int32_t count2 = srcLength - i;
1129 if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
1130 /* fast ASCII loop */
1131 int32_t start = i;
1132 uint8_t b;
1133 while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
1134 *pDest++=b;
1135 ++i;
1136 }
1137 int32_t delta = i - start;
1138 count -= delta;
1139 count2 -= delta;
1140 }
1141 /*
1142 * Each iteration of the inner loop progresses by at most 3 UTF-8
1143 * bytes and one UChar.
1144 */
1145 if(subchar > 0xFFFF) {
1146 break;
1147 }
1148 count2 /= 3;
1149 if(count > count2) {
1150 count = count2; /* min(remaining dest, remaining src/3) */
1151 }
1152 if(count < 3) {
1153 /*
1154 * Too much overhead if we get near the end of the string,
1155 * continue with the next loop.
1156 */
1157 break;
1158 }
1159 do {
1160 ch = (uint8_t)src[i++];
1161 if(U8_IS_SINGLE(ch)) {
1162 *pDest++=(UChar)ch;
1163 } else {
1164 if(ch >= 0xe0) {
1165 if( /* handle U+0000..U+FFFF inline */
1166 ch <= 0xef &&
1167 (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
1168 (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
1169 ) {
1170 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1171 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1172 i += 2;
1173 continue;
1174 }
1175 } else {
1176 if( /* handle U+0000..U+07FF inline */
1177 ch >= 0xc0 &&
1178 (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
1179 ) {
1180 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1181 ++i;
1182 continue;
1183 }
1184 }
1185
1186 if(subchar < 0) {
1187 *pErrorCode = U_INVALID_CHAR_FOUND;
1188 return NULL;
1189 } else if(subchar > 0xffff && --count == 0) {
1190 /*
1191 * We need to write two UChars, adjusted count for that,
1192 * and ran out of space.
1193 */
1194 --i; // back out byte ch
1195 break;
1196 } else {
1197 /* function call for error cases */
1198 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1199 ++numSubstitutions;
1200 *(pDest++)=(UChar)subchar;
1201 }
1202 }
1203 } while(--count > 0);
1204 }
1205
1206 while(i < srcLength && (pDest < pDestLimit)) {
1207 ch = (uint8_t)src[i++];
1208 if(U8_IS_SINGLE(ch)){
1209 *pDest++=(UChar)ch;
1210 } else {
1211 if(ch >= 0xe0) {
1212 if( /* handle U+0000..U+FFFF inline */
1213 ch <= 0xef &&
1214 (i+1) < srcLength &&
1215 (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
1216 (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
1217 ) {
1218 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1219 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1220 i += 2;
1221 continue;
1222 }
1223 } else {
1224 if( /* handle U+0000..U+07FF inline */
1225 ch >= 0xc0 &&
1226 i < srcLength &&
1227 (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
1228 ) {
1229 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1230 ++i;
1231 continue;
1232 }
1233 }
1234
1235 if(subchar < 0) {
1236 *pErrorCode = U_INVALID_CHAR_FOUND;
1237 return NULL;
1238 } else {
1239 /* function call for error cases */
1240 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1241 ++numSubstitutions;
1242 if(subchar<=0xFFFF) {
1243 *(pDest++)=(UChar)subchar;
1244 } else {
1245 *(pDest++)=U16_LEAD(subchar);
1246 if(pDest<pDestLimit) {
1247 *(pDest++)=U16_TRAIL(subchar);
1248 } else {
1249 reqLength++;
1250 break;
1251 }
1252 }
1253 }
1254 }
1255 }
1256
1257 /* Pre-flight the rest of the string. */
1258 while(i < srcLength) {
1259 ch = (uint8_t)src[i++];
1260 if(U8_IS_SINGLE(ch)) {
1261 reqLength++;
1262 } else {
1263 if(ch >= 0xe0) {
1264 if( /* handle U+0000..U+FFFF inline */
1265 ch <= 0xef &&
1266 (i+1) < srcLength &&
1267 (uint8_t)(src[i] - 0x80) <= 0x3f &&
1268 (uint8_t)(src[i+1] - 0x80) <= 0x3f
1269 ) {
1270 reqLength++;
1271 i += 2;
1272 continue;
1273 }
1274 } else {
1275 if( /* handle U+0000..U+07FF inline */
1276 ch >= 0xc0 &&
1277 i < srcLength &&
1278 (uint8_t)(src[i] - 0x80) <= 0x3f
1279 ) {
1280 reqLength++;
1281 ++i;
1282 continue;
1283 }
1284 }
1285
1286 if(subchar < 0) {
1287 *pErrorCode = U_INVALID_CHAR_FOUND;
1288 return NULL;
1289 } else {
1290 /* function call for error cases */
1291 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1292 ++numSubstitutions;
1293 reqLength+=U16_LENGTH(ch);
1294 }
1295 }
1296 }
1297
1298 if(pNumSubstitutions!=NULL) {
1299 *pNumSubstitutions=numSubstitutions;
1300 }
1301
1302 reqLength+=(int32_t)(pDest - dest);
1303 if(pDestLength) {
1304 *pDestLength = reqLength;
1305 }
1306
1307 /* Terminate the buffer */
1308 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1309 return dest;
1310 }
1311
1312 U_CAPI char* U_EXPORT2
u_strToJavaModifiedUTF8(char * dest,int32_t destCapacity,int32_t * pDestLength,const UChar * src,int32_t srcLength,UErrorCode * pErrorCode)1313 u_strToJavaModifiedUTF8(
1314 char *dest,
1315 int32_t destCapacity,
1316 int32_t *pDestLength,
1317 const UChar *src,
1318 int32_t srcLength,
1319 UErrorCode *pErrorCode) {
1320 int32_t reqLength=0;
1321 uint32_t ch=0;
1322 uint8_t *pDest = (uint8_t *)dest;
1323 uint8_t *pDestLimit = pDest + destCapacity;
1324 const UChar *pSrcLimit;
1325 int32_t count;
1326
1327 /* args check */
1328 if(U_FAILURE(*pErrorCode)){
1329 return NULL;
1330 }
1331 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1332 (dest==NULL && destCapacity!=0) || destCapacity<0
1333 ) {
1334 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1335 return NULL;
1336 }
1337
1338 if(srcLength==-1) {
1339 /* Convert NUL-terminated ASCII, then find the string length. */
1340 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1341 *pDest++ = (uint8_t)ch;
1342 ++src;
1343 }
1344 if(ch == 0) {
1345 reqLength=(int32_t)(pDest - (uint8_t *)dest);
1346 if(pDestLength) {
1347 *pDestLength = reqLength;
1348 }
1349
1350 /* Terminate the buffer */
1351 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1352 return dest;
1353 }
1354 srcLength = u_strlen(src);
1355 }
1356
1357 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1358 pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1359 for(;;) {
1360 count = (int32_t)(pDestLimit - pDest);
1361 srcLength = (int32_t)(pSrcLimit - src);
1362 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1363 /* fast ASCII loop */
1364 const UChar *prevSrc = src;
1365 int32_t delta;
1366 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1367 *pDest++=(uint8_t)ch;
1368 ++src;
1369 }
1370 delta = (int32_t)(src - prevSrc);
1371 count -= delta;
1372 srcLength -= delta;
1373 }
1374 /*
1375 * Each iteration of the inner loop progresses by at most 3 UTF-8
1376 * bytes and one UChar.
1377 */
1378 count /= 3;
1379 if(count > srcLength) {
1380 count = srcLength; /* min(remaining dest/3, remaining src) */
1381 }
1382 if(count < 3) {
1383 /*
1384 * Too much overhead if we get near the end of the string,
1385 * continue with the next loop.
1386 */
1387 break;
1388 }
1389 do {
1390 ch=*src++;
1391 if(ch <= 0x7f && ch != 0) {
1392 *pDest++ = (uint8_t)ch;
1393 } else if(ch <= 0x7ff) {
1394 *pDest++=(uint8_t)((ch>>6)|0xc0);
1395 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1396 } else {
1397 *pDest++=(uint8_t)((ch>>12)|0xe0);
1398 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1399 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1400 }
1401 } while(--count > 0);
1402 }
1403
1404 while(src<pSrcLimit) {
1405 ch=*src++;
1406 if(ch <= 0x7f && ch != 0) {
1407 if(pDest<pDestLimit) {
1408 *pDest++ = (uint8_t)ch;
1409 } else {
1410 reqLength = 1;
1411 break;
1412 }
1413 } else if(ch <= 0x7ff) {
1414 if((pDestLimit - pDest) >= 2) {
1415 *pDest++=(uint8_t)((ch>>6)|0xc0);
1416 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1417 } else {
1418 reqLength = 2;
1419 break;
1420 }
1421 } else {
1422 if((pDestLimit - pDest) >= 3) {
1423 *pDest++=(uint8_t)((ch>>12)|0xe0);
1424 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1425 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1426 } else {
1427 reqLength = 3;
1428 break;
1429 }
1430 }
1431 }
1432 while(src<pSrcLimit) {
1433 ch=*src++;
1434 if(ch <= 0x7f && ch != 0) {
1435 ++reqLength;
1436 } else if(ch<=0x7ff) {
1437 reqLength+=2;
1438 } else {
1439 reqLength+=3;
1440 }
1441 }
1442
1443 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1444 if(pDestLength){
1445 *pDestLength = reqLength;
1446 }
1447
1448 /* Terminate the buffer */
1449 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1450 return dest;
1451 }
1452