1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u8.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
15 *
16 * Also, CESU-8 implementation, see UTR 26.
17 * The CESU-8 converter uses all the same functions as the
18 * UTF-8 converter, with a branch for converting supplementary code points.
19 */
20
21 #include "unicode/utypes.h"
22
23 #if !UCONFIG_NO_CONVERSION
24
25 #include "unicode/ucnv.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf8.h"
28 #include "unicode/utf16.h"
29 #include "ucnv_bld.h"
30 #include "ucnv_cnv.h"
31 #include "cmemory.h"
32
33 /* Prototypes --------------------------------------------------------------- */
34
35 /* Keep these here to make finicky compilers happy */
36
37 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
38 UErrorCode *err);
39 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
40 UErrorCode *err);
41
42
43 /* UTF-8 -------------------------------------------------------------------- */
44
45 /* UTF-8 Conversion DATA
46 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
47 */
48 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
49 #define MAXIMUM_UCS2 0x0000FFFF
50 #define MAXIMUM_UTF 0x0010FFFF
51 #define MAXIMUM_UCS4 0x7FFFFFFF
52 #define HALF_SHIFT 10
53 #define HALF_BASE 0x0010000
54 #define HALF_MASK 0x3FF
55 #define SURROGATE_HIGH_START 0xD800
56 #define SURROGATE_HIGH_END 0xDBFF
57 #define SURROGATE_LOW_START 0xDC00
58 #define SURROGATE_LOW_END 0xDFFF
59
60 /* -SURROGATE_LOW_START + HALF_BASE */
61 #define SURROGATE_LOW_BASE 9216
62
63 static const uint32_t offsetsFromUTF8[7] = {0,
64 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
65 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
66 };
67
68 /* END OF UTF-8 Conversion DATA */
69
70 static const int8_t bytesFromUTF8[256] = {
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
79 };
80
81 /*
82 * Starting with Unicode 3.0.1:
83 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
84 * byte sequences with more than 4 bytes are illegal in UTF-8,
85 * which is tested with impossible values for them
86 */
87 static const uint32_t
88 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
89
hasCESU8Data(const UConverter * cnv)90 static UBool hasCESU8Data(const UConverter *cnv)
91 {
92 #if UCONFIG_NO_NON_HTML5_CONVERSION
93 return FALSE;
94 #else
95 return (UBool)(cnv->sharedData == &_CESU8Data);
96 #endif
97 }
98
ucnv_toUnicode_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)99 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
100 UErrorCode * err)
101 {
102 UConverter *cnv = args->converter;
103 const unsigned char *mySource = (unsigned char *) args->source;
104 UChar *myTarget = args->target;
105 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
106 const UChar *targetLimit = args->targetLimit;
107 unsigned char *toUBytes = cnv->toUBytes;
108 UBool isCESU8 = hasCESU8Data(cnv);
109 uint32_t ch, ch2 = 0;
110 int32_t i, inBytes;
111
112 /* Restore size of current sequence */
113 if (cnv->toUnicodeStatus && myTarget < targetLimit)
114 {
115 inBytes = cnv->mode; /* restore # of bytes to consume */
116 i = cnv->toULength; /* restore # of bytes consumed */
117 cnv->toULength = 0;
118
119 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
120 cnv->toUnicodeStatus = 0;
121 goto morebytes;
122 }
123
124
125 while (mySource < sourceLimit && myTarget < targetLimit)
126 {
127 ch = *(mySource++);
128 if (ch < 0x80) /* Simple case */
129 {
130 *(myTarget++) = (UChar) ch;
131 }
132 else
133 {
134 /* store the first char */
135 toUBytes[0] = (char)ch;
136 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
137 i = 1;
138
139 morebytes:
140 while (i < inBytes)
141 {
142 if (mySource < sourceLimit)
143 {
144 toUBytes[i] = (char) (ch2 = *mySource);
145 if (!U8_IS_TRAIL(ch2))
146 {
147 break; /* i < inBytes */
148 }
149 ch = (ch << 6) + ch2;
150 ++mySource;
151 i++;
152 }
153 else
154 {
155 /* stores a partially calculated target*/
156 cnv->toUnicodeStatus = ch;
157 cnv->mode = inBytes;
158 cnv->toULength = (int8_t) i;
159 goto donefornow;
160 }
161 }
162
163 /* Remove the accumulated high bits */
164 ch -= offsetsFromUTF8[inBytes];
165
166 /*
167 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
168 * - use only trail bytes after a lead byte (checked above)
169 * - use the right number of trail bytes for a given lead byte
170 * - encode a code point <= U+10ffff
171 * - use the fewest possible number of bytes for their code points
172 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
173 *
174 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
175 * There are no irregular sequences any more.
176 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
177 */
178 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
179 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
180 {
181 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
182 if (ch <= MAXIMUM_UCS2)
183 {
184 /* fits in 16 bits */
185 *(myTarget++) = (UChar) ch;
186 }
187 else
188 {
189 /* write out the surrogates */
190 ch -= HALF_BASE;
191 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
192 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
193 if (myTarget < targetLimit)
194 {
195 *(myTarget++) = (UChar)ch;
196 }
197 else
198 {
199 /* Put in overflow buffer (not handled here) */
200 cnv->UCharErrorBuffer[0] = (UChar) ch;
201 cnv->UCharErrorBufferLength = 1;
202 *err = U_BUFFER_OVERFLOW_ERROR;
203 break;
204 }
205 }
206 }
207 else
208 {
209 cnv->toULength = (int8_t)i;
210 *err = U_ILLEGAL_CHAR_FOUND;
211 break;
212 }
213 }
214 }
215
216 donefornow:
217 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
218 {
219 /* End of target buffer */
220 *err = U_BUFFER_OVERFLOW_ERROR;
221 }
222
223 args->target = myTarget;
224 args->source = (const char *) mySource;
225 }
226
ucnv_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)227 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
228 UErrorCode * err)
229 {
230 UConverter *cnv = args->converter;
231 const unsigned char *mySource = (unsigned char *) args->source;
232 UChar *myTarget = args->target;
233 int32_t *myOffsets = args->offsets;
234 int32_t offsetNum = 0;
235 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
236 const UChar *targetLimit = args->targetLimit;
237 unsigned char *toUBytes = cnv->toUBytes;
238 UBool isCESU8 = hasCESU8Data(cnv);
239 uint32_t ch, ch2 = 0;
240 int32_t i, inBytes;
241
242 /* Restore size of current sequence */
243 if (cnv->toUnicodeStatus && myTarget < targetLimit)
244 {
245 inBytes = cnv->mode; /* restore # of bytes to consume */
246 i = cnv->toULength; /* restore # of bytes consumed */
247 cnv->toULength = 0;
248
249 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
250 cnv->toUnicodeStatus = 0;
251 goto morebytes;
252 }
253
254 while (mySource < sourceLimit && myTarget < targetLimit)
255 {
256 ch = *(mySource++);
257 if (ch < 0x80) /* Simple case */
258 {
259 *(myTarget++) = (UChar) ch;
260 *(myOffsets++) = offsetNum++;
261 }
262 else
263 {
264 toUBytes[0] = (char)ch;
265 inBytes = bytesFromUTF8[ch];
266 i = 1;
267
268 morebytes:
269 while (i < inBytes)
270 {
271 if (mySource < sourceLimit)
272 {
273 toUBytes[i] = (char) (ch2 = *mySource);
274 if (!U8_IS_TRAIL(ch2))
275 {
276 break; /* i < inBytes */
277 }
278 ch = (ch << 6) + ch2;
279 ++mySource;
280 i++;
281 }
282 else
283 {
284 cnv->toUnicodeStatus = ch;
285 cnv->mode = inBytes;
286 cnv->toULength = (int8_t)i;
287 goto donefornow;
288 }
289 }
290
291 /* Remove the accumulated high bits */
292 ch -= offsetsFromUTF8[inBytes];
293
294 /*
295 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
296 * - use only trail bytes after a lead byte (checked above)
297 * - use the right number of trail bytes for a given lead byte
298 * - encode a code point <= U+10ffff
299 * - use the fewest possible number of bytes for their code points
300 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
301 *
302 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
303 * There are no irregular sequences any more.
304 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
305 */
306 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
307 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
308 {
309 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
310 if (ch <= MAXIMUM_UCS2)
311 {
312 /* fits in 16 bits */
313 *(myTarget++) = (UChar) ch;
314 *(myOffsets++) = offsetNum;
315 }
316 else
317 {
318 /* write out the surrogates */
319 ch -= HALF_BASE;
320 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
321 *(myOffsets++) = offsetNum;
322 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
323 if (myTarget < targetLimit)
324 {
325 *(myTarget++) = (UChar)ch;
326 *(myOffsets++) = offsetNum;
327 }
328 else
329 {
330 cnv->UCharErrorBuffer[0] = (UChar) ch;
331 cnv->UCharErrorBufferLength = 1;
332 *err = U_BUFFER_OVERFLOW_ERROR;
333 }
334 }
335 offsetNum += i;
336 }
337 else
338 {
339 cnv->toULength = (int8_t)i;
340 *err = U_ILLEGAL_CHAR_FOUND;
341 break;
342 }
343 }
344 }
345
346 donefornow:
347 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
348 { /* End of target buffer */
349 *err = U_BUFFER_OVERFLOW_ERROR;
350 }
351
352 args->target = myTarget;
353 args->source = (const char *) mySource;
354 args->offsets = myOffsets;
355 }
356
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,UErrorCode * err)357 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
358 UErrorCode * err)
359 {
360 UConverter *cnv = args->converter;
361 const UChar *mySource = args->source;
362 const UChar *sourceLimit = args->sourceLimit;
363 uint8_t *myTarget = (uint8_t *) args->target;
364 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
365 uint8_t *tempPtr;
366 UChar32 ch;
367 uint8_t tempBuf[4];
368 int32_t indexToWrite;
369 UBool isNotCESU8 = !hasCESU8Data(cnv);
370
371 if (cnv->fromUChar32 && myTarget < targetLimit)
372 {
373 ch = cnv->fromUChar32;
374 cnv->fromUChar32 = 0;
375 goto lowsurrogate;
376 }
377
378 while (mySource < sourceLimit && myTarget < targetLimit)
379 {
380 ch = *(mySource++);
381
382 if (ch < 0x80) /* Single byte */
383 {
384 *(myTarget++) = (uint8_t) ch;
385 }
386 else if (ch < 0x800) /* Double byte */
387 {
388 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
389 if (myTarget < targetLimit)
390 {
391 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
392 }
393 else
394 {
395 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
396 cnv->charErrorBufferLength = 1;
397 *err = U_BUFFER_OVERFLOW_ERROR;
398 }
399 }
400 else {
401 /* Check for surrogates */
402 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
403 lowsurrogate:
404 if (mySource < sourceLimit) {
405 /* test both code units */
406 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
407 /* convert and consume this supplementary code point */
408 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
409 ++mySource;
410 /* exit this condition tree */
411 }
412 else {
413 /* this is an unpaired trail or lead code unit */
414 /* callback(illegal) */
415 cnv->fromUChar32 = ch;
416 *err = U_ILLEGAL_CHAR_FOUND;
417 break;
418 }
419 }
420 else {
421 /* no more input */
422 cnv->fromUChar32 = ch;
423 break;
424 }
425 }
426
427 /* Do we write the buffer directly for speed,
428 or do we have to be careful about target buffer space? */
429 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
430
431 if (ch <= MAXIMUM_UCS2) {
432 indexToWrite = 2;
433 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
434 }
435 else {
436 indexToWrite = 3;
437 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
438 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
439 }
440 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
441 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
442
443 if (tempPtr == myTarget) {
444 /* There was enough space to write the codepoint directly. */
445 myTarget += (indexToWrite + 1);
446 }
447 else {
448 /* We might run out of room soon. Write it slowly. */
449 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
450 if (myTarget < targetLimit) {
451 *(myTarget++) = *tempPtr;
452 }
453 else {
454 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
455 *err = U_BUFFER_OVERFLOW_ERROR;
456 }
457 }
458 }
459 }
460 }
461
462 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
463 {
464 *err = U_BUFFER_OVERFLOW_ERROR;
465 }
466
467 args->target = (char *) myTarget;
468 args->source = mySource;
469 }
470
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)471 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
472 UErrorCode * err)
473 {
474 UConverter *cnv = args->converter;
475 const UChar *mySource = args->source;
476 int32_t *myOffsets = args->offsets;
477 const UChar *sourceLimit = args->sourceLimit;
478 uint8_t *myTarget = (uint8_t *) args->target;
479 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
480 uint8_t *tempPtr;
481 UChar32 ch;
482 int32_t offsetNum, nextSourceIndex;
483 int32_t indexToWrite;
484 uint8_t tempBuf[4];
485 UBool isNotCESU8 = !hasCESU8Data(cnv);
486
487 if (cnv->fromUChar32 && myTarget < targetLimit)
488 {
489 ch = cnv->fromUChar32;
490 cnv->fromUChar32 = 0;
491 offsetNum = -1;
492 nextSourceIndex = 0;
493 goto lowsurrogate;
494 } else {
495 offsetNum = 0;
496 }
497
498 while (mySource < sourceLimit && myTarget < targetLimit)
499 {
500 ch = *(mySource++);
501
502 if (ch < 0x80) /* Single byte */
503 {
504 *(myOffsets++) = offsetNum++;
505 *(myTarget++) = (char) ch;
506 }
507 else if (ch < 0x800) /* Double byte */
508 {
509 *(myOffsets++) = offsetNum;
510 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
511 if (myTarget < targetLimit)
512 {
513 *(myOffsets++) = offsetNum++;
514 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
515 }
516 else
517 {
518 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
519 cnv->charErrorBufferLength = 1;
520 *err = U_BUFFER_OVERFLOW_ERROR;
521 }
522 }
523 else
524 /* Check for surrogates */
525 {
526 nextSourceIndex = offsetNum + 1;
527
528 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
529 lowsurrogate:
530 if (mySource < sourceLimit) {
531 /* test both code units */
532 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
533 /* convert and consume this supplementary code point */
534 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
535 ++mySource;
536 ++nextSourceIndex;
537 /* exit this condition tree */
538 }
539 else {
540 /* this is an unpaired trail or lead code unit */
541 /* callback(illegal) */
542 cnv->fromUChar32 = ch;
543 *err = U_ILLEGAL_CHAR_FOUND;
544 break;
545 }
546 }
547 else {
548 /* no more input */
549 cnv->fromUChar32 = ch;
550 break;
551 }
552 }
553
554 /* Do we write the buffer directly for speed,
555 or do we have to be careful about target buffer space? */
556 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
557
558 if (ch <= MAXIMUM_UCS2) {
559 indexToWrite = 2;
560 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
561 }
562 else {
563 indexToWrite = 3;
564 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
565 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
566 }
567 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
568 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
569
570 if (tempPtr == myTarget) {
571 /* There was enough space to write the codepoint directly. */
572 myTarget += (indexToWrite + 1);
573 myOffsets[0] = offsetNum;
574 myOffsets[1] = offsetNum;
575 myOffsets[2] = offsetNum;
576 if (indexToWrite >= 3) {
577 myOffsets[3] = offsetNum;
578 }
579 myOffsets += (indexToWrite + 1);
580 }
581 else {
582 /* We might run out of room soon. Write it slowly. */
583 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
584 if (myTarget < targetLimit)
585 {
586 *(myOffsets++) = offsetNum;
587 *(myTarget++) = *tempPtr;
588 }
589 else
590 {
591 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
592 *err = U_BUFFER_OVERFLOW_ERROR;
593 }
594 }
595 }
596 offsetNum = nextSourceIndex;
597 }
598 }
599
600 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
601 {
602 *err = U_BUFFER_OVERFLOW_ERROR;
603 }
604
605 args->target = (char *) myTarget;
606 args->source = mySource;
607 args->offsets = myOffsets;
608 }
609
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)610 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
611 UErrorCode *err) {
612 UConverter *cnv;
613 const uint8_t *sourceInitial;
614 const uint8_t *source;
615 uint16_t extraBytesToWrite;
616 uint8_t myByte;
617 UChar32 ch;
618 int8_t i, isLegalSequence;
619
620 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
621
622 cnv = args->converter;
623 sourceInitial = source = (const uint8_t *)args->source;
624 if (source >= (const uint8_t *)args->sourceLimit)
625 {
626 /* no input */
627 *err = U_INDEX_OUTOFBOUNDS_ERROR;
628 return 0xffff;
629 }
630
631 myByte = (uint8_t)*(source++);
632 if (myByte < 0x80)
633 {
634 args->source = (const char *)source;
635 return (UChar32)myByte;
636 }
637
638 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
639 if (extraBytesToWrite == 0) {
640 cnv->toUBytes[0] = myByte;
641 cnv->toULength = 1;
642 *err = U_ILLEGAL_CHAR_FOUND;
643 args->source = (const char *)source;
644 return 0xffff;
645 }
646
647 /*The byte sequence is longer than the buffer area passed*/
648 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
649 {
650 /* check if all of the remaining bytes are trail bytes */
651 cnv->toUBytes[0] = myByte;
652 i = 1;
653 *err = U_TRUNCATED_CHAR_FOUND;
654 while(source < (const uint8_t *)args->sourceLimit) {
655 if(U8_IS_TRAIL(myByte = *source)) {
656 cnv->toUBytes[i++] = myByte;
657 ++source;
658 } else {
659 /* error even before we run out of input */
660 *err = U_ILLEGAL_CHAR_FOUND;
661 break;
662 }
663 }
664 cnv->toULength = i;
665 args->source = (const char *)source;
666 return 0xffff;
667 }
668
669 isLegalSequence = 1;
670 ch = myByte << 6;
671 switch(extraBytesToWrite)
672 {
673 /* note: code falls through cases! (sic)*/
674 case 6:
675 ch += (myByte = *source);
676 ch <<= 6;
677 if (!U8_IS_TRAIL(myByte))
678 {
679 isLegalSequence = 0;
680 break;
681 }
682 ++source;
683 case 5: /*fall through*/
684 ch += (myByte = *source);
685 ch <<= 6;
686 if (!U8_IS_TRAIL(myByte))
687 {
688 isLegalSequence = 0;
689 break;
690 }
691 ++source;
692 case 4: /*fall through*/
693 ch += (myByte = *source);
694 ch <<= 6;
695 if (!U8_IS_TRAIL(myByte))
696 {
697 isLegalSequence = 0;
698 break;
699 }
700 ++source;
701 case 3: /*fall through*/
702 ch += (myByte = *source);
703 ch <<= 6;
704 if (!U8_IS_TRAIL(myByte))
705 {
706 isLegalSequence = 0;
707 break;
708 }
709 ++source;
710 case 2: /*fall through*/
711 ch += (myByte = *source);
712 if (!U8_IS_TRAIL(myByte))
713 {
714 isLegalSequence = 0;
715 break;
716 }
717 ++source;
718 };
719 ch -= offsetsFromUTF8[extraBytesToWrite];
720 args->source = (const char *)source;
721
722 /*
723 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
724 * - use only trail bytes after a lead byte (checked above)
725 * - use the right number of trail bytes for a given lead byte
726 * - encode a code point <= U+10ffff
727 * - use the fewest possible number of bytes for their code points
728 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
729 *
730 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
731 * There are no irregular sequences any more.
732 */
733 if (isLegalSequence &&
734 (uint32_t)ch <= MAXIMUM_UTF &&
735 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
736 !U_IS_SURROGATE(ch)
737 ) {
738 return ch; /* return the code point */
739 }
740
741 for(i = 0; sourceInitial < source; ++i) {
742 cnv->toUBytes[i] = *sourceInitial++;
743 }
744 cnv->toULength = i;
745 *err = U_ILLEGAL_CHAR_FOUND;
746 return 0xffff;
747 }
748
749 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
750
751 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
752 static const UChar32
753 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
754
755 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
756 static const UChar32
757 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
758
759 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
760 static void
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs * pFromUArgs,UConverterToUnicodeArgs * pToUArgs,UErrorCode * pErrorCode)761 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
762 UConverterToUnicodeArgs *pToUArgs,
763 UErrorCode *pErrorCode) {
764 UConverter *utf8;
765 const uint8_t *source, *sourceLimit;
766 uint8_t *target;
767 int32_t targetCapacity;
768 int32_t count;
769
770 int8_t oldToULength, toULength, toULimit;
771
772 UChar32 c;
773 uint8_t b, t1, t2;
774
775 /* set up the local pointers */
776 utf8=pToUArgs->converter;
777 source=(uint8_t *)pToUArgs->source;
778 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
779 target=(uint8_t *)pFromUArgs->target;
780 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
781
782 /* get the converter state from the UTF-8 UConverter */
783 c=(UChar32)utf8->toUnicodeStatus;
784 if(c!=0) {
785 toULength=oldToULength=utf8->toULength;
786 toULimit=(int8_t)utf8->mode;
787 } else {
788 toULength=oldToULength=toULimit=0;
789 }
790
791 count=(int32_t)(sourceLimit-source)+oldToULength;
792 if(count<toULimit) {
793 /*
794 * Not enough input to complete the partial character.
795 * Jump to moreBytes below - it will not output to target.
796 */
797 } else if(targetCapacity<toULimit) {
798 /*
799 * Not enough target capacity to output the partial character.
800 * Let the standard converter handle this.
801 */
802 *pErrorCode=U_USING_DEFAULT_WARNING;
803 return;
804 } else {
805 /*
806 * Use a single counter for source and target, counting the minimum of
807 * the source length and the target capacity.
808 * As a result, the source length is checked only once per multi-byte
809 * character instead of twice.
810 *
811 * Make sure that the last byte sequence is complete, or else
812 * stop just before it.
813 * (The longest legal byte sequence has 3 trail bytes.)
814 * Count oldToULength (number of source bytes from a previous buffer)
815 * into the source length but reduce the source index by toULimit
816 * while going back over trail bytes in order to not go back into
817 * the bytes that will be read for finishing a partial
818 * sequence from the previous buffer.
819 * Let the standard converter handle edge cases.
820 */
821 int32_t i;
822
823 if(count>targetCapacity) {
824 count=targetCapacity;
825 }
826
827 i=0;
828 while(i<3 && i<(count-toULimit)) {
829 b=source[count-oldToULength-i-1];
830 if(U8_IS_TRAIL(b)) {
831 ++i;
832 } else {
833 if(i<U8_COUNT_TRAIL_BYTES(b)) {
834 /* stop converting before the lead byte if there are not enough trail bytes for it */
835 count-=i+1;
836 }
837 break;
838 }
839 }
840 }
841
842 if(c!=0) {
843 utf8->toUnicodeStatus=0;
844 utf8->toULength=0;
845 goto moreBytes;
846 /* See note in ucnv_SBCSFromUTF8() about this goto. */
847 }
848
849 /* conversion loop */
850 while(count>0) {
851 b=*source++;
852 if((int8_t)b>=0) {
853 /* convert ASCII */
854 *target++=b;
855 --count;
856 continue;
857 } else {
858 if(b>0xe0) {
859 if( /* handle U+1000..U+D7FF inline */
860 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
861 (b==0xed && (t1 <= 0x9f))) &&
862 (t2=source[1]) >= 0x80 && t2 <= 0xbf
863 ) {
864 source+=2;
865 *target++=b;
866 *target++=t1;
867 *target++=t2;
868 count-=3;
869 continue;
870 }
871 } else if(b<0xe0) {
872 if( /* handle U+0080..U+07FF inline */
873 b>=0xc2 &&
874 (t1=*source) >= 0x80 && t1 <= 0xbf
875 ) {
876 ++source;
877 *target++=b;
878 *target++=t1;
879 count-=2;
880 continue;
881 }
882 } else if(b==0xe0) {
883 if( /* handle U+0800..U+0FFF inline */
884 (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
885 (t2=source[1]) >= 0x80 && t2 <= 0xbf
886 ) {
887 source+=2;
888 *target++=b;
889 *target++=t1;
890 *target++=t2;
891 count-=3;
892 continue;
893 }
894 }
895
896 /* handle "complicated" and error cases, and continuing partial characters */
897 oldToULength=0;
898 toULength=1;
899 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
900 c=b;
901 moreBytes:
902 while(toULength<toULimit) {
903 if(source<sourceLimit) {
904 b=*source;
905 if(U8_IS_TRAIL(b)) {
906 ++source;
907 ++toULength;
908 c=(c<<6)+b;
909 } else {
910 break; /* sequence too short, stop with toULength<toULimit */
911 }
912 } else {
913 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
914 source-=(toULength-oldToULength);
915 while(oldToULength<toULength) {
916 utf8->toUBytes[oldToULength++]=*source++;
917 }
918 utf8->toUnicodeStatus=c;
919 utf8->toULength=toULength;
920 utf8->mode=toULimit;
921 pToUArgs->source=(char *)source;
922 pFromUArgs->target=(char *)target;
923 return;
924 }
925 }
926
927 if( toULength==toULimit && /* consumed all trail bytes */
928 (toULength==3 || toULength==2) && /* BMP */
929 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
930 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
931 ) {
932 /* legal byte sequence for BMP code point */
933 } else if(
934 toULength==toULimit && toULength==4 &&
935 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
936 ) {
937 /* legal byte sequence for supplementary code point */
938 } else {
939 /* error handling: illegal UTF-8 byte sequence */
940 source-=(toULength-oldToULength);
941 while(oldToULength<toULength) {
942 utf8->toUBytes[oldToULength++]=*source++;
943 }
944 utf8->toULength=toULength;
945 pToUArgs->source=(char *)source;
946 pFromUArgs->target=(char *)target;
947 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
948 return;
949 }
950
951 /* copy the legal byte sequence to the target */
952 {
953 int8_t i;
954
955 for(i=0; i<oldToULength; ++i) {
956 *target++=utf8->toUBytes[i];
957 }
958 source-=(toULength-oldToULength);
959 for(; i<toULength; ++i) {
960 *target++=*source++;
961 }
962 count-=toULength;
963 }
964 }
965 }
966
967 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
968 if(target==(const uint8_t *)pFromUArgs->targetLimit) {
969 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
970 } else {
971 b=*source;
972 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
973 if(toULimit>(sourceLimit-source)) {
974 /* collect a truncated byte sequence */
975 toULength=0;
976 c=b;
977 for(;;) {
978 utf8->toUBytes[toULength++]=b;
979 if(++source==sourceLimit) {
980 /* partial byte sequence at end of source */
981 utf8->toUnicodeStatus=c;
982 utf8->toULength=toULength;
983 utf8->mode=toULimit;
984 break;
985 } else if(!U8_IS_TRAIL(b=*source)) {
986 /* lead byte in trail byte position */
987 utf8->toULength=toULength;
988 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
989 break;
990 }
991 c=(c<<6)+b;
992 }
993 } else {
994 /* partial-sequence target overflow: fall back to the pivoting implementation */
995 *pErrorCode=U_USING_DEFAULT_WARNING;
996 }
997 }
998 }
999
1000 /* write back the updated pointers */
1001 pToUArgs->source=(char *)source;
1002 pFromUArgs->target=(char *)target;
1003 }
1004
1005 /* UTF-8 converter data ----------------------------------------------------- */
1006
1007 static const UConverterImpl _UTF8Impl={
1008 UCNV_UTF8,
1009
1010 NULL,
1011 NULL,
1012
1013 NULL,
1014 NULL,
1015 NULL,
1016
1017 ucnv_toUnicode_UTF8,
1018 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1019 ucnv_fromUnicode_UTF8,
1020 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1021 ucnv_getNextUChar_UTF8,
1022
1023 NULL,
1024 NULL,
1025 NULL,
1026 NULL,
1027 ucnv_getNonSurrogateUnicodeSet,
1028
1029 ucnv_UTF8FromUTF8,
1030 ucnv_UTF8FromUTF8
1031 };
1032
1033 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1034 static const UConverterStaticData _UTF8StaticData={
1035 sizeof(UConverterStaticData),
1036 "UTF-8",
1037 1208, UCNV_IBM, UCNV_UTF8,
1038 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1039 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1040 0,
1041 0,
1042 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1043 };
1044
1045
1046 const UConverterSharedData _UTF8Data={
1047 sizeof(UConverterSharedData), ~((uint32_t) 0),
1048 NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
1049 0
1050 };
1051
1052 /* CESU-8 converter data ---------------------------------------------------- */
1053
1054 static const UConverterImpl _CESU8Impl={
1055 UCNV_CESU8,
1056
1057 NULL,
1058 NULL,
1059
1060 NULL,
1061 NULL,
1062 NULL,
1063
1064 ucnv_toUnicode_UTF8,
1065 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1066 ucnv_fromUnicode_UTF8,
1067 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1068 NULL,
1069
1070 NULL,
1071 NULL,
1072 NULL,
1073 NULL,
1074 ucnv_getCompleteUnicodeSet
1075 };
1076
1077 static const UConverterStaticData _CESU8StaticData={
1078 sizeof(UConverterStaticData),
1079 "CESU-8",
1080 9400, /* CCSID for CESU-8 */
1081 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1082 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1083 0,
1084 0,
1085 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1086 };
1087
1088
1089 const UConverterSharedData _CESU8Data={
1090 sizeof(UConverterSharedData), ~((uint32_t) 0),
1091 NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
1092 0
1093 };
1094
1095 #endif
1096