1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u8.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
15 *
16 * Also, CESU-8 implementation, see UTR 26.
17 * The CESU-8 converter uses all the same functions as the
18 * UTF-8 converter, with a branch for converting supplementary code points.
19 */
20
21 #include "unicode/utypes.h"
22
23 #if !UCONFIG_NO_CONVERSION
24
25 #include "unicode/ucnv.h"
26 #include "ucnv_bld.h"
27 #include "ucnv_cnv.h"
28 #include "cmemory.h"
29
30 /* Prototypes --------------------------------------------------------------- */
31
32 /* Keep these here to make finicky compilers happy */
33
34 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
35 UErrorCode *err);
36 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
37 UErrorCode *err);
38
39
40 /* UTF-8 -------------------------------------------------------------------- */
41
42 /* UTF-8 Conversion DATA
43 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
44 */
45 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
46 #define MAXIMUM_UCS2 0x0000FFFF
47 #define MAXIMUM_UTF 0x0010FFFF
48 #define MAXIMUM_UCS4 0x7FFFFFFF
49 #define HALF_SHIFT 10
50 #define HALF_BASE 0x0010000
51 #define HALF_MASK 0x3FF
52 #define SURROGATE_HIGH_START 0xD800
53 #define SURROGATE_HIGH_END 0xDBFF
54 #define SURROGATE_LOW_START 0xDC00
55 #define SURROGATE_LOW_END 0xDFFF
56
57 /* -SURROGATE_LOW_START + HALF_BASE */
58 #define SURROGATE_LOW_BASE 9216
59
60 static const uint32_t offsetsFromUTF8[7] = {0,
61 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
62 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
63 };
64
65 /* END OF UTF-8 Conversion DATA */
66
67 static const int8_t bytesFromUTF8[256] = {
68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
76 };
77
78 /*
79 * Starting with Unicode 3.0.1:
80 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
81 * byte sequences with more than 4 bytes are illegal in UTF-8,
82 * which is tested with impossible values for them
83 */
84 static const uint32_t
85 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
86
ucnv_toUnicode_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)87 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
88 UErrorCode * err)
89 {
90 UConverter *cnv = args->converter;
91 const unsigned char *mySource = (unsigned char *) args->source;
92 UChar *myTarget = args->target;
93 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
94 const UChar *targetLimit = args->targetLimit;
95 unsigned char *toUBytes = cnv->toUBytes;
96 UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
97 uint32_t ch, ch2 = 0;
98 int32_t i, inBytes;
99
100 /* Restore size of current sequence */
101 if (cnv->toUnicodeStatus && myTarget < targetLimit)
102 {
103 inBytes = cnv->mode; /* restore # of bytes to consume */
104 i = cnv->toULength; /* restore # of bytes consumed */
105 cnv->toULength = 0;
106
107 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
108 cnv->toUnicodeStatus = 0;
109 goto morebytes;
110 }
111
112
113 while (mySource < sourceLimit && myTarget < targetLimit)
114 {
115 ch = *(mySource++);
116 if (ch < 0x80) /* Simple case */
117 {
118 *(myTarget++) = (UChar) ch;
119 }
120 else
121 {
122 /* store the first char */
123 toUBytes[0] = (char)ch;
124 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
125 i = 1;
126
127 morebytes:
128 while (i < inBytes)
129 {
130 if (mySource < sourceLimit)
131 {
132 toUBytes[i] = (char) (ch2 = *mySource);
133 if (!UTF8_IS_TRAIL(ch2))
134 {
135 break; /* i < inBytes */
136 }
137 ch = (ch << 6) + ch2;
138 ++mySource;
139 i++;
140 }
141 else
142 {
143 /* stores a partially calculated target*/
144 cnv->toUnicodeStatus = ch;
145 cnv->mode = inBytes;
146 cnv->toULength = (int8_t) i;
147 goto donefornow;
148 }
149 }
150
151 /* Remove the accumulated high bits */
152 ch -= offsetsFromUTF8[inBytes];
153
154 /*
155 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
156 * - use only trail bytes after a lead byte (checked above)
157 * - use the right number of trail bytes for a given lead byte
158 * - encode a code point <= U+10ffff
159 * - use the fewest possible number of bytes for their code points
160 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
161 *
162 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
163 * There are no irregular sequences any more.
164 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
165 */
166 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
167 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
168 {
169 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
170 if (ch <= MAXIMUM_UCS2)
171 {
172 /* fits in 16 bits */
173 *(myTarget++) = (UChar) ch;
174 }
175 else
176 {
177 /* write out the surrogates */
178 ch -= HALF_BASE;
179 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
180 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
181 if (myTarget < targetLimit)
182 {
183 *(myTarget++) = (UChar)ch;
184 }
185 else
186 {
187 /* Put in overflow buffer (not handled here) */
188 cnv->UCharErrorBuffer[0] = (UChar) ch;
189 cnv->UCharErrorBufferLength = 1;
190 *err = U_BUFFER_OVERFLOW_ERROR;
191 break;
192 }
193 }
194 }
195 else
196 {
197 cnv->toULength = (int8_t)i;
198 *err = U_ILLEGAL_CHAR_FOUND;
199 break;
200 }
201 }
202 }
203
204 donefornow:
205 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
206 {
207 /* End of target buffer */
208 *err = U_BUFFER_OVERFLOW_ERROR;
209 }
210
211 args->target = myTarget;
212 args->source = (const char *) mySource;
213 }
214
ucnv_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)215 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
216 UErrorCode * err)
217 {
218 UConverter *cnv = args->converter;
219 const unsigned char *mySource = (unsigned char *) args->source;
220 UChar *myTarget = args->target;
221 int32_t *myOffsets = args->offsets;
222 int32_t offsetNum = 0;
223 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
224 const UChar *targetLimit = args->targetLimit;
225 unsigned char *toUBytes = cnv->toUBytes;
226 UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
227 uint32_t ch, ch2 = 0;
228 int32_t i, inBytes;
229
230 /* Restore size of current sequence */
231 if (cnv->toUnicodeStatus && myTarget < targetLimit)
232 {
233 inBytes = cnv->mode; /* restore # of bytes to consume */
234 i = cnv->toULength; /* restore # of bytes consumed */
235 cnv->toULength = 0;
236
237 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
238 cnv->toUnicodeStatus = 0;
239 goto morebytes;
240 }
241
242 while (mySource < sourceLimit && myTarget < targetLimit)
243 {
244 ch = *(mySource++);
245 if (ch < 0x80) /* Simple case */
246 {
247 *(myTarget++) = (UChar) ch;
248 *(myOffsets++) = offsetNum++;
249 }
250 else
251 {
252 toUBytes[0] = (char)ch;
253 inBytes = bytesFromUTF8[ch];
254 i = 1;
255
256 morebytes:
257 while (i < inBytes)
258 {
259 if (mySource < sourceLimit)
260 {
261 toUBytes[i] = (char) (ch2 = *mySource);
262 if (!UTF8_IS_TRAIL(ch2))
263 {
264 break; /* i < inBytes */
265 }
266 ch = (ch << 6) + ch2;
267 ++mySource;
268 i++;
269 }
270 else
271 {
272 cnv->toUnicodeStatus = ch;
273 cnv->mode = inBytes;
274 cnv->toULength = (int8_t)i;
275 goto donefornow;
276 }
277 }
278
279 /* Remove the accumulated high bits */
280 ch -= offsetsFromUTF8[inBytes];
281
282 /*
283 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
284 * - use only trail bytes after a lead byte (checked above)
285 * - use the right number of trail bytes for a given lead byte
286 * - encode a code point <= U+10ffff
287 * - use the fewest possible number of bytes for their code points
288 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
289 *
290 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
291 * There are no irregular sequences any more.
292 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
293 */
294 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
295 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
296 {
297 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
298 if (ch <= MAXIMUM_UCS2)
299 {
300 /* fits in 16 bits */
301 *(myTarget++) = (UChar) ch;
302 *(myOffsets++) = offsetNum;
303 }
304 else
305 {
306 /* write out the surrogates */
307 ch -= HALF_BASE;
308 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
309 *(myOffsets++) = offsetNum;
310 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
311 if (myTarget < targetLimit)
312 {
313 *(myTarget++) = (UChar)ch;
314 *(myOffsets++) = offsetNum;
315 }
316 else
317 {
318 cnv->UCharErrorBuffer[0] = (UChar) ch;
319 cnv->UCharErrorBufferLength = 1;
320 *err = U_BUFFER_OVERFLOW_ERROR;
321 }
322 }
323 offsetNum += i;
324 }
325 else
326 {
327 cnv->toULength = (int8_t)i;
328 *err = U_ILLEGAL_CHAR_FOUND;
329 break;
330 }
331 }
332 }
333
334 donefornow:
335 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
336 { /* End of target buffer */
337 *err = U_BUFFER_OVERFLOW_ERROR;
338 }
339
340 args->target = myTarget;
341 args->source = (const char *) mySource;
342 args->offsets = myOffsets;
343 }
344
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,UErrorCode * err)345 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
346 UErrorCode * err)
347 {
348 UConverter *cnv = args->converter;
349 const UChar *mySource = args->source;
350 const UChar *sourceLimit = args->sourceLimit;
351 uint8_t *myTarget = (uint8_t *) args->target;
352 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
353 uint8_t *tempPtr;
354 UChar32 ch;
355 uint8_t tempBuf[4];
356 int32_t indexToWrite;
357 UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
358
359 if (cnv->fromUChar32 && myTarget < targetLimit)
360 {
361 ch = cnv->fromUChar32;
362 cnv->fromUChar32 = 0;
363 goto lowsurrogate;
364 }
365
366 while (mySource < sourceLimit && myTarget < targetLimit)
367 {
368 ch = *(mySource++);
369
370 if (ch < 0x80) /* Single byte */
371 {
372 *(myTarget++) = (uint8_t) ch;
373 }
374 else if (ch < 0x800) /* Double byte */
375 {
376 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
377 if (myTarget < targetLimit)
378 {
379 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
380 }
381 else
382 {
383 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
384 cnv->charErrorBufferLength = 1;
385 *err = U_BUFFER_OVERFLOW_ERROR;
386 }
387 }
388 else {
389 /* Check for surrogates */
390 if(UTF_IS_SURROGATE(ch) && isNotCESU8) {
391 lowsurrogate:
392 if (mySource < sourceLimit) {
393 /* test both code units */
394 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) {
395 /* convert and consume this supplementary code point */
396 ch=UTF16_GET_PAIR_VALUE(ch, *mySource);
397 ++mySource;
398 /* exit this condition tree */
399 }
400 else {
401 /* this is an unpaired trail or lead code unit */
402 /* callback(illegal) */
403 cnv->fromUChar32 = ch;
404 *err = U_ILLEGAL_CHAR_FOUND;
405 break;
406 }
407 }
408 else {
409 /* no more input */
410 cnv->fromUChar32 = ch;
411 break;
412 }
413 }
414
415 /* Do we write the buffer directly for speed,
416 or do we have to be careful about target buffer space? */
417 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
418
419 if (ch <= MAXIMUM_UCS2) {
420 indexToWrite = 2;
421 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
422 }
423 else {
424 indexToWrite = 3;
425 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
426 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
427 }
428 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
429 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
430
431 if (tempPtr == myTarget) {
432 /* There was enough space to write the codepoint directly. */
433 myTarget += (indexToWrite + 1);
434 }
435 else {
436 /* We might run out of room soon. Write it slowly. */
437 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
438 if (myTarget < targetLimit) {
439 *(myTarget++) = *tempPtr;
440 }
441 else {
442 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
443 *err = U_BUFFER_OVERFLOW_ERROR;
444 }
445 }
446 }
447 }
448 }
449
450 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
451 {
452 *err = U_BUFFER_OVERFLOW_ERROR;
453 }
454
455 args->target = (char *) myTarget;
456 args->source = mySource;
457 }
458
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)459 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
460 UErrorCode * err)
461 {
462 UConverter *cnv = args->converter;
463 const UChar *mySource = args->source;
464 int32_t *myOffsets = args->offsets;
465 const UChar *sourceLimit = args->sourceLimit;
466 uint8_t *myTarget = (uint8_t *) args->target;
467 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
468 uint8_t *tempPtr;
469 UChar32 ch;
470 int32_t offsetNum, nextSourceIndex;
471 int32_t indexToWrite;
472 uint8_t tempBuf[4];
473 UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
474
475 if (cnv->fromUChar32 && myTarget < targetLimit)
476 {
477 ch = cnv->fromUChar32;
478 cnv->fromUChar32 = 0;
479 offsetNum = -1;
480 nextSourceIndex = 0;
481 goto lowsurrogate;
482 } else {
483 offsetNum = 0;
484 }
485
486 while (mySource < sourceLimit && myTarget < targetLimit)
487 {
488 ch = *(mySource++);
489
490 if (ch < 0x80) /* Single byte */
491 {
492 *(myOffsets++) = offsetNum++;
493 *(myTarget++) = (char) ch;
494 }
495 else if (ch < 0x800) /* Double byte */
496 {
497 *(myOffsets++) = offsetNum;
498 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
499 if (myTarget < targetLimit)
500 {
501 *(myOffsets++) = offsetNum++;
502 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
503 }
504 else
505 {
506 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
507 cnv->charErrorBufferLength = 1;
508 *err = U_BUFFER_OVERFLOW_ERROR;
509 }
510 }
511 else
512 /* Check for surrogates */
513 {
514 nextSourceIndex = offsetNum + 1;
515
516 if(UTF_IS_SURROGATE(ch) && isNotCESU8) {
517 lowsurrogate:
518 if (mySource < sourceLimit) {
519 /* test both code units */
520 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) {
521 /* convert and consume this supplementary code point */
522 ch=UTF16_GET_PAIR_VALUE(ch, *mySource);
523 ++mySource;
524 ++nextSourceIndex;
525 /* exit this condition tree */
526 }
527 else {
528 /* this is an unpaired trail or lead code unit */
529 /* callback(illegal) */
530 cnv->fromUChar32 = ch;
531 *err = U_ILLEGAL_CHAR_FOUND;
532 break;
533 }
534 }
535 else {
536 /* no more input */
537 cnv->fromUChar32 = ch;
538 break;
539 }
540 }
541
542 /* Do we write the buffer directly for speed,
543 or do we have to be careful about target buffer space? */
544 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
545
546 if (ch <= MAXIMUM_UCS2) {
547 indexToWrite = 2;
548 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
549 }
550 else {
551 indexToWrite = 3;
552 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
553 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
554 }
555 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
556 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
557
558 if (tempPtr == myTarget) {
559 /* There was enough space to write the codepoint directly. */
560 myTarget += (indexToWrite + 1);
561 myOffsets[0] = offsetNum;
562 myOffsets[1] = offsetNum;
563 myOffsets[2] = offsetNum;
564 if (indexToWrite >= 3) {
565 myOffsets[3] = offsetNum;
566 }
567 myOffsets += (indexToWrite + 1);
568 }
569 else {
570 /* We might run out of room soon. Write it slowly. */
571 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
572 if (myTarget < targetLimit)
573 {
574 *(myOffsets++) = offsetNum;
575 *(myTarget++) = *tempPtr;
576 }
577 else
578 {
579 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
580 *err = U_BUFFER_OVERFLOW_ERROR;
581 }
582 }
583 }
584 offsetNum = nextSourceIndex;
585 }
586 }
587
588 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
589 {
590 *err = U_BUFFER_OVERFLOW_ERROR;
591 }
592
593 args->target = (char *) myTarget;
594 args->source = mySource;
595 args->offsets = myOffsets;
596 }
597
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)598 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
599 UErrorCode *err) {
600 UConverter *cnv;
601 const uint8_t *sourceInitial;
602 const uint8_t *source;
603 uint16_t extraBytesToWrite;
604 uint8_t myByte;
605 UChar32 ch;
606 int8_t i, isLegalSequence;
607
608 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
609
610 cnv = args->converter;
611 sourceInitial = source = (const uint8_t *)args->source;
612 if (source >= (const uint8_t *)args->sourceLimit)
613 {
614 /* no input */
615 *err = U_INDEX_OUTOFBOUNDS_ERROR;
616 return 0xffff;
617 }
618
619 myByte = (uint8_t)*(source++);
620 if (myByte < 0x80)
621 {
622 args->source = (const char *)source;
623 return (UChar32)myByte;
624 }
625
626 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
627 if (extraBytesToWrite == 0) {
628 cnv->toUBytes[0] = myByte;
629 cnv->toULength = 1;
630 *err = U_ILLEGAL_CHAR_FOUND;
631 args->source = (const char *)source;
632 return 0xffff;
633 }
634
635 /*The byte sequence is longer than the buffer area passed*/
636 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
637 {
638 /* check if all of the remaining bytes are trail bytes */
639 cnv->toUBytes[0] = myByte;
640 i = 1;
641 *err = U_TRUNCATED_CHAR_FOUND;
642 while(source < (const uint8_t *)args->sourceLimit) {
643 if(U8_IS_TRAIL(myByte = *source)) {
644 cnv->toUBytes[i++] = myByte;
645 ++source;
646 } else {
647 /* error even before we run out of input */
648 *err = U_ILLEGAL_CHAR_FOUND;
649 break;
650 }
651 }
652 cnv->toULength = i;
653 args->source = (const char *)source;
654 return 0xffff;
655 }
656
657 isLegalSequence = 1;
658 ch = myByte << 6;
659 switch(extraBytesToWrite)
660 {
661 /* note: code falls through cases! (sic)*/
662 case 6:
663 ch += (myByte = *source);
664 ch <<= 6;
665 if (!UTF8_IS_TRAIL(myByte))
666 {
667 isLegalSequence = 0;
668 break;
669 }
670 ++source;
671 case 5:
672 ch += (myByte = *source);
673 ch <<= 6;
674 if (!UTF8_IS_TRAIL(myByte))
675 {
676 isLegalSequence = 0;
677 break;
678 }
679 ++source;
680 case 4:
681 ch += (myByte = *source);
682 ch <<= 6;
683 if (!UTF8_IS_TRAIL(myByte))
684 {
685 isLegalSequence = 0;
686 break;
687 }
688 ++source;
689 case 3:
690 ch += (myByte = *source);
691 ch <<= 6;
692 if (!UTF8_IS_TRAIL(myByte))
693 {
694 isLegalSequence = 0;
695 break;
696 }
697 ++source;
698 case 2:
699 ch += (myByte = *source);
700 if (!UTF8_IS_TRAIL(myByte))
701 {
702 isLegalSequence = 0;
703 break;
704 }
705 ++source;
706 };
707 ch -= offsetsFromUTF8[extraBytesToWrite];
708 args->source = (const char *)source;
709
710 /*
711 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
712 * - use only trail bytes after a lead byte (checked above)
713 * - use the right number of trail bytes for a given lead byte
714 * - encode a code point <= U+10ffff
715 * - use the fewest possible number of bytes for their code points
716 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
717 *
718 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
719 * There are no irregular sequences any more.
720 */
721 if (isLegalSequence &&
722 (uint32_t)ch <= MAXIMUM_UTF &&
723 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
724 !U_IS_SURROGATE(ch)
725 ) {
726 return ch; /* return the code point */
727 }
728
729 for(i = 0; sourceInitial < source; ++i) {
730 cnv->toUBytes[i] = *sourceInitial++;
731 }
732 cnv->toULength = i;
733 *err = U_ILLEGAL_CHAR_FOUND;
734 return 0xffff;
735 }
736
737 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
738
739 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
740 static const UChar32
741 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
742
743 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
744 static const UChar32
745 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
746
747 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
748 static void
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs * pFromUArgs,UConverterToUnicodeArgs * pToUArgs,UErrorCode * pErrorCode)749 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
750 UConverterToUnicodeArgs *pToUArgs,
751 UErrorCode *pErrorCode) {
752 UConverter *utf8, *cnv;
753 const uint8_t *source, *sourceLimit;
754 uint8_t *target;
755 int32_t targetCapacity;
756 int32_t count;
757
758 int8_t oldToULength, toULength, toULimit;
759
760 UChar32 c;
761 uint8_t b, t1, t2;
762
763 /* set up the local pointers */
764 utf8=pToUArgs->converter;
765 cnv=pFromUArgs->converter;
766 source=(uint8_t *)pToUArgs->source;
767 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
768 target=(uint8_t *)pFromUArgs->target;
769 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
770
771 /* get the converter state from the UTF-8 UConverter */
772 c=(UChar32)utf8->toUnicodeStatus;
773 if(c!=0) {
774 toULength=oldToULength=utf8->toULength;
775 toULimit=(int8_t)utf8->mode;
776 } else {
777 toULength=oldToULength=toULimit=0;
778 }
779
780 count=(int32_t)(sourceLimit-source)+oldToULength;
781 if(count<toULimit) {
782 /*
783 * Not enough input to complete the partial character.
784 * Jump to moreBytes below - it will not output to target.
785 */
786 } else if(targetCapacity<toULimit) {
787 /*
788 * Not enough target capacity to output the partial character.
789 * Let the standard converter handle this.
790 */
791 *pErrorCode=U_USING_DEFAULT_WARNING;
792 return;
793 } else {
794 /*
795 * Use a single counter for source and target, counting the minimum of
796 * the source length and the target capacity.
797 * As a result, the source length is checked only once per multi-byte
798 * character instead of twice.
799 *
800 * Make sure that the last byte sequence is complete, or else
801 * stop just before it.
802 * (The longest legal byte sequence has 3 trail bytes.)
803 * Count oldToULength (number of source bytes from a previous buffer)
804 * into the source length but reduce the source index by toULimit
805 * while going back over trail bytes in order to not go back into
806 * the bytes that will be read for finishing a partial
807 * sequence from the previous buffer.
808 * Let the standard converter handle edge cases.
809 */
810 int32_t i;
811
812 if(count>targetCapacity) {
813 count=targetCapacity;
814 }
815
816 i=0;
817 while(i<3 && i<(count-toULimit)) {
818 b=source[count-oldToULength-i-1];
819 if(U8_IS_TRAIL(b)) {
820 ++i;
821 } else {
822 if(i<utf8_countTrailBytes[b]) {
823 /* stop converting before the lead byte if there are not enough trail bytes for it */
824 count-=i+1;
825 }
826 break;
827 }
828 }
829 }
830
831 if(c!=0) {
832 utf8->toUnicodeStatus=0;
833 utf8->toULength=0;
834 goto moreBytes;
835 /* See note in ucnv_SBCSFromUTF8() about this goto. */
836 }
837
838 /* conversion loop */
839 while(count>0) {
840 b=*source++;
841 if((int8_t)b>=0) {
842 /* convert ASCII */
843 *target++=b;
844 --count;
845 continue;
846 } else {
847 if(b>0xe0) {
848 if( /* handle U+1000..U+D7FF inline */
849 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
850 (b==0xed && (t1 <= 0x9f))) &&
851 (t2=source[1]) >= 0x80 && t2 <= 0xbf
852 ) {
853 source+=2;
854 *target++=b;
855 *target++=t1;
856 *target++=t2;
857 count-=3;
858 continue;
859 }
860 } else if(b<0xe0) {
861 if( /* handle U+0080..U+07FF inline */
862 b>=0xc2 &&
863 (t1=*source) >= 0x80 && t1 <= 0xbf
864 ) {
865 ++source;
866 *target++=b;
867 *target++=t1;
868 count-=2;
869 continue;
870 }
871 } else if(b==0xe0) {
872 if( /* handle U+0800..U+0FFF inline */
873 (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
874 (t2=source[1]) >= 0x80 && t2 <= 0xbf
875 ) {
876 source+=2;
877 *target++=b;
878 *target++=t1;
879 *target++=t2;
880 count-=3;
881 continue;
882 }
883 }
884
885 /* handle "complicated" and error cases, and continuing partial characters */
886 oldToULength=0;
887 toULength=1;
888 toULimit=utf8_countTrailBytes[b]+1;
889 c=b;
890 moreBytes:
891 while(toULength<toULimit) {
892 if(source<sourceLimit) {
893 b=*source;
894 if(U8_IS_TRAIL(b)) {
895 ++source;
896 ++toULength;
897 c=(c<<6)+b;
898 } else {
899 break; /* sequence too short, stop with toULength<toULimit */
900 }
901 } else {
902 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
903 source-=(toULength-oldToULength);
904 while(oldToULength<toULength) {
905 utf8->toUBytes[oldToULength++]=*source++;
906 }
907 utf8->toUnicodeStatus=c;
908 utf8->toULength=toULength;
909 utf8->mode=toULimit;
910 pToUArgs->source=(char *)source;
911 pFromUArgs->target=(char *)target;
912 return;
913 }
914 }
915
916 if( toULength==toULimit && /* consumed all trail bytes */
917 (toULength==3 || toULength==2) && /* BMP */
918 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
919 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
920 ) {
921 /* legal byte sequence for BMP code point */
922 } else if(
923 toULength==toULimit && toULength==4 &&
924 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
925 ) {
926 /* legal byte sequence for supplementary code point */
927 } else {
928 /* error handling: illegal UTF-8 byte sequence */
929 source-=(toULength-oldToULength);
930 while(oldToULength<toULength) {
931 utf8->toUBytes[oldToULength++]=*source++;
932 }
933 utf8->toULength=toULength;
934 pToUArgs->source=(char *)source;
935 pFromUArgs->target=(char *)target;
936 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
937 return;
938 }
939
940 /* copy the legal byte sequence to the target */
941 {
942 int8_t i;
943
944 for(i=0; i<oldToULength; ++i) {
945 *target++=utf8->toUBytes[i];
946 }
947 source-=(toULength-oldToULength);
948 for(; i<toULength; ++i) {
949 *target++=*source++;
950 }
951 count-=toULength;
952 }
953 }
954 }
955
956 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
957 if(target==(const uint8_t *)pFromUArgs->targetLimit) {
958 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
959 } else {
960 b=*source;
961 toULimit=utf8_countTrailBytes[b]+1;
962 if(toULimit>(sourceLimit-source)) {
963 /* collect a truncated byte sequence */
964 toULength=0;
965 c=b;
966 for(;;) {
967 utf8->toUBytes[toULength++]=b;
968 if(++source==sourceLimit) {
969 /* partial byte sequence at end of source */
970 utf8->toUnicodeStatus=c;
971 utf8->toULength=toULength;
972 utf8->mode=toULimit;
973 break;
974 } else if(!U8_IS_TRAIL(b=*source)) {
975 /* lead byte in trail byte position */
976 utf8->toULength=toULength;
977 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
978 break;
979 }
980 c=(c<<6)+b;
981 }
982 } else {
983 /* partial-sequence target overflow: fall back to the pivoting implementation */
984 *pErrorCode=U_USING_DEFAULT_WARNING;
985 }
986 }
987 }
988
989 /* write back the updated pointers */
990 pToUArgs->source=(char *)source;
991 pFromUArgs->target=(char *)target;
992 }
993
994 /* UTF-8 converter data ----------------------------------------------------- */
995
996 static const UConverterImpl _UTF8Impl={
997 UCNV_UTF8,
998
999 NULL,
1000 NULL,
1001
1002 NULL,
1003 NULL,
1004 NULL,
1005
1006 ucnv_toUnicode_UTF8,
1007 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1008 ucnv_fromUnicode_UTF8,
1009 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1010 ucnv_getNextUChar_UTF8,
1011
1012 NULL,
1013 NULL,
1014 NULL,
1015 NULL,
1016 ucnv_getNonSurrogateUnicodeSet,
1017
1018 ucnv_UTF8FromUTF8,
1019 ucnv_UTF8FromUTF8
1020 };
1021
1022 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1023 static const UConverterStaticData _UTF8StaticData={
1024 sizeof(UConverterStaticData),
1025 "UTF-8",
1026 1208, UCNV_IBM, UCNV_UTF8,
1027 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1028 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1029 0,
1030 0,
1031 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1032 };
1033
1034
1035 const UConverterSharedData _UTF8Data={
1036 sizeof(UConverterSharedData), ~((uint32_t) 0),
1037 NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
1038 0
1039 };
1040
1041 /* CESU-8 converter data ---------------------------------------------------- */
1042
1043 static const UConverterImpl _CESU8Impl={
1044 UCNV_CESU8,
1045
1046 NULL,
1047 NULL,
1048
1049 NULL,
1050 NULL,
1051 NULL,
1052
1053 ucnv_toUnicode_UTF8,
1054 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1055 ucnv_fromUnicode_UTF8,
1056 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1057 NULL,
1058
1059 NULL,
1060 NULL,
1061 NULL,
1062 NULL,
1063 ucnv_getCompleteUnicodeSet
1064 };
1065
1066 static const UConverterStaticData _CESU8StaticData={
1067 sizeof(UConverterStaticData),
1068 "CESU-8",
1069 9400, /* CCSID for CESU-8 */
1070 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1071 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1072 0,
1073 0,
1074 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1075 };
1076
1077
1078 const UConverterSharedData _CESU8Data={
1079 sizeof(UConverterSharedData), ~((uint32_t) 0),
1080 NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
1081 0
1082 };
1083
1084 #endif
1085