• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucnv_u8.c
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2002jul01
12 *   created by: Markus W. Scherer
13 *
14 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
15 *
16 *   Also, CESU-8 implementation, see UTR 26.
17 *   The CESU-8 converter uses all the same functions as the
18 *   UTF-8 converter, with a branch for converting supplementary code points.
19 */
20 
21 #include "unicode/utypes.h"
22 
23 #if !UCONFIG_NO_CONVERSION
24 
25 #include "unicode/ucnv.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf8.h"
28 #include "unicode/utf16.h"
29 #include "ucnv_bld.h"
30 #include "ucnv_cnv.h"
31 #include "cmemory.h"
32 
33 /* Prototypes --------------------------------------------------------------- */
34 
35 /* Keep these here to make finicky compilers happy */
36 
37 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
38                                            UErrorCode *err);
39 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
40                                                         UErrorCode *err);
41 
42 
43 /* UTF-8 -------------------------------------------------------------------- */
44 
45 /* UTF-8 Conversion DATA
46  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
47  */
48 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
49 #define MAXIMUM_UCS2            0x0000FFFF
50 #define MAXIMUM_UTF             0x0010FFFF
51 #define MAXIMUM_UCS4            0x7FFFFFFF
52 #define HALF_SHIFT              10
53 #define HALF_BASE               0x0010000
54 #define HALF_MASK               0x3FF
55 #define SURROGATE_HIGH_START    0xD800
56 #define SURROGATE_HIGH_END      0xDBFF
57 #define SURROGATE_LOW_START     0xDC00
58 #define SURROGATE_LOW_END       0xDFFF
59 
60 /* -SURROGATE_LOW_START + HALF_BASE */
61 #define SURROGATE_LOW_BASE      9216
62 
63 static const uint32_t offsetsFromUTF8[7] = {0,
64   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
65   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
66 };
67 
68 /* END OF UTF-8 Conversion DATA */
69 
70 static const int8_t bytesFromUTF8[256] = {
71   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
79 };
80 
81 /*
82  * Starting with Unicode 3.0.1:
83  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
84  * byte sequences with more than 4 bytes are illegal in UTF-8,
85  * which is tested with impossible values for them
86  */
87 static const uint32_t
88 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
89 
ucnv_toUnicode_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)90 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
91                                   UErrorCode * err)
92 {
93     UConverter *cnv = args->converter;
94     const unsigned char *mySource = (unsigned char *) args->source;
95     UChar *myTarget = args->target;
96     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
97     const UChar *targetLimit = args->targetLimit;
98     unsigned char *toUBytes = cnv->toUBytes;
99     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
100     uint32_t ch, ch2 = 0;
101     int32_t i, inBytes;
102 
103     /* Restore size of current sequence */
104     if (cnv->toUnicodeStatus && myTarget < targetLimit)
105     {
106         inBytes = cnv->mode;            /* restore # of bytes to consume */
107         i = cnv->toULength;             /* restore # of bytes consumed */
108         cnv->toULength = 0;
109 
110         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
111         cnv->toUnicodeStatus = 0;
112         goto morebytes;
113     }
114 
115 
116     while (mySource < sourceLimit && myTarget < targetLimit)
117     {
118         ch = *(mySource++);
119         if (ch < 0x80)        /* Simple case */
120         {
121             *(myTarget++) = (UChar) ch;
122         }
123         else
124         {
125             /* store the first char */
126             toUBytes[0] = (char)ch;
127             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
128             i = 1;
129 
130 morebytes:
131             while (i < inBytes)
132             {
133                 if (mySource < sourceLimit)
134                 {
135                     toUBytes[i] = (char) (ch2 = *mySource);
136                     if (!U8_IS_TRAIL(ch2))
137                     {
138                         break; /* i < inBytes */
139                     }
140                     ch = (ch << 6) + ch2;
141                     ++mySource;
142                     i++;
143                 }
144                 else
145                 {
146                     /* stores a partially calculated target*/
147                     cnv->toUnicodeStatus = ch;
148                     cnv->mode = inBytes;
149                     cnv->toULength = (int8_t) i;
150                     goto donefornow;
151                 }
152             }
153 
154             /* Remove the accumulated high bits */
155             ch -= offsetsFromUTF8[inBytes];
156 
157             /*
158              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
159              * - use only trail bytes after a lead byte (checked above)
160              * - use the right number of trail bytes for a given lead byte
161              * - encode a code point <= U+10ffff
162              * - use the fewest possible number of bytes for their code points
163              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
164              *
165              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
166              * There are no irregular sequences any more.
167              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
168              */
169             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
170                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
171             {
172                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
173                 if (ch <= MAXIMUM_UCS2)
174                 {
175                     /* fits in 16 bits */
176                     *(myTarget++) = (UChar) ch;
177                 }
178                 else
179                 {
180                     /* write out the surrogates */
181                     ch -= HALF_BASE;
182                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
183                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
184                     if (myTarget < targetLimit)
185                     {
186                         *(myTarget++) = (UChar)ch;
187                     }
188                     else
189                     {
190                         /* Put in overflow buffer (not handled here) */
191                         cnv->UCharErrorBuffer[0] = (UChar) ch;
192                         cnv->UCharErrorBufferLength = 1;
193                         *err = U_BUFFER_OVERFLOW_ERROR;
194                         break;
195                     }
196                 }
197             }
198             else
199             {
200                 cnv->toULength = (int8_t)i;
201                 *err = U_ILLEGAL_CHAR_FOUND;
202                 break;
203             }
204         }
205     }
206 
207 donefornow:
208     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
209     {
210         /* End of target buffer */
211         *err = U_BUFFER_OVERFLOW_ERROR;
212     }
213 
214     args->target = myTarget;
215     args->source = (const char *) mySource;
216 }
217 
ucnv_toUnicode_UTF8_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)218 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
219                                                 UErrorCode * err)
220 {
221     UConverter *cnv = args->converter;
222     const unsigned char *mySource = (unsigned char *) args->source;
223     UChar *myTarget = args->target;
224     int32_t *myOffsets = args->offsets;
225     int32_t offsetNum = 0;
226     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
227     const UChar *targetLimit = args->targetLimit;
228     unsigned char *toUBytes = cnv->toUBytes;
229     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
230     uint32_t ch, ch2 = 0;
231     int32_t i, inBytes;
232 
233     /* Restore size of current sequence */
234     if (cnv->toUnicodeStatus && myTarget < targetLimit)
235     {
236         inBytes = cnv->mode;            /* restore # of bytes to consume */
237         i = cnv->toULength;             /* restore # of bytes consumed */
238         cnv->toULength = 0;
239 
240         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
241         cnv->toUnicodeStatus = 0;
242         goto morebytes;
243     }
244 
245     while (mySource < sourceLimit && myTarget < targetLimit)
246     {
247         ch = *(mySource++);
248         if (ch < 0x80)        /* Simple case */
249         {
250             *(myTarget++) = (UChar) ch;
251             *(myOffsets++) = offsetNum++;
252         }
253         else
254         {
255             toUBytes[0] = (char)ch;
256             inBytes = bytesFromUTF8[ch];
257             i = 1;
258 
259 morebytes:
260             while (i < inBytes)
261             {
262                 if (mySource < sourceLimit)
263                 {
264                     toUBytes[i] = (char) (ch2 = *mySource);
265                     if (!U8_IS_TRAIL(ch2))
266                     {
267                         break; /* i < inBytes */
268                     }
269                     ch = (ch << 6) + ch2;
270                     ++mySource;
271                     i++;
272                 }
273                 else
274                 {
275                     cnv->toUnicodeStatus = ch;
276                     cnv->mode = inBytes;
277                     cnv->toULength = (int8_t)i;
278                     goto donefornow;
279                 }
280             }
281 
282             /* Remove the accumulated high bits */
283             ch -= offsetsFromUTF8[inBytes];
284 
285             /*
286              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
287              * - use only trail bytes after a lead byte (checked above)
288              * - use the right number of trail bytes for a given lead byte
289              * - encode a code point <= U+10ffff
290              * - use the fewest possible number of bytes for their code points
291              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
292              *
293              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
294              * There are no irregular sequences any more.
295              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
296              */
297             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
298                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
299             {
300                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
301                 if (ch <= MAXIMUM_UCS2)
302                 {
303                     /* fits in 16 bits */
304                     *(myTarget++) = (UChar) ch;
305                     *(myOffsets++) = offsetNum;
306                 }
307                 else
308                 {
309                     /* write out the surrogates */
310                     ch -= HALF_BASE;
311                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
312                     *(myOffsets++) = offsetNum;
313                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
314                     if (myTarget < targetLimit)
315                     {
316                         *(myTarget++) = (UChar)ch;
317                         *(myOffsets++) = offsetNum;
318                     }
319                     else
320                     {
321                         cnv->UCharErrorBuffer[0] = (UChar) ch;
322                         cnv->UCharErrorBufferLength = 1;
323                         *err = U_BUFFER_OVERFLOW_ERROR;
324                     }
325                 }
326                 offsetNum += i;
327             }
328             else
329             {
330                 cnv->toULength = (int8_t)i;
331                 *err = U_ILLEGAL_CHAR_FOUND;
332                 break;
333             }
334         }
335     }
336 
337 donefornow:
338     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
339     {   /* End of target buffer */
340         *err = U_BUFFER_OVERFLOW_ERROR;
341     }
342 
343     args->target = myTarget;
344     args->source = (const char *) mySource;
345     args->offsets = myOffsets;
346 }
347 
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,UErrorCode * err)348 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
349                                     UErrorCode * err)
350 {
351     UConverter *cnv = args->converter;
352     const UChar *mySource = args->source;
353     const UChar *sourceLimit = args->sourceLimit;
354     uint8_t *myTarget = (uint8_t *) args->target;
355     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
356     uint8_t *tempPtr;
357     UChar32 ch;
358     uint8_t tempBuf[4];
359     int32_t indexToWrite;
360     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
361 
362     if (cnv->fromUChar32 && myTarget < targetLimit)
363     {
364         ch = cnv->fromUChar32;
365         cnv->fromUChar32 = 0;
366         goto lowsurrogate;
367     }
368 
369     while (mySource < sourceLimit && myTarget < targetLimit)
370     {
371         ch = *(mySource++);
372 
373         if (ch < 0x80)        /* Single byte */
374         {
375             *(myTarget++) = (uint8_t) ch;
376         }
377         else if (ch < 0x800)  /* Double byte */
378         {
379             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
380             if (myTarget < targetLimit)
381             {
382                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
383             }
384             else
385             {
386                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
387                 cnv->charErrorBufferLength = 1;
388                 *err = U_BUFFER_OVERFLOW_ERROR;
389             }
390         }
391         else {
392             /* Check for surrogates */
393             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
394 lowsurrogate:
395                 if (mySource < sourceLimit) {
396                     /* test both code units */
397                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
398                         /* convert and consume this supplementary code point */
399                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
400                         ++mySource;
401                         /* exit this condition tree */
402                     }
403                     else {
404                         /* this is an unpaired trail or lead code unit */
405                         /* callback(illegal) */
406                         cnv->fromUChar32 = ch;
407                         *err = U_ILLEGAL_CHAR_FOUND;
408                         break;
409                     }
410                 }
411                 else {
412                     /* no more input */
413                     cnv->fromUChar32 = ch;
414                     break;
415                 }
416             }
417 
418             /* Do we write the buffer directly for speed,
419             or do we have to be careful about target buffer space? */
420             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
421 
422             if (ch <= MAXIMUM_UCS2) {
423                 indexToWrite = 2;
424                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
425             }
426             else {
427                 indexToWrite = 3;
428                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
429                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
430             }
431             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
432             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
433 
434             if (tempPtr == myTarget) {
435                 /* There was enough space to write the codepoint directly. */
436                 myTarget += (indexToWrite + 1);
437             }
438             else {
439                 /* We might run out of room soon. Write it slowly. */
440                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
441                     if (myTarget < targetLimit) {
442                         *(myTarget++) = *tempPtr;
443                     }
444                     else {
445                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
446                         *err = U_BUFFER_OVERFLOW_ERROR;
447                     }
448                 }
449             }
450         }
451     }
452 
453     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
454     {
455         *err = U_BUFFER_OVERFLOW_ERROR;
456     }
457 
458     args->target = (char *) myTarget;
459     args->source = mySource;
460 }
461 
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)462 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
463                                                   UErrorCode * err)
464 {
465     UConverter *cnv = args->converter;
466     const UChar *mySource = args->source;
467     int32_t *myOffsets = args->offsets;
468     const UChar *sourceLimit = args->sourceLimit;
469     uint8_t *myTarget = (uint8_t *) args->target;
470     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
471     uint8_t *tempPtr;
472     UChar32 ch;
473     int32_t offsetNum, nextSourceIndex;
474     int32_t indexToWrite;
475     uint8_t tempBuf[4];
476     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
477 
478     if (cnv->fromUChar32 && myTarget < targetLimit)
479     {
480         ch = cnv->fromUChar32;
481         cnv->fromUChar32 = 0;
482         offsetNum = -1;
483         nextSourceIndex = 0;
484         goto lowsurrogate;
485     } else {
486         offsetNum = 0;
487     }
488 
489     while (mySource < sourceLimit && myTarget < targetLimit)
490     {
491         ch = *(mySource++);
492 
493         if (ch < 0x80)        /* Single byte */
494         {
495             *(myOffsets++) = offsetNum++;
496             *(myTarget++) = (char) ch;
497         }
498         else if (ch < 0x800)  /* Double byte */
499         {
500             *(myOffsets++) = offsetNum;
501             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
502             if (myTarget < targetLimit)
503             {
504                 *(myOffsets++) = offsetNum++;
505                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
506             }
507             else
508             {
509                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
510                 cnv->charErrorBufferLength = 1;
511                 *err = U_BUFFER_OVERFLOW_ERROR;
512             }
513         }
514         else
515         /* Check for surrogates */
516         {
517             nextSourceIndex = offsetNum + 1;
518 
519             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
520 lowsurrogate:
521                 if (mySource < sourceLimit) {
522                     /* test both code units */
523                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
524                         /* convert and consume this supplementary code point */
525                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
526                         ++mySource;
527                         ++nextSourceIndex;
528                         /* exit this condition tree */
529                     }
530                     else {
531                         /* this is an unpaired trail or lead code unit */
532                         /* callback(illegal) */
533                         cnv->fromUChar32 = ch;
534                         *err = U_ILLEGAL_CHAR_FOUND;
535                         break;
536                     }
537                 }
538                 else {
539                     /* no more input */
540                     cnv->fromUChar32 = ch;
541                     break;
542                 }
543             }
544 
545             /* Do we write the buffer directly for speed,
546             or do we have to be careful about target buffer space? */
547             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
548 
549             if (ch <= MAXIMUM_UCS2) {
550                 indexToWrite = 2;
551                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
552             }
553             else {
554                 indexToWrite = 3;
555                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
556                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
557             }
558             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
559             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
560 
561             if (tempPtr == myTarget) {
562                 /* There was enough space to write the codepoint directly. */
563                 myTarget += (indexToWrite + 1);
564                 myOffsets[0] = offsetNum;
565                 myOffsets[1] = offsetNum;
566                 myOffsets[2] = offsetNum;
567                 if (indexToWrite >= 3) {
568                     myOffsets[3] = offsetNum;
569                 }
570                 myOffsets += (indexToWrite + 1);
571             }
572             else {
573                 /* We might run out of room soon. Write it slowly. */
574                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
575                     if (myTarget < targetLimit)
576                     {
577                         *(myOffsets++) = offsetNum;
578                         *(myTarget++) = *tempPtr;
579                     }
580                     else
581                     {
582                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
583                         *err = U_BUFFER_OVERFLOW_ERROR;
584                     }
585                 }
586             }
587             offsetNum = nextSourceIndex;
588         }
589     }
590 
591     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
592     {
593         *err = U_BUFFER_OVERFLOW_ERROR;
594     }
595 
596     args->target = (char *) myTarget;
597     args->source = mySource;
598     args->offsets = myOffsets;
599 }
600 
ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs * args,UErrorCode * err)601 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
602                                                UErrorCode *err) {
603     UConverter *cnv;
604     const uint8_t *sourceInitial;
605     const uint8_t *source;
606     uint16_t extraBytesToWrite;
607     uint8_t myByte;
608     UChar32 ch;
609     int8_t i, isLegalSequence;
610 
611     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
612 
613     cnv = args->converter;
614     sourceInitial = source = (const uint8_t *)args->source;
615     if (source >= (const uint8_t *)args->sourceLimit)
616     {
617         /* no input */
618         *err = U_INDEX_OUTOFBOUNDS_ERROR;
619         return 0xffff;
620     }
621 
622     myByte = (uint8_t)*(source++);
623     if (myByte < 0x80)
624     {
625         args->source = (const char *)source;
626         return (UChar32)myByte;
627     }
628 
629     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
630     if (extraBytesToWrite == 0) {
631         cnv->toUBytes[0] = myByte;
632         cnv->toULength = 1;
633         *err = U_ILLEGAL_CHAR_FOUND;
634         args->source = (const char *)source;
635         return 0xffff;
636     }
637 
638     /*The byte sequence is longer than the buffer area passed*/
639     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
640     {
641         /* check if all of the remaining bytes are trail bytes */
642         cnv->toUBytes[0] = myByte;
643         i = 1;
644         *err = U_TRUNCATED_CHAR_FOUND;
645         while(source < (const uint8_t *)args->sourceLimit) {
646             if(U8_IS_TRAIL(myByte = *source)) {
647                 cnv->toUBytes[i++] = myByte;
648                 ++source;
649             } else {
650                 /* error even before we run out of input */
651                 *err = U_ILLEGAL_CHAR_FOUND;
652                 break;
653             }
654         }
655         cnv->toULength = i;
656         args->source = (const char *)source;
657         return 0xffff;
658     }
659 
660     isLegalSequence = 1;
661     ch = myByte << 6;
662     switch(extraBytesToWrite)
663     {
664       /* note: code falls through cases! (sic)*/
665     case 6:
666         ch += (myByte = *source);
667         ch <<= 6;
668         if (!U8_IS_TRAIL(myByte))
669         {
670             isLegalSequence = 0;
671             break;
672         }
673         ++source;
674     case 5: /*fall through*/
675         ch += (myByte = *source);
676         ch <<= 6;
677         if (!U8_IS_TRAIL(myByte))
678         {
679             isLegalSequence = 0;
680             break;
681         }
682         ++source;
683     case 4: /*fall through*/
684         ch += (myByte = *source);
685         ch <<= 6;
686         if (!U8_IS_TRAIL(myByte))
687         {
688             isLegalSequence = 0;
689             break;
690         }
691         ++source;
692     case 3: /*fall through*/
693         ch += (myByte = *source);
694         ch <<= 6;
695         if (!U8_IS_TRAIL(myByte))
696         {
697             isLegalSequence = 0;
698             break;
699         }
700         ++source;
701     case 2: /*fall through*/
702         ch += (myByte = *source);
703         if (!U8_IS_TRAIL(myByte))
704         {
705             isLegalSequence = 0;
706             break;
707         }
708         ++source;
709     };
710     ch -= offsetsFromUTF8[extraBytesToWrite];
711     args->source = (const char *)source;
712 
713     /*
714      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
715      * - use only trail bytes after a lead byte (checked above)
716      * - use the right number of trail bytes for a given lead byte
717      * - encode a code point <= U+10ffff
718      * - use the fewest possible number of bytes for their code points
719      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
720      *
721      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
722      * There are no irregular sequences any more.
723      */
724     if (isLegalSequence &&
725         (uint32_t)ch <= MAXIMUM_UTF &&
726         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
727         !U_IS_SURROGATE(ch)
728     ) {
729         return ch; /* return the code point */
730     }
731 
732     for(i = 0; sourceInitial < source; ++i) {
733         cnv->toUBytes[i] = *sourceInitial++;
734     }
735     cnv->toULength = i;
736     *err = U_ILLEGAL_CHAR_FOUND;
737     return 0xffff;
738 }
739 
740 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
741 
742 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
743 static const UChar32
744 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
745 
746 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
747 static const UChar32
748 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
749 
750 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
751 static void
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs * pFromUArgs,UConverterToUnicodeArgs * pToUArgs,UErrorCode * pErrorCode)752 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
753                   UConverterToUnicodeArgs *pToUArgs,
754                   UErrorCode *pErrorCode) {
755     UConverter *utf8;
756     const uint8_t *source, *sourceLimit;
757     uint8_t *target;
758     int32_t targetCapacity;
759     int32_t count;
760 
761     int8_t oldToULength, toULength, toULimit;
762 
763     UChar32 c;
764     uint8_t b, t1, t2;
765 
766     /* set up the local pointers */
767     utf8=pToUArgs->converter;
768     source=(uint8_t *)pToUArgs->source;
769     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
770     target=(uint8_t *)pFromUArgs->target;
771     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
772 
773     /* get the converter state from the UTF-8 UConverter */
774     c=(UChar32)utf8->toUnicodeStatus;
775     if(c!=0) {
776         toULength=oldToULength=utf8->toULength;
777         toULimit=(int8_t)utf8->mode;
778     } else {
779         toULength=oldToULength=toULimit=0;
780     }
781 
782     count=(int32_t)(sourceLimit-source)+oldToULength;
783     if(count<toULimit) {
784         /*
785          * Not enough input to complete the partial character.
786          * Jump to moreBytes below - it will not output to target.
787          */
788     } else if(targetCapacity<toULimit) {
789         /*
790          * Not enough target capacity to output the partial character.
791          * Let the standard converter handle this.
792          */
793         *pErrorCode=U_USING_DEFAULT_WARNING;
794         return;
795     } else {
796         /*
797          * Use a single counter for source and target, counting the minimum of
798          * the source length and the target capacity.
799          * As a result, the source length is checked only once per multi-byte
800          * character instead of twice.
801          *
802          * Make sure that the last byte sequence is complete, or else
803          * stop just before it.
804          * (The longest legal byte sequence has 3 trail bytes.)
805          * Count oldToULength (number of source bytes from a previous buffer)
806          * into the source length but reduce the source index by toULimit
807          * while going back over trail bytes in order to not go back into
808          * the bytes that will be read for finishing a partial
809          * sequence from the previous buffer.
810          * Let the standard converter handle edge cases.
811          */
812         int32_t i;
813 
814         if(count>targetCapacity) {
815             count=targetCapacity;
816         }
817 
818         i=0;
819         while(i<3 && i<(count-toULimit)) {
820             b=source[count-oldToULength-i-1];
821             if(U8_IS_TRAIL(b)) {
822                 ++i;
823             } else {
824                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
825                     /* stop converting before the lead byte if there are not enough trail bytes for it */
826                     count-=i+1;
827                 }
828                 break;
829             }
830         }
831     }
832 
833     if(c!=0) {
834         utf8->toUnicodeStatus=0;
835         utf8->toULength=0;
836         goto moreBytes;
837         /* See note in ucnv_SBCSFromUTF8() about this goto. */
838     }
839 
840     /* conversion loop */
841     while(count>0) {
842         b=*source++;
843         if((int8_t)b>=0) {
844             /* convert ASCII */
845             *target++=b;
846             --count;
847             continue;
848         } else {
849             if(b>0xe0) {
850                 if( /* handle U+1000..U+D7FF inline */
851                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
852                                                (b==0xed && (t1 <= 0x9f))) &&
853                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
854                 ) {
855                     source+=2;
856                     *target++=b;
857                     *target++=t1;
858                     *target++=t2;
859                     count-=3;
860                     continue;
861                 }
862             } else if(b<0xe0) {
863                 if( /* handle U+0080..U+07FF inline */
864                     b>=0xc2 &&
865                     (t1=*source) >= 0x80 && t1 <= 0xbf
866                 ) {
867                     ++source;
868                     *target++=b;
869                     *target++=t1;
870                     count-=2;
871                     continue;
872                 }
873             } else if(b==0xe0) {
874                 if( /* handle U+0800..U+0FFF inline */
875                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
876                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
877                 ) {
878                     source+=2;
879                     *target++=b;
880                     *target++=t1;
881                     *target++=t2;
882                     count-=3;
883                     continue;
884                 }
885             }
886 
887             /* handle "complicated" and error cases, and continuing partial characters */
888             oldToULength=0;
889             toULength=1;
890             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
891             c=b;
892 moreBytes:
893             while(toULength<toULimit) {
894                 if(source<sourceLimit) {
895                     b=*source;
896                     if(U8_IS_TRAIL(b)) {
897                         ++source;
898                         ++toULength;
899                         c=(c<<6)+b;
900                     } else {
901                         break; /* sequence too short, stop with toULength<toULimit */
902                     }
903                 } else {
904                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
905                     source-=(toULength-oldToULength);
906                     while(oldToULength<toULength) {
907                         utf8->toUBytes[oldToULength++]=*source++;
908                     }
909                     utf8->toUnicodeStatus=c;
910                     utf8->toULength=toULength;
911                     utf8->mode=toULimit;
912                     pToUArgs->source=(char *)source;
913                     pFromUArgs->target=(char *)target;
914                     return;
915                 }
916             }
917 
918             if( toULength==toULimit &&      /* consumed all trail bytes */
919                 (toULength==3 || toULength==2) &&             /* BMP */
920                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
921                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
922             ) {
923                 /* legal byte sequence for BMP code point */
924             } else if(
925                 toULength==toULimit && toULength==4 &&
926                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
927             ) {
928                 /* legal byte sequence for supplementary code point */
929             } else {
930                 /* error handling: illegal UTF-8 byte sequence */
931                 source-=(toULength-oldToULength);
932                 while(oldToULength<toULength) {
933                     utf8->toUBytes[oldToULength++]=*source++;
934                 }
935                 utf8->toULength=toULength;
936                 pToUArgs->source=(char *)source;
937                 pFromUArgs->target=(char *)target;
938                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
939                 return;
940             }
941 
942             /* copy the legal byte sequence to the target */
943             {
944                 int8_t i;
945 
946                 for(i=0; i<oldToULength; ++i) {
947                     *target++=utf8->toUBytes[i];
948                 }
949                 source-=(toULength-oldToULength);
950                 for(; i<toULength; ++i) {
951                     *target++=*source++;
952                 }
953                 count-=toULength;
954             }
955         }
956     }
957 
958     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
959         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
960             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
961         } else {
962             b=*source;
963             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
964             if(toULimit>(sourceLimit-source)) {
965                 /* collect a truncated byte sequence */
966                 toULength=0;
967                 c=b;
968                 for(;;) {
969                     utf8->toUBytes[toULength++]=b;
970                     if(++source==sourceLimit) {
971                         /* partial byte sequence at end of source */
972                         utf8->toUnicodeStatus=c;
973                         utf8->toULength=toULength;
974                         utf8->mode=toULimit;
975                         break;
976                     } else if(!U8_IS_TRAIL(b=*source)) {
977                         /* lead byte in trail byte position */
978                         utf8->toULength=toULength;
979                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
980                         break;
981                     }
982                     c=(c<<6)+b;
983                 }
984             } else {
985                 /* partial-sequence target overflow: fall back to the pivoting implementation */
986                 *pErrorCode=U_USING_DEFAULT_WARNING;
987             }
988         }
989     }
990 
991     /* write back the updated pointers */
992     pToUArgs->source=(char *)source;
993     pFromUArgs->target=(char *)target;
994 }
995 
996 /* UTF-8 converter data ----------------------------------------------------- */
997 
998 static const UConverterImpl _UTF8Impl={
999     UCNV_UTF8,
1000 
1001     NULL,
1002     NULL,
1003 
1004     NULL,
1005     NULL,
1006     NULL,
1007 
1008     ucnv_toUnicode_UTF8,
1009     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1010     ucnv_fromUnicode_UTF8,
1011     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1012     ucnv_getNextUChar_UTF8,
1013 
1014     NULL,
1015     NULL,
1016     NULL,
1017     NULL,
1018     ucnv_getNonSurrogateUnicodeSet,
1019 
1020     ucnv_UTF8FromUTF8,
1021     ucnv_UTF8FromUTF8
1022 };
1023 
1024 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1025 static const UConverterStaticData _UTF8StaticData={
1026     sizeof(UConverterStaticData),
1027     "UTF-8",
1028     1208, UCNV_IBM, UCNV_UTF8,
1029     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1030     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1031     0,
1032     0,
1033     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1034 };
1035 
1036 
1037 const UConverterSharedData _UTF8Data={
1038     sizeof(UConverterSharedData), ~((uint32_t) 0),
1039     NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
1040     0
1041 };
1042 
1043 /* CESU-8 converter data ---------------------------------------------------- */
1044 
1045 static const UConverterImpl _CESU8Impl={
1046     UCNV_CESU8,
1047 
1048     NULL,
1049     NULL,
1050 
1051     NULL,
1052     NULL,
1053     NULL,
1054 
1055     ucnv_toUnicode_UTF8,
1056     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1057     ucnv_fromUnicode_UTF8,
1058     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1059     NULL,
1060 
1061     NULL,
1062     NULL,
1063     NULL,
1064     NULL,
1065     ucnv_getCompleteUnicodeSet
1066 };
1067 
1068 static const UConverterStaticData _CESU8StaticData={
1069     sizeof(UConverterStaticData),
1070     "CESU-8",
1071     9400, /* CCSID for CESU-8 */
1072     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1073     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1074     0,
1075     0,
1076     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1077 };
1078 
1079 
1080 const UConverterSharedData _CESU8Data={
1081     sizeof(UConverterSharedData), ~((uint32_t) 0),
1082     NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
1083     0
1084 };
1085 
1086 #endif
1087