• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2002-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   file name:  ucnv_u16.c
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002jul01
14 *   created by: Markus W. Scherer
15 *
16 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.
17 */
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_CONVERSION
22 
23 #include "unicode/ucnv.h"
24 #include "unicode/uversion.h"
25 #include "ucnv_bld.h"
26 #include "ucnv_cnv.h"
27 #include "cmemory.h"
28 
29 enum {
30     UCNV_NEED_TO_WRITE_BOM=1
31 };
32 
33 U_CDECL_BEGIN
34 /*
35  * The UTF-16 toUnicode implementation is also used for the Java-specific
36  * "with BOM" variants of UTF-16BE and UTF-16LE.
37  */
38 static void  U_CALLCONV
39 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
40                            UErrorCode *pErrorCode);
41 
42 /* UTF-16BE ----------------------------------------------------------------- */
43 
44 #if U_IS_BIG_ENDIAN
45 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
46 #else
47 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
48 #endif
49 
50 
51 static void  U_CALLCONV
_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)52 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
53                                UErrorCode *pErrorCode) {
54     UConverter *cnv;
55     const UChar *source;
56     char *target;
57     int32_t *offsets;
58 
59     uint32_t targetCapacity, length, sourceIndex;
60     UChar c, trail;
61     char overflow[4];
62 
63     source=pArgs->source;
64     length=(int32_t)(pArgs->sourceLimit-source);
65     if(length<=0) {
66         /* no input, nothing to do */
67         return;
68     }
69 
70     cnv=pArgs->converter;
71 
72     /* write the BOM if necessary */
73     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
74         static const char bom[]={ (char)0xfe, (char)0xff };
75         ucnv_fromUWriteBytes(cnv,
76                              bom, 2,
77                              &pArgs->target, pArgs->targetLimit,
78                              &pArgs->offsets, -1,
79                              pErrorCode);
80         cnv->fromUnicodeStatus=0;
81     }
82 
83     target=pArgs->target;
84     if(target >= pArgs->targetLimit) {
85         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
86         return;
87     }
88 
89     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
90     offsets=pArgs->offsets;
91     sourceIndex=0;
92 
93     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
94 
95     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
96         /* the last buffer ended with a lead surrogate, output the surrogate pair */
97         ++source;
98         --length;
99         target[0]=(uint8_t)(c>>8);
100         target[1]=(uint8_t)c;
101         target[2]=(uint8_t)(trail>>8);
102         target[3]=(uint8_t)trail;
103         target+=4;
104         targetCapacity-=4;
105         if(offsets!=NULL) {
106             *offsets++=-1;
107             *offsets++=-1;
108             *offsets++=-1;
109             *offsets++=-1;
110         }
111         sourceIndex=1;
112         cnv->fromUChar32=c=0;
113     }
114 
115     if(c==0) {
116         /* copy an even number of bytes for complete UChars */
117         uint32_t count=2*length;
118         if(count>targetCapacity) {
119             count=targetCapacity&~1;
120         }
121         /* count is even */
122         targetCapacity-=count;
123         count>>=1;
124         length-=count;
125 
126         if(offsets==NULL) {
127             while(count>0) {
128                 c=*source++;
129                 if(U16_IS_SINGLE(c)) {
130                     target[0]=(uint8_t)(c>>8);
131                     target[1]=(uint8_t)c;
132                     target+=2;
133                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
134                     ++source;
135                     --count;
136                     target[0]=(uint8_t)(c>>8);
137                     target[1]=(uint8_t)c;
138                     target[2]=(uint8_t)(trail>>8);
139                     target[3]=(uint8_t)trail;
140                     target+=4;
141                 } else {
142                     break;
143                 }
144                 --count;
145             }
146         } else {
147             while(count>0) {
148                 c=*source++;
149                 if(U16_IS_SINGLE(c)) {
150                     target[0]=(uint8_t)(c>>8);
151                     target[1]=(uint8_t)c;
152                     target+=2;
153                     *offsets++=sourceIndex;
154                     *offsets++=sourceIndex++;
155                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
156                     ++source;
157                     --count;
158                     target[0]=(uint8_t)(c>>8);
159                     target[1]=(uint8_t)c;
160                     target[2]=(uint8_t)(trail>>8);
161                     target[3]=(uint8_t)trail;
162                     target+=4;
163                     *offsets++=sourceIndex;
164                     *offsets++=sourceIndex;
165                     *offsets++=sourceIndex;
166                     *offsets++=sourceIndex;
167                     sourceIndex+=2;
168                 } else {
169                     break;
170                 }
171                 --count;
172             }
173         }
174 
175         if(count==0) {
176             /* done with the loop for complete UChars */
177             if(length>0 && targetCapacity>0) {
178                 /*
179                  * there is more input and some target capacity -
180                  * it must be targetCapacity==1 because otherwise
181                  * the above would have copied more;
182                  * prepare for overflow output
183                  */
184                 if(U16_IS_SINGLE(c=*source++)) {
185                     overflow[0]=(char)(c>>8);
186                     overflow[1]=(char)c;
187                     length=2; /* 2 bytes to output */
188                     c=0;
189                 /* } else { keep c for surrogate handling, length will be set there */
190                 }
191             } else {
192                 length=0;
193                 c=0;
194             }
195         } else {
196             /* keep c for surrogate handling, length will be set there */
197             targetCapacity+=2*count;
198         }
199     } else {
200         length=0; /* from here on, length counts the bytes in overflow[] */
201     }
202 
203     if(c!=0) {
204         /*
205          * c is a surrogate, and
206          * - source or target too short
207          * - or the surrogate is unmatched
208          */
209         length=0;
210         if(U16_IS_SURROGATE_LEAD(c)) {
211             if(source<pArgs->sourceLimit) {
212                 if(U16_IS_TRAIL(trail=*source)) {
213                     /* output the surrogate pair, will overflow (see conditions comment above) */
214                     ++source;
215                     overflow[0]=(char)(c>>8);
216                     overflow[1]=(char)c;
217                     overflow[2]=(char)(trail>>8);
218                     overflow[3]=(char)trail;
219                     length=4; /* 4 bytes to output */
220                     c=0;
221                 } else {
222                     /* unmatched lead surrogate */
223                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
224                 }
225             } else {
226                 /* see if the trail surrogate is in the next buffer */
227             }
228         } else {
229             /* unmatched trail surrogate */
230             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
231         }
232         cnv->fromUChar32=c;
233     }
234 
235     if(length>0) {
236         /* output length bytes with overflow (length>targetCapacity>0) */
237         ucnv_fromUWriteBytes(cnv,
238                              overflow, length,
239                              (char **)&target, pArgs->targetLimit,
240                              &offsets, sourceIndex,
241                              pErrorCode);
242         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
243     }
244 
245     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
246         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
247     }
248 
249     /* write back the updated pointers */
250     pArgs->source=source;
251     pArgs->target=(char *)target;
252     pArgs->offsets=offsets;
253 }
254 
255 static void  U_CALLCONV
_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)256 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
257                              UErrorCode *pErrorCode) {
258     UConverter *cnv;
259     const uint8_t *source;
260     UChar *target;
261     int32_t *offsets;
262 
263     uint32_t targetCapacity, length, count, sourceIndex;
264     UChar c, trail;
265 
266     if(pArgs->converter->mode<8) {
267         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
268         return;
269     }
270 
271     cnv=pArgs->converter;
272     source=(const uint8_t *)pArgs->source;
273     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
274     if(length<=0 && cnv->toUnicodeStatus==0) {
275         /* no input, nothing to do */
276         return;
277     }
278 
279     target=pArgs->target;
280     if(target >= pArgs->targetLimit) {
281         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
282         return;
283     }
284 
285     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
286     offsets=pArgs->offsets;
287     sourceIndex=0;
288     c=0;
289 
290     /* complete a partial UChar or pair from the last call */
291     if(cnv->toUnicodeStatus!=0) {
292         /*
293          * special case: single byte from a previous buffer,
294          * where the byte turned out not to belong to a trail surrogate
295          * and the preceding, unmatched lead surrogate was put into toUBytes[]
296          * for error handling
297          */
298         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
299         cnv->toULength=1;
300         cnv->toUnicodeStatus=0;
301     }
302     if((count=cnv->toULength)!=0) {
303         uint8_t *p=cnv->toUBytes;
304         do {
305             p[count++]=*source++;
306             ++sourceIndex;
307             --length;
308             if(count==2) {
309                 c=((UChar)p[0]<<8)|p[1];
310                 if(U16_IS_SINGLE(c)) {
311                     /* output the BMP code point */
312                     *target++=c;
313                     if(offsets!=NULL) {
314                         *offsets++=-1;
315                     }
316                     --targetCapacity;
317                     count=0;
318                     c=0;
319                     break;
320                 } else if(U16_IS_SURROGATE_LEAD(c)) {
321                     /* continue collecting bytes for the trail surrogate */
322                     c=0; /* avoid unnecessary surrogate handling below */
323                 } else {
324                     /* fall through to error handling for an unmatched trail surrogate */
325                     break;
326                 }
327             } else if(count==4) {
328                 c=((UChar)p[0]<<8)|p[1];
329                 trail=((UChar)p[2]<<8)|p[3];
330                 if(U16_IS_TRAIL(trail)) {
331                     /* output the surrogate pair */
332                     *target++=c;
333                     if(targetCapacity>=2) {
334                         *target++=trail;
335                         if(offsets!=NULL) {
336                             *offsets++=-1;
337                             *offsets++=-1;
338                         }
339                         targetCapacity-=2;
340                     } else /* targetCapacity==1 */ {
341                         targetCapacity=0;
342                         cnv->UCharErrorBuffer[0]=trail;
343                         cnv->UCharErrorBufferLength=1;
344                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
345                     }
346                     count=0;
347                     c=0;
348                     break;
349                 } else {
350                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
351                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
352 
353                     /* back out reading the code unit after it */
354                     if(((const uint8_t *)pArgs->source-source)>=2) {
355                         source-=2;
356                     } else {
357                         /*
358                          * if the trail unit's first byte was in a previous buffer, then
359                          * we need to put it into a special place because toUBytes[] will be
360                          * used for the lead unit's bytes
361                          */
362                         cnv->toUnicodeStatus=0x100|p[2];
363                         --source;
364                     }
365                     cnv->toULength=2;
366 
367                     /* write back the updated pointers */
368                     pArgs->source=(const char *)source;
369                     pArgs->target=target;
370                     pArgs->offsets=offsets;
371                     return;
372                 }
373             }
374         } while(length>0);
375         cnv->toULength=(int8_t)count;
376     }
377 
378     /* copy an even number of bytes for complete UChars */
379     count=2*targetCapacity;
380     if(count>length) {
381         count=length&~1;
382     }
383     if(c==0 && count>0) {
384         length-=count;
385         count>>=1;
386         targetCapacity-=count;
387         if(offsets==NULL) {
388             do {
389                 c=((UChar)source[0]<<8)|source[1];
390                 source+=2;
391                 if(U16_IS_SINGLE(c)) {
392                     *target++=c;
393                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
394                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
395                 ) {
396                     source+=2;
397                     --count;
398                     *target++=c;
399                     *target++=trail;
400                 } else {
401                     break;
402                 }
403             } while(--count>0);
404         } else {
405             do {
406                 c=((UChar)source[0]<<8)|source[1];
407                 source+=2;
408                 if(U16_IS_SINGLE(c)) {
409                     *target++=c;
410                     *offsets++=sourceIndex;
411                     sourceIndex+=2;
412                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
413                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
414                 ) {
415                     source+=2;
416                     --count;
417                     *target++=c;
418                     *target++=trail;
419                     *offsets++=sourceIndex;
420                     *offsets++=sourceIndex;
421                     sourceIndex+=4;
422                 } else {
423                     break;
424                 }
425             } while(--count>0);
426         }
427 
428         if(count==0) {
429             /* done with the loop for complete UChars */
430             c=0;
431         } else {
432             /* keep c for surrogate handling, trail will be set there */
433             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
434             targetCapacity+=count;
435         }
436     }
437 
438     if(c!=0) {
439         /*
440          * c is a surrogate, and
441          * - source or target too short
442          * - or the surrogate is unmatched
443          */
444         cnv->toUBytes[0]=(uint8_t)(c>>8);
445         cnv->toUBytes[1]=(uint8_t)c;
446         cnv->toULength=2;
447 
448         if(U16_IS_SURROGATE_LEAD(c)) {
449             if(length>=2) {
450                 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
451                     /* output the surrogate pair, will overflow (see conditions comment above) */
452                     source+=2;
453                     length-=2;
454                     *target++=c;
455                     if(offsets!=NULL) {
456                         *offsets++=sourceIndex;
457                     }
458                     cnv->UCharErrorBuffer[0]=trail;
459                     cnv->UCharErrorBufferLength=1;
460                     cnv->toULength=0;
461                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
462                 } else {
463                     /* unmatched lead surrogate */
464                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
465                 }
466             } else {
467                 /* see if the trail surrogate is in the next buffer */
468             }
469         } else {
470             /* unmatched trail surrogate */
471             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
472         }
473     }
474 
475     if(U_SUCCESS(*pErrorCode)) {
476         /* check for a remaining source byte */
477         if(length>0) {
478             if(targetCapacity==0) {
479                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
480             } else {
481                 /* it must be length==1 because otherwise the above would have copied more */
482                 cnv->toUBytes[cnv->toULength++]=*source++;
483             }
484         }
485     }
486 
487     /* write back the updated pointers */
488     pArgs->source=(const char *)source;
489     pArgs->target=target;
490     pArgs->offsets=offsets;
491 }
492 
493 static UChar32  U_CALLCONV
_UTF16BEGetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * err)494 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
495     const uint8_t *s, *sourceLimit;
496     UChar32 c;
497 
498     if(pArgs->converter->mode<8) {
499         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
500     }
501 
502     s=(const uint8_t *)pArgs->source;
503     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
504 
505     if(s>=sourceLimit) {
506         /* no input */
507         *err=U_INDEX_OUTOFBOUNDS_ERROR;
508         return 0xffff;
509     }
510 
511     if(s+2>sourceLimit) {
512         /* only one byte: truncated UChar */
513         pArgs->converter->toUBytes[0]=*s++;
514         pArgs->converter->toULength=1;
515         pArgs->source=(const char *)s;
516         *err = U_TRUNCATED_CHAR_FOUND;
517         return 0xffff;
518     }
519 
520     /* get one UChar */
521     c=((UChar32)*s<<8)|s[1];
522     s+=2;
523 
524     /* check for a surrogate pair */
525     if(U_IS_SURROGATE(c)) {
526         if(U16_IS_SURROGATE_LEAD(c)) {
527             if(s+2<=sourceLimit) {
528                 UChar trail;
529 
530                 /* get a second UChar and see if it is a trail surrogate */
531                 trail=((UChar)*s<<8)|s[1];
532                 if(U16_IS_TRAIL(trail)) {
533                     c=U16_GET_SUPPLEMENTARY(c, trail);
534                     s+=2;
535                 } else {
536                     /* unmatched lead surrogate */
537                     c=-2;
538                 }
539             } else {
540                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
541                 uint8_t *bytes=pArgs->converter->toUBytes;
542                 s-=2;
543                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
544                 do {
545                     *bytes++=*s++;
546                 } while(s<sourceLimit);
547 
548                 c=0xffff;
549                 *err=U_TRUNCATED_CHAR_FOUND;
550             }
551         } else {
552             /* unmatched trail surrogate */
553             c=-2;
554         }
555 
556         if(c<0) {
557             /* write the unmatched surrogate */
558             uint8_t *bytes=pArgs->converter->toUBytes;
559             pArgs->converter->toULength=2;
560             *bytes=*(s-2);
561             bytes[1]=*(s-1);
562 
563             c=0xffff;
564             *err=U_ILLEGAL_CHAR_FOUND;
565         }
566     }
567 
568     pArgs->source=(const char *)s;
569     return c;
570 }
571 
572 static void  U_CALLCONV
_UTF16BEReset(UConverter * cnv,UConverterResetChoice choice)573 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
574     if(choice<=UCNV_RESET_TO_UNICODE) {
575         /* reset toUnicode state */
576         if(UCNV_GET_VERSION(cnv)==0) {
577             cnv->mode=8; /* no BOM handling */
578         } else {
579             cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
580         }
581     }
582     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
583         /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
584         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
585     }
586 }
587 
588 static void  U_CALLCONV
_UTF16BEOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)589 _UTF16BEOpen(UConverter *cnv,
590              UConverterLoadArgs *pArgs,
591              UErrorCode *pErrorCode) {
592     (void)pArgs;
593     if(UCNV_GET_VERSION(cnv)<=1) {
594         _UTF16BEReset(cnv, UCNV_RESET_BOTH);
595     } else {
596         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
597     }
598 }
599 
600 static const char *  U_CALLCONV
_UTF16BEGetName(const UConverter * cnv)601 _UTF16BEGetName(const UConverter *cnv) {
602     if(UCNV_GET_VERSION(cnv)==0) {
603         return "UTF-16BE";
604     } else {
605         return "UTF-16BE,version=1";
606     }
607 }
608 U_CDECL_END
609 
610 static const UConverterImpl _UTF16BEImpl={
611     UCNV_UTF16_BigEndian,
612 
613     NULL,
614     NULL,
615 
616     _UTF16BEOpen,
617     NULL,
618     _UTF16BEReset,
619 
620     _UTF16BEToUnicodeWithOffsets,
621     _UTF16BEToUnicodeWithOffsets,
622     _UTF16BEFromUnicodeWithOffsets,
623     _UTF16BEFromUnicodeWithOffsets,
624     _UTF16BEGetNextUChar,
625 
626     NULL,
627     _UTF16BEGetName,
628     NULL,
629     NULL,
630     ucnv_getNonSurrogateUnicodeSet,
631 
632     NULL,
633     NULL
634 };
635 
636 static const UConverterStaticData _UTF16BEStaticData={
637     sizeof(UConverterStaticData),
638     "UTF-16BE",
639     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
640     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
641     0,
642     0,
643     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
644 };
645 
646 
647 const UConverterSharedData _UTF16BEData=
648         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
649 
650 /* UTF-16LE ----------------------------------------------------------------- */
651 U_CDECL_BEGIN
652 static void  U_CALLCONV
_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)653 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
654                                UErrorCode *pErrorCode) {
655     UConverter *cnv;
656     const UChar *source;
657     char *target;
658     int32_t *offsets;
659 
660     uint32_t targetCapacity, length, sourceIndex;
661     UChar c, trail;
662     char overflow[4];
663 
664     source=pArgs->source;
665     length=(int32_t)(pArgs->sourceLimit-source);
666     if(length<=0) {
667         /* no input, nothing to do */
668         return;
669     }
670 
671     cnv=pArgs->converter;
672 
673     /* write the BOM if necessary */
674     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
675         static const char bom[]={ (char)0xff, (char)0xfe };
676         ucnv_fromUWriteBytes(cnv,
677                              bom, 2,
678                              &pArgs->target, pArgs->targetLimit,
679                              &pArgs->offsets, -1,
680                              pErrorCode);
681         cnv->fromUnicodeStatus=0;
682     }
683 
684     target=pArgs->target;
685     if(target >= pArgs->targetLimit) {
686         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
687         return;
688     }
689 
690     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
691     offsets=pArgs->offsets;
692     sourceIndex=0;
693 
694     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
695 
696     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
697         /* the last buffer ended with a lead surrogate, output the surrogate pair */
698         ++source;
699         --length;
700         target[0]=(uint8_t)c;
701         target[1]=(uint8_t)(c>>8);
702         target[2]=(uint8_t)trail;
703         target[3]=(uint8_t)(trail>>8);
704         target+=4;
705         targetCapacity-=4;
706         if(offsets!=NULL) {
707             *offsets++=-1;
708             *offsets++=-1;
709             *offsets++=-1;
710             *offsets++=-1;
711         }
712         sourceIndex=1;
713         cnv->fromUChar32=c=0;
714     }
715 
716     if(c==0) {
717         /* copy an even number of bytes for complete UChars */
718         uint32_t count=2*length;
719         if(count>targetCapacity) {
720             count=targetCapacity&~1;
721         }
722         /* count is even */
723         targetCapacity-=count;
724         count>>=1;
725         length-=count;
726 
727         if(offsets==NULL) {
728             while(count>0) {
729                 c=*source++;
730                 if(U16_IS_SINGLE(c)) {
731                     target[0]=(uint8_t)c;
732                     target[1]=(uint8_t)(c>>8);
733                     target+=2;
734                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
735                     ++source;
736                     --count;
737                     target[0]=(uint8_t)c;
738                     target[1]=(uint8_t)(c>>8);
739                     target[2]=(uint8_t)trail;
740                     target[3]=(uint8_t)(trail>>8);
741                     target+=4;
742                 } else {
743                     break;
744                 }
745                 --count;
746             }
747         } else {
748             while(count>0) {
749                 c=*source++;
750                 if(U16_IS_SINGLE(c)) {
751                     target[0]=(uint8_t)c;
752                     target[1]=(uint8_t)(c>>8);
753                     target+=2;
754                     *offsets++=sourceIndex;
755                     *offsets++=sourceIndex++;
756                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
757                     ++source;
758                     --count;
759                     target[0]=(uint8_t)c;
760                     target[1]=(uint8_t)(c>>8);
761                     target[2]=(uint8_t)trail;
762                     target[3]=(uint8_t)(trail>>8);
763                     target+=4;
764                     *offsets++=sourceIndex;
765                     *offsets++=sourceIndex;
766                     *offsets++=sourceIndex;
767                     *offsets++=sourceIndex;
768                     sourceIndex+=2;
769                 } else {
770                     break;
771                 }
772                 --count;
773             }
774         }
775 
776         if(count==0) {
777             /* done with the loop for complete UChars */
778             if(length>0 && targetCapacity>0) {
779                 /*
780                  * there is more input and some target capacity -
781                  * it must be targetCapacity==1 because otherwise
782                  * the above would have copied more;
783                  * prepare for overflow output
784                  */
785                 if(U16_IS_SINGLE(c=*source++)) {
786                     overflow[0]=(char)c;
787                     overflow[1]=(char)(c>>8);
788                     length=2; /* 2 bytes to output */
789                     c=0;
790                 /* } else { keep c for surrogate handling, length will be set there */
791                 }
792             } else {
793                 length=0;
794                 c=0;
795             }
796         } else {
797             /* keep c for surrogate handling, length will be set there */
798             targetCapacity+=2*count;
799         }
800     } else {
801         length=0; /* from here on, length counts the bytes in overflow[] */
802     }
803 
804     if(c!=0) {
805         /*
806          * c is a surrogate, and
807          * - source or target too short
808          * - or the surrogate is unmatched
809          */
810         length=0;
811         if(U16_IS_SURROGATE_LEAD(c)) {
812             if(source<pArgs->sourceLimit) {
813                 if(U16_IS_TRAIL(trail=*source)) {
814                     /* output the surrogate pair, will overflow (see conditions comment above) */
815                     ++source;
816                     overflow[0]=(char)c;
817                     overflow[1]=(char)(c>>8);
818                     overflow[2]=(char)trail;
819                     overflow[3]=(char)(trail>>8);
820                     length=4; /* 4 bytes to output */
821                     c=0;
822                 } else {
823                     /* unmatched lead surrogate */
824                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
825                 }
826             } else {
827                 /* see if the trail surrogate is in the next buffer */
828             }
829         } else {
830             /* unmatched trail surrogate */
831             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
832         }
833         cnv->fromUChar32=c;
834     }
835 
836     if(length>0) {
837         /* output length bytes with overflow (length>targetCapacity>0) */
838         ucnv_fromUWriteBytes(cnv,
839                              overflow, length,
840                              &target, pArgs->targetLimit,
841                              &offsets, sourceIndex,
842                              pErrorCode);
843         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
844     }
845 
846     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
847         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
848     }
849 
850     /* write back the updated pointers */
851     pArgs->source=source;
852     pArgs->target=target;
853     pArgs->offsets=offsets;
854 }
855 
856 static void  U_CALLCONV
_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)857 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
858                              UErrorCode *pErrorCode) {
859     UConverter *cnv;
860     const uint8_t *source;
861     UChar *target;
862     int32_t *offsets;
863 
864     uint32_t targetCapacity, length, count, sourceIndex;
865     UChar c, trail;
866 
867     if(pArgs->converter->mode<8) {
868         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
869         return;
870     }
871 
872     cnv=pArgs->converter;
873     source=(const uint8_t *)pArgs->source;
874     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
875     if(length<=0 && cnv->toUnicodeStatus==0) {
876         /* no input, nothing to do */
877         return;
878     }
879 
880     target=pArgs->target;
881     if(target >= pArgs->targetLimit) {
882         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
883         return;
884     }
885 
886     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
887     offsets=pArgs->offsets;
888     sourceIndex=0;
889     c=0;
890 
891     /* complete a partial UChar or pair from the last call */
892     if(cnv->toUnicodeStatus!=0) {
893         /*
894          * special case: single byte from a previous buffer,
895          * where the byte turned out not to belong to a trail surrogate
896          * and the preceding, unmatched lead surrogate was put into toUBytes[]
897          * for error handling
898          */
899         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
900         cnv->toULength=1;
901         cnv->toUnicodeStatus=0;
902     }
903     if((count=cnv->toULength)!=0) {
904         uint8_t *p=cnv->toUBytes;
905         do {
906             p[count++]=*source++;
907             ++sourceIndex;
908             --length;
909             if(count==2) {
910                 c=((UChar)p[1]<<8)|p[0];
911                 if(U16_IS_SINGLE(c)) {
912                     /* output the BMP code point */
913                     *target++=c;
914                     if(offsets!=NULL) {
915                         *offsets++=-1;
916                     }
917                     --targetCapacity;
918                     count=0;
919                     c=0;
920                     break;
921                 } else if(U16_IS_SURROGATE_LEAD(c)) {
922                     /* continue collecting bytes for the trail surrogate */
923                     c=0; /* avoid unnecessary surrogate handling below */
924                 } else {
925                     /* fall through to error handling for an unmatched trail surrogate */
926                     break;
927                 }
928             } else if(count==4) {
929                 c=((UChar)p[1]<<8)|p[0];
930                 trail=((UChar)p[3]<<8)|p[2];
931                 if(U16_IS_TRAIL(trail)) {
932                     /* output the surrogate pair */
933                     *target++=c;
934                     if(targetCapacity>=2) {
935                         *target++=trail;
936                         if(offsets!=NULL) {
937                             *offsets++=-1;
938                             *offsets++=-1;
939                         }
940                         targetCapacity-=2;
941                     } else /* targetCapacity==1 */ {
942                         targetCapacity=0;
943                         cnv->UCharErrorBuffer[0]=trail;
944                         cnv->UCharErrorBufferLength=1;
945                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
946                     }
947                     count=0;
948                     c=0;
949                     break;
950                 } else {
951                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
952                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
953 
954                     /* back out reading the code unit after it */
955                     if(((const uint8_t *)pArgs->source-source)>=2) {
956                         source-=2;
957                     } else {
958                         /*
959                          * if the trail unit's first byte was in a previous buffer, then
960                          * we need to put it into a special place because toUBytes[] will be
961                          * used for the lead unit's bytes
962                          */
963                         cnv->toUnicodeStatus=0x100|p[2];
964                         --source;
965                     }
966                     cnv->toULength=2;
967 
968                     /* write back the updated pointers */
969                     pArgs->source=(const char *)source;
970                     pArgs->target=target;
971                     pArgs->offsets=offsets;
972                     return;
973                 }
974             }
975         } while(length>0);
976         cnv->toULength=(int8_t)count;
977     }
978 
979     /* copy an even number of bytes for complete UChars */
980     count=2*targetCapacity;
981     if(count>length) {
982         count=length&~1;
983     }
984     if(c==0 && count>0) {
985         length-=count;
986         count>>=1;
987         targetCapacity-=count;
988         if(offsets==NULL) {
989             do {
990                 c=((UChar)source[1]<<8)|source[0];
991                 source+=2;
992                 if(U16_IS_SINGLE(c)) {
993                     *target++=c;
994                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
995                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
996                 ) {
997                     source+=2;
998                     --count;
999                     *target++=c;
1000                     *target++=trail;
1001                 } else {
1002                     break;
1003                 }
1004             } while(--count>0);
1005         } else {
1006             do {
1007                 c=((UChar)source[1]<<8)|source[0];
1008                 source+=2;
1009                 if(U16_IS_SINGLE(c)) {
1010                     *target++=c;
1011                     *offsets++=sourceIndex;
1012                     sourceIndex+=2;
1013                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
1014                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
1015                 ) {
1016                     source+=2;
1017                     --count;
1018                     *target++=c;
1019                     *target++=trail;
1020                     *offsets++=sourceIndex;
1021                     *offsets++=sourceIndex;
1022                     sourceIndex+=4;
1023                 } else {
1024                     break;
1025                 }
1026             } while(--count>0);
1027         }
1028 
1029         if(count==0) {
1030             /* done with the loop for complete UChars */
1031             c=0;
1032         } else {
1033             /* keep c for surrogate handling, trail will be set there */
1034             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
1035             targetCapacity+=count;
1036         }
1037     }
1038 
1039     if(c!=0) {
1040         /*
1041          * c is a surrogate, and
1042          * - source or target too short
1043          * - or the surrogate is unmatched
1044          */
1045         cnv->toUBytes[0]=(uint8_t)c;
1046         cnv->toUBytes[1]=(uint8_t)(c>>8);
1047         cnv->toULength=2;
1048 
1049         if(U16_IS_SURROGATE_LEAD(c)) {
1050             if(length>=2) {
1051                 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
1052                     /* output the surrogate pair, will overflow (see conditions comment above) */
1053                     source+=2;
1054                     length-=2;
1055                     *target++=c;
1056                     if(offsets!=NULL) {
1057                         *offsets++=sourceIndex;
1058                     }
1059                     cnv->UCharErrorBuffer[0]=trail;
1060                     cnv->UCharErrorBufferLength=1;
1061                     cnv->toULength=0;
1062                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1063                 } else {
1064                     /* unmatched lead surrogate */
1065                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1066                 }
1067             } else {
1068                 /* see if the trail surrogate is in the next buffer */
1069             }
1070         } else {
1071             /* unmatched trail surrogate */
1072             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1073         }
1074     }
1075 
1076     if(U_SUCCESS(*pErrorCode)) {
1077         /* check for a remaining source byte */
1078         if(length>0) {
1079             if(targetCapacity==0) {
1080                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1081             } else {
1082                 /* it must be length==1 because otherwise the above would have copied more */
1083                 cnv->toUBytes[cnv->toULength++]=*source++;
1084             }
1085         }
1086     }
1087 
1088     /* write back the updated pointers */
1089     pArgs->source=(const char *)source;
1090     pArgs->target=target;
1091     pArgs->offsets=offsets;
1092 }
1093 
1094 static UChar32  U_CALLCONV
_UTF16LEGetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * err)1095 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1096     const uint8_t *s, *sourceLimit;
1097     UChar32 c;
1098 
1099     if(pArgs->converter->mode<8) {
1100         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1101     }
1102 
1103     s=(const uint8_t *)pArgs->source;
1104     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1105 
1106     if(s>=sourceLimit) {
1107         /* no input */
1108         *err=U_INDEX_OUTOFBOUNDS_ERROR;
1109         return 0xffff;
1110     }
1111 
1112     if(s+2>sourceLimit) {
1113         /* only one byte: truncated UChar */
1114         pArgs->converter->toUBytes[0]=*s++;
1115         pArgs->converter->toULength=1;
1116         pArgs->source=(const char *)s;
1117         *err = U_TRUNCATED_CHAR_FOUND;
1118         return 0xffff;
1119     }
1120 
1121     /* get one UChar */
1122     c=((UChar32)s[1]<<8)|*s;
1123     s+=2;
1124 
1125     /* check for a surrogate pair */
1126     if(U_IS_SURROGATE(c)) {
1127         if(U16_IS_SURROGATE_LEAD(c)) {
1128             if(s+2<=sourceLimit) {
1129                 UChar trail;
1130 
1131                 /* get a second UChar and see if it is a trail surrogate */
1132                 trail=((UChar)s[1]<<8)|*s;
1133                 if(U16_IS_TRAIL(trail)) {
1134                     c=U16_GET_SUPPLEMENTARY(c, trail);
1135                     s+=2;
1136                 } else {
1137                     /* unmatched lead surrogate */
1138                     c=-2;
1139                 }
1140             } else {
1141                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1142                 uint8_t *bytes=pArgs->converter->toUBytes;
1143                 s-=2;
1144                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1145                 do {
1146                     *bytes++=*s++;
1147                 } while(s<sourceLimit);
1148 
1149                 c=0xffff;
1150                 *err=U_TRUNCATED_CHAR_FOUND;
1151             }
1152         } else {
1153             /* unmatched trail surrogate */
1154             c=-2;
1155         }
1156 
1157         if(c<0) {
1158             /* write the unmatched surrogate */
1159             uint8_t *bytes=pArgs->converter->toUBytes;
1160             pArgs->converter->toULength=2;
1161             *bytes=*(s-2);
1162             bytes[1]=*(s-1);
1163 
1164             c=0xffff;
1165             *err=U_ILLEGAL_CHAR_FOUND;
1166         }
1167     }
1168 
1169     pArgs->source=(const char *)s;
1170     return c;
1171 }
1172 
1173 static void  U_CALLCONV
_UTF16LEReset(UConverter * cnv,UConverterResetChoice choice)1174 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
1175     if(choice<=UCNV_RESET_TO_UNICODE) {
1176         /* reset toUnicode state */
1177         if(UCNV_GET_VERSION(cnv)==0) {
1178             cnv->mode=8; /* no BOM handling */
1179         } else {
1180             cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1181         }
1182     }
1183     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
1184         /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1185         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1186     }
1187 }
1188 
1189 static void  U_CALLCONV
_UTF16LEOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1190 _UTF16LEOpen(UConverter *cnv,
1191              UConverterLoadArgs *pArgs,
1192              UErrorCode *pErrorCode) {
1193     (void)pArgs;
1194     if(UCNV_GET_VERSION(cnv)<=1) {
1195         _UTF16LEReset(cnv, UCNV_RESET_BOTH);
1196     } else {
1197         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1198     }
1199 }
1200 
1201 static const char *  U_CALLCONV
_UTF16LEGetName(const UConverter * cnv)1202 _UTF16LEGetName(const UConverter *cnv) {
1203     if(UCNV_GET_VERSION(cnv)==0) {
1204         return "UTF-16LE";
1205     } else {
1206         return "UTF-16LE,version=1";
1207     }
1208 }
1209 U_CDECL_END
1210 
1211 static const UConverterImpl _UTF16LEImpl={
1212     UCNV_UTF16_LittleEndian,
1213 
1214     NULL,
1215     NULL,
1216 
1217     _UTF16LEOpen,
1218     NULL,
1219     _UTF16LEReset,
1220 
1221     _UTF16LEToUnicodeWithOffsets,
1222     _UTF16LEToUnicodeWithOffsets,
1223     _UTF16LEFromUnicodeWithOffsets,
1224     _UTF16LEFromUnicodeWithOffsets,
1225     _UTF16LEGetNextUChar,
1226 
1227     NULL,
1228     _UTF16LEGetName,
1229     NULL,
1230     NULL,
1231     ucnv_getNonSurrogateUnicodeSet,
1232 
1233     NULL,
1234     NULL
1235 };
1236 
1237 
1238 static const UConverterStaticData _UTF16LEStaticData={
1239     sizeof(UConverterStaticData),
1240     "UTF-16LE",
1241     1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1242     { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1243     0,
1244     0,
1245     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1246 };
1247 
1248 
1249 const UConverterSharedData _UTF16LEData=
1250         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
1251 
1252 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1253 
1254 /*
1255  * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1256  * accordingly.
1257  * This is a simpler version of the UTF-32 converter, with
1258  * fewer states for shorter BOMs.
1259  *
1260  * State values:
1261  * 0    initial state
1262  * 1    saw first byte
1263  * 2..5 -
1264  * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1265  * 8    UTF-16BE mode
1266  * 9    UTF-16LE mode
1267  *
1268  * During detection: state==number of initial bytes seen so far.
1269  *
1270  * On output, emit U+FEFF as the first code point.
1271  *
1272  * Variants:
1273  * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1274  * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1275  *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1276  */
1277 U_CDECL_BEGIN
1278 static void  U_CALLCONV
_UTF16Reset(UConverter * cnv,UConverterResetChoice choice)1279 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1280     if(choice<=UCNV_RESET_TO_UNICODE) {
1281         /* reset toUnicode: state=0 */
1282         cnv->mode=0;
1283     }
1284     if(choice!=UCNV_RESET_TO_UNICODE) {
1285         /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1286         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1287     }
1288 }
1289 U_CDECL_END
1290 extern const UConverterSharedData _UTF16v2Data;
1291 U_CDECL_BEGIN
1292 static void U_CALLCONV
_UTF16Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1293 _UTF16Open(UConverter *cnv,
1294            UConverterLoadArgs *pArgs,
1295            UErrorCode *pErrorCode) {
1296     if(UCNV_GET_VERSION(cnv)<=2) {
1297         if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
1298             /*
1299              * Switch implementation, and switch the staticData that's different
1300              * and was copied into the UConverter.
1301              * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1302              * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1303              */
1304             cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
1305             uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
1306         }
1307         _UTF16Reset(cnv, UCNV_RESET_BOTH);
1308     } else {
1309         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1310     }
1311 }
1312 
1313 static const char *  U_CALLCONV
_UTF16GetName(const UConverter * cnv)1314 _UTF16GetName(const UConverter *cnv) {
1315     if(UCNV_GET_VERSION(cnv)==0) {
1316         return "UTF-16";
1317     } else if(UCNV_GET_VERSION(cnv)==1) {
1318         return "UTF-16,version=1";
1319     } else {
1320         return "UTF-16,version=2";
1321     }
1322 }
1323 U_CDECL_END
1324 extern const UConverterSharedData _UTF16Data;
1325 
IS_UTF16BE(const UConverter * cnv)1326 static inline bool IS_UTF16BE(const UConverter *cnv) {
1327     return ((cnv)->sharedData == &_UTF16BEData);
1328 }
1329 
IS_UTF16LE(const UConverter * cnv)1330 static inline bool IS_UTF16LE(const UConverter *cnv) {
1331     return ((cnv)->sharedData == &_UTF16LEData);
1332 }
1333 
IS_UTF16(const UConverter * cnv)1334 static inline bool IS_UTF16(const UConverter *cnv) {
1335     return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data);
1336 }
1337 
1338 U_CDECL_BEGIN
1339 static void U_CALLCONV
_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1340 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1341                            UErrorCode *pErrorCode) {
1342     UConverter *cnv=pArgs->converter;
1343     const char *source=pArgs->source;
1344     const char *sourceLimit=pArgs->sourceLimit;
1345     int32_t *offsets=pArgs->offsets;
1346 
1347     int32_t state, offsetDelta;
1348     uint8_t b;
1349 
1350     state=cnv->mode;
1351 
1352     /*
1353      * If we detect a BOM in this buffer, then we must add the BOM size to the
1354      * offsets because the actual converter function will not see and count the BOM.
1355      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1356      */
1357     offsetDelta=0;
1358 
1359     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1360         switch(state) {
1361         case 0:
1362             cnv->toUBytes[0]=(uint8_t)*source++;
1363             cnv->toULength=1;
1364             state=1;
1365             break;
1366         case 1:
1367             /*
1368              * Only inside this switch case can the state variable
1369              * temporarily take two additional values:
1370              * 6: BOM error, continue with BE
1371              * 7: BOM error, continue with LE
1372              */
1373             b=*source;
1374             if(cnv->toUBytes[0]==0xfe && b==0xff) {
1375                 if(IS_UTF16LE(cnv)) {
1376                     state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1377                 } else {
1378                     state=8; /* detect UTF-16BE */
1379                 }
1380             } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
1381                 if(IS_UTF16BE(cnv)) {
1382                     state=6; /* illegal reverse BOM for Java "UnicodeBig" */
1383                 } else {
1384                     state=9; /* detect UTF-16LE */
1385                 }
1386             } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
1387                 state=6; /* illegal missing BOM for Java "Unicode" */
1388             }
1389             if(state>=8) {
1390                 /* BOM detected, consume it */
1391                 ++source;
1392                 cnv->toULength=0;
1393                 offsetDelta=(int32_t)(source-pArgs->source);
1394             } else if(state<6) {
1395                 /* ok: no BOM, and not a reverse BOM */
1396                 if(source!=pArgs->source) {
1397                     /* reset the source for a correct first offset */
1398                     source=pArgs->source;
1399                     cnv->toULength=0;
1400                 }
1401                 if(IS_UTF16LE(cnv)) {
1402                     /* Make Java "UnicodeLittle" default to LE. */
1403                     state=9;
1404                 } else {
1405                     /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1406                     state=8;
1407                 }
1408             } else {
1409                 /*
1410                  * error: missing BOM, or reverse BOM
1411                  * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1412                  * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1413                  * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1414                  */
1415                 /* report the non-BOM or reverse BOM as an illegal sequence */
1416                 cnv->toUBytes[1]=b;
1417                 cnv->toULength=2;
1418                 pArgs->source=source+1;
1419                 /* continue with conversion if the callback resets the error */
1420                 /*
1421                  * Make Java "Unicode" default to BE like standard UTF-16.
1422                  * Make Java "UnicodeBig" and "UnicodeLittle" default
1423                  * to their normal endiannesses.
1424                  */
1425                 cnv->mode=state+2;
1426                 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
1427                 return;
1428             }
1429             /* convert the rest of the stream */
1430             cnv->mode=state;
1431             continue;
1432         case 8:
1433             /* call UTF-16BE */
1434             pArgs->source=source;
1435             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1436             source=pArgs->source;
1437             break;
1438         case 9:
1439             /* call UTF-16LE */
1440             pArgs->source=source;
1441             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1442             source=pArgs->source;
1443             break;
1444         default:
1445             break; /* does not occur */
1446         }
1447     }
1448 
1449     /* add BOM size to offsets - see comment at offsetDelta declaration */
1450     if(offsets!=NULL && offsetDelta!=0) {
1451         int32_t *offsetsLimit=pArgs->offsets;
1452         while(offsets<offsetsLimit) {
1453             *offsets++ += offsetDelta;
1454         }
1455     }
1456 
1457     pArgs->source=source;
1458 
1459     if(source==sourceLimit && pArgs->flush) {
1460         /* handle truncated input */
1461         switch(state) {
1462         case 0:
1463             break; /* no input at all, nothing to do */
1464         case 8:
1465             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1466             break;
1467         case 9:
1468             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1469             break;
1470         default:
1471             /* 0<state<8: framework will report truncation, nothing to do here */
1472             break;
1473         }
1474     }
1475 
1476     cnv->mode=state;
1477 }
1478 
1479 static UChar32 U_CALLCONV
_UTF16GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1480 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1481                    UErrorCode *pErrorCode) {
1482     switch(pArgs->converter->mode) {
1483     case 8:
1484         return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1485     case 9:
1486         return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1487     default:
1488         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1489     }
1490 }
1491 U_CDECL_END
1492 
1493 static const UConverterImpl _UTF16Impl = {
1494     UCNV_UTF16,
1495 
1496     NULL,
1497     NULL,
1498 
1499     _UTF16Open,
1500     NULL,
1501     _UTF16Reset,
1502 
1503     _UTF16ToUnicodeWithOffsets,
1504     _UTF16ToUnicodeWithOffsets,
1505     _UTF16PEFromUnicodeWithOffsets,
1506     _UTF16PEFromUnicodeWithOffsets,
1507     _UTF16GetNextUChar,
1508 
1509     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1510     _UTF16GetName,
1511     NULL,
1512     NULL,
1513     ucnv_getNonSurrogateUnicodeSet,
1514 
1515     NULL,
1516     NULL
1517 };
1518 
1519 static const UConverterStaticData _UTF16StaticData = {
1520     sizeof(UConverterStaticData),
1521     "UTF-16",
1522     1204, /* CCSID for BOM sensitive UTF-16 */
1523     UCNV_IBM, UCNV_UTF16, 2, 2,
1524 #if U_IS_BIG_ENDIAN
1525     { 0xff, 0xfd, 0, 0 }, 2,
1526 #else
1527     { 0xfd, 0xff, 0, 0 }, 2,
1528 #endif
1529     FALSE, FALSE,
1530     0,
1531     0,
1532     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1533 };
1534 
1535 const UConverterSharedData _UTF16Data =
1536         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
1537 
1538 static const UConverterImpl _UTF16v2Impl = {
1539     UCNV_UTF16,
1540 
1541     NULL,
1542     NULL,
1543 
1544     _UTF16Open,
1545     NULL,
1546     _UTF16Reset,
1547 
1548     _UTF16ToUnicodeWithOffsets,
1549     _UTF16ToUnicodeWithOffsets,
1550     _UTF16BEFromUnicodeWithOffsets,
1551     _UTF16BEFromUnicodeWithOffsets,
1552     _UTF16GetNextUChar,
1553 
1554     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1555     _UTF16GetName,
1556     NULL,
1557     NULL,
1558     ucnv_getNonSurrogateUnicodeSet,
1559 
1560     NULL,
1561     NULL
1562 };
1563 
1564 static const UConverterStaticData _UTF16v2StaticData = {
1565     sizeof(UConverterStaticData),
1566     "UTF-16,version=2",
1567     1204, /* CCSID for BOM sensitive UTF-16 */
1568     UCNV_IBM, UCNV_UTF16, 2, 2,
1569     { 0xff, 0xfd, 0, 0 }, 2,
1570     FALSE, FALSE,
1571     0,
1572     0,
1573     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1574 };
1575 
1576 const UConverterSharedData _UTF16v2Data =
1577         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
1578 
1579 #endif
1580