• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2006, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucnv_u16.c
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2002jul01
12 *   created by: Markus W. Scherer
13 *
14 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_CONVERSION
20 
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24 #include "cmemory.h"
25 
26 enum {
27     UCNV_NEED_TO_WRITE_BOM=1
28 };
29 
30 /* UTF-16BE ----------------------------------------------------------------- */
31 
32 #if U_IS_BIG_ENDIAN
33 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
34 #else
35 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
36 #endif
37 
38 
39 static void
_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)40 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
41                                UErrorCode *pErrorCode) {
42     UConverter *cnv;
43     const UChar *source;
44     char *target;
45     int32_t *offsets;
46 
47     uint32_t targetCapacity, length, sourceIndex;
48     UChar c, trail;
49     char overflow[4];
50 
51     source=pArgs->source;
52     length=(int32_t)(pArgs->sourceLimit-source);
53     if(length<=0) {
54         /* no input, nothing to do */
55         return;
56     }
57 
58     cnv=pArgs->converter;
59 
60     /* write the BOM if necessary */
61     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
62         static const char bom[]={ (char)0xfe, (char)0xff };
63         ucnv_fromUWriteBytes(cnv,
64                              bom, 2,
65                              &pArgs->target, pArgs->targetLimit,
66                              &pArgs->offsets, -1,
67                              pErrorCode);
68         cnv->fromUnicodeStatus=0;
69     }
70 
71     target=pArgs->target;
72     if(target >= pArgs->targetLimit) {
73         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
74         return;
75     }
76 
77     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
78     offsets=pArgs->offsets;
79     sourceIndex=0;
80 
81     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
82 
83     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
84         /* the last buffer ended with a lead surrogate, output the surrogate pair */
85         ++source;
86         --length;
87         target[0]=(uint8_t)(c>>8);
88         target[1]=(uint8_t)c;
89         target[2]=(uint8_t)(trail>>8);
90         target[3]=(uint8_t)trail;
91         target+=4;
92         targetCapacity-=4;
93         if(offsets!=NULL) {
94             *offsets++=-1;
95             *offsets++=-1;
96             *offsets++=-1;
97             *offsets++=-1;
98         }
99         sourceIndex=1;
100         cnv->fromUChar32=c=0;
101     }
102 
103     if(c==0) {
104         /* copy an even number of bytes for complete UChars */
105         uint32_t count=2*length;
106         if(count>targetCapacity) {
107             count=targetCapacity&~1;
108         }
109         /* count is even */
110         targetCapacity-=count;
111         count>>=1;
112         length-=count;
113 
114         if(offsets==NULL) {
115             while(count>0) {
116                 c=*source++;
117                 if(U16_IS_SINGLE(c)) {
118                     target[0]=(uint8_t)(c>>8);
119                     target[1]=(uint8_t)c;
120                     target+=2;
121                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
122                     ++source;
123                     --count;
124                     target[0]=(uint8_t)(c>>8);
125                     target[1]=(uint8_t)c;
126                     target[2]=(uint8_t)(trail>>8);
127                     target[3]=(uint8_t)trail;
128                     target+=4;
129                 } else {
130                     break;
131                 }
132                 --count;
133             }
134         } else {
135             while(count>0) {
136                 c=*source++;
137                 if(U16_IS_SINGLE(c)) {
138                     target[0]=(uint8_t)(c>>8);
139                     target[1]=(uint8_t)c;
140                     target+=2;
141                     *offsets++=sourceIndex;
142                     *offsets++=sourceIndex++;
143                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
144                     ++source;
145                     --count;
146                     target[0]=(uint8_t)(c>>8);
147                     target[1]=(uint8_t)c;
148                     target[2]=(uint8_t)(trail>>8);
149                     target[3]=(uint8_t)trail;
150                     target+=4;
151                     *offsets++=sourceIndex;
152                     *offsets++=sourceIndex;
153                     *offsets++=sourceIndex;
154                     *offsets++=sourceIndex;
155                     sourceIndex+=2;
156                 } else {
157                     break;
158                 }
159                 --count;
160             }
161         }
162 
163         if(count==0) {
164             /* done with the loop for complete UChars */
165             if(length>0 && targetCapacity>0) {
166                 /*
167                  * there is more input and some target capacity -
168                  * it must be targetCapacity==1 because otherwise
169                  * the above would have copied more;
170                  * prepare for overflow output
171                  */
172                 if(U16_IS_SINGLE(c=*source++)) {
173                     overflow[0]=(char)(c>>8);
174                     overflow[1]=(char)c;
175                     length=2; /* 2 bytes to output */
176                     c=0;
177                 /* } else { keep c for surrogate handling, length will be set there */
178                 }
179             } else {
180                 length=0;
181                 c=0;
182             }
183         } else {
184             /* keep c for surrogate handling, length will be set there */
185             targetCapacity+=2*count;
186         }
187     } else {
188         length=0; /* from here on, length counts the bytes in overflow[] */
189     }
190 
191     if(c!=0) {
192         /*
193          * c is a surrogate, and
194          * - source or target too short
195          * - or the surrogate is unmatched
196          */
197         length=0;
198         if(U16_IS_SURROGATE_LEAD(c)) {
199             if(source<pArgs->sourceLimit) {
200                 if(U16_IS_TRAIL(trail=*source)) {
201                     /* output the surrogate pair, will overflow (see conditions comment above) */
202                     ++source;
203                     overflow[0]=(char)(c>>8);
204                     overflow[1]=(char)c;
205                     overflow[2]=(char)(trail>>8);
206                     overflow[3]=(char)trail;
207                     length=4; /* 4 bytes to output */
208                     c=0;
209                 } else {
210                     /* unmatched lead surrogate */
211                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
212                 }
213             } else {
214                 /* see if the trail surrogate is in the next buffer */
215             }
216         } else {
217             /* unmatched trail surrogate */
218             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
219         }
220         cnv->fromUChar32=c;
221     }
222 
223     if(length>0) {
224         /* output length bytes with overflow (length>targetCapacity>0) */
225         ucnv_fromUWriteBytes(cnv,
226                              overflow, length,
227                              (char **)&target, pArgs->targetLimit,
228                              &offsets, sourceIndex,
229                              pErrorCode);
230         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
231     }
232 
233     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
234         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
235     }
236 
237     /* write back the updated pointers */
238     pArgs->source=source;
239     pArgs->target=(char *)target;
240     pArgs->offsets=offsets;
241 }
242 
243 static void
_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)244 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
245                              UErrorCode *pErrorCode) {
246     UConverter *cnv;
247     const uint8_t *source;
248     UChar *target;
249     int32_t *offsets;
250 
251     uint32_t targetCapacity, length, count, sourceIndex;
252     UChar c, trail;
253 
254     cnv=pArgs->converter;
255     source=(const uint8_t *)pArgs->source;
256     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
257     if(length<=0 && cnv->toUnicodeStatus==0) {
258         /* no input, nothing to do */
259         return;
260     }
261 
262     target=pArgs->target;
263     if(target >= pArgs->targetLimit) {
264         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
265         return;
266     }
267 
268     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
269     offsets=pArgs->offsets;
270     sourceIndex=0;
271     c=0;
272 
273     /* complete a partial UChar or pair from the last call */
274     if(cnv->toUnicodeStatus!=0) {
275         /*
276          * special case: single byte from a previous buffer,
277          * where the byte turned out not to belong to a trail surrogate
278          * and the preceding, unmatched lead surrogate was put into toUBytes[]
279          * for error handling
280          */
281         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
282         cnv->toULength=1;
283         cnv->toUnicodeStatus=0;
284     }
285     if((count=cnv->toULength)!=0) {
286         uint8_t *p=cnv->toUBytes;
287         do {
288             p[count++]=*source++;
289             ++sourceIndex;
290             --length;
291             if(count==2) {
292                 c=((UChar)p[0]<<8)|p[1];
293                 if(U16_IS_SINGLE(c)) {
294                     /* output the BMP code point */
295                     *target++=c;
296                     if(offsets!=NULL) {
297                         *offsets++=-1;
298                     }
299                     --targetCapacity;
300                     count=0;
301                     c=0;
302                     break;
303                 } else if(U16_IS_SURROGATE_LEAD(c)) {
304                     /* continue collecting bytes for the trail surrogate */
305                     c=0; /* avoid unnecessary surrogate handling below */
306                 } else {
307                     /* fall through to error handling for an unmatched trail surrogate */
308                     break;
309                 }
310             } else if(count==4) {
311                 c=((UChar)p[0]<<8)|p[1];
312                 trail=((UChar)p[2]<<8)|p[3];
313                 if(U16_IS_TRAIL(trail)) {
314                     /* output the surrogate pair */
315                     *target++=c;
316                     if(targetCapacity>=2) {
317                         *target++=trail;
318                         if(offsets!=NULL) {
319                             *offsets++=-1;
320                             *offsets++=-1;
321                         }
322                         targetCapacity-=2;
323                     } else /* targetCapacity==1 */ {
324                         targetCapacity=0;
325                         cnv->UCharErrorBuffer[0]=trail;
326                         cnv->UCharErrorBufferLength=1;
327                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
328                     }
329                     count=0;
330                     c=0;
331                     break;
332                 } else {
333                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
334                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
335 
336                     /* back out reading the code unit after it */
337                     if(((const uint8_t *)pArgs->source-source)>=2) {
338                         source-=2;
339                     } else {
340                         /*
341                          * if the trail unit's first byte was in a previous buffer, then
342                          * we need to put it into a special place because toUBytes[] will be
343                          * used for the lead unit's bytes
344                          */
345                         cnv->toUnicodeStatus=0x100|p[2];
346                         --source;
347                     }
348                     cnv->toULength=2;
349 
350                     /* write back the updated pointers */
351                     pArgs->source=(const char *)source;
352                     pArgs->target=target;
353                     pArgs->offsets=offsets;
354                     return;
355                 }
356             }
357         } while(length>0);
358         cnv->toULength=(int8_t)count;
359     }
360 
361     /* copy an even number of bytes for complete UChars */
362     count=2*targetCapacity;
363     if(count>length) {
364         count=length&~1;
365     }
366     if(c==0 && count>0) {
367         length-=count;
368         count>>=1;
369         targetCapacity-=count;
370         if(offsets==NULL) {
371             do {
372                 c=((UChar)source[0]<<8)|source[1];
373                 source+=2;
374                 if(U16_IS_SINGLE(c)) {
375                     *target++=c;
376                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
377                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
378                 ) {
379                     source+=2;
380                     --count;
381                     *target++=c;
382                     *target++=trail;
383                 } else {
384                     break;
385                 }
386             } while(--count>0);
387         } else {
388             do {
389                 c=((UChar)source[0]<<8)|source[1];
390                 source+=2;
391                 if(U16_IS_SINGLE(c)) {
392                     *target++=c;
393                     *offsets++=sourceIndex;
394                     sourceIndex+=2;
395                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
396                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
397                 ) {
398                     source+=2;
399                     --count;
400                     *target++=c;
401                     *target++=trail;
402                     *offsets++=sourceIndex;
403                     *offsets++=sourceIndex;
404                     sourceIndex+=4;
405                 } else {
406                     break;
407                 }
408             } while(--count>0);
409         }
410 
411         if(count==0) {
412             /* done with the loop for complete UChars */
413             c=0;
414         } else {
415             /* keep c for surrogate handling, trail will be set there */
416             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
417             targetCapacity+=count;
418         }
419     }
420 
421     if(c!=0) {
422         /*
423          * c is a surrogate, and
424          * - source or target too short
425          * - or the surrogate is unmatched
426          */
427         cnv->toUBytes[0]=(uint8_t)(c>>8);
428         cnv->toUBytes[1]=(uint8_t)c;
429         cnv->toULength=2;
430 
431         if(U16_IS_SURROGATE_LEAD(c)) {
432             if(length>=2) {
433                 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
434                     /* output the surrogate pair, will overflow (see conditions comment above) */
435                     source+=2;
436                     length-=2;
437                     *target++=c;
438                     if(offsets!=NULL) {
439                         *offsets++=sourceIndex;
440                     }
441                     cnv->UCharErrorBuffer[0]=trail;
442                     cnv->UCharErrorBufferLength=1;
443                     cnv->toULength=0;
444                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
445                 } else {
446                     /* unmatched lead surrogate */
447                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
448                 }
449             } else {
450                 /* see if the trail surrogate is in the next buffer */
451             }
452         } else {
453             /* unmatched trail surrogate */
454             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
455         }
456     }
457 
458     if(U_SUCCESS(*pErrorCode)) {
459         /* check for a remaining source byte */
460         if(length>0) {
461             if(targetCapacity==0) {
462                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
463             } else {
464                 /* it must be length==1 because otherwise the above would have copied more */
465                 cnv->toUBytes[cnv->toULength++]=*source++;
466             }
467         }
468     }
469 
470     /* write back the updated pointers */
471     pArgs->source=(const char *)source;
472     pArgs->target=target;
473     pArgs->offsets=offsets;
474 }
475 
476 static UChar32
_UTF16BEGetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * err)477 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
478     const uint8_t *s, *sourceLimit;
479     UChar32 c;
480 
481     s=(const uint8_t *)pArgs->source;
482     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
483 
484     if(s>=sourceLimit) {
485         /* no input */
486         *err=U_INDEX_OUTOFBOUNDS_ERROR;
487         return 0xffff;
488     }
489 
490     if(s+2>sourceLimit) {
491         /* only one byte: truncated UChar */
492         pArgs->converter->toUBytes[0]=*s++;
493         pArgs->converter->toULength=1;
494         pArgs->source=(const char *)s;
495         *err = U_TRUNCATED_CHAR_FOUND;
496         return 0xffff;
497     }
498 
499     /* get one UChar */
500     c=((UChar32)*s<<8)|s[1];
501     s+=2;
502 
503     /* check for a surrogate pair */
504     if(U_IS_SURROGATE(c)) {
505         if(U16_IS_SURROGATE_LEAD(c)) {
506             if(s+2<=sourceLimit) {
507                 UChar trail;
508 
509                 /* get a second UChar and see if it is a trail surrogate */
510                 trail=((UChar)*s<<8)|s[1];
511                 if(U16_IS_TRAIL(trail)) {
512                     c=U16_GET_SUPPLEMENTARY(c, trail);
513                     s+=2;
514                 } else {
515                     /* unmatched lead surrogate */
516                     c=-2;
517                 }
518             } else {
519                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
520                 uint8_t *bytes=pArgs->converter->toUBytes;
521                 s-=2;
522                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
523                 do {
524                     *bytes++=*s++;
525                 } while(s<sourceLimit);
526 
527                 c=0xffff;
528                 *err=U_TRUNCATED_CHAR_FOUND;
529             }
530         } else {
531             /* unmatched trail surrogate */
532             c=-2;
533         }
534 
535         if(c<0) {
536             /* write the unmatched surrogate */
537             uint8_t *bytes=pArgs->converter->toUBytes;
538             pArgs->converter->toULength=2;
539             *bytes=*(s-2);
540             bytes[1]=*(s-1);
541 
542             c=0xffff;
543             *err=U_ILLEGAL_CHAR_FOUND;
544         }
545     }
546 
547     pArgs->source=(const char *)s;
548     return c;
549 }
550 
551 static const UConverterImpl _UTF16BEImpl={
552     UCNV_UTF16_BigEndian,
553 
554     NULL,
555     NULL,
556 
557     NULL,
558     NULL,
559     NULL,
560 
561     _UTF16BEToUnicodeWithOffsets,
562     _UTF16BEToUnicodeWithOffsets,
563     _UTF16BEFromUnicodeWithOffsets,
564     _UTF16BEFromUnicodeWithOffsets,
565     _UTF16BEGetNextUChar,
566 
567     NULL,
568     NULL,
569     NULL,
570     NULL,
571     ucnv_getNonSurrogateUnicodeSet
572 };
573 
574 static const UConverterStaticData _UTF16BEStaticData={
575     sizeof(UConverterStaticData),
576     "UTF-16BE",
577     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
578     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
579     0,
580     0,
581     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
582 };
583 
584 
585 const UConverterSharedData _UTF16BEData={
586     sizeof(UConverterSharedData), ~((uint32_t) 0),
587     NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,
588     0
589 };
590 
591 /* UTF-16LE ----------------------------------------------------------------- */
592 
593 static void
_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)594 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
595                                UErrorCode *pErrorCode) {
596     UConverter *cnv;
597     const UChar *source;
598     char *target;
599     int32_t *offsets;
600 
601     uint32_t targetCapacity, length, sourceIndex;
602     UChar c, trail;
603     char overflow[4];
604 
605     source=pArgs->source;
606     length=(int32_t)(pArgs->sourceLimit-source);
607     if(length<=0) {
608         /* no input, nothing to do */
609         return;
610     }
611 
612     cnv=pArgs->converter;
613 
614     /* write the BOM if necessary */
615     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
616         static const char bom[]={ (char)0xff, (char)0xfe };
617         ucnv_fromUWriteBytes(cnv,
618                              bom, 2,
619                              &pArgs->target, pArgs->targetLimit,
620                              &pArgs->offsets, -1,
621                              pErrorCode);
622         cnv->fromUnicodeStatus=0;
623     }
624 
625     target=pArgs->target;
626     if(target >= pArgs->targetLimit) {
627         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
628         return;
629     }
630 
631     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
632     offsets=pArgs->offsets;
633     sourceIndex=0;
634 
635     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
636 
637     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
638         /* the last buffer ended with a lead surrogate, output the surrogate pair */
639         ++source;
640         --length;
641         target[0]=(uint8_t)c;
642         target[1]=(uint8_t)(c>>8);
643         target[2]=(uint8_t)trail;
644         target[3]=(uint8_t)(trail>>8);
645         target+=4;
646         targetCapacity-=4;
647         if(offsets!=NULL) {
648             *offsets++=-1;
649             *offsets++=-1;
650             *offsets++=-1;
651             *offsets++=-1;
652         }
653         sourceIndex=1;
654         cnv->fromUChar32=c=0;
655     }
656 
657     if(c==0) {
658         /* copy an even number of bytes for complete UChars */
659         uint32_t count=2*length;
660         if(count>targetCapacity) {
661             count=targetCapacity&~1;
662         }
663         /* count is even */
664         targetCapacity-=count;
665         count>>=1;
666         length-=count;
667 
668         if(offsets==NULL) {
669             while(count>0) {
670                 c=*source++;
671                 if(U16_IS_SINGLE(c)) {
672                     target[0]=(uint8_t)c;
673                     target[1]=(uint8_t)(c>>8);
674                     target+=2;
675                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
676                     ++source;
677                     --count;
678                     target[0]=(uint8_t)c;
679                     target[1]=(uint8_t)(c>>8);
680                     target[2]=(uint8_t)trail;
681                     target[3]=(uint8_t)(trail>>8);
682                     target+=4;
683                 } else {
684                     break;
685                 }
686                 --count;
687             }
688         } else {
689             while(count>0) {
690                 c=*source++;
691                 if(U16_IS_SINGLE(c)) {
692                     target[0]=(uint8_t)c;
693                     target[1]=(uint8_t)(c>>8);
694                     target+=2;
695                     *offsets++=sourceIndex;
696                     *offsets++=sourceIndex++;
697                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
698                     ++source;
699                     --count;
700                     target[0]=(uint8_t)c;
701                     target[1]=(uint8_t)(c>>8);
702                     target[2]=(uint8_t)trail;
703                     target[3]=(uint8_t)(trail>>8);
704                     target+=4;
705                     *offsets++=sourceIndex;
706                     *offsets++=sourceIndex;
707                     *offsets++=sourceIndex;
708                     *offsets++=sourceIndex;
709                     sourceIndex+=2;
710                 } else {
711                     break;
712                 }
713                 --count;
714             }
715         }
716 
717         if(count==0) {
718             /* done with the loop for complete UChars */
719             if(length>0 && targetCapacity>0) {
720                 /*
721                  * there is more input and some target capacity -
722                  * it must be targetCapacity==1 because otherwise
723                  * the above would have copied more;
724                  * prepare for overflow output
725                  */
726                 if(U16_IS_SINGLE(c=*source++)) {
727                     overflow[0]=(char)c;
728                     overflow[1]=(char)(c>>8);
729                     length=2; /* 2 bytes to output */
730                     c=0;
731                 /* } else { keep c for surrogate handling, length will be set there */
732                 }
733             } else {
734                 length=0;
735                 c=0;
736             }
737         } else {
738             /* keep c for surrogate handling, length will be set there */
739             targetCapacity+=2*count;
740         }
741     } else {
742         length=0; /* from here on, length counts the bytes in overflow[] */
743     }
744 
745     if(c!=0) {
746         /*
747          * c is a surrogate, and
748          * - source or target too short
749          * - or the surrogate is unmatched
750          */
751         length=0;
752         if(U16_IS_SURROGATE_LEAD(c)) {
753             if(source<pArgs->sourceLimit) {
754                 if(U16_IS_TRAIL(trail=*source)) {
755                     /* output the surrogate pair, will overflow (see conditions comment above) */
756                     ++source;
757                     overflow[0]=(char)c;
758                     overflow[1]=(char)(c>>8);
759                     overflow[2]=(char)trail;
760                     overflow[3]=(char)(trail>>8);
761                     length=4; /* 4 bytes to output */
762                     c=0;
763                 } else {
764                     /* unmatched lead surrogate */
765                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
766                 }
767             } else {
768                 /* see if the trail surrogate is in the next buffer */
769             }
770         } else {
771             /* unmatched trail surrogate */
772             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
773         }
774         cnv->fromUChar32=c;
775     }
776 
777     if(length>0) {
778         /* output length bytes with overflow (length>targetCapacity>0) */
779         ucnv_fromUWriteBytes(cnv,
780                              overflow, length,
781                              &target, pArgs->targetLimit,
782                              &offsets, sourceIndex,
783                              pErrorCode);
784         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
785     }
786 
787     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
788         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
789     }
790 
791     /* write back the updated pointers */
792     pArgs->source=source;
793     pArgs->target=target;
794     pArgs->offsets=offsets;
795 }
796 
797 static void
_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)798 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
799                              UErrorCode *pErrorCode) {
800     UConverter *cnv;
801     const uint8_t *source;
802     UChar *target;
803     int32_t *offsets;
804 
805     uint32_t targetCapacity, length, count, sourceIndex;
806     UChar c, trail;
807 
808     cnv=pArgs->converter;
809     source=(const uint8_t *)pArgs->source;
810     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
811     if(length<=0 && cnv->toUnicodeStatus==0) {
812         /* no input, nothing to do */
813         return;
814     }
815 
816     target=pArgs->target;
817     if(target >= pArgs->targetLimit) {
818         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
819         return;
820     }
821 
822     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
823     offsets=pArgs->offsets;
824     sourceIndex=0;
825     c=0;
826 
827     /* complete a partial UChar or pair from the last call */
828     if(cnv->toUnicodeStatus!=0) {
829         /*
830          * special case: single byte from a previous buffer,
831          * where the byte turned out not to belong to a trail surrogate
832          * and the preceding, unmatched lead surrogate was put into toUBytes[]
833          * for error handling
834          */
835         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
836         cnv->toULength=1;
837         cnv->toUnicodeStatus=0;
838     }
839     if((count=cnv->toULength)!=0) {
840         uint8_t *p=cnv->toUBytes;
841         do {
842             p[count++]=*source++;
843             ++sourceIndex;
844             --length;
845             if(count==2) {
846                 c=((UChar)p[1]<<8)|p[0];
847                 if(U16_IS_SINGLE(c)) {
848                     /* output the BMP code point */
849                     *target++=c;
850                     if(offsets!=NULL) {
851                         *offsets++=-1;
852                     }
853                     --targetCapacity;
854                     count=0;
855                     c=0;
856                     break;
857                 } else if(U16_IS_SURROGATE_LEAD(c)) {
858                     /* continue collecting bytes for the trail surrogate */
859                     c=0; /* avoid unnecessary surrogate handling below */
860                 } else {
861                     /* fall through to error handling for an unmatched trail surrogate */
862                     break;
863                 }
864             } else if(count==4) {
865                 c=((UChar)p[1]<<8)|p[0];
866                 trail=((UChar)p[3]<<8)|p[2];
867                 if(U16_IS_TRAIL(trail)) {
868                     /* output the surrogate pair */
869                     *target++=c;
870                     if(targetCapacity>=2) {
871                         *target++=trail;
872                         if(offsets!=NULL) {
873                             *offsets++=-1;
874                             *offsets++=-1;
875                         }
876                         targetCapacity-=2;
877                     } else /* targetCapacity==1 */ {
878                         targetCapacity=0;
879                         cnv->UCharErrorBuffer[0]=trail;
880                         cnv->UCharErrorBufferLength=1;
881                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
882                     }
883                     count=0;
884                     c=0;
885                     break;
886                 } else {
887                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
888                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
889 
890                     /* back out reading the code unit after it */
891                     if(((const uint8_t *)pArgs->source-source)>=2) {
892                         source-=2;
893                     } else {
894                         /*
895                          * if the trail unit's first byte was in a previous buffer, then
896                          * we need to put it into a special place because toUBytes[] will be
897                          * used for the lead unit's bytes
898                          */
899                         cnv->toUnicodeStatus=0x100|p[2];
900                         --source;
901                     }
902                     cnv->toULength=2;
903 
904                     /* write back the updated pointers */
905                     pArgs->source=(const char *)source;
906                     pArgs->target=target;
907                     pArgs->offsets=offsets;
908                     return;
909                 }
910             }
911         } while(length>0);
912         cnv->toULength=(int8_t)count;
913     }
914 
915     /* copy an even number of bytes for complete UChars */
916     count=2*targetCapacity;
917     if(count>length) {
918         count=length&~1;
919     }
920     if(c==0 && count>0) {
921         length-=count;
922         count>>=1;
923         targetCapacity-=count;
924         if(offsets==NULL) {
925             do {
926                 c=((UChar)source[1]<<8)|source[0];
927                 source+=2;
928                 if(U16_IS_SINGLE(c)) {
929                     *target++=c;
930                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
931                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
932                 ) {
933                     source+=2;
934                     --count;
935                     *target++=c;
936                     *target++=trail;
937                 } else {
938                     break;
939                 }
940             } while(--count>0);
941         } else {
942             do {
943                 c=((UChar)source[1]<<8)|source[0];
944                 source+=2;
945                 if(U16_IS_SINGLE(c)) {
946                     *target++=c;
947                     *offsets++=sourceIndex;
948                     sourceIndex+=2;
949                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
950                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
951                 ) {
952                     source+=2;
953                     --count;
954                     *target++=c;
955                     *target++=trail;
956                     *offsets++=sourceIndex;
957                     *offsets++=sourceIndex;
958                     sourceIndex+=4;
959                 } else {
960                     break;
961                 }
962             } while(--count>0);
963         }
964 
965         if(count==0) {
966             /* done with the loop for complete UChars */
967             c=0;
968         } else {
969             /* keep c for surrogate handling, trail will be set there */
970             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
971             targetCapacity+=count;
972         }
973     }
974 
975     if(c!=0) {
976         /*
977          * c is a surrogate, and
978          * - source or target too short
979          * - or the surrogate is unmatched
980          */
981         cnv->toUBytes[0]=(uint8_t)c;
982         cnv->toUBytes[1]=(uint8_t)(c>>8);
983         cnv->toULength=2;
984 
985         if(U16_IS_SURROGATE_LEAD(c)) {
986             if(length>=2) {
987                 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
988                     /* output the surrogate pair, will overflow (see conditions comment above) */
989                     source+=2;
990                     length-=2;
991                     *target++=c;
992                     if(offsets!=NULL) {
993                         *offsets++=sourceIndex;
994                     }
995                     cnv->UCharErrorBuffer[0]=trail;
996                     cnv->UCharErrorBufferLength=1;
997                     cnv->toULength=0;
998                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
999                 } else {
1000                     /* unmatched lead surrogate */
1001                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1002                 }
1003             } else {
1004                 /* see if the trail surrogate is in the next buffer */
1005             }
1006         } else {
1007             /* unmatched trail surrogate */
1008             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1009         }
1010     }
1011 
1012     if(U_SUCCESS(*pErrorCode)) {
1013         /* check for a remaining source byte */
1014         if(length>0) {
1015             if(targetCapacity==0) {
1016                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1017             } else {
1018                 /* it must be length==1 because otherwise the above would have copied more */
1019                 cnv->toUBytes[cnv->toULength++]=*source++;
1020             }
1021         }
1022     }
1023 
1024     /* write back the updated pointers */
1025     pArgs->source=(const char *)source;
1026     pArgs->target=target;
1027     pArgs->offsets=offsets;
1028 }
1029 
1030 static UChar32
_UTF16LEGetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * err)1031 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1032     const uint8_t *s, *sourceLimit;
1033     UChar32 c;
1034 
1035     s=(const uint8_t *)pArgs->source;
1036     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1037 
1038     if(s>=sourceLimit) {
1039         /* no input */
1040         *err=U_INDEX_OUTOFBOUNDS_ERROR;
1041         return 0xffff;
1042     }
1043 
1044     if(s+2>sourceLimit) {
1045         /* only one byte: truncated UChar */
1046         pArgs->converter->toUBytes[0]=*s++;
1047         pArgs->converter->toULength=1;
1048         pArgs->source=(const char *)s;
1049         *err = U_TRUNCATED_CHAR_FOUND;
1050         return 0xffff;
1051     }
1052 
1053     /* get one UChar */
1054     c=((UChar32)s[1]<<8)|*s;
1055     s+=2;
1056 
1057     /* check for a surrogate pair */
1058     if(U_IS_SURROGATE(c)) {
1059         if(U16_IS_SURROGATE_LEAD(c)) {
1060             if(s+2<=sourceLimit) {
1061                 UChar trail;
1062 
1063                 /* get a second UChar and see if it is a trail surrogate */
1064                 trail=((UChar)s[1]<<8)|*s;
1065                 if(U16_IS_TRAIL(trail)) {
1066                     c=U16_GET_SUPPLEMENTARY(c, trail);
1067                     s+=2;
1068                 } else {
1069                     /* unmatched lead surrogate */
1070                     c=-2;
1071                 }
1072             } else {
1073                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1074                 uint8_t *bytes=pArgs->converter->toUBytes;
1075                 s-=2;
1076                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1077                 do {
1078                     *bytes++=*s++;
1079                 } while(s<sourceLimit);
1080 
1081                 c=0xffff;
1082                 *err=U_TRUNCATED_CHAR_FOUND;
1083             }
1084         } else {
1085             /* unmatched trail surrogate */
1086             c=-2;
1087         }
1088 
1089         if(c<0) {
1090             /* write the unmatched surrogate */
1091             uint8_t *bytes=pArgs->converter->toUBytes;
1092             pArgs->converter->toULength=2;
1093             *bytes=*(s-2);
1094             bytes[1]=*(s-1);
1095 
1096             c=0xffff;
1097             *err=U_ILLEGAL_CHAR_FOUND;
1098         }
1099     }
1100 
1101     pArgs->source=(const char *)s;
1102     return c;
1103 }
1104 
1105 static const UConverterImpl _UTF16LEImpl={
1106     UCNV_UTF16_LittleEndian,
1107 
1108     NULL,
1109     NULL,
1110 
1111     NULL,
1112     NULL,
1113     NULL,
1114 
1115     _UTF16LEToUnicodeWithOffsets,
1116     _UTF16LEToUnicodeWithOffsets,
1117     _UTF16LEFromUnicodeWithOffsets,
1118     _UTF16LEFromUnicodeWithOffsets,
1119     _UTF16LEGetNextUChar,
1120 
1121     NULL,
1122     NULL,
1123     NULL,
1124     NULL,
1125     ucnv_getNonSurrogateUnicodeSet
1126 };
1127 
1128 
1129 static const UConverterStaticData _UTF16LEStaticData={
1130     sizeof(UConverterStaticData),
1131     "UTF-16LE",
1132     1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1133     { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1134     0,
1135     0,
1136     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1137 };
1138 
1139 
1140 const UConverterSharedData _UTF16LEData={
1141     sizeof(UConverterSharedData), ~((uint32_t) 0),
1142     NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,
1143     0
1144 };
1145 
1146 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1147 
1148 /*
1149  * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1150  * accordingly.
1151  * This is a simpler version of the UTF-32 converter below, with
1152  * fewer states for shorter BOMs.
1153  *
1154  * State values:
1155  * 0    initial state
1156  * 1    saw FE
1157  * 2..4 -
1158  * 5    saw FF
1159  * 6..7 -
1160  * 8    UTF-16BE mode
1161  * 9    UTF-16LE mode
1162  *
1163  * During detection: state&3==number of matching bytes so far.
1164  *
1165  * On output, emit U+FEFF as the first code point.
1166  */
1167 
1168 static void
_UTF16Reset(UConverter * cnv,UConverterResetChoice choice)1169 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1170     if(choice<=UCNV_RESET_TO_UNICODE) {
1171         /* reset toUnicode: state=0 */
1172         cnv->mode=0;
1173     }
1174     if(choice!=UCNV_RESET_TO_UNICODE) {
1175         /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1176         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1177     }
1178 }
1179 
1180 static void
_UTF16Open(UConverter * cnv,const char * name,const char * locale,uint32_t options,UErrorCode * pErrorCode)1181 _UTF16Open(UConverter *cnv,
1182            const char *name,
1183            const char *locale,
1184            uint32_t options,
1185            UErrorCode *pErrorCode) {
1186     _UTF16Reset(cnv, UCNV_RESET_BOTH);
1187 }
1188 
1189 static const char utf16BOM[8]={ (char)0xfe, (char)0xff, 0, 0,    (char)0xff, (char)0xfe, 0, 0 };
1190 
1191 static void
_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1192 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1193                            UErrorCode *pErrorCode) {
1194     UConverter *cnv=pArgs->converter;
1195     const char *source=pArgs->source;
1196     const char *sourceLimit=pArgs->sourceLimit;
1197     int32_t *offsets=pArgs->offsets;
1198 
1199     int32_t state, offsetDelta;
1200     char b;
1201 
1202     state=cnv->mode;
1203 
1204     /*
1205      * If we detect a BOM in this buffer, then we must add the BOM size to the
1206      * offsets because the actual converter function will not see and count the BOM.
1207      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1208      */
1209     offsetDelta=0;
1210 
1211     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1212         switch(state) {
1213         case 0:
1214             b=*source;
1215             if(b==(char)0xfe) {
1216                 state=1; /* could be FE FF */
1217             } else if(b==(char)0xff) {
1218                 state=5; /* could be FF FE */
1219             } else {
1220                 state=8; /* default to UTF-16BE */
1221                 continue;
1222             }
1223             ++source;
1224             break;
1225         case 1:
1226         case 5:
1227             if(*source==utf16BOM[state]) {
1228                 ++source;
1229                 if(state==1) {
1230                     state=8; /* detect UTF-16BE */
1231                     offsetDelta=(int32_t)(source-pArgs->source);
1232                 } else if(state==5) {
1233                     state=9; /* detect UTF-16LE */
1234                     offsetDelta=(int32_t)(source-pArgs->source);
1235                 }
1236             } else {
1237                 /* switch to UTF-16BE and pass the previous bytes */
1238                 if(source!=pArgs->source) {
1239                     /* just reset the source */
1240                     source=pArgs->source;
1241                 } else {
1242                     UBool oldFlush=pArgs->flush;
1243 
1244                     /* the first byte is from a previous buffer, replay it first */
1245                     pArgs->source=utf16BOM+(state&4); /* select the correct BOM */
1246                     pArgs->sourceLimit=pArgs->source+1; /* replay previous byte */
1247                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1248 
1249                     _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1250 
1251                     /* restore real pointers; pArgs->source will be set in case 8/9 */
1252                     pArgs->sourceLimit=sourceLimit;
1253                     pArgs->flush=oldFlush;
1254                 }
1255                 state=8;
1256                 continue;
1257             }
1258             break;
1259         case 8:
1260             /* call UTF-16BE */
1261             pArgs->source=source;
1262             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1263             source=pArgs->source;
1264             break;
1265         case 9:
1266             /* call UTF-16LE */
1267             pArgs->source=source;
1268             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1269             source=pArgs->source;
1270             break;
1271         default:
1272             break; /* does not occur */
1273         }
1274     }
1275 
1276     /* add BOM size to offsets - see comment at offsetDelta declaration */
1277     if(offsets!=NULL && offsetDelta!=0) {
1278         int32_t *offsetsLimit=pArgs->offsets;
1279         while(offsets<offsetsLimit) {
1280             *offsets++ += offsetDelta;
1281         }
1282     }
1283 
1284     pArgs->source=source;
1285 
1286     if(source==sourceLimit && pArgs->flush) {
1287         /* handle truncated input */
1288         switch(state) {
1289         case 0:
1290             break; /* no input at all, nothing to do */
1291         case 8:
1292             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1293             break;
1294         case 9:
1295             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1296             break;
1297         default:
1298             /* handle 0<state<8: call UTF-16BE with too-short input */
1299             pArgs->source=utf16BOM+(state&4); /* select the correct BOM */
1300             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1301 
1302             /* no offsets: not enough for output */
1303             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1304             pArgs->source=source;
1305             pArgs->sourceLimit=sourceLimit;
1306             state=8;
1307             break;
1308         }
1309     }
1310 
1311     cnv->mode=state;
1312 }
1313 
1314 static UChar32
_UTF16GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1315 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1316                    UErrorCode *pErrorCode) {
1317     switch(pArgs->converter->mode) {
1318     case 8:
1319         return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1320     case 9:
1321         return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1322     default:
1323         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1324     }
1325 }
1326 
1327 static const UConverterImpl _UTF16Impl = {
1328     UCNV_UTF16,
1329 
1330     NULL,
1331     NULL,
1332 
1333     _UTF16Open,
1334     NULL,
1335     _UTF16Reset,
1336 
1337     _UTF16ToUnicodeWithOffsets,
1338     _UTF16ToUnicodeWithOffsets,
1339     _UTF16PEFromUnicodeWithOffsets,
1340     _UTF16PEFromUnicodeWithOffsets,
1341     _UTF16GetNextUChar,
1342 
1343     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1344     NULL,
1345     NULL,
1346     NULL,
1347     ucnv_getNonSurrogateUnicodeSet
1348 };
1349 
1350 static const UConverterStaticData _UTF16StaticData = {
1351     sizeof(UConverterStaticData),
1352     "UTF-16",
1353     1204, /* CCSID for BOM sensitive UTF-16 */
1354     UCNV_IBM, UCNV_UTF16, 2, 2,
1355 #if U_IS_BIG_ENDIAN
1356     { 0xff, 0xfd, 0, 0 }, 2,
1357 #else
1358     { 0xfd, 0xff, 0, 0 }, 2,
1359 #endif
1360     FALSE, FALSE,
1361     0,
1362     0,
1363     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1364 };
1365 
1366 const UConverterSharedData _UTF16Data = {
1367     sizeof(UConverterSharedData), ~((uint32_t) 0),
1368     NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,
1369     0
1370 };
1371 
1372 #endif
1373