1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u16.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-16 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24 #include "cmemory.h"
25
26 enum {
27 UCNV_NEED_TO_WRITE_BOM=1
28 };
29
30 /*
31 * The UTF-16 toUnicode implementation is also used for the Java-specific
32 * "with BOM" variants of UTF-16BE and UTF-16LE.
33 */
34 static void
35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
36 UErrorCode *pErrorCode);
37
38 /* UTF-16BE ----------------------------------------------------------------- */
39
40 #if U_IS_BIG_ENDIAN
41 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
42 #else
43 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
44 #endif
45
46
47 static void
_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
49 UErrorCode *pErrorCode) {
50 UConverter *cnv;
51 const UChar *source;
52 char *target;
53 int32_t *offsets;
54
55 uint32_t targetCapacity, length, sourceIndex;
56 UChar c, trail;
57 char overflow[4];
58
59 source=pArgs->source;
60 length=(int32_t)(pArgs->sourceLimit-source);
61 if(length<=0) {
62 /* no input, nothing to do */
63 return;
64 }
65
66 cnv=pArgs->converter;
67
68 /* write the BOM if necessary */
69 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
70 static const char bom[]={ (char)0xfe, (char)0xff };
71 ucnv_fromUWriteBytes(cnv,
72 bom, 2,
73 &pArgs->target, pArgs->targetLimit,
74 &pArgs->offsets, -1,
75 pErrorCode);
76 cnv->fromUnicodeStatus=0;
77 }
78
79 target=pArgs->target;
80 if(target >= pArgs->targetLimit) {
81 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
82 return;
83 }
84
85 targetCapacity=(uint32_t)(pArgs->targetLimit-target);
86 offsets=pArgs->offsets;
87 sourceIndex=0;
88
89 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
90
91 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
92 /* the last buffer ended with a lead surrogate, output the surrogate pair */
93 ++source;
94 --length;
95 target[0]=(uint8_t)(c>>8);
96 target[1]=(uint8_t)c;
97 target[2]=(uint8_t)(trail>>8);
98 target[3]=(uint8_t)trail;
99 target+=4;
100 targetCapacity-=4;
101 if(offsets!=NULL) {
102 *offsets++=-1;
103 *offsets++=-1;
104 *offsets++=-1;
105 *offsets++=-1;
106 }
107 sourceIndex=1;
108 cnv->fromUChar32=c=0;
109 }
110
111 if(c==0) {
112 /* copy an even number of bytes for complete UChars */
113 uint32_t count=2*length;
114 if(count>targetCapacity) {
115 count=targetCapacity&~1;
116 }
117 /* count is even */
118 targetCapacity-=count;
119 count>>=1;
120 length-=count;
121
122 if(offsets==NULL) {
123 while(count>0) {
124 c=*source++;
125 if(U16_IS_SINGLE(c)) {
126 target[0]=(uint8_t)(c>>8);
127 target[1]=(uint8_t)c;
128 target+=2;
129 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
130 ++source;
131 --count;
132 target[0]=(uint8_t)(c>>8);
133 target[1]=(uint8_t)c;
134 target[2]=(uint8_t)(trail>>8);
135 target[3]=(uint8_t)trail;
136 target+=4;
137 } else {
138 break;
139 }
140 --count;
141 }
142 } else {
143 while(count>0) {
144 c=*source++;
145 if(U16_IS_SINGLE(c)) {
146 target[0]=(uint8_t)(c>>8);
147 target[1]=(uint8_t)c;
148 target+=2;
149 *offsets++=sourceIndex;
150 *offsets++=sourceIndex++;
151 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
152 ++source;
153 --count;
154 target[0]=(uint8_t)(c>>8);
155 target[1]=(uint8_t)c;
156 target[2]=(uint8_t)(trail>>8);
157 target[3]=(uint8_t)trail;
158 target+=4;
159 *offsets++=sourceIndex;
160 *offsets++=sourceIndex;
161 *offsets++=sourceIndex;
162 *offsets++=sourceIndex;
163 sourceIndex+=2;
164 } else {
165 break;
166 }
167 --count;
168 }
169 }
170
171 if(count==0) {
172 /* done with the loop for complete UChars */
173 if(length>0 && targetCapacity>0) {
174 /*
175 * there is more input and some target capacity -
176 * it must be targetCapacity==1 because otherwise
177 * the above would have copied more;
178 * prepare for overflow output
179 */
180 if(U16_IS_SINGLE(c=*source++)) {
181 overflow[0]=(char)(c>>8);
182 overflow[1]=(char)c;
183 length=2; /* 2 bytes to output */
184 c=0;
185 /* } else { keep c for surrogate handling, length will be set there */
186 }
187 } else {
188 length=0;
189 c=0;
190 }
191 } else {
192 /* keep c for surrogate handling, length will be set there */
193 targetCapacity+=2*count;
194 }
195 } else {
196 length=0; /* from here on, length counts the bytes in overflow[] */
197 }
198
199 if(c!=0) {
200 /*
201 * c is a surrogate, and
202 * - source or target too short
203 * - or the surrogate is unmatched
204 */
205 length=0;
206 if(U16_IS_SURROGATE_LEAD(c)) {
207 if(source<pArgs->sourceLimit) {
208 if(U16_IS_TRAIL(trail=*source)) {
209 /* output the surrogate pair, will overflow (see conditions comment above) */
210 ++source;
211 overflow[0]=(char)(c>>8);
212 overflow[1]=(char)c;
213 overflow[2]=(char)(trail>>8);
214 overflow[3]=(char)trail;
215 length=4; /* 4 bytes to output */
216 c=0;
217 } else {
218 /* unmatched lead surrogate */
219 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
220 }
221 } else {
222 /* see if the trail surrogate is in the next buffer */
223 }
224 } else {
225 /* unmatched trail surrogate */
226 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
227 }
228 cnv->fromUChar32=c;
229 }
230
231 if(length>0) {
232 /* output length bytes with overflow (length>targetCapacity>0) */
233 ucnv_fromUWriteBytes(cnv,
234 overflow, length,
235 (char **)&target, pArgs->targetLimit,
236 &offsets, sourceIndex,
237 pErrorCode);
238 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
239 }
240
241 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
242 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
243 }
244
245 /* write back the updated pointers */
246 pArgs->source=source;
247 pArgs->target=(char *)target;
248 pArgs->offsets=offsets;
249 }
250
251 static void
_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
253 UErrorCode *pErrorCode) {
254 UConverter *cnv;
255 const uint8_t *source;
256 UChar *target;
257 int32_t *offsets;
258
259 uint32_t targetCapacity, length, count, sourceIndex;
260 UChar c, trail;
261
262 if(pArgs->converter->mode<8) {
263 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
264 return;
265 }
266
267 cnv=pArgs->converter;
268 source=(const uint8_t *)pArgs->source;
269 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
270 if(length<=0 && cnv->toUnicodeStatus==0) {
271 /* no input, nothing to do */
272 return;
273 }
274
275 target=pArgs->target;
276 if(target >= pArgs->targetLimit) {
277 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
278 return;
279 }
280
281 targetCapacity=(uint32_t)(pArgs->targetLimit-target);
282 offsets=pArgs->offsets;
283 sourceIndex=0;
284 c=0;
285
286 /* complete a partial UChar or pair from the last call */
287 if(cnv->toUnicodeStatus!=0) {
288 /*
289 * special case: single byte from a previous buffer,
290 * where the byte turned out not to belong to a trail surrogate
291 * and the preceding, unmatched lead surrogate was put into toUBytes[]
292 * for error handling
293 */
294 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
295 cnv->toULength=1;
296 cnv->toUnicodeStatus=0;
297 }
298 if((count=cnv->toULength)!=0) {
299 uint8_t *p=cnv->toUBytes;
300 do {
301 p[count++]=*source++;
302 ++sourceIndex;
303 --length;
304 if(count==2) {
305 c=((UChar)p[0]<<8)|p[1];
306 if(U16_IS_SINGLE(c)) {
307 /* output the BMP code point */
308 *target++=c;
309 if(offsets!=NULL) {
310 *offsets++=-1;
311 }
312 --targetCapacity;
313 count=0;
314 c=0;
315 break;
316 } else if(U16_IS_SURROGATE_LEAD(c)) {
317 /* continue collecting bytes for the trail surrogate */
318 c=0; /* avoid unnecessary surrogate handling below */
319 } else {
320 /* fall through to error handling for an unmatched trail surrogate */
321 break;
322 }
323 } else if(count==4) {
324 c=((UChar)p[0]<<8)|p[1];
325 trail=((UChar)p[2]<<8)|p[3];
326 if(U16_IS_TRAIL(trail)) {
327 /* output the surrogate pair */
328 *target++=c;
329 if(targetCapacity>=2) {
330 *target++=trail;
331 if(offsets!=NULL) {
332 *offsets++=-1;
333 *offsets++=-1;
334 }
335 targetCapacity-=2;
336 } else /* targetCapacity==1 */ {
337 targetCapacity=0;
338 cnv->UCharErrorBuffer[0]=trail;
339 cnv->UCharErrorBufferLength=1;
340 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
341 }
342 count=0;
343 c=0;
344 break;
345 } else {
346 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
347 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
348
349 /* back out reading the code unit after it */
350 if(((const uint8_t *)pArgs->source-source)>=2) {
351 source-=2;
352 } else {
353 /*
354 * if the trail unit's first byte was in a previous buffer, then
355 * we need to put it into a special place because toUBytes[] will be
356 * used for the lead unit's bytes
357 */
358 cnv->toUnicodeStatus=0x100|p[2];
359 --source;
360 }
361 cnv->toULength=2;
362
363 /* write back the updated pointers */
364 pArgs->source=(const char *)source;
365 pArgs->target=target;
366 pArgs->offsets=offsets;
367 return;
368 }
369 }
370 } while(length>0);
371 cnv->toULength=(int8_t)count;
372 }
373
374 /* copy an even number of bytes for complete UChars */
375 count=2*targetCapacity;
376 if(count>length) {
377 count=length&~1;
378 }
379 if(c==0 && count>0) {
380 length-=count;
381 count>>=1;
382 targetCapacity-=count;
383 if(offsets==NULL) {
384 do {
385 c=((UChar)source[0]<<8)|source[1];
386 source+=2;
387 if(U16_IS_SINGLE(c)) {
388 *target++=c;
389 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
390 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
391 ) {
392 source+=2;
393 --count;
394 *target++=c;
395 *target++=trail;
396 } else {
397 break;
398 }
399 } while(--count>0);
400 } else {
401 do {
402 c=((UChar)source[0]<<8)|source[1];
403 source+=2;
404 if(U16_IS_SINGLE(c)) {
405 *target++=c;
406 *offsets++=sourceIndex;
407 sourceIndex+=2;
408 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
409 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
410 ) {
411 source+=2;
412 --count;
413 *target++=c;
414 *target++=trail;
415 *offsets++=sourceIndex;
416 *offsets++=sourceIndex;
417 sourceIndex+=4;
418 } else {
419 break;
420 }
421 } while(--count>0);
422 }
423
424 if(count==0) {
425 /* done with the loop for complete UChars */
426 c=0;
427 } else {
428 /* keep c for surrogate handling, trail will be set there */
429 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
430 targetCapacity+=count;
431 }
432 }
433
434 if(c!=0) {
435 /*
436 * c is a surrogate, and
437 * - source or target too short
438 * - or the surrogate is unmatched
439 */
440 cnv->toUBytes[0]=(uint8_t)(c>>8);
441 cnv->toUBytes[1]=(uint8_t)c;
442 cnv->toULength=2;
443
444 if(U16_IS_SURROGATE_LEAD(c)) {
445 if(length>=2) {
446 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
447 /* output the surrogate pair, will overflow (see conditions comment above) */
448 source+=2;
449 length-=2;
450 *target++=c;
451 if(offsets!=NULL) {
452 *offsets++=sourceIndex;
453 }
454 cnv->UCharErrorBuffer[0]=trail;
455 cnv->UCharErrorBufferLength=1;
456 cnv->toULength=0;
457 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
458 } else {
459 /* unmatched lead surrogate */
460 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
461 }
462 } else {
463 /* see if the trail surrogate is in the next buffer */
464 }
465 } else {
466 /* unmatched trail surrogate */
467 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
468 }
469 }
470
471 if(U_SUCCESS(*pErrorCode)) {
472 /* check for a remaining source byte */
473 if(length>0) {
474 if(targetCapacity==0) {
475 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
476 } else {
477 /* it must be length==1 because otherwise the above would have copied more */
478 cnv->toUBytes[cnv->toULength++]=*source++;
479 }
480 }
481 }
482
483 /* write back the updated pointers */
484 pArgs->source=(const char *)source;
485 pArgs->target=target;
486 pArgs->offsets=offsets;
487 }
488
489 static UChar32
_UTF16BEGetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * err)490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
491 const uint8_t *s, *sourceLimit;
492 UChar32 c;
493
494 if(pArgs->converter->mode<8) {
495 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
496 }
497
498 s=(const uint8_t *)pArgs->source;
499 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
500
501 if(s>=sourceLimit) {
502 /* no input */
503 *err=U_INDEX_OUTOFBOUNDS_ERROR;
504 return 0xffff;
505 }
506
507 if(s+2>sourceLimit) {
508 /* only one byte: truncated UChar */
509 pArgs->converter->toUBytes[0]=*s++;
510 pArgs->converter->toULength=1;
511 pArgs->source=(const char *)s;
512 *err = U_TRUNCATED_CHAR_FOUND;
513 return 0xffff;
514 }
515
516 /* get one UChar */
517 c=((UChar32)*s<<8)|s[1];
518 s+=2;
519
520 /* check for a surrogate pair */
521 if(U_IS_SURROGATE(c)) {
522 if(U16_IS_SURROGATE_LEAD(c)) {
523 if(s+2<=sourceLimit) {
524 UChar trail;
525
526 /* get a second UChar and see if it is a trail surrogate */
527 trail=((UChar)*s<<8)|s[1];
528 if(U16_IS_TRAIL(trail)) {
529 c=U16_GET_SUPPLEMENTARY(c, trail);
530 s+=2;
531 } else {
532 /* unmatched lead surrogate */
533 c=-2;
534 }
535 } else {
536 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
537 uint8_t *bytes=pArgs->converter->toUBytes;
538 s-=2;
539 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
540 do {
541 *bytes++=*s++;
542 } while(s<sourceLimit);
543
544 c=0xffff;
545 *err=U_TRUNCATED_CHAR_FOUND;
546 }
547 } else {
548 /* unmatched trail surrogate */
549 c=-2;
550 }
551
552 if(c<0) {
553 /* write the unmatched surrogate */
554 uint8_t *bytes=pArgs->converter->toUBytes;
555 pArgs->converter->toULength=2;
556 *bytes=*(s-2);
557 bytes[1]=*(s-1);
558
559 c=0xffff;
560 *err=U_ILLEGAL_CHAR_FOUND;
561 }
562 }
563
564 pArgs->source=(const char *)s;
565 return c;
566 }
567
568 static void
_UTF16BEReset(UConverter * cnv,UConverterResetChoice choice)569 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
570 if(choice<=UCNV_RESET_TO_UNICODE) {
571 /* reset toUnicode state */
572 if(UCNV_GET_VERSION(cnv)==0) {
573 cnv->mode=8; /* no BOM handling */
574 } else {
575 cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
576 }
577 }
578 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
579 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
580 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
581 }
582 }
583
584 static void
_UTF16BEOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)585 _UTF16BEOpen(UConverter *cnv,
586 UConverterLoadArgs *pArgs,
587 UErrorCode *pErrorCode) {
588 if(UCNV_GET_VERSION(cnv)<=1) {
589 _UTF16BEReset(cnv, UCNV_RESET_BOTH);
590 } else {
591 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
592 }
593 }
594
595 static const char *
_UTF16BEGetName(const UConverter * cnv)596 _UTF16BEGetName(const UConverter *cnv) {
597 if(UCNV_GET_VERSION(cnv)==0) {
598 return "UTF-16BE";
599 } else {
600 return "UTF-16BE,version=1";
601 }
602 }
603
604 static const UConverterImpl _UTF16BEImpl={
605 UCNV_UTF16_BigEndian,
606
607 NULL,
608 NULL,
609
610 _UTF16BEOpen,
611 NULL,
612 _UTF16BEReset,
613
614 _UTF16BEToUnicodeWithOffsets,
615 _UTF16BEToUnicodeWithOffsets,
616 _UTF16BEFromUnicodeWithOffsets,
617 _UTF16BEFromUnicodeWithOffsets,
618 _UTF16BEGetNextUChar,
619
620 NULL,
621 _UTF16BEGetName,
622 NULL,
623 NULL,
624 ucnv_getNonSurrogateUnicodeSet
625 };
626
627 static const UConverterStaticData _UTF16BEStaticData={
628 sizeof(UConverterStaticData),
629 "UTF-16BE",
630 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
631 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
632 0,
633 0,
634 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
635 };
636
637
638 const UConverterSharedData _UTF16BEData={
639 sizeof(UConverterSharedData), ~((uint32_t) 0),
640 NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,
641 0
642 };
643
644 /* UTF-16LE ----------------------------------------------------------------- */
645
646 static void
_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)647 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
648 UErrorCode *pErrorCode) {
649 UConverter *cnv;
650 const UChar *source;
651 char *target;
652 int32_t *offsets;
653
654 uint32_t targetCapacity, length, sourceIndex;
655 UChar c, trail;
656 char overflow[4];
657
658 source=pArgs->source;
659 length=(int32_t)(pArgs->sourceLimit-source);
660 if(length<=0) {
661 /* no input, nothing to do */
662 return;
663 }
664
665 cnv=pArgs->converter;
666
667 /* write the BOM if necessary */
668 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
669 static const char bom[]={ (char)0xff, (char)0xfe };
670 ucnv_fromUWriteBytes(cnv,
671 bom, 2,
672 &pArgs->target, pArgs->targetLimit,
673 &pArgs->offsets, -1,
674 pErrorCode);
675 cnv->fromUnicodeStatus=0;
676 }
677
678 target=pArgs->target;
679 if(target >= pArgs->targetLimit) {
680 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
681 return;
682 }
683
684 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
685 offsets=pArgs->offsets;
686 sourceIndex=0;
687
688 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
689
690 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
691 /* the last buffer ended with a lead surrogate, output the surrogate pair */
692 ++source;
693 --length;
694 target[0]=(uint8_t)c;
695 target[1]=(uint8_t)(c>>8);
696 target[2]=(uint8_t)trail;
697 target[3]=(uint8_t)(trail>>8);
698 target+=4;
699 targetCapacity-=4;
700 if(offsets!=NULL) {
701 *offsets++=-1;
702 *offsets++=-1;
703 *offsets++=-1;
704 *offsets++=-1;
705 }
706 sourceIndex=1;
707 cnv->fromUChar32=c=0;
708 }
709
710 if(c==0) {
711 /* copy an even number of bytes for complete UChars */
712 uint32_t count=2*length;
713 if(count>targetCapacity) {
714 count=targetCapacity&~1;
715 }
716 /* count is even */
717 targetCapacity-=count;
718 count>>=1;
719 length-=count;
720
721 if(offsets==NULL) {
722 while(count>0) {
723 c=*source++;
724 if(U16_IS_SINGLE(c)) {
725 target[0]=(uint8_t)c;
726 target[1]=(uint8_t)(c>>8);
727 target+=2;
728 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
729 ++source;
730 --count;
731 target[0]=(uint8_t)c;
732 target[1]=(uint8_t)(c>>8);
733 target[2]=(uint8_t)trail;
734 target[3]=(uint8_t)(trail>>8);
735 target+=4;
736 } else {
737 break;
738 }
739 --count;
740 }
741 } else {
742 while(count>0) {
743 c=*source++;
744 if(U16_IS_SINGLE(c)) {
745 target[0]=(uint8_t)c;
746 target[1]=(uint8_t)(c>>8);
747 target+=2;
748 *offsets++=sourceIndex;
749 *offsets++=sourceIndex++;
750 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
751 ++source;
752 --count;
753 target[0]=(uint8_t)c;
754 target[1]=(uint8_t)(c>>8);
755 target[2]=(uint8_t)trail;
756 target[3]=(uint8_t)(trail>>8);
757 target+=4;
758 *offsets++=sourceIndex;
759 *offsets++=sourceIndex;
760 *offsets++=sourceIndex;
761 *offsets++=sourceIndex;
762 sourceIndex+=2;
763 } else {
764 break;
765 }
766 --count;
767 }
768 }
769
770 if(count==0) {
771 /* done with the loop for complete UChars */
772 if(length>0 && targetCapacity>0) {
773 /*
774 * there is more input and some target capacity -
775 * it must be targetCapacity==1 because otherwise
776 * the above would have copied more;
777 * prepare for overflow output
778 */
779 if(U16_IS_SINGLE(c=*source++)) {
780 overflow[0]=(char)c;
781 overflow[1]=(char)(c>>8);
782 length=2; /* 2 bytes to output */
783 c=0;
784 /* } else { keep c for surrogate handling, length will be set there */
785 }
786 } else {
787 length=0;
788 c=0;
789 }
790 } else {
791 /* keep c for surrogate handling, length will be set there */
792 targetCapacity+=2*count;
793 }
794 } else {
795 length=0; /* from here on, length counts the bytes in overflow[] */
796 }
797
798 if(c!=0) {
799 /*
800 * c is a surrogate, and
801 * - source or target too short
802 * - or the surrogate is unmatched
803 */
804 length=0;
805 if(U16_IS_SURROGATE_LEAD(c)) {
806 if(source<pArgs->sourceLimit) {
807 if(U16_IS_TRAIL(trail=*source)) {
808 /* output the surrogate pair, will overflow (see conditions comment above) */
809 ++source;
810 overflow[0]=(char)c;
811 overflow[1]=(char)(c>>8);
812 overflow[2]=(char)trail;
813 overflow[3]=(char)(trail>>8);
814 length=4; /* 4 bytes to output */
815 c=0;
816 } else {
817 /* unmatched lead surrogate */
818 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
819 }
820 } else {
821 /* see if the trail surrogate is in the next buffer */
822 }
823 } else {
824 /* unmatched trail surrogate */
825 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
826 }
827 cnv->fromUChar32=c;
828 }
829
830 if(length>0) {
831 /* output length bytes with overflow (length>targetCapacity>0) */
832 ucnv_fromUWriteBytes(cnv,
833 overflow, length,
834 &target, pArgs->targetLimit,
835 &offsets, sourceIndex,
836 pErrorCode);
837 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
838 }
839
840 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
841 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
842 }
843
844 /* write back the updated pointers */
845 pArgs->source=source;
846 pArgs->target=target;
847 pArgs->offsets=offsets;
848 }
849
850 static void
_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)851 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
852 UErrorCode *pErrorCode) {
853 UConverter *cnv;
854 const uint8_t *source;
855 UChar *target;
856 int32_t *offsets;
857
858 uint32_t targetCapacity, length, count, sourceIndex;
859 UChar c, trail;
860
861 if(pArgs->converter->mode<8) {
862 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
863 return;
864 }
865
866 cnv=pArgs->converter;
867 source=(const uint8_t *)pArgs->source;
868 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
869 if(length<=0 && cnv->toUnicodeStatus==0) {
870 /* no input, nothing to do */
871 return;
872 }
873
874 target=pArgs->target;
875 if(target >= pArgs->targetLimit) {
876 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
877 return;
878 }
879
880 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
881 offsets=pArgs->offsets;
882 sourceIndex=0;
883 c=0;
884
885 /* complete a partial UChar or pair from the last call */
886 if(cnv->toUnicodeStatus!=0) {
887 /*
888 * special case: single byte from a previous buffer,
889 * where the byte turned out not to belong to a trail surrogate
890 * and the preceding, unmatched lead surrogate was put into toUBytes[]
891 * for error handling
892 */
893 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
894 cnv->toULength=1;
895 cnv->toUnicodeStatus=0;
896 }
897 if((count=cnv->toULength)!=0) {
898 uint8_t *p=cnv->toUBytes;
899 do {
900 p[count++]=*source++;
901 ++sourceIndex;
902 --length;
903 if(count==2) {
904 c=((UChar)p[1]<<8)|p[0];
905 if(U16_IS_SINGLE(c)) {
906 /* output the BMP code point */
907 *target++=c;
908 if(offsets!=NULL) {
909 *offsets++=-1;
910 }
911 --targetCapacity;
912 count=0;
913 c=0;
914 break;
915 } else if(U16_IS_SURROGATE_LEAD(c)) {
916 /* continue collecting bytes for the trail surrogate */
917 c=0; /* avoid unnecessary surrogate handling below */
918 } else {
919 /* fall through to error handling for an unmatched trail surrogate */
920 break;
921 }
922 } else if(count==4) {
923 c=((UChar)p[1]<<8)|p[0];
924 trail=((UChar)p[3]<<8)|p[2];
925 if(U16_IS_TRAIL(trail)) {
926 /* output the surrogate pair */
927 *target++=c;
928 if(targetCapacity>=2) {
929 *target++=trail;
930 if(offsets!=NULL) {
931 *offsets++=-1;
932 *offsets++=-1;
933 }
934 targetCapacity-=2;
935 } else /* targetCapacity==1 */ {
936 targetCapacity=0;
937 cnv->UCharErrorBuffer[0]=trail;
938 cnv->UCharErrorBufferLength=1;
939 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
940 }
941 count=0;
942 c=0;
943 break;
944 } else {
945 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
946 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
947
948 /* back out reading the code unit after it */
949 if(((const uint8_t *)pArgs->source-source)>=2) {
950 source-=2;
951 } else {
952 /*
953 * if the trail unit's first byte was in a previous buffer, then
954 * we need to put it into a special place because toUBytes[] will be
955 * used for the lead unit's bytes
956 */
957 cnv->toUnicodeStatus=0x100|p[2];
958 --source;
959 }
960 cnv->toULength=2;
961
962 /* write back the updated pointers */
963 pArgs->source=(const char *)source;
964 pArgs->target=target;
965 pArgs->offsets=offsets;
966 return;
967 }
968 }
969 } while(length>0);
970 cnv->toULength=(int8_t)count;
971 }
972
973 /* copy an even number of bytes for complete UChars */
974 count=2*targetCapacity;
975 if(count>length) {
976 count=length&~1;
977 }
978 if(c==0 && count>0) {
979 length-=count;
980 count>>=1;
981 targetCapacity-=count;
982 if(offsets==NULL) {
983 do {
984 c=((UChar)source[1]<<8)|source[0];
985 source+=2;
986 if(U16_IS_SINGLE(c)) {
987 *target++=c;
988 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
989 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
990 ) {
991 source+=2;
992 --count;
993 *target++=c;
994 *target++=trail;
995 } else {
996 break;
997 }
998 } while(--count>0);
999 } else {
1000 do {
1001 c=((UChar)source[1]<<8)|source[0];
1002 source+=2;
1003 if(U16_IS_SINGLE(c)) {
1004 *target++=c;
1005 *offsets++=sourceIndex;
1006 sourceIndex+=2;
1007 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
1008 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
1009 ) {
1010 source+=2;
1011 --count;
1012 *target++=c;
1013 *target++=trail;
1014 *offsets++=sourceIndex;
1015 *offsets++=sourceIndex;
1016 sourceIndex+=4;
1017 } else {
1018 break;
1019 }
1020 } while(--count>0);
1021 }
1022
1023 if(count==0) {
1024 /* done with the loop for complete UChars */
1025 c=0;
1026 } else {
1027 /* keep c for surrogate handling, trail will be set there */
1028 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
1029 targetCapacity+=count;
1030 }
1031 }
1032
1033 if(c!=0) {
1034 /*
1035 * c is a surrogate, and
1036 * - source or target too short
1037 * - or the surrogate is unmatched
1038 */
1039 cnv->toUBytes[0]=(uint8_t)c;
1040 cnv->toUBytes[1]=(uint8_t)(c>>8);
1041 cnv->toULength=2;
1042
1043 if(U16_IS_SURROGATE_LEAD(c)) {
1044 if(length>=2) {
1045 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
1046 /* output the surrogate pair, will overflow (see conditions comment above) */
1047 source+=2;
1048 length-=2;
1049 *target++=c;
1050 if(offsets!=NULL) {
1051 *offsets++=sourceIndex;
1052 }
1053 cnv->UCharErrorBuffer[0]=trail;
1054 cnv->UCharErrorBufferLength=1;
1055 cnv->toULength=0;
1056 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1057 } else {
1058 /* unmatched lead surrogate */
1059 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1060 }
1061 } else {
1062 /* see if the trail surrogate is in the next buffer */
1063 }
1064 } else {
1065 /* unmatched trail surrogate */
1066 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1067 }
1068 }
1069
1070 if(U_SUCCESS(*pErrorCode)) {
1071 /* check for a remaining source byte */
1072 if(length>0) {
1073 if(targetCapacity==0) {
1074 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1075 } else {
1076 /* it must be length==1 because otherwise the above would have copied more */
1077 cnv->toUBytes[cnv->toULength++]=*source++;
1078 }
1079 }
1080 }
1081
1082 /* write back the updated pointers */
1083 pArgs->source=(const char *)source;
1084 pArgs->target=target;
1085 pArgs->offsets=offsets;
1086 }
1087
1088 static UChar32
_UTF16LEGetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * err)1089 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1090 const uint8_t *s, *sourceLimit;
1091 UChar32 c;
1092
1093 if(pArgs->converter->mode<8) {
1094 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1095 }
1096
1097 s=(const uint8_t *)pArgs->source;
1098 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1099
1100 if(s>=sourceLimit) {
1101 /* no input */
1102 *err=U_INDEX_OUTOFBOUNDS_ERROR;
1103 return 0xffff;
1104 }
1105
1106 if(s+2>sourceLimit) {
1107 /* only one byte: truncated UChar */
1108 pArgs->converter->toUBytes[0]=*s++;
1109 pArgs->converter->toULength=1;
1110 pArgs->source=(const char *)s;
1111 *err = U_TRUNCATED_CHAR_FOUND;
1112 return 0xffff;
1113 }
1114
1115 /* get one UChar */
1116 c=((UChar32)s[1]<<8)|*s;
1117 s+=2;
1118
1119 /* check for a surrogate pair */
1120 if(U_IS_SURROGATE(c)) {
1121 if(U16_IS_SURROGATE_LEAD(c)) {
1122 if(s+2<=sourceLimit) {
1123 UChar trail;
1124
1125 /* get a second UChar and see if it is a trail surrogate */
1126 trail=((UChar)s[1]<<8)|*s;
1127 if(U16_IS_TRAIL(trail)) {
1128 c=U16_GET_SUPPLEMENTARY(c, trail);
1129 s+=2;
1130 } else {
1131 /* unmatched lead surrogate */
1132 c=-2;
1133 }
1134 } else {
1135 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1136 uint8_t *bytes=pArgs->converter->toUBytes;
1137 s-=2;
1138 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1139 do {
1140 *bytes++=*s++;
1141 } while(s<sourceLimit);
1142
1143 c=0xffff;
1144 *err=U_TRUNCATED_CHAR_FOUND;
1145 }
1146 } else {
1147 /* unmatched trail surrogate */
1148 c=-2;
1149 }
1150
1151 if(c<0) {
1152 /* write the unmatched surrogate */
1153 uint8_t *bytes=pArgs->converter->toUBytes;
1154 pArgs->converter->toULength=2;
1155 *bytes=*(s-2);
1156 bytes[1]=*(s-1);
1157
1158 c=0xffff;
1159 *err=U_ILLEGAL_CHAR_FOUND;
1160 }
1161 }
1162
1163 pArgs->source=(const char *)s;
1164 return c;
1165 }
1166
1167 static void
_UTF16LEReset(UConverter * cnv,UConverterResetChoice choice)1168 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
1169 if(choice<=UCNV_RESET_TO_UNICODE) {
1170 /* reset toUnicode state */
1171 if(UCNV_GET_VERSION(cnv)==0) {
1172 cnv->mode=8; /* no BOM handling */
1173 } else {
1174 cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1175 }
1176 }
1177 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
1178 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1179 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1180 }
1181 }
1182
1183 static void
_UTF16LEOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1184 _UTF16LEOpen(UConverter *cnv,
1185 UConverterLoadArgs *pArgs,
1186 UErrorCode *pErrorCode) {
1187 if(UCNV_GET_VERSION(cnv)<=1) {
1188 _UTF16LEReset(cnv, UCNV_RESET_BOTH);
1189 } else {
1190 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1191 }
1192 }
1193
1194 static const char *
_UTF16LEGetName(const UConverter * cnv)1195 _UTF16LEGetName(const UConverter *cnv) {
1196 if(UCNV_GET_VERSION(cnv)==0) {
1197 return "UTF-16LE";
1198 } else {
1199 return "UTF-16LE,version=1";
1200 }
1201 }
1202
1203 static const UConverterImpl _UTF16LEImpl={
1204 UCNV_UTF16_LittleEndian,
1205
1206 NULL,
1207 NULL,
1208
1209 _UTF16LEOpen,
1210 NULL,
1211 _UTF16LEReset,
1212
1213 _UTF16LEToUnicodeWithOffsets,
1214 _UTF16LEToUnicodeWithOffsets,
1215 _UTF16LEFromUnicodeWithOffsets,
1216 _UTF16LEFromUnicodeWithOffsets,
1217 _UTF16LEGetNextUChar,
1218
1219 NULL,
1220 _UTF16LEGetName,
1221 NULL,
1222 NULL,
1223 ucnv_getNonSurrogateUnicodeSet
1224 };
1225
1226
1227 static const UConverterStaticData _UTF16LEStaticData={
1228 sizeof(UConverterStaticData),
1229 "UTF-16LE",
1230 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1231 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1232 0,
1233 0,
1234 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1235 };
1236
1237
1238 const UConverterSharedData _UTF16LEData={
1239 sizeof(UConverterSharedData), ~((uint32_t) 0),
1240 NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,
1241 0
1242 };
1243
1244 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1245
1246 /*
1247 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1248 * accordingly.
1249 * This is a simpler version of the UTF-32 converter, with
1250 * fewer states for shorter BOMs.
1251 *
1252 * State values:
1253 * 0 initial state
1254 * 1 saw first byte
1255 * 2..5 -
1256 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1257 * 8 UTF-16BE mode
1258 * 9 UTF-16LE mode
1259 *
1260 * During detection: state==number of initial bytes seen so far.
1261 *
1262 * On output, emit U+FEFF as the first code point.
1263 *
1264 * Variants:
1265 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1266 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1267 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1268 */
1269
1270 static void
_UTF16Reset(UConverter * cnv,UConverterResetChoice choice)1271 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1272 if(choice<=UCNV_RESET_TO_UNICODE) {
1273 /* reset toUnicode: state=0 */
1274 cnv->mode=0;
1275 }
1276 if(choice!=UCNV_RESET_TO_UNICODE) {
1277 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1278 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1279 }
1280 }
1281
1282 static const UConverterSharedData _UTF16v2Data;
1283
1284 static void
_UTF16Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1285 _UTF16Open(UConverter *cnv,
1286 UConverterLoadArgs *pArgs,
1287 UErrorCode *pErrorCode) {
1288 if(UCNV_GET_VERSION(cnv)<=2) {
1289 if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
1290 /*
1291 * Switch implementation, and switch the staticData that's different
1292 * and was copied into the UConverter.
1293 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1294 * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1295 */
1296 cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
1297 uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
1298 }
1299 _UTF16Reset(cnv, UCNV_RESET_BOTH);
1300 } else {
1301 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1302 }
1303 }
1304
1305 static const char *
_UTF16GetName(const UConverter * cnv)1306 _UTF16GetName(const UConverter *cnv) {
1307 if(UCNV_GET_VERSION(cnv)==0) {
1308 return "UTF-16";
1309 } else if(UCNV_GET_VERSION(cnv)==1) {
1310 return "UTF-16,version=1";
1311 } else {
1312 return "UTF-16,version=2";
1313 }
1314 }
1315
1316 const UConverterSharedData _UTF16Data;
1317
1318 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
1319 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
1320 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
1321
1322 static void
_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1323 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1324 UErrorCode *pErrorCode) {
1325 UConverter *cnv=pArgs->converter;
1326 const char *source=pArgs->source;
1327 const char *sourceLimit=pArgs->sourceLimit;
1328 int32_t *offsets=pArgs->offsets;
1329
1330 int32_t state, offsetDelta;
1331 uint8_t b;
1332
1333 state=cnv->mode;
1334
1335 /*
1336 * If we detect a BOM in this buffer, then we must add the BOM size to the
1337 * offsets because the actual converter function will not see and count the BOM.
1338 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1339 */
1340 offsetDelta=0;
1341
1342 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1343 switch(state) {
1344 case 0:
1345 cnv->toUBytes[0]=(uint8_t)*source++;
1346 cnv->toULength=1;
1347 state=1;
1348 break;
1349 case 1:
1350 /*
1351 * Only inside this switch case can the state variable
1352 * temporarily take two additional values:
1353 * 6: BOM error, continue with BE
1354 * 7: BOM error, continue with LE
1355 */
1356 b=*source;
1357 if(cnv->toUBytes[0]==0xfe && b==0xff) {
1358 if(IS_UTF16LE(cnv)) {
1359 state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1360 } else {
1361 state=8; /* detect UTF-16BE */
1362 }
1363 } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
1364 if(IS_UTF16BE(cnv)) {
1365 state=6; /* illegal reverse BOM for Java "UnicodeBig" */
1366 } else {
1367 state=9; /* detect UTF-16LE */
1368 }
1369 } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
1370 state=6; /* illegal missing BOM for Java "Unicode" */
1371 }
1372 if(state>=8) {
1373 /* BOM detected, consume it */
1374 ++source;
1375 cnv->toULength=0;
1376 offsetDelta=(int32_t)(source-pArgs->source);
1377 } else if(state<6) {
1378 /* ok: no BOM, and not a reverse BOM */
1379 if(source!=pArgs->source) {
1380 /* reset the source for a correct first offset */
1381 source=pArgs->source;
1382 cnv->toULength=0;
1383 }
1384 if(IS_UTF16LE(cnv)) {
1385 /* Make Java "UnicodeLittle" default to LE. */
1386 state=9;
1387 } else {
1388 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1389 state=8;
1390 }
1391 } else {
1392 /*
1393 * error: missing BOM, or reverse BOM
1394 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1395 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1396 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1397 */
1398 /* report the non-BOM or reverse BOM as an illegal sequence */
1399 cnv->toUBytes[1]=b;
1400 cnv->toULength=2;
1401 pArgs->source=source+1;
1402 /* continue with conversion if the callback resets the error */
1403 /*
1404 * Make Java "Unicode" default to BE like standard UTF-16.
1405 * Make Java "UnicodeBig" and "UnicodeLittle" default
1406 * to their normal endiannesses.
1407 */
1408 cnv->mode=state+2;
1409 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
1410 return;
1411 }
1412 /* convert the rest of the stream */
1413 cnv->mode=state;
1414 continue;
1415 case 8:
1416 /* call UTF-16BE */
1417 pArgs->source=source;
1418 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1419 source=pArgs->source;
1420 break;
1421 case 9:
1422 /* call UTF-16LE */
1423 pArgs->source=source;
1424 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1425 source=pArgs->source;
1426 break;
1427 default:
1428 break; /* does not occur */
1429 }
1430 }
1431
1432 /* add BOM size to offsets - see comment at offsetDelta declaration */
1433 if(offsets!=NULL && offsetDelta!=0) {
1434 int32_t *offsetsLimit=pArgs->offsets;
1435 while(offsets<offsetsLimit) {
1436 *offsets++ += offsetDelta;
1437 }
1438 }
1439
1440 pArgs->source=source;
1441
1442 if(source==sourceLimit && pArgs->flush) {
1443 /* handle truncated input */
1444 switch(state) {
1445 case 0:
1446 break; /* no input at all, nothing to do */
1447 case 8:
1448 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1449 break;
1450 case 9:
1451 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1452 break;
1453 default:
1454 /* 0<state<8: framework will report truncation, nothing to do here */
1455 break;
1456 }
1457 }
1458
1459 cnv->mode=state;
1460 }
1461
1462 static UChar32
_UTF16GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1463 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1464 UErrorCode *pErrorCode) {
1465 switch(pArgs->converter->mode) {
1466 case 8:
1467 return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1468 case 9:
1469 return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1470 default:
1471 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1472 }
1473 }
1474
1475 static const UConverterImpl _UTF16Impl = {
1476 UCNV_UTF16,
1477
1478 NULL,
1479 NULL,
1480
1481 _UTF16Open,
1482 NULL,
1483 _UTF16Reset,
1484
1485 _UTF16ToUnicodeWithOffsets,
1486 _UTF16ToUnicodeWithOffsets,
1487 _UTF16PEFromUnicodeWithOffsets,
1488 _UTF16PEFromUnicodeWithOffsets,
1489 _UTF16GetNextUChar,
1490
1491 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1492 _UTF16GetName,
1493 NULL,
1494 NULL,
1495 ucnv_getNonSurrogateUnicodeSet
1496 };
1497
1498 static const UConverterStaticData _UTF16StaticData = {
1499 sizeof(UConverterStaticData),
1500 "UTF-16",
1501 1204, /* CCSID for BOM sensitive UTF-16 */
1502 UCNV_IBM, UCNV_UTF16, 2, 2,
1503 #if U_IS_BIG_ENDIAN
1504 { 0xff, 0xfd, 0, 0 }, 2,
1505 #else
1506 { 0xfd, 0xff, 0, 0 }, 2,
1507 #endif
1508 FALSE, FALSE,
1509 0,
1510 0,
1511 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1512 };
1513
1514 const UConverterSharedData _UTF16Data = {
1515 sizeof(UConverterSharedData), ~((uint32_t) 0),
1516 NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,
1517 0
1518 };
1519
1520 static const UConverterImpl _UTF16v2Impl = {
1521 UCNV_UTF16,
1522
1523 NULL,
1524 NULL,
1525
1526 _UTF16Open,
1527 NULL,
1528 _UTF16Reset,
1529
1530 _UTF16ToUnicodeWithOffsets,
1531 _UTF16ToUnicodeWithOffsets,
1532 _UTF16BEFromUnicodeWithOffsets,
1533 _UTF16BEFromUnicodeWithOffsets,
1534 _UTF16GetNextUChar,
1535
1536 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1537 _UTF16GetName,
1538 NULL,
1539 NULL,
1540 ucnv_getNonSurrogateUnicodeSet
1541 };
1542
1543 static const UConverterStaticData _UTF16v2StaticData = {
1544 sizeof(UConverterStaticData),
1545 "UTF-16,version=2",
1546 1204, /* CCSID for BOM sensitive UTF-16 */
1547 UCNV_IBM, UCNV_UTF16, 2, 2,
1548 { 0xff, 0xfd, 0, 0 }, 2,
1549 FALSE, FALSE,
1550 0,
1551 0,
1552 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1553 };
1554
1555 static const UConverterSharedData _UTF16v2Data = {
1556 sizeof(UConverterSharedData), ~((uint32_t) 0),
1557 NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl,
1558 0
1559 };
1560
1561 #endif
1562