1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv_u16.c
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
15 *
16 * UTF-16 converter implementation. Used to be in ucnv_utf.c.
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_CONVERSION
22
23 #include "unicode/ucnv.h"
24 #include "unicode/uversion.h"
25 #include "ucnv_bld.h"
26 #include "ucnv_cnv.h"
27 #include "cmemory.h"
28
29 enum {
30 UCNV_NEED_TO_WRITE_BOM=1
31 };
32
33 U_CDECL_BEGIN
34 /*
35 * The UTF-16 toUnicode implementation is also used for the Java-specific
36 * "with BOM" variants of UTF-16BE and UTF-16LE.
37 */
38 static void U_CALLCONV
39 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
40 UErrorCode *pErrorCode);
41
42 /* UTF-16BE ----------------------------------------------------------------- */
43
44 #if U_IS_BIG_ENDIAN
45 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
46 #else
47 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
48 #endif
49
50
51 static void U_CALLCONV
_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)52 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
53 UErrorCode *pErrorCode) {
54 UConverter *cnv;
55 const UChar *source;
56 char *target;
57 int32_t *offsets;
58
59 uint32_t targetCapacity, length, sourceIndex;
60 UChar c, trail;
61 char overflow[4];
62
63 source=pArgs->source;
64 length=(int32_t)(pArgs->sourceLimit-source);
65 if(length<=0) {
66 /* no input, nothing to do */
67 return;
68 }
69
70 cnv=pArgs->converter;
71
72 /* write the BOM if necessary */
73 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
74 static const char bom[]={ (char)0xfeu, (char)0xffu };
75 ucnv_fromUWriteBytes(cnv,
76 bom, 2,
77 &pArgs->target, pArgs->targetLimit,
78 &pArgs->offsets, -1,
79 pErrorCode);
80 cnv->fromUnicodeStatus=0;
81 }
82
83 target=pArgs->target;
84 if(target >= pArgs->targetLimit) {
85 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
86 return;
87 }
88
89 targetCapacity=(uint32_t)(pArgs->targetLimit-target);
90 offsets=pArgs->offsets;
91 sourceIndex=0;
92
93 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
94
95 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
96 /* the last buffer ended with a lead surrogate, output the surrogate pair */
97 ++source;
98 --length;
99 target[0]=(uint8_t)(c>>8);
100 target[1]=(uint8_t)c;
101 target[2]=(uint8_t)(trail>>8);
102 target[3]=(uint8_t)trail;
103 target+=4;
104 targetCapacity-=4;
105 if(offsets!=NULL) {
106 *offsets++=-1;
107 *offsets++=-1;
108 *offsets++=-1;
109 *offsets++=-1;
110 }
111 sourceIndex=1;
112 cnv->fromUChar32=c=0;
113 }
114
115 if(c==0) {
116 /* copy an even number of bytes for complete UChars */
117 uint32_t count=2*length;
118 if(count>targetCapacity) {
119 count=targetCapacity&~1;
120 }
121 /* count is even */
122 targetCapacity-=count;
123 count>>=1;
124 length-=count;
125
126 if(offsets==NULL) {
127 while(count>0) {
128 c=*source++;
129 if(U16_IS_SINGLE(c)) {
130 target[0]=(uint8_t)(c>>8);
131 target[1]=(uint8_t)c;
132 target+=2;
133 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
134 ++source;
135 --count;
136 target[0]=(uint8_t)(c>>8);
137 target[1]=(uint8_t)c;
138 target[2]=(uint8_t)(trail>>8);
139 target[3]=(uint8_t)trail;
140 target+=4;
141 } else {
142 break;
143 }
144 --count;
145 }
146 } else {
147 while(count>0) {
148 c=*source++;
149 if(U16_IS_SINGLE(c)) {
150 target[0]=(uint8_t)(c>>8);
151 target[1]=(uint8_t)c;
152 target+=2;
153 *offsets++=sourceIndex;
154 *offsets++=sourceIndex++;
155 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
156 ++source;
157 --count;
158 target[0]=(uint8_t)(c>>8);
159 target[1]=(uint8_t)c;
160 target[2]=(uint8_t)(trail>>8);
161 target[3]=(uint8_t)trail;
162 target+=4;
163 *offsets++=sourceIndex;
164 *offsets++=sourceIndex;
165 *offsets++=sourceIndex;
166 *offsets++=sourceIndex;
167 sourceIndex+=2;
168 } else {
169 break;
170 }
171 --count;
172 }
173 }
174
175 if(count==0) {
176 /* done with the loop for complete UChars */
177 if(length>0 && targetCapacity>0) {
178 /*
179 * there is more input and some target capacity -
180 * it must be targetCapacity==1 because otherwise
181 * the above would have copied more;
182 * prepare for overflow output
183 */
184 if(U16_IS_SINGLE(c=*source++)) {
185 overflow[0]=(char)(c>>8);
186 overflow[1]=(char)c;
187 length=2; /* 2 bytes to output */
188 c=0;
189 /* } else { keep c for surrogate handling, length will be set there */
190 }
191 } else {
192 length=0;
193 c=0;
194 }
195 } else {
196 /* keep c for surrogate handling, length will be set there */
197 targetCapacity+=2*count;
198 }
199 } else {
200 length=0; /* from here on, length counts the bytes in overflow[] */
201 }
202
203 if(c!=0) {
204 /*
205 * c is a surrogate, and
206 * - source or target too short
207 * - or the surrogate is unmatched
208 */
209 length=0;
210 if(U16_IS_SURROGATE_LEAD(c)) {
211 if(source<pArgs->sourceLimit) {
212 if(U16_IS_TRAIL(trail=*source)) {
213 /* output the surrogate pair, will overflow (see conditions comment above) */
214 ++source;
215 overflow[0]=(char)(c>>8);
216 overflow[1]=(char)c;
217 overflow[2]=(char)(trail>>8);
218 overflow[3]=(char)trail;
219 length=4; /* 4 bytes to output */
220 c=0;
221 } else {
222 /* unmatched lead surrogate */
223 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
224 }
225 } else {
226 /* see if the trail surrogate is in the next buffer */
227 }
228 } else {
229 /* unmatched trail surrogate */
230 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
231 }
232 cnv->fromUChar32=c;
233 }
234
235 if(length>0) {
236 /* output length bytes with overflow (length>targetCapacity>0) */
237 ucnv_fromUWriteBytes(cnv,
238 overflow, length,
239 (char **)&target, pArgs->targetLimit,
240 &offsets, sourceIndex,
241 pErrorCode);
242 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
243 }
244
245 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
246 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
247 }
248
249 /* write back the updated pointers */
250 pArgs->source=source;
251 pArgs->target=(char *)target;
252 pArgs->offsets=offsets;
253 }
254
255 static void U_CALLCONV
_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)256 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
257 UErrorCode *pErrorCode) {
258 UConverter *cnv;
259 const uint8_t *source;
260 UChar *target;
261 int32_t *offsets;
262
263 uint32_t targetCapacity, length, count, sourceIndex;
264 UChar c, trail;
265
266 if(pArgs->converter->mode<8) {
267 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
268 return;
269 }
270
271 cnv=pArgs->converter;
272 source=(const uint8_t *)pArgs->source;
273 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
274 if(length<=0 && cnv->toUnicodeStatus==0) {
275 /* no input, nothing to do */
276 return;
277 }
278
279 target=pArgs->target;
280 if(target >= pArgs->targetLimit) {
281 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
282 return;
283 }
284
285 targetCapacity=(uint32_t)(pArgs->targetLimit-target);
286 offsets=pArgs->offsets;
287 sourceIndex=0;
288 c=0;
289
290 /* complete a partial UChar or pair from the last call */
291 if(cnv->toUnicodeStatus!=0) {
292 /*
293 * special case: single byte from a previous buffer,
294 * where the byte turned out not to belong to a trail surrogate
295 * and the preceding, unmatched lead surrogate was put into toUBytes[]
296 * for error handling
297 */
298 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
299 cnv->toULength=1;
300 cnv->toUnicodeStatus=0;
301 }
302 if((count=cnv->toULength)!=0) {
303 uint8_t *p=cnv->toUBytes;
304 do {
305 p[count++]=*source++;
306 ++sourceIndex;
307 --length;
308 if(count==2) {
309 c=((UChar)p[0]<<8)|p[1];
310 if(U16_IS_SINGLE(c)) {
311 /* output the BMP code point */
312 *target++=c;
313 if(offsets!=NULL) {
314 *offsets++=-1;
315 }
316 --targetCapacity;
317 count=0;
318 c=0;
319 break;
320 } else if(U16_IS_SURROGATE_LEAD(c)) {
321 /* continue collecting bytes for the trail surrogate */
322 c=0; /* avoid unnecessary surrogate handling below */
323 } else {
324 /* fall through to error handling for an unmatched trail surrogate */
325 break;
326 }
327 } else if(count==4) {
328 c=((UChar)p[0]<<8)|p[1];
329 trail=((UChar)p[2]<<8)|p[3];
330 if(U16_IS_TRAIL(trail)) {
331 /* output the surrogate pair */
332 *target++=c;
333 if(targetCapacity>=2) {
334 *target++=trail;
335 if(offsets!=NULL) {
336 *offsets++=-1;
337 *offsets++=-1;
338 }
339 targetCapacity-=2;
340 } else /* targetCapacity==1 */ {
341 targetCapacity=0;
342 cnv->UCharErrorBuffer[0]=trail;
343 cnv->UCharErrorBufferLength=1;
344 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
345 }
346 count=0;
347 c=0;
348 break;
349 } else {
350 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
351 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
352
353 /* back out reading the code unit after it */
354 if(((const uint8_t *)pArgs->source-source)>=2) {
355 source-=2;
356 } else {
357 /*
358 * if the trail unit's first byte was in a previous buffer, then
359 * we need to put it into a special place because toUBytes[] will be
360 * used for the lead unit's bytes
361 */
362 cnv->toUnicodeStatus=0x100|p[2];
363 --source;
364 }
365 cnv->toULength=2;
366
367 /* write back the updated pointers */
368 pArgs->source=(const char *)source;
369 pArgs->target=target;
370 pArgs->offsets=offsets;
371 return;
372 }
373 }
374 } while(length>0);
375 cnv->toULength=(int8_t)count;
376 }
377
378 /* copy an even number of bytes for complete UChars */
379 count=2*targetCapacity;
380 if(count>length) {
381 count=length&~1;
382 }
383 if(c==0 && count>0) {
384 length-=count;
385 count>>=1;
386 targetCapacity-=count;
387 if(offsets==NULL) {
388 do {
389 c=((UChar)source[0]<<8)|source[1];
390 source+=2;
391 if(U16_IS_SINGLE(c)) {
392 *target++=c;
393 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
394 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
395 ) {
396 source+=2;
397 --count;
398 *target++=c;
399 *target++=trail;
400 } else {
401 break;
402 }
403 } while(--count>0);
404 } else {
405 do {
406 c=((UChar)source[0]<<8)|source[1];
407 source+=2;
408 if(U16_IS_SINGLE(c)) {
409 *target++=c;
410 *offsets++=sourceIndex;
411 sourceIndex+=2;
412 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
413 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
414 ) {
415 source+=2;
416 --count;
417 *target++=c;
418 *target++=trail;
419 *offsets++=sourceIndex;
420 *offsets++=sourceIndex;
421 sourceIndex+=4;
422 } else {
423 break;
424 }
425 } while(--count>0);
426 }
427
428 if(count==0) {
429 /* done with the loop for complete UChars */
430 c=0;
431 } else {
432 /* keep c for surrogate handling, trail will be set there */
433 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
434 targetCapacity+=count;
435 }
436 }
437
438 if(c!=0) {
439 /*
440 * c is a surrogate, and
441 * - source or target too short
442 * - or the surrogate is unmatched
443 */
444 cnv->toUBytes[0]=(uint8_t)(c>>8);
445 cnv->toUBytes[1]=(uint8_t)c;
446 cnv->toULength=2;
447
448 if(U16_IS_SURROGATE_LEAD(c)) {
449 if(length>=2) {
450 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
451 /* output the surrogate pair, will overflow (see conditions comment above) */
452 source+=2;
453 length-=2;
454 *target++=c;
455 if(offsets!=NULL) {
456 *offsets++=sourceIndex;
457 }
458 cnv->UCharErrorBuffer[0]=trail;
459 cnv->UCharErrorBufferLength=1;
460 cnv->toULength=0;
461 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
462 } else {
463 /* unmatched lead surrogate */
464 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
465 }
466 } else {
467 /* see if the trail surrogate is in the next buffer */
468 }
469 } else {
470 /* unmatched trail surrogate */
471 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
472 }
473 }
474
475 if(U_SUCCESS(*pErrorCode)) {
476 /* check for a remaining source byte */
477 if(length>0) {
478 if(targetCapacity==0) {
479 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
480 } else {
481 /* it must be length==1 because otherwise the above would have copied more */
482 cnv->toUBytes[cnv->toULength++]=*source++;
483 }
484 }
485 }
486
487 /* write back the updated pointers */
488 pArgs->source=(const char *)source;
489 pArgs->target=target;
490 pArgs->offsets=offsets;
491 }
492
493 static UChar32 U_CALLCONV
_UTF16BEGetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * err)494 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
495 const uint8_t *s, *sourceLimit;
496 UChar32 c;
497
498 if(pArgs->converter->mode<8) {
499 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
500 }
501
502 s=(const uint8_t *)pArgs->source;
503 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
504
505 if(s>=sourceLimit) {
506 /* no input */
507 *err=U_INDEX_OUTOFBOUNDS_ERROR;
508 return 0xffff;
509 }
510
511 if(s+2>sourceLimit) {
512 /* only one byte: truncated UChar */
513 pArgs->converter->toUBytes[0]=*s++;
514 pArgs->converter->toULength=1;
515 pArgs->source=(const char *)s;
516 *err = U_TRUNCATED_CHAR_FOUND;
517 return 0xffff;
518 }
519
520 /* get one UChar */
521 c=((UChar32)*s<<8)|s[1];
522 s+=2;
523
524 /* check for a surrogate pair */
525 if(U_IS_SURROGATE(c)) {
526 if(U16_IS_SURROGATE_LEAD(c)) {
527 if(s+2<=sourceLimit) {
528 UChar trail;
529
530 /* get a second UChar and see if it is a trail surrogate */
531 trail=((UChar)*s<<8)|s[1];
532 if(U16_IS_TRAIL(trail)) {
533 c=U16_GET_SUPPLEMENTARY(c, trail);
534 s+=2;
535 } else {
536 /* unmatched lead surrogate */
537 c=-2;
538 }
539 } else {
540 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
541 uint8_t *bytes=pArgs->converter->toUBytes;
542 s-=2;
543 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
544 do {
545 *bytes++=*s++;
546 } while(s<sourceLimit);
547
548 c=0xffff;
549 *err=U_TRUNCATED_CHAR_FOUND;
550 }
551 } else {
552 /* unmatched trail surrogate */
553 c=-2;
554 }
555
556 if(c<0) {
557 /* write the unmatched surrogate */
558 uint8_t *bytes=pArgs->converter->toUBytes;
559 pArgs->converter->toULength=2;
560 *bytes=*(s-2);
561 bytes[1]=*(s-1);
562
563 c=0xffff;
564 *err=U_ILLEGAL_CHAR_FOUND;
565 }
566 }
567
568 pArgs->source=(const char *)s;
569 return c;
570 }
571
572 static void U_CALLCONV
_UTF16BEReset(UConverter * cnv,UConverterResetChoice choice)573 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
574 if(choice<=UCNV_RESET_TO_UNICODE) {
575 /* reset toUnicode state */
576 if(UCNV_GET_VERSION(cnv)==0) {
577 cnv->mode=8; /* no BOM handling */
578 } else {
579 cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
580 }
581 }
582 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
583 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
584 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
585 }
586 }
587
588 static void U_CALLCONV
_UTF16BEOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)589 _UTF16BEOpen(UConverter *cnv,
590 UConverterLoadArgs *pArgs,
591 UErrorCode *pErrorCode) {
592 (void)pArgs;
593 if(UCNV_GET_VERSION(cnv)<=1) {
594 _UTF16BEReset(cnv, UCNV_RESET_BOTH);
595 } else {
596 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
597 }
598 }
599
600 static const char * U_CALLCONV
_UTF16BEGetName(const UConverter * cnv)601 _UTF16BEGetName(const UConverter *cnv) {
602 if(UCNV_GET_VERSION(cnv)==0) {
603 return "UTF-16BE";
604 } else {
605 return "UTF-16BE,version=1";
606 }
607 }
608 U_CDECL_END
609
610 static const UConverterImpl _UTF16BEImpl={
611 UCNV_UTF16_BigEndian,
612
613 NULL,
614 NULL,
615
616 _UTF16BEOpen,
617 NULL,
618 _UTF16BEReset,
619
620 _UTF16BEToUnicodeWithOffsets,
621 _UTF16BEToUnicodeWithOffsets,
622 _UTF16BEFromUnicodeWithOffsets,
623 _UTF16BEFromUnicodeWithOffsets,
624 _UTF16BEGetNextUChar,
625
626 NULL,
627 _UTF16BEGetName,
628 NULL,
629 NULL,
630 ucnv_getNonSurrogateUnicodeSet,
631
632 NULL,
633 NULL
634 };
635
636 static const UConverterStaticData _UTF16BEStaticData={
637 sizeof(UConverterStaticData),
638 "UTF-16BE",
639 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
640 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
641 0,
642 0,
643 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
644 };
645
646
647 const UConverterSharedData _UTF16BEData=
648 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
649
650 /* UTF-16LE ----------------------------------------------------------------- */
651 U_CDECL_BEGIN
652 static void U_CALLCONV
_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)653 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
654 UErrorCode *pErrorCode) {
655 UConverter *cnv;
656 const UChar *source;
657 char *target;
658 int32_t *offsets;
659
660 uint32_t targetCapacity, length, sourceIndex;
661 UChar c, trail;
662 char overflow[4];
663
664 source=pArgs->source;
665 length=(int32_t)(pArgs->sourceLimit-source);
666 if(length<=0) {
667 /* no input, nothing to do */
668 return;
669 }
670
671 cnv=pArgs->converter;
672
673 /* write the BOM if necessary */
674 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
675 static const char bom[]={ (char)0xffu, (char)0xfeu };
676 ucnv_fromUWriteBytes(cnv,
677 bom, 2,
678 &pArgs->target, pArgs->targetLimit,
679 &pArgs->offsets, -1,
680 pErrorCode);
681 cnv->fromUnicodeStatus=0;
682 }
683
684 target=pArgs->target;
685 if(target >= pArgs->targetLimit) {
686 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
687 return;
688 }
689
690 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
691 offsets=pArgs->offsets;
692 sourceIndex=0;
693
694 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
695
696 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
697 /* the last buffer ended with a lead surrogate, output the surrogate pair */
698 ++source;
699 --length;
700 target[0]=(uint8_t)c;
701 target[1]=(uint8_t)(c>>8);
702 target[2]=(uint8_t)trail;
703 target[3]=(uint8_t)(trail>>8);
704 target+=4;
705 targetCapacity-=4;
706 if(offsets!=NULL) {
707 *offsets++=-1;
708 *offsets++=-1;
709 *offsets++=-1;
710 *offsets++=-1;
711 }
712 sourceIndex=1;
713 cnv->fromUChar32=c=0;
714 }
715
716 if(c==0) {
717 /* copy an even number of bytes for complete UChars */
718 uint32_t count=2*length;
719 if(count>targetCapacity) {
720 count=targetCapacity&~1;
721 }
722 /* count is even */
723 targetCapacity-=count;
724 count>>=1;
725 length-=count;
726
727 if(offsets==NULL) {
728 while(count>0) {
729 c=*source++;
730 if(U16_IS_SINGLE(c)) {
731 target[0]=(uint8_t)c;
732 target[1]=(uint8_t)(c>>8);
733 target+=2;
734 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
735 ++source;
736 --count;
737 target[0]=(uint8_t)c;
738 target[1]=(uint8_t)(c>>8);
739 target[2]=(uint8_t)trail;
740 target[3]=(uint8_t)(trail>>8);
741 target+=4;
742 } else {
743 break;
744 }
745 --count;
746 }
747 } else {
748 while(count>0) {
749 c=*source++;
750 if(U16_IS_SINGLE(c)) {
751 target[0]=(uint8_t)c;
752 target[1]=(uint8_t)(c>>8);
753 target+=2;
754 *offsets++=sourceIndex;
755 *offsets++=sourceIndex++;
756 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
757 ++source;
758 --count;
759 target[0]=(uint8_t)c;
760 target[1]=(uint8_t)(c>>8);
761 target[2]=(uint8_t)trail;
762 target[3]=(uint8_t)(trail>>8);
763 target+=4;
764 *offsets++=sourceIndex;
765 *offsets++=sourceIndex;
766 *offsets++=sourceIndex;
767 *offsets++=sourceIndex;
768 sourceIndex+=2;
769 } else {
770 break;
771 }
772 --count;
773 }
774 }
775
776 if(count==0) {
777 /* done with the loop for complete UChars */
778 if(length>0 && targetCapacity>0) {
779 /*
780 * there is more input and some target capacity -
781 * it must be targetCapacity==1 because otherwise
782 * the above would have copied more;
783 * prepare for overflow output
784 */
785 if(U16_IS_SINGLE(c=*source++)) {
786 overflow[0]=(char)c;
787 overflow[1]=(char)(c>>8);
788 length=2; /* 2 bytes to output */
789 c=0;
790 /* } else { keep c for surrogate handling, length will be set there */
791 }
792 } else {
793 length=0;
794 c=0;
795 }
796 } else {
797 /* keep c for surrogate handling, length will be set there */
798 targetCapacity+=2*count;
799 }
800 } else {
801 length=0; /* from here on, length counts the bytes in overflow[] */
802 }
803
804 if(c!=0) {
805 /*
806 * c is a surrogate, and
807 * - source or target too short
808 * - or the surrogate is unmatched
809 */
810 length=0;
811 if(U16_IS_SURROGATE_LEAD(c)) {
812 if(source<pArgs->sourceLimit) {
813 if(U16_IS_TRAIL(trail=*source)) {
814 /* output the surrogate pair, will overflow (see conditions comment above) */
815 ++source;
816 overflow[0]=(char)c;
817 overflow[1]=(char)(c>>8);
818 overflow[2]=(char)trail;
819 overflow[3]=(char)(trail>>8);
820 length=4; /* 4 bytes to output */
821 c=0;
822 } else {
823 /* unmatched lead surrogate */
824 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
825 }
826 } else {
827 /* see if the trail surrogate is in the next buffer */
828 }
829 } else {
830 /* unmatched trail surrogate */
831 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
832 }
833 cnv->fromUChar32=c;
834 }
835
836 if(length>0) {
837 /* output length bytes with overflow (length>targetCapacity>0) */
838 ucnv_fromUWriteBytes(cnv,
839 overflow, length,
840 &target, pArgs->targetLimit,
841 &offsets, sourceIndex,
842 pErrorCode);
843 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
844 }
845
846 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
847 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
848 }
849
850 /* write back the updated pointers */
851 pArgs->source=source;
852 pArgs->target=target;
853 pArgs->offsets=offsets;
854 }
855
856 static void U_CALLCONV
_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)857 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
858 UErrorCode *pErrorCode) {
859 UConverter *cnv;
860 const uint8_t *source;
861 UChar *target;
862 int32_t *offsets;
863
864 uint32_t targetCapacity, length, count, sourceIndex;
865 UChar c, trail;
866
867 if(pArgs->converter->mode<8) {
868 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
869 return;
870 }
871
872 cnv=pArgs->converter;
873 source=(const uint8_t *)pArgs->source;
874 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
875 if(length<=0 && cnv->toUnicodeStatus==0) {
876 /* no input, nothing to do */
877 return;
878 }
879
880 target=pArgs->target;
881 if(target >= pArgs->targetLimit) {
882 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
883 return;
884 }
885
886 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
887 offsets=pArgs->offsets;
888 sourceIndex=0;
889 c=0;
890
891 /* complete a partial UChar or pair from the last call */
892 if(cnv->toUnicodeStatus!=0) {
893 /*
894 * special case: single byte from a previous buffer,
895 * where the byte turned out not to belong to a trail surrogate
896 * and the preceding, unmatched lead surrogate was put into toUBytes[]
897 * for error handling
898 */
899 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
900 cnv->toULength=1;
901 cnv->toUnicodeStatus=0;
902 }
903 if((count=cnv->toULength)!=0) {
904 uint8_t *p=cnv->toUBytes;
905 do {
906 p[count++]=*source++;
907 ++sourceIndex;
908 --length;
909 if(count==2) {
910 c=((UChar)p[1]<<8)|p[0];
911 if(U16_IS_SINGLE(c)) {
912 /* output the BMP code point */
913 *target++=c;
914 if(offsets!=NULL) {
915 *offsets++=-1;
916 }
917 --targetCapacity;
918 count=0;
919 c=0;
920 break;
921 } else if(U16_IS_SURROGATE_LEAD(c)) {
922 /* continue collecting bytes for the trail surrogate */
923 c=0; /* avoid unnecessary surrogate handling below */
924 } else {
925 /* fall through to error handling for an unmatched trail surrogate */
926 break;
927 }
928 } else if(count==4) {
929 c=((UChar)p[1]<<8)|p[0];
930 trail=((UChar)p[3]<<8)|p[2];
931 if(U16_IS_TRAIL(trail)) {
932 /* output the surrogate pair */
933 *target++=c;
934 if(targetCapacity>=2) {
935 *target++=trail;
936 if(offsets!=NULL) {
937 *offsets++=-1;
938 *offsets++=-1;
939 }
940 targetCapacity-=2;
941 } else /* targetCapacity==1 */ {
942 targetCapacity=0;
943 cnv->UCharErrorBuffer[0]=trail;
944 cnv->UCharErrorBufferLength=1;
945 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
946 }
947 count=0;
948 c=0;
949 break;
950 } else {
951 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
952 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
953
954 /* back out reading the code unit after it */
955 if(((const uint8_t *)pArgs->source-source)>=2) {
956 source-=2;
957 } else {
958 /*
959 * if the trail unit's first byte was in a previous buffer, then
960 * we need to put it into a special place because toUBytes[] will be
961 * used for the lead unit's bytes
962 */
963 cnv->toUnicodeStatus=0x100|p[2];
964 --source;
965 }
966 cnv->toULength=2;
967
968 /* write back the updated pointers */
969 pArgs->source=(const char *)source;
970 pArgs->target=target;
971 pArgs->offsets=offsets;
972 return;
973 }
974 }
975 } while(length>0);
976 cnv->toULength=(int8_t)count;
977 }
978
979 /* copy an even number of bytes for complete UChars */
980 count=2*targetCapacity;
981 if(count>length) {
982 count=length&~1;
983 }
984 if(c==0 && count>0) {
985 length-=count;
986 count>>=1;
987 targetCapacity-=count;
988 if(offsets==NULL) {
989 do {
990 c=((UChar)source[1]<<8)|source[0];
991 source+=2;
992 if(U16_IS_SINGLE(c)) {
993 *target++=c;
994 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
995 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
996 ) {
997 source+=2;
998 --count;
999 *target++=c;
1000 *target++=trail;
1001 } else {
1002 break;
1003 }
1004 } while(--count>0);
1005 } else {
1006 do {
1007 c=((UChar)source[1]<<8)|source[0];
1008 source+=2;
1009 if(U16_IS_SINGLE(c)) {
1010 *target++=c;
1011 *offsets++=sourceIndex;
1012 sourceIndex+=2;
1013 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
1014 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
1015 ) {
1016 source+=2;
1017 --count;
1018 *target++=c;
1019 *target++=trail;
1020 *offsets++=sourceIndex;
1021 *offsets++=sourceIndex;
1022 sourceIndex+=4;
1023 } else {
1024 break;
1025 }
1026 } while(--count>0);
1027 }
1028
1029 if(count==0) {
1030 /* done with the loop for complete UChars */
1031 c=0;
1032 } else {
1033 /* keep c for surrogate handling, trail will be set there */
1034 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
1035 targetCapacity+=count;
1036 }
1037 }
1038
1039 if(c!=0) {
1040 /*
1041 * c is a surrogate, and
1042 * - source or target too short
1043 * - or the surrogate is unmatched
1044 */
1045 cnv->toUBytes[0]=(uint8_t)c;
1046 cnv->toUBytes[1]=(uint8_t)(c>>8);
1047 cnv->toULength=2;
1048
1049 if(U16_IS_SURROGATE_LEAD(c)) {
1050 if(length>=2) {
1051 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
1052 /* output the surrogate pair, will overflow (see conditions comment above) */
1053 source+=2;
1054 length-=2;
1055 *target++=c;
1056 if(offsets!=NULL) {
1057 *offsets++=sourceIndex;
1058 }
1059 cnv->UCharErrorBuffer[0]=trail;
1060 cnv->UCharErrorBufferLength=1;
1061 cnv->toULength=0;
1062 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1063 } else {
1064 /* unmatched lead surrogate */
1065 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1066 }
1067 } else {
1068 /* see if the trail surrogate is in the next buffer */
1069 }
1070 } else {
1071 /* unmatched trail surrogate */
1072 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1073 }
1074 }
1075
1076 if(U_SUCCESS(*pErrorCode)) {
1077 /* check for a remaining source byte */
1078 if(length>0) {
1079 if(targetCapacity==0) {
1080 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1081 } else {
1082 /* it must be length==1 because otherwise the above would have copied more */
1083 cnv->toUBytes[cnv->toULength++]=*source++;
1084 }
1085 }
1086 }
1087
1088 /* write back the updated pointers */
1089 pArgs->source=(const char *)source;
1090 pArgs->target=target;
1091 pArgs->offsets=offsets;
1092 }
1093
1094 static UChar32 U_CALLCONV
_UTF16LEGetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * err)1095 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1096 const uint8_t *s, *sourceLimit;
1097 UChar32 c;
1098
1099 if(pArgs->converter->mode<8) {
1100 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1101 }
1102
1103 s=(const uint8_t *)pArgs->source;
1104 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1105
1106 if(s>=sourceLimit) {
1107 /* no input */
1108 *err=U_INDEX_OUTOFBOUNDS_ERROR;
1109 return 0xffff;
1110 }
1111
1112 if(s+2>sourceLimit) {
1113 /* only one byte: truncated UChar */
1114 pArgs->converter->toUBytes[0]=*s++;
1115 pArgs->converter->toULength=1;
1116 pArgs->source=(const char *)s;
1117 *err = U_TRUNCATED_CHAR_FOUND;
1118 return 0xffff;
1119 }
1120
1121 /* get one UChar */
1122 c=((UChar32)s[1]<<8)|*s;
1123 s+=2;
1124
1125 /* check for a surrogate pair */
1126 if(U_IS_SURROGATE(c)) {
1127 if(U16_IS_SURROGATE_LEAD(c)) {
1128 if(s+2<=sourceLimit) {
1129 UChar trail;
1130
1131 /* get a second UChar and see if it is a trail surrogate */
1132 trail=((UChar)s[1]<<8)|*s;
1133 if(U16_IS_TRAIL(trail)) {
1134 c=U16_GET_SUPPLEMENTARY(c, trail);
1135 s+=2;
1136 } else {
1137 /* unmatched lead surrogate */
1138 c=-2;
1139 }
1140 } else {
1141 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1142 uint8_t *bytes=pArgs->converter->toUBytes;
1143 s-=2;
1144 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1145 do {
1146 *bytes++=*s++;
1147 } while(s<sourceLimit);
1148
1149 c=0xffff;
1150 *err=U_TRUNCATED_CHAR_FOUND;
1151 }
1152 } else {
1153 /* unmatched trail surrogate */
1154 c=-2;
1155 }
1156
1157 if(c<0) {
1158 /* write the unmatched surrogate */
1159 uint8_t *bytes=pArgs->converter->toUBytes;
1160 pArgs->converter->toULength=2;
1161 *bytes=*(s-2);
1162 bytes[1]=*(s-1);
1163
1164 c=0xffff;
1165 *err=U_ILLEGAL_CHAR_FOUND;
1166 }
1167 }
1168
1169 pArgs->source=(const char *)s;
1170 return c;
1171 }
1172
1173 static void U_CALLCONV
_UTF16LEReset(UConverter * cnv,UConverterResetChoice choice)1174 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
1175 if(choice<=UCNV_RESET_TO_UNICODE) {
1176 /* reset toUnicode state */
1177 if(UCNV_GET_VERSION(cnv)==0) {
1178 cnv->mode=8; /* no BOM handling */
1179 } else {
1180 cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1181 }
1182 }
1183 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
1184 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1185 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1186 }
1187 }
1188
1189 static void U_CALLCONV
_UTF16LEOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1190 _UTF16LEOpen(UConverter *cnv,
1191 UConverterLoadArgs *pArgs,
1192 UErrorCode *pErrorCode) {
1193 (void)pArgs;
1194 if(UCNV_GET_VERSION(cnv)<=1) {
1195 _UTF16LEReset(cnv, UCNV_RESET_BOTH);
1196 } else {
1197 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1198 }
1199 }
1200
1201 static const char * U_CALLCONV
_UTF16LEGetName(const UConverter * cnv)1202 _UTF16LEGetName(const UConverter *cnv) {
1203 if(UCNV_GET_VERSION(cnv)==0) {
1204 return "UTF-16LE";
1205 } else {
1206 return "UTF-16LE,version=1";
1207 }
1208 }
1209 U_CDECL_END
1210
1211 static const UConverterImpl _UTF16LEImpl={
1212 UCNV_UTF16_LittleEndian,
1213
1214 NULL,
1215 NULL,
1216
1217 _UTF16LEOpen,
1218 NULL,
1219 _UTF16LEReset,
1220
1221 _UTF16LEToUnicodeWithOffsets,
1222 _UTF16LEToUnicodeWithOffsets,
1223 _UTF16LEFromUnicodeWithOffsets,
1224 _UTF16LEFromUnicodeWithOffsets,
1225 _UTF16LEGetNextUChar,
1226
1227 NULL,
1228 _UTF16LEGetName,
1229 NULL,
1230 NULL,
1231 ucnv_getNonSurrogateUnicodeSet,
1232
1233 NULL,
1234 NULL
1235 };
1236
1237
1238 static const UConverterStaticData _UTF16LEStaticData={
1239 sizeof(UConverterStaticData),
1240 "UTF-16LE",
1241 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1242 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1243 0,
1244 0,
1245 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1246 };
1247
1248
1249 const UConverterSharedData _UTF16LEData=
1250 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
1251
1252 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1253
1254 /*
1255 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1256 * accordingly.
1257 * This is a simpler version of the UTF-32 converter, with
1258 * fewer states for shorter BOMs.
1259 *
1260 * State values:
1261 * 0 initial state
1262 * 1 saw first byte
1263 * 2..5 -
1264 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1265 * 8 UTF-16BE mode
1266 * 9 UTF-16LE mode
1267 *
1268 * During detection: state==number of initial bytes seen so far.
1269 *
1270 * On output, emit U+FEFF as the first code point.
1271 *
1272 * Variants:
1273 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1274 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1275 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1276 */
1277 U_CDECL_BEGIN
1278 static void U_CALLCONV
_UTF16Reset(UConverter * cnv,UConverterResetChoice choice)1279 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1280 if(choice<=UCNV_RESET_TO_UNICODE) {
1281 /* reset toUnicode: state=0 */
1282 cnv->mode=0;
1283 }
1284 if(choice!=UCNV_RESET_TO_UNICODE) {
1285 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1286 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1287 }
1288 }
1289 U_CDECL_END
1290 extern const UConverterSharedData _UTF16v2Data;
1291 U_CDECL_BEGIN
1292 static void U_CALLCONV
_UTF16Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)1293 _UTF16Open(UConverter *cnv,
1294 UConverterLoadArgs *pArgs,
1295 UErrorCode *pErrorCode) {
1296 if(UCNV_GET_VERSION(cnv)<=2) {
1297 if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
1298 /*
1299 * Switch implementation, and switch the staticData that's different
1300 * and was copied into the UConverter.
1301 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1302 * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1303 */
1304 cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
1305 uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
1306 }
1307 _UTF16Reset(cnv, UCNV_RESET_BOTH);
1308 } else {
1309 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1310 }
1311 }
1312
1313 static const char * U_CALLCONV
_UTF16GetName(const UConverter * cnv)1314 _UTF16GetName(const UConverter *cnv) {
1315 if(UCNV_GET_VERSION(cnv)==0) {
1316 return "UTF-16";
1317 } else if(UCNV_GET_VERSION(cnv)==1) {
1318 return "UTF-16,version=1";
1319 } else {
1320 return "UTF-16,version=2";
1321 }
1322 }
1323 U_CDECL_END
1324 extern const UConverterSharedData _UTF16Data;
1325
IS_UTF16BE(const UConverter * cnv)1326 static inline bool IS_UTF16BE(const UConverter *cnv) {
1327 return ((cnv)->sharedData == &_UTF16BEData);
1328 }
1329
IS_UTF16LE(const UConverter * cnv)1330 static inline bool IS_UTF16LE(const UConverter *cnv) {
1331 return ((cnv)->sharedData == &_UTF16LEData);
1332 }
1333
IS_UTF16(const UConverter * cnv)1334 static inline bool IS_UTF16(const UConverter *cnv) {
1335 return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data);
1336 }
1337
1338 U_CDECL_BEGIN
1339 static void U_CALLCONV
_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1340 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1341 UErrorCode *pErrorCode) {
1342 UConverter *cnv=pArgs->converter;
1343 const char *source=pArgs->source;
1344 const char *sourceLimit=pArgs->sourceLimit;
1345 int32_t *offsets=pArgs->offsets;
1346
1347 int32_t state, offsetDelta;
1348 uint8_t b;
1349
1350 state=cnv->mode;
1351
1352 /*
1353 * If we detect a BOM in this buffer, then we must add the BOM size to the
1354 * offsets because the actual converter function will not see and count the BOM.
1355 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1356 */
1357 offsetDelta=0;
1358
1359 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1360 switch(state) {
1361 case 0:
1362 cnv->toUBytes[0]=(uint8_t)*source++;
1363 cnv->toULength=1;
1364 state=1;
1365 break;
1366 case 1:
1367 /*
1368 * Only inside this switch case can the state variable
1369 * temporarily take two additional values:
1370 * 6: BOM error, continue with BE
1371 * 7: BOM error, continue with LE
1372 */
1373 b=*source;
1374 if(cnv->toUBytes[0]==0xfe && b==0xff) {
1375 if(IS_UTF16LE(cnv)) {
1376 state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1377 } else {
1378 state=8; /* detect UTF-16BE */
1379 }
1380 } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
1381 if(IS_UTF16BE(cnv)) {
1382 state=6; /* illegal reverse BOM for Java "UnicodeBig" */
1383 } else {
1384 state=9; /* detect UTF-16LE */
1385 }
1386 } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
1387 state=6; /* illegal missing BOM for Java "Unicode" */
1388 }
1389 if(state>=8) {
1390 /* BOM detected, consume it */
1391 ++source;
1392 cnv->toULength=0;
1393 offsetDelta=(int32_t)(source-pArgs->source);
1394 } else if(state<6) {
1395 /* ok: no BOM, and not a reverse BOM */
1396 if(source!=pArgs->source) {
1397 /* reset the source for a correct first offset */
1398 source=pArgs->source;
1399 cnv->toULength=0;
1400 }
1401 if(IS_UTF16LE(cnv)) {
1402 /* Make Java "UnicodeLittle" default to LE. */
1403 state=9;
1404 } else {
1405 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1406 state=8;
1407 }
1408 } else {
1409 /*
1410 * error: missing BOM, or reverse BOM
1411 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1412 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1413 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1414 */
1415 /* report the non-BOM or reverse BOM as an illegal sequence */
1416 cnv->toUBytes[1]=b;
1417 cnv->toULength=2;
1418 pArgs->source=source+1;
1419 /* continue with conversion if the callback resets the error */
1420 /*
1421 * Make Java "Unicode" default to BE like standard UTF-16.
1422 * Make Java "UnicodeBig" and "UnicodeLittle" default
1423 * to their normal endiannesses.
1424 */
1425 cnv->mode=state+2;
1426 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
1427 return;
1428 }
1429 /* convert the rest of the stream */
1430 cnv->mode=state;
1431 continue;
1432 case 8:
1433 /* call UTF-16BE */
1434 pArgs->source=source;
1435 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1436 source=pArgs->source;
1437 break;
1438 case 9:
1439 /* call UTF-16LE */
1440 pArgs->source=source;
1441 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1442 source=pArgs->source;
1443 break;
1444 default:
1445 break; /* does not occur */
1446 }
1447 }
1448
1449 /* add BOM size to offsets - see comment at offsetDelta declaration */
1450 if(offsets!=NULL && offsetDelta!=0) {
1451 int32_t *offsetsLimit=pArgs->offsets;
1452 while(offsets<offsetsLimit) {
1453 *offsets++ += offsetDelta;
1454 }
1455 }
1456
1457 pArgs->source=source;
1458
1459 if(source==sourceLimit && pArgs->flush) {
1460 /* handle truncated input */
1461 switch(state) {
1462 case 0:
1463 break; /* no input at all, nothing to do */
1464 case 8:
1465 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1466 break;
1467 case 9:
1468 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1469 break;
1470 default:
1471 /* 0<state<8: framework will report truncation, nothing to do here */
1472 break;
1473 }
1474 }
1475
1476 cnv->mode=state;
1477 }
1478
1479 static UChar32 U_CALLCONV
_UTF16GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1480 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1481 UErrorCode *pErrorCode) {
1482 switch(pArgs->converter->mode) {
1483 case 8:
1484 return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1485 case 9:
1486 return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1487 default:
1488 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1489 }
1490 }
1491 U_CDECL_END
1492
1493 static const UConverterImpl _UTF16Impl = {
1494 UCNV_UTF16,
1495
1496 NULL,
1497 NULL,
1498
1499 _UTF16Open,
1500 NULL,
1501 _UTF16Reset,
1502
1503 _UTF16ToUnicodeWithOffsets,
1504 _UTF16ToUnicodeWithOffsets,
1505 _UTF16PEFromUnicodeWithOffsets,
1506 _UTF16PEFromUnicodeWithOffsets,
1507 _UTF16GetNextUChar,
1508
1509 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1510 _UTF16GetName,
1511 NULL,
1512 NULL,
1513 ucnv_getNonSurrogateUnicodeSet,
1514
1515 NULL,
1516 NULL
1517 };
1518
1519 static const UConverterStaticData _UTF16StaticData = {
1520 sizeof(UConverterStaticData),
1521 "UTF-16",
1522 1204, /* CCSID for BOM sensitive UTF-16 */
1523 UCNV_IBM, UCNV_UTF16, 2, 2,
1524 #if U_IS_BIG_ENDIAN
1525 { 0xff, 0xfd, 0, 0 }, 2,
1526 #else
1527 { 0xfd, 0xff, 0, 0 }, 2,
1528 #endif
1529 FALSE, FALSE,
1530 0,
1531 0,
1532 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1533 };
1534
1535 const UConverterSharedData _UTF16Data =
1536 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
1537
1538 static const UConverterImpl _UTF16v2Impl = {
1539 UCNV_UTF16,
1540
1541 NULL,
1542 NULL,
1543
1544 _UTF16Open,
1545 NULL,
1546 _UTF16Reset,
1547
1548 _UTF16ToUnicodeWithOffsets,
1549 _UTF16ToUnicodeWithOffsets,
1550 _UTF16BEFromUnicodeWithOffsets,
1551 _UTF16BEFromUnicodeWithOffsets,
1552 _UTF16GetNextUChar,
1553
1554 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1555 _UTF16GetName,
1556 NULL,
1557 NULL,
1558 ucnv_getNonSurrogateUnicodeSet,
1559
1560 NULL,
1561 NULL
1562 };
1563
1564 static const UConverterStaticData _UTF16v2StaticData = {
1565 sizeof(UConverterStaticData),
1566 "UTF-16,version=2",
1567 1204, /* CCSID for BOM sensitive UTF-16 */
1568 UCNV_IBM, UCNV_UTF16, 2, 2,
1569 { 0xff, 0xfd, 0, 0 }, 2,
1570 FALSE, FALSE,
1571 0,
1572 0,
1573 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1574 };
1575
1576 const UConverterSharedData _UTF16v2Data =
1577 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
1578
1579 #endif
1580