1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u16.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-16 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24 #include "cmemory.h"
25
26 enum {
27 UCNV_NEED_TO_WRITE_BOM=1
28 };
29
30 /* UTF-16BE ----------------------------------------------------------------- */
31
32 #if U_IS_BIG_ENDIAN
33 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
34 #else
35 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
36 #endif
37
38
39 static void
_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)40 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
41 UErrorCode *pErrorCode) {
42 UConverter *cnv;
43 const UChar *source;
44 char *target;
45 int32_t *offsets;
46
47 uint32_t targetCapacity, length, sourceIndex;
48 UChar c, trail;
49 char overflow[4];
50
51 source=pArgs->source;
52 length=(int32_t)(pArgs->sourceLimit-source);
53 if(length<=0) {
54 /* no input, nothing to do */
55 return;
56 }
57
58 cnv=pArgs->converter;
59
60 /* write the BOM if necessary */
61 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
62 static const char bom[]={ (char)0xfe, (char)0xff };
63 ucnv_fromUWriteBytes(cnv,
64 bom, 2,
65 &pArgs->target, pArgs->targetLimit,
66 &pArgs->offsets, -1,
67 pErrorCode);
68 cnv->fromUnicodeStatus=0;
69 }
70
71 target=pArgs->target;
72 if(target >= pArgs->targetLimit) {
73 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
74 return;
75 }
76
77 targetCapacity=(uint32_t)(pArgs->targetLimit-target);
78 offsets=pArgs->offsets;
79 sourceIndex=0;
80
81 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
82
83 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
84 /* the last buffer ended with a lead surrogate, output the surrogate pair */
85 ++source;
86 --length;
87 target[0]=(uint8_t)(c>>8);
88 target[1]=(uint8_t)c;
89 target[2]=(uint8_t)(trail>>8);
90 target[3]=(uint8_t)trail;
91 target+=4;
92 targetCapacity-=4;
93 if(offsets!=NULL) {
94 *offsets++=-1;
95 *offsets++=-1;
96 *offsets++=-1;
97 *offsets++=-1;
98 }
99 sourceIndex=1;
100 cnv->fromUChar32=c=0;
101 }
102
103 if(c==0) {
104 /* copy an even number of bytes for complete UChars */
105 uint32_t count=2*length;
106 if(count>targetCapacity) {
107 count=targetCapacity&~1;
108 }
109 /* count is even */
110 targetCapacity-=count;
111 count>>=1;
112 length-=count;
113
114 if(offsets==NULL) {
115 while(count>0) {
116 c=*source++;
117 if(U16_IS_SINGLE(c)) {
118 target[0]=(uint8_t)(c>>8);
119 target[1]=(uint8_t)c;
120 target+=2;
121 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
122 ++source;
123 --count;
124 target[0]=(uint8_t)(c>>8);
125 target[1]=(uint8_t)c;
126 target[2]=(uint8_t)(trail>>8);
127 target[3]=(uint8_t)trail;
128 target+=4;
129 } else {
130 break;
131 }
132 --count;
133 }
134 } else {
135 while(count>0) {
136 c=*source++;
137 if(U16_IS_SINGLE(c)) {
138 target[0]=(uint8_t)(c>>8);
139 target[1]=(uint8_t)c;
140 target+=2;
141 *offsets++=sourceIndex;
142 *offsets++=sourceIndex++;
143 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
144 ++source;
145 --count;
146 target[0]=(uint8_t)(c>>8);
147 target[1]=(uint8_t)c;
148 target[2]=(uint8_t)(trail>>8);
149 target[3]=(uint8_t)trail;
150 target+=4;
151 *offsets++=sourceIndex;
152 *offsets++=sourceIndex;
153 *offsets++=sourceIndex;
154 *offsets++=sourceIndex;
155 sourceIndex+=2;
156 } else {
157 break;
158 }
159 --count;
160 }
161 }
162
163 if(count==0) {
164 /* done with the loop for complete UChars */
165 if(length>0 && targetCapacity>0) {
166 /*
167 * there is more input and some target capacity -
168 * it must be targetCapacity==1 because otherwise
169 * the above would have copied more;
170 * prepare for overflow output
171 */
172 if(U16_IS_SINGLE(c=*source++)) {
173 overflow[0]=(char)(c>>8);
174 overflow[1]=(char)c;
175 length=2; /* 2 bytes to output */
176 c=0;
177 /* } else { keep c for surrogate handling, length will be set there */
178 }
179 } else {
180 length=0;
181 c=0;
182 }
183 } else {
184 /* keep c for surrogate handling, length will be set there */
185 targetCapacity+=2*count;
186 }
187 } else {
188 length=0; /* from here on, length counts the bytes in overflow[] */
189 }
190
191 if(c!=0) {
192 /*
193 * c is a surrogate, and
194 * - source or target too short
195 * - or the surrogate is unmatched
196 */
197 length=0;
198 if(U16_IS_SURROGATE_LEAD(c)) {
199 if(source<pArgs->sourceLimit) {
200 if(U16_IS_TRAIL(trail=*source)) {
201 /* output the surrogate pair, will overflow (see conditions comment above) */
202 ++source;
203 overflow[0]=(char)(c>>8);
204 overflow[1]=(char)c;
205 overflow[2]=(char)(trail>>8);
206 overflow[3]=(char)trail;
207 length=4; /* 4 bytes to output */
208 c=0;
209 } else {
210 /* unmatched lead surrogate */
211 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
212 }
213 } else {
214 /* see if the trail surrogate is in the next buffer */
215 }
216 } else {
217 /* unmatched trail surrogate */
218 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
219 }
220 cnv->fromUChar32=c;
221 }
222
223 if(length>0) {
224 /* output length bytes with overflow (length>targetCapacity>0) */
225 ucnv_fromUWriteBytes(cnv,
226 overflow, length,
227 (char **)&target, pArgs->targetLimit,
228 &offsets, sourceIndex,
229 pErrorCode);
230 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
231 }
232
233 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
234 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
235 }
236
237 /* write back the updated pointers */
238 pArgs->source=source;
239 pArgs->target=(char *)target;
240 pArgs->offsets=offsets;
241 }
242
243 static void
_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)244 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
245 UErrorCode *pErrorCode) {
246 UConverter *cnv;
247 const uint8_t *source;
248 UChar *target;
249 int32_t *offsets;
250
251 uint32_t targetCapacity, length, count, sourceIndex;
252 UChar c, trail;
253
254 cnv=pArgs->converter;
255 source=(const uint8_t *)pArgs->source;
256 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
257 if(length<=0 && cnv->toUnicodeStatus==0) {
258 /* no input, nothing to do */
259 return;
260 }
261
262 target=pArgs->target;
263 if(target >= pArgs->targetLimit) {
264 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
265 return;
266 }
267
268 targetCapacity=(uint32_t)(pArgs->targetLimit-target);
269 offsets=pArgs->offsets;
270 sourceIndex=0;
271 c=0;
272
273 /* complete a partial UChar or pair from the last call */
274 if(cnv->toUnicodeStatus!=0) {
275 /*
276 * special case: single byte from a previous buffer,
277 * where the byte turned out not to belong to a trail surrogate
278 * and the preceding, unmatched lead surrogate was put into toUBytes[]
279 * for error handling
280 */
281 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
282 cnv->toULength=1;
283 cnv->toUnicodeStatus=0;
284 }
285 if((count=cnv->toULength)!=0) {
286 uint8_t *p=cnv->toUBytes;
287 do {
288 p[count++]=*source++;
289 ++sourceIndex;
290 --length;
291 if(count==2) {
292 c=((UChar)p[0]<<8)|p[1];
293 if(U16_IS_SINGLE(c)) {
294 /* output the BMP code point */
295 *target++=c;
296 if(offsets!=NULL) {
297 *offsets++=-1;
298 }
299 --targetCapacity;
300 count=0;
301 c=0;
302 break;
303 } else if(U16_IS_SURROGATE_LEAD(c)) {
304 /* continue collecting bytes for the trail surrogate */
305 c=0; /* avoid unnecessary surrogate handling below */
306 } else {
307 /* fall through to error handling for an unmatched trail surrogate */
308 break;
309 }
310 } else if(count==4) {
311 c=((UChar)p[0]<<8)|p[1];
312 trail=((UChar)p[2]<<8)|p[3];
313 if(U16_IS_TRAIL(trail)) {
314 /* output the surrogate pair */
315 *target++=c;
316 if(targetCapacity>=2) {
317 *target++=trail;
318 if(offsets!=NULL) {
319 *offsets++=-1;
320 *offsets++=-1;
321 }
322 targetCapacity-=2;
323 } else /* targetCapacity==1 */ {
324 targetCapacity=0;
325 cnv->UCharErrorBuffer[0]=trail;
326 cnv->UCharErrorBufferLength=1;
327 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
328 }
329 count=0;
330 c=0;
331 break;
332 } else {
333 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
334 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
335
336 /* back out reading the code unit after it */
337 if(((const uint8_t *)pArgs->source-source)>=2) {
338 source-=2;
339 } else {
340 /*
341 * if the trail unit's first byte was in a previous buffer, then
342 * we need to put it into a special place because toUBytes[] will be
343 * used for the lead unit's bytes
344 */
345 cnv->toUnicodeStatus=0x100|p[2];
346 --source;
347 }
348 cnv->toULength=2;
349
350 /* write back the updated pointers */
351 pArgs->source=(const char *)source;
352 pArgs->target=target;
353 pArgs->offsets=offsets;
354 return;
355 }
356 }
357 } while(length>0);
358 cnv->toULength=(int8_t)count;
359 }
360
361 /* copy an even number of bytes for complete UChars */
362 count=2*targetCapacity;
363 if(count>length) {
364 count=length&~1;
365 }
366 if(c==0 && count>0) {
367 length-=count;
368 count>>=1;
369 targetCapacity-=count;
370 if(offsets==NULL) {
371 do {
372 c=((UChar)source[0]<<8)|source[1];
373 source+=2;
374 if(U16_IS_SINGLE(c)) {
375 *target++=c;
376 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
377 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
378 ) {
379 source+=2;
380 --count;
381 *target++=c;
382 *target++=trail;
383 } else {
384 break;
385 }
386 } while(--count>0);
387 } else {
388 do {
389 c=((UChar)source[0]<<8)|source[1];
390 source+=2;
391 if(U16_IS_SINGLE(c)) {
392 *target++=c;
393 *offsets++=sourceIndex;
394 sourceIndex+=2;
395 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
396 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
397 ) {
398 source+=2;
399 --count;
400 *target++=c;
401 *target++=trail;
402 *offsets++=sourceIndex;
403 *offsets++=sourceIndex;
404 sourceIndex+=4;
405 } else {
406 break;
407 }
408 } while(--count>0);
409 }
410
411 if(count==0) {
412 /* done with the loop for complete UChars */
413 c=0;
414 } else {
415 /* keep c for surrogate handling, trail will be set there */
416 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
417 targetCapacity+=count;
418 }
419 }
420
421 if(c!=0) {
422 /*
423 * c is a surrogate, and
424 * - source or target too short
425 * - or the surrogate is unmatched
426 */
427 cnv->toUBytes[0]=(uint8_t)(c>>8);
428 cnv->toUBytes[1]=(uint8_t)c;
429 cnv->toULength=2;
430
431 if(U16_IS_SURROGATE_LEAD(c)) {
432 if(length>=2) {
433 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
434 /* output the surrogate pair, will overflow (see conditions comment above) */
435 source+=2;
436 length-=2;
437 *target++=c;
438 if(offsets!=NULL) {
439 *offsets++=sourceIndex;
440 }
441 cnv->UCharErrorBuffer[0]=trail;
442 cnv->UCharErrorBufferLength=1;
443 cnv->toULength=0;
444 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
445 } else {
446 /* unmatched lead surrogate */
447 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
448 }
449 } else {
450 /* see if the trail surrogate is in the next buffer */
451 }
452 } else {
453 /* unmatched trail surrogate */
454 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
455 }
456 }
457
458 if(U_SUCCESS(*pErrorCode)) {
459 /* check for a remaining source byte */
460 if(length>0) {
461 if(targetCapacity==0) {
462 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
463 } else {
464 /* it must be length==1 because otherwise the above would have copied more */
465 cnv->toUBytes[cnv->toULength++]=*source++;
466 }
467 }
468 }
469
470 /* write back the updated pointers */
471 pArgs->source=(const char *)source;
472 pArgs->target=target;
473 pArgs->offsets=offsets;
474 }
475
476 static UChar32
_UTF16BEGetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * err)477 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
478 const uint8_t *s, *sourceLimit;
479 UChar32 c;
480
481 s=(const uint8_t *)pArgs->source;
482 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
483
484 if(s>=sourceLimit) {
485 /* no input */
486 *err=U_INDEX_OUTOFBOUNDS_ERROR;
487 return 0xffff;
488 }
489
490 if(s+2>sourceLimit) {
491 /* only one byte: truncated UChar */
492 pArgs->converter->toUBytes[0]=*s++;
493 pArgs->converter->toULength=1;
494 pArgs->source=(const char *)s;
495 *err = U_TRUNCATED_CHAR_FOUND;
496 return 0xffff;
497 }
498
499 /* get one UChar */
500 c=((UChar32)*s<<8)|s[1];
501 s+=2;
502
503 /* check for a surrogate pair */
504 if(U_IS_SURROGATE(c)) {
505 if(U16_IS_SURROGATE_LEAD(c)) {
506 if(s+2<=sourceLimit) {
507 UChar trail;
508
509 /* get a second UChar and see if it is a trail surrogate */
510 trail=((UChar)*s<<8)|s[1];
511 if(U16_IS_TRAIL(trail)) {
512 c=U16_GET_SUPPLEMENTARY(c, trail);
513 s+=2;
514 } else {
515 /* unmatched lead surrogate */
516 c=-2;
517 }
518 } else {
519 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
520 uint8_t *bytes=pArgs->converter->toUBytes;
521 s-=2;
522 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
523 do {
524 *bytes++=*s++;
525 } while(s<sourceLimit);
526
527 c=0xffff;
528 *err=U_TRUNCATED_CHAR_FOUND;
529 }
530 } else {
531 /* unmatched trail surrogate */
532 c=-2;
533 }
534
535 if(c<0) {
536 /* write the unmatched surrogate */
537 uint8_t *bytes=pArgs->converter->toUBytes;
538 pArgs->converter->toULength=2;
539 *bytes=*(s-2);
540 bytes[1]=*(s-1);
541
542 c=0xffff;
543 *err=U_ILLEGAL_CHAR_FOUND;
544 }
545 }
546
547 pArgs->source=(const char *)s;
548 return c;
549 }
550
551 static const UConverterImpl _UTF16BEImpl={
552 UCNV_UTF16_BigEndian,
553
554 NULL,
555 NULL,
556
557 NULL,
558 NULL,
559 NULL,
560
561 _UTF16BEToUnicodeWithOffsets,
562 _UTF16BEToUnicodeWithOffsets,
563 _UTF16BEFromUnicodeWithOffsets,
564 _UTF16BEFromUnicodeWithOffsets,
565 _UTF16BEGetNextUChar,
566
567 NULL,
568 NULL,
569 NULL,
570 NULL,
571 ucnv_getNonSurrogateUnicodeSet
572 };
573
574 static const UConverterStaticData _UTF16BEStaticData={
575 sizeof(UConverterStaticData),
576 "UTF-16BE",
577 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
578 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
579 0,
580 0,
581 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
582 };
583
584
585 const UConverterSharedData _UTF16BEData={
586 sizeof(UConverterSharedData), ~((uint32_t) 0),
587 NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,
588 0
589 };
590
591 /* UTF-16LE ----------------------------------------------------------------- */
592
593 static void
_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)594 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
595 UErrorCode *pErrorCode) {
596 UConverter *cnv;
597 const UChar *source;
598 char *target;
599 int32_t *offsets;
600
601 uint32_t targetCapacity, length, sourceIndex;
602 UChar c, trail;
603 char overflow[4];
604
605 source=pArgs->source;
606 length=(int32_t)(pArgs->sourceLimit-source);
607 if(length<=0) {
608 /* no input, nothing to do */
609 return;
610 }
611
612 cnv=pArgs->converter;
613
614 /* write the BOM if necessary */
615 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
616 static const char bom[]={ (char)0xff, (char)0xfe };
617 ucnv_fromUWriteBytes(cnv,
618 bom, 2,
619 &pArgs->target, pArgs->targetLimit,
620 &pArgs->offsets, -1,
621 pErrorCode);
622 cnv->fromUnicodeStatus=0;
623 }
624
625 target=pArgs->target;
626 if(target >= pArgs->targetLimit) {
627 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
628 return;
629 }
630
631 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
632 offsets=pArgs->offsets;
633 sourceIndex=0;
634
635 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
636
637 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
638 /* the last buffer ended with a lead surrogate, output the surrogate pair */
639 ++source;
640 --length;
641 target[0]=(uint8_t)c;
642 target[1]=(uint8_t)(c>>8);
643 target[2]=(uint8_t)trail;
644 target[3]=(uint8_t)(trail>>8);
645 target+=4;
646 targetCapacity-=4;
647 if(offsets!=NULL) {
648 *offsets++=-1;
649 *offsets++=-1;
650 *offsets++=-1;
651 *offsets++=-1;
652 }
653 sourceIndex=1;
654 cnv->fromUChar32=c=0;
655 }
656
657 if(c==0) {
658 /* copy an even number of bytes for complete UChars */
659 uint32_t count=2*length;
660 if(count>targetCapacity) {
661 count=targetCapacity&~1;
662 }
663 /* count is even */
664 targetCapacity-=count;
665 count>>=1;
666 length-=count;
667
668 if(offsets==NULL) {
669 while(count>0) {
670 c=*source++;
671 if(U16_IS_SINGLE(c)) {
672 target[0]=(uint8_t)c;
673 target[1]=(uint8_t)(c>>8);
674 target+=2;
675 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
676 ++source;
677 --count;
678 target[0]=(uint8_t)c;
679 target[1]=(uint8_t)(c>>8);
680 target[2]=(uint8_t)trail;
681 target[3]=(uint8_t)(trail>>8);
682 target+=4;
683 } else {
684 break;
685 }
686 --count;
687 }
688 } else {
689 while(count>0) {
690 c=*source++;
691 if(U16_IS_SINGLE(c)) {
692 target[0]=(uint8_t)c;
693 target[1]=(uint8_t)(c>>8);
694 target+=2;
695 *offsets++=sourceIndex;
696 *offsets++=sourceIndex++;
697 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
698 ++source;
699 --count;
700 target[0]=(uint8_t)c;
701 target[1]=(uint8_t)(c>>8);
702 target[2]=(uint8_t)trail;
703 target[3]=(uint8_t)(trail>>8);
704 target+=4;
705 *offsets++=sourceIndex;
706 *offsets++=sourceIndex;
707 *offsets++=sourceIndex;
708 *offsets++=sourceIndex;
709 sourceIndex+=2;
710 } else {
711 break;
712 }
713 --count;
714 }
715 }
716
717 if(count==0) {
718 /* done with the loop for complete UChars */
719 if(length>0 && targetCapacity>0) {
720 /*
721 * there is more input and some target capacity -
722 * it must be targetCapacity==1 because otherwise
723 * the above would have copied more;
724 * prepare for overflow output
725 */
726 if(U16_IS_SINGLE(c=*source++)) {
727 overflow[0]=(char)c;
728 overflow[1]=(char)(c>>8);
729 length=2; /* 2 bytes to output */
730 c=0;
731 /* } else { keep c for surrogate handling, length will be set there */
732 }
733 } else {
734 length=0;
735 c=0;
736 }
737 } else {
738 /* keep c for surrogate handling, length will be set there */
739 targetCapacity+=2*count;
740 }
741 } else {
742 length=0; /* from here on, length counts the bytes in overflow[] */
743 }
744
745 if(c!=0) {
746 /*
747 * c is a surrogate, and
748 * - source or target too short
749 * - or the surrogate is unmatched
750 */
751 length=0;
752 if(U16_IS_SURROGATE_LEAD(c)) {
753 if(source<pArgs->sourceLimit) {
754 if(U16_IS_TRAIL(trail=*source)) {
755 /* output the surrogate pair, will overflow (see conditions comment above) */
756 ++source;
757 overflow[0]=(char)c;
758 overflow[1]=(char)(c>>8);
759 overflow[2]=(char)trail;
760 overflow[3]=(char)(trail>>8);
761 length=4; /* 4 bytes to output */
762 c=0;
763 } else {
764 /* unmatched lead surrogate */
765 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
766 }
767 } else {
768 /* see if the trail surrogate is in the next buffer */
769 }
770 } else {
771 /* unmatched trail surrogate */
772 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
773 }
774 cnv->fromUChar32=c;
775 }
776
777 if(length>0) {
778 /* output length bytes with overflow (length>targetCapacity>0) */
779 ucnv_fromUWriteBytes(cnv,
780 overflow, length,
781 &target, pArgs->targetLimit,
782 &offsets, sourceIndex,
783 pErrorCode);
784 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
785 }
786
787 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
788 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
789 }
790
791 /* write back the updated pointers */
792 pArgs->source=source;
793 pArgs->target=target;
794 pArgs->offsets=offsets;
795 }
796
797 static void
_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)798 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
799 UErrorCode *pErrorCode) {
800 UConverter *cnv;
801 const uint8_t *source;
802 UChar *target;
803 int32_t *offsets;
804
805 uint32_t targetCapacity, length, count, sourceIndex;
806 UChar c, trail;
807
808 cnv=pArgs->converter;
809 source=(const uint8_t *)pArgs->source;
810 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
811 if(length<=0 && cnv->toUnicodeStatus==0) {
812 /* no input, nothing to do */
813 return;
814 }
815
816 target=pArgs->target;
817 if(target >= pArgs->targetLimit) {
818 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
819 return;
820 }
821
822 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
823 offsets=pArgs->offsets;
824 sourceIndex=0;
825 c=0;
826
827 /* complete a partial UChar or pair from the last call */
828 if(cnv->toUnicodeStatus!=0) {
829 /*
830 * special case: single byte from a previous buffer,
831 * where the byte turned out not to belong to a trail surrogate
832 * and the preceding, unmatched lead surrogate was put into toUBytes[]
833 * for error handling
834 */
835 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
836 cnv->toULength=1;
837 cnv->toUnicodeStatus=0;
838 }
839 if((count=cnv->toULength)!=0) {
840 uint8_t *p=cnv->toUBytes;
841 do {
842 p[count++]=*source++;
843 ++sourceIndex;
844 --length;
845 if(count==2) {
846 c=((UChar)p[1]<<8)|p[0];
847 if(U16_IS_SINGLE(c)) {
848 /* output the BMP code point */
849 *target++=c;
850 if(offsets!=NULL) {
851 *offsets++=-1;
852 }
853 --targetCapacity;
854 count=0;
855 c=0;
856 break;
857 } else if(U16_IS_SURROGATE_LEAD(c)) {
858 /* continue collecting bytes for the trail surrogate */
859 c=0; /* avoid unnecessary surrogate handling below */
860 } else {
861 /* fall through to error handling for an unmatched trail surrogate */
862 break;
863 }
864 } else if(count==4) {
865 c=((UChar)p[1]<<8)|p[0];
866 trail=((UChar)p[3]<<8)|p[2];
867 if(U16_IS_TRAIL(trail)) {
868 /* output the surrogate pair */
869 *target++=c;
870 if(targetCapacity>=2) {
871 *target++=trail;
872 if(offsets!=NULL) {
873 *offsets++=-1;
874 *offsets++=-1;
875 }
876 targetCapacity-=2;
877 } else /* targetCapacity==1 */ {
878 targetCapacity=0;
879 cnv->UCharErrorBuffer[0]=trail;
880 cnv->UCharErrorBufferLength=1;
881 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
882 }
883 count=0;
884 c=0;
885 break;
886 } else {
887 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
888 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
889
890 /* back out reading the code unit after it */
891 if(((const uint8_t *)pArgs->source-source)>=2) {
892 source-=2;
893 } else {
894 /*
895 * if the trail unit's first byte was in a previous buffer, then
896 * we need to put it into a special place because toUBytes[] will be
897 * used for the lead unit's bytes
898 */
899 cnv->toUnicodeStatus=0x100|p[2];
900 --source;
901 }
902 cnv->toULength=2;
903
904 /* write back the updated pointers */
905 pArgs->source=(const char *)source;
906 pArgs->target=target;
907 pArgs->offsets=offsets;
908 return;
909 }
910 }
911 } while(length>0);
912 cnv->toULength=(int8_t)count;
913 }
914
915 /* copy an even number of bytes for complete UChars */
916 count=2*targetCapacity;
917 if(count>length) {
918 count=length&~1;
919 }
920 if(c==0 && count>0) {
921 length-=count;
922 count>>=1;
923 targetCapacity-=count;
924 if(offsets==NULL) {
925 do {
926 c=((UChar)source[1]<<8)|source[0];
927 source+=2;
928 if(U16_IS_SINGLE(c)) {
929 *target++=c;
930 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
931 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
932 ) {
933 source+=2;
934 --count;
935 *target++=c;
936 *target++=trail;
937 } else {
938 break;
939 }
940 } while(--count>0);
941 } else {
942 do {
943 c=((UChar)source[1]<<8)|source[0];
944 source+=2;
945 if(U16_IS_SINGLE(c)) {
946 *target++=c;
947 *offsets++=sourceIndex;
948 sourceIndex+=2;
949 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
950 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
951 ) {
952 source+=2;
953 --count;
954 *target++=c;
955 *target++=trail;
956 *offsets++=sourceIndex;
957 *offsets++=sourceIndex;
958 sourceIndex+=4;
959 } else {
960 break;
961 }
962 } while(--count>0);
963 }
964
965 if(count==0) {
966 /* done with the loop for complete UChars */
967 c=0;
968 } else {
969 /* keep c for surrogate handling, trail will be set there */
970 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
971 targetCapacity+=count;
972 }
973 }
974
975 if(c!=0) {
976 /*
977 * c is a surrogate, and
978 * - source or target too short
979 * - or the surrogate is unmatched
980 */
981 cnv->toUBytes[0]=(uint8_t)c;
982 cnv->toUBytes[1]=(uint8_t)(c>>8);
983 cnv->toULength=2;
984
985 if(U16_IS_SURROGATE_LEAD(c)) {
986 if(length>=2) {
987 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
988 /* output the surrogate pair, will overflow (see conditions comment above) */
989 source+=2;
990 length-=2;
991 *target++=c;
992 if(offsets!=NULL) {
993 *offsets++=sourceIndex;
994 }
995 cnv->UCharErrorBuffer[0]=trail;
996 cnv->UCharErrorBufferLength=1;
997 cnv->toULength=0;
998 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
999 } else {
1000 /* unmatched lead surrogate */
1001 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1002 }
1003 } else {
1004 /* see if the trail surrogate is in the next buffer */
1005 }
1006 } else {
1007 /* unmatched trail surrogate */
1008 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1009 }
1010 }
1011
1012 if(U_SUCCESS(*pErrorCode)) {
1013 /* check for a remaining source byte */
1014 if(length>0) {
1015 if(targetCapacity==0) {
1016 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1017 } else {
1018 /* it must be length==1 because otherwise the above would have copied more */
1019 cnv->toUBytes[cnv->toULength++]=*source++;
1020 }
1021 }
1022 }
1023
1024 /* write back the updated pointers */
1025 pArgs->source=(const char *)source;
1026 pArgs->target=target;
1027 pArgs->offsets=offsets;
1028 }
1029
1030 static UChar32
_UTF16LEGetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * err)1031 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1032 const uint8_t *s, *sourceLimit;
1033 UChar32 c;
1034
1035 s=(const uint8_t *)pArgs->source;
1036 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1037
1038 if(s>=sourceLimit) {
1039 /* no input */
1040 *err=U_INDEX_OUTOFBOUNDS_ERROR;
1041 return 0xffff;
1042 }
1043
1044 if(s+2>sourceLimit) {
1045 /* only one byte: truncated UChar */
1046 pArgs->converter->toUBytes[0]=*s++;
1047 pArgs->converter->toULength=1;
1048 pArgs->source=(const char *)s;
1049 *err = U_TRUNCATED_CHAR_FOUND;
1050 return 0xffff;
1051 }
1052
1053 /* get one UChar */
1054 c=((UChar32)s[1]<<8)|*s;
1055 s+=2;
1056
1057 /* check for a surrogate pair */
1058 if(U_IS_SURROGATE(c)) {
1059 if(U16_IS_SURROGATE_LEAD(c)) {
1060 if(s+2<=sourceLimit) {
1061 UChar trail;
1062
1063 /* get a second UChar and see if it is a trail surrogate */
1064 trail=((UChar)s[1]<<8)|*s;
1065 if(U16_IS_TRAIL(trail)) {
1066 c=U16_GET_SUPPLEMENTARY(c, trail);
1067 s+=2;
1068 } else {
1069 /* unmatched lead surrogate */
1070 c=-2;
1071 }
1072 } else {
1073 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1074 uint8_t *bytes=pArgs->converter->toUBytes;
1075 s-=2;
1076 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1077 do {
1078 *bytes++=*s++;
1079 } while(s<sourceLimit);
1080
1081 c=0xffff;
1082 *err=U_TRUNCATED_CHAR_FOUND;
1083 }
1084 } else {
1085 /* unmatched trail surrogate */
1086 c=-2;
1087 }
1088
1089 if(c<0) {
1090 /* write the unmatched surrogate */
1091 uint8_t *bytes=pArgs->converter->toUBytes;
1092 pArgs->converter->toULength=2;
1093 *bytes=*(s-2);
1094 bytes[1]=*(s-1);
1095
1096 c=0xffff;
1097 *err=U_ILLEGAL_CHAR_FOUND;
1098 }
1099 }
1100
1101 pArgs->source=(const char *)s;
1102 return c;
1103 }
1104
1105 static const UConverterImpl _UTF16LEImpl={
1106 UCNV_UTF16_LittleEndian,
1107
1108 NULL,
1109 NULL,
1110
1111 NULL,
1112 NULL,
1113 NULL,
1114
1115 _UTF16LEToUnicodeWithOffsets,
1116 _UTF16LEToUnicodeWithOffsets,
1117 _UTF16LEFromUnicodeWithOffsets,
1118 _UTF16LEFromUnicodeWithOffsets,
1119 _UTF16LEGetNextUChar,
1120
1121 NULL,
1122 NULL,
1123 NULL,
1124 NULL,
1125 ucnv_getNonSurrogateUnicodeSet
1126 };
1127
1128
1129 static const UConverterStaticData _UTF16LEStaticData={
1130 sizeof(UConverterStaticData),
1131 "UTF-16LE",
1132 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1133 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1134 0,
1135 0,
1136 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1137 };
1138
1139
1140 const UConverterSharedData _UTF16LEData={
1141 sizeof(UConverterSharedData), ~((uint32_t) 0),
1142 NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,
1143 0
1144 };
1145
1146 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1147
1148 /*
1149 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1150 * accordingly.
1151 * This is a simpler version of the UTF-32 converter below, with
1152 * fewer states for shorter BOMs.
1153 *
1154 * State values:
1155 * 0 initial state
1156 * 1 saw FE
1157 * 2..4 -
1158 * 5 saw FF
1159 * 6..7 -
1160 * 8 UTF-16BE mode
1161 * 9 UTF-16LE mode
1162 *
1163 * During detection: state&3==number of matching bytes so far.
1164 *
1165 * On output, emit U+FEFF as the first code point.
1166 */
1167
1168 static void
_UTF16Reset(UConverter * cnv,UConverterResetChoice choice)1169 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1170 if(choice<=UCNV_RESET_TO_UNICODE) {
1171 /* reset toUnicode: state=0 */
1172 cnv->mode=0;
1173 }
1174 if(choice!=UCNV_RESET_TO_UNICODE) {
1175 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1176 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1177 }
1178 }
1179
1180 static void
_UTF16Open(UConverter * cnv,const char * name,const char * locale,uint32_t options,UErrorCode * pErrorCode)1181 _UTF16Open(UConverter *cnv,
1182 const char *name,
1183 const char *locale,
1184 uint32_t options,
1185 UErrorCode *pErrorCode) {
1186 _UTF16Reset(cnv, UCNV_RESET_BOTH);
1187 }
1188
1189 static const char utf16BOM[8]={ (char)0xfe, (char)0xff, 0, 0, (char)0xff, (char)0xfe, 0, 0 };
1190
1191 static void
_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1192 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1193 UErrorCode *pErrorCode) {
1194 UConverter *cnv=pArgs->converter;
1195 const char *source=pArgs->source;
1196 const char *sourceLimit=pArgs->sourceLimit;
1197 int32_t *offsets=pArgs->offsets;
1198
1199 int32_t state, offsetDelta;
1200 char b;
1201
1202 state=cnv->mode;
1203
1204 /*
1205 * If we detect a BOM in this buffer, then we must add the BOM size to the
1206 * offsets because the actual converter function will not see and count the BOM.
1207 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1208 */
1209 offsetDelta=0;
1210
1211 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1212 switch(state) {
1213 case 0:
1214 b=*source;
1215 if(b==(char)0xfe) {
1216 state=1; /* could be FE FF */
1217 } else if(b==(char)0xff) {
1218 state=5; /* could be FF FE */
1219 } else {
1220 state=8; /* default to UTF-16BE */
1221 continue;
1222 }
1223 ++source;
1224 break;
1225 case 1:
1226 case 5:
1227 if(*source==utf16BOM[state]) {
1228 ++source;
1229 if(state==1) {
1230 state=8; /* detect UTF-16BE */
1231 offsetDelta=(int32_t)(source-pArgs->source);
1232 } else if(state==5) {
1233 state=9; /* detect UTF-16LE */
1234 offsetDelta=(int32_t)(source-pArgs->source);
1235 }
1236 } else {
1237 /* switch to UTF-16BE and pass the previous bytes */
1238 if(source!=pArgs->source) {
1239 /* just reset the source */
1240 source=pArgs->source;
1241 } else {
1242 UBool oldFlush=pArgs->flush;
1243
1244 /* the first byte is from a previous buffer, replay it first */
1245 pArgs->source=utf16BOM+(state&4); /* select the correct BOM */
1246 pArgs->sourceLimit=pArgs->source+1; /* replay previous byte */
1247 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1248
1249 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1250
1251 /* restore real pointers; pArgs->source will be set in case 8/9 */
1252 pArgs->sourceLimit=sourceLimit;
1253 pArgs->flush=oldFlush;
1254 }
1255 state=8;
1256 continue;
1257 }
1258 break;
1259 case 8:
1260 /* call UTF-16BE */
1261 pArgs->source=source;
1262 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1263 source=pArgs->source;
1264 break;
1265 case 9:
1266 /* call UTF-16LE */
1267 pArgs->source=source;
1268 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1269 source=pArgs->source;
1270 break;
1271 default:
1272 break; /* does not occur */
1273 }
1274 }
1275
1276 /* add BOM size to offsets - see comment at offsetDelta declaration */
1277 if(offsets!=NULL && offsetDelta!=0) {
1278 int32_t *offsetsLimit=pArgs->offsets;
1279 while(offsets<offsetsLimit) {
1280 *offsets++ += offsetDelta;
1281 }
1282 }
1283
1284 pArgs->source=source;
1285
1286 if(source==sourceLimit && pArgs->flush) {
1287 /* handle truncated input */
1288 switch(state) {
1289 case 0:
1290 break; /* no input at all, nothing to do */
1291 case 8:
1292 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1293 break;
1294 case 9:
1295 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1296 break;
1297 default:
1298 /* handle 0<state<8: call UTF-16BE with too-short input */
1299 pArgs->source=utf16BOM+(state&4); /* select the correct BOM */
1300 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1301
1302 /* no offsets: not enough for output */
1303 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1304 pArgs->source=source;
1305 pArgs->sourceLimit=sourceLimit;
1306 state=8;
1307 break;
1308 }
1309 }
1310
1311 cnv->mode=state;
1312 }
1313
1314 static UChar32
_UTF16GetNextUChar(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1315 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1316 UErrorCode *pErrorCode) {
1317 switch(pArgs->converter->mode) {
1318 case 8:
1319 return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1320 case 9:
1321 return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1322 default:
1323 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1324 }
1325 }
1326
1327 static const UConverterImpl _UTF16Impl = {
1328 UCNV_UTF16,
1329
1330 NULL,
1331 NULL,
1332
1333 _UTF16Open,
1334 NULL,
1335 _UTF16Reset,
1336
1337 _UTF16ToUnicodeWithOffsets,
1338 _UTF16ToUnicodeWithOffsets,
1339 _UTF16PEFromUnicodeWithOffsets,
1340 _UTF16PEFromUnicodeWithOffsets,
1341 _UTF16GetNextUChar,
1342
1343 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1344 NULL,
1345 NULL,
1346 NULL,
1347 ucnv_getNonSurrogateUnicodeSet
1348 };
1349
1350 static const UConverterStaticData _UTF16StaticData = {
1351 sizeof(UConverterStaticData),
1352 "UTF-16",
1353 1204, /* CCSID for BOM sensitive UTF-16 */
1354 UCNV_IBM, UCNV_UTF16, 2, 2,
1355 #if U_IS_BIG_ENDIAN
1356 { 0xff, 0xfd, 0, 0 }, 2,
1357 #else
1358 { 0xfd, 0xff, 0, 0 }, 2,
1359 #endif
1360 FALSE, FALSE,
1361 0,
1362 0,
1363 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1364 };
1365
1366 const UConverterSharedData _UTF16Data = {
1367 sizeof(UConverterSharedData), ~((uint32_t) 0),
1368 NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,
1369 0
1370 };
1371
1372 #endif
1373