1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2002-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: ucnvbocu.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002mar27
14 * created by: Markus W. Scherer
15 *
16 * This is an implementation of the Binary Ordered Compression for Unicode,
17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_CONVERSION
23
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "unicode/utf16.h"
27 #include "putilimp.h"
28 #include "ucnv_bld.h"
29 #include "ucnv_cnv.h"
30 #include "uassert.h"
31
32 /* BOCU-1 constants and macros ---------------------------------------------- */
33
34 /*
35 * BOCU-1 encodes the code points of a Unicode string as
36 * a sequence of byte-encoded differences (slope detection),
37 * preserving lexical order.
38 *
39 * Optimize the difference-taking for runs of Unicode text within
40 * small scripts:
41 *
42 * Most small scripts are allocated within aligned 128-blocks of Unicode
43 * code points. Lexical order is preserved if the "previous code point" state
44 * is always moved into the middle of such a block.
45 *
46 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
47 * areas into the middle of those areas.
48 *
49 * C0 control codes and space are encoded with their US-ASCII bytes.
50 * "prev" is reset for C0 controls but not for space.
51 */
52
53 /* initial value for "prev": middle of the ASCII range */
54 #define BOCU1_ASCII_PREV 0x40
55
56 /* bounding byte values for differences */
57 #define BOCU1_MIN 0x21
58 #define BOCU1_MIDDLE 0x90
59 #define BOCU1_MAX_LEAD 0xfe
60 #define BOCU1_MAX_TRAIL 0xff
61 #define BOCU1_RESET 0xff
62
63 /* number of lead bytes */
64 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
65
66 /* adjust trail byte counts for the use of some C0 control byte values */
67 #define BOCU1_TRAIL_CONTROLS_COUNT 20
68 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
69
70 /* number of trail bytes */
71 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
72
73 /*
74 * number of positive and negative single-byte codes
75 * (counting 0==BOCU1_MIDDLE among the positive ones)
76 */
77 #define BOCU1_SINGLE 64
78
79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
80 #define BOCU1_LEAD_2 43
81 #define BOCU1_LEAD_3 3
82 #define BOCU1_LEAD_4 1
83
84 /* The difference value range for single-byters. */
85 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
86 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
87
88 /* The difference value range for double-byters. */
89 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
90 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
91
92 /* The difference value range for 3-byters. */
93 #define BOCU1_REACH_POS_3 \
94 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
95
96 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
97
98 /* The lead byte start values. */
99 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
100 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
101 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
102 /* ==BOCU1_MAX_LEAD */
103
104 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
105 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
106 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
107 /* ==BOCU1_MIN+1 */
108
109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
111 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
112 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
113 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
114
115 /* The length of a byte sequence, according to its packed form. */
116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
117 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
118
119 /*
120 * 12 commonly used C0 control codes (and space) are only used to encode
121 * themselves directly,
122 * which makes BOCU-1 MIME-usable and reasonably safe for
123 * ASCII-oriented software.
124 *
125 * These controls are
126 * 0 NUL
127 *
128 * 7 BEL
129 * 8 BS
130 *
131 * 9 TAB
132 * a LF
133 * b VT
134 * c FF
135 * d CR
136 *
137 * e SO
138 * f SI
139 *
140 * 1a SUB
141 * 1b ESC
142 *
143 * The other 20 C0 controls are also encoded directly (to preserve order)
144 * but are also used as trail bytes in difference encoding
145 * (for better compression).
146 */
147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
148
149 /*
150 * Byte value map for control codes,
151 * from external byte values 0x00..0x20
152 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
153 * External byte values that are illegal as trail bytes are mapped to -1.
154 */
155 static const int8_t
156 bocu1ByteToTrail[BOCU1_MIN]={
157 /* 0 1 2 3 4 5 6 7 */
158 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
159
160 /* 8 9 a b c d e f */
161 -1, -1, -1, -1, -1, -1, -1, -1,
162
163 /* 10 11 12 13 14 15 16 17 */
164 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
165
166 /* 18 19 1a 1b 1c 1d 1e 1f */
167 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
168
169 /* 20 */
170 -1
171 };
172
173 /*
174 * Byte value map for control codes,
175 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
176 * to external byte values 0x00..0x20.
177 */
178 static const int8_t
179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
180 /* 0 1 2 3 4 5 6 7 */
181 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
182
183 /* 8 9 a b c d e f */
184 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
185
186 /* 10 11 12 13 */
187 0x1c, 0x1d, 0x1e, 0x1f
188 };
189
190 /**
191 * Integer division and modulo with negative numerators
192 * yields negative modulo results and quotients that are one more than
193 * what we need here.
194 * This macro adjust the results so that the modulo-value m is always >=0.
195 *
196 * For positive n, the if() condition is always FALSE.
197 *
198 * @param n Number to be split into quotient and rest.
199 * Will be modified to contain the quotient.
200 * @param d Divisor.
201 * @param m Output variable for the rest (modulo result).
202 */
203 #define NEGDIVMOD(n, d, m) { \
204 (m)=(n)%(d); \
205 (n)/=(d); \
206 if((m)<0) { \
207 --(n); \
208 (m)+=(d); \
209 } \
210 }
211
212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
213
214 /** Is a diff value encodable in a single byte? */
215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
216
217 /** Encode a diff value in a single byte. */
218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
219
220 /** Is a diff value encodable in two bytes? */
221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
222
223 /* BOCU-1 implementation functions ------------------------------------------ */
224
225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
226
227 /**
228 * Compute the next "previous" value for differencing
229 * from the current code point.
230 *
231 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
232 * @return "previous code point" state value
233 */
234 static inline int32_t
bocu1Prev(int32_t c)235 bocu1Prev(int32_t c) {
236 /* compute new prev */
237 if(/* 0x3040<=c && */ c<=0x309f) {
238 /* Hiragana is not 128-aligned */
239 return 0x3070;
240 } else if(0x4e00<=c && c<=0x9fa5) {
241 /* CJK Unihan */
242 return 0x4e00-BOCU1_REACH_NEG_2;
243 } else if(0xac00<=c /* && c<=0xd7a3 */) {
244 /* Korean Hangul */
245 return (0xd7a3+0xac00)/2;
246 } else {
247 /* mostly small scripts */
248 return BOCU1_SIMPLE_PREV(c);
249 }
250 }
251
252 /** Fast version of bocu1Prev() for most scripts. */
253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
254
255 /*
256 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
257 * The UConverter fields are used as follows:
258 *
259 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
260 *
261 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
263 */
264
265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
266
267 /**
268 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
269 * and return a packed integer with them.
270 *
271 * The encoding favors small absolute differences with short encodings
272 * to compress runs of same-script characters.
273 *
274 * Optimized version with unrolled loops and fewer floating-point operations
275 * than the standard packDiff().
276 *
277 * @param diff difference value -0x10ffff..0x10ffff
278 * @return
279 * 0x010000zz for 1-byte sequence zz
280 * 0x0200yyzz for 2-byte sequence yy zz
281 * 0x03xxyyzz for 3-byte sequence xx yy zz
282 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
283 */
284 static int32_t
packDiff(int32_t diff)285 packDiff(int32_t diff) {
286 int32_t result, m;
287
288 U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
289 if(diff>=BOCU1_REACH_NEG_1) {
290 /* mostly positive differences, and single-byte negative ones */
291 #if 0 /* single-byte case handled in macros, see below */
292 if(diff<=BOCU1_REACH_POS_1) {
293 /* single byte */
294 return 0x01000000|(BOCU1_MIDDLE+diff);
295 } else
296 #endif
297 if(diff<=BOCU1_REACH_POS_2) {
298 /* two bytes */
299 diff-=BOCU1_REACH_POS_1+1;
300 result=0x02000000;
301
302 m=diff%BOCU1_TRAIL_COUNT;
303 diff/=BOCU1_TRAIL_COUNT;
304 result|=BOCU1_TRAIL_TO_BYTE(m);
305
306 result|=(BOCU1_START_POS_2+diff)<<8;
307 } else if(diff<=BOCU1_REACH_POS_3) {
308 /* three bytes */
309 diff-=BOCU1_REACH_POS_2+1;
310 result=0x03000000;
311
312 m=diff%BOCU1_TRAIL_COUNT;
313 diff/=BOCU1_TRAIL_COUNT;
314 result|=BOCU1_TRAIL_TO_BYTE(m);
315
316 m=diff%BOCU1_TRAIL_COUNT;
317 diff/=BOCU1_TRAIL_COUNT;
318 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
319
320 result|=(BOCU1_START_POS_3+diff)<<16;
321 } else {
322 /* four bytes */
323 diff-=BOCU1_REACH_POS_3+1;
324
325 m=diff%BOCU1_TRAIL_COUNT;
326 diff/=BOCU1_TRAIL_COUNT;
327 result=BOCU1_TRAIL_TO_BYTE(m);
328
329 m=diff%BOCU1_TRAIL_COUNT;
330 diff/=BOCU1_TRAIL_COUNT;
331 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
332
333 /*
334 * We know that / and % would deliver quotient 0 and rest=diff.
335 * Avoid division and modulo for performance.
336 */
337 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
338
339 result|=((uint32_t)BOCU1_START_POS_4)<<24;
340 }
341 } else {
342 /* two- to four-byte negative differences */
343 if(diff>=BOCU1_REACH_NEG_2) {
344 /* two bytes */
345 diff-=BOCU1_REACH_NEG_1;
346 result=0x02000000;
347
348 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
349 result|=BOCU1_TRAIL_TO_BYTE(m);
350
351 result|=(BOCU1_START_NEG_2+diff)<<8;
352 } else if(diff>=BOCU1_REACH_NEG_3) {
353 /* three bytes */
354 diff-=BOCU1_REACH_NEG_2;
355 result=0x03000000;
356
357 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
358 result|=BOCU1_TRAIL_TO_BYTE(m);
359
360 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
361 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
362
363 result|=(BOCU1_START_NEG_3+diff)<<16;
364 } else {
365 /* four bytes */
366 diff-=BOCU1_REACH_NEG_3;
367
368 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
369 result=BOCU1_TRAIL_TO_BYTE(m);
370
371 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
372 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
373
374 /*
375 * We know that NEGDIVMOD would deliver
376 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
377 * Avoid division and modulo for performance.
378 */
379 m=diff+BOCU1_TRAIL_COUNT;
380 result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
381
382 result|=BOCU1_MIN<<24;
383 }
384 }
385 return result;
386 }
387
388
389 static void
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
391 UErrorCode *pErrorCode) {
392 UConverter *cnv;
393 const UChar *source, *sourceLimit;
394 uint8_t *target;
395 int32_t targetCapacity;
396 int32_t *offsets;
397
398 int32_t prev, c, diff;
399
400 int32_t sourceIndex, nextSourceIndex;
401
402 U_ALIGN_CODE(16)
403
404 /* set up the local pointers */
405 cnv=pArgs->converter;
406 source=pArgs->source;
407 sourceLimit=pArgs->sourceLimit;
408 target=(uint8_t *)pArgs->target;
409 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
410 offsets=pArgs->offsets;
411
412 /* get the converter state from UConverter */
413 c=cnv->fromUChar32;
414 prev=(int32_t)cnv->fromUnicodeStatus;
415 if(prev==0) {
416 prev=BOCU1_ASCII_PREV;
417 }
418
419 /* sourceIndex=-1 if the current character began in the previous buffer */
420 sourceIndex= c==0 ? 0 : -1;
421 nextSourceIndex=0;
422
423 /* conversion loop */
424 if(c!=0 && targetCapacity>0) {
425 goto getTrail;
426 }
427
428 fastSingle:
429 /* fast loop for single-byte differences */
430 /* use only one loop counter variable, targetCapacity, not also source */
431 diff=(int32_t)(sourceLimit-source);
432 if(targetCapacity>diff) {
433 targetCapacity=diff;
434 }
435 while(targetCapacity>0 && (c=*source)<0x3000) {
436 if(c<=0x20) {
437 if(c!=0x20) {
438 prev=BOCU1_ASCII_PREV;
439 }
440 *target++=(uint8_t)c;
441 *offsets++=nextSourceIndex++;
442 ++source;
443 --targetCapacity;
444 } else {
445 diff=c-prev;
446 if(DIFF_IS_SINGLE(diff)) {
447 prev=BOCU1_SIMPLE_PREV(c);
448 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
449 *offsets++=nextSourceIndex++;
450 ++source;
451 --targetCapacity;
452 } else {
453 break;
454 }
455 }
456 }
457 /* restore real values */
458 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
459 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
460
461 /* regular loop for all cases */
462 while(source<sourceLimit) {
463 if(targetCapacity>0) {
464 c=*source++;
465 ++nextSourceIndex;
466
467 if(c<=0x20) {
468 /*
469 * ISO C0 control & space:
470 * Encode directly for MIME compatibility,
471 * and reset state except for space, to not disrupt compression.
472 */
473 if(c!=0x20) {
474 prev=BOCU1_ASCII_PREV;
475 }
476 *target++=(uint8_t)c;
477 *offsets++=sourceIndex;
478 --targetCapacity;
479
480 sourceIndex=nextSourceIndex;
481 continue;
482 }
483
484 if(U16_IS_LEAD(c)) {
485 getTrail:
486 if(source<sourceLimit) {
487 /* test the following code unit */
488 UChar trail=*source;
489 if(U16_IS_TRAIL(trail)) {
490 ++source;
491 ++nextSourceIndex;
492 c=U16_GET_SUPPLEMENTARY(c, trail);
493 }
494 } else {
495 /* no more input */
496 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
497 break;
498 }
499 }
500
501 /*
502 * all other Unicode code points c==U+0021..U+10ffff
503 * are encoded with the difference c-prev
504 *
505 * a new prev is computed from c,
506 * placed in the middle of a 0x80-block (for most small scripts) or
507 * in the middle of the Unihan and Hangul blocks
508 * to statistically minimize the following difference
509 */
510 diff=c-prev;
511 prev=BOCU1_PREV(c);
512 if(DIFF_IS_SINGLE(diff)) {
513 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
514 *offsets++=sourceIndex;
515 --targetCapacity;
516 sourceIndex=nextSourceIndex;
517 if(c<0x3000) {
518 goto fastSingle;
519 }
520 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
521 /* optimize 2-byte case */
522 int32_t m;
523
524 if(diff>=0) {
525 diff-=BOCU1_REACH_POS_1+1;
526 m=diff%BOCU1_TRAIL_COUNT;
527 diff/=BOCU1_TRAIL_COUNT;
528 diff+=BOCU1_START_POS_2;
529 } else {
530 diff-=BOCU1_REACH_NEG_1;
531 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
532 diff+=BOCU1_START_NEG_2;
533 }
534 *target++=(uint8_t)diff;
535 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
536 *offsets++=sourceIndex;
537 *offsets++=sourceIndex;
538 targetCapacity-=2;
539 sourceIndex=nextSourceIndex;
540 } else {
541 int32_t length; /* will be 2..4 */
542
543 diff=packDiff(diff);
544 length=BOCU1_LENGTH_FROM_PACKED(diff);
545
546 /* write the output character bytes from diff and length */
547 /* from the first if in the loop we know that targetCapacity>0 */
548 if(length<=targetCapacity) {
549 switch(length) {
550 /* each branch falls through to the next one */
551 case 4:
552 *target++=(uint8_t)(diff>>24);
553 *offsets++=sourceIndex;
554 case 3: /*fall through*/
555 *target++=(uint8_t)(diff>>16);
556 *offsets++=sourceIndex;
557 case 2: /*fall through*/
558 *target++=(uint8_t)(diff>>8);
559 *offsets++=sourceIndex;
560 /* case 1: handled above */
561 *target++=(uint8_t)diff;
562 *offsets++=sourceIndex;
563 default:
564 /* will never occur */
565 break;
566 }
567 targetCapacity-=length;
568 sourceIndex=nextSourceIndex;
569 } else {
570 uint8_t *charErrorBuffer;
571
572 /*
573 * We actually do this backwards here:
574 * In order to save an intermediate variable, we output
575 * first to the overflow buffer what does not fit into the
576 * regular target.
577 */
578 /* we know that 1<=targetCapacity<length<=4 */
579 length-=targetCapacity;
580 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
581 switch(length) {
582 /* each branch falls through to the next one */
583 case 3:
584 *charErrorBuffer++=(uint8_t)(diff>>16);
585 case 2: /*fall through*/
586 *charErrorBuffer++=(uint8_t)(diff>>8);
587 case 1: /*fall through*/
588 *charErrorBuffer=(uint8_t)diff;
589 default:
590 /* will never occur */
591 break;
592 }
593 cnv->charErrorBufferLength=(int8_t)length;
594
595 /* now output what fits into the regular target */
596 diff>>=8*length; /* length was reduced by targetCapacity */
597 switch(targetCapacity) {
598 /* each branch falls through to the next one */
599 case 3:
600 *target++=(uint8_t)(diff>>16);
601 *offsets++=sourceIndex;
602 case 2: /*fall through*/
603 *target++=(uint8_t)(diff>>8);
604 *offsets++=sourceIndex;
605 case 1: /*fall through*/
606 *target++=(uint8_t)diff;
607 *offsets++=sourceIndex;
608 default:
609 /* will never occur */
610 break;
611 }
612
613 /* target overflow */
614 targetCapacity=0;
615 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
616 break;
617 }
618 }
619 } else {
620 /* target is full */
621 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
622 break;
623 }
624 }
625
626 /* set the converter state back into UConverter */
627 cnv->fromUChar32= c<0 ? -c : 0;
628 cnv->fromUnicodeStatus=(uint32_t)prev;
629
630 /* write back the updated pointers */
631 pArgs->source=source;
632 pArgs->target=(char *)target;
633 pArgs->offsets=offsets;
634 }
635
636 /*
637 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
638 * If a change is made in the original function, then either
639 * change this function the same way or
640 * re-copy the original function and remove the variables
641 * offsets, sourceIndex, and nextSourceIndex.
642 */
643 static void
_Bocu1FromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)644 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
645 UErrorCode *pErrorCode) {
646 UConverter *cnv;
647 const UChar *source, *sourceLimit;
648 uint8_t *target;
649 int32_t targetCapacity;
650
651 int32_t prev, c, diff;
652
653 /* set up the local pointers */
654 cnv=pArgs->converter;
655 source=pArgs->source;
656 sourceLimit=pArgs->sourceLimit;
657 target=(uint8_t *)pArgs->target;
658 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
659
660 /* get the converter state from UConverter */
661 c=cnv->fromUChar32;
662 prev=(int32_t)cnv->fromUnicodeStatus;
663 if(prev==0) {
664 prev=BOCU1_ASCII_PREV;
665 }
666
667 /* conversion loop */
668 if(c!=0 && targetCapacity>0) {
669 goto getTrail;
670 }
671
672 fastSingle:
673 /* fast loop for single-byte differences */
674 /* use only one loop counter variable, targetCapacity, not also source */
675 diff=(int32_t)(sourceLimit-source);
676 if(targetCapacity>diff) {
677 targetCapacity=diff;
678 }
679 while(targetCapacity>0 && (c=*source)<0x3000) {
680 if(c<=0x20) {
681 if(c!=0x20) {
682 prev=BOCU1_ASCII_PREV;
683 }
684 *target++=(uint8_t)c;
685 } else {
686 diff=c-prev;
687 if(DIFF_IS_SINGLE(diff)) {
688 prev=BOCU1_SIMPLE_PREV(c);
689 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
690 } else {
691 break;
692 }
693 }
694 ++source;
695 --targetCapacity;
696 }
697 /* restore real values */
698 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
699
700 /* regular loop for all cases */
701 while(source<sourceLimit) {
702 if(targetCapacity>0) {
703 c=*source++;
704
705 if(c<=0x20) {
706 /*
707 * ISO C0 control & space:
708 * Encode directly for MIME compatibility,
709 * and reset state except for space, to not disrupt compression.
710 */
711 if(c!=0x20) {
712 prev=BOCU1_ASCII_PREV;
713 }
714 *target++=(uint8_t)c;
715 --targetCapacity;
716 continue;
717 }
718
719 if(U16_IS_LEAD(c)) {
720 getTrail:
721 if(source<sourceLimit) {
722 /* test the following code unit */
723 UChar trail=*source;
724 if(U16_IS_TRAIL(trail)) {
725 ++source;
726 c=U16_GET_SUPPLEMENTARY(c, trail);
727 }
728 } else {
729 /* no more input */
730 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
731 break;
732 }
733 }
734
735 /*
736 * all other Unicode code points c==U+0021..U+10ffff
737 * are encoded with the difference c-prev
738 *
739 * a new prev is computed from c,
740 * placed in the middle of a 0x80-block (for most small scripts) or
741 * in the middle of the Unihan and Hangul blocks
742 * to statistically minimize the following difference
743 */
744 diff=c-prev;
745 prev=BOCU1_PREV(c);
746 if(DIFF_IS_SINGLE(diff)) {
747 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
748 --targetCapacity;
749 if(c<0x3000) {
750 goto fastSingle;
751 }
752 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
753 /* optimize 2-byte case */
754 int32_t m;
755
756 if(diff>=0) {
757 diff-=BOCU1_REACH_POS_1+1;
758 m=diff%BOCU1_TRAIL_COUNT;
759 diff/=BOCU1_TRAIL_COUNT;
760 diff+=BOCU1_START_POS_2;
761 } else {
762 diff-=BOCU1_REACH_NEG_1;
763 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
764 diff+=BOCU1_START_NEG_2;
765 }
766 *target++=(uint8_t)diff;
767 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
768 targetCapacity-=2;
769 } else {
770 int32_t length; /* will be 2..4 */
771
772 diff=packDiff(diff);
773 length=BOCU1_LENGTH_FROM_PACKED(diff);
774
775 /* write the output character bytes from diff and length */
776 /* from the first if in the loop we know that targetCapacity>0 */
777 if(length<=targetCapacity) {
778 switch(length) {
779 /* each branch falls through to the next one */
780 case 4:
781 *target++=(uint8_t)(diff>>24);
782 case 3: /*fall through*/
783 *target++=(uint8_t)(diff>>16);
784 /* case 2: handled above */
785 *target++=(uint8_t)(diff>>8);
786 /* case 1: handled above */
787 *target++=(uint8_t)diff;
788 default:
789 /* will never occur */
790 break;
791 }
792 targetCapacity-=length;
793 } else {
794 uint8_t *charErrorBuffer;
795
796 /*
797 * We actually do this backwards here:
798 * In order to save an intermediate variable, we output
799 * first to the overflow buffer what does not fit into the
800 * regular target.
801 */
802 /* we know that 1<=targetCapacity<length<=4 */
803 length-=targetCapacity;
804 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
805 switch(length) {
806 /* each branch falls through to the next one */
807 case 3:
808 *charErrorBuffer++=(uint8_t)(diff>>16);
809 case 2: /*fall through*/
810 *charErrorBuffer++=(uint8_t)(diff>>8);
811 case 1: /*fall through*/
812 *charErrorBuffer=(uint8_t)diff;
813 default:
814 /* will never occur */
815 break;
816 }
817 cnv->charErrorBufferLength=(int8_t)length;
818
819 /* now output what fits into the regular target */
820 diff>>=8*length; /* length was reduced by targetCapacity */
821 switch(targetCapacity) {
822 /* each branch falls through to the next one */
823 case 3:
824 *target++=(uint8_t)(diff>>16);
825 case 2: /*fall through*/
826 *target++=(uint8_t)(diff>>8);
827 case 1: /*fall through*/
828 *target++=(uint8_t)diff;
829 default:
830 /* will never occur */
831 break;
832 }
833
834 /* target overflow */
835 targetCapacity=0;
836 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
837 break;
838 }
839 }
840 } else {
841 /* target is full */
842 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
843 break;
844 }
845 }
846
847 /* set the converter state back into UConverter */
848 cnv->fromUChar32= c<0 ? -c : 0;
849 cnv->fromUnicodeStatus=(uint32_t)prev;
850
851 /* write back the updated pointers */
852 pArgs->source=source;
853 pArgs->target=(char *)target;
854 }
855
856 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
857
858 /**
859 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
860 *
861 * @param b lead byte;
862 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
863 * @return (diff<<2)|count
864 */
865 static inline int32_t
decodeBocu1LeadByte(int32_t b)866 decodeBocu1LeadByte(int32_t b) {
867 int32_t diff, count;
868
869 if(b>=BOCU1_START_NEG_2) {
870 /* positive difference */
871 if(b<BOCU1_START_POS_3) {
872 /* two bytes */
873 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
874 count=1;
875 } else if(b<BOCU1_START_POS_4) {
876 /* three bytes */
877 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
878 count=2;
879 } else {
880 /* four bytes */
881 diff=BOCU1_REACH_POS_3+1;
882 count=3;
883 }
884 } else {
885 /* negative difference */
886 if(b>=BOCU1_START_NEG_3) {
887 /* two bytes */
888 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
889 count=1;
890 } else if(b>BOCU1_MIN) {
891 /* three bytes */
892 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
893 count=2;
894 } else {
895 /* four bytes */
896 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
897 count=3;
898 }
899 }
900
901 /* return the state for decoding the trail byte(s) */
902 return (diff<<2)|count;
903 }
904
905 /**
906 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
907 *
908 * @param count number of remaining trail bytes including this one
909 * @param b trail byte
910 * @return new delta for diff including b - <0 indicates an error
911 *
912 * @see decodeBocu1
913 */
914 static inline int32_t
decodeBocu1TrailByte(int32_t count,int32_t b)915 decodeBocu1TrailByte(int32_t count, int32_t b) {
916 if(b<=0x20) {
917 /* skip some C0 controls and make the trail byte range contiguous */
918 b=bocu1ByteToTrail[b];
919 /* b<0 for an illegal trail byte value will result in return<0 below */
920 #if BOCU1_MAX_TRAIL<0xff
921 } else if(b>BOCU1_MAX_TRAIL) {
922 return -99;
923 #endif
924 } else {
925 b-=BOCU1_TRAIL_BYTE_OFFSET;
926 }
927
928 /* add trail byte into difference and decrement count */
929 if(count==1) {
930 return b;
931 } else if(count==2) {
932 return b*BOCU1_TRAIL_COUNT;
933 } else /* count==3 */ {
934 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
935 }
936 }
937
938 static void
_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)939 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
940 UErrorCode *pErrorCode) {
941 UConverter *cnv;
942 const uint8_t *source, *sourceLimit;
943 UChar *target;
944 const UChar *targetLimit;
945 int32_t *offsets;
946
947 int32_t prev, count, diff, c;
948
949 int8_t byteIndex;
950 uint8_t *bytes;
951
952 int32_t sourceIndex, nextSourceIndex;
953
954 /* set up the local pointers */
955 cnv=pArgs->converter;
956 source=(const uint8_t *)pArgs->source;
957 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
958 target=pArgs->target;
959 targetLimit=pArgs->targetLimit;
960 offsets=pArgs->offsets;
961
962 /* get the converter state from UConverter */
963 prev=(int32_t)cnv->toUnicodeStatus;
964 if(prev==0) {
965 prev=BOCU1_ASCII_PREV;
966 }
967 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
968 count=diff&3;
969 diff>>=2;
970
971 byteIndex=cnv->toULength;
972 bytes=cnv->toUBytes;
973
974 /* sourceIndex=-1 if the current character began in the previous buffer */
975 sourceIndex=byteIndex==0 ? 0 : -1;
976 nextSourceIndex=0;
977
978 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
979 if(count>0 && byteIndex>0 && target<targetLimit) {
980 goto getTrail;
981 }
982
983 fastSingle:
984 /* fast loop for single-byte differences */
985 /* use count as the only loop counter variable */
986 diff=(int32_t)(sourceLimit-source);
987 count=(int32_t)(pArgs->targetLimit-target);
988 if(count>diff) {
989 count=diff;
990 }
991 while(count>0) {
992 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
993 c=prev+(c-BOCU1_MIDDLE);
994 if(c<0x3000) {
995 *target++=(UChar)c;
996 *offsets++=nextSourceIndex++;
997 prev=BOCU1_SIMPLE_PREV(c);
998 } else {
999 break;
1000 }
1001 } else if(c<=0x20) {
1002 if(c!=0x20) {
1003 prev=BOCU1_ASCII_PREV;
1004 }
1005 *target++=(UChar)c;
1006 *offsets++=nextSourceIndex++;
1007 } else {
1008 break;
1009 }
1010 ++source;
1011 --count;
1012 }
1013 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1014
1015 /* decode a sequence of single and lead bytes */
1016 while(source<sourceLimit) {
1017 if(target>=targetLimit) {
1018 /* target is full */
1019 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1020 break;
1021 }
1022
1023 ++nextSourceIndex;
1024 c=*source++;
1025 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1026 /* Write a code point directly from a single-byte difference. */
1027 c=prev+(c-BOCU1_MIDDLE);
1028 if(c<0x3000) {
1029 *target++=(UChar)c;
1030 *offsets++=sourceIndex;
1031 prev=BOCU1_SIMPLE_PREV(c);
1032 sourceIndex=nextSourceIndex;
1033 goto fastSingle;
1034 }
1035 } else if(c<=0x20) {
1036 /*
1037 * Direct-encoded C0 control code or space.
1038 * Reset prev for C0 control codes but not for space.
1039 */
1040 if(c!=0x20) {
1041 prev=BOCU1_ASCII_PREV;
1042 }
1043 *target++=(UChar)c;
1044 *offsets++=sourceIndex;
1045 sourceIndex=nextSourceIndex;
1046 continue;
1047 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1048 /* Optimize two-byte case. */
1049 if(c>=BOCU1_MIDDLE) {
1050 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1051 } else {
1052 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1053 }
1054
1055 /* trail byte */
1056 ++nextSourceIndex;
1057 c=decodeBocu1TrailByte(1, *source++);
1058 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1059 bytes[0]=source[-2];
1060 bytes[1]=source[-1];
1061 byteIndex=2;
1062 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1063 break;
1064 }
1065 } else if(c==BOCU1_RESET) {
1066 /* only reset the state, no code point */
1067 prev=BOCU1_ASCII_PREV;
1068 sourceIndex=nextSourceIndex;
1069 continue;
1070 } else {
1071 /*
1072 * For multi-byte difference lead bytes, set the decoder state
1073 * with the partial difference value from the lead byte and
1074 * with the number of trail bytes.
1075 */
1076 bytes[0]=(uint8_t)c;
1077 byteIndex=1;
1078
1079 diff=decodeBocu1LeadByte(c);
1080 count=diff&3;
1081 diff>>=2;
1082 getTrail:
1083 for(;;) {
1084 if(source>=sourceLimit) {
1085 goto endloop;
1086 }
1087 ++nextSourceIndex;
1088 c=bytes[byteIndex++]=*source++;
1089
1090 /* trail byte in any position */
1091 c=decodeBocu1TrailByte(count, c);
1092 if(c<0) {
1093 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1094 goto endloop;
1095 }
1096
1097 diff+=c;
1098 if(--count==0) {
1099 /* final trail byte, deliver a code point */
1100 byteIndex=0;
1101 c=prev+diff;
1102 if((uint32_t)c>0x10ffff) {
1103 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1104 goto endloop;
1105 }
1106 break;
1107 }
1108 }
1109 }
1110
1111 /* calculate the next prev and output c */
1112 prev=BOCU1_PREV(c);
1113 if(c<=0xffff) {
1114 *target++=(UChar)c;
1115 *offsets++=sourceIndex;
1116 } else {
1117 /* output surrogate pair */
1118 *target++=U16_LEAD(c);
1119 if(target<targetLimit) {
1120 *target++=U16_TRAIL(c);
1121 *offsets++=sourceIndex;
1122 *offsets++=sourceIndex;
1123 } else {
1124 /* target overflow */
1125 *offsets++=sourceIndex;
1126 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1127 cnv->UCharErrorBufferLength=1;
1128 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1129 break;
1130 }
1131 }
1132 sourceIndex=nextSourceIndex;
1133 }
1134 endloop:
1135
1136 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1137 /* set the converter state in UConverter to deal with the next character */
1138 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1139 cnv->mode=0;
1140 } else {
1141 /* set the converter state back into UConverter */
1142 cnv->toUnicodeStatus=(uint32_t)prev;
1143 cnv->mode=(diff<<2)|count;
1144 }
1145 cnv->toULength=byteIndex;
1146
1147 /* write back the updated pointers */
1148 pArgs->source=(const char *)source;
1149 pArgs->target=target;
1150 pArgs->offsets=offsets;
1151 return;
1152 }
1153
1154 /*
1155 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1156 * If a change is made in the original function, then either
1157 * change this function the same way or
1158 * re-copy the original function and remove the variables
1159 * offsets, sourceIndex, and nextSourceIndex.
1160 */
1161 static void
_Bocu1ToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1162 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1163 UErrorCode *pErrorCode) {
1164 UConverter *cnv;
1165 const uint8_t *source, *sourceLimit;
1166 UChar *target;
1167 const UChar *targetLimit;
1168
1169 int32_t prev, count, diff, c;
1170
1171 int8_t byteIndex;
1172 uint8_t *bytes;
1173
1174 U_ALIGN_CODE(16)
1175
1176 /* set up the local pointers */
1177 cnv=pArgs->converter;
1178 source=(const uint8_t *)pArgs->source;
1179 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1180 target=pArgs->target;
1181 targetLimit=pArgs->targetLimit;
1182
1183 /* get the converter state from UConverter */
1184 prev=(int32_t)cnv->toUnicodeStatus;
1185 if(prev==0) {
1186 prev=BOCU1_ASCII_PREV;
1187 }
1188 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1189 count=diff&3;
1190 diff>>=2;
1191
1192 byteIndex=cnv->toULength;
1193 bytes=cnv->toUBytes;
1194
1195 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1196 if(count>0 && byteIndex>0 && target<targetLimit) {
1197 goto getTrail;
1198 }
1199
1200 fastSingle:
1201 /* fast loop for single-byte differences */
1202 /* use count as the only loop counter variable */
1203 diff=(int32_t)(sourceLimit-source);
1204 count=(int32_t)(pArgs->targetLimit-target);
1205 if(count>diff) {
1206 count=diff;
1207 }
1208 while(count>0) {
1209 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1210 c=prev+(c-BOCU1_MIDDLE);
1211 if(c<0x3000) {
1212 *target++=(UChar)c;
1213 prev=BOCU1_SIMPLE_PREV(c);
1214 } else {
1215 break;
1216 }
1217 } else if(c<=0x20) {
1218 if(c!=0x20) {
1219 prev=BOCU1_ASCII_PREV;
1220 }
1221 *target++=(UChar)c;
1222 } else {
1223 break;
1224 }
1225 ++source;
1226 --count;
1227 }
1228
1229 /* decode a sequence of single and lead bytes */
1230 while(source<sourceLimit) {
1231 if(target>=targetLimit) {
1232 /* target is full */
1233 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1234 break;
1235 }
1236
1237 c=*source++;
1238 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1239 /* Write a code point directly from a single-byte difference. */
1240 c=prev+(c-BOCU1_MIDDLE);
1241 if(c<0x3000) {
1242 *target++=(UChar)c;
1243 prev=BOCU1_SIMPLE_PREV(c);
1244 goto fastSingle;
1245 }
1246 } else if(c<=0x20) {
1247 /*
1248 * Direct-encoded C0 control code or space.
1249 * Reset prev for C0 control codes but not for space.
1250 */
1251 if(c!=0x20) {
1252 prev=BOCU1_ASCII_PREV;
1253 }
1254 *target++=(UChar)c;
1255 continue;
1256 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1257 /* Optimize two-byte case. */
1258 if(c>=BOCU1_MIDDLE) {
1259 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1260 } else {
1261 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1262 }
1263
1264 /* trail byte */
1265 c=decodeBocu1TrailByte(1, *source++);
1266 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1267 bytes[0]=source[-2];
1268 bytes[1]=source[-1];
1269 byteIndex=2;
1270 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1271 break;
1272 }
1273 } else if(c==BOCU1_RESET) {
1274 /* only reset the state, no code point */
1275 prev=BOCU1_ASCII_PREV;
1276 continue;
1277 } else {
1278 /*
1279 * For multi-byte difference lead bytes, set the decoder state
1280 * with the partial difference value from the lead byte and
1281 * with the number of trail bytes.
1282 */
1283 bytes[0]=(uint8_t)c;
1284 byteIndex=1;
1285
1286 diff=decodeBocu1LeadByte(c);
1287 count=diff&3;
1288 diff>>=2;
1289 getTrail:
1290 for(;;) {
1291 if(source>=sourceLimit) {
1292 goto endloop;
1293 }
1294 c=bytes[byteIndex++]=*source++;
1295
1296 /* trail byte in any position */
1297 c=decodeBocu1TrailByte(count, c);
1298 if(c<0) {
1299 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1300 goto endloop;
1301 }
1302
1303 diff+=c;
1304 if(--count==0) {
1305 /* final trail byte, deliver a code point */
1306 byteIndex=0;
1307 c=prev+diff;
1308 if((uint32_t)c>0x10ffff) {
1309 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1310 goto endloop;
1311 }
1312 break;
1313 }
1314 }
1315 }
1316
1317 /* calculate the next prev and output c */
1318 prev=BOCU1_PREV(c);
1319 if(c<=0xffff) {
1320 *target++=(UChar)c;
1321 } else {
1322 /* output surrogate pair */
1323 *target++=U16_LEAD(c);
1324 if(target<targetLimit) {
1325 *target++=U16_TRAIL(c);
1326 } else {
1327 /* target overflow */
1328 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1329 cnv->UCharErrorBufferLength=1;
1330 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1331 break;
1332 }
1333 }
1334 }
1335 endloop:
1336
1337 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1338 /* set the converter state in UConverter to deal with the next character */
1339 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1340 cnv->mode=0;
1341 } else {
1342 /* set the converter state back into UConverter */
1343 cnv->toUnicodeStatus=(uint32_t)prev;
1344 cnv->mode=(diff<<2)|count;
1345 }
1346 cnv->toULength=byteIndex;
1347
1348 /* write back the updated pointers */
1349 pArgs->source=(const char *)source;
1350 pArgs->target=target;
1351 return;
1352 }
1353
1354 /* miscellaneous ------------------------------------------------------------ */
1355
1356 static const UConverterImpl _Bocu1Impl={
1357 UCNV_BOCU1,
1358
1359 NULL,
1360 NULL,
1361
1362 NULL,
1363 NULL,
1364 NULL,
1365
1366 _Bocu1ToUnicode,
1367 _Bocu1ToUnicodeWithOffsets,
1368 _Bocu1FromUnicode,
1369 _Bocu1FromUnicodeWithOffsets,
1370 NULL,
1371
1372 NULL,
1373 NULL,
1374 NULL,
1375 NULL,
1376 ucnv_getCompleteUnicodeSet,
1377
1378 NULL,
1379 NULL
1380 };
1381
1382 static const UConverterStaticData _Bocu1StaticData={
1383 sizeof(UConverterStaticData),
1384 "BOCU-1",
1385 1214, /* CCSID for BOCU-1 */
1386 UCNV_IBM, UCNV_BOCU1,
1387 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1388 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1389 FALSE, FALSE,
1390 0,
1391 0,
1392 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1393 };
1394
1395 const UConverterSharedData _Bocu1Data={
1396 sizeof(UConverterSharedData), ~((uint32_t)0),
1397 NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
1398 0,
1399 UCNV_MBCS_TABLE_INITIALIZER
1400 };
1401
1402 #endif
1403