1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2002-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: ucnvbocu.cpp
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2002mar27
16 * created by: Markus W. Scherer
17 *
18 * This is an implementation of the Binary Ordered Compression for Unicode,
19 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
20 */
21
22 #include "unicode/utypes.h"
23
24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
25
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
28 #include "unicode/utf16.h"
29 #include "putilimp.h"
30 #include "ucnv_bld.h"
31 #include "ucnv_cnv.h"
32 #include "uassert.h"
33
34 /* BOCU-1 constants and macros ---------------------------------------------- */
35
36 /*
37 * BOCU-1 encodes the code points of a Unicode string as
38 * a sequence of byte-encoded differences (slope detection),
39 * preserving lexical order.
40 *
41 * Optimize the difference-taking for runs of Unicode text within
42 * small scripts:
43 *
44 * Most small scripts are allocated within aligned 128-blocks of Unicode
45 * code points. Lexical order is preserved if the "previous code point" state
46 * is always moved into the middle of such a block.
47 *
48 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
49 * areas into the middle of those areas.
50 *
51 * C0 control codes and space are encoded with their US-ASCII bytes.
52 * "prev" is reset for C0 controls but not for space.
53 */
54
55 /* initial value for "prev": middle of the ASCII range */
56 #define BOCU1_ASCII_PREV 0x40
57
58 /* bounding byte values for differences */
59 #define BOCU1_MIN 0x21
60 #define BOCU1_MIDDLE 0x90
61 #define BOCU1_MAX_LEAD 0xfe
62 #define BOCU1_MAX_TRAIL 0xff
63 #define BOCU1_RESET 0xff
64
65 /* number of lead bytes */
66 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
67
68 /* adjust trail byte counts for the use of some C0 control byte values */
69 #define BOCU1_TRAIL_CONTROLS_COUNT 20
70 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
71
72 /* number of trail bytes */
73 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
74
75 /*
76 * number of positive and negative single-byte codes
77 * (counting 0==BOCU1_MIDDLE among the positive ones)
78 */
79 #define BOCU1_SINGLE 64
80
81 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
82 #define BOCU1_LEAD_2 43
83 #define BOCU1_LEAD_3 3
84 #define BOCU1_LEAD_4 1
85
86 /* The difference value range for single-byters. */
87 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
88 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
89
90 /* The difference value range for double-byters. */
91 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
92 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
93
94 /* The difference value range for 3-byters. */
95 #define BOCU1_REACH_POS_3 \
96 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
97
98 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
99
100 /* The lead byte start values. */
101 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
102 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
103 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
104 /* ==BOCU1_MAX_LEAD */
105
106 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
107 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
108 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
109 /* ==BOCU1_MIN+1 */
110
111 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
112 #define BOCU1_LENGTH_FROM_LEAD(lead) \
113 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
114 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
115 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
116
117 /* The length of a byte sequence, according to its packed form. */
118 #define BOCU1_LENGTH_FROM_PACKED(packed) \
119 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
120
121 /*
122 * 12 commonly used C0 control codes (and space) are only used to encode
123 * themselves directly,
124 * which makes BOCU-1 MIME-usable and reasonably safe for
125 * ASCII-oriented software.
126 *
127 * These controls are
128 * 0 NUL
129 *
130 * 7 BEL
131 * 8 BS
132 *
133 * 9 TAB
134 * a LF
135 * b VT
136 * c FF
137 * d CR
138 *
139 * e SO
140 * f SI
141 *
142 * 1a SUB
143 * 1b ESC
144 *
145 * The other 20 C0 controls are also encoded directly (to preserve order)
146 * but are also used as trail bytes in difference encoding
147 * (for better compression).
148 */
149 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
150
151 /*
152 * Byte value map for control codes,
153 * from external byte values 0x00..0x20
154 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
155 * External byte values that are illegal as trail bytes are mapped to -1.
156 */
157 static const int8_t
158 bocu1ByteToTrail[BOCU1_MIN]={
159 /* 0 1 2 3 4 5 6 7 */
160 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
161
162 /* 8 9 a b c d e f */
163 -1, -1, -1, -1, -1, -1, -1, -1,
164
165 /* 10 11 12 13 14 15 16 17 */
166 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
167
168 /* 18 19 1a 1b 1c 1d 1e 1f */
169 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
170
171 /* 20 */
172 -1
173 };
174
175 /*
176 * Byte value map for control codes,
177 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
178 * to external byte values 0x00..0x20.
179 */
180 static const int8_t
181 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
182 /* 0 1 2 3 4 5 6 7 */
183 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
184
185 /* 8 9 a b c d e f */
186 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
187
188 /* 10 11 12 13 */
189 0x1c, 0x1d, 0x1e, 0x1f
190 };
191
192 /**
193 * Integer division and modulo with negative numerators
194 * yields negative modulo results and quotients that are one more than
195 * what we need here.
196 * This macro adjust the results so that the modulo-value m is always >=0.
197 *
198 * For positive n, the if() condition is always FALSE.
199 *
200 * @param n Number to be split into quotient and rest.
201 * Will be modified to contain the quotient.
202 * @param d Divisor.
203 * @param m Output variable for the rest (modulo result).
204 */
205 #define NEGDIVMOD(n, d, m) { \
206 (m)=(n)%(d); \
207 (n)/=(d); \
208 if((m)<0) { \
209 --(n); \
210 (m)+=(d); \
211 } \
212 }
213
214 /* Faster versions of packDiff() for single-byte-encoded diff values. */
215
216 /** Is a diff value encodable in a single byte? */
217 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
218
219 /** Encode a diff value in a single byte. */
220 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
221
222 /** Is a diff value encodable in two bytes? */
223 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
224
225 /* BOCU-1 implementation functions ------------------------------------------ */
226
227 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
228
229 /**
230 * Compute the next "previous" value for differencing
231 * from the current code point.
232 *
233 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
234 * @return "previous code point" state value
235 */
236 static inline int32_t
bocu1Prev(int32_t c)237 bocu1Prev(int32_t c) {
238 /* compute new prev */
239 if(/* 0x3040<=c && */ c<=0x309f) {
240 /* Hiragana is not 128-aligned */
241 return 0x3070;
242 } else if(0x4e00<=c && c<=0x9fa5) {
243 /* CJK Unihan */
244 return 0x4e00-BOCU1_REACH_NEG_2;
245 } else if(0xac00<=c /* && c<=0xd7a3 */) {
246 /* Korean Hangul */
247 return (0xd7a3+0xac00)/2;
248 } else {
249 /* mostly small scripts */
250 return BOCU1_SIMPLE_PREV(c);
251 }
252 }
253
254 /** Fast version of bocu1Prev() for most scripts. */
255 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
256
257 /*
258 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
259 * The UConverter fields are used as follows:
260 *
261 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262 *
263 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
264 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
265 */
266
267 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
268
269 /**
270 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
271 * and return a packed integer with them.
272 *
273 * The encoding favors small absolute differences with short encodings
274 * to compress runs of same-script characters.
275 *
276 * Optimized version with unrolled loops and fewer floating-point operations
277 * than the standard packDiff().
278 *
279 * @param diff difference value -0x10ffff..0x10ffff
280 * @return
281 * 0x010000zz for 1-byte sequence zz
282 * 0x0200yyzz for 2-byte sequence yy zz
283 * 0x03xxyyzz for 3-byte sequence xx yy zz
284 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
285 */
286 static int32_t
packDiff(int32_t diff)287 packDiff(int32_t diff) {
288 int32_t result, m;
289
290 U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
291 if(diff>=BOCU1_REACH_NEG_1) {
292 /* mostly positive differences, and single-byte negative ones */
293 #if 0 /* single-byte case handled in macros, see below */
294 if(diff<=BOCU1_REACH_POS_1) {
295 /* single byte */
296 return 0x01000000|(BOCU1_MIDDLE+diff);
297 } else
298 #endif
299 if(diff<=BOCU1_REACH_POS_2) {
300 /* two bytes */
301 diff-=BOCU1_REACH_POS_1+1;
302 result=0x02000000;
303
304 m=diff%BOCU1_TRAIL_COUNT;
305 diff/=BOCU1_TRAIL_COUNT;
306 result|=BOCU1_TRAIL_TO_BYTE(m);
307
308 result|=(BOCU1_START_POS_2+diff)<<8;
309 } else if(diff<=BOCU1_REACH_POS_3) {
310 /* three bytes */
311 diff-=BOCU1_REACH_POS_2+1;
312 result=0x03000000;
313
314 m=diff%BOCU1_TRAIL_COUNT;
315 diff/=BOCU1_TRAIL_COUNT;
316 result|=BOCU1_TRAIL_TO_BYTE(m);
317
318 m=diff%BOCU1_TRAIL_COUNT;
319 diff/=BOCU1_TRAIL_COUNT;
320 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
321
322 result|=(BOCU1_START_POS_3+diff)<<16;
323 } else {
324 /* four bytes */
325 diff-=BOCU1_REACH_POS_3+1;
326
327 m=diff%BOCU1_TRAIL_COUNT;
328 diff/=BOCU1_TRAIL_COUNT;
329 result=BOCU1_TRAIL_TO_BYTE(m);
330
331 m=diff%BOCU1_TRAIL_COUNT;
332 diff/=BOCU1_TRAIL_COUNT;
333 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
334
335 /*
336 * We know that / and % would deliver quotient 0 and rest=diff.
337 * Avoid division and modulo for performance.
338 */
339 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
340
341 result|=((uint32_t)BOCU1_START_POS_4)<<24;
342 }
343 } else {
344 /* two- to four-byte negative differences */
345 if(diff>=BOCU1_REACH_NEG_2) {
346 /* two bytes */
347 diff-=BOCU1_REACH_NEG_1;
348 result=0x02000000;
349
350 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
351 result|=BOCU1_TRAIL_TO_BYTE(m);
352
353 result|=(BOCU1_START_NEG_2+diff)<<8;
354 } else if(diff>=BOCU1_REACH_NEG_3) {
355 /* three bytes */
356 diff-=BOCU1_REACH_NEG_2;
357 result=0x03000000;
358
359 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
360 result|=BOCU1_TRAIL_TO_BYTE(m);
361
362 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
363 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
364
365 result|=(BOCU1_START_NEG_3+diff)<<16;
366 } else {
367 /* four bytes */
368 diff-=BOCU1_REACH_NEG_3;
369
370 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
371 result=BOCU1_TRAIL_TO_BYTE(m);
372
373 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
374 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
375
376 /*
377 * We know that NEGDIVMOD would deliver
378 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
379 * Avoid division and modulo for performance.
380 */
381 m=diff+BOCU1_TRAIL_COUNT;
382 result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
383
384 result|=BOCU1_MIN<<24;
385 }
386 }
387 return result;
388 }
389
390
391 static void U_CALLCONV
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)392 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
393 UErrorCode *pErrorCode) {
394 UConverter *cnv;
395 const UChar *source, *sourceLimit;
396 uint8_t *target;
397 int32_t targetCapacity;
398 int32_t *offsets;
399
400 int32_t prev, c, diff;
401
402 int32_t sourceIndex, nextSourceIndex;
403
404 /* set up the local pointers */
405 cnv=pArgs->converter;
406 source=pArgs->source;
407 sourceLimit=pArgs->sourceLimit;
408 target=(uint8_t *)pArgs->target;
409 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
410 offsets=pArgs->offsets;
411
412 /* get the converter state from UConverter */
413 c=cnv->fromUChar32;
414 prev=(int32_t)cnv->fromUnicodeStatus;
415 if(prev==0) {
416 prev=BOCU1_ASCII_PREV;
417 }
418
419 /* sourceIndex=-1 if the current character began in the previous buffer */
420 sourceIndex= c==0 ? 0 : -1;
421 nextSourceIndex=0;
422
423 /* conversion loop */
424 if(c!=0 && targetCapacity>0) {
425 goto getTrail;
426 }
427
428 fastSingle:
429 /* fast loop for single-byte differences */
430 /* use only one loop counter variable, targetCapacity, not also source */
431 diff=(int32_t)(sourceLimit-source);
432 if(targetCapacity>diff) {
433 targetCapacity=diff;
434 }
435 while(targetCapacity>0 && (c=*source)<0x3000) {
436 if(c<=0x20) {
437 if(c!=0x20) {
438 prev=BOCU1_ASCII_PREV;
439 }
440 *target++=(uint8_t)c;
441 *offsets++=nextSourceIndex++;
442 ++source;
443 --targetCapacity;
444 } else {
445 diff=c-prev;
446 if(DIFF_IS_SINGLE(diff)) {
447 prev=BOCU1_SIMPLE_PREV(c);
448 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
449 *offsets++=nextSourceIndex++;
450 ++source;
451 --targetCapacity;
452 } else {
453 break;
454 }
455 }
456 }
457 /* restore real values */
458 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
459 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
460
461 /* regular loop for all cases */
462 while(source<sourceLimit) {
463 if(targetCapacity>0) {
464 c=*source++;
465 ++nextSourceIndex;
466
467 if(c<=0x20) {
468 /*
469 * ISO C0 control & space:
470 * Encode directly for MIME compatibility,
471 * and reset state except for space, to not disrupt compression.
472 */
473 if(c!=0x20) {
474 prev=BOCU1_ASCII_PREV;
475 }
476 *target++=(uint8_t)c;
477 *offsets++=sourceIndex;
478 --targetCapacity;
479
480 sourceIndex=nextSourceIndex;
481 continue;
482 }
483
484 if(U16_IS_LEAD(c)) {
485 getTrail:
486 if(source<sourceLimit) {
487 /* test the following code unit */
488 UChar trail=*source;
489 if(U16_IS_TRAIL(trail)) {
490 ++source;
491 ++nextSourceIndex;
492 c=U16_GET_SUPPLEMENTARY(c, trail);
493 }
494 } else {
495 /* no more input */
496 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
497 break;
498 }
499 }
500
501 /*
502 * all other Unicode code points c==U+0021..U+10ffff
503 * are encoded with the difference c-prev
504 *
505 * a new prev is computed from c,
506 * placed in the middle of a 0x80-block (for most small scripts) or
507 * in the middle of the Unihan and Hangul blocks
508 * to statistically minimize the following difference
509 */
510 diff=c-prev;
511 prev=BOCU1_PREV(c);
512 if(DIFF_IS_SINGLE(diff)) {
513 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
514 *offsets++=sourceIndex;
515 --targetCapacity;
516 sourceIndex=nextSourceIndex;
517 if(c<0x3000) {
518 goto fastSingle;
519 }
520 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
521 /* optimize 2-byte case */
522 int32_t m;
523
524 if(diff>=0) {
525 diff-=BOCU1_REACH_POS_1+1;
526 m=diff%BOCU1_TRAIL_COUNT;
527 diff/=BOCU1_TRAIL_COUNT;
528 diff+=BOCU1_START_POS_2;
529 } else {
530 diff-=BOCU1_REACH_NEG_1;
531 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
532 diff+=BOCU1_START_NEG_2;
533 }
534 *target++=(uint8_t)diff;
535 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
536 *offsets++=sourceIndex;
537 *offsets++=sourceIndex;
538 targetCapacity-=2;
539 sourceIndex=nextSourceIndex;
540 } else {
541 int32_t length; /* will be 2..4 */
542
543 diff=packDiff(diff);
544 length=BOCU1_LENGTH_FROM_PACKED(diff);
545
546 /* write the output character bytes from diff and length */
547 /* from the first if in the loop we know that targetCapacity>0 */
548 if(length<=targetCapacity) {
549 switch(length) {
550 /* each branch falls through to the next one */
551 case 4:
552 *target++=(uint8_t)(diff>>24);
553 *offsets++=sourceIndex;
554 U_FALLTHROUGH;
555 case 3:
556 *target++=(uint8_t)(diff>>16);
557 *offsets++=sourceIndex;
558 U_FALLTHROUGH;
559 case 2:
560 *target++=(uint8_t)(diff>>8);
561 *offsets++=sourceIndex;
562 /* case 1: handled above */
563 *target++=(uint8_t)diff;
564 *offsets++=sourceIndex;
565 U_FALLTHROUGH;
566 default:
567 /* will never occur */
568 break;
569 }
570 targetCapacity-=length;
571 sourceIndex=nextSourceIndex;
572 } else {
573 uint8_t *charErrorBuffer;
574
575 /*
576 * We actually do this backwards here:
577 * In order to save an intermediate variable, we output
578 * first to the overflow buffer what does not fit into the
579 * regular target.
580 */
581 /* we know that 1<=targetCapacity<length<=4 */
582 length-=targetCapacity;
583 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
584 switch(length) {
585 /* each branch falls through to the next one */
586 case 3:
587 *charErrorBuffer++=(uint8_t)(diff>>16);
588 U_FALLTHROUGH;
589 case 2:
590 *charErrorBuffer++=(uint8_t)(diff>>8);
591 U_FALLTHROUGH;
592 case 1:
593 *charErrorBuffer=(uint8_t)diff;
594 U_FALLTHROUGH;
595 default:
596 /* will never occur */
597 break;
598 }
599 cnv->charErrorBufferLength=(int8_t)length;
600
601 /* now output what fits into the regular target */
602 diff>>=8*length; /* length was reduced by targetCapacity */
603 switch(targetCapacity) {
604 /* each branch falls through to the next one */
605 case 3:
606 *target++=(uint8_t)(diff>>16);
607 *offsets++=sourceIndex;
608 U_FALLTHROUGH;
609 case 2:
610 *target++=(uint8_t)(diff>>8);
611 *offsets++=sourceIndex;
612 U_FALLTHROUGH;
613 case 1:
614 *target++=(uint8_t)diff;
615 *offsets++=sourceIndex;
616 U_FALLTHROUGH;
617 default:
618 /* will never occur */
619 break;
620 }
621
622 /* target overflow */
623 targetCapacity=0;
624 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
625 break;
626 }
627 }
628 } else {
629 /* target is full */
630 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
631 break;
632 }
633 }
634
635 /* set the converter state back into UConverter */
636 cnv->fromUChar32= c<0 ? -c : 0;
637 cnv->fromUnicodeStatus=(uint32_t)prev;
638
639 /* write back the updated pointers */
640 pArgs->source=source;
641 pArgs->target=(char *)target;
642 pArgs->offsets=offsets;
643 }
644
645 /*
646 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
647 * If a change is made in the original function, then either
648 * change this function the same way or
649 * re-copy the original function and remove the variables
650 * offsets, sourceIndex, and nextSourceIndex.
651 */
652 static void U_CALLCONV
_Bocu1FromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)653 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
654 UErrorCode *pErrorCode) {
655 UConverter *cnv;
656 const UChar *source, *sourceLimit;
657 uint8_t *target;
658 int32_t targetCapacity;
659
660 int32_t prev, c, diff;
661
662 /* set up the local pointers */
663 cnv=pArgs->converter;
664 source=pArgs->source;
665 sourceLimit=pArgs->sourceLimit;
666 target=(uint8_t *)pArgs->target;
667 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
668
669 /* get the converter state from UConverter */
670 c=cnv->fromUChar32;
671 prev=(int32_t)cnv->fromUnicodeStatus;
672 if(prev==0) {
673 prev=BOCU1_ASCII_PREV;
674 }
675
676 /* conversion loop */
677 if(c!=0 && targetCapacity>0) {
678 goto getTrail;
679 }
680
681 fastSingle:
682 /* fast loop for single-byte differences */
683 /* use only one loop counter variable, targetCapacity, not also source */
684 diff=(int32_t)(sourceLimit-source);
685 if(targetCapacity>diff) {
686 targetCapacity=diff;
687 }
688 while(targetCapacity>0 && (c=*source)<0x3000) {
689 if(c<=0x20) {
690 if(c!=0x20) {
691 prev=BOCU1_ASCII_PREV;
692 }
693 *target++=(uint8_t)c;
694 } else {
695 diff=c-prev;
696 if(DIFF_IS_SINGLE(diff)) {
697 prev=BOCU1_SIMPLE_PREV(c);
698 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
699 } else {
700 break;
701 }
702 }
703 ++source;
704 --targetCapacity;
705 }
706 /* restore real values */
707 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
708
709 /* regular loop for all cases */
710 while(source<sourceLimit) {
711 if(targetCapacity>0) {
712 c=*source++;
713
714 if(c<=0x20) {
715 /*
716 * ISO C0 control & space:
717 * Encode directly for MIME compatibility,
718 * and reset state except for space, to not disrupt compression.
719 */
720 if(c!=0x20) {
721 prev=BOCU1_ASCII_PREV;
722 }
723 *target++=(uint8_t)c;
724 --targetCapacity;
725 continue;
726 }
727
728 if(U16_IS_LEAD(c)) {
729 getTrail:
730 if(source<sourceLimit) {
731 /* test the following code unit */
732 UChar trail=*source;
733 if(U16_IS_TRAIL(trail)) {
734 ++source;
735 c=U16_GET_SUPPLEMENTARY(c, trail);
736 }
737 } else {
738 /* no more input */
739 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
740 break;
741 }
742 }
743
744 /*
745 * all other Unicode code points c==U+0021..U+10ffff
746 * are encoded with the difference c-prev
747 *
748 * a new prev is computed from c,
749 * placed in the middle of a 0x80-block (for most small scripts) or
750 * in the middle of the Unihan and Hangul blocks
751 * to statistically minimize the following difference
752 */
753 diff=c-prev;
754 prev=BOCU1_PREV(c);
755 if(DIFF_IS_SINGLE(diff)) {
756 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
757 --targetCapacity;
758 if(c<0x3000) {
759 goto fastSingle;
760 }
761 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
762 /* optimize 2-byte case */
763 int32_t m;
764
765 if(diff>=0) {
766 diff-=BOCU1_REACH_POS_1+1;
767 m=diff%BOCU1_TRAIL_COUNT;
768 diff/=BOCU1_TRAIL_COUNT;
769 diff+=BOCU1_START_POS_2;
770 } else {
771 diff-=BOCU1_REACH_NEG_1;
772 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
773 diff+=BOCU1_START_NEG_2;
774 }
775 *target++=(uint8_t)diff;
776 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
777 targetCapacity-=2;
778 } else {
779 int32_t length; /* will be 2..4 */
780
781 diff=packDiff(diff);
782 length=BOCU1_LENGTH_FROM_PACKED(diff);
783
784 /* write the output character bytes from diff and length */
785 /* from the first if in the loop we know that targetCapacity>0 */
786 if(length<=targetCapacity) {
787 switch(length) {
788 /* each branch falls through to the next one */
789 case 4:
790 *target++=(uint8_t)(diff>>24);
791 U_FALLTHROUGH;
792 case 3:
793 *target++=(uint8_t)(diff>>16);
794 /* case 2: handled above */
795 *target++=(uint8_t)(diff>>8);
796 /* case 1: handled above */
797 *target++=(uint8_t)diff;
798 U_FALLTHROUGH;
799 default:
800 /* will never occur */
801 break;
802 }
803 targetCapacity-=length;
804 } else {
805 uint8_t *charErrorBuffer;
806
807 /*
808 * We actually do this backwards here:
809 * In order to save an intermediate variable, we output
810 * first to the overflow buffer what does not fit into the
811 * regular target.
812 */
813 /* we know that 1<=targetCapacity<length<=4 */
814 length-=targetCapacity;
815 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
816 switch(length) {
817 /* each branch falls through to the next one */
818 case 3:
819 *charErrorBuffer++=(uint8_t)(diff>>16);
820 U_FALLTHROUGH;
821 case 2:
822 *charErrorBuffer++=(uint8_t)(diff>>8);
823 U_FALLTHROUGH;
824 case 1:
825 *charErrorBuffer=(uint8_t)diff;
826 U_FALLTHROUGH;
827 default:
828 /* will never occur */
829 break;
830 }
831 cnv->charErrorBufferLength=(int8_t)length;
832
833 /* now output what fits into the regular target */
834 diff>>=8*length; /* length was reduced by targetCapacity */
835 switch(targetCapacity) {
836 /* each branch falls through to the next one */
837 case 3:
838 *target++=(uint8_t)(diff>>16);
839 U_FALLTHROUGH;
840 case 2:
841 *target++=(uint8_t)(diff>>8);
842 U_FALLTHROUGH;
843 case 1:
844 *target++=(uint8_t)diff;
845 U_FALLTHROUGH;
846 default:
847 /* will never occur */
848 break;
849 }
850
851 /* target overflow */
852 targetCapacity=0;
853 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
854 break;
855 }
856 }
857 } else {
858 /* target is full */
859 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
860 break;
861 }
862 }
863
864 /* set the converter state back into UConverter */
865 cnv->fromUChar32= c<0 ? -c : 0;
866 cnv->fromUnicodeStatus=(uint32_t)prev;
867
868 /* write back the updated pointers */
869 pArgs->source=source;
870 pArgs->target=(char *)target;
871 }
872
873 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
874
875 /**
876 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
877 *
878 * @param b lead byte;
879 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
880 * @return (diff<<2)|count
881 */
882 static inline int32_t
decodeBocu1LeadByte(int32_t b)883 decodeBocu1LeadByte(int32_t b) {
884 int32_t diff, count;
885
886 if(b>=BOCU1_START_NEG_2) {
887 /* positive difference */
888 if(b<BOCU1_START_POS_3) {
889 /* two bytes */
890 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
891 count=1;
892 } else if(b<BOCU1_START_POS_4) {
893 /* three bytes */
894 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
895 count=2;
896 } else {
897 /* four bytes */
898 diff=BOCU1_REACH_POS_3+1;
899 count=3;
900 }
901 } else {
902 /* negative difference */
903 if(b>=BOCU1_START_NEG_3) {
904 /* two bytes */
905 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
906 count=1;
907 } else if(b>BOCU1_MIN) {
908 /* three bytes */
909 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
910 count=2;
911 } else {
912 /* four bytes */
913 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
914 count=3;
915 }
916 }
917
918 /* return the state for decoding the trail byte(s) */
919 return (diff<<2)|count;
920 }
921
922 /**
923 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
924 *
925 * @param count number of remaining trail bytes including this one
926 * @param b trail byte
927 * @return new delta for diff including b - <0 indicates an error
928 *
929 * @see decodeBocu1
930 */
931 static inline int32_t
decodeBocu1TrailByte(int32_t count,int32_t b)932 decodeBocu1TrailByte(int32_t count, int32_t b) {
933 if(b<=0x20) {
934 /* skip some C0 controls and make the trail byte range contiguous */
935 b=bocu1ByteToTrail[b];
936 /* b<0 for an illegal trail byte value will result in return<0 below */
937 #if BOCU1_MAX_TRAIL<0xff
938 } else if(b>BOCU1_MAX_TRAIL) {
939 return -99;
940 #endif
941 } else {
942 b-=BOCU1_TRAIL_BYTE_OFFSET;
943 }
944
945 /* add trail byte into difference and decrement count */
946 if(count==1) {
947 return b;
948 } else if(count==2) {
949 return b*BOCU1_TRAIL_COUNT;
950 } else /* count==3 */ {
951 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
952 }
953 }
954
955 static void U_CALLCONV
_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)956 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
957 UErrorCode *pErrorCode) {
958 UConverter *cnv;
959 const uint8_t *source, *sourceLimit;
960 UChar *target;
961 const UChar *targetLimit;
962 int32_t *offsets;
963
964 int32_t prev, count, diff, c;
965
966 int8_t byteIndex;
967 uint8_t *bytes;
968
969 int32_t sourceIndex, nextSourceIndex;
970
971 /* set up the local pointers */
972 cnv=pArgs->converter;
973 source=(const uint8_t *)pArgs->source;
974 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
975 target=pArgs->target;
976 targetLimit=pArgs->targetLimit;
977 offsets=pArgs->offsets;
978
979 /* get the converter state from UConverter */
980 prev=(int32_t)cnv->toUnicodeStatus;
981 if(prev==0) {
982 prev=BOCU1_ASCII_PREV;
983 }
984 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
985 count=diff&3;
986 diff>>=2;
987
988 byteIndex=cnv->toULength;
989 bytes=cnv->toUBytes;
990
991 /* sourceIndex=-1 if the current character began in the previous buffer */
992 sourceIndex=byteIndex==0 ? 0 : -1;
993 nextSourceIndex=0;
994
995 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
996 if(count>0 && byteIndex>0 && target<targetLimit) {
997 goto getTrail;
998 }
999
1000 fastSingle:
1001 /* fast loop for single-byte differences */
1002 /* use count as the only loop counter variable */
1003 diff=(int32_t)(sourceLimit-source);
1004 count=(int32_t)(pArgs->targetLimit-target);
1005 if(count>diff) {
1006 count=diff;
1007 }
1008 while(count>0) {
1009 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1010 c=prev+(c-BOCU1_MIDDLE);
1011 if(c<0x3000) {
1012 *target++=(UChar)c;
1013 *offsets++=nextSourceIndex++;
1014 prev=BOCU1_SIMPLE_PREV(c);
1015 } else {
1016 break;
1017 }
1018 } else if(c<=0x20) {
1019 if(c!=0x20) {
1020 prev=BOCU1_ASCII_PREV;
1021 }
1022 *target++=(UChar)c;
1023 *offsets++=nextSourceIndex++;
1024 } else {
1025 break;
1026 }
1027 ++source;
1028 --count;
1029 }
1030 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1031
1032 /* decode a sequence of single and lead bytes */
1033 while(source<sourceLimit) {
1034 if(target>=targetLimit) {
1035 /* target is full */
1036 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1037 break;
1038 }
1039
1040 ++nextSourceIndex;
1041 c=*source++;
1042 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1043 /* Write a code point directly from a single-byte difference. */
1044 c=prev+(c-BOCU1_MIDDLE);
1045 if(c<0x3000) {
1046 *target++=(UChar)c;
1047 *offsets++=sourceIndex;
1048 prev=BOCU1_SIMPLE_PREV(c);
1049 sourceIndex=nextSourceIndex;
1050 goto fastSingle;
1051 }
1052 } else if(c<=0x20) {
1053 /*
1054 * Direct-encoded C0 control code or space.
1055 * Reset prev for C0 control codes but not for space.
1056 */
1057 if(c!=0x20) {
1058 prev=BOCU1_ASCII_PREV;
1059 }
1060 *target++=(UChar)c;
1061 *offsets++=sourceIndex;
1062 sourceIndex=nextSourceIndex;
1063 continue;
1064 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1065 /* Optimize two-byte case. */
1066 if(c>=BOCU1_MIDDLE) {
1067 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1068 } else {
1069 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1070 }
1071
1072 /* trail byte */
1073 ++nextSourceIndex;
1074 c=decodeBocu1TrailByte(1, *source++);
1075 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1076 bytes[0]=source[-2];
1077 bytes[1]=source[-1];
1078 byteIndex=2;
1079 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1080 break;
1081 }
1082 } else if(c==BOCU1_RESET) {
1083 /* only reset the state, no code point */
1084 prev=BOCU1_ASCII_PREV;
1085 sourceIndex=nextSourceIndex;
1086 continue;
1087 } else {
1088 /*
1089 * For multi-byte difference lead bytes, set the decoder state
1090 * with the partial difference value from the lead byte and
1091 * with the number of trail bytes.
1092 */
1093 bytes[0]=(uint8_t)c;
1094 byteIndex=1;
1095
1096 diff=decodeBocu1LeadByte(c);
1097 count=diff&3;
1098 diff>>=2;
1099 getTrail:
1100 for(;;) {
1101 if(source>=sourceLimit) {
1102 goto endloop;
1103 }
1104 ++nextSourceIndex;
1105 c=bytes[byteIndex++]=*source++;
1106
1107 /* trail byte in any position */
1108 c=decodeBocu1TrailByte(count, c);
1109 if(c<0) {
1110 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1111 goto endloop;
1112 }
1113
1114 diff+=c;
1115 if(--count==0) {
1116 /* final trail byte, deliver a code point */
1117 byteIndex=0;
1118 c=prev+diff;
1119 if((uint32_t)c>0x10ffff) {
1120 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121 goto endloop;
1122 }
1123 break;
1124 }
1125 }
1126 }
1127
1128 /* calculate the next prev and output c */
1129 prev=BOCU1_PREV(c);
1130 if(c<=0xffff) {
1131 *target++=(UChar)c;
1132 *offsets++=sourceIndex;
1133 } else {
1134 /* output surrogate pair */
1135 *target++=U16_LEAD(c);
1136 if(target<targetLimit) {
1137 *target++=U16_TRAIL(c);
1138 *offsets++=sourceIndex;
1139 *offsets++=sourceIndex;
1140 } else {
1141 /* target overflow */
1142 *offsets++=sourceIndex;
1143 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1144 cnv->UCharErrorBufferLength=1;
1145 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1146 break;
1147 }
1148 }
1149 sourceIndex=nextSourceIndex;
1150 }
1151 endloop:
1152
1153 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1154 /* set the converter state in UConverter to deal with the next character */
1155 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1156 cnv->mode=0;
1157 } else {
1158 /* set the converter state back into UConverter */
1159 cnv->toUnicodeStatus=(uint32_t)prev;
1160 cnv->mode=(diff<<2)|count;
1161 }
1162 cnv->toULength=byteIndex;
1163
1164 /* write back the updated pointers */
1165 pArgs->source=(const char *)source;
1166 pArgs->target=target;
1167 pArgs->offsets=offsets;
1168 return;
1169 }
1170
1171 /*
1172 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1173 * If a change is made in the original function, then either
1174 * change this function the same way or
1175 * re-copy the original function and remove the variables
1176 * offsets, sourceIndex, and nextSourceIndex.
1177 */
1178 static void U_CALLCONV
_Bocu1ToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1179 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1180 UErrorCode *pErrorCode) {
1181 UConverter *cnv;
1182 const uint8_t *source, *sourceLimit;
1183 UChar *target;
1184 const UChar *targetLimit;
1185
1186 int32_t prev, count, diff, c;
1187
1188 int8_t byteIndex;
1189 uint8_t *bytes;
1190
1191 /* set up the local pointers */
1192 cnv=pArgs->converter;
1193 source=(const uint8_t *)pArgs->source;
1194 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1195 target=pArgs->target;
1196 targetLimit=pArgs->targetLimit;
1197
1198 /* get the converter state from UConverter */
1199 prev=(int32_t)cnv->toUnicodeStatus;
1200 if(prev==0) {
1201 prev=BOCU1_ASCII_PREV;
1202 }
1203 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1204 count=diff&3;
1205 diff>>=2;
1206
1207 byteIndex=cnv->toULength;
1208 bytes=cnv->toUBytes;
1209
1210 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1211 if(count>0 && byteIndex>0 && target<targetLimit) {
1212 goto getTrail;
1213 }
1214
1215 fastSingle:
1216 /* fast loop for single-byte differences */
1217 /* use count as the only loop counter variable */
1218 diff=(int32_t)(sourceLimit-source);
1219 count=(int32_t)(pArgs->targetLimit-target);
1220 if(count>diff) {
1221 count=diff;
1222 }
1223 while(count>0) {
1224 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1225 c=prev+(c-BOCU1_MIDDLE);
1226 if(c<0x3000) {
1227 *target++=(UChar)c;
1228 prev=BOCU1_SIMPLE_PREV(c);
1229 } else {
1230 break;
1231 }
1232 } else if(c<=0x20) {
1233 if(c!=0x20) {
1234 prev=BOCU1_ASCII_PREV;
1235 }
1236 *target++=(UChar)c;
1237 } else {
1238 break;
1239 }
1240 ++source;
1241 --count;
1242 }
1243
1244 /* decode a sequence of single and lead bytes */
1245 while(source<sourceLimit) {
1246 if(target>=targetLimit) {
1247 /* target is full */
1248 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1249 break;
1250 }
1251
1252 c=*source++;
1253 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1254 /* Write a code point directly from a single-byte difference. */
1255 c=prev+(c-BOCU1_MIDDLE);
1256 if(c<0x3000) {
1257 *target++=(UChar)c;
1258 prev=BOCU1_SIMPLE_PREV(c);
1259 goto fastSingle;
1260 }
1261 } else if(c<=0x20) {
1262 /*
1263 * Direct-encoded C0 control code or space.
1264 * Reset prev for C0 control codes but not for space.
1265 */
1266 if(c!=0x20) {
1267 prev=BOCU1_ASCII_PREV;
1268 }
1269 *target++=(UChar)c;
1270 continue;
1271 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1272 /* Optimize two-byte case. */
1273 if(c>=BOCU1_MIDDLE) {
1274 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1275 } else {
1276 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1277 }
1278
1279 /* trail byte */
1280 c=decodeBocu1TrailByte(1, *source++);
1281 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1282 bytes[0]=source[-2];
1283 bytes[1]=source[-1];
1284 byteIndex=2;
1285 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1286 break;
1287 }
1288 } else if(c==BOCU1_RESET) {
1289 /* only reset the state, no code point */
1290 prev=BOCU1_ASCII_PREV;
1291 continue;
1292 } else {
1293 /*
1294 * For multi-byte difference lead bytes, set the decoder state
1295 * with the partial difference value from the lead byte and
1296 * with the number of trail bytes.
1297 */
1298 bytes[0]=(uint8_t)c;
1299 byteIndex=1;
1300
1301 diff=decodeBocu1LeadByte(c);
1302 count=diff&3;
1303 diff>>=2;
1304 getTrail:
1305 for(;;) {
1306 if(source>=sourceLimit) {
1307 goto endloop;
1308 }
1309 c=bytes[byteIndex++]=*source++;
1310
1311 /* trail byte in any position */
1312 c=decodeBocu1TrailByte(count, c);
1313 if(c<0) {
1314 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1315 goto endloop;
1316 }
1317
1318 diff+=c;
1319 if(--count==0) {
1320 /* final trail byte, deliver a code point */
1321 byteIndex=0;
1322 c=prev+diff;
1323 if((uint32_t)c>0x10ffff) {
1324 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1325 goto endloop;
1326 }
1327 break;
1328 }
1329 }
1330 }
1331
1332 /* calculate the next prev and output c */
1333 prev=BOCU1_PREV(c);
1334 if(c<=0xffff) {
1335 *target++=(UChar)c;
1336 } else {
1337 /* output surrogate pair */
1338 *target++=U16_LEAD(c);
1339 if(target<targetLimit) {
1340 *target++=U16_TRAIL(c);
1341 } else {
1342 /* target overflow */
1343 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1344 cnv->UCharErrorBufferLength=1;
1345 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1346 break;
1347 }
1348 }
1349 }
1350 endloop:
1351
1352 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1353 /* set the converter state in UConverter to deal with the next character */
1354 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1355 cnv->mode=0;
1356 } else {
1357 /* set the converter state back into UConverter */
1358 cnv->toUnicodeStatus=(uint32_t)prev;
1359 cnv->mode=(diff<<2)|count;
1360 }
1361 cnv->toULength=byteIndex;
1362
1363 /* write back the updated pointers */
1364 pArgs->source=(const char *)source;
1365 pArgs->target=target;
1366 return;
1367 }
1368
1369 /* miscellaneous ------------------------------------------------------------ */
1370
1371 static const UConverterImpl _Bocu1Impl={
1372 UCNV_BOCU1,
1373
1374 NULL,
1375 NULL,
1376
1377 NULL,
1378 NULL,
1379 NULL,
1380
1381 _Bocu1ToUnicode,
1382 _Bocu1ToUnicodeWithOffsets,
1383 _Bocu1FromUnicode,
1384 _Bocu1FromUnicodeWithOffsets,
1385 NULL,
1386
1387 NULL,
1388 NULL,
1389 NULL,
1390 NULL,
1391 ucnv_getCompleteUnicodeSet,
1392
1393 NULL,
1394 NULL
1395 };
1396
1397 static const UConverterStaticData _Bocu1StaticData={
1398 sizeof(UConverterStaticData),
1399 "BOCU-1",
1400 1214, /* CCSID for BOCU-1 */
1401 UCNV_IBM, UCNV_BOCU1,
1402 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1403 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1404 FALSE, FALSE,
1405 0,
1406 0,
1407 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1408 };
1409
1410 const UConverterSharedData _Bocu1Data=
1411 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1412
1413 #endif
1414