• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2002-2011, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  ucnvbocu.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002mar27
14 *   created by: Markus W. Scherer
15 *
16 *   This is an implementation of the Binary Ordered Compression for Unicode,
17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
18 */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_CONVERSION
23 
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "unicode/utf16.h"
27 #include "putilimp.h"
28 #include "ucnv_bld.h"
29 #include "ucnv_cnv.h"
30 #include "uassert.h"
31 
32 /* BOCU-1 constants and macros ---------------------------------------------- */
33 
34 /*
35  * BOCU-1 encodes the code points of a Unicode string as
36  * a sequence of byte-encoded differences (slope detection),
37  * preserving lexical order.
38  *
39  * Optimize the difference-taking for runs of Unicode text within
40  * small scripts:
41  *
42  * Most small scripts are allocated within aligned 128-blocks of Unicode
43  * code points. Lexical order is preserved if the "previous code point" state
44  * is always moved into the middle of such a block.
45  *
46  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
47  * areas into the middle of those areas.
48  *
49  * C0 control codes and space are encoded with their US-ASCII bytes.
50  * "prev" is reset for C0 controls but not for space.
51  */
52 
53 /* initial value for "prev": middle of the ASCII range */
54 #define BOCU1_ASCII_PREV        0x40
55 
56 /* bounding byte values for differences */
57 #define BOCU1_MIN               0x21
58 #define BOCU1_MIDDLE            0x90
59 #define BOCU1_MAX_LEAD          0xfe
60 #define BOCU1_MAX_TRAIL         0xff
61 #define BOCU1_RESET             0xff
62 
63 /* number of lead bytes */
64 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
65 
66 /* adjust trail byte counts for the use of some C0 control byte values */
67 #define BOCU1_TRAIL_CONTROLS_COUNT  20
68 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
69 
70 /* number of trail bytes */
71 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
72 
73 /*
74  * number of positive and negative single-byte codes
75  * (counting 0==BOCU1_MIDDLE among the positive ones)
76  */
77 #define BOCU1_SINGLE            64
78 
79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
80 #define BOCU1_LEAD_2            43
81 #define BOCU1_LEAD_3            3
82 #define BOCU1_LEAD_4            1
83 
84 /* The difference value range for single-byters. */
85 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
86 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
87 
88 /* The difference value range for double-byters. */
89 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
90 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
91 
92 /* The difference value range for 3-byters. */
93 #define BOCU1_REACH_POS_3   \
94     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
95 
96 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
97 
98 /* The lead byte start values. */
99 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
100 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
101 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
102      /* ==BOCU1_MAX_LEAD */
103 
104 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
105 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
106 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
107      /* ==BOCU1_MIN+1 */
108 
109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
111     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
112      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
113      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
114 
115 /* The length of a byte sequence, according to its packed form. */
116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
117     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
118 
119 /*
120  * 12 commonly used C0 control codes (and space) are only used to encode
121  * themselves directly,
122  * which makes BOCU-1 MIME-usable and reasonably safe for
123  * ASCII-oriented software.
124  *
125  * These controls are
126  *  0   NUL
127  *
128  *  7   BEL
129  *  8   BS
130  *
131  *  9   TAB
132  *  a   LF
133  *  b   VT
134  *  c   FF
135  *  d   CR
136  *
137  *  e   SO
138  *  f   SI
139  *
140  * 1a   SUB
141  * 1b   ESC
142  *
143  * The other 20 C0 controls are also encoded directly (to preserve order)
144  * but are also used as trail bytes in difference encoding
145  * (for better compression).
146  */
147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
148 
149 /*
150  * Byte value map for control codes,
151  * from external byte values 0x00..0x20
152  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
153  * External byte values that are illegal as trail bytes are mapped to -1.
154  */
155 static const int8_t
156 bocu1ByteToTrail[BOCU1_MIN]={
157 /*  0     1     2     3     4     5     6     7    */
158     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
159 
160 /*  8     9     a     b     c     d     e     f    */
161     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
162 
163 /*  10    11    12    13    14    15    16    17   */
164     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
165 
166 /*  18    19    1a    1b    1c    1d    1e    1f   */
167     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
168 
169 /*  20   */
170     -1
171 };
172 
173 /*
174  * Byte value map for control codes,
175  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
176  * to external byte values 0x00..0x20.
177  */
178 static const int8_t
179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
180 /*  0     1     2     3     4     5     6     7    */
181     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
182 
183 /*  8     9     a     b     c     d     e     f    */
184     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
185 
186 /*  10    11    12    13   */
187     0x1c, 0x1d, 0x1e, 0x1f
188 };
189 
190 /**
191  * Integer division and modulo with negative numerators
192  * yields negative modulo results and quotients that are one more than
193  * what we need here.
194  * This macro adjust the results so that the modulo-value m is always >=0.
195  *
196  * For positive n, the if() condition is always FALSE.
197  *
198  * @param n Number to be split into quotient and rest.
199  *          Will be modified to contain the quotient.
200  * @param d Divisor.
201  * @param m Output variable for the rest (modulo result).
202  */
203 #define NEGDIVMOD(n, d, m) { \
204     (m)=(n)%(d); \
205     (n)/=(d); \
206     if((m)<0) { \
207         --(n); \
208         (m)+=(d); \
209     } \
210 }
211 
212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
213 
214 /** Is a diff value encodable in a single byte? */
215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
216 
217 /** Encode a diff value in a single byte. */
218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
219 
220 /** Is a diff value encodable in two bytes? */
221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
222 
223 /* BOCU-1 implementation functions ------------------------------------------ */
224 
225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
226 
227 /**
228  * Compute the next "previous" value for differencing
229  * from the current code point.
230  *
231  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
232  * @return "previous code point" state value
233  */
234 static inline int32_t
bocu1Prev(int32_t c)235 bocu1Prev(int32_t c) {
236     /* compute new prev */
237     if(/* 0x3040<=c && */ c<=0x309f) {
238         /* Hiragana is not 128-aligned */
239         return 0x3070;
240     } else if(0x4e00<=c && c<=0x9fa5) {
241         /* CJK Unihan */
242         return 0x4e00-BOCU1_REACH_NEG_2;
243     } else if(0xac00<=c /* && c<=0xd7a3 */) {
244         /* Korean Hangul */
245         return (0xd7a3+0xac00)/2;
246     } else {
247         /* mostly small scripts */
248         return BOCU1_SIMPLE_PREV(c);
249     }
250 }
251 
252 /** Fast version of bocu1Prev() for most scripts. */
253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
254 
255 /*
256  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
257  * The UConverter fields are used as follows:
258  *
259  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
260  *
261  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
263  */
264 
265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
266 
267 /**
268  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
269  * and return a packed integer with them.
270  *
271  * The encoding favors small absolute differences with short encodings
272  * to compress runs of same-script characters.
273  *
274  * Optimized version with unrolled loops and fewer floating-point operations
275  * than the standard packDiff().
276  *
277  * @param diff difference value -0x10ffff..0x10ffff
278  * @return
279  *      0x010000zz for 1-byte sequence zz
280  *      0x0200yyzz for 2-byte sequence yy zz
281  *      0x03xxyyzz for 3-byte sequence xx yy zz
282  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
283  */
284 static int32_t
packDiff(int32_t diff)285 packDiff(int32_t diff) {
286     int32_t result, m;
287 
288     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
289     if(diff>=BOCU1_REACH_NEG_1) {
290         /* mostly positive differences, and single-byte negative ones */
291 #if 0   /* single-byte case handled in macros, see below */
292         if(diff<=BOCU1_REACH_POS_1) {
293             /* single byte */
294             return 0x01000000|(BOCU1_MIDDLE+diff);
295         } else
296 #endif
297         if(diff<=BOCU1_REACH_POS_2) {
298             /* two bytes */
299             diff-=BOCU1_REACH_POS_1+1;
300             result=0x02000000;
301 
302             m=diff%BOCU1_TRAIL_COUNT;
303             diff/=BOCU1_TRAIL_COUNT;
304             result|=BOCU1_TRAIL_TO_BYTE(m);
305 
306             result|=(BOCU1_START_POS_2+diff)<<8;
307         } else if(diff<=BOCU1_REACH_POS_3) {
308             /* three bytes */
309             diff-=BOCU1_REACH_POS_2+1;
310             result=0x03000000;
311 
312             m=diff%BOCU1_TRAIL_COUNT;
313             diff/=BOCU1_TRAIL_COUNT;
314             result|=BOCU1_TRAIL_TO_BYTE(m);
315 
316             m=diff%BOCU1_TRAIL_COUNT;
317             diff/=BOCU1_TRAIL_COUNT;
318             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
319 
320             result|=(BOCU1_START_POS_3+diff)<<16;
321         } else {
322             /* four bytes */
323             diff-=BOCU1_REACH_POS_3+1;
324 
325             m=diff%BOCU1_TRAIL_COUNT;
326             diff/=BOCU1_TRAIL_COUNT;
327             result=BOCU1_TRAIL_TO_BYTE(m);
328 
329             m=diff%BOCU1_TRAIL_COUNT;
330             diff/=BOCU1_TRAIL_COUNT;
331             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
332 
333             /*
334              * We know that / and % would deliver quotient 0 and rest=diff.
335              * Avoid division and modulo for performance.
336              */
337             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
338 
339             result|=((uint32_t)BOCU1_START_POS_4)<<24;
340         }
341     } else {
342         /* two- to four-byte negative differences */
343         if(diff>=BOCU1_REACH_NEG_2) {
344             /* two bytes */
345             diff-=BOCU1_REACH_NEG_1;
346             result=0x02000000;
347 
348             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
349             result|=BOCU1_TRAIL_TO_BYTE(m);
350 
351             result|=(BOCU1_START_NEG_2+diff)<<8;
352         } else if(diff>=BOCU1_REACH_NEG_3) {
353             /* three bytes */
354             diff-=BOCU1_REACH_NEG_2;
355             result=0x03000000;
356 
357             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
358             result|=BOCU1_TRAIL_TO_BYTE(m);
359 
360             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
361             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
362 
363             result|=(BOCU1_START_NEG_3+diff)<<16;
364         } else {
365             /* four bytes */
366             diff-=BOCU1_REACH_NEG_3;
367 
368             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
369             result=BOCU1_TRAIL_TO_BYTE(m);
370 
371             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
372             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
373 
374             /*
375              * We know that NEGDIVMOD would deliver
376              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
377              * Avoid division and modulo for performance.
378              */
379             m=diff+BOCU1_TRAIL_COUNT;
380             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
381 
382             result|=BOCU1_MIN<<24;
383         }
384     }
385     return result;
386 }
387 
388 
389 static void
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
391                              UErrorCode *pErrorCode) {
392     UConverter *cnv;
393     const UChar *source, *sourceLimit;
394     uint8_t *target;
395     int32_t targetCapacity;
396     int32_t *offsets;
397 
398     int32_t prev, c, diff;
399 
400     int32_t sourceIndex, nextSourceIndex;
401 
402 U_ALIGN_CODE(16)
403 
404     /* set up the local pointers */
405     cnv=pArgs->converter;
406     source=pArgs->source;
407     sourceLimit=pArgs->sourceLimit;
408     target=(uint8_t *)pArgs->target;
409     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
410     offsets=pArgs->offsets;
411 
412     /* get the converter state from UConverter */
413     c=cnv->fromUChar32;
414     prev=(int32_t)cnv->fromUnicodeStatus;
415     if(prev==0) {
416         prev=BOCU1_ASCII_PREV;
417     }
418 
419     /* sourceIndex=-1 if the current character began in the previous buffer */
420     sourceIndex= c==0 ? 0 : -1;
421     nextSourceIndex=0;
422 
423     /* conversion loop */
424     if(c!=0 && targetCapacity>0) {
425         goto getTrail;
426     }
427 
428 fastSingle:
429     /* fast loop for single-byte differences */
430     /* use only one loop counter variable, targetCapacity, not also source */
431     diff=(int32_t)(sourceLimit-source);
432     if(targetCapacity>diff) {
433         targetCapacity=diff;
434     }
435     while(targetCapacity>0 && (c=*source)<0x3000) {
436         if(c<=0x20) {
437             if(c!=0x20) {
438                 prev=BOCU1_ASCII_PREV;
439             }
440             *target++=(uint8_t)c;
441             *offsets++=nextSourceIndex++;
442             ++source;
443             --targetCapacity;
444         } else {
445             diff=c-prev;
446             if(DIFF_IS_SINGLE(diff)) {
447                 prev=BOCU1_SIMPLE_PREV(c);
448                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
449                 *offsets++=nextSourceIndex++;
450                 ++source;
451                 --targetCapacity;
452             } else {
453                 break;
454             }
455         }
456     }
457     /* restore real values */
458     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
459     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
460 
461     /* regular loop for all cases */
462     while(source<sourceLimit) {
463         if(targetCapacity>0) {
464             c=*source++;
465             ++nextSourceIndex;
466 
467             if(c<=0x20) {
468                 /*
469                  * ISO C0 control & space:
470                  * Encode directly for MIME compatibility,
471                  * and reset state except for space, to not disrupt compression.
472                  */
473                 if(c!=0x20) {
474                     prev=BOCU1_ASCII_PREV;
475                 }
476                 *target++=(uint8_t)c;
477                 *offsets++=sourceIndex;
478                 --targetCapacity;
479 
480                 sourceIndex=nextSourceIndex;
481                 continue;
482             }
483 
484             if(U16_IS_LEAD(c)) {
485 getTrail:
486                 if(source<sourceLimit) {
487                     /* test the following code unit */
488                     UChar trail=*source;
489                     if(U16_IS_TRAIL(trail)) {
490                         ++source;
491                         ++nextSourceIndex;
492                         c=U16_GET_SUPPLEMENTARY(c, trail);
493                     }
494                 } else {
495                     /* no more input */
496                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
497                     break;
498                 }
499             }
500 
501             /*
502              * all other Unicode code points c==U+0021..U+10ffff
503              * are encoded with the difference c-prev
504              *
505              * a new prev is computed from c,
506              * placed in the middle of a 0x80-block (for most small scripts) or
507              * in the middle of the Unihan and Hangul blocks
508              * to statistically minimize the following difference
509              */
510             diff=c-prev;
511             prev=BOCU1_PREV(c);
512             if(DIFF_IS_SINGLE(diff)) {
513                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
514                 *offsets++=sourceIndex;
515                 --targetCapacity;
516                 sourceIndex=nextSourceIndex;
517                 if(c<0x3000) {
518                     goto fastSingle;
519                 }
520             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
521                 /* optimize 2-byte case */
522                 int32_t m;
523 
524                 if(diff>=0) {
525                     diff-=BOCU1_REACH_POS_1+1;
526                     m=diff%BOCU1_TRAIL_COUNT;
527                     diff/=BOCU1_TRAIL_COUNT;
528                     diff+=BOCU1_START_POS_2;
529                 } else {
530                     diff-=BOCU1_REACH_NEG_1;
531                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
532                     diff+=BOCU1_START_NEG_2;
533                 }
534                 *target++=(uint8_t)diff;
535                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
536                 *offsets++=sourceIndex;
537                 *offsets++=sourceIndex;
538                 targetCapacity-=2;
539                 sourceIndex=nextSourceIndex;
540             } else {
541                 int32_t length; /* will be 2..4 */
542 
543                 diff=packDiff(diff);
544                 length=BOCU1_LENGTH_FROM_PACKED(diff);
545 
546                 /* write the output character bytes from diff and length */
547                 /* from the first if in the loop we know that targetCapacity>0 */
548                 if(length<=targetCapacity) {
549                     switch(length) {
550                         /* each branch falls through to the next one */
551                     case 4:
552                         *target++=(uint8_t)(diff>>24);
553                         *offsets++=sourceIndex;
554                     case 3: /*fall through*/
555                         *target++=(uint8_t)(diff>>16);
556                         *offsets++=sourceIndex;
557                     case 2: /*fall through*/
558                         *target++=(uint8_t)(diff>>8);
559                         *offsets++=sourceIndex;
560                     /* case 1: handled above */
561                         *target++=(uint8_t)diff;
562                         *offsets++=sourceIndex;
563                     default:
564                         /* will never occur */
565                         break;
566                     }
567                     targetCapacity-=length;
568                     sourceIndex=nextSourceIndex;
569                 } else {
570                     uint8_t *charErrorBuffer;
571 
572                     /*
573                      * We actually do this backwards here:
574                      * In order to save an intermediate variable, we output
575                      * first to the overflow buffer what does not fit into the
576                      * regular target.
577                      */
578                     /* we know that 1<=targetCapacity<length<=4 */
579                     length-=targetCapacity;
580                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
581                     switch(length) {
582                         /* each branch falls through to the next one */
583                     case 3:
584                         *charErrorBuffer++=(uint8_t)(diff>>16);
585                     case 2: /*fall through*/
586                         *charErrorBuffer++=(uint8_t)(diff>>8);
587                     case 1: /*fall through*/
588                         *charErrorBuffer=(uint8_t)diff;
589                     default:
590                         /* will never occur */
591                         break;
592                     }
593                     cnv->charErrorBufferLength=(int8_t)length;
594 
595                     /* now output what fits into the regular target */
596                     diff>>=8*length; /* length was reduced by targetCapacity */
597                     switch(targetCapacity) {
598                         /* each branch falls through to the next one */
599                     case 3:
600                         *target++=(uint8_t)(diff>>16);
601                         *offsets++=sourceIndex;
602                     case 2: /*fall through*/
603                         *target++=(uint8_t)(diff>>8);
604                         *offsets++=sourceIndex;
605                     case 1: /*fall through*/
606                         *target++=(uint8_t)diff;
607                         *offsets++=sourceIndex;
608                     default:
609                         /* will never occur */
610                         break;
611                     }
612 
613                     /* target overflow */
614                     targetCapacity=0;
615                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
616                     break;
617                 }
618             }
619         } else {
620             /* target is full */
621             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
622             break;
623         }
624     }
625 
626     /* set the converter state back into UConverter */
627     cnv->fromUChar32= c<0 ? -c : 0;
628     cnv->fromUnicodeStatus=(uint32_t)prev;
629 
630     /* write back the updated pointers */
631     pArgs->source=source;
632     pArgs->target=(char *)target;
633     pArgs->offsets=offsets;
634 }
635 
636 /*
637  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
638  * If a change is made in the original function, then either
639  * change this function the same way or
640  * re-copy the original function and remove the variables
641  * offsets, sourceIndex, and nextSourceIndex.
642  */
643 static void
_Bocu1FromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)644 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
645                   UErrorCode *pErrorCode) {
646     UConverter *cnv;
647     const UChar *source, *sourceLimit;
648     uint8_t *target;
649     int32_t targetCapacity;
650 
651     int32_t prev, c, diff;
652 
653     /* set up the local pointers */
654     cnv=pArgs->converter;
655     source=pArgs->source;
656     sourceLimit=pArgs->sourceLimit;
657     target=(uint8_t *)pArgs->target;
658     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
659 
660     /* get the converter state from UConverter */
661     c=cnv->fromUChar32;
662     prev=(int32_t)cnv->fromUnicodeStatus;
663     if(prev==0) {
664         prev=BOCU1_ASCII_PREV;
665     }
666 
667     /* conversion loop */
668     if(c!=0 && targetCapacity>0) {
669         goto getTrail;
670     }
671 
672 fastSingle:
673     /* fast loop for single-byte differences */
674     /* use only one loop counter variable, targetCapacity, not also source */
675     diff=(int32_t)(sourceLimit-source);
676     if(targetCapacity>diff) {
677         targetCapacity=diff;
678     }
679     while(targetCapacity>0 && (c=*source)<0x3000) {
680         if(c<=0x20) {
681             if(c!=0x20) {
682                 prev=BOCU1_ASCII_PREV;
683             }
684             *target++=(uint8_t)c;
685         } else {
686             diff=c-prev;
687             if(DIFF_IS_SINGLE(diff)) {
688                 prev=BOCU1_SIMPLE_PREV(c);
689                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
690             } else {
691                 break;
692             }
693         }
694         ++source;
695         --targetCapacity;
696     }
697     /* restore real values */
698     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
699 
700     /* regular loop for all cases */
701     while(source<sourceLimit) {
702         if(targetCapacity>0) {
703             c=*source++;
704 
705             if(c<=0x20) {
706                 /*
707                  * ISO C0 control & space:
708                  * Encode directly for MIME compatibility,
709                  * and reset state except for space, to not disrupt compression.
710                  */
711                 if(c!=0x20) {
712                     prev=BOCU1_ASCII_PREV;
713                 }
714                 *target++=(uint8_t)c;
715                 --targetCapacity;
716                 continue;
717             }
718 
719             if(U16_IS_LEAD(c)) {
720 getTrail:
721                 if(source<sourceLimit) {
722                     /* test the following code unit */
723                     UChar trail=*source;
724                     if(U16_IS_TRAIL(trail)) {
725                         ++source;
726                         c=U16_GET_SUPPLEMENTARY(c, trail);
727                     }
728                 } else {
729                     /* no more input */
730                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
731                     break;
732                 }
733             }
734 
735             /*
736              * all other Unicode code points c==U+0021..U+10ffff
737              * are encoded with the difference c-prev
738              *
739              * a new prev is computed from c,
740              * placed in the middle of a 0x80-block (for most small scripts) or
741              * in the middle of the Unihan and Hangul blocks
742              * to statistically minimize the following difference
743              */
744             diff=c-prev;
745             prev=BOCU1_PREV(c);
746             if(DIFF_IS_SINGLE(diff)) {
747                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
748                 --targetCapacity;
749                 if(c<0x3000) {
750                     goto fastSingle;
751                 }
752             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
753                 /* optimize 2-byte case */
754                 int32_t m;
755 
756                 if(diff>=0) {
757                     diff-=BOCU1_REACH_POS_1+1;
758                     m=diff%BOCU1_TRAIL_COUNT;
759                     diff/=BOCU1_TRAIL_COUNT;
760                     diff+=BOCU1_START_POS_2;
761                 } else {
762                     diff-=BOCU1_REACH_NEG_1;
763                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
764                     diff+=BOCU1_START_NEG_2;
765                 }
766                 *target++=(uint8_t)diff;
767                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
768                 targetCapacity-=2;
769             } else {
770                 int32_t length; /* will be 2..4 */
771 
772                 diff=packDiff(diff);
773                 length=BOCU1_LENGTH_FROM_PACKED(diff);
774 
775                 /* write the output character bytes from diff and length */
776                 /* from the first if in the loop we know that targetCapacity>0 */
777                 if(length<=targetCapacity) {
778                     switch(length) {
779                         /* each branch falls through to the next one */
780                     case 4:
781                         *target++=(uint8_t)(diff>>24);
782                     case 3: /*fall through*/
783                         *target++=(uint8_t)(diff>>16);
784                     /* case 2: handled above */
785                         *target++=(uint8_t)(diff>>8);
786                     /* case 1: handled above */
787                         *target++=(uint8_t)diff;
788                     default:
789                         /* will never occur */
790                         break;
791                     }
792                     targetCapacity-=length;
793                 } else {
794                     uint8_t *charErrorBuffer;
795 
796                     /*
797                      * We actually do this backwards here:
798                      * In order to save an intermediate variable, we output
799                      * first to the overflow buffer what does not fit into the
800                      * regular target.
801                      */
802                     /* we know that 1<=targetCapacity<length<=4 */
803                     length-=targetCapacity;
804                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
805                     switch(length) {
806                         /* each branch falls through to the next one */
807                     case 3:
808                         *charErrorBuffer++=(uint8_t)(diff>>16);
809                     case 2: /*fall through*/
810                         *charErrorBuffer++=(uint8_t)(diff>>8);
811                     case 1: /*fall through*/
812                         *charErrorBuffer=(uint8_t)diff;
813                     default:
814                         /* will never occur */
815                         break;
816                     }
817                     cnv->charErrorBufferLength=(int8_t)length;
818 
819                     /* now output what fits into the regular target */
820                     diff>>=8*length; /* length was reduced by targetCapacity */
821                     switch(targetCapacity) {
822                         /* each branch falls through to the next one */
823                     case 3:
824                         *target++=(uint8_t)(diff>>16);
825                     case 2: /*fall through*/
826                         *target++=(uint8_t)(diff>>8);
827                     case 1: /*fall through*/
828                         *target++=(uint8_t)diff;
829                     default:
830                         /* will never occur */
831                         break;
832                     }
833 
834                     /* target overflow */
835                     targetCapacity=0;
836                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
837                     break;
838                 }
839             }
840         } else {
841             /* target is full */
842             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
843             break;
844         }
845     }
846 
847     /* set the converter state back into UConverter */
848     cnv->fromUChar32= c<0 ? -c : 0;
849     cnv->fromUnicodeStatus=(uint32_t)prev;
850 
851     /* write back the updated pointers */
852     pArgs->source=source;
853     pArgs->target=(char *)target;
854 }
855 
856 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
857 
858 /**
859  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
860  *
861  * @param b lead byte;
862  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
863  * @return (diff<<2)|count
864  */
865 static inline int32_t
decodeBocu1LeadByte(int32_t b)866 decodeBocu1LeadByte(int32_t b) {
867     int32_t diff, count;
868 
869     if(b>=BOCU1_START_NEG_2) {
870         /* positive difference */
871         if(b<BOCU1_START_POS_3) {
872             /* two bytes */
873             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
874             count=1;
875         } else if(b<BOCU1_START_POS_4) {
876             /* three bytes */
877             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
878             count=2;
879         } else {
880             /* four bytes */
881             diff=BOCU1_REACH_POS_3+1;
882             count=3;
883         }
884     } else {
885         /* negative difference */
886         if(b>=BOCU1_START_NEG_3) {
887             /* two bytes */
888             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
889             count=1;
890         } else if(b>BOCU1_MIN) {
891             /* three bytes */
892             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
893             count=2;
894         } else {
895             /* four bytes */
896             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
897             count=3;
898         }
899     }
900 
901     /* return the state for decoding the trail byte(s) */
902     return (diff<<2)|count;
903 }
904 
905 /**
906  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
907  *
908  * @param count number of remaining trail bytes including this one
909  * @param b trail byte
910  * @return new delta for diff including b - <0 indicates an error
911  *
912  * @see decodeBocu1
913  */
914 static inline int32_t
decodeBocu1TrailByte(int32_t count,int32_t b)915 decodeBocu1TrailByte(int32_t count, int32_t b) {
916     if(b<=0x20) {
917         /* skip some C0 controls and make the trail byte range contiguous */
918         b=bocu1ByteToTrail[b];
919         /* b<0 for an illegal trail byte value will result in return<0 below */
920 #if BOCU1_MAX_TRAIL<0xff
921     } else if(b>BOCU1_MAX_TRAIL) {
922         return -99;
923 #endif
924     } else {
925         b-=BOCU1_TRAIL_BYTE_OFFSET;
926     }
927 
928     /* add trail byte into difference and decrement count */
929     if(count==1) {
930         return b;
931     } else if(count==2) {
932         return b*BOCU1_TRAIL_COUNT;
933     } else /* count==3 */ {
934         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
935     }
936 }
937 
938 static void
_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)939 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
940                            UErrorCode *pErrorCode) {
941     UConverter *cnv;
942     const uint8_t *source, *sourceLimit;
943     UChar *target;
944     const UChar *targetLimit;
945     int32_t *offsets;
946 
947     int32_t prev, count, diff, c;
948 
949     int8_t byteIndex;
950     uint8_t *bytes;
951 
952     int32_t sourceIndex, nextSourceIndex;
953 
954     /* set up the local pointers */
955     cnv=pArgs->converter;
956     source=(const uint8_t *)pArgs->source;
957     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
958     target=pArgs->target;
959     targetLimit=pArgs->targetLimit;
960     offsets=pArgs->offsets;
961 
962     /* get the converter state from UConverter */
963     prev=(int32_t)cnv->toUnicodeStatus;
964     if(prev==0) {
965         prev=BOCU1_ASCII_PREV;
966     }
967     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
968     count=diff&3;
969     diff>>=2;
970 
971     byteIndex=cnv->toULength;
972     bytes=cnv->toUBytes;
973 
974     /* sourceIndex=-1 if the current character began in the previous buffer */
975     sourceIndex=byteIndex==0 ? 0 : -1;
976     nextSourceIndex=0;
977 
978     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
979     if(count>0 && byteIndex>0 && target<targetLimit) {
980         goto getTrail;
981     }
982 
983 fastSingle:
984     /* fast loop for single-byte differences */
985     /* use count as the only loop counter variable */
986     diff=(int32_t)(sourceLimit-source);
987     count=(int32_t)(pArgs->targetLimit-target);
988     if(count>diff) {
989         count=diff;
990     }
991     while(count>0) {
992         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
993             c=prev+(c-BOCU1_MIDDLE);
994             if(c<0x3000) {
995                 *target++=(UChar)c;
996                 *offsets++=nextSourceIndex++;
997                 prev=BOCU1_SIMPLE_PREV(c);
998             } else {
999                 break;
1000             }
1001         } else if(c<=0x20) {
1002             if(c!=0x20) {
1003                 prev=BOCU1_ASCII_PREV;
1004             }
1005             *target++=(UChar)c;
1006             *offsets++=nextSourceIndex++;
1007         } else {
1008             break;
1009         }
1010         ++source;
1011         --count;
1012     }
1013     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1014 
1015     /* decode a sequence of single and lead bytes */
1016     while(source<sourceLimit) {
1017         if(target>=targetLimit) {
1018             /* target is full */
1019             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1020             break;
1021         }
1022 
1023         ++nextSourceIndex;
1024         c=*source++;
1025         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1026             /* Write a code point directly from a single-byte difference. */
1027             c=prev+(c-BOCU1_MIDDLE);
1028             if(c<0x3000) {
1029                 *target++=(UChar)c;
1030                 *offsets++=sourceIndex;
1031                 prev=BOCU1_SIMPLE_PREV(c);
1032                 sourceIndex=nextSourceIndex;
1033                 goto fastSingle;
1034             }
1035         } else if(c<=0x20) {
1036             /*
1037              * Direct-encoded C0 control code or space.
1038              * Reset prev for C0 control codes but not for space.
1039              */
1040             if(c!=0x20) {
1041                 prev=BOCU1_ASCII_PREV;
1042             }
1043             *target++=(UChar)c;
1044             *offsets++=sourceIndex;
1045             sourceIndex=nextSourceIndex;
1046             continue;
1047         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1048             /* Optimize two-byte case. */
1049             if(c>=BOCU1_MIDDLE) {
1050                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1051             } else {
1052                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1053             }
1054 
1055             /* trail byte */
1056             ++nextSourceIndex;
1057             c=decodeBocu1TrailByte(1, *source++);
1058             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1059                 bytes[0]=source[-2];
1060                 bytes[1]=source[-1];
1061                 byteIndex=2;
1062                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1063                 break;
1064             }
1065         } else if(c==BOCU1_RESET) {
1066             /* only reset the state, no code point */
1067             prev=BOCU1_ASCII_PREV;
1068             sourceIndex=nextSourceIndex;
1069             continue;
1070         } else {
1071             /*
1072              * For multi-byte difference lead bytes, set the decoder state
1073              * with the partial difference value from the lead byte and
1074              * with the number of trail bytes.
1075              */
1076             bytes[0]=(uint8_t)c;
1077             byteIndex=1;
1078 
1079             diff=decodeBocu1LeadByte(c);
1080             count=diff&3;
1081             diff>>=2;
1082 getTrail:
1083             for(;;) {
1084                 if(source>=sourceLimit) {
1085                     goto endloop;
1086                 }
1087                 ++nextSourceIndex;
1088                 c=bytes[byteIndex++]=*source++;
1089 
1090                 /* trail byte in any position */
1091                 c=decodeBocu1TrailByte(count, c);
1092                 if(c<0) {
1093                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1094                     goto endloop;
1095                 }
1096 
1097                 diff+=c;
1098                 if(--count==0) {
1099                     /* final trail byte, deliver a code point */
1100                     byteIndex=0;
1101                     c=prev+diff;
1102                     if((uint32_t)c>0x10ffff) {
1103                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1104                         goto endloop;
1105                     }
1106                     break;
1107                 }
1108             }
1109         }
1110 
1111         /* calculate the next prev and output c */
1112         prev=BOCU1_PREV(c);
1113         if(c<=0xffff) {
1114             *target++=(UChar)c;
1115             *offsets++=sourceIndex;
1116         } else {
1117             /* output surrogate pair */
1118             *target++=U16_LEAD(c);
1119             if(target<targetLimit) {
1120                 *target++=U16_TRAIL(c);
1121                 *offsets++=sourceIndex;
1122                 *offsets++=sourceIndex;
1123             } else {
1124                 /* target overflow */
1125                 *offsets++=sourceIndex;
1126                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1127                 cnv->UCharErrorBufferLength=1;
1128                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1129                 break;
1130             }
1131         }
1132         sourceIndex=nextSourceIndex;
1133     }
1134 endloop:
1135 
1136     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1137         /* set the converter state in UConverter to deal with the next character */
1138         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1139         cnv->mode=0;
1140     } else {
1141         /* set the converter state back into UConverter */
1142         cnv->toUnicodeStatus=(uint32_t)prev;
1143         cnv->mode=(diff<<2)|count;
1144     }
1145     cnv->toULength=byteIndex;
1146 
1147     /* write back the updated pointers */
1148     pArgs->source=(const char *)source;
1149     pArgs->target=target;
1150     pArgs->offsets=offsets;
1151     return;
1152 }
1153 
1154 /*
1155  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1156  * If a change is made in the original function, then either
1157  * change this function the same way or
1158  * re-copy the original function and remove the variables
1159  * offsets, sourceIndex, and nextSourceIndex.
1160  */
1161 static void
_Bocu1ToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1162 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1163                 UErrorCode *pErrorCode) {
1164     UConverter *cnv;
1165     const uint8_t *source, *sourceLimit;
1166     UChar *target;
1167     const UChar *targetLimit;
1168 
1169     int32_t prev, count, diff, c;
1170 
1171     int8_t byteIndex;
1172     uint8_t *bytes;
1173 
1174 U_ALIGN_CODE(16)
1175 
1176     /* set up the local pointers */
1177     cnv=pArgs->converter;
1178     source=(const uint8_t *)pArgs->source;
1179     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1180     target=pArgs->target;
1181     targetLimit=pArgs->targetLimit;
1182 
1183     /* get the converter state from UConverter */
1184     prev=(int32_t)cnv->toUnicodeStatus;
1185     if(prev==0) {
1186         prev=BOCU1_ASCII_PREV;
1187     }
1188     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1189     count=diff&3;
1190     diff>>=2;
1191 
1192     byteIndex=cnv->toULength;
1193     bytes=cnv->toUBytes;
1194 
1195     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1196     if(count>0 && byteIndex>0 && target<targetLimit) {
1197         goto getTrail;
1198     }
1199 
1200 fastSingle:
1201     /* fast loop for single-byte differences */
1202     /* use count as the only loop counter variable */
1203     diff=(int32_t)(sourceLimit-source);
1204     count=(int32_t)(pArgs->targetLimit-target);
1205     if(count>diff) {
1206         count=diff;
1207     }
1208     while(count>0) {
1209         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1210             c=prev+(c-BOCU1_MIDDLE);
1211             if(c<0x3000) {
1212                 *target++=(UChar)c;
1213                 prev=BOCU1_SIMPLE_PREV(c);
1214             } else {
1215                 break;
1216             }
1217         } else if(c<=0x20) {
1218             if(c!=0x20) {
1219                 prev=BOCU1_ASCII_PREV;
1220             }
1221             *target++=(UChar)c;
1222         } else {
1223             break;
1224         }
1225         ++source;
1226         --count;
1227     }
1228 
1229     /* decode a sequence of single and lead bytes */
1230     while(source<sourceLimit) {
1231         if(target>=targetLimit) {
1232             /* target is full */
1233             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1234             break;
1235         }
1236 
1237         c=*source++;
1238         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1239             /* Write a code point directly from a single-byte difference. */
1240             c=prev+(c-BOCU1_MIDDLE);
1241             if(c<0x3000) {
1242                 *target++=(UChar)c;
1243                 prev=BOCU1_SIMPLE_PREV(c);
1244                 goto fastSingle;
1245             }
1246         } else if(c<=0x20) {
1247             /*
1248              * Direct-encoded C0 control code or space.
1249              * Reset prev for C0 control codes but not for space.
1250              */
1251             if(c!=0x20) {
1252                 prev=BOCU1_ASCII_PREV;
1253             }
1254             *target++=(UChar)c;
1255             continue;
1256         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1257             /* Optimize two-byte case. */
1258             if(c>=BOCU1_MIDDLE) {
1259                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1260             } else {
1261                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1262             }
1263 
1264             /* trail byte */
1265             c=decodeBocu1TrailByte(1, *source++);
1266             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1267                 bytes[0]=source[-2];
1268                 bytes[1]=source[-1];
1269                 byteIndex=2;
1270                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1271                 break;
1272             }
1273         } else if(c==BOCU1_RESET) {
1274             /* only reset the state, no code point */
1275             prev=BOCU1_ASCII_PREV;
1276             continue;
1277         } else {
1278             /*
1279              * For multi-byte difference lead bytes, set the decoder state
1280              * with the partial difference value from the lead byte and
1281              * with the number of trail bytes.
1282              */
1283             bytes[0]=(uint8_t)c;
1284             byteIndex=1;
1285 
1286             diff=decodeBocu1LeadByte(c);
1287             count=diff&3;
1288             diff>>=2;
1289 getTrail:
1290             for(;;) {
1291                 if(source>=sourceLimit) {
1292                     goto endloop;
1293                 }
1294                 c=bytes[byteIndex++]=*source++;
1295 
1296                 /* trail byte in any position */
1297                 c=decodeBocu1TrailByte(count, c);
1298                 if(c<0) {
1299                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1300                     goto endloop;
1301                 }
1302 
1303                 diff+=c;
1304                 if(--count==0) {
1305                     /* final trail byte, deliver a code point */
1306                     byteIndex=0;
1307                     c=prev+diff;
1308                     if((uint32_t)c>0x10ffff) {
1309                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1310                         goto endloop;
1311                     }
1312                     break;
1313                 }
1314             }
1315         }
1316 
1317         /* calculate the next prev and output c */
1318         prev=BOCU1_PREV(c);
1319         if(c<=0xffff) {
1320             *target++=(UChar)c;
1321         } else {
1322             /* output surrogate pair */
1323             *target++=U16_LEAD(c);
1324             if(target<targetLimit) {
1325                 *target++=U16_TRAIL(c);
1326             } else {
1327                 /* target overflow */
1328                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1329                 cnv->UCharErrorBufferLength=1;
1330                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1331                 break;
1332             }
1333         }
1334     }
1335 endloop:
1336 
1337     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1338         /* set the converter state in UConverter to deal with the next character */
1339         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1340         cnv->mode=0;
1341     } else {
1342         /* set the converter state back into UConverter */
1343         cnv->toUnicodeStatus=(uint32_t)prev;
1344         cnv->mode=(diff<<2)|count;
1345     }
1346     cnv->toULength=byteIndex;
1347 
1348     /* write back the updated pointers */
1349     pArgs->source=(const char *)source;
1350     pArgs->target=target;
1351     return;
1352 }
1353 
1354 /* miscellaneous ------------------------------------------------------------ */
1355 
1356 static const UConverterImpl _Bocu1Impl={
1357     UCNV_BOCU1,
1358 
1359     NULL,
1360     NULL,
1361 
1362     NULL,
1363     NULL,
1364     NULL,
1365 
1366     _Bocu1ToUnicode,
1367     _Bocu1ToUnicodeWithOffsets,
1368     _Bocu1FromUnicode,
1369     _Bocu1FromUnicodeWithOffsets,
1370     NULL,
1371 
1372     NULL,
1373     NULL,
1374     NULL,
1375     NULL,
1376     ucnv_getCompleteUnicodeSet,
1377 
1378     NULL,
1379     NULL
1380 };
1381 
1382 static const UConverterStaticData _Bocu1StaticData={
1383     sizeof(UConverterStaticData),
1384     "BOCU-1",
1385     1214, /* CCSID for BOCU-1 */
1386     UCNV_IBM, UCNV_BOCU1,
1387     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1388     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1389     FALSE, FALSE,
1390     0,
1391     0,
1392     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1393 };
1394 
1395 const UConverterSharedData _Bocu1Data={
1396     sizeof(UConverterSharedData), ~((uint32_t)0),
1397     NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
1398     0,
1399     UCNV_MBCS_TABLE_INITIALIZER
1400 };
1401 
1402 #endif
1403