• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2002-2005, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  ucnvbocu.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002mar27
14 *   created by: Markus W. Scherer
15 *
16 *   This is an implementation of the Binary Ordered Compression for Unicode,
17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
18 */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_CONVERSION
23 
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "ucnv_bld.h"
27 #include "ucnv_cnv.h"
28 
29 /* BOCU-1 constants and macros ---------------------------------------------- */
30 
31 /*
32  * BOCU-1 encodes the code points of a Unicode string as
33  * a sequence of byte-encoded differences (slope detection),
34  * preserving lexical order.
35  *
36  * Optimize the difference-taking for runs of Unicode text within
37  * small scripts:
38  *
39  * Most small scripts are allocated within aligned 128-blocks of Unicode
40  * code points. Lexical order is preserved if the "previous code point" state
41  * is always moved into the middle of such a block.
42  *
43  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
44  * areas into the middle of those areas.
45  *
46  * C0 control codes and space are encoded with their US-ASCII bytes.
47  * "prev" is reset for C0 controls but not for space.
48  */
49 
50 /* initial value for "prev": middle of the ASCII range */
51 #define BOCU1_ASCII_PREV        0x40
52 
53 /* bounding byte values for differences */
54 #define BOCU1_MIN               0x21
55 #define BOCU1_MIDDLE            0x90
56 #define BOCU1_MAX_LEAD          0xfe
57 #define BOCU1_MAX_TRAIL         0xff
58 #define BOCU1_RESET             0xff
59 
60 /* number of lead bytes */
61 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
62 
63 /* adjust trail byte counts for the use of some C0 control byte values */
64 #define BOCU1_TRAIL_CONTROLS_COUNT  20
65 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
66 
67 /* number of trail bytes */
68 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
69 
70 /*
71  * number of positive and negative single-byte codes
72  * (counting 0==BOCU1_MIDDLE among the positive ones)
73  */
74 #define BOCU1_SINGLE            64
75 
76 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
77 #define BOCU1_LEAD_2            43
78 #define BOCU1_LEAD_3            3
79 #define BOCU1_LEAD_4            1
80 
81 /* The difference value range for single-byters. */
82 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
83 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
84 
85 /* The difference value range for double-byters. */
86 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
87 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
88 
89 /* The difference value range for 3-byters. */
90 #define BOCU1_REACH_POS_3   \
91     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
92 
93 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
94 
95 /* The lead byte start values. */
96 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
97 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
98 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
99      /* ==BOCU1_MAX_LEAD */
100 
101 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
102 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
103 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
104      /* ==BOCU1_MIN+1 */
105 
106 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
107 #define BOCU1_LENGTH_FROM_LEAD(lead) \
108     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
109      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
110      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
111 
112 /* The length of a byte sequence, according to its packed form. */
113 #define BOCU1_LENGTH_FROM_PACKED(packed) \
114     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
115 
116 /*
117  * 12 commonly used C0 control codes (and space) are only used to encode
118  * themselves directly,
119  * which makes BOCU-1 MIME-usable and reasonably safe for
120  * ASCII-oriented software.
121  *
122  * These controls are
123  *  0   NUL
124  *
125  *  7   BEL
126  *  8   BS
127  *
128  *  9   TAB
129  *  a   LF
130  *  b   VT
131  *  c   FF
132  *  d   CR
133  *
134  *  e   SO
135  *  f   SI
136  *
137  * 1a   SUB
138  * 1b   ESC
139  *
140  * The other 20 C0 controls are also encoded directly (to preserve order)
141  * but are also used as trail bytes in difference encoding
142  * (for better compression).
143  */
144 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
145 
146 /*
147  * Byte value map for control codes,
148  * from external byte values 0x00..0x20
149  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
150  * External byte values that are illegal as trail bytes are mapped to -1.
151  */
152 static const int8_t
153 bocu1ByteToTrail[BOCU1_MIN]={
154 /*  0     1     2     3     4     5     6     7    */
155     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
156 
157 /*  8     9     a     b     c     d     e     f    */
158     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
159 
160 /*  10    11    12    13    14    15    16    17   */
161     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
162 
163 /*  18    19    1a    1b    1c    1d    1e    1f   */
164     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
165 
166 /*  20   */
167     -1
168 };
169 
170 /*
171  * Byte value map for control codes,
172  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
173  * to external byte values 0x00..0x20.
174  */
175 static const int8_t
176 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
177 /*  0     1     2     3     4     5     6     7    */
178     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
179 
180 /*  8     9     a     b     c     d     e     f    */
181     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
182 
183 /*  10    11    12    13   */
184     0x1c, 0x1d, 0x1e, 0x1f
185 };
186 
187 /**
188  * Integer division and modulo with negative numerators
189  * yields negative modulo results and quotients that are one more than
190  * what we need here.
191  * This macro adjust the results so that the modulo-value m is always >=0.
192  *
193  * For positive n, the if() condition is always FALSE.
194  *
195  * @param n Number to be split into quotient and rest.
196  *          Will be modified to contain the quotient.
197  * @param d Divisor.
198  * @param m Output variable for the rest (modulo result).
199  */
200 #define NEGDIVMOD(n, d, m) { \
201     (m)=(n)%(d); \
202     (n)/=(d); \
203     if((m)<0) { \
204         --(n); \
205         (m)+=(d); \
206     } \
207 }
208 
209 /* BOCU-1 implementation functions ------------------------------------------ */
210 
211 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
212 
213 /**
214  * Compute the next "previous" value for differencing
215  * from the current code point.
216  *
217  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
218  * @return "previous code point" state value
219  */
220 static U_INLINE int32_t
bocu1Prev(int32_t c)221 bocu1Prev(int32_t c) {
222     /* compute new prev */
223     if(/* 0x3040<=c && */ c<=0x309f) {
224         /* Hiragana is not 128-aligned */
225         return 0x3070;
226     } else if(0x4e00<=c && c<=0x9fa5) {
227         /* CJK Unihan */
228         return 0x4e00-BOCU1_REACH_NEG_2;
229     } else if(0xac00<=c /* && c<=0xd7a3 */) {
230         /* Korean Hangul */
231         return (0xd7a3+0xac00)/2;
232     } else {
233         /* mostly small scripts */
234         return BOCU1_SIMPLE_PREV(c);
235     }
236 }
237 
238 /** Fast version of bocu1Prev() for most scripts. */
239 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
240 
241 /*
242  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
243  * The UConverter fields are used as follows:
244  *
245  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
246  *
247  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
248  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
249  */
250 
251 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
252 
253 /**
254  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
255  * and return a packed integer with them.
256  *
257  * The encoding favors small absolut differences with short encodings
258  * to compress runs of same-script characters.
259  *
260  * Optimized version with unrolled loops and fewer floating-point operations
261  * than the standard packDiff().
262  *
263  * @param diff difference value -0x10ffff..0x10ffff
264  * @return
265  *      0x010000zz for 1-byte sequence zz
266  *      0x0200yyzz for 2-byte sequence yy zz
267  *      0x03xxyyzz for 3-byte sequence xx yy zz
268  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
269  */
270 static int32_t
packDiff(int32_t diff)271 packDiff(int32_t diff) {
272     int32_t result, m;
273 
274     if(diff>=BOCU1_REACH_NEG_1) {
275         /* mostly positive differences, and single-byte negative ones */
276 #if 0   /* single-byte case handled in macros, see below */
277         if(diff<=BOCU1_REACH_POS_1) {
278             /* single byte */
279             return 0x01000000|(BOCU1_MIDDLE+diff);
280         } else
281 #endif
282         if(diff<=BOCU1_REACH_POS_2) {
283             /* two bytes */
284             diff-=BOCU1_REACH_POS_1+1;
285             result=0x02000000;
286 
287             m=diff%BOCU1_TRAIL_COUNT;
288             diff/=BOCU1_TRAIL_COUNT;
289             result|=BOCU1_TRAIL_TO_BYTE(m);
290 
291             result|=(BOCU1_START_POS_2+diff)<<8;
292         } else if(diff<=BOCU1_REACH_POS_3) {
293             /* three bytes */
294             diff-=BOCU1_REACH_POS_2+1;
295             result=0x03000000;
296 
297             m=diff%BOCU1_TRAIL_COUNT;
298             diff/=BOCU1_TRAIL_COUNT;
299             result|=BOCU1_TRAIL_TO_BYTE(m);
300 
301             m=diff%BOCU1_TRAIL_COUNT;
302             diff/=BOCU1_TRAIL_COUNT;
303             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
304 
305             result|=(BOCU1_START_POS_3+diff)<<16;
306         } else {
307             /* four bytes */
308             diff-=BOCU1_REACH_POS_3+1;
309 
310             m=diff%BOCU1_TRAIL_COUNT;
311             diff/=BOCU1_TRAIL_COUNT;
312             result=BOCU1_TRAIL_TO_BYTE(m);
313 
314             m=diff%BOCU1_TRAIL_COUNT;
315             diff/=BOCU1_TRAIL_COUNT;
316             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
317 
318             /*
319              * We know that / and % would deliver quotient 0 and rest=diff.
320              * Avoid division and modulo for performance.
321              */
322             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
323 
324             result|=((uint32_t)BOCU1_START_POS_4)<<24;
325         }
326     } else {
327         /* two- to four-byte negative differences */
328         if(diff>=BOCU1_REACH_NEG_2) {
329             /* two bytes */
330             diff-=BOCU1_REACH_NEG_1;
331             result=0x02000000;
332 
333             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
334             result|=BOCU1_TRAIL_TO_BYTE(m);
335 
336             result|=(BOCU1_START_NEG_2+diff)<<8;
337         } else if(diff>=BOCU1_REACH_NEG_3) {
338             /* three bytes */
339             diff-=BOCU1_REACH_NEG_2;
340             result=0x03000000;
341 
342             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
343             result|=BOCU1_TRAIL_TO_BYTE(m);
344 
345             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
346             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
347 
348             result|=(BOCU1_START_NEG_3+diff)<<16;
349         } else {
350             /* four bytes */
351             diff-=BOCU1_REACH_NEG_3;
352 
353             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
354             result=BOCU1_TRAIL_TO_BYTE(m);
355 
356             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
357             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
358 
359             /*
360              * We know that NEGDIVMOD would deliver
361              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
362              * Avoid division and modulo for performance.
363              */
364             m=diff+BOCU1_TRAIL_COUNT;
365             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
366 
367             result|=BOCU1_MIN<<24;
368         }
369     }
370     return result;
371 }
372 
373 /* Faster versions of packDiff() for single-byte-encoded diff values. */
374 
375 /** Is a diff value encodable in a single byte? */
376 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
377 
378 /** Encode a diff value in a single byte. */
379 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
380 
381 /** Is a diff value encodable in two bytes? */
382 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
383 
384 static void
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)385 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
386                              UErrorCode *pErrorCode) {
387     UConverter *cnv;
388     const UChar *source, *sourceLimit;
389     uint8_t *target;
390     int32_t targetCapacity;
391     int32_t *offsets;
392 
393     int32_t prev, c, diff;
394 
395     int32_t sourceIndex, nextSourceIndex;
396 
397 U_ALIGN_CODE(16)
398 
399     /* set up the local pointers */
400     cnv=pArgs->converter;
401     source=pArgs->source;
402     sourceLimit=pArgs->sourceLimit;
403     target=(uint8_t *)pArgs->target;
404     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
405     offsets=pArgs->offsets;
406 
407     /* get the converter state from UConverter */
408     c=cnv->fromUChar32;
409     prev=(int32_t)cnv->fromUnicodeStatus;
410     if(prev==0) {
411         prev=BOCU1_ASCII_PREV;
412     }
413 
414     /* sourceIndex=-1 if the current character began in the previous buffer */
415     sourceIndex= c==0 ? 0 : -1;
416     nextSourceIndex=0;
417 
418     /* conversion loop */
419     if(c!=0 && targetCapacity>0) {
420         goto getTrail;
421     }
422 
423 fastSingle:
424     /* fast loop for single-byte differences */
425     /* use only one loop counter variable, targetCapacity, not also source */
426     diff=(int32_t)(sourceLimit-source);
427     if(targetCapacity>diff) {
428         targetCapacity=diff;
429     }
430     while(targetCapacity>0 && (c=*source)<0x3000) {
431         if(c<=0x20) {
432             if(c!=0x20) {
433                 prev=BOCU1_ASCII_PREV;
434             }
435             *target++=(uint8_t)c;
436             *offsets++=nextSourceIndex++;
437             ++source;
438             --targetCapacity;
439         } else {
440             diff=c-prev;
441             if(DIFF_IS_SINGLE(diff)) {
442                 prev=BOCU1_SIMPLE_PREV(c);
443                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
444                 *offsets++=nextSourceIndex++;
445                 ++source;
446                 --targetCapacity;
447             } else {
448                 break;
449             }
450         }
451     }
452     /* restore real values */
453     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
454     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
455 
456     /* regular loop for all cases */
457     while(source<sourceLimit) {
458         if(targetCapacity>0) {
459             c=*source++;
460             ++nextSourceIndex;
461 
462             if(c<=0x20) {
463                 /*
464                  * ISO C0 control & space:
465                  * Encode directly for MIME compatibility,
466                  * and reset state except for space, to not disrupt compression.
467                  */
468                 if(c!=0x20) {
469                     prev=BOCU1_ASCII_PREV;
470                 }
471                 *target++=(uint8_t)c;
472                 *offsets++=sourceIndex;
473                 --targetCapacity;
474 
475                 sourceIndex=nextSourceIndex;
476                 continue;
477             }
478 
479             if(UTF_IS_LEAD(c)) {
480 getTrail:
481                 if(source<sourceLimit) {
482                     /* test the following code unit */
483                     UChar trail=*source;
484                     if(UTF_IS_SECOND_SURROGATE(trail)) {
485                         ++source;
486                         ++nextSourceIndex;
487                         c=UTF16_GET_PAIR_VALUE(c, trail);
488                     }
489                 } else {
490                     /* no more input */
491                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
492                     break;
493                 }
494             }
495 
496             /*
497              * all other Unicode code points c==U+0021..U+10ffff
498              * are encoded with the difference c-prev
499              *
500              * a new prev is computed from c,
501              * placed in the middle of a 0x80-block (for most small scripts) or
502              * in the middle of the Unihan and Hangul blocks
503              * to statistically minimize the following difference
504              */
505             diff=c-prev;
506             prev=BOCU1_PREV(c);
507             if(DIFF_IS_SINGLE(diff)) {
508                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
509                 *offsets++=sourceIndex;
510                 --targetCapacity;
511                 sourceIndex=nextSourceIndex;
512                 if(c<0x3000) {
513                     goto fastSingle;
514                 }
515             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
516                 /* optimize 2-byte case */
517                 int32_t m;
518 
519                 if(diff>=0) {
520                     diff-=BOCU1_REACH_POS_1+1;
521                     m=diff%BOCU1_TRAIL_COUNT;
522                     diff/=BOCU1_TRAIL_COUNT;
523                     diff+=BOCU1_START_POS_2;
524                 } else {
525                     diff-=BOCU1_REACH_NEG_1;
526                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
527                     diff+=BOCU1_START_NEG_2;
528                 }
529                 *target++=(uint8_t)diff;
530                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
531                 *offsets++=sourceIndex;
532                 *offsets++=sourceIndex;
533                 targetCapacity-=2;
534                 sourceIndex=nextSourceIndex;
535             } else {
536                 int32_t length; /* will be 2..4 */
537 
538                 diff=packDiff(diff);
539                 length=BOCU1_LENGTH_FROM_PACKED(diff);
540 
541                 /* write the output character bytes from diff and length */
542                 /* from the first if in the loop we know that targetCapacity>0 */
543                 if(length<=targetCapacity) {
544                     switch(length) {
545                         /* each branch falls through to the next one */
546                     case 4:
547                         *target++=(uint8_t)(diff>>24);
548                         *offsets++=sourceIndex;
549                     case 3:
550                         *target++=(uint8_t)(diff>>16);
551                         *offsets++=sourceIndex;
552                     case 2:
553                         *target++=(uint8_t)(diff>>8);
554                         *offsets++=sourceIndex;
555                     /* case 1: handled above */
556                         *target++=(uint8_t)diff;
557                         *offsets++=sourceIndex;
558                     default:
559                         /* will never occur */
560                         break;
561                     }
562                     targetCapacity-=length;
563                     sourceIndex=nextSourceIndex;
564                 } else {
565                     uint8_t *charErrorBuffer;
566 
567                     /*
568                      * We actually do this backwards here:
569                      * In order to save an intermediate variable, we output
570                      * first to the overflow buffer what does not fit into the
571                      * regular target.
572                      */
573                     /* we know that 1<=targetCapacity<length<=4 */
574                     length-=targetCapacity;
575                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
576                     switch(length) {
577                         /* each branch falls through to the next one */
578                     case 3:
579                         *charErrorBuffer++=(uint8_t)(diff>>16);
580                     case 2:
581                         *charErrorBuffer++=(uint8_t)(diff>>8);
582                     case 1:
583                         *charErrorBuffer=(uint8_t)diff;
584                     default:
585                         /* will never occur */
586                         break;
587                     }
588                     cnv->charErrorBufferLength=(int8_t)length;
589 
590                     /* now output what fits into the regular target */
591                     diff>>=8*length; /* length was reduced by targetCapacity */
592                     switch(targetCapacity) {
593                         /* each branch falls through to the next one */
594                     case 3:
595                         *target++=(uint8_t)(diff>>16);
596                         *offsets++=sourceIndex;
597                     case 2:
598                         *target++=(uint8_t)(diff>>8);
599                         *offsets++=sourceIndex;
600                     case 1:
601                         *target++=(uint8_t)diff;
602                         *offsets++=sourceIndex;
603                     default:
604                         /* will never occur */
605                         break;
606                     }
607 
608                     /* target overflow */
609                     targetCapacity=0;
610                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
611                     break;
612                 }
613             }
614         } else {
615             /* target is full */
616             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
617             break;
618         }
619     }
620 
621     /* set the converter state back into UConverter */
622     cnv->fromUChar32= c<0 ? -c : 0;
623     cnv->fromUnicodeStatus=(uint32_t)prev;
624 
625     /* write back the updated pointers */
626     pArgs->source=source;
627     pArgs->target=(char *)target;
628     pArgs->offsets=offsets;
629 }
630 
631 /*
632  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
633  * If a change is made in the original function, then either
634  * change this function the same way or
635  * re-copy the original function and remove the variables
636  * offsets, sourceIndex, and nextSourceIndex.
637  */
638 static void
_Bocu1FromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)639 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
640                   UErrorCode *pErrorCode) {
641     UConverter *cnv;
642     const UChar *source, *sourceLimit;
643     uint8_t *target;
644     int32_t targetCapacity;
645 
646     int32_t prev, c, diff;
647 
648     /* set up the local pointers */
649     cnv=pArgs->converter;
650     source=pArgs->source;
651     sourceLimit=pArgs->sourceLimit;
652     target=(uint8_t *)pArgs->target;
653     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
654 
655     /* get the converter state from UConverter */
656     c=cnv->fromUChar32;
657     prev=(int32_t)cnv->fromUnicodeStatus;
658     if(prev==0) {
659         prev=BOCU1_ASCII_PREV;
660     }
661 
662     /* conversion loop */
663     if(c!=0 && targetCapacity>0) {
664         goto getTrail;
665     }
666 
667 fastSingle:
668     /* fast loop for single-byte differences */
669     /* use only one loop counter variable, targetCapacity, not also source */
670     diff=(int32_t)(sourceLimit-source);
671     if(targetCapacity>diff) {
672         targetCapacity=diff;
673     }
674     while(targetCapacity>0 && (c=*source)<0x3000) {
675         if(c<=0x20) {
676             if(c!=0x20) {
677                 prev=BOCU1_ASCII_PREV;
678             }
679             *target++=(uint8_t)c;
680         } else {
681             diff=c-prev;
682             if(DIFF_IS_SINGLE(diff)) {
683                 prev=BOCU1_SIMPLE_PREV(c);
684                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
685             } else {
686                 break;
687             }
688         }
689         ++source;
690         --targetCapacity;
691     }
692     /* restore real values */
693     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
694 
695     /* regular loop for all cases */
696     while(source<sourceLimit) {
697         if(targetCapacity>0) {
698             c=*source++;
699 
700             if(c<=0x20) {
701                 /*
702                  * ISO C0 control & space:
703                  * Encode directly for MIME compatibility,
704                  * and reset state except for space, to not disrupt compression.
705                  */
706                 if(c!=0x20) {
707                     prev=BOCU1_ASCII_PREV;
708                 }
709                 *target++=(uint8_t)c;
710                 --targetCapacity;
711                 continue;
712             }
713 
714             if(UTF_IS_LEAD(c)) {
715 getTrail:
716                 if(source<sourceLimit) {
717                     /* test the following code unit */
718                     UChar trail=*source;
719                     if(UTF_IS_SECOND_SURROGATE(trail)) {
720                         ++source;
721                         c=UTF16_GET_PAIR_VALUE(c, trail);
722                     }
723                 } else {
724                     /* no more input */
725                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
726                     break;
727                 }
728             }
729 
730             /*
731              * all other Unicode code points c==U+0021..U+10ffff
732              * are encoded with the difference c-prev
733              *
734              * a new prev is computed from c,
735              * placed in the middle of a 0x80-block (for most small scripts) or
736              * in the middle of the Unihan and Hangul blocks
737              * to statistically minimize the following difference
738              */
739             diff=c-prev;
740             prev=BOCU1_PREV(c);
741             if(DIFF_IS_SINGLE(diff)) {
742                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
743                 --targetCapacity;
744                 if(c<0x3000) {
745                     goto fastSingle;
746                 }
747             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
748                 /* optimize 2-byte case */
749                 int32_t m;
750 
751                 if(diff>=0) {
752                     diff-=BOCU1_REACH_POS_1+1;
753                     m=diff%BOCU1_TRAIL_COUNT;
754                     diff/=BOCU1_TRAIL_COUNT;
755                     diff+=BOCU1_START_POS_2;
756                 } else {
757                     diff-=BOCU1_REACH_NEG_1;
758                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
759                     diff+=BOCU1_START_NEG_2;
760                 }
761                 *target++=(uint8_t)diff;
762                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
763                 targetCapacity-=2;
764             } else {
765                 int32_t length; /* will be 2..4 */
766 
767                 diff=packDiff(diff);
768                 length=BOCU1_LENGTH_FROM_PACKED(diff);
769 
770                 /* write the output character bytes from diff and length */
771                 /* from the first if in the loop we know that targetCapacity>0 */
772                 if(length<=targetCapacity) {
773                     switch(length) {
774                         /* each branch falls through to the next one */
775                     case 4:
776                         *target++=(uint8_t)(diff>>24);
777                     case 3:
778                         *target++=(uint8_t)(diff>>16);
779                     /* case 2: handled above */
780                         *target++=(uint8_t)(diff>>8);
781                     /* case 1: handled above */
782                         *target++=(uint8_t)diff;
783                     default:
784                         /* will never occur */
785                         break;
786                     }
787                     targetCapacity-=length;
788                 } else {
789                     uint8_t *charErrorBuffer;
790 
791                     /*
792                      * We actually do this backwards here:
793                      * In order to save an intermediate variable, we output
794                      * first to the overflow buffer what does not fit into the
795                      * regular target.
796                      */
797                     /* we know that 1<=targetCapacity<length<=4 */
798                     length-=targetCapacity;
799                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
800                     switch(length) {
801                         /* each branch falls through to the next one */
802                     case 3:
803                         *charErrorBuffer++=(uint8_t)(diff>>16);
804                     case 2:
805                         *charErrorBuffer++=(uint8_t)(diff>>8);
806                     case 1:
807                         *charErrorBuffer=(uint8_t)diff;
808                     default:
809                         /* will never occur */
810                         break;
811                     }
812                     cnv->charErrorBufferLength=(int8_t)length;
813 
814                     /* now output what fits into the regular target */
815                     diff>>=8*length; /* length was reduced by targetCapacity */
816                     switch(targetCapacity) {
817                         /* each branch falls through to the next one */
818                     case 3:
819                         *target++=(uint8_t)(diff>>16);
820                     case 2:
821                         *target++=(uint8_t)(diff>>8);
822                     case 1:
823                         *target++=(uint8_t)diff;
824                     default:
825                         /* will never occur */
826                         break;
827                     }
828 
829                     /* target overflow */
830                     targetCapacity=0;
831                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
832                     break;
833                 }
834             }
835         } else {
836             /* target is full */
837             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
838             break;
839         }
840     }
841 
842     /* set the converter state back into UConverter */
843     cnv->fromUChar32= c<0 ? -c : 0;
844     cnv->fromUnicodeStatus=(uint32_t)prev;
845 
846     /* write back the updated pointers */
847     pArgs->source=source;
848     pArgs->target=(char *)target;
849 }
850 
851 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
852 
853 /**
854  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
855  *
856  * @param b lead byte;
857  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
858  * @return (diff<<2)|count
859  */
860 static U_INLINE int32_t
decodeBocu1LeadByte(int32_t b)861 decodeBocu1LeadByte(int32_t b) {
862     int32_t diff, count;
863 
864     if(b>=BOCU1_START_NEG_2) {
865         /* positive difference */
866         if(b<BOCU1_START_POS_3) {
867             /* two bytes */
868             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
869             count=1;
870         } else if(b<BOCU1_START_POS_4) {
871             /* three bytes */
872             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
873             count=2;
874         } else {
875             /* four bytes */
876             diff=BOCU1_REACH_POS_3+1;
877             count=3;
878         }
879     } else {
880         /* negative difference */
881         if(b>=BOCU1_START_NEG_3) {
882             /* two bytes */
883             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
884             count=1;
885         } else if(b>BOCU1_MIN) {
886             /* three bytes */
887             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
888             count=2;
889         } else {
890             /* four bytes */
891             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
892             count=3;
893         }
894     }
895 
896     /* return the state for decoding the trail byte(s) */
897     return (diff<<2)|count;
898 }
899 
900 /**
901  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
902  *
903  * @param count number of remaining trail bytes including this one
904  * @param b trail byte
905  * @return new delta for diff including b - <0 indicates an error
906  *
907  * @see decodeBocu1
908  */
909 static U_INLINE int32_t
decodeBocu1TrailByte(int32_t count,int32_t b)910 decodeBocu1TrailByte(int32_t count, int32_t b) {
911     if(b<=0x20) {
912         /* skip some C0 controls and make the trail byte range contiguous */
913         b=bocu1ByteToTrail[b];
914         /* b<0 for an illegal trail byte value will result in return<0 below */
915 #if BOCU1_MAX_TRAIL<0xff
916     } else if(b>BOCU1_MAX_TRAIL) {
917         return -99;
918 #endif
919     } else {
920         b-=BOCU1_TRAIL_BYTE_OFFSET;
921     }
922 
923     /* add trail byte into difference and decrement count */
924     if(count==1) {
925         return b;
926     } else if(count==2) {
927         return b*BOCU1_TRAIL_COUNT;
928     } else /* count==3 */ {
929         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
930     }
931 }
932 
933 static void
_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)934 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
935                            UErrorCode *pErrorCode) {
936     UConverter *cnv;
937     const uint8_t *source, *sourceLimit;
938     UChar *target;
939     const UChar *targetLimit;
940     int32_t *offsets;
941 
942     int32_t prev, count, diff, c;
943 
944     int8_t byteIndex;
945     uint8_t *bytes;
946 
947     int32_t sourceIndex, nextSourceIndex;
948 
949     /* set up the local pointers */
950     cnv=pArgs->converter;
951     source=(const uint8_t *)pArgs->source;
952     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
953     target=pArgs->target;
954     targetLimit=pArgs->targetLimit;
955     offsets=pArgs->offsets;
956 
957     /* get the converter state from UConverter */
958     prev=(int32_t)cnv->toUnicodeStatus;
959     if(prev==0) {
960         prev=BOCU1_ASCII_PREV;
961     }
962     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
963     count=diff&3;
964     diff>>=2;
965 
966     byteIndex=cnv->toULength;
967     bytes=cnv->toUBytes;
968 
969     /* sourceIndex=-1 if the current character began in the previous buffer */
970     sourceIndex=byteIndex==0 ? 0 : -1;
971     nextSourceIndex=0;
972 
973     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
974     if(count>0 && byteIndex>0 && target<targetLimit) {
975         goto getTrail;
976     }
977 
978 fastSingle:
979     /* fast loop for single-byte differences */
980     /* use count as the only loop counter variable */
981     diff=(int32_t)(sourceLimit-source);
982     count=(int32_t)(pArgs->targetLimit-target);
983     if(count>diff) {
984         count=diff;
985     }
986     while(count>0) {
987         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
988             c=prev+(c-BOCU1_MIDDLE);
989             if(c<0x3000) {
990                 *target++=(UChar)c;
991                 *offsets++=nextSourceIndex++;
992                 prev=BOCU1_SIMPLE_PREV(c);
993             } else {
994                 break;
995             }
996         } else if(c<=0x20) {
997             if(c!=0x20) {
998                 prev=BOCU1_ASCII_PREV;
999             }
1000             *target++=(UChar)c;
1001             *offsets++=nextSourceIndex++;
1002         } else {
1003             break;
1004         }
1005         ++source;
1006         --count;
1007     }
1008     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1009 
1010     /* decode a sequence of single and lead bytes */
1011     while(source<sourceLimit) {
1012         if(target>=targetLimit) {
1013             /* target is full */
1014             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1015             break;
1016         }
1017 
1018         ++nextSourceIndex;
1019         c=*source++;
1020         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1021             /* Write a code point directly from a single-byte difference. */
1022             c=prev+(c-BOCU1_MIDDLE);
1023             if(c<0x3000) {
1024                 *target++=(UChar)c;
1025                 *offsets++=sourceIndex;
1026                 prev=BOCU1_SIMPLE_PREV(c);
1027                 sourceIndex=nextSourceIndex;
1028                 goto fastSingle;
1029             }
1030         } else if(c<=0x20) {
1031             /*
1032              * Direct-encoded C0 control code or space.
1033              * Reset prev for C0 control codes but not for space.
1034              */
1035             if(c!=0x20) {
1036                 prev=BOCU1_ASCII_PREV;
1037             }
1038             *target++=(UChar)c;
1039             *offsets++=sourceIndex;
1040             sourceIndex=nextSourceIndex;
1041             continue;
1042         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1043             /* Optimize two-byte case. */
1044             if(c>=BOCU1_MIDDLE) {
1045                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1046             } else {
1047                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1048             }
1049 
1050             /* trail byte */
1051             ++nextSourceIndex;
1052             c=decodeBocu1TrailByte(1, *source++);
1053             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1054                 bytes[0]=source[-2];
1055                 bytes[1]=source[-1];
1056                 byteIndex=2;
1057                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1058                 break;
1059             }
1060         } else if(c==BOCU1_RESET) {
1061             /* only reset the state, no code point */
1062             prev=BOCU1_ASCII_PREV;
1063             sourceIndex=nextSourceIndex;
1064             continue;
1065         } else {
1066             /*
1067              * For multi-byte difference lead bytes, set the decoder state
1068              * with the partial difference value from the lead byte and
1069              * with the number of trail bytes.
1070              */
1071             bytes[0]=(uint8_t)c;
1072             byteIndex=1;
1073 
1074             diff=decodeBocu1LeadByte(c);
1075             count=diff&3;
1076             diff>>=2;
1077 getTrail:
1078             for(;;) {
1079                 if(source>=sourceLimit) {
1080                     goto endloop;
1081                 }
1082                 ++nextSourceIndex;
1083                 c=bytes[byteIndex++]=*source++;
1084 
1085                 /* trail byte in any position */
1086                 c=decodeBocu1TrailByte(count, c);
1087                 if(c<0) {
1088                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1089                     goto endloop;
1090                 }
1091 
1092                 diff+=c;
1093                 if(--count==0) {
1094                     /* final trail byte, deliver a code point */
1095                     byteIndex=0;
1096                     c=prev+diff;
1097                     if((uint32_t)c>0x10ffff) {
1098                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1099                         goto endloop;
1100                     }
1101                     break;
1102                 }
1103             }
1104         }
1105 
1106         /* calculate the next prev and output c */
1107         prev=BOCU1_PREV(c);
1108         if(c<=0xffff) {
1109             *target++=(UChar)c;
1110             *offsets++=sourceIndex;
1111         } else {
1112             /* output surrogate pair */
1113             *target++=UTF16_LEAD(c);
1114             if(target<targetLimit) {
1115                 *target++=UTF16_TRAIL(c);
1116                 *offsets++=sourceIndex;
1117                 *offsets++=sourceIndex;
1118             } else {
1119                 /* target overflow */
1120                 *offsets++=sourceIndex;
1121                 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
1122                 cnv->UCharErrorBufferLength=1;
1123                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1124                 break;
1125             }
1126         }
1127         sourceIndex=nextSourceIndex;
1128     }
1129 endloop:
1130 
1131     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1132         /* set the converter state in UConverter to deal with the next character */
1133         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1134         cnv->mode=0;
1135     } else {
1136         /* set the converter state back into UConverter */
1137         cnv->toUnicodeStatus=(uint32_t)prev;
1138         cnv->mode=(diff<<2)|count;
1139     }
1140     cnv->toULength=byteIndex;
1141 
1142     /* write back the updated pointers */
1143     pArgs->source=(const char *)source;
1144     pArgs->target=target;
1145     pArgs->offsets=offsets;
1146     return;
1147 }
1148 
1149 /*
1150  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1151  * If a change is made in the original function, then either
1152  * change this function the same way or
1153  * re-copy the original function and remove the variables
1154  * offsets, sourceIndex, and nextSourceIndex.
1155  */
1156 static void
_Bocu1ToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1157 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1158                 UErrorCode *pErrorCode) {
1159     UConverter *cnv;
1160     const uint8_t *source, *sourceLimit;
1161     UChar *target;
1162     const UChar *targetLimit;
1163 
1164     int32_t prev, count, diff, c;
1165 
1166     int8_t byteIndex;
1167     uint8_t *bytes;
1168 
1169 U_ALIGN_CODE(16)
1170 
1171     /* set up the local pointers */
1172     cnv=pArgs->converter;
1173     source=(const uint8_t *)pArgs->source;
1174     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1175     target=pArgs->target;
1176     targetLimit=pArgs->targetLimit;
1177 
1178     /* get the converter state from UConverter */
1179     prev=(int32_t)cnv->toUnicodeStatus;
1180     if(prev==0) {
1181         prev=BOCU1_ASCII_PREV;
1182     }
1183     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1184     count=diff&3;
1185     diff>>=2;
1186 
1187     byteIndex=cnv->toULength;
1188     bytes=cnv->toUBytes;
1189 
1190     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1191     if(count>0 && byteIndex>0 && target<targetLimit) {
1192         goto getTrail;
1193     }
1194 
1195 fastSingle:
1196     /* fast loop for single-byte differences */
1197     /* use count as the only loop counter variable */
1198     diff=(int32_t)(sourceLimit-source);
1199     count=(int32_t)(pArgs->targetLimit-target);
1200     if(count>diff) {
1201         count=diff;
1202     }
1203     while(count>0) {
1204         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1205             c=prev+(c-BOCU1_MIDDLE);
1206             if(c<0x3000) {
1207                 *target++=(UChar)c;
1208                 prev=BOCU1_SIMPLE_PREV(c);
1209             } else {
1210                 break;
1211             }
1212         } else if(c<=0x20) {
1213             if(c!=0x20) {
1214                 prev=BOCU1_ASCII_PREV;
1215             }
1216             *target++=(UChar)c;
1217         } else {
1218             break;
1219         }
1220         ++source;
1221         --count;
1222     }
1223 
1224     /* decode a sequence of single and lead bytes */
1225     while(source<sourceLimit) {
1226         if(target>=targetLimit) {
1227             /* target is full */
1228             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1229             break;
1230         }
1231 
1232         c=*source++;
1233         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1234             /* Write a code point directly from a single-byte difference. */
1235             c=prev+(c-BOCU1_MIDDLE);
1236             if(c<0x3000) {
1237                 *target++=(UChar)c;
1238                 prev=BOCU1_SIMPLE_PREV(c);
1239                 goto fastSingle;
1240             }
1241         } else if(c<=0x20) {
1242             /*
1243              * Direct-encoded C0 control code or space.
1244              * Reset prev for C0 control codes but not for space.
1245              */
1246             if(c!=0x20) {
1247                 prev=BOCU1_ASCII_PREV;
1248             }
1249             *target++=(UChar)c;
1250             continue;
1251         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1252             /* Optimize two-byte case. */
1253             if(c>=BOCU1_MIDDLE) {
1254                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1255             } else {
1256                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1257             }
1258 
1259             /* trail byte */
1260             c=decodeBocu1TrailByte(1, *source++);
1261             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1262                 bytes[0]=source[-2];
1263                 bytes[1]=source[-1];
1264                 byteIndex=2;
1265                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1266                 break;
1267             }
1268         } else if(c==BOCU1_RESET) {
1269             /* only reset the state, no code point */
1270             prev=BOCU1_ASCII_PREV;
1271             continue;
1272         } else {
1273             /*
1274              * For multi-byte difference lead bytes, set the decoder state
1275              * with the partial difference value from the lead byte and
1276              * with the number of trail bytes.
1277              */
1278             bytes[0]=(uint8_t)c;
1279             byteIndex=1;
1280 
1281             diff=decodeBocu1LeadByte(c);
1282             count=diff&3;
1283             diff>>=2;
1284 getTrail:
1285             for(;;) {
1286                 if(source>=sourceLimit) {
1287                     goto endloop;
1288                 }
1289                 c=bytes[byteIndex++]=*source++;
1290 
1291                 /* trail byte in any position */
1292                 c=decodeBocu1TrailByte(count, c);
1293                 if(c<0) {
1294                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1295                     goto endloop;
1296                 }
1297 
1298                 diff+=c;
1299                 if(--count==0) {
1300                     /* final trail byte, deliver a code point */
1301                     byteIndex=0;
1302                     c=prev+diff;
1303                     if((uint32_t)c>0x10ffff) {
1304                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1305                         goto endloop;
1306                     }
1307                     break;
1308                 }
1309             }
1310         }
1311 
1312         /* calculate the next prev and output c */
1313         prev=BOCU1_PREV(c);
1314         if(c<=0xffff) {
1315             *target++=(UChar)c;
1316         } else {
1317             /* output surrogate pair */
1318             *target++=UTF16_LEAD(c);
1319             if(target<targetLimit) {
1320                 *target++=UTF16_TRAIL(c);
1321             } else {
1322                 /* target overflow */
1323                 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
1324                 cnv->UCharErrorBufferLength=1;
1325                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1326                 break;
1327             }
1328         }
1329     }
1330 endloop:
1331 
1332     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1333         /* set the converter state in UConverter to deal with the next character */
1334         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1335         cnv->mode=0;
1336     } else {
1337         /* set the converter state back into UConverter */
1338         cnv->toUnicodeStatus=(uint32_t)prev;
1339         cnv->mode=(diff<<2)|count;
1340     }
1341     cnv->toULength=byteIndex;
1342 
1343     /* write back the updated pointers */
1344     pArgs->source=(const char *)source;
1345     pArgs->target=target;
1346     return;
1347 }
1348 
1349 /* miscellaneous ------------------------------------------------------------ */
1350 
1351 static const UConverterImpl _Bocu1Impl={
1352     UCNV_BOCU1,
1353 
1354     NULL,
1355     NULL,
1356 
1357     NULL,
1358     NULL,
1359     NULL,
1360 
1361     _Bocu1ToUnicode,
1362     _Bocu1ToUnicodeWithOffsets,
1363     _Bocu1FromUnicode,
1364     _Bocu1FromUnicodeWithOffsets,
1365     NULL,
1366 
1367     NULL,
1368     NULL,
1369     NULL,
1370     NULL,
1371     ucnv_getCompleteUnicodeSet
1372 };
1373 
1374 static const UConverterStaticData _Bocu1StaticData={
1375     sizeof(UConverterStaticData),
1376     "BOCU-1",
1377     1214, /* CCSID for BOCU-1 */
1378     UCNV_IBM, UCNV_BOCU1,
1379     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1380     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1381     FALSE, FALSE,
1382     0,
1383     0,
1384     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1385 };
1386 
1387 const UConverterSharedData _Bocu1Data={
1388     sizeof(UConverterSharedData), ~((uint32_t)0),
1389     NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
1390     0
1391 };
1392 
1393 #endif
1394