• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2002-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  ucnvbocu.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002mar27
16 *   created by: Markus W. Scherer
17 *
18 *   This is an implementation of the Binary Ordered Compression for Unicode,
19 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
20 */
21 
22 #include "unicode/utypes.h"
23 
24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
25 
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
28 #include "unicode/utf16.h"
29 #include "putilimp.h"
30 #include "ucnv_bld.h"
31 #include "ucnv_cnv.h"
32 #include "uassert.h"
33 
34 /* BOCU-1 constants and macros ---------------------------------------------- */
35 
36 /*
37  * BOCU-1 encodes the code points of a Unicode string as
38  * a sequence of byte-encoded differences (slope detection),
39  * preserving lexical order.
40  *
41  * Optimize the difference-taking for runs of Unicode text within
42  * small scripts:
43  *
44  * Most small scripts are allocated within aligned 128-blocks of Unicode
45  * code points. Lexical order is preserved if the "previous code point" state
46  * is always moved into the middle of such a block.
47  *
48  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
49  * areas into the middle of those areas.
50  *
51  * C0 control codes and space are encoded with their US-ASCII bytes.
52  * "prev" is reset for C0 controls but not for space.
53  */
54 
55 /* initial value for "prev": middle of the ASCII range */
56 #define BOCU1_ASCII_PREV        0x40
57 
58 /* bounding byte values for differences */
59 #define BOCU1_MIN               0x21
60 #define BOCU1_MIDDLE            0x90
61 #define BOCU1_MAX_LEAD          0xfe
62 #define BOCU1_MAX_TRAIL         0xff
63 #define BOCU1_RESET             0xff
64 
65 /* number of lead bytes */
66 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
67 
68 /* adjust trail byte counts for the use of some C0 control byte values */
69 #define BOCU1_TRAIL_CONTROLS_COUNT  20
70 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
71 
72 /* number of trail bytes */
73 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
74 
75 /*
76  * number of positive and negative single-byte codes
77  * (counting 0==BOCU1_MIDDLE among the positive ones)
78  */
79 #define BOCU1_SINGLE            64
80 
81 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
82 #define BOCU1_LEAD_2            43
83 #define BOCU1_LEAD_3            3
84 #define BOCU1_LEAD_4            1
85 
86 /* The difference value range for single-byters. */
87 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
88 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
89 
90 /* The difference value range for double-byters. */
91 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
92 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
93 
94 /* The difference value range for 3-byters. */
95 #define BOCU1_REACH_POS_3   \
96     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
97 
98 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
99 
100 /* The lead byte start values. */
101 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
102 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
103 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
104      /* ==BOCU1_MAX_LEAD */
105 
106 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
107 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
108 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
109      /* ==BOCU1_MIN+1 */
110 
111 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
112 #define BOCU1_LENGTH_FROM_LEAD(lead) \
113     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
114      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
115      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
116 
117 /* The length of a byte sequence, according to its packed form. */
118 #define BOCU1_LENGTH_FROM_PACKED(packed) \
119     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
120 
121 /*
122  * 12 commonly used C0 control codes (and space) are only used to encode
123  * themselves directly,
124  * which makes BOCU-1 MIME-usable and reasonably safe for
125  * ASCII-oriented software.
126  *
127  * These controls are
128  *  0   NUL
129  *
130  *  7   BEL
131  *  8   BS
132  *
133  *  9   TAB
134  *  a   LF
135  *  b   VT
136  *  c   FF
137  *  d   CR
138  *
139  *  e   SO
140  *  f   SI
141  *
142  * 1a   SUB
143  * 1b   ESC
144  *
145  * The other 20 C0 controls are also encoded directly (to preserve order)
146  * but are also used as trail bytes in difference encoding
147  * (for better compression).
148  */
149 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
150 
151 /*
152  * Byte value map for control codes,
153  * from external byte values 0x00..0x20
154  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
155  * External byte values that are illegal as trail bytes are mapped to -1.
156  */
157 static const int8_t
158 bocu1ByteToTrail[BOCU1_MIN]={
159 /*  0     1     2     3     4     5     6     7    */
160     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
161 
162 /*  8     9     a     b     c     d     e     f    */
163     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
164 
165 /*  10    11    12    13    14    15    16    17   */
166     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
167 
168 /*  18    19    1a    1b    1c    1d    1e    1f   */
169     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
170 
171 /*  20   */
172     -1
173 };
174 
175 /*
176  * Byte value map for control codes,
177  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
178  * to external byte values 0x00..0x20.
179  */
180 static const int8_t
181 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
182 /*  0     1     2     3     4     5     6     7    */
183     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
184 
185 /*  8     9     a     b     c     d     e     f    */
186     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
187 
188 /*  10    11    12    13   */
189     0x1c, 0x1d, 0x1e, 0x1f
190 };
191 
192 /**
193  * Integer division and modulo with negative numerators
194  * yields negative modulo results and quotients that are one more than
195  * what we need here.
196  * This macro adjust the results so that the modulo-value m is always >=0.
197  *
198  * For positive n, the if() condition is always FALSE.
199  *
200  * @param n Number to be split into quotient and rest.
201  *          Will be modified to contain the quotient.
202  * @param d Divisor.
203  * @param m Output variable for the rest (modulo result).
204  */
205 #define NEGDIVMOD(n, d, m) { \
206     (m)=(n)%(d); \
207     (n)/=(d); \
208     if((m)<0) { \
209         --(n); \
210         (m)+=(d); \
211     } \
212 }
213 
214 /* Faster versions of packDiff() for single-byte-encoded diff values. */
215 
216 /** Is a diff value encodable in a single byte? */
217 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
218 
219 /** Encode a diff value in a single byte. */
220 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
221 
222 /** Is a diff value encodable in two bytes? */
223 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
224 
225 /* BOCU-1 implementation functions ------------------------------------------ */
226 
227 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
228 
229 /**
230  * Compute the next "previous" value for differencing
231  * from the current code point.
232  *
233  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
234  * @return "previous code point" state value
235  */
236 static inline int32_t
bocu1Prev(int32_t c)237 bocu1Prev(int32_t c) {
238     /* compute new prev */
239     if(/* 0x3040<=c && */ c<=0x309f) {
240         /* Hiragana is not 128-aligned */
241         return 0x3070;
242     } else if(0x4e00<=c && c<=0x9fa5) {
243         /* CJK Unihan */
244         return 0x4e00-BOCU1_REACH_NEG_2;
245     } else if(0xac00<=c /* && c<=0xd7a3 */) {
246         /* Korean Hangul */
247         return (0xd7a3+0xac00)/2;
248     } else {
249         /* mostly small scripts */
250         return BOCU1_SIMPLE_PREV(c);
251     }
252 }
253 
254 /** Fast version of bocu1Prev() for most scripts. */
255 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
256 
257 /*
258  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
259  * The UConverter fields are used as follows:
260  *
261  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262  *
263  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
264  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
265  */
266 
267 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
268 
269 /**
270  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
271  * and return a packed integer with them.
272  *
273  * The encoding favors small absolute differences with short encodings
274  * to compress runs of same-script characters.
275  *
276  * Optimized version with unrolled loops and fewer floating-point operations
277  * than the standard packDiff().
278  *
279  * @param diff difference value -0x10ffff..0x10ffff
280  * @return
281  *      0x010000zz for 1-byte sequence zz
282  *      0x0200yyzz for 2-byte sequence yy zz
283  *      0x03xxyyzz for 3-byte sequence xx yy zz
284  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
285  */
286 static int32_t
packDiff(int32_t diff)287 packDiff(int32_t diff) {
288     int32_t result, m;
289 
290     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
291     if(diff>=BOCU1_REACH_NEG_1) {
292         /* mostly positive differences, and single-byte negative ones */
293 #if 0   /* single-byte case handled in macros, see below */
294         if(diff<=BOCU1_REACH_POS_1) {
295             /* single byte */
296             return 0x01000000|(BOCU1_MIDDLE+diff);
297         } else
298 #endif
299         if(diff<=BOCU1_REACH_POS_2) {
300             /* two bytes */
301             diff-=BOCU1_REACH_POS_1+1;
302             result=0x02000000;
303 
304             m=diff%BOCU1_TRAIL_COUNT;
305             diff/=BOCU1_TRAIL_COUNT;
306             result|=BOCU1_TRAIL_TO_BYTE(m);
307 
308             result|=(BOCU1_START_POS_2+diff)<<8;
309         } else if(diff<=BOCU1_REACH_POS_3) {
310             /* three bytes */
311             diff-=BOCU1_REACH_POS_2+1;
312             result=0x03000000;
313 
314             m=diff%BOCU1_TRAIL_COUNT;
315             diff/=BOCU1_TRAIL_COUNT;
316             result|=BOCU1_TRAIL_TO_BYTE(m);
317 
318             m=diff%BOCU1_TRAIL_COUNT;
319             diff/=BOCU1_TRAIL_COUNT;
320             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
321 
322             result|=(BOCU1_START_POS_3+diff)<<16;
323         } else {
324             /* four bytes */
325             diff-=BOCU1_REACH_POS_3+1;
326 
327             m=diff%BOCU1_TRAIL_COUNT;
328             diff/=BOCU1_TRAIL_COUNT;
329             result=BOCU1_TRAIL_TO_BYTE(m);
330 
331             m=diff%BOCU1_TRAIL_COUNT;
332             diff/=BOCU1_TRAIL_COUNT;
333             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
334 
335             /*
336              * We know that / and % would deliver quotient 0 and rest=diff.
337              * Avoid division and modulo for performance.
338              */
339             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
340 
341             result|=((uint32_t)BOCU1_START_POS_4)<<24;
342         }
343     } else {
344         /* two- to four-byte negative differences */
345         if(diff>=BOCU1_REACH_NEG_2) {
346             /* two bytes */
347             diff-=BOCU1_REACH_NEG_1;
348             result=0x02000000;
349 
350             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
351             result|=BOCU1_TRAIL_TO_BYTE(m);
352 
353             result|=(BOCU1_START_NEG_2+diff)<<8;
354         } else if(diff>=BOCU1_REACH_NEG_3) {
355             /* three bytes */
356             diff-=BOCU1_REACH_NEG_2;
357             result=0x03000000;
358 
359             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
360             result|=BOCU1_TRAIL_TO_BYTE(m);
361 
362             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
363             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
364 
365             result|=(BOCU1_START_NEG_3+diff)<<16;
366         } else {
367             /* four bytes */
368             diff-=BOCU1_REACH_NEG_3;
369 
370             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
371             result=BOCU1_TRAIL_TO_BYTE(m);
372 
373             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
374             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
375 
376             /*
377              * We know that NEGDIVMOD would deliver
378              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
379              * Avoid division and modulo for performance.
380              */
381             m=diff+BOCU1_TRAIL_COUNT;
382             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
383 
384             result|=BOCU1_MIN<<24;
385         }
386     }
387     return result;
388 }
389 
390 
391 static void U_CALLCONV
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)392 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
393                              UErrorCode *pErrorCode) {
394     UConverter *cnv;
395     const UChar *source, *sourceLimit;
396     uint8_t *target;
397     int32_t targetCapacity;
398     int32_t *offsets;
399 
400     int32_t prev, c, diff;
401 
402     int32_t sourceIndex, nextSourceIndex;
403 
404     /* set up the local pointers */
405     cnv=pArgs->converter;
406     source=pArgs->source;
407     sourceLimit=pArgs->sourceLimit;
408     target=(uint8_t *)pArgs->target;
409     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
410     offsets=pArgs->offsets;
411 
412     /* get the converter state from UConverter */
413     c=cnv->fromUChar32;
414     prev=(int32_t)cnv->fromUnicodeStatus;
415     if(prev==0) {
416         prev=BOCU1_ASCII_PREV;
417     }
418 
419     /* sourceIndex=-1 if the current character began in the previous buffer */
420     sourceIndex= c==0 ? 0 : -1;
421     nextSourceIndex=0;
422 
423     /* conversion loop */
424     if(c!=0 && targetCapacity>0) {
425         goto getTrail;
426     }
427 
428 fastSingle:
429     /* fast loop for single-byte differences */
430     /* use only one loop counter variable, targetCapacity, not also source */
431     diff=(int32_t)(sourceLimit-source);
432     if(targetCapacity>diff) {
433         targetCapacity=diff;
434     }
435     while(targetCapacity>0 && (c=*source)<0x3000) {
436         if(c<=0x20) {
437             if(c!=0x20) {
438                 prev=BOCU1_ASCII_PREV;
439             }
440             *target++=(uint8_t)c;
441             *offsets++=nextSourceIndex++;
442             ++source;
443             --targetCapacity;
444         } else {
445             diff=c-prev;
446             if(DIFF_IS_SINGLE(diff)) {
447                 prev=BOCU1_SIMPLE_PREV(c);
448                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
449                 *offsets++=nextSourceIndex++;
450                 ++source;
451                 --targetCapacity;
452             } else {
453                 break;
454             }
455         }
456     }
457     /* restore real values */
458     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
459     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
460 
461     /* regular loop for all cases */
462     while(source<sourceLimit) {
463         if(targetCapacity>0) {
464             c=*source++;
465             ++nextSourceIndex;
466 
467             if(c<=0x20) {
468                 /*
469                  * ISO C0 control & space:
470                  * Encode directly for MIME compatibility,
471                  * and reset state except for space, to not disrupt compression.
472                  */
473                 if(c!=0x20) {
474                     prev=BOCU1_ASCII_PREV;
475                 }
476                 *target++=(uint8_t)c;
477                 *offsets++=sourceIndex;
478                 --targetCapacity;
479 
480                 sourceIndex=nextSourceIndex;
481                 continue;
482             }
483 
484             if(U16_IS_LEAD(c)) {
485 getTrail:
486                 if(source<sourceLimit) {
487                     /* test the following code unit */
488                     UChar trail=*source;
489                     if(U16_IS_TRAIL(trail)) {
490                         ++source;
491                         ++nextSourceIndex;
492                         c=U16_GET_SUPPLEMENTARY(c, trail);
493                     }
494                 } else {
495                     /* no more input */
496                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
497                     break;
498                 }
499             }
500 
501             /*
502              * all other Unicode code points c==U+0021..U+10ffff
503              * are encoded with the difference c-prev
504              *
505              * a new prev is computed from c,
506              * placed in the middle of a 0x80-block (for most small scripts) or
507              * in the middle of the Unihan and Hangul blocks
508              * to statistically minimize the following difference
509              */
510             diff=c-prev;
511             prev=BOCU1_PREV(c);
512             if(DIFF_IS_SINGLE(diff)) {
513                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
514                 *offsets++=sourceIndex;
515                 --targetCapacity;
516                 sourceIndex=nextSourceIndex;
517                 if(c<0x3000) {
518                     goto fastSingle;
519                 }
520             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
521                 /* optimize 2-byte case */
522                 int32_t m;
523 
524                 if(diff>=0) {
525                     diff-=BOCU1_REACH_POS_1+1;
526                     m=diff%BOCU1_TRAIL_COUNT;
527                     diff/=BOCU1_TRAIL_COUNT;
528                     diff+=BOCU1_START_POS_2;
529                 } else {
530                     diff-=BOCU1_REACH_NEG_1;
531                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
532                     diff+=BOCU1_START_NEG_2;
533                 }
534                 *target++=(uint8_t)diff;
535                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
536                 *offsets++=sourceIndex;
537                 *offsets++=sourceIndex;
538                 targetCapacity-=2;
539                 sourceIndex=nextSourceIndex;
540             } else {
541                 int32_t length; /* will be 2..4 */
542 
543                 diff=packDiff(diff);
544                 length=BOCU1_LENGTH_FROM_PACKED(diff);
545 
546                 /* write the output character bytes from diff and length */
547                 /* from the first if in the loop we know that targetCapacity>0 */
548                 if(length<=targetCapacity) {
549                     switch(length) {
550                         /* each branch falls through to the next one */
551                     case 4:
552                         *target++=(uint8_t)(diff>>24);
553                         *offsets++=sourceIndex;
554                         U_FALLTHROUGH;
555                     case 3:
556                         *target++=(uint8_t)(diff>>16);
557                         *offsets++=sourceIndex;
558                         U_FALLTHROUGH;
559                     case 2:
560                         *target++=(uint8_t)(diff>>8);
561                         *offsets++=sourceIndex;
562                     /* case 1: handled above */
563                         *target++=(uint8_t)diff;
564                         *offsets++=sourceIndex;
565                         U_FALLTHROUGH;
566                     default:
567                         /* will never occur */
568                         break;
569                     }
570                     targetCapacity-=length;
571                     sourceIndex=nextSourceIndex;
572                 } else {
573                     uint8_t *charErrorBuffer;
574 
575                     /*
576                      * We actually do this backwards here:
577                      * In order to save an intermediate variable, we output
578                      * first to the overflow buffer what does not fit into the
579                      * regular target.
580                      */
581                     /* we know that 1<=targetCapacity<length<=4 */
582                     length-=targetCapacity;
583                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
584                     switch(length) {
585                         /* each branch falls through to the next one */
586                     case 3:
587                         *charErrorBuffer++=(uint8_t)(diff>>16);
588                         U_FALLTHROUGH;
589                     case 2:
590                         *charErrorBuffer++=(uint8_t)(diff>>8);
591                         U_FALLTHROUGH;
592                     case 1:
593                         *charErrorBuffer=(uint8_t)diff;
594                         U_FALLTHROUGH;
595                     default:
596                         /* will never occur */
597                         break;
598                     }
599                     cnv->charErrorBufferLength=(int8_t)length;
600 
601                     /* now output what fits into the regular target */
602                     diff>>=8*length; /* length was reduced by targetCapacity */
603                     switch(targetCapacity) {
604                         /* each branch falls through to the next one */
605                     case 3:
606                         *target++=(uint8_t)(diff>>16);
607                         *offsets++=sourceIndex;
608                         U_FALLTHROUGH;
609                     case 2:
610                         *target++=(uint8_t)(diff>>8);
611                         *offsets++=sourceIndex;
612                         U_FALLTHROUGH;
613                     case 1:
614                         *target++=(uint8_t)diff;
615                         *offsets++=sourceIndex;
616                         U_FALLTHROUGH;
617                     default:
618                         /* will never occur */
619                         break;
620                     }
621 
622                     /* target overflow */
623                     targetCapacity=0;
624                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
625                     break;
626                 }
627             }
628         } else {
629             /* target is full */
630             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
631             break;
632         }
633     }
634 
635     /* set the converter state back into UConverter */
636     cnv->fromUChar32= c<0 ? -c : 0;
637     cnv->fromUnicodeStatus=(uint32_t)prev;
638 
639     /* write back the updated pointers */
640     pArgs->source=source;
641     pArgs->target=(char *)target;
642     pArgs->offsets=offsets;
643 }
644 
645 /*
646  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
647  * If a change is made in the original function, then either
648  * change this function the same way or
649  * re-copy the original function and remove the variables
650  * offsets, sourceIndex, and nextSourceIndex.
651  */
652 static void U_CALLCONV
_Bocu1FromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)653 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
654                   UErrorCode *pErrorCode) {
655     UConverter *cnv;
656     const UChar *source, *sourceLimit;
657     uint8_t *target;
658     int32_t targetCapacity;
659 
660     int32_t prev, c, diff;
661 
662     /* set up the local pointers */
663     cnv=pArgs->converter;
664     source=pArgs->source;
665     sourceLimit=pArgs->sourceLimit;
666     target=(uint8_t *)pArgs->target;
667     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
668 
669     /* get the converter state from UConverter */
670     c=cnv->fromUChar32;
671     prev=(int32_t)cnv->fromUnicodeStatus;
672     if(prev==0) {
673         prev=BOCU1_ASCII_PREV;
674     }
675 
676     /* conversion loop */
677     if(c!=0 && targetCapacity>0) {
678         goto getTrail;
679     }
680 
681 fastSingle:
682     /* fast loop for single-byte differences */
683     /* use only one loop counter variable, targetCapacity, not also source */
684     diff=(int32_t)(sourceLimit-source);
685     if(targetCapacity>diff) {
686         targetCapacity=diff;
687     }
688     while(targetCapacity>0 && (c=*source)<0x3000) {
689         if(c<=0x20) {
690             if(c!=0x20) {
691                 prev=BOCU1_ASCII_PREV;
692             }
693             *target++=(uint8_t)c;
694         } else {
695             diff=c-prev;
696             if(DIFF_IS_SINGLE(diff)) {
697                 prev=BOCU1_SIMPLE_PREV(c);
698                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
699             } else {
700                 break;
701             }
702         }
703         ++source;
704         --targetCapacity;
705     }
706     /* restore real values */
707     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
708 
709     /* regular loop for all cases */
710     while(source<sourceLimit) {
711         if(targetCapacity>0) {
712             c=*source++;
713 
714             if(c<=0x20) {
715                 /*
716                  * ISO C0 control & space:
717                  * Encode directly for MIME compatibility,
718                  * and reset state except for space, to not disrupt compression.
719                  */
720                 if(c!=0x20) {
721                     prev=BOCU1_ASCII_PREV;
722                 }
723                 *target++=(uint8_t)c;
724                 --targetCapacity;
725                 continue;
726             }
727 
728             if(U16_IS_LEAD(c)) {
729 getTrail:
730                 if(source<sourceLimit) {
731                     /* test the following code unit */
732                     UChar trail=*source;
733                     if(U16_IS_TRAIL(trail)) {
734                         ++source;
735                         c=U16_GET_SUPPLEMENTARY(c, trail);
736                     }
737                 } else {
738                     /* no more input */
739                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
740                     break;
741                 }
742             }
743 
744             /*
745              * all other Unicode code points c==U+0021..U+10ffff
746              * are encoded with the difference c-prev
747              *
748              * a new prev is computed from c,
749              * placed in the middle of a 0x80-block (for most small scripts) or
750              * in the middle of the Unihan and Hangul blocks
751              * to statistically minimize the following difference
752              */
753             diff=c-prev;
754             prev=BOCU1_PREV(c);
755             if(DIFF_IS_SINGLE(diff)) {
756                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
757                 --targetCapacity;
758                 if(c<0x3000) {
759                     goto fastSingle;
760                 }
761             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
762                 /* optimize 2-byte case */
763                 int32_t m;
764 
765                 if(diff>=0) {
766                     diff-=BOCU1_REACH_POS_1+1;
767                     m=diff%BOCU1_TRAIL_COUNT;
768                     diff/=BOCU1_TRAIL_COUNT;
769                     diff+=BOCU1_START_POS_2;
770                 } else {
771                     diff-=BOCU1_REACH_NEG_1;
772                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
773                     diff+=BOCU1_START_NEG_2;
774                 }
775                 *target++=(uint8_t)diff;
776                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
777                 targetCapacity-=2;
778             } else {
779                 int32_t length; /* will be 2..4 */
780 
781                 diff=packDiff(diff);
782                 length=BOCU1_LENGTH_FROM_PACKED(diff);
783 
784                 /* write the output character bytes from diff and length */
785                 /* from the first if in the loop we know that targetCapacity>0 */
786                 if(length<=targetCapacity) {
787                     switch(length) {
788                         /* each branch falls through to the next one */
789                     case 4:
790                         *target++=(uint8_t)(diff>>24);
791                         U_FALLTHROUGH;
792                     case 3:
793                         *target++=(uint8_t)(diff>>16);
794                     /* case 2: handled above */
795                         *target++=(uint8_t)(diff>>8);
796                     /* case 1: handled above */
797                         *target++=(uint8_t)diff;
798                         U_FALLTHROUGH;
799                     default:
800                         /* will never occur */
801                         break;
802                     }
803                     targetCapacity-=length;
804                 } else {
805                     uint8_t *charErrorBuffer;
806 
807                     /*
808                      * We actually do this backwards here:
809                      * In order to save an intermediate variable, we output
810                      * first to the overflow buffer what does not fit into the
811                      * regular target.
812                      */
813                     /* we know that 1<=targetCapacity<length<=4 */
814                     length-=targetCapacity;
815                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
816                     switch(length) {
817                         /* each branch falls through to the next one */
818                     case 3:
819                         *charErrorBuffer++=(uint8_t)(diff>>16);
820                         U_FALLTHROUGH;
821                     case 2:
822                         *charErrorBuffer++=(uint8_t)(diff>>8);
823                         U_FALLTHROUGH;
824                     case 1:
825                         *charErrorBuffer=(uint8_t)diff;
826                         U_FALLTHROUGH;
827                     default:
828                         /* will never occur */
829                         break;
830                     }
831                     cnv->charErrorBufferLength=(int8_t)length;
832 
833                     /* now output what fits into the regular target */
834                     diff>>=8*length; /* length was reduced by targetCapacity */
835                     switch(targetCapacity) {
836                         /* each branch falls through to the next one */
837                     case 3:
838                         *target++=(uint8_t)(diff>>16);
839                         U_FALLTHROUGH;
840                     case 2:
841                         *target++=(uint8_t)(diff>>8);
842                         U_FALLTHROUGH;
843                     case 1:
844                         *target++=(uint8_t)diff;
845                         U_FALLTHROUGH;
846                     default:
847                         /* will never occur */
848                         break;
849                     }
850 
851                     /* target overflow */
852                     targetCapacity=0;
853                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
854                     break;
855                 }
856             }
857         } else {
858             /* target is full */
859             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
860             break;
861         }
862     }
863 
864     /* set the converter state back into UConverter */
865     cnv->fromUChar32= c<0 ? -c : 0;
866     cnv->fromUnicodeStatus=(uint32_t)prev;
867 
868     /* write back the updated pointers */
869     pArgs->source=source;
870     pArgs->target=(char *)target;
871 }
872 
873 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
874 
875 /**
876  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
877  *
878  * @param b lead byte;
879  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
880  * @return (diff<<2)|count
881  */
882 static inline int32_t
decodeBocu1LeadByte(int32_t b)883 decodeBocu1LeadByte(int32_t b) {
884     int32_t diff, count;
885 
886     if(b>=BOCU1_START_NEG_2) {
887         /* positive difference */
888         if(b<BOCU1_START_POS_3) {
889             /* two bytes */
890             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
891             count=1;
892         } else if(b<BOCU1_START_POS_4) {
893             /* three bytes */
894             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
895             count=2;
896         } else {
897             /* four bytes */
898             diff=BOCU1_REACH_POS_3+1;
899             count=3;
900         }
901     } else {
902         /* negative difference */
903         if(b>=BOCU1_START_NEG_3) {
904             /* two bytes */
905             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
906             count=1;
907         } else if(b>BOCU1_MIN) {
908             /* three bytes */
909             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
910             count=2;
911         } else {
912             /* four bytes */
913             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
914             count=3;
915         }
916     }
917 
918     /* return the state for decoding the trail byte(s) */
919     return (diff<<2)|count;
920 }
921 
922 /**
923  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
924  *
925  * @param count number of remaining trail bytes including this one
926  * @param b trail byte
927  * @return new delta for diff including b - <0 indicates an error
928  *
929  * @see decodeBocu1
930  */
931 static inline int32_t
decodeBocu1TrailByte(int32_t count,int32_t b)932 decodeBocu1TrailByte(int32_t count, int32_t b) {
933     if(b<=0x20) {
934         /* skip some C0 controls and make the trail byte range contiguous */
935         b=bocu1ByteToTrail[b];
936         /* b<0 for an illegal trail byte value will result in return<0 below */
937 #if BOCU1_MAX_TRAIL<0xff
938     } else if(b>BOCU1_MAX_TRAIL) {
939         return -99;
940 #endif
941     } else {
942         b-=BOCU1_TRAIL_BYTE_OFFSET;
943     }
944 
945     /* add trail byte into difference and decrement count */
946     if(count==1) {
947         return b;
948     } else if(count==2) {
949         return b*BOCU1_TRAIL_COUNT;
950     } else /* count==3 */ {
951         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
952     }
953 }
954 
955 static void U_CALLCONV
_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)956 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
957                            UErrorCode *pErrorCode) {
958     UConverter *cnv;
959     const uint8_t *source, *sourceLimit;
960     UChar *target;
961     const UChar *targetLimit;
962     int32_t *offsets;
963 
964     int32_t prev, count, diff, c;
965 
966     int8_t byteIndex;
967     uint8_t *bytes;
968 
969     int32_t sourceIndex, nextSourceIndex;
970 
971     /* set up the local pointers */
972     cnv=pArgs->converter;
973     source=(const uint8_t *)pArgs->source;
974     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
975     target=pArgs->target;
976     targetLimit=pArgs->targetLimit;
977     offsets=pArgs->offsets;
978 
979     /* get the converter state from UConverter */
980     prev=(int32_t)cnv->toUnicodeStatus;
981     if(prev==0) {
982         prev=BOCU1_ASCII_PREV;
983     }
984     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
985     count=diff&3;
986     diff>>=2;
987 
988     byteIndex=cnv->toULength;
989     bytes=cnv->toUBytes;
990 
991     /* sourceIndex=-1 if the current character began in the previous buffer */
992     sourceIndex=byteIndex==0 ? 0 : -1;
993     nextSourceIndex=0;
994 
995     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
996     if(count>0 && byteIndex>0 && target<targetLimit) {
997         goto getTrail;
998     }
999 
1000 fastSingle:
1001     /* fast loop for single-byte differences */
1002     /* use count as the only loop counter variable */
1003     diff=(int32_t)(sourceLimit-source);
1004     count=(int32_t)(pArgs->targetLimit-target);
1005     if(count>diff) {
1006         count=diff;
1007     }
1008     while(count>0) {
1009         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1010             c=prev+(c-BOCU1_MIDDLE);
1011             if(c<0x3000) {
1012                 *target++=(UChar)c;
1013                 *offsets++=nextSourceIndex++;
1014                 prev=BOCU1_SIMPLE_PREV(c);
1015             } else {
1016                 break;
1017             }
1018         } else if(c<=0x20) {
1019             if(c!=0x20) {
1020                 prev=BOCU1_ASCII_PREV;
1021             }
1022             *target++=(UChar)c;
1023             *offsets++=nextSourceIndex++;
1024         } else {
1025             break;
1026         }
1027         ++source;
1028         --count;
1029     }
1030     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1031 
1032     /* decode a sequence of single and lead bytes */
1033     while(source<sourceLimit) {
1034         if(target>=targetLimit) {
1035             /* target is full */
1036             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1037             break;
1038         }
1039 
1040         ++nextSourceIndex;
1041         c=*source++;
1042         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1043             /* Write a code point directly from a single-byte difference. */
1044             c=prev+(c-BOCU1_MIDDLE);
1045             if(c<0x3000) {
1046                 *target++=(UChar)c;
1047                 *offsets++=sourceIndex;
1048                 prev=BOCU1_SIMPLE_PREV(c);
1049                 sourceIndex=nextSourceIndex;
1050                 goto fastSingle;
1051             }
1052         } else if(c<=0x20) {
1053             /*
1054              * Direct-encoded C0 control code or space.
1055              * Reset prev for C0 control codes but not for space.
1056              */
1057             if(c!=0x20) {
1058                 prev=BOCU1_ASCII_PREV;
1059             }
1060             *target++=(UChar)c;
1061             *offsets++=sourceIndex;
1062             sourceIndex=nextSourceIndex;
1063             continue;
1064         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1065             /* Optimize two-byte case. */
1066             if(c>=BOCU1_MIDDLE) {
1067                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1068             } else {
1069                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1070             }
1071 
1072             /* trail byte */
1073             ++nextSourceIndex;
1074             c=decodeBocu1TrailByte(1, *source++);
1075             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1076                 bytes[0]=source[-2];
1077                 bytes[1]=source[-1];
1078                 byteIndex=2;
1079                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1080                 break;
1081             }
1082         } else if(c==BOCU1_RESET) {
1083             /* only reset the state, no code point */
1084             prev=BOCU1_ASCII_PREV;
1085             sourceIndex=nextSourceIndex;
1086             continue;
1087         } else {
1088             /*
1089              * For multi-byte difference lead bytes, set the decoder state
1090              * with the partial difference value from the lead byte and
1091              * with the number of trail bytes.
1092              */
1093             bytes[0]=(uint8_t)c;
1094             byteIndex=1;
1095 
1096             diff=decodeBocu1LeadByte(c);
1097             count=diff&3;
1098             diff>>=2;
1099 getTrail:
1100             for(;;) {
1101                 if(source>=sourceLimit) {
1102                     goto endloop;
1103                 }
1104                 ++nextSourceIndex;
1105                 c=bytes[byteIndex++]=*source++;
1106 
1107                 /* trail byte in any position */
1108                 c=decodeBocu1TrailByte(count, c);
1109                 if(c<0) {
1110                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1111                     goto endloop;
1112                 }
1113 
1114                 diff+=c;
1115                 if(--count==0) {
1116                     /* final trail byte, deliver a code point */
1117                     byteIndex=0;
1118                     c=prev+diff;
1119                     if((uint32_t)c>0x10ffff) {
1120                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121                         goto endloop;
1122                     }
1123                     break;
1124                 }
1125             }
1126         }
1127 
1128         /* calculate the next prev and output c */
1129         prev=BOCU1_PREV(c);
1130         if(c<=0xffff) {
1131             *target++=(UChar)c;
1132             *offsets++=sourceIndex;
1133         } else {
1134             /* output surrogate pair */
1135             *target++=U16_LEAD(c);
1136             if(target<targetLimit) {
1137                 *target++=U16_TRAIL(c);
1138                 *offsets++=sourceIndex;
1139                 *offsets++=sourceIndex;
1140             } else {
1141                 /* target overflow */
1142                 *offsets++=sourceIndex;
1143                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1144                 cnv->UCharErrorBufferLength=1;
1145                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1146                 break;
1147             }
1148         }
1149         sourceIndex=nextSourceIndex;
1150     }
1151 endloop:
1152 
1153     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1154         /* set the converter state in UConverter to deal with the next character */
1155         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1156         cnv->mode=0;
1157     } else {
1158         /* set the converter state back into UConverter */
1159         cnv->toUnicodeStatus=(uint32_t)prev;
1160         cnv->mode=(diff<<2)|count;
1161     }
1162     cnv->toULength=byteIndex;
1163 
1164     /* write back the updated pointers */
1165     pArgs->source=(const char *)source;
1166     pArgs->target=target;
1167     pArgs->offsets=offsets;
1168     return;
1169 }
1170 
1171 /*
1172  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1173  * If a change is made in the original function, then either
1174  * change this function the same way or
1175  * re-copy the original function and remove the variables
1176  * offsets, sourceIndex, and nextSourceIndex.
1177  */
1178 static void U_CALLCONV
_Bocu1ToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)1179 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1180                 UErrorCode *pErrorCode) {
1181     UConverter *cnv;
1182     const uint8_t *source, *sourceLimit;
1183     UChar *target;
1184     const UChar *targetLimit;
1185 
1186     int32_t prev, count, diff, c;
1187 
1188     int8_t byteIndex;
1189     uint8_t *bytes;
1190 
1191     /* set up the local pointers */
1192     cnv=pArgs->converter;
1193     source=(const uint8_t *)pArgs->source;
1194     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1195     target=pArgs->target;
1196     targetLimit=pArgs->targetLimit;
1197 
1198     /* get the converter state from UConverter */
1199     prev=(int32_t)cnv->toUnicodeStatus;
1200     if(prev==0) {
1201         prev=BOCU1_ASCII_PREV;
1202     }
1203     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1204     count=diff&3;
1205     diff>>=2;
1206 
1207     byteIndex=cnv->toULength;
1208     bytes=cnv->toUBytes;
1209 
1210     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1211     if(count>0 && byteIndex>0 && target<targetLimit) {
1212         goto getTrail;
1213     }
1214 
1215 fastSingle:
1216     /* fast loop for single-byte differences */
1217     /* use count as the only loop counter variable */
1218     diff=(int32_t)(sourceLimit-source);
1219     count=(int32_t)(pArgs->targetLimit-target);
1220     if(count>diff) {
1221         count=diff;
1222     }
1223     while(count>0) {
1224         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1225             c=prev+(c-BOCU1_MIDDLE);
1226             if(c<0x3000) {
1227                 *target++=(UChar)c;
1228                 prev=BOCU1_SIMPLE_PREV(c);
1229             } else {
1230                 break;
1231             }
1232         } else if(c<=0x20) {
1233             if(c!=0x20) {
1234                 prev=BOCU1_ASCII_PREV;
1235             }
1236             *target++=(UChar)c;
1237         } else {
1238             break;
1239         }
1240         ++source;
1241         --count;
1242     }
1243 
1244     /* decode a sequence of single and lead bytes */
1245     while(source<sourceLimit) {
1246         if(target>=targetLimit) {
1247             /* target is full */
1248             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1249             break;
1250         }
1251 
1252         c=*source++;
1253         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1254             /* Write a code point directly from a single-byte difference. */
1255             c=prev+(c-BOCU1_MIDDLE);
1256             if(c<0x3000) {
1257                 *target++=(UChar)c;
1258                 prev=BOCU1_SIMPLE_PREV(c);
1259                 goto fastSingle;
1260             }
1261         } else if(c<=0x20) {
1262             /*
1263              * Direct-encoded C0 control code or space.
1264              * Reset prev for C0 control codes but not for space.
1265              */
1266             if(c!=0x20) {
1267                 prev=BOCU1_ASCII_PREV;
1268             }
1269             *target++=(UChar)c;
1270             continue;
1271         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1272             /* Optimize two-byte case. */
1273             if(c>=BOCU1_MIDDLE) {
1274                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1275             } else {
1276                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1277             }
1278 
1279             /* trail byte */
1280             c=decodeBocu1TrailByte(1, *source++);
1281             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1282                 bytes[0]=source[-2];
1283                 bytes[1]=source[-1];
1284                 byteIndex=2;
1285                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1286                 break;
1287             }
1288         } else if(c==BOCU1_RESET) {
1289             /* only reset the state, no code point */
1290             prev=BOCU1_ASCII_PREV;
1291             continue;
1292         } else {
1293             /*
1294              * For multi-byte difference lead bytes, set the decoder state
1295              * with the partial difference value from the lead byte and
1296              * with the number of trail bytes.
1297              */
1298             bytes[0]=(uint8_t)c;
1299             byteIndex=1;
1300 
1301             diff=decodeBocu1LeadByte(c);
1302             count=diff&3;
1303             diff>>=2;
1304 getTrail:
1305             for(;;) {
1306                 if(source>=sourceLimit) {
1307                     goto endloop;
1308                 }
1309                 c=bytes[byteIndex++]=*source++;
1310 
1311                 /* trail byte in any position */
1312                 c=decodeBocu1TrailByte(count, c);
1313                 if(c<0) {
1314                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1315                     goto endloop;
1316                 }
1317 
1318                 diff+=c;
1319                 if(--count==0) {
1320                     /* final trail byte, deliver a code point */
1321                     byteIndex=0;
1322                     c=prev+diff;
1323                     if((uint32_t)c>0x10ffff) {
1324                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1325                         goto endloop;
1326                     }
1327                     break;
1328                 }
1329             }
1330         }
1331 
1332         /* calculate the next prev and output c */
1333         prev=BOCU1_PREV(c);
1334         if(c<=0xffff) {
1335             *target++=(UChar)c;
1336         } else {
1337             /* output surrogate pair */
1338             *target++=U16_LEAD(c);
1339             if(target<targetLimit) {
1340                 *target++=U16_TRAIL(c);
1341             } else {
1342                 /* target overflow */
1343                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1344                 cnv->UCharErrorBufferLength=1;
1345                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1346                 break;
1347             }
1348         }
1349     }
1350 endloop:
1351 
1352     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1353         /* set the converter state in UConverter to deal with the next character */
1354         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1355         cnv->mode=0;
1356     } else {
1357         /* set the converter state back into UConverter */
1358         cnv->toUnicodeStatus=(uint32_t)prev;
1359         cnv->mode=(diff<<2)|count;
1360     }
1361     cnv->toULength=byteIndex;
1362 
1363     /* write back the updated pointers */
1364     pArgs->source=(const char *)source;
1365     pArgs->target=target;
1366     return;
1367 }
1368 
1369 /* miscellaneous ------------------------------------------------------------ */
1370 
1371 static const UConverterImpl _Bocu1Impl={
1372     UCNV_BOCU1,
1373 
1374     NULL,
1375     NULL,
1376 
1377     NULL,
1378     NULL,
1379     NULL,
1380 
1381     _Bocu1ToUnicode,
1382     _Bocu1ToUnicodeWithOffsets,
1383     _Bocu1FromUnicode,
1384     _Bocu1FromUnicodeWithOffsets,
1385     NULL,
1386 
1387     NULL,
1388     NULL,
1389     NULL,
1390     NULL,
1391     ucnv_getCompleteUnicodeSet,
1392 
1393     NULL,
1394     NULL
1395 };
1396 
1397 static const UConverterStaticData _Bocu1StaticData={
1398     sizeof(UConverterStaticData),
1399     "BOCU-1",
1400     1214, /* CCSID for BOCU-1 */
1401     UCNV_IBM, UCNV_BOCU1,
1402     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1403     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1404     FALSE, FALSE,
1405     0,
1406     0,
1407     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1408 };
1409 
1410 const UConverterSharedData _Bocu1Data=
1411         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1412 
1413 #endif
1414