• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2002-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  bocu1tst.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002may27
14 *   created by: Markus W. Scherer
15 *
16 *   This is the reference implementation of BOCU-1,
17 *   the MIME-friendly form of the Binary Ordered Compression for Unicode,
18 *   taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/
19 *   The files bocu1.h and bocu1.c from the design folder are taken
20 *   verbatim (minus copyright and #include) and copied together into this file.
21 *   The reference code and some of the reference bocu1tst.c
22 *   is modified to run as part of the ICU cintltst
23 *   test framework (minus main(), log_ln() etc. instead of printf()).
24 *
25 *   This reference implementation is used here to verify
26 *   the ICU BOCU-1 implementation, which is
27 *   adapted for ICU conversion APIs and optimized.
28 *   ### links in design doc to here and to ucnvbocu.c
29 */
30 
31 #include "unicode/utypes.h"
32 #include "unicode/ustring.h"
33 #include "unicode/ucnv.h"
34 #include "cmemory.h"
35 #include "cintltst.h"
36 
37 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
38 
39 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
40 
41 /* BOCU-1 constants and macros ---------------------------------------------- */
42 
43 /*
44  * BOCU-1 encodes the code points of a Unicode string as
45  * a sequence of byte-encoded differences (slope detection),
46  * preserving lexical order.
47  *
48  * Optimize the difference-taking for runs of Unicode text within
49  * small scripts:
50  *
51  * Most small scripts are allocated within aligned 128-blocks of Unicode
52  * code points. Lexical order is preserved if the "previous code point" state
53  * is always moved into the middle of such a block.
54  *
55  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
56  * areas into the middle of those areas.
57  *
58  * C0 control codes and space are encoded with their US-ASCII bytes.
59  * "prev" is reset for C0 controls but not for space.
60  */
61 
62 /* initial value for "prev": middle of the ASCII range */
63 #define BOCU1_ASCII_PREV        0x40
64 
65 /* bounding byte values for differences */
66 #define BOCU1_MIN               0x21
67 #define BOCU1_MIDDLE            0x90
68 #define BOCU1_MAX_LEAD          0xfe
69 
70 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
71 #define BOCU1_MAX_TRAIL         0xffL
72 #define BOCU1_RESET             0xff
73 
74 /* number of lead bytes */
75 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
76 
77 /* adjust trail byte counts for the use of some C0 control byte values */
78 #define BOCU1_TRAIL_CONTROLS_COUNT  20
79 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
80 
81 /* number of trail bytes */
82 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
83 
84 /*
85  * number of positive and negative single-byte codes
86  * (counting 0==BOCU1_MIDDLE among the positive ones)
87  */
88 #define BOCU1_SINGLE            64
89 
90 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
91 #define BOCU1_LEAD_2            43
92 #define BOCU1_LEAD_3            3
93 #define BOCU1_LEAD_4            1
94 
95 /* The difference value range for single-byters. */
96 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
97 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
98 
99 /* The difference value range for double-byters. */
100 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
101 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
102 
103 /* The difference value range for 3-byters. */
104 #define BOCU1_REACH_POS_3   \
105     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
106 
107 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
108 
109 /* The lead byte start values. */
110 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
111 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
112 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
113      /* ==BOCU1_MAX_LEAD */
114 
115 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
116 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
117 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
118      /* ==BOCU1_MIN+1 */
119 
120 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
121 #define BOCU1_LENGTH_FROM_LEAD(lead) \
122     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
123      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
124      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
125 
126 /* The length of a byte sequence, according to its packed form. */
127 #define BOCU1_LENGTH_FROM_PACKED(packed) \
128     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
129 
130 /*
131  * 12 commonly used C0 control codes (and space) are only used to encode
132  * themselves directly,
133  * which makes BOCU-1 MIME-usable and reasonably safe for
134  * ASCII-oriented software.
135  *
136  * These controls are
137  *  0   NUL
138  *
139  *  7   BEL
140  *  8   BS
141  *
142  *  9   TAB
143  *  a   LF
144  *  b   VT
145  *  c   FF
146  *  d   CR
147  *
148  *  e   SO
149  *  f   SI
150  *
151  * 1a   SUB
152  * 1b   ESC
153  *
154  * The other 20 C0 controls are also encoded directly (to preserve order)
155  * but are also used as trail bytes in difference encoding
156  * (for better compression).
157  */
158 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
159 
160 /*
161  * Byte value map for control codes,
162  * from external byte values 0x00..0x20
163  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
164  * External byte values that are illegal as trail bytes are mapped to -1.
165  */
166 static const int8_t
167 bocu1ByteToTrail[BOCU1_MIN]={
168 /*  0     1     2     3     4     5     6     7    */
169     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
170 
171 /*  8     9     a     b     c     d     e     f    */
172     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
173 
174 /*  10    11    12    13    14    15    16    17   */
175     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
176 
177 /*  18    19    1a    1b    1c    1d    1e    1f   */
178     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
179 
180 /*  20   */
181     -1
182 };
183 
184 /*
185  * Byte value map for control codes,
186  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
187  * to external byte values 0x00..0x20.
188  */
189 static const int8_t
190 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
191 /*  0     1     2     3     4     5     6     7    */
192     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
193 
194 /*  8     9     a     b     c     d     e     f    */
195     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
196 
197 /*  10    11    12    13   */
198     0x1c, 0x1d, 0x1e, 0x1f
199 };
200 
201 /**
202  * Integer division and modulo with negative numerators
203  * yields negative modulo results and quotients that are one more than
204  * what we need here.
205  * This macro adjust the results so that the modulo-value m is always >=0.
206  *
207  * For positive n, the if() condition is always FALSE.
208  *
209  * @param n Number to be split into quotient and rest.
210  *          Will be modified to contain the quotient.
211  * @param d Divisor.
212  * @param m Output variable for the rest (modulo result).
213  */
214 #define NEGDIVMOD(n, d, m) { \
215     (m)=(n)%(d); \
216     (n)/=(d); \
217     if((m)<0) { \
218         --(n); \
219         (m)+=(d); \
220     } \
221 }
222 
223 /* State for BOCU-1 decoder function. */
224 struct Bocu1Rx {
225     int32_t prev, count, diff;
226 };
227 
228 typedef struct Bocu1Rx Bocu1Rx;
229 
230 /* Function prototypes ------------------------------------------------------ */
231 
232 /* see bocu1.c */
233 U_CFUNC int32_t
234 packDiff(int32_t diff);
235 
236 U_CFUNC int32_t
237 encodeBocu1(int32_t *pPrev, int32_t c);
238 
239 U_CFUNC int32_t
240 decodeBocu1(Bocu1Rx *pRx, uint8_t b);
241 
242 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
243 
244 /* BOCU-1 implementation functions ------------------------------------------ */
245 
246 /**
247  * Compute the next "previous" value for differencing
248  * from the current code point.
249  *
250  * @param c current code point, 0..0x10ffff
251  * @return "previous code point" state value
252  */
253 static U_INLINE int32_t
bocu1Prev(int32_t c)254 bocu1Prev(int32_t c) {
255     /* compute new prev */
256     if(0x3040<=c && c<=0x309f) {
257         /* Hiragana is not 128-aligned */
258         return 0x3070;
259     } else if(0x4e00<=c && c<=0x9fa5) {
260         /* CJK Unihan */
261         return 0x4e00-BOCU1_REACH_NEG_2;
262     } else if(0xac00<=c && c<=0xd7a3) {
263         /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
264         return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
265     } else {
266         /* mostly small scripts */
267         return (c&~0x7f)+BOCU1_ASCII_PREV;
268     }
269 }
270 
271 /**
272  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
273  * and return a packed integer with them.
274  *
275  * The encoding favors small absolut differences with short encodings
276  * to compress runs of same-script characters.
277  *
278  * @param diff difference value -0x10ffff..0x10ffff
279  * @return
280  *      0x010000zz for 1-byte sequence zz
281  *      0x0200yyzz for 2-byte sequence yy zz
282  *      0x03xxyyzz for 3-byte sequence xx yy zz
283  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
284  */
285 U_CFUNC int32_t
packDiff(int32_t diff)286 packDiff(int32_t diff) {
287     int32_t result, m, lead, count, shift;
288 
289     if(diff>=BOCU1_REACH_NEG_1) {
290         /* mostly positive differences, and single-byte negative ones */
291         if(diff<=BOCU1_REACH_POS_1) {
292             /* single byte */
293             return 0x01000000|(BOCU1_MIDDLE+diff);
294         } else if(diff<=BOCU1_REACH_POS_2) {
295             /* two bytes */
296             diff-=BOCU1_REACH_POS_1+1;
297             lead=BOCU1_START_POS_2;
298             count=1;
299         } else if(diff<=BOCU1_REACH_POS_3) {
300             /* three bytes */
301             diff-=BOCU1_REACH_POS_2+1;
302             lead=BOCU1_START_POS_3;
303             count=2;
304         } else {
305             /* four bytes */
306             diff-=BOCU1_REACH_POS_3+1;
307             lead=BOCU1_START_POS_4;
308             count=3;
309         }
310     } else {
311         /* two- and four-byte negative differences */
312         if(diff>=BOCU1_REACH_NEG_2) {
313             /* two bytes */
314             diff-=BOCU1_REACH_NEG_1;
315             lead=BOCU1_START_NEG_2;
316             count=1;
317         } else if(diff>=BOCU1_REACH_NEG_3) {
318             /* three bytes */
319             diff-=BOCU1_REACH_NEG_2;
320             lead=BOCU1_START_NEG_3;
321             count=2;
322         } else {
323             /* four bytes */
324             diff-=BOCU1_REACH_NEG_3;
325             lead=BOCU1_START_NEG_4;
326             count=3;
327         }
328     }
329 
330     /* encode the length of the packed result */
331     if(count<3) {
332         result=(count+1)<<24;
333     } else /* count==3, MSB used for the lead byte */ {
334         result=0;
335     }
336 
337     /* calculate trail bytes like digits in itoa() */
338     shift=0;
339     do {
340         NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
341         result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
342         shift+=8;
343     } while(--count>0);
344 
345     /* add lead byte */
346     result|=(lead+diff)<<shift;
347 
348     return result;
349 }
350 
351 /**
352  * BOCU-1 encoder function.
353  *
354  * @param pPrev pointer to the integer that holds
355  *        the "previous code point" state;
356  *        the initial value should be 0 which
357  *        encodeBocu1 will set to the actual BOCU-1 initial state value
358  * @param c the code point to encode
359  * @return the packed 1/2/3/4-byte encoding, see packDiff(),
360  *         or 0 if an error occurs
361  *
362  * @see packDiff
363  */
364 U_CFUNC int32_t
encodeBocu1(int32_t * pPrev,int32_t c)365 encodeBocu1(int32_t *pPrev, int32_t c) {
366     int32_t prev;
367 
368     if(pPrev==NULL || c<0 || c>0x10ffff) {
369         /* illegal argument */
370         return 0;
371     }
372 
373     prev=*pPrev;
374     if(prev==0) {
375         /* lenient handling of initial value 0 */
376         prev=*pPrev=BOCU1_ASCII_PREV;
377     }
378 
379     if(c<=0x20) {
380         /*
381          * ISO C0 control & space:
382          * Encode directly for MIME compatibility,
383          * and reset state except for space, to not disrupt compression.
384          */
385         if(c!=0x20) {
386             *pPrev=BOCU1_ASCII_PREV;
387         }
388         return 0x01000000|c;
389     }
390 
391     /*
392      * all other Unicode code points c==U+0021..U+10ffff
393      * are encoded with the difference c-prev
394      *
395      * a new prev is computed from c,
396      * placed in the middle of a 0x80-block (for most small scripts) or
397      * in the middle of the Unihan and Hangul blocks
398      * to statistically minimize the following difference
399      */
400     *pPrev=bocu1Prev(c);
401     return packDiff(c-prev);
402 }
403 
404 /**
405  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
406  *
407  * @param pRx pointer to the decoder state structure
408  * @param b lead byte;
409  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
410  * @return -1 (state change only)
411  *
412  * @see decodeBocu1
413  */
414 static int32_t
decodeBocu1LeadByte(Bocu1Rx * pRx,uint8_t b)415 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
416     int32_t c, count;
417 
418     if(b>=BOCU1_START_NEG_2) {
419         /* positive difference */
420         if(b<BOCU1_START_POS_3) {
421             /* two bytes */
422             c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
423             count=1;
424         } else if(b<BOCU1_START_POS_4) {
425             /* three bytes */
426             c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
427             count=2;
428         } else {
429             /* four bytes */
430             c=BOCU1_REACH_POS_3+1;
431             count=3;
432         }
433     } else {
434         /* negative difference */
435         if(b>=BOCU1_START_NEG_3) {
436             /* two bytes */
437             c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
438             count=1;
439         } else if(b>BOCU1_MIN) {
440             /* three bytes */
441             c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
442             count=2;
443         } else {
444             /* four bytes */
445             c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
446             count=3;
447         }
448     }
449 
450     /* set the state for decoding the trail byte(s) */
451     pRx->diff=c;
452     pRx->count=count;
453     return -1;
454 }
455 
456 /**
457  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
458  *
459  * @param pRx pointer to the decoder state structure
460  * @param b trail byte
461  * @return result value, same as decodeBocu1
462  *
463  * @see decodeBocu1
464  */
465 static int32_t
decodeBocu1TrailByte(Bocu1Rx * pRx,uint8_t b)466 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
467     int32_t t, c, count;
468 
469     if(b<=0x20) {
470         /* skip some C0 controls and make the trail byte range contiguous */
471         t=bocu1ByteToTrail[b];
472         if(t<0) {
473             /* illegal trail byte value */
474             pRx->prev=BOCU1_ASCII_PREV;
475             pRx->count=0;
476             return -99;
477         }
478 #if BOCU1_MAX_TRAIL<0xff
479     } else if(b>BOCU1_MAX_TRAIL) {
480         return -99;
481 #endif
482     } else {
483         t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
484     }
485 
486     /* add trail byte into difference and decrement count */
487     c=pRx->diff;
488     count=pRx->count;
489 
490     if(count==1) {
491         /* final trail byte, deliver a code point */
492         c=pRx->prev+c+t;
493         if(0<=c && c<=0x10ffff) {
494             /* valid code point result */
495             pRx->prev=bocu1Prev(c);
496             pRx->count=0;
497             return c;
498         } else {
499             /* illegal code point result */
500             pRx->prev=BOCU1_ASCII_PREV;
501             pRx->count=0;
502             return -99;
503         }
504     }
505 
506     /* intermediate trail byte */
507     if(count==2) {
508         pRx->diff=c+t*BOCU1_TRAIL_COUNT;
509     } else /* count==3 */ {
510         pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
511     }
512     pRx->count=count-1;
513     return -1;
514 }
515 
516 /**
517  * BOCU-1 decoder function.
518  *
519  * @param pRx pointer to the decoder state structure;
520  *        the initial values should be 0 which
521  *        decodeBocu1 will set to actual initial state values
522  * @param b an input byte
523  * @return
524  *      0..0x10ffff for a result code point
525  *      -1 if only the state changed without code point output
526  *     <-1 if an error occurs
527  */
528 U_CFUNC int32_t
decodeBocu1(Bocu1Rx * pRx,uint8_t b)529 decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
530     int32_t prev, c, count;
531 
532     if(pRx==NULL) {
533         /* illegal argument */
534         return -99;
535     }
536 
537     prev=pRx->prev;
538     if(prev==0) {
539         /* lenient handling of initial 0 values */
540         prev=pRx->prev=BOCU1_ASCII_PREV;
541         count=pRx->count=0;
542     } else {
543         count=pRx->count;
544     }
545 
546     if(count==0) {
547         /* byte in lead position */
548         if(b<=0x20) {
549             /*
550              * Direct-encoded C0 control code or space.
551              * Reset prev for C0 control codes but not for space.
552              */
553             if(b!=0x20) {
554                 pRx->prev=BOCU1_ASCII_PREV;
555             }
556             return b;
557         }
558 
559         /*
560          * b is a difference lead byte.
561          *
562          * Return a code point directly from a single-byte difference.
563          *
564          * For multi-byte difference lead bytes, set the decoder state
565          * with the partial difference value from the lead byte and
566          * with the number of trail bytes.
567          *
568          * For four-byte differences, the signedness also affects the
569          * first trail byte, which has special handling farther below.
570          */
571         if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
572             /* single-byte difference */
573             c=prev+((int32_t)b-BOCU1_MIDDLE);
574             pRx->prev=bocu1Prev(c);
575             return c;
576         } else if(b==BOCU1_RESET) {
577             /* only reset the state, no code point */
578             pRx->prev=BOCU1_ASCII_PREV;
579             return -1;
580         } else {
581             return decodeBocu1LeadByte(pRx, b);
582         }
583     } else {
584         /* trail byte in any position */
585         return decodeBocu1TrailByte(pRx, b);
586     }
587 }
588 
589 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
590 
591 /* test code ---------------------------------------------------------------- */
592 
593 /* test code options */
594 
595 /* ignore comma when processing name lists in testText() */
596 #define TEST_IGNORE_COMMA       1
597 
598 /**
599  * Write a packed BOCU-1 byte sequence into a byte array,
600  * without overflow check.
601  * Test function.
602  *
603  * @param packed packed BOCU-1 byte sequence, see packDiff()
604  * @param p pointer to byte array
605  * @return number of bytes
606  *
607  * @see packDiff
608  */
609 static int32_t
writePacked(int32_t packed,uint8_t * p)610 writePacked(int32_t packed, uint8_t *p) {
611     int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
612     switch(count) {
613     case 4:
614         *p++=(uint8_t)(packed>>24);
615     case 3:
616         *p++=(uint8_t)(packed>>16);
617     case 2:
618         *p++=(uint8_t)(packed>>8);
619     case 1:
620         *p++=(uint8_t)packed;
621     default:
622         break;
623     }
624 
625     return count;
626 }
627 
628 /**
629  * Unpack a packed BOCU-1 non-C0/space byte sequence and get
630  * the difference to initialPrev.
631  * Used only for round-trip testing of the difference encoding and decoding.
632  * Test function.
633  *
634  * @param initialPrev bogus "previous code point" value to make sure that
635  *                    the resulting code point is in the range 0..0x10ffff
636  * @param packed packed BOCU-1 byte sequence
637  * @return the difference to initialPrev
638  *
639  * @see packDiff
640  * @see writeDiff
641  */
642 static int32_t
unpackDiff(int32_t initialPrev,int32_t packed)643 unpackDiff(int32_t initialPrev, int32_t packed) {
644     Bocu1Rx rx={ 0, 0, 0 };
645     int32_t count;
646 
647     rx.prev=initialPrev;
648     count=BOCU1_LENGTH_FROM_PACKED(packed);
649     switch(count) {
650     case 4:
651         decodeBocu1(&rx, (uint8_t)(packed>>24));
652     case 3:
653         decodeBocu1(&rx, (uint8_t)(packed>>16));
654     case 2:
655         decodeBocu1(&rx, (uint8_t)(packed>>8));
656     case 1:
657         /* subtract initial prev */
658         return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
659     default:
660         return -0x7fffffff;
661     }
662 }
663 
664 /**
665  * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
666  * preserving lexical order.
667  * Also checks for roundtripping of the difference encoding.
668  * Test function.
669  *
670  * @param diff difference value to test, -0x10ffff..0x10ffff
671  * @param p pointer to output byte array
672  * @return p advanced by number of bytes output
673  *
674  * @see unpackDiff
675  */
676 static uint8_t *
writeDiff(int32_t diff,uint8_t * p)677 writeDiff(int32_t diff, uint8_t *p) {
678     /* generate the difference as a packed value and serialize it */
679     int32_t packed, initialPrev;
680 
681     packed=packDiff(diff);
682 
683     /*
684      * bogus initial "prev" to work around
685      * code point range check in decodeBocu1()
686      */
687     if(diff<=0) {
688         initialPrev=0x10ffff;
689     } else {
690         initialPrev=-1;
691     }
692 
693     if(diff!=unpackDiff(initialPrev, packed)) {
694         log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
695                 diff, packed, unpackDiff(initialPrev, packed));
696     }
697     return p+writePacked(packed, p);
698 }
699 
700 /**
701  * Encode a UTF-16 string in BOCU-1.
702  * Does not check for overflows, but otherwise useful function.
703  *
704  * @param s input UTF-16 string
705  * @param length number of UChar code units in s
706  * @param p pointer to output byte array
707  * @return number of bytes output
708  */
709 static int32_t
writeString(const UChar * s,int32_t length,uint8_t * p)710 writeString(const UChar *s, int32_t length, uint8_t *p) {
711     uint8_t *p0;
712     int32_t c, prev, i;
713 
714     prev=0;
715     p0=p;
716     i=0;
717     while(i<length) {
718         UTF_NEXT_CHAR(s, i, length, c);
719         p+=writePacked(encodeBocu1(&prev, c), p);
720     }
721     return (int32_t)(p-p0);
722 }
723 
724 /**
725  * Decode a BOCU-1 byte sequence to a UTF-16 string.
726  * Does not check for overflows, but otherwise useful function.
727  *
728  * @param p pointer to input BOCU-1 bytes
729  * @param length number of input bytes
730  * @param s point to output UTF-16 string array
731  * @return number of UChar code units output
732  */
733 static int32_t
readString(const uint8_t * p,int32_t length,UChar * s)734 readString(const uint8_t *p, int32_t length, UChar *s) {
735     Bocu1Rx rx={ 0, 0, 0 };
736     int32_t c, i, sLength;
737 
738     i=sLength=0;
739     while(i<length) {
740         c=decodeBocu1(&rx, p[i++]);
741         if(c<-1) {
742             log_err("error: readString detects encoding error at string index %ld\n", i);
743             return -1;
744         }
745         if(c>=0) {
746             UTF_APPEND_CHAR_UNSAFE(s, sLength, c);
747         }
748     }
749     return sLength;
750 }
751 
752 static U_INLINE char
hexDigit(uint8_t digit)753 hexDigit(uint8_t digit) {
754     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
755 }
756 
757 /**
758  * Pretty-print 0-terminated byte values.
759  * Helper function for test output.
760  *
761  * @param bytes 0-terminated byte array to print
762  */
763 static void
printBytes(uint8_t * bytes,char * out)764 printBytes(uint8_t *bytes, char *out) {
765     int i;
766     uint8_t b;
767 
768     i=0;
769     while((b=*bytes++)!=0) {
770         *out++=' ';
771         *out++=hexDigit((uint8_t)(b>>4));
772         *out++=hexDigit((uint8_t)(b&0xf));
773         ++i;
774     }
775     i=3*(5-i);
776     while(i>0) {
777         *out++=' ';
778         --i;
779     }
780     *out=0;
781 }
782 
783 /**
784  * Basic BOCU-1 test function, called when there are no command line arguments.
785  * Prints some of the #define values and performs round-trip tests of the
786  * difference encoding and decoding.
787  */
788 static void
TestBOCU1RefDiff(void)789 TestBOCU1RefDiff(void) {
790     char buf1[80], buf2[80];
791     uint8_t prev[5], level[5];
792     int32_t i, cmp, countErrors;
793 
794     log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
795     log_verbose("reach of 2 bytes     : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
796     log_verbose("reach of 3 bytes     : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
797 
798     log_verbose("    BOCU1_REACH_NEG_1 %8ld    BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
799     log_verbose("    BOCU1_REACH_NEG_2 %8ld    BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
800     log_verbose("    BOCU1_REACH_NEG_3 %8ld    BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
801 
802     log_verbose("    BOCU1_MIDDLE      0x%02x\n", BOCU1_MIDDLE);
803     log_verbose("    BOCU1_START_NEG_2 0x%02x    BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
804     log_verbose("    BOCU1_START_NEG_3 0x%02x    BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
805 
806     /* test packDiff() & unpackDiff() with some specific values */
807     writeDiff(0, level);
808     writeDiff(1, level);
809     writeDiff(65, level);
810     writeDiff(130, level);
811     writeDiff(30000, level);
812     writeDiff(1000000, level);
813     writeDiff(-65, level);
814     writeDiff(-130, level);
815     writeDiff(-30000, level);
816     writeDiff(-1000000, level);
817 
818     /* test that each value is smaller than any following one */
819     countErrors=0;
820     i=-0x10ffff;
821     *writeDiff(i, prev)=0;
822 
823     /* show first number and bytes */
824     printBytes(prev, buf1);
825     log_verbose("              wD(%8ld)                    %s\n", i, buf1);
826 
827     for(++i; i<=0x10ffff; ++i) {
828         *writeDiff(i, level)=0;
829         cmp=strcmp((const char *)prev, (const char *)level);
830         if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
831             log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
832                    level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
833         }
834         if(cmp<0) {
835             if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
836                 /*
837                  * if the result is good, then print only if the length changed
838                  * to get little but interesting output
839                  */
840                 printBytes(prev, buf1);
841                 printBytes(level, buf2);
842                 log_verbose("ok:    strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
843             }
844         } else {
845             ++countErrors;
846             printBytes(prev, buf1);
847             printBytes(level, buf2);
848             log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
849         }
850         /* remember the previous bytes */
851         memcpy(prev, level, 4);
852     }
853 
854     /* show last number and bytes */
855     printBytes((uint8_t *)"", buf1);
856     printBytes(prev, buf2);
857     log_verbose("                            wD(%8ld)      %s%s\n", i-1, buf1, buf2);
858 
859     if(countErrors==0) {
860         log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
861     } else {
862         log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
863     }
864 
865     /* output signature byte sequence */
866     i=0;
867     writePacked(encodeBocu1(&i, 0xfeff), level);
868     log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
869             level[0], level[1], level[2]);
870 }
871 
872 /* cintltst code ------------------------------------------------------------ */
873 
874 static const int32_t DEFAULT_BUFFER_SIZE = 30000;
875 
876 
877 /* test one string with the ICU and the reference BOCU-1 implementations */
878 static void
roundtripBOCU1(UConverter * bocu1,int32_t number,const UChar * text,int32_t length)879 roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
880     UChar *roundtripRef, *roundtripICU;
881     char *bocu1Ref, *bocu1ICU;
882 
883     int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
884     UErrorCode errorCode;
885 
886     roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
887     roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
888     bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);
889     bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);
890 
891     /* Unicode -> BOCU-1 */
892     bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
893 
894     errorCode=U_ZERO_ERROR;
895     bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode);
896     if(U_FAILURE(errorCode)) {
897         log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
898         return;
899     }
900 
901     if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
902         log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
903         return;
904     }
905 
906     /* BOCU-1 -> Unicode */
907     roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
908     if(roundtripRefLength<0) {
909         free(roundtripICU);
910         return; /* readString() found an error and reported it */
911     }
912 
913     roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode);
914     if(U_FAILURE(errorCode)) {
915         log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
916         return;
917     }
918 
919     if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
920         log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
921         return;
922     }
923     if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
924         log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
925         return;
926     }
927     free(roundtripRef);
928     free(roundtripICU);
929     free(bocu1Ref);
930     free(bocu1ICU);
931 }
932 
933 static const UChar feff[]={ 0xfeff };
934 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
935 static const UChar crlf[]={ 0xd, 0xa, 0x20 };
936 static const UChar nul[]={ 0 };
937 static const UChar latin[]={ 0xdf, 0xe6 };
938 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
939 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
940 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
941 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
942 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
943 static const UChar plane1[]={ 0xd800, 0xdc00 };
944 static const UChar plane2[]={ 0xd845, 0xdddd };
945 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
946 static const UChar plane16[]={ 0xdbff, 0xdfff };
947 static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
948 
949 static const struct {
950     const UChar *s;
951     int32_t length;
952 } strings[]={
953     { feff,         LENGTHOF(feff) },
954     { ascii,        LENGTHOF(ascii) },
955     { crlf,         LENGTHOF(crlf) },
956     { nul,          LENGTHOF(nul) },
957     { latin,        LENGTHOF(latin) },
958     { devanagari,   LENGTHOF(devanagari) },
959     { hiragana,     LENGTHOF(hiragana) },
960     { unihan,       LENGTHOF(unihan) },
961     { hangul,       LENGTHOF(hangul) },
962     { surrogates,   LENGTHOF(surrogates) },
963     { plane1,       LENGTHOF(plane1) },
964     { plane2,       LENGTHOF(plane2) },
965     { plane15,      LENGTHOF(plane15) },
966     { plane16,      LENGTHOF(plane16) },
967     { c0,           LENGTHOF(c0) }
968 };
969 
970 /*
971  * Verify that the ICU BOCU-1 implementation produces the same results as
972  * the reference implementation from the design folder.
973  * Generate some texts and convert them with both converters, verifying
974  * identical results and roundtripping.
975  */
976 static void
TestBOCU1(void)977 TestBOCU1(void) {
978     UChar *text;
979     int32_t i, length;
980 
981     UConverter *bocu1;
982     UErrorCode errorCode;
983 
984     errorCode=U_ZERO_ERROR;
985     bocu1=ucnv_open("BOCU-1", &errorCode);
986     if(U_FAILURE(errorCode)) {
987         log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
988         return;
989     }
990 
991     text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
992 
993     /* text 1: each of strings[] once */
994     length=0;
995     for(i=0; i<LENGTHOF(strings); ++i) {
996         u_memcpy(text+length, strings[i].s, strings[i].length);
997         length+=strings[i].length;
998     }
999     roundtripBOCU1(bocu1, 1, text, length);
1000 
1001     /* text 2: each of strings[] twice */
1002     length=0;
1003     for(i=0; i<LENGTHOF(strings); ++i) {
1004         u_memcpy(text+length, strings[i].s, strings[i].length);
1005         length+=strings[i].length;
1006         u_memcpy(text+length, strings[i].s, strings[i].length);
1007         length+=strings[i].length;
1008     }
1009     roundtripBOCU1(bocu1, 2, text, length);
1010 
1011     /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
1012     length=0;
1013     for(i=1; length<5000; i+=7) {
1014         if(i>=LENGTHOF(strings)) {
1015             i-=LENGTHOF(strings);
1016         }
1017         u_memcpy(text+length, strings[i].s, strings[i].length);
1018         length+=strings[i].length;
1019     }
1020     roundtripBOCU1(bocu1, 3, text, length);
1021 
1022     ucnv_close(bocu1);
1023     free(text);
1024 }
1025 
1026 U_CFUNC void addBOCU1Tests(TestNode** root);
1027 
1028 U_CFUNC void
addBOCU1Tests(TestNode ** root)1029 addBOCU1Tests(TestNode** root) {
1030     addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
1031     addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
1032 }
1033