• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2002-2015, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  bocu1tst.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002may27
14 *   created by: Markus W. Scherer
15 *
16 *   This is the reference implementation of BOCU-1,
17 *   the MIME-friendly form of the Binary Ordered Compression for Unicode,
18 *   taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/
19 *   The files bocu1.h and bocu1.c from the design folder are taken
20 *   verbatim (minus copyright and #include) and copied together into this file.
21 *   The reference code and some of the reference bocu1tst.c
22 *   is modified to run as part of the ICU cintltst
23 *   test framework (minus main(), log_ln() etc. instead of printf()).
24 *
25 *   This reference implementation is used here to verify
26 *   the ICU BOCU-1 implementation, which is
27 *   adapted for ICU conversion APIs and optimized.
28 *   ### links in design doc to here and to ucnvbocu.c
29 */
30 
31 #include "unicode/utypes.h"
32 #include "unicode/ustring.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/utf16.h"
35 #include "cmemory.h"
36 #include "cintltst.h"
37 
38 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
39 
40 /* BOCU-1 constants and macros ---------------------------------------------- */
41 
42 /*
43  * BOCU-1 encodes the code points of a Unicode string as
44  * a sequence of byte-encoded differences (slope detection),
45  * preserving lexical order.
46  *
47  * Optimize the difference-taking for runs of Unicode text within
48  * small scripts:
49  *
50  * Most small scripts are allocated within aligned 128-blocks of Unicode
51  * code points. Lexical order is preserved if the "previous code point" state
52  * is always moved into the middle of such a block.
53  *
54  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
55  * areas into the middle of those areas.
56  *
57  * C0 control codes and space are encoded with their US-ASCII bytes.
58  * "prev" is reset for C0 controls but not for space.
59  */
60 
61 /* initial value for "prev": middle of the ASCII range */
62 #define BOCU1_ASCII_PREV        0x40
63 
64 /* bounding byte values for differences */
65 #define BOCU1_MIN               0x21
66 #define BOCU1_MIDDLE            0x90
67 #define BOCU1_MAX_LEAD          0xfe
68 
69 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
70 #define BOCU1_MAX_TRAIL         0xffL
71 #define BOCU1_RESET             0xff
72 
73 /* number of lead bytes */
74 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
75 
76 /* adjust trail byte counts for the use of some C0 control byte values */
77 #define BOCU1_TRAIL_CONTROLS_COUNT  20
78 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
79 
80 /* number of trail bytes */
81 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
82 
83 /*
84  * number of positive and negative single-byte codes
85  * (counting 0==BOCU1_MIDDLE among the positive ones)
86  */
87 #define BOCU1_SINGLE            64
88 
89 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
90 #define BOCU1_LEAD_2            43
91 #define BOCU1_LEAD_3            3
92 #define BOCU1_LEAD_4            1
93 
94 /* The difference value range for single-byters. */
95 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
96 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
97 
98 /* The difference value range for double-byters. */
99 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
100 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
101 
102 /* The difference value range for 3-byters. */
103 #define BOCU1_REACH_POS_3   \
104     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
105 
106 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
107 
108 /* The lead byte start values. */
109 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
110 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
111 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
112      /* ==BOCU1_MAX_LEAD */
113 
114 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
115 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
116 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
117      /* ==BOCU1_MIN+1 */
118 
119 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
120 #define BOCU1_LENGTH_FROM_LEAD(lead) \
121     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
122      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
123      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
124 
125 /* The length of a byte sequence, according to its packed form. */
126 #define BOCU1_LENGTH_FROM_PACKED(packed) \
127     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
128 
129 /*
130  * 12 commonly used C0 control codes (and space) are only used to encode
131  * themselves directly,
132  * which makes BOCU-1 MIME-usable and reasonably safe for
133  * ASCII-oriented software.
134  *
135  * These controls are
136  *  0   NUL
137  *
138  *  7   BEL
139  *  8   BS
140  *
141  *  9   TAB
142  *  a   LF
143  *  b   VT
144  *  c   FF
145  *  d   CR
146  *
147  *  e   SO
148  *  f   SI
149  *
150  * 1a   SUB
151  * 1b   ESC
152  *
153  * The other 20 C0 controls are also encoded directly (to preserve order)
154  * but are also used as trail bytes in difference encoding
155  * (for better compression).
156  */
157 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
158 
159 /*
160  * Byte value map for control codes,
161  * from external byte values 0x00..0x20
162  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
163  * External byte values that are illegal as trail bytes are mapped to -1.
164  */
165 static const int8_t
166 bocu1ByteToTrail[BOCU1_MIN]={
167 /*  0     1     2     3     4     5     6     7    */
168     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
169 
170 /*  8     9     a     b     c     d     e     f    */
171     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
172 
173 /*  10    11    12    13    14    15    16    17   */
174     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
175 
176 /*  18    19    1a    1b    1c    1d    1e    1f   */
177     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
178 
179 /*  20   */
180     -1
181 };
182 
183 /*
184  * Byte value map for control codes,
185  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
186  * to external byte values 0x00..0x20.
187  */
188 static const int8_t
189 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
190 /*  0     1     2     3     4     5     6     7    */
191     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
192 
193 /*  8     9     a     b     c     d     e     f    */
194     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
195 
196 /*  10    11    12    13   */
197     0x1c, 0x1d, 0x1e, 0x1f
198 };
199 
200 /**
201  * Integer division and modulo with negative numerators
202  * yields negative modulo results and quotients that are one more than
203  * what we need here.
204  * This macro adjust the results so that the modulo-value m is always >=0.
205  *
206  * For positive n, the if() condition is always FALSE.
207  *
208  * @param n Number to be split into quotient and rest.
209  *          Will be modified to contain the quotient.
210  * @param d Divisor.
211  * @param m Output variable for the rest (modulo result).
212  */
213 #define NEGDIVMOD(n, d, m) { \
214     (m)=(n)%(d); \
215     (n)/=(d); \
216     if((m)<0) { \
217         --(n); \
218         (m)+=(d); \
219     } \
220 }
221 
222 /* State for BOCU-1 decoder function. */
223 struct Bocu1Rx {
224     int32_t prev, count, diff;
225 };
226 
227 typedef struct Bocu1Rx Bocu1Rx;
228 
229 /* Function prototypes ------------------------------------------------------ */
230 
231 /* see bocu1.c */
232 U_CFUNC int32_t
233 packDiff(int32_t diff);
234 
235 U_CFUNC int32_t
236 encodeBocu1(int32_t *pPrev, int32_t c);
237 
238 U_CFUNC int32_t
239 decodeBocu1(Bocu1Rx *pRx, uint8_t b);
240 
241 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
242 
243 /* BOCU-1 implementation functions ------------------------------------------ */
244 
245 /**
246  * Compute the next "previous" value for differencing
247  * from the current code point.
248  *
249  * @param c current code point, 0..0x10ffff
250  * @return "previous code point" state value
251  */
252 static int32_t
bocu1Prev(int32_t c)253 bocu1Prev(int32_t c) {
254     /* compute new prev */
255     if(0x3040<=c && c<=0x309f) {
256         /* Hiragana is not 128-aligned */
257         return 0x3070;
258     } else if(0x4e00<=c && c<=0x9fa5) {
259         /* CJK Unihan */
260         return 0x4e00-BOCU1_REACH_NEG_2;
261     } else if(0xac00<=c && c<=0xd7a3) {
262         /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
263         return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
264     } else {
265         /* mostly small scripts */
266         return (c&~0x7f)+BOCU1_ASCII_PREV;
267     }
268 }
269 
270 /**
271  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
272  * and return a packed integer with them.
273  *
274  * The encoding favors small absolut differences with short encodings
275  * to compress runs of same-script characters.
276  *
277  * @param diff difference value -0x10ffff..0x10ffff
278  * @return
279  *      0x010000zz for 1-byte sequence zz
280  *      0x0200yyzz for 2-byte sequence yy zz
281  *      0x03xxyyzz for 3-byte sequence xx yy zz
282  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
283  */
284 U_CFUNC int32_t
packDiff(int32_t diff)285 packDiff(int32_t diff) {
286     int32_t result, m, lead, count, shift;
287 
288     if(diff>=BOCU1_REACH_NEG_1) {
289         /* mostly positive differences, and single-byte negative ones */
290         if(diff<=BOCU1_REACH_POS_1) {
291             /* single byte */
292             return 0x01000000|(BOCU1_MIDDLE+diff);
293         } else if(diff<=BOCU1_REACH_POS_2) {
294             /* two bytes */
295             diff-=BOCU1_REACH_POS_1+1;
296             lead=BOCU1_START_POS_2;
297             count=1;
298         } else if(diff<=BOCU1_REACH_POS_3) {
299             /* three bytes */
300             diff-=BOCU1_REACH_POS_2+1;
301             lead=BOCU1_START_POS_3;
302             count=2;
303         } else {
304             /* four bytes */
305             diff-=BOCU1_REACH_POS_3+1;
306             lead=BOCU1_START_POS_4;
307             count=3;
308         }
309     } else {
310         /* two- and four-byte negative differences */
311         if(diff>=BOCU1_REACH_NEG_2) {
312             /* two bytes */
313             diff-=BOCU1_REACH_NEG_1;
314             lead=BOCU1_START_NEG_2;
315             count=1;
316         } else if(diff>=BOCU1_REACH_NEG_3) {
317             /* three bytes */
318             diff-=BOCU1_REACH_NEG_2;
319             lead=BOCU1_START_NEG_3;
320             count=2;
321         } else {
322             /* four bytes */
323             diff-=BOCU1_REACH_NEG_3;
324             lead=BOCU1_START_NEG_4;
325             count=3;
326         }
327     }
328 
329     /* encode the length of the packed result */
330     if(count<3) {
331         result=(count+1)<<24;
332     } else /* count==3, MSB used for the lead byte */ {
333         result=0;
334     }
335 
336     /* calculate trail bytes like digits in itoa() */
337     shift=0;
338     do {
339         NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
340         result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
341         shift+=8;
342     } while(--count>0);
343 
344     /* add lead byte */
345     result|=(lead+diff)<<shift;
346 
347     return result;
348 }
349 
350 /**
351  * BOCU-1 encoder function.
352  *
353  * @param pPrev pointer to the integer that holds
354  *        the "previous code point" state;
355  *        the initial value should be 0 which
356  *        encodeBocu1 will set to the actual BOCU-1 initial state value
357  * @param c the code point to encode
358  * @return the packed 1/2/3/4-byte encoding, see packDiff(),
359  *         or 0 if an error occurs
360  *
361  * @see packDiff
362  */
363 U_CFUNC int32_t
encodeBocu1(int32_t * pPrev,int32_t c)364 encodeBocu1(int32_t *pPrev, int32_t c) {
365     int32_t prev;
366 
367     if(pPrev==NULL || c<0 || c>0x10ffff) {
368         /* illegal argument */
369         return 0;
370     }
371 
372     prev=*pPrev;
373     if(prev==0) {
374         /* lenient handling of initial value 0 */
375         prev=*pPrev=BOCU1_ASCII_PREV;
376     }
377 
378     if(c<=0x20) {
379         /*
380          * ISO C0 control & space:
381          * Encode directly for MIME compatibility,
382          * and reset state except for space, to not disrupt compression.
383          */
384         if(c!=0x20) {
385             *pPrev=BOCU1_ASCII_PREV;
386         }
387         return 0x01000000|c;
388     }
389 
390     /*
391      * all other Unicode code points c==U+0021..U+10ffff
392      * are encoded with the difference c-prev
393      *
394      * a new prev is computed from c,
395      * placed in the middle of a 0x80-block (for most small scripts) or
396      * in the middle of the Unihan and Hangul blocks
397      * to statistically minimize the following difference
398      */
399     *pPrev=bocu1Prev(c);
400     return packDiff(c-prev);
401 }
402 
403 /**
404  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
405  *
406  * @param pRx pointer to the decoder state structure
407  * @param b lead byte;
408  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
409  * @return -1 (state change only)
410  *
411  * @see decodeBocu1
412  */
413 static int32_t
decodeBocu1LeadByte(Bocu1Rx * pRx,uint8_t b)414 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
415     int32_t c, count;
416 
417     if(b>=BOCU1_START_NEG_2) {
418         /* positive difference */
419         if(b<BOCU1_START_POS_3) {
420             /* two bytes */
421             c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
422             count=1;
423         } else if(b<BOCU1_START_POS_4) {
424             /* three bytes */
425             c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
426             count=2;
427         } else {
428             /* four bytes */
429             c=BOCU1_REACH_POS_3+1;
430             count=3;
431         }
432     } else {
433         /* negative difference */
434         if(b>=BOCU1_START_NEG_3) {
435             /* two bytes */
436             c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
437             count=1;
438         } else if(b>BOCU1_MIN) {
439             /* three bytes */
440             c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
441             count=2;
442         } else {
443             /* four bytes */
444             c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
445             count=3;
446         }
447     }
448 
449     /* set the state for decoding the trail byte(s) */
450     pRx->diff=c;
451     pRx->count=count;
452     return -1;
453 }
454 
455 /**
456  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
457  *
458  * @param pRx pointer to the decoder state structure
459  * @param b trail byte
460  * @return result value, same as decodeBocu1
461  *
462  * @see decodeBocu1
463  */
464 static int32_t
decodeBocu1TrailByte(Bocu1Rx * pRx,uint8_t b)465 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
466     int32_t t, c, count;
467 
468     if(b<=0x20) {
469         /* skip some C0 controls and make the trail byte range contiguous */
470         t=bocu1ByteToTrail[b];
471         if(t<0) {
472             /* illegal trail byte value */
473             pRx->prev=BOCU1_ASCII_PREV;
474             pRx->count=0;
475             return -99;
476         }
477 #if BOCU1_MAX_TRAIL<0xff
478     } else if(b>BOCU1_MAX_TRAIL) {
479         return -99;
480 #endif
481     } else {
482         t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
483     }
484 
485     /* add trail byte into difference and decrement count */
486     c=pRx->diff;
487     count=pRx->count;
488 
489     if(count==1) {
490         /* final trail byte, deliver a code point */
491         c=pRx->prev+c+t;
492         if(0<=c && c<=0x10ffff) {
493             /* valid code point result */
494             pRx->prev=bocu1Prev(c);
495             pRx->count=0;
496             return c;
497         } else {
498             /* illegal code point result */
499             pRx->prev=BOCU1_ASCII_PREV;
500             pRx->count=0;
501             return -99;
502         }
503     }
504 
505     /* intermediate trail byte */
506     if(count==2) {
507         pRx->diff=c+t*BOCU1_TRAIL_COUNT;
508     } else /* count==3 */ {
509         pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
510     }
511     pRx->count=count-1;
512     return -1;
513 }
514 
515 /**
516  * BOCU-1 decoder function.
517  *
518  * @param pRx pointer to the decoder state structure;
519  *        the initial values should be 0 which
520  *        decodeBocu1 will set to actual initial state values
521  * @param b an input byte
522  * @return
523  *      0..0x10ffff for a result code point
524  *      -1 if only the state changed without code point output
525  *     <-1 if an error occurs
526  */
527 U_CFUNC int32_t
decodeBocu1(Bocu1Rx * pRx,uint8_t b)528 decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
529     int32_t prev, c, count;
530 
531     if(pRx==NULL) {
532         /* illegal argument */
533         return -99;
534     }
535 
536     prev=pRx->prev;
537     if(prev==0) {
538         /* lenient handling of initial 0 values */
539         prev=pRx->prev=BOCU1_ASCII_PREV;
540         count=pRx->count=0;
541     } else {
542         count=pRx->count;
543     }
544 
545     if(count==0) {
546         /* byte in lead position */
547         if(b<=0x20) {
548             /*
549              * Direct-encoded C0 control code or space.
550              * Reset prev for C0 control codes but not for space.
551              */
552             if(b!=0x20) {
553                 pRx->prev=BOCU1_ASCII_PREV;
554             }
555             return b;
556         }
557 
558         /*
559          * b is a difference lead byte.
560          *
561          * Return a code point directly from a single-byte difference.
562          *
563          * For multi-byte difference lead bytes, set the decoder state
564          * with the partial difference value from the lead byte and
565          * with the number of trail bytes.
566          *
567          * For four-byte differences, the signedness also affects the
568          * first trail byte, which has special handling farther below.
569          */
570         if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
571             /* single-byte difference */
572             c=prev+((int32_t)b-BOCU1_MIDDLE);
573             pRx->prev=bocu1Prev(c);
574             return c;
575         } else if(b==BOCU1_RESET) {
576             /* only reset the state, no code point */
577             pRx->prev=BOCU1_ASCII_PREV;
578             return -1;
579         } else {
580             return decodeBocu1LeadByte(pRx, b);
581         }
582     } else {
583         /* trail byte in any position */
584         return decodeBocu1TrailByte(pRx, b);
585     }
586 }
587 
588 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
589 
590 /* test code ---------------------------------------------------------------- */
591 
592 /* test code options */
593 
594 /* ignore comma when processing name lists in testText() */
595 #define TEST_IGNORE_COMMA       1
596 
597 /**
598  * Write a packed BOCU-1 byte sequence into a byte array,
599  * without overflow check.
600  * Test function.
601  *
602  * @param packed packed BOCU-1 byte sequence, see packDiff()
603  * @param p pointer to byte array
604  * @return number of bytes
605  *
606  * @see packDiff
607  */
608 static int32_t
writePacked(int32_t packed,uint8_t * p)609 writePacked(int32_t packed, uint8_t *p) {
610     int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
611     switch(count) {
612     case 4:
613         *p++=(uint8_t)(packed>>24);
614     case 3:
615         *p++=(uint8_t)(packed>>16);
616     case 2:
617         *p++=(uint8_t)(packed>>8);
618     case 1:
619         *p++=(uint8_t)packed;
620     default:
621         break;
622     }
623 
624     return count;
625 }
626 
627 /**
628  * Unpack a packed BOCU-1 non-C0/space byte sequence and get
629  * the difference to initialPrev.
630  * Used only for round-trip testing of the difference encoding and decoding.
631  * Test function.
632  *
633  * @param initialPrev bogus "previous code point" value to make sure that
634  *                    the resulting code point is in the range 0..0x10ffff
635  * @param packed packed BOCU-1 byte sequence
636  * @return the difference to initialPrev
637  *
638  * @see packDiff
639  * @see writeDiff
640  */
641 static int32_t
unpackDiff(int32_t initialPrev,int32_t packed)642 unpackDiff(int32_t initialPrev, int32_t packed) {
643     Bocu1Rx rx={ 0, 0, 0 };
644     int32_t count;
645 
646     rx.prev=initialPrev;
647     count=BOCU1_LENGTH_FROM_PACKED(packed);
648     switch(count) {
649     case 4:
650         decodeBocu1(&rx, (uint8_t)(packed>>24));
651     case 3:
652         decodeBocu1(&rx, (uint8_t)(packed>>16));
653     case 2:
654         decodeBocu1(&rx, (uint8_t)(packed>>8));
655     case 1:
656         /* subtract initial prev */
657         return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
658     default:
659         return -0x7fffffff;
660     }
661 }
662 
663 /**
664  * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
665  * preserving lexical order.
666  * Also checks for roundtripping of the difference encoding.
667  * Test function.
668  *
669  * @param diff difference value to test, -0x10ffff..0x10ffff
670  * @param p pointer to output byte array
671  * @return p advanced by number of bytes output
672  *
673  * @see unpackDiff
674  */
675 static uint8_t *
writeDiff(int32_t diff,uint8_t * p)676 writeDiff(int32_t diff, uint8_t *p) {
677     /* generate the difference as a packed value and serialize it */
678     int32_t packed, initialPrev;
679 
680     packed=packDiff(diff);
681 
682     /*
683      * bogus initial "prev" to work around
684      * code point range check in decodeBocu1()
685      */
686     if(diff<=0) {
687         initialPrev=0x10ffff;
688     } else {
689         initialPrev=-1;
690     }
691 
692     if(diff!=unpackDiff(initialPrev, packed)) {
693         log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
694                 diff, packed, unpackDiff(initialPrev, packed));
695     }
696     return p+writePacked(packed, p);
697 }
698 
699 /**
700  * Encode a UTF-16 string in BOCU-1.
701  * Does not check for overflows, but otherwise useful function.
702  *
703  * @param s input UTF-16 string
704  * @param length number of UChar code units in s
705  * @param p pointer to output byte array
706  * @return number of bytes output
707  */
708 static int32_t
writeString(const UChar * s,int32_t length,uint8_t * p)709 writeString(const UChar *s, int32_t length, uint8_t *p) {
710     uint8_t *p0;
711     int32_t c, prev, i;
712 
713     prev=0;
714     p0=p;
715     i=0;
716     while(i<length) {
717         U16_NEXT(s, i, length, c);
718         p+=writePacked(encodeBocu1(&prev, c), p);
719     }
720     return (int32_t)(p-p0);
721 }
722 
723 /**
724  * Decode a BOCU-1 byte sequence to a UTF-16 string.
725  * Does not check for overflows, but otherwise useful function.
726  *
727  * @param p pointer to input BOCU-1 bytes
728  * @param length number of input bytes
729  * @param s point to output UTF-16 string array
730  * @return number of UChar code units output
731  */
732 static int32_t
readString(const uint8_t * p,int32_t length,UChar * s)733 readString(const uint8_t *p, int32_t length, UChar *s) {
734     Bocu1Rx rx={ 0, 0, 0 };
735     int32_t c, i, sLength;
736 
737     i=sLength=0;
738     while(i<length) {
739         c=decodeBocu1(&rx, p[i++]);
740         if(c<-1) {
741             log_err("error: readString detects encoding error at string index %ld\n", i);
742             return -1;
743         }
744         if(c>=0) {
745             U16_APPEND_UNSAFE(s, sLength, c);
746         }
747     }
748     return sLength;
749 }
750 
751 static char
hexDigit(uint8_t digit)752 hexDigit(uint8_t digit) {
753     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
754 }
755 
756 /**
757  * Pretty-print 0-terminated byte values.
758  * Helper function for test output.
759  *
760  * @param bytes 0-terminated byte array to print
761  */
762 static void
printBytes(uint8_t * bytes,char * out)763 printBytes(uint8_t *bytes, char *out) {
764     int i;
765     uint8_t b;
766 
767     i=0;
768     while((b=*bytes++)!=0) {
769         *out++=' ';
770         *out++=hexDigit((uint8_t)(b>>4));
771         *out++=hexDigit((uint8_t)(b&0xf));
772         ++i;
773     }
774     i=3*(5-i);
775     while(i>0) {
776         *out++=' ';
777         --i;
778     }
779     *out=0;
780 }
781 
782 /**
783  * Basic BOCU-1 test function, called when there are no command line arguments.
784  * Prints some of the #define values and performs round-trip tests of the
785  * difference encoding and decoding.
786  */
787 static void
TestBOCU1RefDiff(void)788 TestBOCU1RefDiff(void) {
789     char buf1[80], buf2[80];
790     uint8_t prev[5], level[5];
791     int32_t i, cmp, countErrors;
792 
793     log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
794     log_verbose("reach of 2 bytes     : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
795     log_verbose("reach of 3 bytes     : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
796 
797     log_verbose("    BOCU1_REACH_NEG_1 %8ld    BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
798     log_verbose("    BOCU1_REACH_NEG_2 %8ld    BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
799     log_verbose("    BOCU1_REACH_NEG_3 %8ld    BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
800 
801     log_verbose("    BOCU1_MIDDLE      0x%02x\n", BOCU1_MIDDLE);
802     log_verbose("    BOCU1_START_NEG_2 0x%02x    BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
803     log_verbose("    BOCU1_START_NEG_3 0x%02x    BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
804 
805     /* test packDiff() & unpackDiff() with some specific values */
806     writeDiff(0, level);
807     writeDiff(1, level);
808     writeDiff(65, level);
809     writeDiff(130, level);
810     writeDiff(30000, level);
811     writeDiff(1000000, level);
812     writeDiff(-65, level);
813     writeDiff(-130, level);
814     writeDiff(-30000, level);
815     writeDiff(-1000000, level);
816 
817     /* test that each value is smaller than any following one */
818     countErrors=0;
819     i=-0x10ffff;
820     *writeDiff(i, prev)=0;
821 
822     /* show first number and bytes */
823     printBytes(prev, buf1);
824     log_verbose("              wD(%8ld)                    %s\n", i, buf1);
825 
826     for(++i; i<=0x10ffff; ++i) {
827         *writeDiff(i, level)=0;
828         cmp=strcmp((const char *)prev, (const char *)level);
829         if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
830             log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
831                    level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
832         }
833         if(cmp<0) {
834             if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
835                 /*
836                  * if the result is good, then print only if the length changed
837                  * to get little but interesting output
838                  */
839                 printBytes(prev, buf1);
840                 printBytes(level, buf2);
841                 log_verbose("ok:    strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
842             }
843         } else {
844             ++countErrors;
845             printBytes(prev, buf1);
846             printBytes(level, buf2);
847             log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
848         }
849         /* remember the previous bytes */
850         memcpy(prev, level, 4);
851     }
852 
853     /* show last number and bytes */
854     printBytes((uint8_t *)"", buf1);
855     printBytes(prev, buf2);
856     log_verbose("                            wD(%8ld)      %s%s\n", i-1, buf1, buf2);
857 
858     if(countErrors==0) {
859         log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
860     } else {
861         log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
862     }
863 
864     /* output signature byte sequence */
865     i=0;
866     writePacked(encodeBocu1(&i, 0xfeff), level);
867     log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
868             level[0], level[1], level[2]);
869 }
870 
871 /* cintltst code ------------------------------------------------------------ */
872 
873 static const int32_t DEFAULT_BUFFER_SIZE = 30000;
874 
875 
876 /* test one string with the ICU and the reference BOCU-1 implementations */
877 static void
roundtripBOCU1(UConverter * bocu1,int32_t number,const UChar * text,int32_t length)878 roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
879     UChar *roundtripRef, *roundtripICU;
880     char *bocu1Ref, *bocu1ICU;
881 
882     int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
883     UErrorCode errorCode;
884 
885     roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
886     roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
887     bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);
888     bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);
889 
890     /* Unicode -> BOCU-1 */
891     bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
892 
893     errorCode=U_ZERO_ERROR;
894     bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode);
895     if(U_FAILURE(errorCode)) {
896         log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
897         goto cleanup;
898     }
899 
900     if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
901         log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
902         goto cleanup;
903     }
904 
905     /* BOCU-1 -> Unicode */
906     roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
907     if(roundtripRefLength<0) {
908         goto cleanup; /* readString() found an error and reported it */
909     }
910 
911     roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode);
912     if(U_FAILURE(errorCode)) {
913         log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
914         goto cleanup;
915     }
916 
917     if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
918         log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
919         goto cleanup;
920     }
921     if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
922         log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
923         goto cleanup;
924     }
925 cleanup:
926     free(roundtripRef);
927     free(roundtripICU);
928     free(bocu1Ref);
929     free(bocu1ICU);
930 }
931 
932 static const UChar feff[]={ 0xfeff };
933 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
934 static const UChar crlf[]={ 0xd, 0xa, 0x20 };
935 static const UChar nul[]={ 0 };
936 static const UChar latin[]={ 0xdf, 0xe6 };
937 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
938 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
939 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
940 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
941 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
942 static const UChar plane1[]={ 0xd800, 0xdc00 };
943 static const UChar plane2[]={ 0xd845, 0xdddd };
944 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
945 static const UChar plane16[]={ 0xdbff, 0xdfff };
946 static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
947 
948 static const struct {
949     const UChar *s;
950     int32_t length;
951 } strings[]={
952     { feff,         UPRV_LENGTHOF(feff) },
953     { ascii,        UPRV_LENGTHOF(ascii) },
954     { crlf,         UPRV_LENGTHOF(crlf) },
955     { nul,          UPRV_LENGTHOF(nul) },
956     { latin,        UPRV_LENGTHOF(latin) },
957     { devanagari,   UPRV_LENGTHOF(devanagari) },
958     { hiragana,     UPRV_LENGTHOF(hiragana) },
959     { unihan,       UPRV_LENGTHOF(unihan) },
960     { hangul,       UPRV_LENGTHOF(hangul) },
961     { surrogates,   UPRV_LENGTHOF(surrogates) },
962     { plane1,       UPRV_LENGTHOF(plane1) },
963     { plane2,       UPRV_LENGTHOF(plane2) },
964     { plane15,      UPRV_LENGTHOF(plane15) },
965     { plane16,      UPRV_LENGTHOF(plane16) },
966     { c0,           UPRV_LENGTHOF(c0) }
967 };
968 
969 /*
970  * Verify that the ICU BOCU-1 implementation produces the same results as
971  * the reference implementation from the design folder.
972  * Generate some texts and convert them with both converters, verifying
973  * identical results and roundtripping.
974  */
975 static void
TestBOCU1(void)976 TestBOCU1(void) {
977     UChar *text;
978     int32_t i, length;
979 
980     UConverter *bocu1;
981     UErrorCode errorCode;
982 
983     errorCode=U_ZERO_ERROR;
984     bocu1=ucnv_open("BOCU-1", &errorCode);
985     if(U_FAILURE(errorCode)) {
986         log_data_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
987         return;
988     }
989 
990     text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
991 
992     /* text 1: each of strings[] once */
993     length=0;
994     for(i=0; i<UPRV_LENGTHOF(strings); ++i) {
995         u_memcpy(text+length, strings[i].s, strings[i].length);
996         length+=strings[i].length;
997     }
998     roundtripBOCU1(bocu1, 1, text, length);
999 
1000     /* text 2: each of strings[] twice */
1001     length=0;
1002     for(i=0; i<UPRV_LENGTHOF(strings); ++i) {
1003         u_memcpy(text+length, strings[i].s, strings[i].length);
1004         length+=strings[i].length;
1005         u_memcpy(text+length, strings[i].s, strings[i].length);
1006         length+=strings[i].length;
1007     }
1008     roundtripBOCU1(bocu1, 2, text, length);
1009 
1010     /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
1011     length=0;
1012     for(i=1; length<5000; i+=7) {
1013         if(i>=UPRV_LENGTHOF(strings)) {
1014             i-=UPRV_LENGTHOF(strings);
1015         }
1016         u_memcpy(text+length, strings[i].s, strings[i].length);
1017         length+=strings[i].length;
1018     }
1019     roundtripBOCU1(bocu1, 3, text, length);
1020 
1021     ucnv_close(bocu1);
1022     free(text);
1023 }
1024 
1025 U_CFUNC void addBOCU1Tests(TestNode** root);
1026 
1027 U_CFUNC void
addBOCU1Tests(TestNode ** root)1028 addBOCU1Tests(TestNode** root) {
1029     addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
1030     addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
1031 }
1032