1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2002-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: bocu1tst.c
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2002may27
16 * created by: Markus W. Scherer
17 *
18 * This is the reference implementation of BOCU-1,
19 * the MIME-friendly form of the Binary Ordered Compression for Unicode,
20 * taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/
21 * The files bocu1.h and bocu1.c from the design folder are taken
22 * verbatim (minus copyright and #include) and copied together into this file.
23 * The reference code and some of the reference bocu1tst.c
24 * is modified to run as part of the ICU cintltst
25 * test framework (minus main(), log_ln() etc. instead of printf()).
26 *
27 * This reference implementation is used here to verify
28 * the ICU BOCU-1 implementation, which is
29 * adapted for ICU conversion APIs and optimized.
30 * ### links in design doc to here and to ucnvbocu.c
31 */
32
33 #include <stdbool.h>
34
35 #include "unicode/utypes.h"
36 #include "unicode/ustring.h"
37 #include "unicode/ucnv.h"
38 #include "unicode/utf16.h"
39 #include "cmemory.h"
40 #include "cintltst.h"
41
42 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
43
44 /* BOCU-1 constants and macros ---------------------------------------------- */
45
46 /*
47 * BOCU-1 encodes the code points of a Unicode string as
48 * a sequence of byte-encoded differences (slope detection),
49 * preserving lexical order.
50 *
51 * Optimize the difference-taking for runs of Unicode text within
52 * small scripts:
53 *
54 * Most small scripts are allocated within aligned 128-blocks of Unicode
55 * code points. Lexical order is preserved if the "previous code point" state
56 * is always moved into the middle of such a block.
57 *
58 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
59 * areas into the middle of those areas.
60 *
61 * C0 control codes and space are encoded with their US-ASCII bytes.
62 * "prev" is reset for C0 controls but not for space.
63 */
64
65 /* initial value for "prev": middle of the ASCII range */
66 #define BOCU1_ASCII_PREV 0x40
67
68 /* bounding byte values for differences */
69 #define BOCU1_MIN 0x21
70 #define BOCU1_MIDDLE 0x90
71 #define BOCU1_MAX_LEAD 0xfe
72
73 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
74 #define BOCU1_MAX_TRAIL 0xffL
75 #define BOCU1_RESET 0xff
76
77 /* number of lead bytes */
78 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
79
80 /* adjust trail byte counts for the use of some C0 control byte values */
81 #define BOCU1_TRAIL_CONTROLS_COUNT 20
82 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
83
84 /* number of trail bytes */
85 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
86
87 /*
88 * number of positive and negative single-byte codes
89 * (counting 0==BOCU1_MIDDLE among the positive ones)
90 */
91 #define BOCU1_SINGLE 64
92
93 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
94 #define BOCU1_LEAD_2 43
95 #define BOCU1_LEAD_3 3
96 #define BOCU1_LEAD_4 1
97
98 /* The difference value range for single-byters. */
99 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
100 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
101
102 /* The difference value range for double-byters. */
103 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
104 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
105
106 /* The difference value range for 3-byters. */
107 #define BOCU1_REACH_POS_3 \
108 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
109
110 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
111
112 /* The lead byte start values. */
113 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
114 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
115 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
116 /* ==BOCU1_MAX_LEAD */
117
118 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
119 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
120 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
121 /* ==BOCU1_MIN+1 */
122
123 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
124 #define BOCU1_LENGTH_FROM_LEAD(lead) \
125 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
126 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
127 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
128
129 /* The length of a byte sequence, according to its packed form. */
130 #define BOCU1_LENGTH_FROM_PACKED(packed) \
131 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
132
133 /*
134 * 12 commonly used C0 control codes (and space) are only used to encode
135 * themselves directly,
136 * which makes BOCU-1 MIME-usable and reasonably safe for
137 * ASCII-oriented software.
138 *
139 * These controls are
140 * 0 NUL
141 *
142 * 7 BEL
143 * 8 BS
144 *
145 * 9 TAB
146 * a LF
147 * b VT
148 * c FF
149 * d CR
150 *
151 * e SO
152 * f SI
153 *
154 * 1a SUB
155 * 1b ESC
156 *
157 * The other 20 C0 controls are also encoded directly (to preserve order)
158 * but are also used as trail bytes in difference encoding
159 * (for better compression).
160 */
161 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
162
163 /*
164 * Byte value map for control codes,
165 * from external byte values 0x00..0x20
166 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
167 * External byte values that are illegal as trail bytes are mapped to -1.
168 */
169 static const int8_t
170 bocu1ByteToTrail[BOCU1_MIN]={
171 /* 0 1 2 3 4 5 6 7 */
172 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
173
174 /* 8 9 a b c d e f */
175 -1, -1, -1, -1, -1, -1, -1, -1,
176
177 /* 10 11 12 13 14 15 16 17 */
178 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
179
180 /* 18 19 1a 1b 1c 1d 1e 1f */
181 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
182
183 /* 20 */
184 -1
185 };
186
187 /*
188 * Byte value map for control codes,
189 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
190 * to external byte values 0x00..0x20.
191 */
192 static const int8_t
193 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
194 /* 0 1 2 3 4 5 6 7 */
195 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
196
197 /* 8 9 a b c d e f */
198 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
199
200 /* 10 11 12 13 */
201 0x1c, 0x1d, 0x1e, 0x1f
202 };
203
204 /**
205 * Integer division and modulo with negative numerators
206 * yields negative modulo results and quotients that are one more than
207 * what we need here.
208 * This macro adjust the results so that the modulo-value m is always >=0.
209 *
210 * For positive n, the if() condition is always false.
211 *
212 * @param n Number to be split into quotient and rest.
213 * Will be modified to contain the quotient.
214 * @param d Divisor.
215 * @param m Output variable for the rest (modulo result).
216 */
217 #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
218 (m)=(n)%(d); \
219 (n)/=(d); \
220 if((m)<0) { \
221 --(n); \
222 (m)+=(d); \
223 } \
224 } UPRV_BLOCK_MACRO_END
225
226 /* State for BOCU-1 decoder function. */
227 struct Bocu1Rx {
228 int32_t prev, count, diff;
229 };
230
231 typedef struct Bocu1Rx Bocu1Rx;
232
233 /* Function prototypes ------------------------------------------------------ */
234
235 /* see bocu1.c */
236 U_CFUNC int32_t
237 packDiff(int32_t diff);
238
239 U_CFUNC int32_t
240 encodeBocu1(int32_t *pPrev, int32_t c);
241
242 U_CFUNC int32_t
243 decodeBocu1(Bocu1Rx *pRx, uint8_t b);
244
245 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
246
247 /* BOCU-1 implementation functions ------------------------------------------ */
248
249 /**
250 * Compute the next "previous" value for differencing
251 * from the current code point.
252 *
253 * @param c current code point, 0..0x10ffff
254 * @return "previous code point" state value
255 */
256 static int32_t
bocu1Prev(int32_t c)257 bocu1Prev(int32_t c) {
258 /* compute new prev */
259 if(0x3040<=c && c<=0x309f) {
260 /* Hiragana is not 128-aligned */
261 return 0x3070;
262 } else if(0x4e00<=c && c<=0x9fa5) {
263 /* CJK Unihan */
264 return 0x4e00-BOCU1_REACH_NEG_2;
265 } else if(0xac00<=c && c<=0xd7a3) {
266 /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
267 return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
268 } else {
269 /* mostly small scripts */
270 return (c&~0x7f)+BOCU1_ASCII_PREV;
271 }
272 }
273
274 /**
275 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
276 * and return a packed integer with them.
277 *
278 * The encoding favors small absolute differences with short encodings
279 * to compress runs of same-script characters.
280 *
281 * @param diff difference value -0x10ffff..0x10ffff
282 * @return
283 * 0x010000zz for 1-byte sequence zz
284 * 0x0200yyzz for 2-byte sequence yy zz
285 * 0x03xxyyzz for 3-byte sequence xx yy zz
286 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
287 */
288 U_CFUNC int32_t
packDiff(int32_t diff)289 packDiff(int32_t diff) {
290 int32_t result, m, lead, count, shift;
291
292 if(diff>=BOCU1_REACH_NEG_1) {
293 /* mostly positive differences, and single-byte negative ones */
294 if(diff<=BOCU1_REACH_POS_1) {
295 /* single byte */
296 return 0x01000000|(BOCU1_MIDDLE+diff);
297 } else if(diff<=BOCU1_REACH_POS_2) {
298 /* two bytes */
299 diff-=BOCU1_REACH_POS_1+1;
300 lead=BOCU1_START_POS_2;
301 count=1;
302 } else if(diff<=BOCU1_REACH_POS_3) {
303 /* three bytes */
304 diff-=BOCU1_REACH_POS_2+1;
305 lead=BOCU1_START_POS_3;
306 count=2;
307 } else {
308 /* four bytes */
309 diff-=BOCU1_REACH_POS_3+1;
310 lead=BOCU1_START_POS_4;
311 count=3;
312 }
313 } else {
314 /* two- and four-byte negative differences */
315 if(diff>=BOCU1_REACH_NEG_2) {
316 /* two bytes */
317 diff-=BOCU1_REACH_NEG_1;
318 lead=BOCU1_START_NEG_2;
319 count=1;
320 } else if(diff>=BOCU1_REACH_NEG_3) {
321 /* three bytes */
322 diff-=BOCU1_REACH_NEG_2;
323 lead=BOCU1_START_NEG_3;
324 count=2;
325 } else {
326 /* four bytes */
327 diff-=BOCU1_REACH_NEG_3;
328 lead=BOCU1_START_NEG_4;
329 count=3;
330 }
331 }
332
333 /* encode the length of the packed result */
334 if(count<3) {
335 result=(count+1)<<24;
336 } else /* count==3, MSB used for the lead byte */ {
337 result=0;
338 }
339
340 /* calculate trail bytes like digits in itoa() */
341 shift=0;
342 do {
343 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
344 result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
345 shift+=8;
346 } while(--count>0);
347
348 /* add lead byte */
349 result |= (uint32_t)(lead+diff)<<shift;
350
351 return result;
352 }
353
354 /**
355 * BOCU-1 encoder function.
356 *
357 * @param pPrev pointer to the integer that holds
358 * the "previous code point" state;
359 * the initial value should be 0 which
360 * encodeBocu1 will set to the actual BOCU-1 initial state value
361 * @param c the code point to encode
362 * @return the packed 1/2/3/4-byte encoding, see packDiff(),
363 * or 0 if an error occurs
364 *
365 * @see packDiff
366 */
367 U_CFUNC int32_t
encodeBocu1(int32_t * pPrev,int32_t c)368 encodeBocu1(int32_t *pPrev, int32_t c) {
369 int32_t prev;
370
371 if(pPrev==NULL || c<0 || c>0x10ffff) {
372 /* illegal argument */
373 return 0;
374 }
375
376 prev=*pPrev;
377 if(prev==0) {
378 /* lenient handling of initial value 0 */
379 prev=*pPrev=BOCU1_ASCII_PREV;
380 }
381
382 if(c<=0x20) {
383 /*
384 * ISO C0 control & space:
385 * Encode directly for MIME compatibility,
386 * and reset state except for space, to not disrupt compression.
387 */
388 if(c!=0x20) {
389 *pPrev=BOCU1_ASCII_PREV;
390 }
391 return 0x01000000|c;
392 }
393
394 /*
395 * all other Unicode code points c==U+0021..U+10ffff
396 * are encoded with the difference c-prev
397 *
398 * a new prev is computed from c,
399 * placed in the middle of a 0x80-block (for most small scripts) or
400 * in the middle of the Unihan and Hangul blocks
401 * to statistically minimize the following difference
402 */
403 *pPrev=bocu1Prev(c);
404 return packDiff(c-prev);
405 }
406
407 /**
408 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
409 *
410 * @param pRx pointer to the decoder state structure
411 * @param b lead byte;
412 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
413 * @return -1 (state change only)
414 *
415 * @see decodeBocu1
416 */
417 static int32_t
decodeBocu1LeadByte(Bocu1Rx * pRx,uint8_t b)418 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
419 int32_t c, count;
420
421 if(b>=BOCU1_START_NEG_2) {
422 /* positive difference */
423 if(b<BOCU1_START_POS_3) {
424 /* two bytes */
425 c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
426 count=1;
427 } else if(b<BOCU1_START_POS_4) {
428 /* three bytes */
429 c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
430 count=2;
431 } else {
432 /* four bytes */
433 c=BOCU1_REACH_POS_3+1;
434 count=3;
435 }
436 } else {
437 /* negative difference */
438 if(b>=BOCU1_START_NEG_3) {
439 /* two bytes */
440 c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
441 count=1;
442 } else if(b>BOCU1_MIN) {
443 /* three bytes */
444 c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
445 count=2;
446 } else {
447 /* four bytes */
448 c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
449 count=3;
450 }
451 }
452
453 /* set the state for decoding the trail byte(s) */
454 pRx->diff=c;
455 pRx->count=count;
456 return -1;
457 }
458
459 /**
460 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
461 *
462 * @param pRx pointer to the decoder state structure
463 * @param b trail byte
464 * @return result value, same as decodeBocu1
465 *
466 * @see decodeBocu1
467 */
468 static int32_t
decodeBocu1TrailByte(Bocu1Rx * pRx,uint8_t b)469 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
470 int32_t t, c, count;
471
472 if(b<=0x20) {
473 /* skip some C0 controls and make the trail byte range contiguous */
474 t=bocu1ByteToTrail[b];
475 if(t<0) {
476 /* illegal trail byte value */
477 pRx->prev=BOCU1_ASCII_PREV;
478 pRx->count=0;
479 return -99;
480 }
481 #if BOCU1_MAX_TRAIL<0xff
482 } else if(b>BOCU1_MAX_TRAIL) {
483 return -99;
484 #endif
485 } else {
486 t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
487 }
488
489 /* add trail byte into difference and decrement count */
490 c=pRx->diff;
491 count=pRx->count;
492
493 if(count==1) {
494 /* final trail byte, deliver a code point */
495 c=pRx->prev+c+t;
496 if(0<=c && c<=0x10ffff) {
497 /* valid code point result */
498 pRx->prev=bocu1Prev(c);
499 pRx->count=0;
500 return c;
501 } else {
502 /* illegal code point result */
503 pRx->prev=BOCU1_ASCII_PREV;
504 pRx->count=0;
505 return -99;
506 }
507 }
508
509 /* intermediate trail byte */
510 if(count==2) {
511 pRx->diff=c+t*BOCU1_TRAIL_COUNT;
512 } else /* count==3 */ {
513 pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
514 }
515 pRx->count=count-1;
516 return -1;
517 }
518
519 /**
520 * BOCU-1 decoder function.
521 *
522 * @param pRx pointer to the decoder state structure;
523 * the initial values should be 0 which
524 * decodeBocu1 will set to actual initial state values
525 * @param b an input byte
526 * @return
527 * 0..0x10ffff for a result code point
528 * -1 if only the state changed without code point output
529 * <-1 if an error occurs
530 */
531 U_CFUNC int32_t
decodeBocu1(Bocu1Rx * pRx,uint8_t b)532 decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
533 int32_t prev, c, count;
534
535 if(pRx==NULL) {
536 /* illegal argument */
537 return -99;
538 }
539
540 prev=pRx->prev;
541 if(prev==0) {
542 /* lenient handling of initial 0 values */
543 prev=pRx->prev=BOCU1_ASCII_PREV;
544 count=pRx->count=0;
545 } else {
546 count=pRx->count;
547 }
548
549 if(count==0) {
550 /* byte in lead position */
551 if(b<=0x20) {
552 /*
553 * Direct-encoded C0 control code or space.
554 * Reset prev for C0 control codes but not for space.
555 */
556 if(b!=0x20) {
557 pRx->prev=BOCU1_ASCII_PREV;
558 }
559 return b;
560 }
561
562 /*
563 * b is a difference lead byte.
564 *
565 * Return a code point directly from a single-byte difference.
566 *
567 * For multi-byte difference lead bytes, set the decoder state
568 * with the partial difference value from the lead byte and
569 * with the number of trail bytes.
570 *
571 * For four-byte differences, the signedness also affects the
572 * first trail byte, which has special handling farther below.
573 */
574 if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
575 /* single-byte difference */
576 c=prev+((int32_t)b-BOCU1_MIDDLE);
577 pRx->prev=bocu1Prev(c);
578 return c;
579 } else if(b==BOCU1_RESET) {
580 /* only reset the state, no code point */
581 pRx->prev=BOCU1_ASCII_PREV;
582 return -1;
583 } else {
584 return decodeBocu1LeadByte(pRx, b);
585 }
586 } else {
587 /* trail byte in any position */
588 return decodeBocu1TrailByte(pRx, b);
589 }
590 }
591
592 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
593
594 /* test code ---------------------------------------------------------------- */
595
596 /* test code options */
597
598 /* ignore comma when processing name lists in testText() */
599 #define TEST_IGNORE_COMMA 1
600
601 /**
602 * Write a packed BOCU-1 byte sequence into a byte array,
603 * without overflow check.
604 * Test function.
605 *
606 * @param packed packed BOCU-1 byte sequence, see packDiff()
607 * @param p pointer to byte array
608 * @return number of bytes
609 *
610 * @see packDiff
611 */
612 static int32_t
writePacked(int32_t packed,uint8_t * p)613 writePacked(int32_t packed, uint8_t *p) {
614 int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
615 switch(count) {
616 case 4:
617 *p++=(uint8_t)(packed>>24);
618 case 3:
619 *p++=(uint8_t)(packed>>16);
620 case 2:
621 *p++=(uint8_t)(packed>>8);
622 case 1:
623 *p++=(uint8_t)packed;
624 default:
625 break;
626 }
627
628 return count;
629 }
630
631 /**
632 * Unpack a packed BOCU-1 non-C0/space byte sequence and get
633 * the difference to initialPrev.
634 * Used only for round-trip testing of the difference encoding and decoding.
635 * Test function.
636 *
637 * @param initialPrev bogus "previous code point" value to make sure that
638 * the resulting code point is in the range 0..0x10ffff
639 * @param packed packed BOCU-1 byte sequence
640 * @return the difference to initialPrev
641 *
642 * @see packDiff
643 * @see writeDiff
644 */
645 static int32_t
unpackDiff(int32_t initialPrev,int32_t packed)646 unpackDiff(int32_t initialPrev, int32_t packed) {
647 Bocu1Rx rx={ 0, 0, 0 };
648 int32_t count;
649
650 rx.prev=initialPrev;
651 count=BOCU1_LENGTH_FROM_PACKED(packed);
652 switch(count) {
653 case 4:
654 decodeBocu1(&rx, (uint8_t)(packed>>24));
655 case 3:
656 decodeBocu1(&rx, (uint8_t)(packed>>16));
657 case 2:
658 decodeBocu1(&rx, (uint8_t)(packed>>8));
659 case 1:
660 /* subtract initial prev */
661 return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
662 default:
663 return -0x7fffffff;
664 }
665 }
666
667 /**
668 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
669 * preserving lexical order.
670 * Also checks for roundtripping of the difference encoding.
671 * Test function.
672 *
673 * @param diff difference value to test, -0x10ffff..0x10ffff
674 * @param p pointer to output byte array
675 * @return p advanced by number of bytes output
676 *
677 * @see unpackDiff
678 */
679 static uint8_t *
writeDiff(int32_t diff,uint8_t * p)680 writeDiff(int32_t diff, uint8_t *p) {
681 /* generate the difference as a packed value and serialize it */
682 int32_t packed, initialPrev;
683
684 packed=packDiff(diff);
685
686 /*
687 * bogus initial "prev" to work around
688 * code point range check in decodeBocu1()
689 */
690 if(diff<=0) {
691 initialPrev=0x10ffff;
692 } else {
693 initialPrev=-1;
694 }
695
696 if(diff!=unpackDiff(initialPrev, packed)) {
697 log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
698 diff, packed, unpackDiff(initialPrev, packed));
699 }
700 return p+writePacked(packed, p);
701 }
702
703 /**
704 * Encode a UTF-16 string in BOCU-1.
705 * Does not check for overflows, but otherwise useful function.
706 *
707 * @param s input UTF-16 string
708 * @param length number of UChar code units in s
709 * @param p pointer to output byte array
710 * @return number of bytes output
711 */
712 static int32_t
writeString(const UChar * s,int32_t length,uint8_t * p)713 writeString(const UChar *s, int32_t length, uint8_t *p) {
714 uint8_t *p0;
715 int32_t c, prev, i;
716
717 prev=0;
718 p0=p;
719 i=0;
720 while(i<length) {
721 U16_NEXT(s, i, length, c);
722 p+=writePacked(encodeBocu1(&prev, c), p);
723 }
724 return (int32_t)(p-p0);
725 }
726
727 /**
728 * Decode a BOCU-1 byte sequence to a UTF-16 string.
729 * Does not check for overflows, but otherwise useful function.
730 *
731 * @param p pointer to input BOCU-1 bytes
732 * @param length number of input bytes
733 * @param s point to output UTF-16 string array
734 * @return number of UChar code units output
735 */
736 static int32_t
readString(const uint8_t * p,int32_t length,UChar * s)737 readString(const uint8_t *p, int32_t length, UChar *s) {
738 Bocu1Rx rx={ 0, 0, 0 };
739 int32_t c, i, sLength;
740
741 i=sLength=0;
742 while(i<length) {
743 c=decodeBocu1(&rx, p[i++]);
744 if(c<-1) {
745 log_err("error: readString detects encoding error at string index %ld\n", i);
746 return -1;
747 }
748 if(c>=0) {
749 U16_APPEND_UNSAFE(s, sLength, c);
750 }
751 }
752 return sLength;
753 }
754
755 static char
hexDigit(uint8_t digit)756 hexDigit(uint8_t digit) {
757 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
758 }
759
760 /**
761 * Pretty-print 0-terminated byte values.
762 * Helper function for test output.
763 *
764 * @param bytes 0-terminated byte array to print
765 */
766 static void
printBytes(uint8_t * bytes,char * out)767 printBytes(uint8_t *bytes, char *out) {
768 int i;
769 uint8_t b;
770
771 i=0;
772 while((b=*bytes++)!=0) {
773 *out++=' ';
774 *out++=hexDigit((uint8_t)(b>>4));
775 *out++=hexDigit((uint8_t)(b&0xf));
776 ++i;
777 }
778 i=3*(5-i);
779 while(i>0) {
780 *out++=' ';
781 --i;
782 }
783 *out=0;
784 }
785
786 /**
787 * Basic BOCU-1 test function, called when there are no command line arguments.
788 * Prints some of the #define values and performs round-trip tests of the
789 * difference encoding and decoding.
790 */
791 static void
TestBOCU1RefDiff(void)792 TestBOCU1RefDiff(void) {
793 char buf1[80], buf2[80];
794 uint8_t prev[5], level[5];
795 int32_t i, cmp, countErrors;
796
797 log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
798 log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
799 log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
800
801 log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
802 log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
803 log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
804
805 log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE);
806 log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
807 log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
808
809 /* test packDiff() & unpackDiff() with some specific values */
810 writeDiff(0, level);
811 writeDiff(1, level);
812 writeDiff(65, level);
813 writeDiff(130, level);
814 writeDiff(30000, level);
815 writeDiff(1000000, level);
816 writeDiff(-65, level);
817 writeDiff(-130, level);
818 writeDiff(-30000, level);
819 writeDiff(-1000000, level);
820
821 /* test that each value is smaller than any following one */
822 countErrors=0;
823 i=-0x10ffff;
824 *writeDiff(i, prev)=0;
825
826 /* show first number and bytes */
827 printBytes(prev, buf1);
828 log_verbose(" wD(%8ld) %s\n", i, buf1);
829
830 for(++i; i<=0x10ffff; ++i) {
831 *writeDiff(i, level)=0;
832 cmp=strcmp((const char *)prev, (const char *)level);
833 if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
834 log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
835 level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
836 }
837 if(cmp<0) {
838 if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
839 /*
840 * if the result is good, then print only if the length changed
841 * to get little but interesting output
842 */
843 printBytes(prev, buf1);
844 printBytes(level, buf2);
845 log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);
846 }
847 } else {
848 ++countErrors;
849 printBytes(prev, buf1);
850 printBytes(level, buf2);
851 log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);
852 }
853 /* remember the previous bytes */
854 memcpy(prev, level, 4);
855 }
856
857 /* show last number and bytes */
858 printBytes((uint8_t *)"", buf1);
859 printBytes(prev, buf2);
860 log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, buf2);
861
862 if(countErrors==0) {
863 log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
864 } else {
865 log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
866 }
867
868 /* output signature byte sequence */
869 i=0;
870 writePacked(encodeBocu1(&i, 0xfeff), level);
871 log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
872 level[0], level[1], level[2]);
873 }
874
875 /* cintltst code ------------------------------------------------------------ */
876
877 static const int32_t DEFAULT_BUFFER_SIZE = 30000;
878
879
880 /* test one string with the ICU and the reference BOCU-1 implementations */
881 static void
roundtripBOCU1(UConverter * bocu1,int32_t number,const UChar * text,int32_t length)882 roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
883 UChar *roundtripRef, *roundtripICU;
884 char *bocu1Ref, *bocu1ICU;
885
886 int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
887 UErrorCode errorCode;
888
889 roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
890 roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
891 bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);
892 bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);
893
894 /* Unicode -> BOCU-1 */
895 bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
896
897 errorCode=U_ZERO_ERROR;
898 bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode);
899 if(U_FAILURE(errorCode)) {
900 log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
901 goto cleanup;
902 }
903
904 if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
905 log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
906 goto cleanup;
907 }
908
909 /* BOCU-1 -> Unicode */
910 roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
911 if(roundtripRefLength<0) {
912 goto cleanup; /* readString() found an error and reported it */
913 }
914
915 roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode);
916 if(U_FAILURE(errorCode)) {
917 log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
918 goto cleanup;
919 }
920
921 if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
922 log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
923 goto cleanup;
924 }
925 if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
926 log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
927 goto cleanup;
928 }
929 cleanup:
930 free(roundtripRef);
931 free(roundtripICU);
932 free(bocu1Ref);
933 free(bocu1ICU);
934 }
935
936 static const UChar feff[]={ 0xfeff };
937 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
938 static const UChar crlf[]={ 0xd, 0xa, 0x20 };
939 static const UChar nul[]={ 0 };
940 static const UChar latin[]={ 0xdf, 0xe6 };
941 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
942 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
943 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
944 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
945 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
946 static const UChar plane1[]={ 0xd800, 0xdc00 };
947 static const UChar plane2[]={ 0xd845, 0xdddd };
948 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
949 static const UChar plane16[]={ 0xdbff, 0xdfff };
950 static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
951
952 static const struct {
953 const UChar *s;
954 int32_t length;
955 } strings[]={
956 { feff, UPRV_LENGTHOF(feff) },
957 { ascii, UPRV_LENGTHOF(ascii) },
958 { crlf, UPRV_LENGTHOF(crlf) },
959 { nul, UPRV_LENGTHOF(nul) },
960 { latin, UPRV_LENGTHOF(latin) },
961 { devanagari, UPRV_LENGTHOF(devanagari) },
962 { hiragana, UPRV_LENGTHOF(hiragana) },
963 { unihan, UPRV_LENGTHOF(unihan) },
964 { hangul, UPRV_LENGTHOF(hangul) },
965 { surrogates, UPRV_LENGTHOF(surrogates) },
966 { plane1, UPRV_LENGTHOF(plane1) },
967 { plane2, UPRV_LENGTHOF(plane2) },
968 { plane15, UPRV_LENGTHOF(plane15) },
969 { plane16, UPRV_LENGTHOF(plane16) },
970 { c0, UPRV_LENGTHOF(c0) }
971 };
972
973 /*
974 * Verify that the ICU BOCU-1 implementation produces the same results as
975 * the reference implementation from the design folder.
976 * Generate some texts and convert them with both converters, verifying
977 * identical results and roundtripping.
978 */
979 static void
TestBOCU1(void)980 TestBOCU1(void) {
981 UChar *text;
982 int32_t i, length;
983
984 UConverter *bocu1;
985 UErrorCode errorCode;
986
987 errorCode=U_ZERO_ERROR;
988 bocu1=ucnv_open("BOCU-1", &errorCode);
989 if(U_FAILURE(errorCode)) {
990 log_data_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
991 return;
992 }
993
994 text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
995
996 /* text 1: each of strings[] once */
997 length=0;
998 for(i=0; i<UPRV_LENGTHOF(strings); ++i) {
999 u_memcpy(text+length, strings[i].s, strings[i].length);
1000 length+=strings[i].length;
1001 }
1002 roundtripBOCU1(bocu1, 1, text, length);
1003
1004 /* text 2: each of strings[] twice */
1005 length=0;
1006 for(i=0; i<UPRV_LENGTHOF(strings); ++i) {
1007 u_memcpy(text+length, strings[i].s, strings[i].length);
1008 length+=strings[i].length;
1009 u_memcpy(text+length, strings[i].s, strings[i].length);
1010 length+=strings[i].length;
1011 }
1012 roundtripBOCU1(bocu1, 2, text, length);
1013
1014 /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
1015 length=0;
1016 for(i=1; length<5000; i+=7) {
1017 if(i>=UPRV_LENGTHOF(strings)) {
1018 i-=UPRV_LENGTHOF(strings);
1019 }
1020 u_memcpy(text+length, strings[i].s, strings[i].length);
1021 length+=strings[i].length;
1022 }
1023 roundtripBOCU1(bocu1, 3, text, length);
1024
1025 ucnv_close(bocu1);
1026 free(text);
1027 }
1028
1029 U_CFUNC void addBOCU1Tests(TestNode** root);
1030
1031 U_CFUNC void
addBOCU1Tests(TestNode ** root)1032 addBOCU1Tests(TestNode** root) {
1033 addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
1034 addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
1035 }
1036