1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2003-2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucm.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2003jun20
14 * created by: Markus W. Scherer
15 *
16 * This file reads a .ucm file, stores its mappings and sorts them.
17 * It implements handling of Unicode conversion mappings from .ucm files
18 * for makeconv, canonucm, rptp2ucm, etc.
19 *
20 * Unicode code point sequences with a length of more than 1,
21 * as well as byte sequences with more than 4 bytes or more than one complete
22 * character sequence are handled to support m:n mappings.
23 */
24
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
27 #include "cstring.h"
28 #include "cmemory.h"
29 #include "filestrm.h"
30 #include "uarrsort.h"
31 #include "ucnvmbcs.h"
32 #include "ucnv_bld.h"
33 #include "ucnv_ext.h"
34 #include "uparse.h"
35 #include "ucm.h"
36 #include <stdio.h>
37
38 #if !UCONFIG_NO_CONVERSION
39
40 /* -------------------------------------------------------------------------- */
41
42 static void
printMapping(UCMapping * m,UChar32 * codePoints,uint8_t * bytes,FILE * f)43 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
44 int32_t j;
45
46 for(j=0; j<m->uLen; ++j) {
47 fprintf(f, "<U%04lX>", (long)codePoints[j]);
48 }
49
50 fputc(' ', f);
51
52 for(j=0; j<m->bLen; ++j) {
53 fprintf(f, "\\x%02X", bytes[j]);
54 }
55
56 if(m->f>=0) {
57 fprintf(f, " |%u\n", m->f);
58 } else {
59 fputs("\n", f);
60 }
61 }
62
63 U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable * table,UCMapping * m,FILE * f)64 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
65 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
66 }
67
68 U_CAPI void U_EXPORT2
ucm_printTable(UCMTable * table,FILE * f,UBool byUnicode)69 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
70 UCMapping *m;
71 int32_t i, length;
72
73 m=table->mappings;
74 length=table->mappingsLength;
75 if(byUnicode) {
76 for(i=0; i<length; ++m, ++i) {
77 ucm_printMapping(table, m, f);
78 }
79 } else {
80 const int32_t *map=table->reverseMap;
81 for(i=0; i<length; ++i) {
82 ucm_printMapping(table, m+map[i], f);
83 }
84 }
85 }
86
87 /* mapping comparisons ------------------------------------------------------ */
88
89 static int32_t
compareUnicode(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r)90 compareUnicode(UCMTable *lTable, const UCMapping *l,
91 UCMTable *rTable, const UCMapping *r) {
92 const UChar32 *lu, *ru;
93 int32_t result, i, length;
94
95 if(l->uLen==1 && r->uLen==1) {
96 /* compare two single code points */
97 return l->u-r->u;
98 }
99
100 /* get pointers to the code point sequences */
101 lu=UCM_GET_CODE_POINTS(lTable, l);
102 ru=UCM_GET_CODE_POINTS(rTable, r);
103
104 /* get the minimum length */
105 if(l->uLen<=r->uLen) {
106 length=l->uLen;
107 } else {
108 length=r->uLen;
109 }
110
111 /* compare the code points */
112 for(i=0; i<length; ++i) {
113 result=lu[i]-ru[i];
114 if(result!=0) {
115 return result;
116 }
117 }
118
119 /* compare the lengths */
120 return l->uLen-r->uLen;
121 }
122
123 static int32_t
compareBytes(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool lexical)124 compareBytes(UCMTable *lTable, const UCMapping *l,
125 UCMTable *rTable, const UCMapping *r,
126 UBool lexical) {
127 const uint8_t *lb, *rb;
128 int32_t result, i, length;
129
130 /*
131 * A lexical comparison is used for sorting in the builder, to allow
132 * an efficient search for a byte sequence that could be a prefix
133 * of a previously entered byte sequence.
134 *
135 * Comparing by lengths first is for compatibility with old .ucm tools
136 * like canonucm and rptp2ucm.
137 */
138 if(lexical) {
139 /* get the minimum length and continue */
140 if(l->bLen<=r->bLen) {
141 length=l->bLen;
142 } else {
143 length=r->bLen;
144 }
145 } else {
146 /* compare lengths first */
147 result=l->bLen-r->bLen;
148 if(result!=0) {
149 return result;
150 } else {
151 length=l->bLen;
152 }
153 }
154
155 /* get pointers to the byte sequences */
156 lb=UCM_GET_BYTES(lTable, l);
157 rb=UCM_GET_BYTES(rTable, r);
158
159 /* compare the bytes */
160 for(i=0; i<length; ++i) {
161 result=lb[i]-rb[i];
162 if(result!=0) {
163 return result;
164 }
165 }
166
167 /* compare the lengths */
168 return l->bLen-r->bLen;
169 }
170
171 /* compare UCMappings for sorting */
172 static int32_t
compareMappings(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool uFirst)173 compareMappings(UCMTable *lTable, const UCMapping *l,
174 UCMTable *rTable, const UCMapping *r,
175 UBool uFirst) {
176 int32_t result;
177
178 /* choose which side to compare first */
179 if(uFirst) {
180 /* Unicode then bytes */
181 result=compareUnicode(lTable, l, rTable, r);
182 if(result==0) {
183 result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
184 }
185 } else {
186 /* bytes then Unicode */
187 result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
188 if(result==0) {
189 result=compareUnicode(lTable, l, rTable, r);
190 }
191 }
192
193 if(result!=0) {
194 return result;
195 }
196
197 /* compare the flags */
198 return l->f-r->f;
199 }
200
201 /* sorting by Unicode first sorts mappings directly */
202 static int32_t
compareMappingsUnicodeFirst(const void * context,const void * left,const void * right)203 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
204 return compareMappings(
205 (UCMTable *)context, (const UCMapping *)left,
206 (UCMTable *)context, (const UCMapping *)right, TRUE);
207 }
208
209 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
210 static int32_t
compareMappingsBytesFirst(const void * context,const void * left,const void * right)211 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
212 UCMTable *table=(UCMTable *)context;
213 int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
214 return compareMappings(
215 table, table->mappings+l,
216 table, table->mappings+r, FALSE);
217 }
218
219 U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable * t)220 ucm_sortTable(UCMTable *t) {
221 UErrorCode errorCode;
222 int32_t i;
223
224 if(t->isSorted) {
225 return;
226 }
227
228 errorCode=U_ZERO_ERROR;
229
230 /* 1. sort by Unicode first */
231 uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
232 compareMappingsUnicodeFirst, t,
233 FALSE, &errorCode);
234
235 /* build the reverseMap */
236 if(t->reverseMap==NULL) {
237 /*
238 * allocate mappingsCapacity instead of mappingsLength so that
239 * if mappings are added, the reverseMap need not be
240 * reallocated each time
241 * (see ucm_moveMappings() and ucm_addMapping())
242 */
243 t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
244 if(t->reverseMap==NULL) {
245 fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
246 exit(U_MEMORY_ALLOCATION_ERROR);
247 }
248 }
249 for(i=0; i<t->mappingsLength; ++i) {
250 t->reverseMap[i]=i;
251 }
252
253 /* 2. sort reverseMap by mappings bytes first */
254 uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
255 compareMappingsBytesFirst, t,
256 FALSE, &errorCode);
257
258 if(U_FAILURE(errorCode)) {
259 fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
260 u_errorName(errorCode));
261 exit(errorCode);
262 }
263
264 t->isSorted=TRUE;
265 }
266
267 /*
268 * remove mappings with their move flag set from the base table
269 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
270 */
271 U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable * base,UCMTable * ext)272 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
273 UCMapping *mb, *mbLimit;
274 int8_t flag;
275
276 mb=base->mappings;
277 mbLimit=mb+base->mappingsLength;
278
279 while(mb<mbLimit) {
280 flag=mb->moveFlag;
281 if(flag!=0) {
282 /* reset the move flag */
283 mb->moveFlag=0;
284
285 if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
286 /* add the mapping to the extension table */
287 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
288 }
289
290 /* remove this mapping: move the last base mapping down and overwrite the current one */
291 if(mb<(mbLimit-1)) {
292 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
293 }
294 --mbLimit;
295 --base->mappingsLength;
296 base->isSorted=FALSE;
297 } else {
298 ++mb;
299 }
300 }
301 }
302
303 enum {
304 NEEDS_MOVE=1,
305 HAS_ERRORS=2
306 };
307
308 static uint8_t
checkBaseExtUnicode(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)309 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
310 UBool moveToExt, UBool intersectBase) {
311 UCMapping *mb, *me, *mbLimit, *meLimit;
312 int32_t cmp;
313 uint8_t result;
314
315 mb=base->mappings;
316 mbLimit=mb+base->mappingsLength;
317
318 me=ext->mappings;
319 meLimit=me+ext->mappingsLength;
320
321 result=0;
322
323 for(;;) {
324 /* skip irrelevant mappings on both sides */
325 for(;;) {
326 if(mb==mbLimit) {
327 return result;
328 }
329
330 if(0<=mb->f && mb->f<=2) {
331 break;
332 }
333
334 ++mb;
335 }
336
337 for(;;) {
338 if(me==meLimit) {
339 return result;
340 }
341
342 if(0<=me->f && me->f<=2) {
343 break;
344 }
345
346 ++me;
347 }
348
349 /* compare the base and extension mappings */
350 cmp=compareUnicode(base, mb, ext, me);
351 if(cmp<0) {
352 if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
353 /*
354 * mapping in base but not in ext, move it
355 *
356 * if ext is DBCS, move DBCS mappings here
357 * and check SBCS ones for Unicode prefix below
358 */
359 mb->moveFlag|=UCM_MOVE_TO_EXT;
360 result|=NEEDS_MOVE;
361
362 /* does mb map from an input sequence that is a prefix of me's? */
363 } else if( mb->uLen<me->uLen &&
364 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
365 ) {
366 if(moveToExt) {
367 /* mark this mapping to be moved to the extension table */
368 mb->moveFlag|=UCM_MOVE_TO_EXT;
369 result|=NEEDS_MOVE;
370 } else {
371 fprintf(stderr,
372 "ucm error: the base table contains a mapping whose input sequence\n"
373 " is a prefix of the input sequence of an extension mapping\n");
374 ucm_printMapping(base, mb, stderr);
375 ucm_printMapping(ext, me, stderr);
376 result|=HAS_ERRORS;
377 }
378 }
379
380 ++mb;
381 } else if(cmp==0) {
382 /*
383 * same output: remove the extension mapping,
384 * otherwise treat as an error
385 */
386 if( mb->f==me->f && mb->bLen==me->bLen &&
387 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
388 ) {
389 me->moveFlag|=UCM_REMOVE_MAPPING;
390 result|=NEEDS_MOVE;
391 } else if(intersectBase) {
392 /* mapping in base but not in ext, move it */
393 mb->moveFlag|=UCM_MOVE_TO_EXT;
394 result|=NEEDS_MOVE;
395 } else {
396 fprintf(stderr,
397 "ucm error: the base table contains a mapping whose input sequence\n"
398 " is the same as the input sequence of an extension mapping\n"
399 " but it maps differently\n");
400 ucm_printMapping(base, mb, stderr);
401 ucm_printMapping(ext, me, stderr);
402 result|=HAS_ERRORS;
403 }
404
405 ++mb;
406 } else /* cmp>0 */ {
407 ++me;
408 }
409 }
410 }
411
412 static uint8_t
checkBaseExtBytes(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)413 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
414 UBool moveToExt, UBool intersectBase) {
415 UCMapping *mb, *me;
416 int32_t *baseMap, *extMap;
417 int32_t b, e, bLimit, eLimit, cmp;
418 uint8_t result;
419 UBool isSISO;
420
421 baseMap=base->reverseMap;
422 extMap=ext->reverseMap;
423
424 b=e=0;
425 bLimit=base->mappingsLength;
426 eLimit=ext->mappingsLength;
427
428 result=0;
429
430 isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
431
432 for(;;) {
433 /* skip irrelevant mappings on both sides */
434 for(;; ++b) {
435 if(b==bLimit) {
436 return result;
437 }
438 mb=base->mappings+baseMap[b];
439
440 if(intersectBase==2 && mb->bLen==1) {
441 /*
442 * comparing a base against a DBCS extension:
443 * leave SBCS base mappings alone
444 */
445 continue;
446 }
447
448 if(mb->f==0 || mb->f==3) {
449 break;
450 }
451 }
452
453 for(;;) {
454 if(e==eLimit) {
455 return result;
456 }
457 me=ext->mappings+extMap[e];
458
459 if(me->f==0 || me->f==3) {
460 break;
461 }
462
463 ++e;
464 }
465
466 /* compare the base and extension mappings */
467 cmp=compareBytes(base, mb, ext, me, TRUE);
468 if(cmp<0) {
469 if(intersectBase) {
470 /* mapping in base but not in ext, move it */
471 mb->moveFlag|=UCM_MOVE_TO_EXT;
472 result|=NEEDS_MOVE;
473
474 /*
475 * does mb map from an input sequence that is a prefix of me's?
476 * for SI/SO tables, a single byte is never a prefix because it
477 * occurs in a separate single-byte state
478 */
479 } else if( mb->bLen<me->bLen &&
480 (!isSISO || mb->bLen>1) &&
481 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
482 ) {
483 if(moveToExt) {
484 /* mark this mapping to be moved to the extension table */
485 mb->moveFlag|=UCM_MOVE_TO_EXT;
486 result|=NEEDS_MOVE;
487 } else {
488 fprintf(stderr,
489 "ucm error: the base table contains a mapping whose input sequence\n"
490 " is a prefix of the input sequence of an extension mapping\n");
491 ucm_printMapping(base, mb, stderr);
492 ucm_printMapping(ext, me, stderr);
493 result|=HAS_ERRORS;
494 }
495 }
496
497 ++b;
498 } else if(cmp==0) {
499 /*
500 * same output: remove the extension mapping,
501 * otherwise treat as an error
502 */
503 if( mb->f==me->f && mb->uLen==me->uLen &&
504 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
505 ) {
506 me->moveFlag|=UCM_REMOVE_MAPPING;
507 result|=NEEDS_MOVE;
508 } else if(intersectBase) {
509 /* mapping in base but not in ext, move it */
510 mb->moveFlag|=UCM_MOVE_TO_EXT;
511 result|=NEEDS_MOVE;
512 } else {
513 fprintf(stderr,
514 "ucm error: the base table contains a mapping whose input sequence\n"
515 " is the same as the input sequence of an extension mapping\n"
516 " but it maps differently\n");
517 ucm_printMapping(base, mb, stderr);
518 ucm_printMapping(ext, me, stderr);
519 result|=HAS_ERRORS;
520 }
521
522 ++b;
523 } else /* cmp>0 */ {
524 ++e;
525 }
526 }
527 }
528
529 U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable * table,UCMStates * baseStates)530 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
531 UCMapping *m, *mLimit;
532 int32_t count;
533 UBool isOK;
534
535 m=table->mappings;
536 mLimit=m+table->mappingsLength;
537 isOK=TRUE;
538
539 while(m<mLimit) {
540 count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
541 if(count<1) {
542 ucm_printMapping(table, m, stderr);
543 isOK=FALSE;
544 }
545 ++m;
546 }
547
548 return isOK;
549 }
550
551 U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UCMTable * moveTarget,UBool intersectBase)552 ucm_checkBaseExt(UCMStates *baseStates,
553 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
554 UBool intersectBase) {
555 uint8_t result;
556
557 /* if we have an extension table, we must always use precision flags */
558 if(base->flagsType&UCM_FLAGS_IMPLICIT) {
559 fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
560 return FALSE;
561 }
562 if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
563 fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
564 return FALSE;
565 }
566
567 /* checking requires both tables to be sorted */
568 ucm_sortTable(base);
569 ucm_sortTable(ext);
570
571 /* check */
572 result=
573 checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
574 checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
575
576 if(result&HAS_ERRORS) {
577 return FALSE;
578 }
579
580 if(result&NEEDS_MOVE) {
581 ucm_moveMappings(ext, NULL);
582 ucm_moveMappings(base, moveTarget);
583 ucm_sortTable(base);
584 ucm_sortTable(ext);
585 if(moveTarget!=NULL) {
586 ucm_sortTable(moveTarget);
587 }
588 }
589
590 return TRUE;
591 }
592
593 /* merge tables for rptp2ucm ------------------------------------------------ */
594
595 U_CAPI void U_EXPORT2
ucm_mergeTables(UCMTable * fromUTable,UCMTable * toUTable,const uint8_t * subchar,int32_t subcharLength,uint8_t subchar1)596 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
597 const uint8_t *subchar, int32_t subcharLength,
598 uint8_t subchar1) {
599 UCMapping *fromUMapping, *toUMapping;
600 int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
601
602 ucm_sortTable(fromUTable);
603 ucm_sortTable(toUTable);
604
605 fromUMapping=fromUTable->mappings;
606 toUMapping=toUTable->mappings;
607
608 fromUTop=fromUTable->mappingsLength;
609 toUTop=toUTable->mappingsLength;
610
611 fromUIndex=toUIndex=0;
612
613 while(fromUIndex<fromUTop && toUIndex<toUTop) {
614 cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
615 if(cmp==0) {
616 /* equal: roundtrip, nothing to do (flags are initially 0) */
617 ++fromUMapping;
618 ++toUMapping;
619
620 ++fromUIndex;
621 ++toUIndex;
622 } else if(cmp<0) {
623 /*
624 * the fromU mapping does not have a toU counterpart:
625 * fallback Unicode->codepage
626 */
627 if( (fromUMapping->bLen==subcharLength &&
628 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
629 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
630 ) {
631 fromUMapping->f=2; /* SUB mapping */
632 } else {
633 fromUMapping->f=1; /* normal fallback */
634 }
635
636 ++fromUMapping;
637 ++fromUIndex;
638 } else {
639 /*
640 * the toU mapping does not have a fromU counterpart:
641 * (reverse) fallback codepage->Unicode, copy it to the fromU table
642 */
643
644 /* ignore reverse fallbacks to Unicode SUB */
645 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
646 toUMapping->f=3; /* reverse fallback */
647 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
648
649 /* the table may have been reallocated */
650 fromUMapping=fromUTable->mappings+fromUIndex;
651 }
652
653 ++toUMapping;
654 ++toUIndex;
655 }
656 }
657
658 /* either one or both tables are exhausted */
659 while(fromUIndex<fromUTop) {
660 /* leftover fromU mappings are fallbacks */
661 if( (fromUMapping->bLen==subcharLength &&
662 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
663 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
664 ) {
665 fromUMapping->f=2; /* SUB mapping */
666 } else {
667 fromUMapping->f=1; /* normal fallback */
668 }
669
670 ++fromUMapping;
671 ++fromUIndex;
672 }
673
674 while(toUIndex<toUTop) {
675 /* leftover toU mappings are reverse fallbacks */
676
677 /* ignore reverse fallbacks to Unicode SUB */
678 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
679 toUMapping->f=3; /* reverse fallback */
680 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
681 }
682
683 ++toUMapping;
684 ++toUIndex;
685 }
686
687 fromUTable->isSorted=FALSE;
688 }
689
690 /* separate extension mappings out of base table for rptp2ucm --------------- */
691
692 U_CAPI UBool U_EXPORT2
ucm_separateMappings(UCMFile * ucm,UBool isSISO)693 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
694 UCMTable *table;
695 UCMapping *m, *mLimit;
696 int32_t type;
697 UBool needsMove, isOK;
698
699 table=ucm->base;
700 m=table->mappings;
701 mLimit=m+table->mappingsLength;
702
703 needsMove=FALSE;
704 isOK=TRUE;
705
706 for(; m<mLimit; ++m) {
707 if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
708 fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
709 ucm_printMapping(table, m, stderr);
710 m->moveFlag|=UCM_REMOVE_MAPPING;
711 needsMove=TRUE;
712 continue;
713 }
714
715 type=ucm_mappingType(
716 &ucm->states, m,
717 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
718 if(type<0) {
719 /* illegal byte sequence */
720 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
721 isOK=FALSE;
722 } else if(type>0) {
723 m->moveFlag|=UCM_MOVE_TO_EXT;
724 needsMove=TRUE;
725 }
726 }
727
728 if(!isOK) {
729 return FALSE;
730 }
731 if(needsMove) {
732 ucm_moveMappings(ucm->base, ucm->ext);
733 return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
734 } else {
735 ucm_sortTable(ucm->base);
736 return TRUE;
737 }
738 }
739
740 /* ucm parser --------------------------------------------------------------- */
741
742 U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line,const char ** ps)743 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
744 const char *s=*ps;
745 char *end;
746 uint8_t byte;
747 int8_t bLen;
748
749 bLen=0;
750 for(;;) {
751 /* skip an optional plus sign */
752 if(bLen>0 && *s=='+') {
753 ++s;
754 }
755 if(*s!='\\') {
756 break;
757 }
758
759 if( s[1]!='x' ||
760 (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
761 ) {
762 fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
763 return -1;
764 }
765
766 if(bLen==UCNV_EXT_MAX_BYTES) {
767 fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
768 return -1;
769 }
770 bytes[bLen++]=byte;
771 s=end;
772 }
773
774 *ps=s;
775 return bLen;
776 }
777
778 /* parse a mapping line; must not be empty */
779 U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line)780 ucm_parseMappingLine(UCMapping *m,
781 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
782 uint8_t bytes[UCNV_EXT_MAX_BYTES],
783 const char *line) {
784 const char *s;
785 char *end;
786 UChar32 cp;
787 int32_t u16Length;
788 int8_t uLen, bLen, f;
789
790 s=line;
791 uLen=bLen=0;
792
793 /* parse code points */
794 for(;;) {
795 /* skip an optional plus sign */
796 if(uLen>0 && *s=='+') {
797 ++s;
798 }
799 if(*s!='<') {
800 break;
801 }
802
803 if( s[1]!='U' ||
804 (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
805 *end!='>'
806 ) {
807 fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
808 return FALSE;
809 }
810 if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
811 fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
812 return FALSE;
813 }
814
815 if(uLen==UCNV_EXT_MAX_UCHARS) {
816 fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
817 return FALSE;
818 }
819 codePoints[uLen++]=cp;
820 s=end+1;
821 }
822
823 if(uLen==0) {
824 fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
825 return FALSE;
826 } else if(uLen==1) {
827 m->u=codePoints[0];
828 } else {
829 UErrorCode errorCode=U_ZERO_ERROR;
830 u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
831 if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
832 u16Length>UCNV_EXT_MAX_UCHARS
833 ) {
834 fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
835 return FALSE;
836 }
837 }
838
839 s=u_skipWhitespace(s);
840
841 /* parse bytes */
842 bLen=ucm_parseBytes(bytes, line, &s);
843
844 if(bLen<0) {
845 return FALSE;
846 } else if(bLen==0) {
847 fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
848 return FALSE;
849 } else if(bLen<=4) {
850 uprv_memcpy(m->b.bytes, bytes, bLen);
851 }
852
853 /* skip everything until the fallback indicator, even the start of a comment */
854 for(;;) {
855 if(*s==0) {
856 f=-1; /* no fallback indicator */
857 break;
858 } else if(*s=='|') {
859 f=(int8_t)(s[1]-'0');
860 if((uint8_t)f>3) {
861 fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
862 return FALSE;
863 }
864 break;
865 }
866 ++s;
867 }
868
869 m->uLen=uLen;
870 m->bLen=bLen;
871 m->f=f;
872 return TRUE;
873 }
874
875 /* general APIs ------------------------------------------------------------- */
876
877 U_CAPI UCMTable * U_EXPORT2
ucm_openTable()878 ucm_openTable() {
879 UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
880 if(table==NULL) {
881 fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
882 exit(U_MEMORY_ALLOCATION_ERROR);
883 }
884
885 memset(table, 0, sizeof(UCMTable));
886 return table;
887 }
888
889 U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable * table)890 ucm_closeTable(UCMTable *table) {
891 if(table!=NULL) {
892 uprv_free(table->mappings);
893 uprv_free(table->codePoints);
894 uprv_free(table->bytes);
895 uprv_free(table->reverseMap);
896 uprv_free(table);
897 }
898 }
899
900 U_CAPI void U_EXPORT2
ucm_resetTable(UCMTable * table)901 ucm_resetTable(UCMTable *table) {
902 if(table!=NULL) {
903 table->mappingsLength=0;
904 table->flagsType=0;
905 table->unicodeMask=0;
906 table->bytesLength=table->codePointsLength=0;
907 table->isSorted=FALSE;
908 }
909 }
910
911 U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable * table,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])912 ucm_addMapping(UCMTable *table,
913 UCMapping *m,
914 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
915 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
916 UCMapping *tm;
917 UChar32 c;
918 int32_t index;
919
920 if(table->mappingsLength>=table->mappingsCapacity) {
921 /* make the mappings array larger */
922 if(table->mappingsCapacity==0) {
923 table->mappingsCapacity=1000;
924 } else {
925 table->mappingsCapacity*=10;
926 }
927 table->mappings=(UCMapping *)uprv_realloc(table->mappings,
928 table->mappingsCapacity*sizeof(UCMapping));
929 if(table->mappings==NULL) {
930 fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
931 (int)table->mappingsCapacity);
932 exit(U_MEMORY_ALLOCATION_ERROR);
933 }
934
935 if(table->reverseMap!=NULL) {
936 /* the reverseMap must be reallocated in a new sort */
937 uprv_free(table->reverseMap);
938 table->reverseMap=NULL;
939 }
940 }
941
942 if(m->uLen>1 && table->codePointsCapacity==0) {
943 table->codePointsCapacity=10000;
944 table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
945 if(table->codePoints==NULL) {
946 fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
947 (int)table->codePointsCapacity);
948 exit(U_MEMORY_ALLOCATION_ERROR);
949 }
950 }
951
952 if(m->bLen>4 && table->bytesCapacity==0) {
953 table->bytesCapacity=10000;
954 table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
955 if(table->bytes==NULL) {
956 fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
957 (int)table->bytesCapacity);
958 exit(U_MEMORY_ALLOCATION_ERROR);
959 }
960 }
961
962 if(m->uLen>1) {
963 index=table->codePointsLength;
964 table->codePointsLength+=m->uLen;
965 if(table->codePointsLength>table->codePointsCapacity) {
966 fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
967 exit(U_MEMORY_ALLOCATION_ERROR);
968 }
969
970 uprv_memcpy(table->codePoints+index, codePoints, m->uLen*4);
971 m->u=index;
972 }
973
974 if(m->bLen>4) {
975 index=table->bytesLength;
976 table->bytesLength+=m->bLen;
977 if(table->bytesLength>table->bytesCapacity) {
978 fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
979 exit(U_MEMORY_ALLOCATION_ERROR);
980 }
981
982 uprv_memcpy(table->bytes+index, bytes, m->bLen);
983 m->b.index=index;
984 }
985
986 /* set unicodeMask */
987 for(index=0; index<m->uLen; ++index) {
988 c=codePoints[index];
989 if(c>=0x10000) {
990 table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
991 } else if(U_IS_SURROGATE(c)) {
992 table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
993 }
994 }
995
996 /* set flagsType */
997 if(m->f<0) {
998 table->flagsType|=UCM_FLAGS_IMPLICIT;
999 } else {
1000 table->flagsType|=UCM_FLAGS_EXPLICIT;
1001 }
1002
1003 tm=table->mappings+table->mappingsLength++;
1004 uprv_memcpy(tm, m, sizeof(UCMapping));
1005
1006 table->isSorted=FALSE;
1007 }
1008
1009 U_CAPI UCMFile * U_EXPORT2
ucm_open()1010 ucm_open() {
1011 UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1012 if(ucm==NULL) {
1013 fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1014 exit(U_MEMORY_ALLOCATION_ERROR);
1015 }
1016
1017 memset(ucm, 0, sizeof(UCMFile));
1018
1019 ucm->base=ucm_openTable();
1020 ucm->ext=ucm_openTable();
1021
1022 ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
1023 ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
1024 ucm->states.outputType=-1;
1025 ucm->states.minCharLength=ucm->states.maxCharLength=1;
1026
1027 return ucm;
1028 }
1029
1030 U_CAPI void U_EXPORT2
ucm_close(UCMFile * ucm)1031 ucm_close(UCMFile *ucm) {
1032 if(ucm!=NULL) {
1033 uprv_free(ucm->base);
1034 uprv_free(ucm->ext);
1035 uprv_free(ucm);
1036 }
1037 }
1038
1039 U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1040 ucm_mappingType(UCMStates *baseStates,
1041 UCMapping *m,
1042 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1043 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1044 /* check validity of the bytes and count the characters in them */
1045 int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1046 if(count<1) {
1047 /* illegal byte sequence */
1048 return -1;
1049 }
1050
1051 /*
1052 * Suitable for an ICU conversion base table means:
1053 * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1054 * - SBCS: any 1:1 mapping
1055 * (the table stores additional bits to distinguish mapping types)
1056 * - MBCS: not a |2 SUB mapping for <subchar1>
1057 * - MBCS: not a |1 fallback to 0x00
1058 * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1059 *
1060 * Further restrictions for fromUnicode tables
1061 * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1062 *
1063 * All of the MBCS fromUnicode specific tests could be removed from here,
1064 * but the ones above are for unusual mappings, and removing the tests
1065 * from here would change canonucm output which seems gratuitous.
1066 * (Markus Scherer 2006-nov-28)
1067 *
1068 * Exception: All implicit mappings (f<0) that need to be moved
1069 * because of fromUnicode restrictions _must_ be moved here because
1070 * makeconv uses a hack for moving mappings only for the fromUnicode table
1071 * that only works with non-negative values of f.
1072 */
1073 if( m->uLen==1 && count==1 &&
1074 (baseStates->maxCharLength==1 ||
1075 !((m->f==2 && m->bLen==1) ||
1076 (m->f==1 && bytes[0]==0) ||
1077 (m->f<=1 && m->bLen>1 && bytes[0]==0)))
1078 ) {
1079 return 0; /* suitable for a base table */
1080 } else {
1081 return 1; /* needs to go into an extension table */
1082 }
1083 }
1084
1085 U_CAPI UBool U_EXPORT2
ucm_addMappingAuto(UCMFile * ucm,UBool forBase,UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1086 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1087 UCMapping *m,
1088 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1089 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1090 int32_t type;
1091
1092 if(m->f==2 && m->uLen>1) {
1093 fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1094 printMapping(m, codePoints, bytes, stderr);
1095 return FALSE;
1096 }
1097
1098 if(baseStates!=NULL) {
1099 /* check validity of the bytes and count the characters in them */
1100 type=ucm_mappingType(baseStates, m, codePoints, bytes);
1101 if(type<0) {
1102 /* illegal byte sequence */
1103 printMapping(m, codePoints, bytes, stderr);
1104 return FALSE;
1105 }
1106 } else {
1107 /* not used - adding a mapping for an extension-only table before its base table is read */
1108 type=1;
1109 }
1110
1111 /*
1112 * Add the mapping to the base table if this is requested and suitable.
1113 * Otherwise, add it to the extension table.
1114 */
1115 if(forBase && type==0) {
1116 ucm_addMapping(ucm->base, m, codePoints, bytes);
1117 } else {
1118 ucm_addMapping(ucm->ext, m, codePoints, bytes);
1119 }
1120
1121 return TRUE;
1122 }
1123
1124 U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile * ucm,const char * line,UBool forBase,UCMStates * baseStates)1125 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1126 UCMapping m={ 0 };
1127 UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1128 uint8_t bytes[UCNV_EXT_MAX_BYTES];
1129
1130 const char *s;
1131
1132 /* ignore empty and comment lines */
1133 if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1134 return TRUE;
1135 }
1136
1137 return
1138 ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1139 ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1140 }
1141
1142 U_CAPI void U_EXPORT2
ucm_readTable(UCMFile * ucm,FileStream * convFile,UBool forBase,UCMStates * baseStates,UErrorCode * pErrorCode)1143 ucm_readTable(UCMFile *ucm, FileStream* convFile,
1144 UBool forBase, UCMStates *baseStates,
1145 UErrorCode *pErrorCode) {
1146 char line[500];
1147 char *end;
1148 UBool isOK;
1149
1150 if(U_FAILURE(*pErrorCode)) {
1151 return;
1152 }
1153
1154 isOK=TRUE;
1155
1156 for(;;) {
1157 /* read the next line */
1158 if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1159 fprintf(stderr, "incomplete charmap section\n");
1160 isOK=FALSE;
1161 break;
1162 }
1163
1164 /* remove CR LF */
1165 end=uprv_strchr(line, 0);
1166 while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1167 --end;
1168 }
1169 *end=0;
1170
1171 /* ignore empty and comment lines */
1172 if(line[0]==0 || line[0]=='#') {
1173 continue;
1174 }
1175
1176 /* stop at the end of the mapping table */
1177 if(0==uprv_strcmp(line, "END CHARMAP")) {
1178 break;
1179 }
1180
1181 isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1182 }
1183
1184 if(!isOK) {
1185 *pErrorCode=U_INVALID_TABLE_FORMAT;
1186 }
1187 }
1188 #endif
1189