1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2003-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucm.c
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2003jun20
16 * created by: Markus W. Scherer
17 *
18 * This file reads a .ucm file, stores its mappings and sorts them.
19 * It implements handling of Unicode conversion mappings from .ucm files
20 * for makeconv, canonucm, rptp2ucm, etc.
21 *
22 * Unicode code point sequences with a length of more than 1,
23 * as well as byte sequences with more than 4 bytes or more than one complete
24 * character sequence are handled to support m:n mappings.
25 */
26
27 #include "unicode/utypes.h"
28 #include "unicode/ustring.h"
29 #include "cstring.h"
30 #include "cmemory.h"
31 #include "filestrm.h"
32 #include "uarrsort.h"
33 #include "ucnvmbcs.h"
34 #include "ucnv_bld.h"
35 #include "ucnv_ext.h"
36 #include "uparse.h"
37 #include "ucm.h"
38 #include <stdio.h>
39
40 #if !UCONFIG_NO_CONVERSION
41
42 /* -------------------------------------------------------------------------- */
43
44 static void
printMapping(UCMapping * m,UChar32 * codePoints,uint8_t * bytes,FILE * f)45 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
46 int32_t j;
47
48 for(j=0; j<m->uLen; ++j) {
49 fprintf(f, "<U%04lX>", (long)codePoints[j]);
50 }
51
52 fputc(' ', f);
53
54 for(j=0; j<m->bLen; ++j) {
55 fprintf(f, "\\x%02X", bytes[j]);
56 }
57
58 if(m->f>=0) {
59 fprintf(f, " |%u\n", m->f);
60 } else {
61 fputs("\n", f);
62 }
63 }
64
65 U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable * table,UCMapping * m,FILE * f)66 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
67 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
68 }
69
70 U_CAPI void U_EXPORT2
ucm_printTable(UCMTable * table,FILE * f,UBool byUnicode)71 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
72 UCMapping *m;
73 int32_t i, length;
74
75 m=table->mappings;
76 length=table->mappingsLength;
77 if(byUnicode) {
78 for(i=0; i<length; ++m, ++i) {
79 ucm_printMapping(table, m, f);
80 }
81 } else {
82 const int32_t *map=table->reverseMap;
83 for(i=0; i<length; ++i) {
84 ucm_printMapping(table, m+map[i], f);
85 }
86 }
87 }
88
89 /* mapping comparisons ------------------------------------------------------ */
90
91 static int32_t
compareUnicode(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r)92 compareUnicode(UCMTable *lTable, const UCMapping *l,
93 UCMTable *rTable, const UCMapping *r) {
94 const UChar32 *lu, *ru;
95 int32_t result, i, length;
96
97 if(l->uLen==1 && r->uLen==1) {
98 /* compare two single code points */
99 return l->u-r->u;
100 }
101
102 /* get pointers to the code point sequences */
103 lu=UCM_GET_CODE_POINTS(lTable, l);
104 ru=UCM_GET_CODE_POINTS(rTable, r);
105
106 /* get the minimum length */
107 if(l->uLen<=r->uLen) {
108 length=l->uLen;
109 } else {
110 length=r->uLen;
111 }
112
113 /* compare the code points */
114 for(i=0; i<length; ++i) {
115 result=lu[i]-ru[i];
116 if(result!=0) {
117 return result;
118 }
119 }
120
121 /* compare the lengths */
122 return l->uLen-r->uLen;
123 }
124
125 static int32_t
compareBytes(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool lexical)126 compareBytes(UCMTable *lTable, const UCMapping *l,
127 UCMTable *rTable, const UCMapping *r,
128 UBool lexical) {
129 const uint8_t *lb, *rb;
130 int32_t result, i, length;
131
132 /*
133 * A lexical comparison is used for sorting in the builder, to allow
134 * an efficient search for a byte sequence that could be a prefix
135 * of a previously entered byte sequence.
136 *
137 * Comparing by lengths first is for compatibility with old .ucm tools
138 * like canonucm and rptp2ucm.
139 */
140 if(lexical) {
141 /* get the minimum length and continue */
142 if(l->bLen<=r->bLen) {
143 length=l->bLen;
144 } else {
145 length=r->bLen;
146 }
147 } else {
148 /* compare lengths first */
149 result=l->bLen-r->bLen;
150 if(result!=0) {
151 return result;
152 } else {
153 length=l->bLen;
154 }
155 }
156
157 /* get pointers to the byte sequences */
158 lb=UCM_GET_BYTES(lTable, l);
159 rb=UCM_GET_BYTES(rTable, r);
160
161 /* compare the bytes */
162 for(i=0; i<length; ++i) {
163 result=lb[i]-rb[i];
164 if(result!=0) {
165 return result;
166 }
167 }
168
169 /* compare the lengths */
170 return l->bLen-r->bLen;
171 }
172
173 /* compare UCMappings for sorting */
174 static int32_t
compareMappings(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool uFirst)175 compareMappings(UCMTable *lTable, const UCMapping *l,
176 UCMTable *rTable, const UCMapping *r,
177 UBool uFirst) {
178 int32_t result;
179
180 /* choose which side to compare first */
181 if(uFirst) {
182 /* Unicode then bytes */
183 result=compareUnicode(lTable, l, rTable, r);
184 if(result==0) {
185 result=compareBytes(lTable, l, rTable, r, false); /* not lexically, like canonucm */
186 }
187 } else {
188 /* bytes then Unicode */
189 result=compareBytes(lTable, l, rTable, r, true); /* lexically, for builder */
190 if(result==0) {
191 result=compareUnicode(lTable, l, rTable, r);
192 }
193 }
194
195 if(result!=0) {
196 return result;
197 }
198
199 /* compare the flags */
200 return l->f-r->f;
201 }
202 U_CDECL_BEGIN
203 /* sorting by Unicode first sorts mappings directly */
204 static int32_t U_CALLCONV
compareMappingsUnicodeFirst(const void * context,const void * left,const void * right)205 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
206 return compareMappings(
207 (UCMTable *)context, (const UCMapping *)left,
208 (UCMTable *)context, (const UCMapping *)right, true);
209 }
210
211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
212 static int32_t U_CALLCONV
compareMappingsBytesFirst(const void * context,const void * left,const void * right)213 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
214 UCMTable *table=(UCMTable *)context;
215 int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
216 return compareMappings(
217 table, table->mappings+l,
218 table, table->mappings+r, false);
219 }
220 U_CDECL_END
221
222 U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable * t)223 ucm_sortTable(UCMTable *t) {
224 UErrorCode errorCode;
225 int32_t i;
226
227 if(t->isSorted) {
228 return;
229 }
230
231 errorCode=U_ZERO_ERROR;
232
233 /* 1. sort by Unicode first */
234 uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
235 compareMappingsUnicodeFirst, t,
236 false, &errorCode);
237
238 /* build the reverseMap */
239 if(t->reverseMap==NULL) {
240 /*
241 * allocate mappingsCapacity instead of mappingsLength so that
242 * if mappings are added, the reverseMap need not be
243 * reallocated each time
244 * (see ucm_moveMappings() and ucm_addMapping())
245 */
246 t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
247 if(t->reverseMap==NULL) {
248 fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
249 exit(U_MEMORY_ALLOCATION_ERROR);
250 }
251 }
252 for(i=0; i<t->mappingsLength; ++i) {
253 t->reverseMap[i]=i;
254 }
255
256 /* 2. sort reverseMap by mappings bytes first */
257 uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
258 compareMappingsBytesFirst, t,
259 false, &errorCode);
260
261 if(U_FAILURE(errorCode)) {
262 fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
263 u_errorName(errorCode));
264 exit(errorCode);
265 }
266
267 t->isSorted=true;
268 }
269
270 /*
271 * remove mappings with their move flag set from the base table
272 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
273 */
274 U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable * base,UCMTable * ext)275 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
276 UCMapping *mb, *mbLimit;
277 int8_t flag;
278
279 mb=base->mappings;
280 mbLimit=mb+base->mappingsLength;
281
282 while(mb<mbLimit) {
283 flag=mb->moveFlag;
284 if(flag!=0) {
285 /* reset the move flag */
286 mb->moveFlag=0;
287
288 if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
289 /* add the mapping to the extension table */
290 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
291 }
292
293 /* remove this mapping: move the last base mapping down and overwrite the current one */
294 if(mb<(mbLimit-1)) {
295 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
296 }
297 --mbLimit;
298 --base->mappingsLength;
299 base->isSorted=false;
300 } else {
301 ++mb;
302 }
303 }
304 }
305
306 enum {
307 NEEDS_MOVE=1,
308 HAS_ERRORS=2
309 };
310
311 static uint8_t
checkBaseExtUnicode(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)312 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
313 UBool moveToExt, UBool intersectBase) {
314 (void)baseStates;
315
316 UCMapping *mb, *me, *mbLimit, *meLimit;
317 int32_t cmp;
318 uint8_t result;
319
320 mb=base->mappings;
321 mbLimit=mb+base->mappingsLength;
322
323 me=ext->mappings;
324 meLimit=me+ext->mappingsLength;
325
326 result=0;
327
328 for(;;) {
329 /* skip irrelevant mappings on both sides */
330 for(;;) {
331 if(mb==mbLimit) {
332 return result;
333 }
334
335 if((0<=mb->f && mb->f<=2) || mb->f==4) {
336 break;
337 }
338
339 ++mb;
340 }
341
342 for(;;) {
343 if(me==meLimit) {
344 return result;
345 }
346
347 if((0<=me->f && me->f<=2) || me->f==4) {
348 break;
349 }
350
351 ++me;
352 }
353
354 /* compare the base and extension mappings */
355 cmp=compareUnicode(base, mb, ext, me);
356 if(cmp<0) {
357 if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
358 /*
359 * mapping in base but not in ext, move it
360 *
361 * if ext is DBCS, move DBCS mappings here
362 * and check SBCS ones for Unicode prefix below
363 */
364 mb->moveFlag|=UCM_MOVE_TO_EXT;
365 result|=NEEDS_MOVE;
366
367 /* does mb map from an input sequence that is a prefix of me's? */
368 } else if( mb->uLen<me->uLen &&
369 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
370 ) {
371 if(moveToExt) {
372 /* mark this mapping to be moved to the extension table */
373 mb->moveFlag|=UCM_MOVE_TO_EXT;
374 result|=NEEDS_MOVE;
375 } else {
376 fprintf(stderr,
377 "ucm error: the base table contains a mapping whose input sequence\n"
378 " is a prefix of the input sequence of an extension mapping\n");
379 ucm_printMapping(base, mb, stderr);
380 ucm_printMapping(ext, me, stderr);
381 result|=HAS_ERRORS;
382 }
383 }
384
385 ++mb;
386 } else if(cmp==0) {
387 /*
388 * same output: remove the extension mapping,
389 * otherwise treat as an error
390 */
391 if( mb->f==me->f && mb->bLen==me->bLen &&
392 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
393 ) {
394 me->moveFlag|=UCM_REMOVE_MAPPING;
395 result|=NEEDS_MOVE;
396 } else if(intersectBase) {
397 /* mapping in base but not in ext, move it */
398 mb->moveFlag|=UCM_MOVE_TO_EXT;
399 result|=NEEDS_MOVE;
400 } else {
401 fprintf(stderr,
402 "ucm error: the base table contains a mapping whose input sequence\n"
403 " is the same as the input sequence of an extension mapping\n"
404 " but it maps differently\n");
405 ucm_printMapping(base, mb, stderr);
406 ucm_printMapping(ext, me, stderr);
407 result|=HAS_ERRORS;
408 }
409
410 ++mb;
411 } else /* cmp>0 */ {
412 ++me;
413 }
414 }
415 }
416
417 static uint8_t
checkBaseExtBytes(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)418 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
419 UBool moveToExt, UBool intersectBase) {
420 UCMapping *mb, *me;
421 int32_t *baseMap, *extMap;
422 int32_t b, e, bLimit, eLimit, cmp;
423 uint8_t result;
424 UBool isSISO;
425
426 baseMap=base->reverseMap;
427 extMap=ext->reverseMap;
428
429 b=e=0;
430 bLimit=base->mappingsLength;
431 eLimit=ext->mappingsLength;
432
433 result=0;
434
435 isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
436
437 for(;;) {
438 /* skip irrelevant mappings on both sides */
439 for(;; ++b) {
440 if(b==bLimit) {
441 return result;
442 }
443 mb=base->mappings+baseMap[b];
444
445 if(intersectBase==2 && mb->bLen==1) {
446 /*
447 * comparing a base against a DBCS extension:
448 * leave SBCS base mappings alone
449 */
450 continue;
451 }
452
453 if(mb->f==0 || mb->f==3) {
454 break;
455 }
456 }
457
458 for(;;) {
459 if(e==eLimit) {
460 return result;
461 }
462 me=ext->mappings+extMap[e];
463
464 if(me->f==0 || me->f==3) {
465 break;
466 }
467
468 ++e;
469 }
470
471 /* compare the base and extension mappings */
472 cmp=compareBytes(base, mb, ext, me, true);
473 if(cmp<0) {
474 if(intersectBase) {
475 /* mapping in base but not in ext, move it */
476 mb->moveFlag|=UCM_MOVE_TO_EXT;
477 result|=NEEDS_MOVE;
478
479 /*
480 * does mb map from an input sequence that is a prefix of me's?
481 * for SI/SO tables, a single byte is never a prefix because it
482 * occurs in a separate single-byte state
483 */
484 } else if( mb->bLen<me->bLen &&
485 (!isSISO || mb->bLen>1) &&
486 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
487 ) {
488 if(moveToExt) {
489 /* mark this mapping to be moved to the extension table */
490 mb->moveFlag|=UCM_MOVE_TO_EXT;
491 result|=NEEDS_MOVE;
492 } else {
493 fprintf(stderr,
494 "ucm error: the base table contains a mapping whose input sequence\n"
495 " is a prefix of the input sequence of an extension mapping\n");
496 ucm_printMapping(base, mb, stderr);
497 ucm_printMapping(ext, me, stderr);
498 result|=HAS_ERRORS;
499 }
500 }
501
502 ++b;
503 } else if(cmp==0) {
504 /*
505 * same output: remove the extension mapping,
506 * otherwise treat as an error
507 */
508 if( mb->f==me->f && mb->uLen==me->uLen &&
509 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
510 ) {
511 me->moveFlag|=UCM_REMOVE_MAPPING;
512 result|=NEEDS_MOVE;
513 } else if(intersectBase) {
514 /* mapping in base but not in ext, move it */
515 mb->moveFlag|=UCM_MOVE_TO_EXT;
516 result|=NEEDS_MOVE;
517 } else {
518 fprintf(stderr,
519 "ucm error: the base table contains a mapping whose input sequence\n"
520 " is the same as the input sequence of an extension mapping\n"
521 " but it maps differently\n");
522 ucm_printMapping(base, mb, stderr);
523 ucm_printMapping(ext, me, stderr);
524 result|=HAS_ERRORS;
525 }
526
527 ++b;
528 } else /* cmp>0 */ {
529 ++e;
530 }
531 }
532 }
533
534 U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable * table,UCMStates * baseStates)535 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
536 UCMapping *m, *mLimit;
537 int32_t count;
538 UBool isOK;
539
540 m=table->mappings;
541 mLimit=m+table->mappingsLength;
542 isOK=true;
543
544 while(m<mLimit) {
545 count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
546 if(count<1) {
547 ucm_printMapping(table, m, stderr);
548 isOK=false;
549 }
550 ++m;
551 }
552
553 return isOK;
554 }
555
556 U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UCMTable * moveTarget,UBool intersectBase)557 ucm_checkBaseExt(UCMStates *baseStates,
558 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
559 UBool intersectBase) {
560 uint8_t result;
561
562 /* if we have an extension table, we must always use precision flags */
563 if(base->flagsType&UCM_FLAGS_IMPLICIT) {
564 fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
565 return false;
566 }
567 if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
568 fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
569 return false;
570 }
571
572 /* checking requires both tables to be sorted */
573 ucm_sortTable(base);
574 ucm_sortTable(ext);
575
576 /* check */
577 result=
578 checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
579 checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
580
581 if(result&HAS_ERRORS) {
582 return false;
583 }
584
585 if(result&NEEDS_MOVE) {
586 ucm_moveMappings(ext, NULL);
587 ucm_moveMappings(base, moveTarget);
588 ucm_sortTable(base);
589 ucm_sortTable(ext);
590 if(moveTarget!=NULL) {
591 ucm_sortTable(moveTarget);
592 }
593 }
594
595 return true;
596 }
597
598 /* merge tables for rptp2ucm ------------------------------------------------ */
599
600 U_CAPI void U_EXPORT2
ucm_mergeTables(UCMTable * fromUTable,UCMTable * toUTable,const uint8_t * subchar,int32_t subcharLength,uint8_t subchar1)601 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
602 const uint8_t *subchar, int32_t subcharLength,
603 uint8_t subchar1) {
604 UCMapping *fromUMapping, *toUMapping;
605 int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
606
607 ucm_sortTable(fromUTable);
608 ucm_sortTable(toUTable);
609
610 fromUMapping=fromUTable->mappings;
611 toUMapping=toUTable->mappings;
612
613 fromUTop=fromUTable->mappingsLength;
614 toUTop=toUTable->mappingsLength;
615
616 fromUIndex=toUIndex=0;
617
618 while(fromUIndex<fromUTop && toUIndex<toUTop) {
619 cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, true);
620 if(cmp==0) {
621 /* equal: roundtrip, nothing to do (flags are initially 0) */
622 ++fromUMapping;
623 ++toUMapping;
624
625 ++fromUIndex;
626 ++toUIndex;
627 } else if(cmp<0) {
628 /*
629 * the fromU mapping does not have a toU counterpart:
630 * fallback Unicode->codepage
631 */
632 if( (fromUMapping->bLen==subcharLength &&
633 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
634 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
635 ) {
636 fromUMapping->f=2; /* SUB mapping */
637 } else {
638 fromUMapping->f=1; /* normal fallback */
639 }
640
641 ++fromUMapping;
642 ++fromUIndex;
643 } else {
644 /*
645 * the toU mapping does not have a fromU counterpart:
646 * (reverse) fallback codepage->Unicode, copy it to the fromU table
647 */
648
649 /* ignore reverse fallbacks to Unicode SUB */
650 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
651 toUMapping->f=3; /* reverse fallback */
652 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
653
654 /* the table may have been reallocated */
655 fromUMapping=fromUTable->mappings+fromUIndex;
656 }
657
658 ++toUMapping;
659 ++toUIndex;
660 }
661 }
662
663 /* either one or both tables are exhausted */
664 while(fromUIndex<fromUTop) {
665 /* leftover fromU mappings are fallbacks */
666 if( (fromUMapping->bLen==subcharLength &&
667 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
668 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
669 ) {
670 fromUMapping->f=2; /* SUB mapping */
671 } else {
672 fromUMapping->f=1; /* normal fallback */
673 }
674
675 ++fromUMapping;
676 ++fromUIndex;
677 }
678
679 while(toUIndex<toUTop) {
680 /* leftover toU mappings are reverse fallbacks */
681
682 /* ignore reverse fallbacks to Unicode SUB */
683 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
684 toUMapping->f=3; /* reverse fallback */
685 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
686 }
687
688 ++toUMapping;
689 ++toUIndex;
690 }
691
692 fromUTable->isSorted=false;
693 }
694
695 /* separate extension mappings out of base table for rptp2ucm --------------- */
696
697 U_CAPI UBool U_EXPORT2
ucm_separateMappings(UCMFile * ucm,UBool isSISO)698 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
699 UCMTable *table;
700 UCMapping *m, *mLimit;
701 int32_t type;
702 UBool needsMove, isOK;
703
704 table=ucm->base;
705 m=table->mappings;
706 mLimit=m+table->mappingsLength;
707
708 needsMove=false;
709 isOK=true;
710
711 for(; m<mLimit; ++m) {
712 if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
713 fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
714 ucm_printMapping(table, m, stderr);
715 m->moveFlag|=UCM_REMOVE_MAPPING;
716 needsMove=true;
717 continue;
718 }
719
720 type=ucm_mappingType(
721 &ucm->states, m,
722 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
723 if(type<0) {
724 /* illegal byte sequence */
725 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
726 isOK=false;
727 } else if(type>0) {
728 m->moveFlag|=UCM_MOVE_TO_EXT;
729 needsMove=true;
730 }
731 }
732
733 if(!isOK) {
734 return false;
735 }
736 if(needsMove) {
737 ucm_moveMappings(ucm->base, ucm->ext);
738 return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, false);
739 } else {
740 ucm_sortTable(ucm->base);
741 return true;
742 }
743 }
744
745 /* ucm parser --------------------------------------------------------------- */
746
747 U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line,const char ** ps)748 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
749 const char *s=*ps;
750 char *end;
751 uint8_t byte;
752 int8_t bLen;
753
754 bLen=0;
755 for(;;) {
756 /* skip an optional plus sign */
757 if(bLen>0 && *s=='+') {
758 ++s;
759 }
760 if(*s!='\\') {
761 break;
762 }
763
764 if( s[1]!='x' ||
765 (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
766 ) {
767 fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
768 return -1;
769 }
770
771 if(bLen==UCNV_EXT_MAX_BYTES) {
772 fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
773 return -1;
774 }
775 bytes[bLen++]=byte;
776 s=end;
777 }
778
779 *ps=s;
780 return bLen;
781 }
782
783 /* parse a mapping line; must not be empty */
784 U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line)785 ucm_parseMappingLine(UCMapping *m,
786 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
787 uint8_t bytes[UCNV_EXT_MAX_BYTES],
788 const char *line) {
789 const char *s;
790 char *end;
791 UChar32 cp;
792 int32_t u16Length;
793 int8_t uLen, bLen, f;
794
795 s=line;
796 uLen=bLen=0;
797
798 /* parse code points */
799 for(;;) {
800 /* skip an optional plus sign */
801 if(uLen>0 && *s=='+') {
802 ++s;
803 }
804 if(*s!='<') {
805 break;
806 }
807
808 if( s[1]!='U' ||
809 (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
810 *end!='>'
811 ) {
812 fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
813 return false;
814 }
815 if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
816 fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
817 return false;
818 }
819
820 if(uLen==UCNV_EXT_MAX_UCHARS) {
821 fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
822 return false;
823 }
824 codePoints[uLen++]=cp;
825 s=end+1;
826 }
827
828 if(uLen==0) {
829 fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
830 return false;
831 } else if(uLen==1) {
832 m->u=codePoints[0];
833 } else {
834 UErrorCode errorCode=U_ZERO_ERROR;
835 u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
836 if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
837 u16Length>UCNV_EXT_MAX_UCHARS
838 ) {
839 fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
840 return false;
841 }
842 }
843
844 s=u_skipWhitespace(s);
845
846 /* parse bytes */
847 bLen=ucm_parseBytes(bytes, line, &s);
848
849 if(bLen<0) {
850 return false;
851 } else if(bLen==0) {
852 fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
853 return false;
854 } else if(bLen<=4) {
855 uprv_memcpy(m->b.bytes, bytes, bLen);
856 }
857
858 /* skip everything until the fallback indicator, even the start of a comment */
859 for(;;) {
860 if(*s==0) {
861 f=-1; /* no fallback indicator */
862 break;
863 } else if(*s=='|') {
864 f=(int8_t)(s[1]-'0');
865 if((uint8_t)f>4) {
866 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
867 return false;
868 }
869 break;
870 }
871 ++s;
872 }
873
874 m->uLen=uLen;
875 m->bLen=bLen;
876 m->f=f;
877 return true;
878 }
879
880 /* general APIs ------------------------------------------------------------- */
881
882 U_CAPI UCMTable * U_EXPORT2
ucm_openTable()883 ucm_openTable() {
884 UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
885 if(table==NULL) {
886 fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
887 exit(U_MEMORY_ALLOCATION_ERROR);
888 }
889
890 memset(table, 0, sizeof(UCMTable));
891 return table;
892 }
893
894 U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable * table)895 ucm_closeTable(UCMTable *table) {
896 if(table!=NULL) {
897 uprv_free(table->mappings);
898 uprv_free(table->codePoints);
899 uprv_free(table->bytes);
900 uprv_free(table->reverseMap);
901 uprv_free(table);
902 }
903 }
904
905 U_CAPI void U_EXPORT2
ucm_resetTable(UCMTable * table)906 ucm_resetTable(UCMTable *table) {
907 if(table!=NULL) {
908 table->mappingsLength=0;
909 table->flagsType=0;
910 table->unicodeMask=0;
911 table->bytesLength=table->codePointsLength=0;
912 table->isSorted=false;
913 }
914 }
915
916 U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable * table,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])917 ucm_addMapping(UCMTable *table,
918 UCMapping *m,
919 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
920 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
921 UCMapping *tm;
922 UChar32 c;
923 int32_t idx;
924
925 if(table->mappingsLength>=table->mappingsCapacity) {
926 /* make the mappings array larger */
927 if(table->mappingsCapacity==0) {
928 table->mappingsCapacity=1000;
929 } else {
930 table->mappingsCapacity*=10;
931 }
932 table->mappings=(UCMapping *)uprv_realloc(table->mappings,
933 table->mappingsCapacity*sizeof(UCMapping));
934 if(table->mappings==NULL) {
935 fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
936 (int)table->mappingsCapacity);
937 exit(U_MEMORY_ALLOCATION_ERROR);
938 }
939
940 if(table->reverseMap!=NULL) {
941 /* the reverseMap must be reallocated in a new sort */
942 uprv_free(table->reverseMap);
943 table->reverseMap=NULL;
944 }
945 }
946
947 if(m->uLen>1 && table->codePointsCapacity==0) {
948 table->codePointsCapacity=10000;
949 table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
950 if(table->codePoints==NULL) {
951 fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
952 (int)table->codePointsCapacity);
953 exit(U_MEMORY_ALLOCATION_ERROR);
954 }
955 }
956
957 if(m->bLen>4 && table->bytesCapacity==0) {
958 table->bytesCapacity=10000;
959 table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
960 if(table->bytes==NULL) {
961 fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
962 (int)table->bytesCapacity);
963 exit(U_MEMORY_ALLOCATION_ERROR);
964 }
965 }
966
967 if(m->uLen>1) {
968 idx=table->codePointsLength;
969 table->codePointsLength+=m->uLen;
970 if(table->codePointsLength>table->codePointsCapacity) {
971 fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
972 exit(U_MEMORY_ALLOCATION_ERROR);
973 }
974
975 uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4);
976 m->u=idx;
977 }
978
979 if(m->bLen>4) {
980 idx=table->bytesLength;
981 table->bytesLength+=m->bLen;
982 if(table->bytesLength>table->bytesCapacity) {
983 fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
984 exit(U_MEMORY_ALLOCATION_ERROR);
985 }
986
987 uprv_memcpy(table->bytes+idx, bytes, m->bLen);
988 m->b.idx=idx;
989 }
990
991 /* set unicodeMask */
992 for(idx=0; idx<m->uLen; ++idx) {
993 c=codePoints[idx];
994 if(c>=0x10000) {
995 table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
996 } else if(U_IS_SURROGATE(c)) {
997 table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
998 }
999 }
1000
1001 /* set flagsType */
1002 if(m->f<0) {
1003 table->flagsType|=UCM_FLAGS_IMPLICIT;
1004 } else {
1005 table->flagsType|=UCM_FLAGS_EXPLICIT;
1006 }
1007
1008 tm=table->mappings+table->mappingsLength++;
1009 uprv_memcpy(tm, m, sizeof(UCMapping));
1010
1011 table->isSorted=false;
1012 }
1013
1014 U_CAPI UCMFile * U_EXPORT2
ucm_open()1015 ucm_open() {
1016 UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1017 if(ucm==NULL) {
1018 fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1019 exit(U_MEMORY_ALLOCATION_ERROR);
1020 }
1021
1022 memset(ucm, 0, sizeof(UCMFile));
1023
1024 ucm->base=ucm_openTable();
1025 ucm->ext=ucm_openTable();
1026
1027 ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
1028 ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
1029 ucm->states.outputType=-1;
1030 ucm->states.minCharLength=ucm->states.maxCharLength=1;
1031
1032 return ucm;
1033 }
1034
1035 U_CAPI void U_EXPORT2
ucm_close(UCMFile * ucm)1036 ucm_close(UCMFile *ucm) {
1037 if(ucm!=NULL) {
1038 ucm_closeTable(ucm->base);
1039 ucm_closeTable(ucm->ext);
1040 uprv_free(ucm);
1041 }
1042 }
1043
1044 U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1045 ucm_mappingType(UCMStates *baseStates,
1046 UCMapping *m,
1047 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1048 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1049 (void)codePoints;
1050 /* check validity of the bytes and count the characters in them */
1051 int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1052 if(count<1) {
1053 /* illegal byte sequence */
1054 return -1;
1055 }
1056
1057 /*
1058 * Suitable for an ICU conversion base table means:
1059 * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1060 * - precision flag 0..3
1061 * - SBCS: any 1:1 mapping
1062 * (the table stores additional bits to distinguish mapping types)
1063 * - MBCS: not a |2 SUB mapping for <subchar1>
1064 * - MBCS: not a |1 fallback to 0x00
1065 * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1066 *
1067 * Further restrictions for fromUnicode tables
1068 * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1069 *
1070 * All of the MBCS fromUnicode specific tests could be removed from here,
1071 * but the ones above are for unusual mappings, and removing the tests
1072 * from here would change canonucm output which seems gratuitous.
1073 * (Markus Scherer 2006-nov-28)
1074 *
1075 * Exception: All implicit mappings (f<0) that need to be moved
1076 * because of fromUnicode restrictions _must_ be moved here because
1077 * makeconv uses a hack for moving mappings only for the fromUnicode table
1078 * that only works with non-negative values of f.
1079 */
1080 if( m->uLen==1 && count==1 && m->f<=3 &&
1081 (baseStates->maxCharLength==1 ||
1082 !((m->f==2 && m->bLen==1) ||
1083 (m->f==1 && bytes[0]==0) ||
1084 (m->f<=1 && m->bLen>1 && bytes[0]==0)))
1085 ) {
1086 return 0; /* suitable for a base table */
1087 } else {
1088 return 1; /* needs to go into an extension table */
1089 }
1090 }
1091
1092 U_CAPI UBool U_EXPORT2
ucm_addMappingAuto(UCMFile * ucm,UBool forBase,UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1093 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1094 UCMapping *m,
1095 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1096 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1097 int32_t type;
1098
1099 if(m->f==2 && m->uLen>1) {
1100 fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1101 printMapping(m, codePoints, bytes, stderr);
1102 return false;
1103 }
1104
1105 if(baseStates!=NULL) {
1106 /* check validity of the bytes and count the characters in them */
1107 type=ucm_mappingType(baseStates, m, codePoints, bytes);
1108 if(type<0) {
1109 /* illegal byte sequence */
1110 printMapping(m, codePoints, bytes, stderr);
1111 return false;
1112 }
1113 } else {
1114 /* not used - adding a mapping for an extension-only table before its base table is read */
1115 type=1;
1116 }
1117
1118 /*
1119 * Add the mapping to the base table if this is requested and suitable.
1120 * Otherwise, add it to the extension table.
1121 */
1122 if(forBase && type==0) {
1123 ucm_addMapping(ucm->base, m, codePoints, bytes);
1124 } else {
1125 ucm_addMapping(ucm->ext, m, codePoints, bytes);
1126 }
1127
1128 return true;
1129 }
1130
1131 U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile * ucm,const char * line,UBool forBase,UCMStates * baseStates)1132 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1133 UCMapping m={ 0, {0}, 0, 0, 0, 0 };
1134 UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1135 uint8_t bytes[UCNV_EXT_MAX_BYTES];
1136
1137 const char *s;
1138
1139 /* ignore empty and comment lines */
1140 if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1141 return true;
1142 }
1143
1144 return
1145 ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1146 ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1147 }
1148
1149 U_CAPI void U_EXPORT2
ucm_readTable(UCMFile * ucm,FileStream * convFile,UBool forBase,UCMStates * baseStates,UErrorCode * pErrorCode)1150 ucm_readTable(UCMFile *ucm, FileStream* convFile,
1151 UBool forBase, UCMStates *baseStates,
1152 UErrorCode *pErrorCode) {
1153 char line[500];
1154 char *end;
1155 UBool isOK;
1156
1157 if(U_FAILURE(*pErrorCode)) {
1158 return;
1159 }
1160
1161 isOK=true;
1162
1163 for(;;) {
1164 /* read the next line */
1165 if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1166 fprintf(stderr, "incomplete charmap section\n");
1167 isOK=false;
1168 break;
1169 }
1170
1171 /* remove CR LF */
1172 end=uprv_strchr(line, 0);
1173 while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1174 --end;
1175 }
1176 *end=0;
1177
1178 /* ignore empty and comment lines */
1179 if(line[0]==0 || line[0]=='#') {
1180 continue;
1181 }
1182
1183 /* stop at the end of the mapping table */
1184 if(0==uprv_strcmp(line, "END CHARMAP")) {
1185 break;
1186 }
1187
1188 isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1189 }
1190
1191 if(!isOK) {
1192 *pErrorCode=U_INVALID_TABLE_FORMAT;
1193 }
1194 }
1195 #endif
1196