1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2003-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucm.c
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2003jun20
16 * created by: Markus W. Scherer
17 *
18 * This file reads a .ucm file, stores its mappings and sorts them.
19 * It implements handling of Unicode conversion mappings from .ucm files
20 * for makeconv, canonucm, rptp2ucm, etc.
21 *
22 * Unicode code point sequences with a length of more than 1,
23 * as well as byte sequences with more than 4 bytes or more than one complete
24 * character sequence are handled to support m:n mappings.
25 */
26
27 #include "unicode/utypes.h"
28 #include "unicode/ustring.h"
29 #include "cstring.h"
30 #include "cmemory.h"
31 #include "filestrm.h"
32 #include "uarrsort.h"
33 #include "ucnvmbcs.h"
34 #include "ucnv_bld.h"
35 #include "ucnv_ext.h"
36 #include "uparse.h"
37 #include "ucm.h"
38 #include <stdio.h>
39
40 #if !UCONFIG_NO_CONVERSION
41
42 /* -------------------------------------------------------------------------- */
43
44 static void
printMapping(UCMapping * m,UChar32 * codePoints,uint8_t * bytes,FILE * f)45 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
46 int32_t j;
47
48 for(j=0; j<m->uLen; ++j) {
49 fprintf(f, "<U%04lX>", (long)codePoints[j]);
50 }
51
52 fputc(' ', f);
53
54 for(j=0; j<m->bLen; ++j) {
55 fprintf(f, "\\x%02X", bytes[j]);
56 }
57
58 if(m->f>=0) {
59 fprintf(f, " |%u\n", m->f);
60 } else {
61 fputs("\n", f);
62 }
63 }
64
65 U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable * table,UCMapping * m,FILE * f)66 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
67 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
68 }
69
70 U_CAPI void U_EXPORT2
ucm_printTable(UCMTable * table,FILE * f,UBool byUnicode)71 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
72 UCMapping *m;
73 int32_t i, length;
74
75 m=table->mappings;
76 length=table->mappingsLength;
77 if(byUnicode) {
78 for(i=0; i<length; ++m, ++i) {
79 ucm_printMapping(table, m, f);
80 }
81 } else {
82 const int32_t *map=table->reverseMap;
83 for(i=0; i<length; ++i) {
84 ucm_printMapping(table, m+map[i], f);
85 }
86 }
87 }
88
89 /* mapping comparisons ------------------------------------------------------ */
90
91 static int32_t
compareUnicode(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r)92 compareUnicode(UCMTable *lTable, const UCMapping *l,
93 UCMTable *rTable, const UCMapping *r) {
94 const UChar32 *lu, *ru;
95 int32_t result, i, length;
96
97 if(l->uLen==1 && r->uLen==1) {
98 /* compare two single code points */
99 return l->u-r->u;
100 }
101
102 /* get pointers to the code point sequences */
103 lu=UCM_GET_CODE_POINTS(lTable, l);
104 ru=UCM_GET_CODE_POINTS(rTable, r);
105
106 /* get the minimum length */
107 if(l->uLen<=r->uLen) {
108 length=l->uLen;
109 } else {
110 length=r->uLen;
111 }
112
113 /* compare the code points */
114 for(i=0; i<length; ++i) {
115 result=lu[i]-ru[i];
116 if(result!=0) {
117 return result;
118 }
119 }
120
121 /* compare the lengths */
122 return l->uLen-r->uLen;
123 }
124
125 static int32_t
compareBytes(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool lexical)126 compareBytes(UCMTable *lTable, const UCMapping *l,
127 UCMTable *rTable, const UCMapping *r,
128 UBool lexical) {
129 const uint8_t *lb, *rb;
130 int32_t result, i, length;
131
132 /*
133 * A lexical comparison is used for sorting in the builder, to allow
134 * an efficient search for a byte sequence that could be a prefix
135 * of a previously entered byte sequence.
136 *
137 * Comparing by lengths first is for compatibility with old .ucm tools
138 * like canonucm and rptp2ucm.
139 */
140 if(lexical) {
141 /* get the minimum length and continue */
142 if(l->bLen<=r->bLen) {
143 length=l->bLen;
144 } else {
145 length=r->bLen;
146 }
147 } else {
148 /* compare lengths first */
149 result=l->bLen-r->bLen;
150 if(result!=0) {
151 return result;
152 } else {
153 length=l->bLen;
154 }
155 }
156
157 /* get pointers to the byte sequences */
158 lb=UCM_GET_BYTES(lTable, l);
159 rb=UCM_GET_BYTES(rTable, r);
160
161 /* compare the bytes */
162 for(i=0; i<length; ++i) {
163 result=lb[i]-rb[i];
164 if(result!=0) {
165 return result;
166 }
167 }
168
169 /* compare the lengths */
170 return l->bLen-r->bLen;
171 }
172
173 /* compare UCMappings for sorting */
174 static int32_t
compareMappings(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool uFirst)175 compareMappings(UCMTable *lTable, const UCMapping *l,
176 UCMTable *rTable, const UCMapping *r,
177 UBool uFirst) {
178 int32_t result;
179
180 /* choose which side to compare first */
181 if(uFirst) {
182 /* Unicode then bytes */
183 result=compareUnicode(lTable, l, rTable, r);
184 if(result==0) {
185 result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
186 }
187 } else {
188 /* bytes then Unicode */
189 result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
190 if(result==0) {
191 result=compareUnicode(lTable, l, rTable, r);
192 }
193 }
194
195 if(result!=0) {
196 return result;
197 }
198
199 /* compare the flags */
200 return l->f-r->f;
201 }
202
203 /* sorting by Unicode first sorts mappings directly */
204 static int32_t
compareMappingsUnicodeFirst(const void * context,const void * left,const void * right)205 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
206 return compareMappings(
207 (UCMTable *)context, (const UCMapping *)left,
208 (UCMTable *)context, (const UCMapping *)right, TRUE);
209 }
210
211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
212 static int32_t
compareMappingsBytesFirst(const void * context,const void * left,const void * right)213 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
214 UCMTable *table=(UCMTable *)context;
215 int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
216 return compareMappings(
217 table, table->mappings+l,
218 table, table->mappings+r, FALSE);
219 }
220
221 U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable * t)222 ucm_sortTable(UCMTable *t) {
223 UErrorCode errorCode;
224 int32_t i;
225
226 if(t->isSorted) {
227 return;
228 }
229
230 errorCode=U_ZERO_ERROR;
231
232 /* 1. sort by Unicode first */
233 uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
234 compareMappingsUnicodeFirst, t,
235 FALSE, &errorCode);
236
237 /* build the reverseMap */
238 if(t->reverseMap==NULL) {
239 /*
240 * allocate mappingsCapacity instead of mappingsLength so that
241 * if mappings are added, the reverseMap need not be
242 * reallocated each time
243 * (see ucm_moveMappings() and ucm_addMapping())
244 */
245 t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
246 if(t->reverseMap==NULL) {
247 fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
248 exit(U_MEMORY_ALLOCATION_ERROR);
249 }
250 }
251 for(i=0; i<t->mappingsLength; ++i) {
252 t->reverseMap[i]=i;
253 }
254
255 /* 2. sort reverseMap by mappings bytes first */
256 uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
257 compareMappingsBytesFirst, t,
258 FALSE, &errorCode);
259
260 if(U_FAILURE(errorCode)) {
261 fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
262 u_errorName(errorCode));
263 exit(errorCode);
264 }
265
266 t->isSorted=TRUE;
267 }
268
269 /*
270 * remove mappings with their move flag set from the base table
271 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
272 */
273 U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable * base,UCMTable * ext)274 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
275 UCMapping *mb, *mbLimit;
276 int8_t flag;
277
278 mb=base->mappings;
279 mbLimit=mb+base->mappingsLength;
280
281 while(mb<mbLimit) {
282 flag=mb->moveFlag;
283 if(flag!=0) {
284 /* reset the move flag */
285 mb->moveFlag=0;
286
287 if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
288 /* add the mapping to the extension table */
289 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
290 }
291
292 /* remove this mapping: move the last base mapping down and overwrite the current one */
293 if(mb<(mbLimit-1)) {
294 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
295 }
296 --mbLimit;
297 --base->mappingsLength;
298 base->isSorted=FALSE;
299 } else {
300 ++mb;
301 }
302 }
303 }
304
305 enum {
306 NEEDS_MOVE=1,
307 HAS_ERRORS=2
308 };
309
310 static uint8_t
checkBaseExtUnicode(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)311 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
312 UBool moveToExt, UBool intersectBase) {
313 UCMapping *mb, *me, *mbLimit, *meLimit;
314 int32_t cmp;
315 uint8_t result;
316
317 mb=base->mappings;
318 mbLimit=mb+base->mappingsLength;
319
320 me=ext->mappings;
321 meLimit=me+ext->mappingsLength;
322
323 result=0;
324
325 for(;;) {
326 /* skip irrelevant mappings on both sides */
327 for(;;) {
328 if(mb==mbLimit) {
329 return result;
330 }
331
332 if((0<=mb->f && mb->f<=2) || mb->f==4) {
333 break;
334 }
335
336 ++mb;
337 }
338
339 for(;;) {
340 if(me==meLimit) {
341 return result;
342 }
343
344 if((0<=me->f && me->f<=2) || me->f==4) {
345 break;
346 }
347
348 ++me;
349 }
350
351 /* compare the base and extension mappings */
352 cmp=compareUnicode(base, mb, ext, me);
353 if(cmp<0) {
354 if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
355 /*
356 * mapping in base but not in ext, move it
357 *
358 * if ext is DBCS, move DBCS mappings here
359 * and check SBCS ones for Unicode prefix below
360 */
361 mb->moveFlag|=UCM_MOVE_TO_EXT;
362 result|=NEEDS_MOVE;
363
364 /* does mb map from an input sequence that is a prefix of me's? */
365 } else if( mb->uLen<me->uLen &&
366 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
367 ) {
368 if(moveToExt) {
369 /* mark this mapping to be moved to the extension table */
370 mb->moveFlag|=UCM_MOVE_TO_EXT;
371 result|=NEEDS_MOVE;
372 } else {
373 fprintf(stderr,
374 "ucm error: the base table contains a mapping whose input sequence\n"
375 " is a prefix of the input sequence of an extension mapping\n");
376 ucm_printMapping(base, mb, stderr);
377 ucm_printMapping(ext, me, stderr);
378 result|=HAS_ERRORS;
379 }
380 }
381
382 ++mb;
383 } else if(cmp==0) {
384 /*
385 * same output: remove the extension mapping,
386 * otherwise treat as an error
387 */
388 if( mb->f==me->f && mb->bLen==me->bLen &&
389 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
390 ) {
391 me->moveFlag|=UCM_REMOVE_MAPPING;
392 result|=NEEDS_MOVE;
393 } else if(intersectBase) {
394 /* mapping in base but not in ext, move it */
395 mb->moveFlag|=UCM_MOVE_TO_EXT;
396 result|=NEEDS_MOVE;
397 } else {
398 fprintf(stderr,
399 "ucm error: the base table contains a mapping whose input sequence\n"
400 " is the same as the input sequence of an extension mapping\n"
401 " but it maps differently\n");
402 ucm_printMapping(base, mb, stderr);
403 ucm_printMapping(ext, me, stderr);
404 result|=HAS_ERRORS;
405 }
406
407 ++mb;
408 } else /* cmp>0 */ {
409 ++me;
410 }
411 }
412 }
413
414 static uint8_t
checkBaseExtBytes(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)415 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
416 UBool moveToExt, UBool intersectBase) {
417 UCMapping *mb, *me;
418 int32_t *baseMap, *extMap;
419 int32_t b, e, bLimit, eLimit, cmp;
420 uint8_t result;
421 UBool isSISO;
422
423 baseMap=base->reverseMap;
424 extMap=ext->reverseMap;
425
426 b=e=0;
427 bLimit=base->mappingsLength;
428 eLimit=ext->mappingsLength;
429
430 result=0;
431
432 isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
433
434 for(;;) {
435 /* skip irrelevant mappings on both sides */
436 for(;; ++b) {
437 if(b==bLimit) {
438 return result;
439 }
440 mb=base->mappings+baseMap[b];
441
442 if(intersectBase==2 && mb->bLen==1) {
443 /*
444 * comparing a base against a DBCS extension:
445 * leave SBCS base mappings alone
446 */
447 continue;
448 }
449
450 if(mb->f==0 || mb->f==3) {
451 break;
452 }
453 }
454
455 for(;;) {
456 if(e==eLimit) {
457 return result;
458 }
459 me=ext->mappings+extMap[e];
460
461 if(me->f==0 || me->f==3) {
462 break;
463 }
464
465 ++e;
466 }
467
468 /* compare the base and extension mappings */
469 cmp=compareBytes(base, mb, ext, me, TRUE);
470 if(cmp<0) {
471 if(intersectBase) {
472 /* mapping in base but not in ext, move it */
473 mb->moveFlag|=UCM_MOVE_TO_EXT;
474 result|=NEEDS_MOVE;
475
476 /*
477 * does mb map from an input sequence that is a prefix of me's?
478 * for SI/SO tables, a single byte is never a prefix because it
479 * occurs in a separate single-byte state
480 */
481 } else if( mb->bLen<me->bLen &&
482 (!isSISO || mb->bLen>1) &&
483 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
484 ) {
485 if(moveToExt) {
486 /* mark this mapping to be moved to the extension table */
487 mb->moveFlag|=UCM_MOVE_TO_EXT;
488 result|=NEEDS_MOVE;
489 } else {
490 fprintf(stderr,
491 "ucm error: the base table contains a mapping whose input sequence\n"
492 " is a prefix of the input sequence of an extension mapping\n");
493 ucm_printMapping(base, mb, stderr);
494 ucm_printMapping(ext, me, stderr);
495 result|=HAS_ERRORS;
496 }
497 }
498
499 ++b;
500 } else if(cmp==0) {
501 /*
502 * same output: remove the extension mapping,
503 * otherwise treat as an error
504 */
505 if( mb->f==me->f && mb->uLen==me->uLen &&
506 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
507 ) {
508 me->moveFlag|=UCM_REMOVE_MAPPING;
509 result|=NEEDS_MOVE;
510 } else if(intersectBase) {
511 /* mapping in base but not in ext, move it */
512 mb->moveFlag|=UCM_MOVE_TO_EXT;
513 result|=NEEDS_MOVE;
514 } else {
515 fprintf(stderr,
516 "ucm error: the base table contains a mapping whose input sequence\n"
517 " is the same as the input sequence of an extension mapping\n"
518 " but it maps differently\n");
519 ucm_printMapping(base, mb, stderr);
520 ucm_printMapping(ext, me, stderr);
521 result|=HAS_ERRORS;
522 }
523
524 ++b;
525 } else /* cmp>0 */ {
526 ++e;
527 }
528 }
529 }
530
531 U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable * table,UCMStates * baseStates)532 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
533 UCMapping *m, *mLimit;
534 int32_t count;
535 UBool isOK;
536
537 m=table->mappings;
538 mLimit=m+table->mappingsLength;
539 isOK=TRUE;
540
541 while(m<mLimit) {
542 count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
543 if(count<1) {
544 ucm_printMapping(table, m, stderr);
545 isOK=FALSE;
546 }
547 ++m;
548 }
549
550 return isOK;
551 }
552
553 U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UCMTable * moveTarget,UBool intersectBase)554 ucm_checkBaseExt(UCMStates *baseStates,
555 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
556 UBool intersectBase) {
557 uint8_t result;
558
559 /* if we have an extension table, we must always use precision flags */
560 if(base->flagsType&UCM_FLAGS_IMPLICIT) {
561 fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
562 return FALSE;
563 }
564 if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
565 fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
566 return FALSE;
567 }
568
569 /* checking requires both tables to be sorted */
570 ucm_sortTable(base);
571 ucm_sortTable(ext);
572
573 /* check */
574 result=
575 checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
576 checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
577
578 if(result&HAS_ERRORS) {
579 return FALSE;
580 }
581
582 if(result&NEEDS_MOVE) {
583 ucm_moveMappings(ext, NULL);
584 ucm_moveMappings(base, moveTarget);
585 ucm_sortTable(base);
586 ucm_sortTable(ext);
587 if(moveTarget!=NULL) {
588 ucm_sortTable(moveTarget);
589 }
590 }
591
592 return TRUE;
593 }
594
595 /* merge tables for rptp2ucm ------------------------------------------------ */
596
597 U_CAPI void U_EXPORT2
ucm_mergeTables(UCMTable * fromUTable,UCMTable * toUTable,const uint8_t * subchar,int32_t subcharLength,uint8_t subchar1)598 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
599 const uint8_t *subchar, int32_t subcharLength,
600 uint8_t subchar1) {
601 UCMapping *fromUMapping, *toUMapping;
602 int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
603
604 ucm_sortTable(fromUTable);
605 ucm_sortTable(toUTable);
606
607 fromUMapping=fromUTable->mappings;
608 toUMapping=toUTable->mappings;
609
610 fromUTop=fromUTable->mappingsLength;
611 toUTop=toUTable->mappingsLength;
612
613 fromUIndex=toUIndex=0;
614
615 while(fromUIndex<fromUTop && toUIndex<toUTop) {
616 cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
617 if(cmp==0) {
618 /* equal: roundtrip, nothing to do (flags are initially 0) */
619 ++fromUMapping;
620 ++toUMapping;
621
622 ++fromUIndex;
623 ++toUIndex;
624 } else if(cmp<0) {
625 /*
626 * the fromU mapping does not have a toU counterpart:
627 * fallback Unicode->codepage
628 */
629 if( (fromUMapping->bLen==subcharLength &&
630 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
631 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
632 ) {
633 fromUMapping->f=2; /* SUB mapping */
634 } else {
635 fromUMapping->f=1; /* normal fallback */
636 }
637
638 ++fromUMapping;
639 ++fromUIndex;
640 } else {
641 /*
642 * the toU mapping does not have a fromU counterpart:
643 * (reverse) fallback codepage->Unicode, copy it to the fromU table
644 */
645
646 /* ignore reverse fallbacks to Unicode SUB */
647 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
648 toUMapping->f=3; /* reverse fallback */
649 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
650
651 /* the table may have been reallocated */
652 fromUMapping=fromUTable->mappings+fromUIndex;
653 }
654
655 ++toUMapping;
656 ++toUIndex;
657 }
658 }
659
660 /* either one or both tables are exhausted */
661 while(fromUIndex<fromUTop) {
662 /* leftover fromU mappings are fallbacks */
663 if( (fromUMapping->bLen==subcharLength &&
664 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
665 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
666 ) {
667 fromUMapping->f=2; /* SUB mapping */
668 } else {
669 fromUMapping->f=1; /* normal fallback */
670 }
671
672 ++fromUMapping;
673 ++fromUIndex;
674 }
675
676 while(toUIndex<toUTop) {
677 /* leftover toU mappings are reverse fallbacks */
678
679 /* ignore reverse fallbacks to Unicode SUB */
680 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
681 toUMapping->f=3; /* reverse fallback */
682 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
683 }
684
685 ++toUMapping;
686 ++toUIndex;
687 }
688
689 fromUTable->isSorted=FALSE;
690 }
691
692 /* separate extension mappings out of base table for rptp2ucm --------------- */
693
694 U_CAPI UBool U_EXPORT2
ucm_separateMappings(UCMFile * ucm,UBool isSISO)695 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
696 UCMTable *table;
697 UCMapping *m, *mLimit;
698 int32_t type;
699 UBool needsMove, isOK;
700
701 table=ucm->base;
702 m=table->mappings;
703 mLimit=m+table->mappingsLength;
704
705 needsMove=FALSE;
706 isOK=TRUE;
707
708 for(; m<mLimit; ++m) {
709 if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
710 fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
711 ucm_printMapping(table, m, stderr);
712 m->moveFlag|=UCM_REMOVE_MAPPING;
713 needsMove=TRUE;
714 continue;
715 }
716
717 type=ucm_mappingType(
718 &ucm->states, m,
719 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
720 if(type<0) {
721 /* illegal byte sequence */
722 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
723 isOK=FALSE;
724 } else if(type>0) {
725 m->moveFlag|=UCM_MOVE_TO_EXT;
726 needsMove=TRUE;
727 }
728 }
729
730 if(!isOK) {
731 return FALSE;
732 }
733 if(needsMove) {
734 ucm_moveMappings(ucm->base, ucm->ext);
735 return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
736 } else {
737 ucm_sortTable(ucm->base);
738 return TRUE;
739 }
740 }
741
742 /* ucm parser --------------------------------------------------------------- */
743
744 U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line,const char ** ps)745 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
746 const char *s=*ps;
747 char *end;
748 uint8_t byte;
749 int8_t bLen;
750
751 bLen=0;
752 for(;;) {
753 /* skip an optional plus sign */
754 if(bLen>0 && *s=='+') {
755 ++s;
756 }
757 if(*s!='\\') {
758 break;
759 }
760
761 if( s[1]!='x' ||
762 (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
763 ) {
764 fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
765 return -1;
766 }
767
768 if(bLen==UCNV_EXT_MAX_BYTES) {
769 fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
770 return -1;
771 }
772 bytes[bLen++]=byte;
773 s=end;
774 }
775
776 *ps=s;
777 return bLen;
778 }
779
780 /* parse a mapping line; must not be empty */
781 U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line)782 ucm_parseMappingLine(UCMapping *m,
783 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
784 uint8_t bytes[UCNV_EXT_MAX_BYTES],
785 const char *line) {
786 const char *s;
787 char *end;
788 UChar32 cp;
789 int32_t u16Length;
790 int8_t uLen, bLen, f;
791
792 s=line;
793 uLen=bLen=0;
794
795 /* parse code points */
796 for(;;) {
797 /* skip an optional plus sign */
798 if(uLen>0 && *s=='+') {
799 ++s;
800 }
801 if(*s!='<') {
802 break;
803 }
804
805 if( s[1]!='U' ||
806 (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
807 *end!='>'
808 ) {
809 fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
810 return FALSE;
811 }
812 if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
813 fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
814 return FALSE;
815 }
816
817 if(uLen==UCNV_EXT_MAX_UCHARS) {
818 fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
819 return FALSE;
820 }
821 codePoints[uLen++]=cp;
822 s=end+1;
823 }
824
825 if(uLen==0) {
826 fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
827 return FALSE;
828 } else if(uLen==1) {
829 m->u=codePoints[0];
830 } else {
831 UErrorCode errorCode=U_ZERO_ERROR;
832 u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
833 if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
834 u16Length>UCNV_EXT_MAX_UCHARS
835 ) {
836 fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
837 return FALSE;
838 }
839 }
840
841 s=u_skipWhitespace(s);
842
843 /* parse bytes */
844 bLen=ucm_parseBytes(bytes, line, &s);
845
846 if(bLen<0) {
847 return FALSE;
848 } else if(bLen==0) {
849 fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
850 return FALSE;
851 } else if(bLen<=4) {
852 uprv_memcpy(m->b.bytes, bytes, bLen);
853 }
854
855 /* skip everything until the fallback indicator, even the start of a comment */
856 for(;;) {
857 if(*s==0) {
858 f=-1; /* no fallback indicator */
859 break;
860 } else if(*s=='|') {
861 f=(int8_t)(s[1]-'0');
862 if((uint8_t)f>4) {
863 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
864 return FALSE;
865 }
866 break;
867 }
868 ++s;
869 }
870
871 m->uLen=uLen;
872 m->bLen=bLen;
873 m->f=f;
874 return TRUE;
875 }
876
877 /* general APIs ------------------------------------------------------------- */
878
879 U_CAPI UCMTable * U_EXPORT2
ucm_openTable()880 ucm_openTable() {
881 UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
882 if(table==NULL) {
883 fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
884 exit(U_MEMORY_ALLOCATION_ERROR);
885 }
886
887 memset(table, 0, sizeof(UCMTable));
888 return table;
889 }
890
891 U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable * table)892 ucm_closeTable(UCMTable *table) {
893 if(table!=NULL) {
894 uprv_free(table->mappings);
895 uprv_free(table->codePoints);
896 uprv_free(table->bytes);
897 uprv_free(table->reverseMap);
898 uprv_free(table);
899 }
900 }
901
902 U_CAPI void U_EXPORT2
ucm_resetTable(UCMTable * table)903 ucm_resetTable(UCMTable *table) {
904 if(table!=NULL) {
905 table->mappingsLength=0;
906 table->flagsType=0;
907 table->unicodeMask=0;
908 table->bytesLength=table->codePointsLength=0;
909 table->isSorted=FALSE;
910 }
911 }
912
913 U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable * table,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])914 ucm_addMapping(UCMTable *table,
915 UCMapping *m,
916 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
917 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
918 UCMapping *tm;
919 UChar32 c;
920 int32_t idx;
921
922 if(table->mappingsLength>=table->mappingsCapacity) {
923 /* make the mappings array larger */
924 if(table->mappingsCapacity==0) {
925 table->mappingsCapacity=1000;
926 } else {
927 table->mappingsCapacity*=10;
928 }
929 table->mappings=(UCMapping *)uprv_realloc(table->mappings,
930 table->mappingsCapacity*sizeof(UCMapping));
931 if(table->mappings==NULL) {
932 fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
933 (int)table->mappingsCapacity);
934 exit(U_MEMORY_ALLOCATION_ERROR);
935 }
936
937 if(table->reverseMap!=NULL) {
938 /* the reverseMap must be reallocated in a new sort */
939 uprv_free(table->reverseMap);
940 table->reverseMap=NULL;
941 }
942 }
943
944 if(m->uLen>1 && table->codePointsCapacity==0) {
945 table->codePointsCapacity=10000;
946 table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
947 if(table->codePoints==NULL) {
948 fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
949 (int)table->codePointsCapacity);
950 exit(U_MEMORY_ALLOCATION_ERROR);
951 }
952 }
953
954 if(m->bLen>4 && table->bytesCapacity==0) {
955 table->bytesCapacity=10000;
956 table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
957 if(table->bytes==NULL) {
958 fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
959 (int)table->bytesCapacity);
960 exit(U_MEMORY_ALLOCATION_ERROR);
961 }
962 }
963
964 if(m->uLen>1) {
965 idx=table->codePointsLength;
966 table->codePointsLength+=m->uLen;
967 if(table->codePointsLength>table->codePointsCapacity) {
968 fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
969 exit(U_MEMORY_ALLOCATION_ERROR);
970 }
971
972 uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4);
973 m->u=idx;
974 }
975
976 if(m->bLen>4) {
977 idx=table->bytesLength;
978 table->bytesLength+=m->bLen;
979 if(table->bytesLength>table->bytesCapacity) {
980 fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
981 exit(U_MEMORY_ALLOCATION_ERROR);
982 }
983
984 uprv_memcpy(table->bytes+idx, bytes, m->bLen);
985 m->b.idx=idx;
986 }
987
988 /* set unicodeMask */
989 for(idx=0; idx<m->uLen; ++idx) {
990 c=codePoints[idx];
991 if(c>=0x10000) {
992 table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
993 } else if(U_IS_SURROGATE(c)) {
994 table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
995 }
996 }
997
998 /* set flagsType */
999 if(m->f<0) {
1000 table->flagsType|=UCM_FLAGS_IMPLICIT;
1001 } else {
1002 table->flagsType|=UCM_FLAGS_EXPLICIT;
1003 }
1004
1005 tm=table->mappings+table->mappingsLength++;
1006 uprv_memcpy(tm, m, sizeof(UCMapping));
1007
1008 table->isSorted=FALSE;
1009 }
1010
1011 U_CAPI UCMFile * U_EXPORT2
ucm_open()1012 ucm_open() {
1013 UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1014 if(ucm==NULL) {
1015 fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1016 exit(U_MEMORY_ALLOCATION_ERROR);
1017 }
1018
1019 memset(ucm, 0, sizeof(UCMFile));
1020
1021 ucm->base=ucm_openTable();
1022 ucm->ext=ucm_openTable();
1023
1024 ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
1025 ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
1026 ucm->states.outputType=-1;
1027 ucm->states.minCharLength=ucm->states.maxCharLength=1;
1028
1029 return ucm;
1030 }
1031
1032 U_CAPI void U_EXPORT2
ucm_close(UCMFile * ucm)1033 ucm_close(UCMFile *ucm) {
1034 if(ucm!=NULL) {
1035 ucm_closeTable(ucm->base);
1036 ucm_closeTable(ucm->ext);
1037 uprv_free(ucm);
1038 }
1039 }
1040
1041 U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1042 ucm_mappingType(UCMStates *baseStates,
1043 UCMapping *m,
1044 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1045 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1046 /* check validity of the bytes and count the characters in them */
1047 int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1048 if(count<1) {
1049 /* illegal byte sequence */
1050 return -1;
1051 }
1052
1053 /*
1054 * Suitable for an ICU conversion base table means:
1055 * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1056 * - precision flag 0..3
1057 * - SBCS: any 1:1 mapping
1058 * (the table stores additional bits to distinguish mapping types)
1059 * - MBCS: not a |2 SUB mapping for <subchar1>
1060 * - MBCS: not a |1 fallback to 0x00
1061 * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1062 *
1063 * Further restrictions for fromUnicode tables
1064 * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1065 *
1066 * All of the MBCS fromUnicode specific tests could be removed from here,
1067 * but the ones above are for unusual mappings, and removing the tests
1068 * from here would change canonucm output which seems gratuitous.
1069 * (Markus Scherer 2006-nov-28)
1070 *
1071 * Exception: All implicit mappings (f<0) that need to be moved
1072 * because of fromUnicode restrictions _must_ be moved here because
1073 * makeconv uses a hack for moving mappings only for the fromUnicode table
1074 * that only works with non-negative values of f.
1075 */
1076 if( m->uLen==1 && count==1 && m->f<=3 &&
1077 (baseStates->maxCharLength==1 ||
1078 !((m->f==2 && m->bLen==1) ||
1079 (m->f==1 && bytes[0]==0) ||
1080 (m->f<=1 && m->bLen>1 && bytes[0]==0)))
1081 ) {
1082 return 0; /* suitable for a base table */
1083 } else {
1084 return 1; /* needs to go into an extension table */
1085 }
1086 }
1087
1088 U_CAPI UBool U_EXPORT2
ucm_addMappingAuto(UCMFile * ucm,UBool forBase,UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1089 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1090 UCMapping *m,
1091 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1092 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1093 int32_t type;
1094
1095 if(m->f==2 && m->uLen>1) {
1096 fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1097 printMapping(m, codePoints, bytes, stderr);
1098 return FALSE;
1099 }
1100
1101 if(baseStates!=NULL) {
1102 /* check validity of the bytes and count the characters in them */
1103 type=ucm_mappingType(baseStates, m, codePoints, bytes);
1104 if(type<0) {
1105 /* illegal byte sequence */
1106 printMapping(m, codePoints, bytes, stderr);
1107 return FALSE;
1108 }
1109 } else {
1110 /* not used - adding a mapping for an extension-only table before its base table is read */
1111 type=1;
1112 }
1113
1114 /*
1115 * Add the mapping to the base table if this is requested and suitable.
1116 * Otherwise, add it to the extension table.
1117 */
1118 if(forBase && type==0) {
1119 ucm_addMapping(ucm->base, m, codePoints, bytes);
1120 } else {
1121 ucm_addMapping(ucm->ext, m, codePoints, bytes);
1122 }
1123
1124 return TRUE;
1125 }
1126
1127 U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile * ucm,const char * line,UBool forBase,UCMStates * baseStates)1128 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1129 UCMapping m={ 0, {0}, 0, 0, 0, 0 };
1130 UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1131 uint8_t bytes[UCNV_EXT_MAX_BYTES];
1132
1133 const char *s;
1134
1135 /* ignore empty and comment lines */
1136 if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1137 return TRUE;
1138 }
1139
1140 return
1141 ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1142 ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1143 }
1144
1145 U_CAPI void U_EXPORT2
ucm_readTable(UCMFile * ucm,FileStream * convFile,UBool forBase,UCMStates * baseStates,UErrorCode * pErrorCode)1146 ucm_readTable(UCMFile *ucm, FileStream* convFile,
1147 UBool forBase, UCMStates *baseStates,
1148 UErrorCode *pErrorCode) {
1149 char line[500];
1150 char *end;
1151 UBool isOK;
1152
1153 if(U_FAILURE(*pErrorCode)) {
1154 return;
1155 }
1156
1157 isOK=TRUE;
1158
1159 for(;;) {
1160 /* read the next line */
1161 if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1162 fprintf(stderr, "incomplete charmap section\n");
1163 isOK=FALSE;
1164 break;
1165 }
1166
1167 /* remove CR LF */
1168 end=uprv_strchr(line, 0);
1169 while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1170 --end;
1171 }
1172 *end=0;
1173
1174 /* ignore empty and comment lines */
1175 if(line[0]==0 || line[0]=='#') {
1176 continue;
1177 }
1178
1179 /* stop at the end of the mapping table */
1180 if(0==uprv_strcmp(line, "END CHARMAP")) {
1181 break;
1182 }
1183
1184 isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1185 }
1186
1187 if(!isOK) {
1188 *pErrorCode=U_INVALID_TABLE_FORMAT;
1189 }
1190 }
1191 #endif
1192