1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2003-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucmstate.c
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2003oct09
16 * created by: Markus W. Scherer
17 *
18 * This file handles ICU .ucm file state information as part of the ucm module.
19 * Most of this code used to be in makeconv.c.
20 */
21
22 #include "unicode/utypes.h"
23 #include "cstring.h"
24 #include "cmemory.h"
25 #include "uarrsort.h"
26 #include "ucnvmbcs.h"
27 #include "ucnv_ext.h"
28 #include "uparse.h"
29 #include "ucm.h"
30 #include <stdio.h>
31
32 #if !UCONFIG_NO_CONVERSION
33
34 /* MBCS state handling ------------------------------------------------------ */
35
36 /*
37 * state table row grammar (ebnf-style):
38 * (whitespace is allowed between all tokens)
39 *
40 * row=[[firstentry ','] entry (',' entry)*]
41 * firstentry="initial" | "surrogates"
42 * (initial state (default for state 0), output is all surrogate pairs)
43 * entry=range [':' nextstate] ['.' action]
44 * range=number ['-' number]
45 * nextstate=number
46 * (0..7f)
47 * action='u' | 's' | 'p' | 'i'
48 * (unassigned, state change only, surrogate pair, illegal)
49 * number=(1- or 2-digit hexadecimal number)
50 */
51 static const char *
parseState(const char * s,int32_t state[256],uint32_t * pFlags)52 parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
53 const char *t;
54 uint32_t start, end, i;
55 int32_t entry;
56
57 /* initialize the state: all illegal with U+ffff */
58 for(i=0; i<256; ++i) {
59 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
60 }
61
62 /* skip leading white space */
63 s=u_skipWhitespace(s);
64
65 /* is there an "initial" or "surrogates" directive? */
66 if(uprv_strncmp("initial", s, 7)==0) {
67 *pFlags=MBCS_STATE_FLAG_DIRECT;
68 s=u_skipWhitespace(s+7);
69 if(*s++!=',') {
70 return s-1;
71 }
72 } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
73 *pFlags=MBCS_STATE_FLAG_SURROGATES;
74 s=u_skipWhitespace(s+10);
75 if(*s++!=',') {
76 return s-1;
77 }
78 } else if(*s==0) {
79 /* empty state row: all-illegal */
80 return NULL;
81 }
82
83 for(;;) {
84 /* read an entry, the start of the range first */
85 s=u_skipWhitespace(s);
86 start=uprv_strtoul(s, (char **)&t, 16);
87 if(s==t || 0xff<start) {
88 return s;
89 }
90 s=u_skipWhitespace(t);
91
92 /* read the end of the range if there is one */
93 if(*s=='-') {
94 s=u_skipWhitespace(s+1);
95 end=uprv_strtoul(s, (char **)&t, 16);
96 if(s==t || end<start || 0xff<end) {
97 return s;
98 }
99 s=u_skipWhitespace(t);
100 } else {
101 end=start;
102 }
103
104 /* determine the state entrys for this range */
105 if(*s!=':' && *s!='.') {
106 /* the default is: final state with valid entries */
107 entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
108 } else {
109 entry=MBCS_ENTRY_TRANSITION(0, 0);
110 if(*s==':') {
111 /* get the next state, default to 0 */
112 s=u_skipWhitespace(s+1);
113 i=uprv_strtoul(s, (char **)&t, 16);
114 if(s!=t) {
115 if(0x7f<i) {
116 return s;
117 }
118 s=u_skipWhitespace(t);
119 entry=MBCS_ENTRY_SET_STATE(entry, i);
120 }
121 }
122
123 /* get the state action, default to valid */
124 if(*s=='.') {
125 /* this is a final state */
126 entry=MBCS_ENTRY_SET_FINAL(entry);
127
128 s=u_skipWhitespace(s+1);
129 if(*s=='u') {
130 /* unassigned set U+fffe */
131 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
132 s=u_skipWhitespace(s+1);
133 } else if(*s=='p') {
134 if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
135 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
136 } else {
137 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
138 }
139 s=u_skipWhitespace(s+1);
140 } else if(*s=='s') {
141 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
142 s=u_skipWhitespace(s+1);
143 } else if(*s=='i') {
144 /* illegal set U+ffff */
145 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
146 s=u_skipWhitespace(s+1);
147 } else {
148 /* default to valid */
149 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
150 }
151 } else {
152 /* this is an intermediate state, nothing to do */
153 }
154 }
155
156 /* adjust "final valid" states according to the state flags */
157 if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
158 switch(*pFlags) {
159 case 0:
160 /* no adjustment */
161 break;
162 case MBCS_STATE_FLAG_DIRECT:
163 /* set the valid-direct code point to "unassigned"==0xfffe */
164 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
165 break;
166 case MBCS_STATE_FLAG_SURROGATES:
167 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
168 break;
169 default:
170 break;
171 }
172 }
173
174 /* set this entry for the range */
175 for(i=start; i<=end; ++i) {
176 state[i]=entry;
177 }
178
179 if(*s==',') {
180 ++s;
181 } else {
182 return *s==0 ? NULL : s;
183 }
184 }
185 }
186
187 U_CAPI void U_EXPORT2
ucm_addState(UCMStates * states,const char * s)188 ucm_addState(UCMStates *states, const char *s) {
189 const char *error;
190
191 if(states->countStates==MBCS_MAX_STATE_COUNT) {
192 fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
193 exit(U_INVALID_TABLE_FORMAT);
194 }
195
196 error=parseState(s, states->stateTable[states->countStates],
197 &states->stateFlags[states->countStates]);
198 if(error!=NULL) {
199 fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
200 exit(U_INVALID_TABLE_FORMAT);
201 }
202
203 ++states->countStates;
204 }
205
206 U_CAPI UBool U_EXPORT2
ucm_parseHeaderLine(UCMFile * ucm,char * line,char ** pKey,char ** pValue)207 ucm_parseHeaderLine(UCMFile *ucm,
208 char *line, char **pKey, char **pValue) {
209 UCMStates *states;
210 char *s, *end;
211 char c;
212
213 states=&ucm->states;
214
215 /* remove comments and trailing CR and LF and remove whitespace from the end */
216 for(end=line; (c=*end)!=0; ++end) {
217 if(c=='#' || c=='\r' || c=='\n') {
218 break;
219 }
220 }
221 while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
222 --end;
223 }
224 *end=0;
225
226 /* skip leading white space and ignore empty lines */
227 s=(char *)u_skipWhitespace(line);
228 if(*s==0) {
229 return TRUE;
230 }
231
232 /* stop at the beginning of the mapping section */
233 if(uprv_memcmp(s, "CHARMAP", 7)==0) {
234 return FALSE;
235 }
236
237 /* get the key name, bracketed in <> */
238 if(*s!='<') {
239 fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
240 exit(U_INVALID_TABLE_FORMAT);
241 }
242 *pKey=++s;
243 while(*s!='>') {
244 if(*s==0) {
245 fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
246 exit(U_INVALID_TABLE_FORMAT);
247 }
248 ++s;
249 }
250 *s=0;
251
252 /* get the value string, possibly quoted */
253 s=(char *)u_skipWhitespace(s+1);
254 if(*s!='"') {
255 *pValue=s;
256 } else {
257 /* remove the quotes */
258 *pValue=s+1;
259 if(end>*pValue && *(end-1)=='"') {
260 *--end=0;
261 }
262 }
263
264 /* collect the information from the header field, ignore unknown keys */
265 if(uprv_strcmp(*pKey, "uconv_class")==0) {
266 if(uprv_strcmp(*pValue, "DBCS")==0) {
267 states->conversionType=UCNV_DBCS;
268 } else if(uprv_strcmp(*pValue, "SBCS")==0) {
269 states->conversionType = UCNV_SBCS;
270 } else if(uprv_strcmp(*pValue, "MBCS")==0) {
271 states->conversionType = UCNV_MBCS;
272 } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
273 states->conversionType = UCNV_EBCDIC_STATEFUL;
274 } else {
275 fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
276 exit(U_INVALID_TABLE_FORMAT);
277 }
278 return TRUE;
279 } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
280 c=**pValue;
281 if('1'<=c && c<='4' && (*pValue)[1]==0) {
282 states->maxCharLength=(int8_t)(c-'0');
283 states->outputType=(int8_t)(states->maxCharLength-1);
284 } else {
285 fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
286 exit(U_INVALID_TABLE_FORMAT);
287 }
288 return TRUE;
289 } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
290 c=**pValue;
291 if('1'<=c && c<='4' && (*pValue)[1]==0) {
292 states->minCharLength=(int8_t)(c-'0');
293 } else {
294 fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
295 exit(U_INVALID_TABLE_FORMAT);
296 }
297 return TRUE;
298 } else if(uprv_strcmp(*pKey, "icu:state")==0) {
299 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
300 switch(states->conversionType) {
301 case UCNV_SBCS:
302 case UCNV_DBCS:
303 case UCNV_EBCDIC_STATEFUL:
304 states->conversionType=UCNV_MBCS;
305 break;
306 case UCNV_MBCS:
307 break;
308 default:
309 fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
310 exit(U_INVALID_TABLE_FORMAT);
311 }
312
313 if(states->maxCharLength==0) {
314 fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
315 exit(U_INVALID_TABLE_FORMAT);
316 }
317 ucm_addState(states, *pValue);
318 return TRUE;
319 } else if(uprv_strcmp(*pKey, "icu:base")==0) {
320 if(**pValue==0) {
321 fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
322 exit(U_INVALID_TABLE_FORMAT);
323 }
324 uprv_strcpy(ucm->baseName, *pValue);
325 return TRUE;
326 }
327
328 return FALSE;
329 }
330
331 /* post-processing ---------------------------------------------------------- */
332
333 static int32_t
sumUpStates(UCMStates * states)334 sumUpStates(UCMStates *states) {
335 int32_t entry, sum, state, cell, count;
336 UBool allStatesReady;
337
338 /*
339 * Sum up the offsets for all states.
340 * In each final state (where there are only final entries),
341 * the offsets add up directly.
342 * In all other state table rows, for each transition entry to another state,
343 * the offsets sum of that state needs to be added.
344 * This is achieved in at most countStates iterations.
345 */
346 allStatesReady=FALSE;
347 for(count=states->countStates; !allStatesReady && count>=0; --count) {
348 allStatesReady=TRUE;
349 for(state=states->countStates-1; state>=0; --state) {
350 if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
351 allStatesReady=FALSE;
352 sum=0;
353
354 /* at first, add up only the final delta offsets to keep them <512 */
355 for(cell=0; cell<256; ++cell) {
356 entry=states->stateTable[state][cell];
357 if(MBCS_ENTRY_IS_FINAL(entry)) {
358 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
359 case MBCS_STATE_VALID_16:
360 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
361 sum+=1;
362 break;
363 case MBCS_STATE_VALID_16_PAIR:
364 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
365 sum+=2;
366 break;
367 default:
368 /* no addition */
369 break;
370 }
371 }
372 }
373
374 /* now, add up the delta offsets for the transitional entries */
375 for(cell=0; cell<256; ++cell) {
376 entry=states->stateTable[state][cell];
377 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
378 if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
379 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
380 sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
381 } else {
382 /* that next state does not have a sum yet, we cannot finish the one for this state */
383 sum=-1;
384 break;
385 }
386 }
387 }
388
389 if(sum!=-1) {
390 states->stateOffsetSum[state]=sum;
391 states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
392 }
393 }
394 }
395 }
396
397 if(!allStatesReady) {
398 fprintf(stderr, "ucm error: the state table contains loops\n");
399 exit(U_INVALID_TABLE_FORMAT);
400 }
401
402 /*
403 * For all "direct" (i.e., initial) states>0,
404 * the offsets need to be increased by the sum of
405 * the previous initial states.
406 */
407 sum=states->stateOffsetSum[0];
408 for(state=1; state<states->countStates; ++state) {
409 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
410 int32_t sum2=sum;
411 sum+=states->stateOffsetSum[state];
412 for(cell=0; cell<256; ++cell) {
413 entry=states->stateTable[state][cell];
414 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
415 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
416 }
417 }
418 }
419 }
420
421 /* round up to the next even number to have the following data 32-bit-aligned */
422 return states->countToUCodeUnits=(sum+1)&~1;
423 }
424
425 U_CAPI void U_EXPORT2
ucm_processStates(UCMStates * states,UBool ignoreSISOCheck)426 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) {
427 int32_t entry, state, cell, count;
428
429 if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
430 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
431 exit(U_INVALID_TABLE_FORMAT);
432 }
433
434 if(states->countStates==0) {
435 switch(states->conversionType) {
436 case UCNV_SBCS:
437 /* SBCS: use MBCS data structure with a default state table */
438 if(states->maxCharLength!=1) {
439 fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
440 exit(U_INVALID_TABLE_FORMAT);
441 }
442 states->conversionType=UCNV_MBCS;
443 ucm_addState(states, "0-ff");
444 break;
445 case UCNV_MBCS:
446 fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n");
447 exit(U_INVALID_TABLE_FORMAT);
448 break;
449 case UCNV_EBCDIC_STATEFUL:
450 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
451 if(states->minCharLength!=1 || states->maxCharLength!=2) {
452 fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
453 exit(U_INVALID_TABLE_FORMAT);
454 }
455 states->conversionType=UCNV_MBCS;
456 ucm_addState(states, "0-ff, e:1.s, f:0.s");
457 ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
458 ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
459 ucm_addState(states, "0-ff:1.i, 40:1.");
460 ucm_addState(states, "0-ff:1.i");
461 break;
462 case UCNV_DBCS:
463 /* DBCS: use MBCS data structure with a default state table */
464 if(states->minCharLength!=2 || states->maxCharLength!=2) {
465 fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
466 exit(U_INVALID_TABLE_FORMAT);
467 }
468 states->conversionType = UCNV_MBCS;
469 ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
470 ucm_addState(states, "41-fe");
471 ucm_addState(states, "40");
472 ucm_addState(states, "");
473 break;
474 default:
475 fprintf(stderr, "ucm error: unknown charset structure\n");
476 exit(U_INVALID_TABLE_FORMAT);
477 break;
478 }
479 }
480
481 /*
482 * check that the min/max character lengths are reasonable;
483 * to do this right, all paths through the state table would have to be
484 * recursively walked while keeping track of the sequence lengths,
485 * but these simple checks cover most state tables in practice
486 */
487 if(states->maxCharLength<states->minCharLength) {
488 fprintf(stderr, "ucm error: max B/char < min B/char\n");
489 exit(U_INVALID_TABLE_FORMAT);
490 }
491
492 /* count non-direct states and compare with max B/char */
493 count=0;
494 for(state=0; state<states->countStates; ++state) {
495 if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
496 ++count;
497 }
498 }
499 if(states->maxCharLength>count+1) {
500 fprintf(stderr, "ucm error: max B/char too large\n");
501 exit(U_INVALID_TABLE_FORMAT);
502 }
503
504 if(states->minCharLength==1) {
505 int32_t action;
506
507 /*
508 * if there are single-byte characters,
509 * then the initial state must have direct result states
510 */
511 for(cell=0; cell<256; ++cell) {
512 entry=states->stateTable[0][cell];
513 if( MBCS_ENTRY_IS_FINAL(entry) &&
514 ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
515 action==MBCS_STATE_UNASSIGNED)
516 ) {
517 break;
518 }
519 }
520
521 if(cell==256) {
522 fprintf(stderr, "ucm warning: min B/char too small\n");
523 }
524 }
525
526 /*
527 * make sure that all "next state" values are within limits
528 * and that all next states after final ones have the "direct"
529 * flag of initial states
530 */
531 for(state=states->countStates-1; state>=0; --state) {
532 for(cell=0; cell<256; ++cell) {
533 entry=states->stateTable[state][cell];
534 if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
535 fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
536 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
537 exit(U_INVALID_TABLE_FORMAT);
538 }
539 if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
540 fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
541 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
542 exit(U_INVALID_TABLE_FORMAT);
543 } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
544 fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
545 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
546 exit(U_INVALID_TABLE_FORMAT);
547 }
548 }
549 }
550
551 /* is this an SI/SO (like EBCDIC-stateful) state table? */
552 if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
553 if(states->maxCharLength!=2) {
554 fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength);
555 exit(U_INVALID_TABLE_FORMAT);
556 }
557 if(states->countStates<3) {
558 fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates);
559 exit(U_INVALID_TABLE_FORMAT);
560 }
561 /* are the SI/SO all in the right places? */
562 if( ignoreSISOCheck ||
563 (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
564 states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
565 states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
566 states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0))
567 ) {
568 states->outputType=MBCS_OUTPUT_2_SISO;
569 } else {
570 fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
571 exit(U_INVALID_TABLE_FORMAT);
572 }
573 state=2;
574 } else {
575 state=1;
576 }
577
578 /* check that no unexpected state is a "direct" one */
579 while(state<states->countStates) {
580 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
581 fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state);
582 exit(U_INVALID_TABLE_FORMAT);
583 }
584 ++state;
585 }
586
587 sumUpStates(states);
588 }
589
590 /* find a fallback for this offset; return the index or -1 if not found */
591 U_CAPI int32_t U_EXPORT2
ucm_findFallback(_MBCSToUFallback * toUFallbacks,int32_t countToUFallbacks,uint32_t offset)592 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
593 uint32_t offset) {
594 int32_t i;
595
596 if(countToUFallbacks==0) {
597 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
598 return -1;
599 }
600
601 /* do a linear search for the fallback mapping (the table is not yet sorted) */
602 for(i=0; i<countToUFallbacks; ++i) {
603 if(offset==toUFallbacks[i].offset) {
604 return i;
605 }
606 }
607 return -1;
608 }
609
610 /*
611 * This function tries to compact toUnicode tables for 2-byte codepages
612 * by finding lead bytes with all-unassigned trail bytes and adding another state
613 * for them.
614 */
615 static void
compactToUnicode2(UCMStates * states,uint16_t ** pUnicodeCodeUnits,_MBCSToUFallback * toUFallbacks,int32_t countToUFallbacks,UBool verbose)616 compactToUnicode2(UCMStates *states,
617 uint16_t **pUnicodeCodeUnits,
618 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
619 UBool verbose) {
620 int32_t (*oldStateTable)[256];
621 uint16_t count[256];
622 uint16_t *oldUnicodeCodeUnits;
623 int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
624 int32_t i, j, leadState, trailState, newState, fallback;
625 uint16_t unit;
626
627 /* find the lead state */
628 if(states->outputType==MBCS_OUTPUT_2_SISO) {
629 /* use the DBCS lead state for SI/SO codepages */
630 leadState=1;
631 } else {
632 leadState=0;
633 }
634
635 /* find the main trail state: the most used target state */
636 uprv_memset(count, 0, sizeof(count));
637 for(i=0; i<256; ++i) {
638 entry=states->stateTable[leadState][i];
639 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
640 ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
641 }
642 }
643 trailState=0;
644 for(i=1; i<states->countStates; ++i) {
645 if(count[i]>count[trailState]) {
646 trailState=i;
647 }
648 }
649
650 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
651 uprv_memset(count, 0, sizeof(count));
652 savings=0;
653 /* for each lead byte */
654 for(i=0; i<256; ++i) {
655 entry=states->stateTable[leadState][i];
656 if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
657 /* the offset is different for each lead byte */
658 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
659 /* for each trail byte for this lead byte */
660 for(j=0; j<256; ++j) {
661 entry=states->stateTable[trailState][j];
662 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
663 case MBCS_STATE_VALID_16:
664 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
665 if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
666 ++count[i];
667 } else {
668 j=999; /* do not count for this lead byte because there are assignments */
669 }
670 break;
671 case MBCS_STATE_VALID_16_PAIR:
672 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
673 if((*pUnicodeCodeUnits)[entry]==0xfffe) {
674 count[i]+=2;
675 } else {
676 j=999; /* do not count for this lead byte because there are assignments */
677 }
678 break;
679 default:
680 break;
681 }
682 }
683 if(j==256) {
684 /* all trail bytes for this lead byte are unassigned */
685 savings+=count[i];
686 } else {
687 count[i]=0;
688 }
689 }
690 }
691 /* subtract from the possible savings the cost of an additional state */
692 savings=savings*2-1024; /* count bytes, not 16-bit words */
693 if(savings<=0) {
694 return;
695 }
696 if(verbose) {
697 printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
698 }
699 if(states->countStates>=MBCS_MAX_STATE_COUNT) {
700 fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
701 return;
702 }
703
704 /* make a copy of the state table */
705 oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024);
706 if(oldStateTable==NULL) {
707 fprintf(stderr, "cannot compact toUnicode: out of memory\n");
708 return;
709 }
710 uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
711
712 /* add the new state */
713 /*
714 * this function does not catch the degenerate case where all lead bytes
715 * have all-unassigned trail bytes and the lead state could be removed
716 */
717 newState=states->countStates++;
718 states->stateFlags[newState]=0;
719 /* copy the old trail state, turning all assigned states into unassigned ones */
720 for(i=0; i<256; ++i) {
721 entry=states->stateTable[trailState][i];
722 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
723 case MBCS_STATE_VALID_16:
724 case MBCS_STATE_VALID_16_PAIR:
725 states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
726 break;
727 default:
728 states->stateTable[newState][i]=entry;
729 break;
730 }
731 }
732
733 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
734 for(i=0; i<256; ++i) {
735 if(count[i]>0) {
736 states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
737 }
738 }
739
740 /* sum up the new state table */
741 for(i=0; i<states->countStates; ++i) {
742 states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
743 }
744 sum=sumUpStates(states);
745
746 /* allocate a new, smaller code units array */
747 oldUnicodeCodeUnits=*pUnicodeCodeUnits;
748 if(sum==0) {
749 *pUnicodeCodeUnits=NULL;
750 if(oldUnicodeCodeUnits!=NULL) {
751 uprv_free(oldUnicodeCodeUnits);
752 }
753 uprv_free(oldStateTable);
754 return;
755 }
756 *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
757 if(*pUnicodeCodeUnits==NULL) {
758 fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
759 (long)sum);
760 /* revert to the old state table */
761 *pUnicodeCodeUnits=oldUnicodeCodeUnits;
762 --states->countStates;
763 uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
764 uprv_free(oldStateTable);
765 return;
766 }
767 for(i=0; i<sum; ++i) {
768 (*pUnicodeCodeUnits)[i]=0xfffe;
769 }
770
771 /* copy the code units for all assigned characters */
772 /*
773 * The old state table has the same lead _and_ trail states for assigned characters!
774 * The differences are in the offsets, and in the trail states for some unassigned characters.
775 * For each character with an assigned state in the new table, it was assigned in the old one.
776 * Only still-assigned characters are copied.
777 * Note that fallback mappings need to get their offset values adjusted.
778 */
779
780 /* for each initial state */
781 for(leadState=0; leadState<states->countStates; ++leadState) {
782 if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
783 /* for each lead byte from there */
784 for(i=0; i<256; ++i) {
785 entry=states->stateTable[leadState][i];
786 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
787 trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
788 /* the new state does not have assigned states */
789 if(trailState!=newState) {
790 trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
791 oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
792 /* for each trail byte */
793 for(j=0; j<256; ++j) {
794 entry=states->stateTable[trailState][j];
795 /* copy assigned-character code units and adjust fallback offsets */
796 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
797 case MBCS_STATE_VALID_16:
798 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
799 /* find the old offset according to the old state table */
800 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
801 unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
802 if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
803 toUFallbacks[fallback].offset=0x80000000|offset;
804 }
805 break;
806 case MBCS_STATE_VALID_16_PAIR:
807 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
808 /* find the old offset according to the old state table */
809 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
810 (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
811 (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
812 break;
813 default:
814 break;
815 }
816 }
817 }
818 }
819 }
820 }
821 }
822
823 /* remove temporary flags from fallback offsets that protected them from being modified twice */
824 for(i=0; i<countToUFallbacks; ++i) {
825 toUFallbacks[i].offset&=0x7fffffff;
826 }
827
828 /* free temporary memory */
829 uprv_free(oldUnicodeCodeUnits);
830 uprv_free(oldStateTable);
831 }
832
833 /*
834 * recursive sub-function of compactToUnicodeHelper()
835 * returns:
836 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
837 * if all sequences from this state are unassigned, returns the
838 * <0 there are assignments in unicodeCodeUnits[]
839 * 0 no use of unicodeCodeUnits[]
840 */
841 static int32_t
findUnassigned(UCMStates * states,uint16_t * unicodeCodeUnits,_MBCSToUFallback * toUFallbacks,int32_t countToUFallbacks,int32_t state,int32_t offset,uint32_t b)842 findUnassigned(UCMStates *states,
843 uint16_t *unicodeCodeUnits,
844 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
845 int32_t state, int32_t offset, uint32_t b) {
846 int32_t i, entry, savings, localSavings, belowSavings;
847 UBool haveAssigned;
848
849 localSavings=belowSavings=0;
850 haveAssigned=FALSE;
851 for(i=0; i<256; ++i) {
852 entry=states->stateTable[state][i];
853 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
854 savings=findUnassigned(states,
855 unicodeCodeUnits,
856 toUFallbacks, countToUFallbacks,
857 MBCS_ENTRY_TRANSITION_STATE(entry),
858 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
859 (b<<8)|(uint32_t)i);
860 if(savings<0) {
861 haveAssigned=TRUE;
862 } else if(savings>0) {
863 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
864 (unsigned long)((b<<8)|i), (long)state, (long)savings);
865 belowSavings+=savings;
866 }
867 } else if(!haveAssigned) {
868 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
869 case MBCS_STATE_VALID_16:
870 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
871 if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
872 localSavings+=2;
873 } else {
874 haveAssigned=TRUE;
875 }
876 break;
877 case MBCS_STATE_VALID_16_PAIR:
878 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
879 if(unicodeCodeUnits[entry]==0xfffe) {
880 localSavings+=4;
881 } else {
882 haveAssigned=TRUE;
883 }
884 break;
885 default:
886 break;
887 }
888 }
889 }
890 if(haveAssigned) {
891 return -1;
892 } else {
893 return localSavings+belowSavings;
894 }
895 }
896
897 /* helper function for finding compaction opportunities */
898 static void
compactToUnicodeHelper(UCMStates * states,uint16_t * unicodeCodeUnits,_MBCSToUFallback * toUFallbacks,int32_t countToUFallbacks)899 compactToUnicodeHelper(UCMStates *states,
900 uint16_t *unicodeCodeUnits,
901 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
902 int32_t state, savings;
903
904 /* for each initial state */
905 for(state=0; state<states->countStates; ++state) {
906 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
907 savings=findUnassigned(states,
908 unicodeCodeUnits,
909 toUFallbacks, countToUFallbacks,
910 state, 0, 0);
911 if(savings>0) {
912 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
913 (long)state, (long)savings);
914 }
915 }
916 }
917 }
918
919 static int32_t
compareFallbacks(const void * context,const void * fb1,const void * fb2)920 compareFallbacks(const void *context, const void *fb1, const void *fb2) {
921 return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
922 }
923
924 U_CAPI void U_EXPORT2
ucm_optimizeStates(UCMStates * states,uint16_t ** pUnicodeCodeUnits,_MBCSToUFallback * toUFallbacks,int32_t countToUFallbacks,UBool verbose)925 ucm_optimizeStates(UCMStates *states,
926 uint16_t **pUnicodeCodeUnits,
927 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
928 UBool verbose) {
929 UErrorCode errorCode;
930 int32_t state, cell, entry;
931
932 /* test each state table entry */
933 for(state=0; state<states->countStates; ++state) {
934 for(cell=0; cell<256; ++cell) {
935 entry=states->stateTable[state][cell];
936 /*
937 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
938 * and the code point is "unassigned" (0xfffe), then change it to
939 * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
940 */
941 if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
942 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
943 }
944 }
945 }
946
947 /* try to compact the toUnicode tables */
948 if(states->maxCharLength==2) {
949 compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
950 } else if(states->maxCharLength>2) {
951 if(verbose) {
952 compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
953 }
954 }
955
956 /* sort toUFallbacks */
957 /*
958 * It should be safe to sort them before compactToUnicode2() is called,
959 * because it should not change the relative order of the offset values
960 * that it adjusts, but they need to be sorted at some point, and
961 * it is safest here.
962 */
963 if(countToUFallbacks>0) {
964 errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
965 uprv_sortArray(toUFallbacks, countToUFallbacks,
966 sizeof(_MBCSToUFallback),
967 compareFallbacks, NULL, FALSE, &errorCode);
968 }
969 }
970
971 /* use a complete state table ----------------------------------------------- */
972
973 U_CAPI int32_t U_EXPORT2
ucm_countChars(UCMStates * states,const uint8_t * bytes,int32_t length)974 ucm_countChars(UCMStates *states,
975 const uint8_t *bytes, int32_t length) {
976 uint32_t offset;
977 int32_t i, entry, count;
978 uint8_t state;
979
980 offset=0;
981 count=0;
982 state=0;
983
984 if(states->countStates==0) {
985 fprintf(stderr, "ucm error: there is no state information!\n");
986 return -1;
987 }
988
989 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
990 if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
991 state=1;
992 }
993
994 /*
995 * Walk down the state table like in conversion,
996 * much like getNextUChar().
997 * We assume that c<=0x10ffff.
998 */
999 for(i=0; i<length; ++i) {
1000 entry=states->stateTable[state][bytes[i]];
1001 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1002 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1003 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1004 } else {
1005 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
1006 case MBCS_STATE_ILLEGAL:
1007 fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
1008 return -1;
1009 case MBCS_STATE_CHANGE_ONLY:
1010 fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
1011 return -1;
1012 case MBCS_STATE_UNASSIGNED:
1013 case MBCS_STATE_FALLBACK_DIRECT_16:
1014 case MBCS_STATE_VALID_DIRECT_16:
1015 case MBCS_STATE_FALLBACK_DIRECT_20:
1016 case MBCS_STATE_VALID_DIRECT_20:
1017 case MBCS_STATE_VALID_16:
1018 case MBCS_STATE_VALID_16_PAIR:
1019 /* count a complete character and prepare for a new one */
1020 ++count;
1021 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1022 offset=0;
1023 break;
1024 default:
1025 /* reserved, must never occur */
1026 fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry);
1027 return -1;
1028 }
1029 }
1030 }
1031
1032 if(offset!=0) {
1033 fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state);
1034 return -1;
1035 }
1036
1037 /*
1038 * for SI/SO (like EBCDIC-stateful), multiple-character results
1039 * must consist of only double-byte sequences
1040 */
1041 if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
1042 fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count);
1043 return -1;
1044 }
1045
1046 return count;
1047 }
1048 #endif
1049
1050