1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2003-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucmstate.c
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2003oct09
16 * created by: Markus W. Scherer
17 *
18 * This file handles ICU .ucm file state information as part of the ucm module.
19 * Most of this code used to be in makeconv.c.
20 */
21
22 #include "unicode/utypes.h"
23 #include "cstring.h"
24 #include "cmemory.h"
25 #include "uarrsort.h"
26 #include "ucnvmbcs.h"
27 #include "ucnv_ext.h"
28 #include "uparse.h"
29 #include "ucm.h"
30 #include <stdio.h>
31
32 #if !UCONFIG_NO_CONVERSION
33
34 /* MBCS state handling ------------------------------------------------------ */
35
36 /*
37 * state table row grammar (ebnf-style):
38 * (whitespace is allowed between all tokens)
39 *
40 * row=[[firstentry ','] entry (',' entry)*]
41 * firstentry="initial" | "surrogates"
42 * (initial state (default for state 0), output is all surrogate pairs)
43 * entry=range [':' nextstate] ['.' action]
44 * range=number ['-' number]
45 * nextstate=number
46 * (0..7f)
47 * action='u' | 's' | 'p' | 'i'
48 * (unassigned, state change only, surrogate pair, illegal)
49 * number=(1- or 2-digit hexadecimal number)
50 */
51 static const char *
parseState(const char * s,int32_t state[256],uint32_t * pFlags)52 parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
53 const char *t;
54 uint32_t start, end, i;
55 int32_t entry;
56
57 /* initialize the state: all illegal with U+ffff */
58 for(i=0; i<256; ++i) {
59 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
60 }
61
62 /* skip leading white space */
63 s=u_skipWhitespace(s);
64
65 /* is there an "initial" or "surrogates" directive? */
66 if(uprv_strncmp("initial", s, 7)==0) {
67 *pFlags=MBCS_STATE_FLAG_DIRECT;
68 s=u_skipWhitespace(s+7);
69 if(*s++!=',') {
70 return s-1;
71 }
72 } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
73 *pFlags=MBCS_STATE_FLAG_SURROGATES;
74 s=u_skipWhitespace(s+10);
75 if(*s++!=',') {
76 return s-1;
77 }
78 } else if(*s==0) {
79 /* empty state row: all-illegal */
80 return NULL;
81 }
82
83 for(;;) {
84 /* read an entry, the start of the range first */
85 s=u_skipWhitespace(s);
86 start=uprv_strtoul(s, (char **)&t, 16);
87 if(s==t || 0xff<start) {
88 return s;
89 }
90 s=u_skipWhitespace(t);
91
92 /* read the end of the range if there is one */
93 if(*s=='-') {
94 s=u_skipWhitespace(s+1);
95 end=uprv_strtoul(s, (char **)&t, 16);
96 if(s==t || end<start || 0xff<end) {
97 return s;
98 }
99 s=u_skipWhitespace(t);
100 } else {
101 end=start;
102 }
103
104 /* determine the state entry for this range */
105 if(*s!=':' && *s!='.') {
106 /* the default is: final state with valid entries */
107 entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
108 } else {
109 entry=MBCS_ENTRY_TRANSITION(0, 0);
110 if(*s==':') {
111 /* get the next state, default to 0 */
112 s=u_skipWhitespace(s+1);
113 i=uprv_strtoul(s, (char **)&t, 16);
114 if(s!=t) {
115 if(0x7f<i) {
116 return s;
117 }
118 s=u_skipWhitespace(t);
119 entry=MBCS_ENTRY_SET_STATE(entry, i);
120 }
121 }
122
123 /* get the state action, default to valid */
124 if(*s=='.') {
125 /* this is a final state */
126 entry=MBCS_ENTRY_SET_FINAL(entry);
127
128 s=u_skipWhitespace(s+1);
129 if(*s=='u') {
130 /* unassigned set U+fffe */
131 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
132 s=u_skipWhitespace(s+1);
133 } else if(*s=='p') {
134 if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
135 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
136 } else {
137 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
138 }
139 s=u_skipWhitespace(s+1);
140 } else if(*s=='s') {
141 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
142 s=u_skipWhitespace(s+1);
143 } else if(*s=='i') {
144 /* illegal set U+ffff */
145 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
146 s=u_skipWhitespace(s+1);
147 } else {
148 /* default to valid */
149 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
150 }
151 } else {
152 /* this is an intermediate state, nothing to do */
153 }
154 }
155
156 /* adjust "final valid" states according to the state flags */
157 if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
158 switch(*pFlags) {
159 case 0:
160 /* no adjustment */
161 break;
162 case MBCS_STATE_FLAG_DIRECT:
163 /* set the valid-direct code point to "unassigned"==0xfffe */
164 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
165 break;
166 case MBCS_STATE_FLAG_SURROGATES:
167 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
168 break;
169 default:
170 break;
171 }
172 }
173
174 /* set this entry for the range */
175 for(i=start; i<=end; ++i) {
176 state[i]=entry;
177 }
178
179 if(*s==',') {
180 ++s;
181 } else {
182 return *s==0 ? NULL : s;
183 }
184 }
185 }
186
187 U_CAPI void U_EXPORT2
ucm_addState(UCMStates * states,const char * s)188 ucm_addState(UCMStates *states, const char *s) {
189 const char *error;
190
191 if(states->countStates==MBCS_MAX_STATE_COUNT) {
192 fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
193 exit(U_INVALID_TABLE_FORMAT);
194 }
195
196 error=parseState(s, states->stateTable[states->countStates],
197 &states->stateFlags[states->countStates]);
198 if(error!=NULL) {
199 fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
200 exit(U_INVALID_TABLE_FORMAT);
201 }
202
203 ++states->countStates;
204 }
205
206 U_CAPI UBool U_EXPORT2
ucm_parseHeaderLine(UCMFile * ucm,char * line,char ** pKey,char ** pValue)207 ucm_parseHeaderLine(UCMFile *ucm,
208 char *line, char **pKey, char **pValue) {
209 UCMStates *states;
210 char *s, *end;
211 char c;
212
213 states=&ucm->states;
214
215 /* remove comments and trailing CR and LF and remove whitespace from the end */
216 for(end=line; (c=*end)!=0; ++end) {
217 if(c=='#' || c=='\r' || c=='\n') {
218 break;
219 }
220 }
221 while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
222 --end;
223 }
224 *end=0;
225
226 /* skip leading white space and ignore empty lines */
227 s=(char *)u_skipWhitespace(line);
228 if(*s==0) {
229 return true;
230 }
231
232 /* stop at the beginning of the mapping section */
233 if(uprv_memcmp(s, "CHARMAP", 7)==0) {
234 return false;
235 }
236
237 /* get the key name, bracketed in <> */
238 if(*s!='<') {
239 fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
240 exit(U_INVALID_TABLE_FORMAT);
241 }
242 *pKey=++s;
243 while(*s!='>') {
244 if(*s==0) {
245 fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
246 exit(U_INVALID_TABLE_FORMAT);
247 }
248 ++s;
249 }
250 *s=0;
251
252 /* get the value string, possibly quoted */
253 s=(char *)u_skipWhitespace(s+1);
254 if(*s!='"') {
255 *pValue=s;
256 } else {
257 /* remove the quotes */
258 *pValue=s+1;
259 if(end>*pValue && *(end-1)=='"') {
260 *--end=0;
261 }
262 }
263
264 /* collect the information from the header field, ignore unknown keys */
265 if(uprv_strcmp(*pKey, "uconv_class")==0) {
266 if(uprv_strcmp(*pValue, "DBCS")==0) {
267 states->conversionType=UCNV_DBCS;
268 } else if(uprv_strcmp(*pValue, "SBCS")==0) {
269 states->conversionType = UCNV_SBCS;
270 } else if(uprv_strcmp(*pValue, "MBCS")==0) {
271 states->conversionType = UCNV_MBCS;
272 } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
273 states->conversionType = UCNV_EBCDIC_STATEFUL;
274 } else {
275 fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
276 exit(U_INVALID_TABLE_FORMAT);
277 }
278 return true;
279 } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
280 c=**pValue;
281 if('1'<=c && c<='4' && (*pValue)[1]==0) {
282 states->maxCharLength=(int8_t)(c-'0');
283 states->outputType=(int8_t)(states->maxCharLength-1);
284 } else {
285 fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
286 exit(U_INVALID_TABLE_FORMAT);
287 }
288 return true;
289 } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
290 c=**pValue;
291 if('1'<=c && c<='4' && (*pValue)[1]==0) {
292 states->minCharLength=(int8_t)(c-'0');
293 } else {
294 fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
295 exit(U_INVALID_TABLE_FORMAT);
296 }
297 return true;
298 } else if(uprv_strcmp(*pKey, "icu:state")==0) {
299 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
300 switch(states->conversionType) {
301 case UCNV_SBCS:
302 case UCNV_DBCS:
303 case UCNV_EBCDIC_STATEFUL:
304 states->conversionType=UCNV_MBCS;
305 break;
306 case UCNV_MBCS:
307 break;
308 default:
309 fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
310 exit(U_INVALID_TABLE_FORMAT);
311 }
312
313 if(states->maxCharLength==0) {
314 fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
315 exit(U_INVALID_TABLE_FORMAT);
316 }
317 ucm_addState(states, *pValue);
318 return true;
319 } else if(uprv_strcmp(*pKey, "icu:base")==0) {
320 if(**pValue==0) {
321 fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
322 exit(U_INVALID_TABLE_FORMAT);
323 }
324 uprv_strcpy(ucm->baseName, *pValue);
325 return true;
326 }
327
328 return false;
329 }
330
331 /* post-processing ---------------------------------------------------------- */
332
333 static int32_t
sumUpStates(UCMStates * states)334 sumUpStates(UCMStates *states) {
335 int32_t entry, sum, state, cell, count;
336 UBool allStatesReady;
337
338 /*
339 * Sum up the offsets for all states.
340 * In each final state (where there are only final entries),
341 * the offsets add up directly.
342 * In all other state table rows, for each transition entry to another state,
343 * the offsets sum of that state needs to be added.
344 * This is achieved in at most countStates iterations.
345 */
346 allStatesReady=false;
347 for(count=states->countStates; !allStatesReady && count>=0; --count) {
348 allStatesReady=true;
349 for(state=states->countStates-1; state>=0; --state) {
350 if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
351 allStatesReady=false;
352 sum=0;
353
354 /* at first, add up only the final delta offsets to keep them <512 */
355 for(cell=0; cell<256; ++cell) {
356 entry=states->stateTable[state][cell];
357 if(MBCS_ENTRY_IS_FINAL(entry)) {
358 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
359 case MBCS_STATE_VALID_16:
360 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
361 sum+=1;
362 break;
363 case MBCS_STATE_VALID_16_PAIR:
364 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
365 sum+=2;
366 break;
367 default:
368 /* no addition */
369 break;
370 }
371 }
372 }
373
374 /* now, add up the delta offsets for the transitional entries */
375 for(cell=0; cell<256; ++cell) {
376 entry=states->stateTable[state][cell];
377 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
378 if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
379 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
380 sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
381 } else {
382 /* that next state does not have a sum yet, we cannot finish the one for this state */
383 sum=-1;
384 break;
385 }
386 }
387 }
388
389 if(sum!=-1) {
390 states->stateOffsetSum[state]=sum;
391 states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
392 }
393 }
394 }
395 }
396
397 if(!allStatesReady) {
398 fprintf(stderr, "ucm error: the state table contains loops\n");
399 exit(U_INVALID_TABLE_FORMAT);
400 }
401
402 /*
403 * For all "direct" (i.e., initial) states>0,
404 * the offsets need to be increased by the sum of
405 * the previous initial states.
406 */
407 sum=states->stateOffsetSum[0];
408 for(state=1; state<states->countStates; ++state) {
409 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
410 int32_t sum2=sum;
411 sum+=states->stateOffsetSum[state];
412 for(cell=0; cell<256; ++cell) {
413 entry=states->stateTable[state][cell];
414 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
415 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
416 }
417 }
418 }
419 }
420
421 /* round up to the next even number to have the following data 32-bit-aligned */
422 return states->countToUCodeUnits=(sum+1)&~1;
423 }
424
425 U_CAPI void U_EXPORT2
ucm_processStates(UCMStates * states,UBool ignoreSISOCheck)426 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) {
427 int32_t entry, state, cell, count;
428
429 if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
430 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
431 exit(U_INVALID_TABLE_FORMAT);
432 }
433
434 if(states->countStates==0) {
435 switch(states->conversionType) {
436 case UCNV_SBCS:
437 /* SBCS: use MBCS data structure with a default state table */
438 if(states->maxCharLength!=1) {
439 fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
440 exit(U_INVALID_TABLE_FORMAT);
441 }
442 states->conversionType=UCNV_MBCS;
443 ucm_addState(states, "0-ff");
444 break;
445 case UCNV_MBCS:
446 fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n");
447 exit(U_INVALID_TABLE_FORMAT);
448 break;
449 case UCNV_EBCDIC_STATEFUL:
450 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
451 if(states->minCharLength!=1 || states->maxCharLength!=2) {
452 fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
453 exit(U_INVALID_TABLE_FORMAT);
454 }
455 states->conversionType=UCNV_MBCS;
456 ucm_addState(states, "0-ff, e:1.s, f:0.s");
457 ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
458 ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
459 ucm_addState(states, "0-ff:1.i, 40:1.");
460 ucm_addState(states, "0-ff:1.i");
461 break;
462 case UCNV_DBCS:
463 /* DBCS: use MBCS data structure with a default state table */
464 if(states->minCharLength!=2 || states->maxCharLength!=2) {
465 fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
466 exit(U_INVALID_TABLE_FORMAT);
467 }
468 states->conversionType = UCNV_MBCS;
469 ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
470 ucm_addState(states, "41-fe");
471 ucm_addState(states, "40");
472 ucm_addState(states, "");
473 break;
474 default:
475 fprintf(stderr, "ucm error: unknown charset structure\n");
476 exit(U_INVALID_TABLE_FORMAT);
477 break;
478 }
479 }
480
481 /*
482 * check that the min/max character lengths are reasonable;
483 * to do this right, all paths through the state table would have to be
484 * recursively walked while keeping track of the sequence lengths,
485 * but these simple checks cover most state tables in practice
486 */
487 if(states->maxCharLength<states->minCharLength) {
488 fprintf(stderr, "ucm error: max B/char < min B/char\n");
489 exit(U_INVALID_TABLE_FORMAT);
490 }
491
492 /* count non-direct states and compare with max B/char */
493 count=0;
494 for(state=0; state<states->countStates; ++state) {
495 if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
496 ++count;
497 }
498 }
499 if(states->maxCharLength>count+1) {
500 fprintf(stderr, "ucm error: max B/char too large\n");
501 exit(U_INVALID_TABLE_FORMAT);
502 }
503
504 if(states->minCharLength==1) {
505 int32_t action;
506
507 /*
508 * if there are single-byte characters,
509 * then the initial state must have direct result states
510 */
511 for(cell=0; cell<256; ++cell) {
512 entry=states->stateTable[0][cell];
513 if( MBCS_ENTRY_IS_FINAL(entry) &&
514 ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
515 action==MBCS_STATE_UNASSIGNED)
516 ) {
517 break;
518 }
519 }
520
521 if(cell==256) {
522 fprintf(stderr, "ucm warning: min B/char too small\n");
523 }
524 }
525
526 /*
527 * make sure that all "next state" values are within limits
528 * and that all next states after final ones have the "direct"
529 * flag of initial states
530 */
531 for(state=states->countStates-1; state>=0; --state) {
532 for(cell=0; cell<256; ++cell) {
533 entry=states->stateTable[state][cell];
534 if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
535 fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
536 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
537 exit(U_INVALID_TABLE_FORMAT);
538 }
539 if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
540 fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
541 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
542 exit(U_INVALID_TABLE_FORMAT);
543 } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
544 fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
545 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
546 exit(U_INVALID_TABLE_FORMAT);
547 }
548 }
549 }
550
551 /* is this an SI/SO (like EBCDIC-stateful) state table? */
552 if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
553 if(states->maxCharLength!=2) {
554 fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength);
555 exit(U_INVALID_TABLE_FORMAT);
556 }
557 if(states->countStates<3) {
558 fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates);
559 exit(U_INVALID_TABLE_FORMAT);
560 }
561 /* are the SI/SO all in the right places? */
562 if( ignoreSISOCheck ||
563 (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
564 states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
565 states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
566 states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0))
567 ) {
568 states->outputType=MBCS_OUTPUT_2_SISO;
569 } else {
570 fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
571 exit(U_INVALID_TABLE_FORMAT);
572 }
573 state=2;
574 } else {
575 state=1;
576 }
577
578 /* check that no unexpected state is a "direct" one */
579 while(state<states->countStates) {
580 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
581 fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state);
582 exit(U_INVALID_TABLE_FORMAT);
583 }
584 ++state;
585 }
586
587 sumUpStates(states);
588 }
589
590 /* find a fallback for this offset; return the index or -1 if not found */
591 U_CAPI int32_t U_EXPORT2
ucm_findFallback(_MBCSToUFallback * toUFallbacks,int32_t countToUFallbacks,uint32_t offset)592 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
593 uint32_t offset) {
594 int32_t i;
595
596 if(countToUFallbacks==0) {
597 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
598 return -1;
599 }
600
601 /* do a linear search for the fallback mapping (the table is not yet sorted) */
602 for(i=0; i<countToUFallbacks; ++i) {
603 if(offset==toUFallbacks[i].offset) {
604 return i;
605 }
606 }
607 return -1;
608 }
609
610 /*
611 * This function tries to compact toUnicode tables for 2-byte codepages
612 * by finding lead bytes with all-unassigned trail bytes and adding another state
613 * for them.
614 */
615 static void
compactToUnicode2(UCMStates * states,uint16_t ** pUnicodeCodeUnits,_MBCSToUFallback * toUFallbacks,int32_t countToUFallbacks,UBool verbose)616 compactToUnicode2(UCMStates *states,
617 uint16_t **pUnicodeCodeUnits,
618 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
619 UBool verbose) {
620 int32_t (*oldStateTable)[256];
621 uint16_t count[256];
622 uint16_t *oldUnicodeCodeUnits;
623 int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
624 int32_t i, j, leadState, trailState, newState, fallback;
625 uint16_t unit;
626
627 /* find the lead state */
628 if(states->outputType==MBCS_OUTPUT_2_SISO) {
629 /* use the DBCS lead state for SI/SO codepages */
630 leadState=1;
631 } else {
632 leadState=0;
633 }
634
635 /* find the main trail state: the most used target state */
636 uprv_memset(count, 0, sizeof(count));
637 for(i=0; i<256; ++i) {
638 entry=states->stateTable[leadState][i];
639 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
640 ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
641 }
642 }
643 trailState=0;
644 for(i=1; i<states->countStates; ++i) {
645 if(count[i]>count[trailState]) {
646 trailState=i;
647 }
648 }
649
650 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
651 uprv_memset(count, 0, sizeof(count));
652 savings=0;
653 /* for each lead byte */
654 for(i=0; i<256; ++i) {
655 entry=states->stateTable[leadState][i];
656 if(MBCS_ENTRY_IS_TRANSITION(entry) &&
657 (MBCS_ENTRY_TRANSITION_STATE(entry))==static_cast<uint32_t>(trailState)) {
658 /* the offset is different for each lead byte */
659 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
660 /* for each trail byte for this lead byte */
661 for(j=0; j<256; ++j) {
662 entry=states->stateTable[trailState][j];
663 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
664 case MBCS_STATE_VALID_16:
665 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
666 if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
667 ++count[i];
668 } else {
669 j=999; /* do not count for this lead byte because there are assignments */
670 }
671 break;
672 case MBCS_STATE_VALID_16_PAIR:
673 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
674 if((*pUnicodeCodeUnits)[entry]==0xfffe) {
675 count[i]+=2;
676 } else {
677 j=999; /* do not count for this lead byte because there are assignments */
678 }
679 break;
680 default:
681 break;
682 }
683 }
684 if(j==256) {
685 /* all trail bytes for this lead byte are unassigned */
686 savings+=count[i];
687 } else {
688 count[i]=0;
689 }
690 }
691 }
692 /* subtract from the possible savings the cost of an additional state */
693 savings=savings*2-1024; /* count bytes, not 16-bit words */
694 if(savings<=0) {
695 return;
696 }
697 if(verbose) {
698 printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
699 }
700 if(states->countStates>=MBCS_MAX_STATE_COUNT) {
701 fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
702 return;
703 }
704
705 /* make a copy of the state table */
706 oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024);
707 if(oldStateTable==NULL) {
708 fprintf(stderr, "cannot compact toUnicode: out of memory\n");
709 return;
710 }
711 uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
712
713 /* add the new state */
714 /*
715 * this function does not catch the degenerate case where all lead bytes
716 * have all-unassigned trail bytes and the lead state could be removed
717 */
718 newState=states->countStates++;
719 states->stateFlags[newState]=0;
720 /* copy the old trail state, turning all assigned states into unassigned ones */
721 for(i=0; i<256; ++i) {
722 entry=states->stateTable[trailState][i];
723 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
724 case MBCS_STATE_VALID_16:
725 case MBCS_STATE_VALID_16_PAIR:
726 states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
727 break;
728 default:
729 states->stateTable[newState][i]=entry;
730 break;
731 }
732 }
733
734 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
735 for(i=0; i<256; ++i) {
736 if(count[i]>0) {
737 states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
738 }
739 }
740
741 /* sum up the new state table */
742 for(i=0; i<states->countStates; ++i) {
743 states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
744 }
745 sum=sumUpStates(states);
746
747 /* allocate a new, smaller code units array */
748 oldUnicodeCodeUnits=*pUnicodeCodeUnits;
749 if(sum==0) {
750 *pUnicodeCodeUnits=NULL;
751 if(oldUnicodeCodeUnits!=NULL) {
752 uprv_free(oldUnicodeCodeUnits);
753 }
754 uprv_free(oldStateTable);
755 return;
756 }
757 *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
758 if(*pUnicodeCodeUnits==NULL) {
759 fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
760 (long)sum);
761 /* revert to the old state table */
762 *pUnicodeCodeUnits=oldUnicodeCodeUnits;
763 --states->countStates;
764 uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
765 uprv_free(oldStateTable);
766 return;
767 }
768 for(i=0; i<sum; ++i) {
769 (*pUnicodeCodeUnits)[i]=0xfffe;
770 }
771
772 /* copy the code units for all assigned characters */
773 /*
774 * The old state table has the same lead _and_ trail states for assigned characters!
775 * The differences are in the offsets, and in the trail states for some unassigned characters.
776 * For each character with an assigned state in the new table, it was assigned in the old one.
777 * Only still-assigned characters are copied.
778 * Note that fallback mappings need to get their offset values adjusted.
779 */
780
781 /* for each initial state */
782 for(leadState=0; leadState<states->countStates; ++leadState) {
783 if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
784 /* for each lead byte from there */
785 for(i=0; i<256; ++i) {
786 entry=states->stateTable[leadState][i];
787 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
788 trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
789 /* the new state does not have assigned states */
790 if(trailState!=newState) {
791 trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
792 oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
793 /* for each trail byte */
794 for(j=0; j<256; ++j) {
795 entry=states->stateTable[trailState][j];
796 /* copy assigned-character code units and adjust fallback offsets */
797 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
798 case MBCS_STATE_VALID_16:
799 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
800 /* find the old offset according to the old state table */
801 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
802 unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
803 if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
804 toUFallbacks[fallback].offset=0x80000000|offset;
805 }
806 break;
807 case MBCS_STATE_VALID_16_PAIR:
808 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
809 /* find the old offset according to the old state table */
810 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
811 (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
812 (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
813 break;
814 default:
815 break;
816 }
817 }
818 }
819 }
820 }
821 }
822 }
823
824 /* remove temporary flags from fallback offsets that protected them from being modified twice */
825 for(i=0; i<countToUFallbacks; ++i) {
826 toUFallbacks[i].offset&=0x7fffffff;
827 }
828
829 /* free temporary memory */
830 uprv_free(oldUnicodeCodeUnits);
831 uprv_free(oldStateTable);
832 }
833
834 /*
835 * recursive sub-function of compactToUnicodeHelper()
836 * returns:
837 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
838 * if all sequences from this state are unassigned, returns the
839 * <0 there are assignments in unicodeCodeUnits[]
840 * 0 no use of unicodeCodeUnits[]
841 */
842 static int32_t
findUnassigned(UCMStates * states,uint16_t * unicodeCodeUnits,_MBCSToUFallback * toUFallbacks,int32_t countToUFallbacks,int32_t state,int32_t offset,uint32_t b)843 findUnassigned(UCMStates *states,
844 uint16_t *unicodeCodeUnits,
845 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
846 int32_t state, int32_t offset, uint32_t b) {
847 int32_t i, entry, savings, localSavings, belowSavings;
848 UBool haveAssigned;
849
850 localSavings=belowSavings=0;
851 haveAssigned=false;
852 for(i=0; i<256; ++i) {
853 entry=states->stateTable[state][i];
854 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
855 savings=findUnassigned(states,
856 unicodeCodeUnits,
857 toUFallbacks, countToUFallbacks,
858 MBCS_ENTRY_TRANSITION_STATE(entry),
859 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
860 (b<<8)|(uint32_t)i);
861 if(savings<0) {
862 haveAssigned=true;
863 } else if(savings>0) {
864 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
865 (unsigned long)((b<<8)|i), (long)state, (long)savings);
866 belowSavings+=savings;
867 }
868 } else if(!haveAssigned) {
869 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
870 case MBCS_STATE_VALID_16:
871 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
872 if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
873 localSavings+=2;
874 } else {
875 haveAssigned=true;
876 }
877 break;
878 case MBCS_STATE_VALID_16_PAIR:
879 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
880 if(unicodeCodeUnits[entry]==0xfffe) {
881 localSavings+=4;
882 } else {
883 haveAssigned=true;
884 }
885 break;
886 default:
887 break;
888 }
889 }
890 }
891 if(haveAssigned) {
892 return -1;
893 } else {
894 return localSavings+belowSavings;
895 }
896 }
897
898 /* helper function for finding compaction opportunities */
899 static void
compactToUnicodeHelper(UCMStates * states,uint16_t * unicodeCodeUnits,_MBCSToUFallback * toUFallbacks,int32_t countToUFallbacks)900 compactToUnicodeHelper(UCMStates *states,
901 uint16_t *unicodeCodeUnits,
902 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
903 int32_t state, savings;
904
905 /* for each initial state */
906 for(state=0; state<states->countStates; ++state) {
907 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
908 savings=findUnassigned(states,
909 unicodeCodeUnits,
910 toUFallbacks, countToUFallbacks,
911 state, 0, 0);
912 if(savings>0) {
913 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
914 (long)state, (long)savings);
915 }
916 }
917 }
918 }
919
920 U_CDECL_BEGIN
921 static int32_t U_CALLCONV
compareFallbacks(const void * context,const void * fb1,const void * fb2)922 compareFallbacks(const void *context, const void *fb1, const void *fb2) {
923 (void)context;
924 return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
925 }
926 U_CDECL_END
927
928 U_CAPI void U_EXPORT2
ucm_optimizeStates(UCMStates * states,uint16_t ** pUnicodeCodeUnits,_MBCSToUFallback * toUFallbacks,int32_t countToUFallbacks,UBool verbose)929 ucm_optimizeStates(UCMStates *states,
930 uint16_t **pUnicodeCodeUnits,
931 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
932 UBool verbose) {
933 UErrorCode errorCode;
934 int32_t state, cell, entry;
935
936 /* test each state table entry */
937 for(state=0; state<states->countStates; ++state) {
938 for(cell=0; cell<256; ++cell) {
939 entry=states->stateTable[state][cell];
940 /*
941 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
942 * and the code point is "unassigned" (0xfffe), then change it to
943 * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
944 */
945 if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
946 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
947 }
948 }
949 }
950
951 /* try to compact the toUnicode tables */
952 if(states->maxCharLength==2) {
953 compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
954 } else if(states->maxCharLength>2) {
955 if(verbose) {
956 compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
957 }
958 }
959
960 /* sort toUFallbacks */
961 /*
962 * It should be safe to sort them before compactToUnicode2() is called,
963 * because it should not change the relative order of the offset values
964 * that it adjusts, but they need to be sorted at some point, and
965 * it is safest here.
966 */
967 if(countToUFallbacks>0) {
968 errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
969 uprv_sortArray(toUFallbacks, countToUFallbacks,
970 sizeof(_MBCSToUFallback),
971 compareFallbacks, NULL, false, &errorCode);
972 }
973 }
974
975 /* use a complete state table ----------------------------------------------- */
976
977 U_CAPI int32_t U_EXPORT2
ucm_countChars(UCMStates * states,const uint8_t * bytes,int32_t length)978 ucm_countChars(UCMStates *states,
979 const uint8_t *bytes, int32_t length) {
980 uint32_t offset;
981 int32_t i, entry, count;
982 uint8_t state;
983
984 offset=0;
985 count=0;
986 state=0;
987
988 if(states->countStates==0) {
989 fprintf(stderr, "ucm error: there is no state information!\n");
990 return -1;
991 }
992
993 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
994 if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
995 state=1;
996 }
997
998 /*
999 * Walk down the state table like in conversion,
1000 * much like getNextUChar().
1001 * We assume that c<=0x10ffff.
1002 */
1003 for(i=0; i<length; ++i) {
1004 entry=states->stateTable[state][bytes[i]];
1005 if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1006 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1007 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1008 } else {
1009 switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
1010 case MBCS_STATE_ILLEGAL:
1011 fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
1012 return -1;
1013 case MBCS_STATE_CHANGE_ONLY:
1014 fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
1015 return -1;
1016 case MBCS_STATE_UNASSIGNED:
1017 case MBCS_STATE_FALLBACK_DIRECT_16:
1018 case MBCS_STATE_VALID_DIRECT_16:
1019 case MBCS_STATE_FALLBACK_DIRECT_20:
1020 case MBCS_STATE_VALID_DIRECT_20:
1021 case MBCS_STATE_VALID_16:
1022 case MBCS_STATE_VALID_16_PAIR:
1023 /* count a complete character and prepare for a new one */
1024 ++count;
1025 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
1026 offset=0;
1027 break;
1028 default:
1029 /* reserved, must never occur */
1030 fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry);
1031 return -1;
1032 }
1033 }
1034 }
1035
1036 if(offset!=0) {
1037 fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state);
1038 return -1;
1039 }
1040
1041 /*
1042 * for SI/SO (like EBCDIC-stateful), multiple-character results
1043 * must consist of only double-byte sequences
1044 */
1045 if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
1046 fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count);
1047 return -1;
1048 }
1049
1050 return count;
1051 }
1052 #endif
1053
1054