• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2000-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  genmbcs.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2000jul06
16 *   created by: Markus W. Scherer
17 */
18 
19 #include <stdio.h>
20 #include "unicode/utypes.h"
21 #include "cstring.h"
22 #include "cmemory.h"
23 #include "unewdata.h"
24 #include "ucnv_cnv.h"
25 #include "ucnvmbcs.h"
26 #include "ucm.h"
27 #include "makeconv.h"
28 #include "genmbcs.h"
29 #include "toolutil.h"
30 
31 /*
32  * TODO: Split this file into toUnicode, SBCSFromUnicode and MBCSFromUnicode files.
33  * Reduce tests for maxCharLength.
34  */
35 
36 struct MBCSData {
37     NewConverter newConverter;
38 
39     UCMFile *ucm;
40 
41     /* toUnicode (state table in ucm->states) */
42     _MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT];
43     int32_t countToUFallbacks;
44     uint16_t *unicodeCodeUnits;
45 
46     /* fromUnicode */
47     uint16_t stage1[MBCS_STAGE_1_SIZE];
48     uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */
49     uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */
50     uint8_t *fromUBytes;
51     uint32_t stage2Top, stage3Top;
52 
53     /* fromUTF8 */
54     uint16_t stageUTF8[0x10000>>MBCS_UTF8_STAGE_SHIFT];  /* allow for utf8Max=0xffff */
55 
56     /*
57      * Maximum UTF-8-friendly code point.
58      * 0 if !utf8Friendly, otherwise 0x01ff..0xffff in steps of 0x100.
59      * If utf8Friendly, utf8Max is normally either MBCS_UTF8_MAX or 0xffff.
60      */
61     uint16_t utf8Max;
62 
63     UBool utf8Friendly;
64     UBool omitFromU;
65 };
66 
67 /* prototypes */
68 U_CDECL_BEGIN
69 static void
70 MBCSClose(NewConverter *cnvData);
71 
72 static UBool
73 MBCSStartMappings(MBCSData *mbcsData);
74 
75 static UBool
76 MBCSAddToUnicode(MBCSData *mbcsData,
77                  const uint8_t *bytes, int32_t length,
78                  UChar32 c,
79                  int8_t flag);
80 
81 static UBool
82 MBCSIsValid(NewConverter *cnvData,
83             const uint8_t *bytes, int32_t length);
84 
85 static UBool
86 MBCSSingleAddFromUnicode(MBCSData *mbcsData,
87                          const uint8_t *bytes, int32_t length,
88                          UChar32 c,
89                          int8_t flag);
90 
91 static UBool
92 MBCSAddFromUnicode(MBCSData *mbcsData,
93                    const uint8_t *bytes, int32_t length,
94                    UChar32 c,
95                    int8_t flag);
96 
97 static void
98 MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData);
99 
100 static UBool
101 MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
102 
103 static uint32_t
104 MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
105           UNewDataMemory *pData, int32_t tableType);
106 U_CDECL_END
107 
108 /* helper ------------------------------------------------------------------- */
109 
110 static inline char
hexDigit(uint8_t digit)111 hexDigit(uint8_t digit) {
112     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
113 }
114 
115 static inline char *
printBytes(char * buffer,const uint8_t * bytes,int32_t length)116 printBytes(char *buffer, const uint8_t *bytes, int32_t length) {
117     char *s=buffer;
118     while(length>0) {
119         *s++=hexDigit((uint8_t)(*bytes>>4));
120         *s++=hexDigit((uint8_t)(*bytes&0xf));
121         ++bytes;
122         --length;
123     }
124 
125     *s=0;
126     return buffer;
127 }
128 
129 /* implementation ----------------------------------------------------------- */
130 
131 static MBCSData gDummy;
132 
133 
134 U_CFUNC const MBCSData *
MBCSGetDummy()135 MBCSGetDummy() {
136     uprv_memset(&gDummy, 0, sizeof(MBCSData));
137 
138     /*
139      * Set "pessimistic" values which may sometimes move too many
140      * mappings to the extension table (but never too few).
141      * These values cause MBCSOkForBaseFromUnicode() to return false for the
142      * largest set of mappings.
143      * Assume maxCharLength>1.
144      */
145     gDummy.utf8Friendly=true;
146     if(SMALL) {
147         gDummy.utf8Max=0xffff;
148         gDummy.omitFromU=true;
149     } else {
150         gDummy.utf8Max=MBCS_UTF8_MAX;
151     }
152     return &gDummy;
153 }
154 
155 static void
MBCSInit(MBCSData * mbcsData,UCMFile * ucm)156 MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
157     uprv_memset(mbcsData, 0, sizeof(MBCSData));
158 
159     mbcsData->ucm=ucm; /* aliased, not owned */
160 
161     mbcsData->newConverter.close=MBCSClose;
162     mbcsData->newConverter.isValid=MBCSIsValid;
163     mbcsData->newConverter.addTable=MBCSAddTable;
164     mbcsData->newConverter.write=MBCSWrite;
165 }
166 
167 U_CFUNC NewConverter *
MBCSOpen(UCMFile * ucm)168 MBCSOpen(UCMFile *ucm) {
169     MBCSData *mbcsData=(MBCSData *)uprv_malloc(sizeof(MBCSData));
170     if(mbcsData==NULL) {
171         printf("out of memory\n");
172         exit(U_MEMORY_ALLOCATION_ERROR);
173     }
174 
175     MBCSInit(mbcsData, ucm);
176     return &mbcsData->newConverter;
177 }
178 
179 static void
MBCSDestruct(MBCSData * mbcsData)180 MBCSDestruct(MBCSData *mbcsData) {
181     uprv_free(mbcsData->unicodeCodeUnits);
182     uprv_free(mbcsData->fromUBytes);
183 }
184 
185 U_CDECL_BEGIN
186 static void
MBCSClose(NewConverter * cnvData)187 MBCSClose(NewConverter *cnvData) {
188     MBCSData *mbcsData=(MBCSData *)cnvData;
189     if(mbcsData!=NULL) {
190         MBCSDestruct(mbcsData);
191         uprv_free(mbcsData);
192     }
193 }
194 U_CDECL_END
195 
196 static UBool
MBCSStartMappings(MBCSData * mbcsData)197 MBCSStartMappings(MBCSData *mbcsData) {
198     int32_t i, sum, maxCharLength,
199             stage2NullLength, stage2AllocLength,
200             stage3NullLength, stage3AllocLength;
201 
202     /* toUnicode */
203 
204     /* allocate the code unit array and prefill it with "unassigned" values */
205     sum=mbcsData->ucm->states.countToUCodeUnits;
206     if(VERBOSE) {
207         printf("the total number of offsets is 0x%lx=%ld\n", (long)sum, (long)sum);
208     }
209 
210     if(sum>0) {
211         mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
212         if(mbcsData->unicodeCodeUnits==NULL) {
213             fprintf(stderr, "error: out of memory allocating %ld 16-bit code units\n",
214                 (long)sum);
215             return false;
216         }
217         for(i=0; i<sum; ++i) {
218             mbcsData->unicodeCodeUnits[i]=0xfffe;
219         }
220     }
221 
222     /* fromUnicode */
223     maxCharLength=mbcsData->ucm->states.maxCharLength;
224 
225     /* allocate the codepage mappings and preset the first 16 characters to 0 */
226     if(maxCharLength==1) {
227         /* allocate 64k 16-bit results for single-byte codepages */
228         sum=0x20000;
229     } else {
230         /* allocate 1M * maxCharLength bytes for at most 1M mappings */
231         sum=0x100000*maxCharLength;
232     }
233     mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum);
234     if(mbcsData->fromUBytes==NULL) {
235         fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", (long)sum);
236         return false;
237     }
238     uprv_memset(mbcsData->fromUBytes, 0, sum);
239 
240     /*
241      * UTF-8-friendly fromUnicode tries: allocate multiple blocks at a time.
242      * See ucnvmbcs.h for details.
243      *
244      * There is code, for example in ucnv_MBCSGetUnicodeSetForUnicode(), which
245      * assumes that the initial stage 2/3 blocks are the all-unassigned ones.
246      * Therefore, we refine the data structure while maintaining this placement
247      * even though it would be convenient to allocate the ASCII block at the
248      * beginning of stage 3, for example.
249      *
250      * UTF-8-friendly fromUnicode tries work from sorted tables and are built
251      * pre-compacted, overlapping adjacent stage 2/3 blocks.
252      * This is necessary because the block allocation and compaction changes
253      * at SBCS_UTF8_MAX or MBCS_UTF8_MAX, and for MBCS tables the additional
254      * stage table uses direct indexes into stage 3, without a multiplier and
255      * thus with a smaller reach.
256      *
257      * Non-UTF-8-friendly fromUnicode tries work from unsorted tables
258      * (because implicit precision is used), and are compacted
259      * in post-processing.
260      *
261      * Preallocation for UTF-8-friendly fromUnicode tries:
262      *
263      * Stage 3:
264      * 64-entry all-unassigned first block followed by ASCII (128 entries).
265      *
266      * Stage 2:
267      * 64-entry all-unassigned first block followed by preallocated
268      * 64-block for ASCII.
269      */
270 
271     /* Preallocate ASCII as a linear 128-entry stage 3 block. */
272     stage2NullLength=MBCS_STAGE_2_BLOCK_SIZE;
273     stage2AllocLength=MBCS_STAGE_2_BLOCK_SIZE;
274 
275     stage3NullLength=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
276     stage3AllocLength=128; /* ASCII U+0000..U+007f */
277 
278     /* Initialize stage 1 for the preallocated blocks. */
279     sum=stage2NullLength;
280     for(i=0; i<(stage2AllocLength>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT); ++i) {
281         mbcsData->stage1[i]=sum;
282         sum+=MBCS_STAGE_2_BLOCK_SIZE;
283     }
284     mbcsData->stage2Top=stage2NullLength+stage2AllocLength; /* ==sum */
285 
286     /*
287      * Stage 2 indexes count 16-blocks in stage 3 as follows:
288      * SBCS: directly, indexes increment by 16
289      * MBCS: indexes need to be multiplied by 16*maxCharLength, indexes increment by 1
290      * MBCS UTF-8: directly, indexes increment by 16
291      */
292     if(maxCharLength==1) {
293         sum=stage3NullLength;
294         for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
295             mbcsData->stage2Single[mbcsData->stage1[0]+i]=sum;
296             sum+=MBCS_STAGE_3_BLOCK_SIZE;
297         }
298     } else {
299         sum=stage3NullLength/MBCS_STAGE_3_GRANULARITY;
300         for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
301             mbcsData->stage2[mbcsData->stage1[0]+i]=sum;
302             sum+=MBCS_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_GRANULARITY;
303         }
304     }
305 
306     sum=stage3NullLength;
307     for(i=0; i<(stage3AllocLength/MBCS_UTF8_STAGE_3_BLOCK_SIZE); ++i) {
308         mbcsData->stageUTF8[i]=sum;
309         sum+=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
310     }
311 
312     /*
313      * Allocate a 64-entry all-unassigned first stage 3 block,
314      * for UTF-8-friendly lookup with a trail byte,
315      * plus 128 entries for ASCII.
316      */
317     mbcsData->stage3Top=(stage3NullLength+stage3AllocLength)*maxCharLength; /* ==sum*maxCharLength */
318 
319     return true;
320 }
321 
322 /* return true for success */
323 static UBool
setFallback(MBCSData * mbcsData,uint32_t offset,UChar32 c)324 setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) {
325     int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
326     if(i>=0) {
327         /* if there is already a fallback for this offset, then overwrite it */
328         mbcsData->toUFallbacks[i].codePoint=c;
329         return true;
330     } else {
331         /* if there is no fallback for this offset, then add one */
332         i=mbcsData->countToUFallbacks;
333         if(i>=MBCS_MAX_FALLBACK_COUNT) {
334             fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", (int)c);
335             return false;
336         } else {
337             mbcsData->toUFallbacks[i].offset=offset;
338             mbcsData->toUFallbacks[i].codePoint=c;
339             mbcsData->countToUFallbacks=i+1;
340             return true;
341         }
342     }
343 }
344 
345 /* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */
346 static int32_t
removeFallback(MBCSData * mbcsData,uint32_t offset)347 removeFallback(MBCSData *mbcsData, uint32_t offset) {
348     int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
349     if(i>=0) {
350         _MBCSToUFallback *toUFallbacks;
351         int32_t limit, old;
352 
353         toUFallbacks=mbcsData->toUFallbacks;
354         limit=mbcsData->countToUFallbacks;
355         old=(int32_t)toUFallbacks[i].codePoint;
356 
357         /* copy the last fallback entry here to keep the list contiguous */
358         toUFallbacks[i].offset=toUFallbacks[limit-1].offset;
359         toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint;
360         mbcsData->countToUFallbacks=limit-1;
361         return old;
362     } else {
363         return -1;
364     }
365 }
366 
367 /*
368  * isFallback is almost a boolean:
369  * 1 (true)  this is a fallback mapping
370  * 0 (false) this is a precise mapping
371  * -1        the precision of this mapping is not specified
372  */
373 static UBool
MBCSAddToUnicode(MBCSData * mbcsData,const uint8_t * bytes,int32_t length,UChar32 c,int8_t flag)374 MBCSAddToUnicode(MBCSData *mbcsData,
375                  const uint8_t *bytes, int32_t length,
376                  UChar32 c,
377                  int8_t flag) {
378     char buffer[10];
379     uint32_t offset=0;
380     int32_t i=0, entry, old;
381     uint8_t state=0;
382 
383     if(mbcsData->ucm->states.countStates==0) {
384         fprintf(stderr, "error: there is no state information!\n");
385         return false;
386     }
387 
388     /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
389     if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) {
390         state=1;
391     }
392 
393     /*
394      * Walk down the state table like in conversion,
395      * much like getNextUChar().
396      * We assume that c<=0x10ffff.
397      */
398     for(i=0;;) {
399         entry=mbcsData->ucm->states.stateTable[state][bytes[i++]];
400         if(MBCS_ENTRY_IS_TRANSITION(entry)) {
401             if(i==length) {
402                 fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n",
403                     (short)state, printBytes(buffer, bytes, length), (int)c);
404                 return false;
405             }
406             state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
407             offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
408         } else {
409             if(i<length) {
410                 fprintf(stderr, "error: byte sequence too long by %d bytes, final state %u: 0x%s (U+%x)\n",
411                     (int)(length-i), state, printBytes(buffer, bytes, length), (int)c);
412                 return false;
413             }
414             switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
415             case MBCS_STATE_ILLEGAL:
416                 fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n",
417                     (int)c, printBytes(buffer, bytes, length));
418                 return false;
419             case MBCS_STATE_CHANGE_ONLY:
420                 fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n",
421                     (int)c, printBytes(buffer, bytes, length));
422                 return false;
423             case MBCS_STATE_UNASSIGNED:
424                 fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n",
425                     (int)c, printBytes(buffer, bytes, length));
426                 return false;
427             case MBCS_STATE_FALLBACK_DIRECT_16:
428             case MBCS_STATE_VALID_DIRECT_16:
429             case MBCS_STATE_FALLBACK_DIRECT_20:
430             case MBCS_STATE_VALID_DIRECT_20:
431                 if(MBCS_ENTRY_SET_STATE(entry, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
432                     /* the "direct" action's value is not "valid-direct-16-unassigned" any more */
433                     if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_DIRECT_16 || MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_FALLBACK_DIRECT_16) {
434                         old=MBCS_ENTRY_FINAL_VALUE(entry);
435                     } else {
436                         old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
437                     }
438                     if(flag>=0) {
439                         fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
440                             (int)c, printBytes(buffer, bytes, length), (int)old);
441                         return false;
442                     } else if(VERBOSE) {
443                         fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
444                             (int)c, printBytes(buffer, bytes, length), (int)old);
445                     }
446                     /*
447                      * Continue after the above warning
448                      * if the precision of the mapping is unspecified.
449                      */
450                 }
451                 /* reassign the correct action code */
452                 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0)));
453 
454                 /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */
455                 if(c<=0xffff) {
456                     entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c);
457                 } else {
458                     entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000);
459                 }
460                 mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry;
461                 break;
462             case MBCS_STATE_VALID_16:
463                 /* bits 26..16 are not used, 0 */
464                 /* bits 15..7 contain the final offset delta to one 16-bit code unit */
465                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
466                 /* check that this byte sequence is still unassigned */
467                 if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) {
468                     if(flag>=0) {
469                         fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
470                             (int)c, printBytes(buffer, bytes, length), (int)old);
471                         return false;
472                     } else if(VERBOSE) {
473                         fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
474                             (int)c, printBytes(buffer, bytes, length), (int)old);
475                     }
476                 }
477                 if(c>=0x10000) {
478                     fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n",
479                         (int)c, printBytes(buffer, bytes, length));
480                     return false;
481                 }
482                 if(flag>0) {
483                     /* assign only if there is no precise mapping */
484                     if(mbcsData->unicodeCodeUnits[offset]==0xfffe) {
485                         return setFallback(mbcsData, offset, c);
486                     }
487                 } else {
488                     mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
489                 }
490                 break;
491             case MBCS_STATE_VALID_16_PAIR:
492                 /* bits 26..16 are not used, 0 */
493                 /* bits 15..7 contain the final offset delta to two 16-bit code units */
494                 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
495                 /* check that this byte sequence is still unassigned */
496                 old=mbcsData->unicodeCodeUnits[offset];
497                 if(old<0xfffe) {
498                     int32_t real;
499                     if(old<0xd800) {
500                         real=old;
501                     } else if(old<=0xdfff) {
502                         real=0x10000+((old&0x3ff)<<10)+((mbcsData->unicodeCodeUnits[offset+1])&0x3ff);
503                     } else /* old<=0xe001 */ {
504                         real=mbcsData->unicodeCodeUnits[offset+1];
505                     }
506                     if(flag>=0) {
507                         fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
508                             (int)c, printBytes(buffer, bytes, length), (int)real);
509                         return false;
510                     } else if(VERBOSE) {
511                         fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
512                             (int)c, printBytes(buffer, bytes, length), (int)real);
513                     }
514                 }
515                 if(flag>0) {
516                     /* assign only if there is no precise mapping */
517                     if(old<=0xdbff || old==0xe000) {
518                         /* do nothing */
519                     } else if(c<=0xffff) {
520                         /* set a BMP fallback code point as a pair with 0xe001 */
521                         mbcsData->unicodeCodeUnits[offset++]=0xe001;
522                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
523                     } else {
524                         /* set a fallback surrogate pair with two second surrogates */
525                         mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xdbc0+(c>>10));
526                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff));
527                     }
528                 } else {
529                     if(c<0xd800) {
530                         /* set a BMP code point */
531                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
532                     } else if(c<=0xffff) {
533                         /* set a BMP code point above 0xd800 as a pair with 0xe000 */
534                         mbcsData->unicodeCodeUnits[offset++]=0xe000;
535                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
536                     } else {
537                         /* set a surrogate pair */
538                         mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xd7c0+(c>>10));
539                         mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff));
540                     }
541                 }
542                 break;
543             default:
544                 /* reserved, must never occur */
545                 fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n",
546                     (int)entry, printBytes(buffer, bytes, length), (int)c);
547                 return false;
548             }
549 
550             return true;
551         }
552     }
553 }
554 
555 U_CDECL_BEGIN
556 /* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */
557 static UBool
MBCSIsValid(NewConverter * cnvData,const uint8_t * bytes,int32_t length)558 MBCSIsValid(NewConverter *cnvData,
559             const uint8_t *bytes, int32_t length) {
560     MBCSData *mbcsData=(MBCSData *)cnvData;
561 
562     return (UBool)(1==ucm_countChars(&mbcsData->ucm->states, bytes, length));
563 }
564 U_CDECL_END
565 static UBool
MBCSSingleAddFromUnicode(MBCSData * mbcsData,const uint8_t * bytes,int32_t,UChar32 c,int8_t flag)566 MBCSSingleAddFromUnicode(MBCSData *mbcsData,
567                          const uint8_t *bytes, int32_t /*length*/,
568                          UChar32 c,
569                          int8_t flag) {
570     uint16_t *stage3, *p;
571     uint32_t idx;
572     uint16_t old;
573     uint8_t b;
574 
575     uint32_t blockSize, newTop, i, nextOffset, newBlock, min;
576 
577     /* ignore |2 SUB mappings */
578     if(flag==2) {
579         return true;
580     }
581 
582     /*
583      * Walk down the triple-stage compact array ("trie") and
584      * allocate parts as necessary.
585      * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
586      * We assume that length<=maxCharLength and that c<=0x10ffff.
587      */
588     stage3=(uint16_t *)mbcsData->fromUBytes;
589     b=*bytes;
590 
591     /* inspect stage 1 */
592     idx=c>>MBCS_STAGE_1_SHIFT;
593     if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
594         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
595     } else {
596         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
597     }
598     if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
599         /* allocate another block in stage 2 */
600         newBlock=mbcsData->stage2Top;
601         if(mbcsData->utf8Friendly) {
602             min=newBlock-nextOffset; /* minimum block start with overlap */
603             while(min<newBlock && mbcsData->stage2Single[newBlock-1]==0) {
604                 --newBlock;
605             }
606         }
607         newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
608 
609         if(newTop>MBCS_MAX_STAGE_2_TOP) {
610             fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c, b);
611             return false;
612         }
613 
614         /*
615          * each stage 2 block contains 64 16-bit words:
616          * 6 code point bits 9..4 with 1 stage 3 index
617          */
618         mbcsData->stage1[idx]=(uint16_t)newBlock;
619         mbcsData->stage2Top=newTop;
620     }
621 
622     /* inspect stage 2 */
623     idx=mbcsData->stage1[idx]+nextOffset;
624     if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
625         /* allocate 64-entry blocks for UTF-8-friendly lookup */
626         blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
627         nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
628     } else {
629         blockSize=MBCS_STAGE_3_BLOCK_SIZE;
630         nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
631     }
632     if(mbcsData->stage2Single[idx]==0) {
633         /* allocate another block in stage 3 */
634         newBlock=mbcsData->stage3Top;
635         if(mbcsData->utf8Friendly) {
636             min=newBlock-nextOffset; /* minimum block start with overlap */
637             while(min<newBlock && stage3[newBlock-1]==0) {
638                 --newBlock;
639             }
640         }
641         newTop=newBlock+blockSize;
642 
643         if(newTop>MBCS_STAGE_3_SBCS_SIZE) {
644             fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", (int)c, b);
645             return false;
646         }
647         /* each block has 16 uint16_t entries */
648         i=idx;
649         while(newBlock<newTop) {
650             mbcsData->stage2Single[i++]=(uint16_t)newBlock;
651             newBlock+=MBCS_STAGE_3_BLOCK_SIZE;
652         }
653         mbcsData->stage3Top=newTop; /* ==newBlock */
654     }
655 
656     /* write the codepage entry into stage 3 and get the previous entry */
657     p=stage3+mbcsData->stage2Single[idx]+nextOffset;
658     old=*p;
659     if(flag<=0) {
660         *p=(uint16_t)(0xf00|b);
661     } else if(IS_PRIVATE_USE(c)) {
662         *p=(uint16_t)(0xc00|b);
663     } else {
664         *p=(uint16_t)(0x800|b);
665     }
666 
667     /* check that this Unicode code point was still unassigned */
668     if(old>=0x100) {
669         if(flag>=0) {
670             fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
671                 (int)c, b, old&0xff);
672             return false;
673         } else if(VERBOSE) {
674             fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
675                 (int)c, b, old&0xff);
676         }
677         /* continue after the above warning if the precision of the mapping is unspecified */
678     }
679 
680     return true;
681 }
682 
683 static UBool
MBCSAddFromUnicode(MBCSData * mbcsData,const uint8_t * bytes,int32_t length,UChar32 c,int8_t flag)684 MBCSAddFromUnicode(MBCSData *mbcsData,
685                    const uint8_t *bytes, int32_t length,
686                    UChar32 c,
687                    int8_t flag) {
688     char buffer[10];
689     const uint8_t *pb;
690     uint8_t *stage3, *p;
691     uint32_t idx, b, old, stage3Index;
692     int32_t maxCharLength;
693 
694     uint32_t blockSize, newTop, i, nextOffset, newBlock, min, overlap, maxOverlap;
695 
696     maxCharLength=mbcsData->ucm->states.maxCharLength;
697 
698     if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO &&
699         (!IGNORE_SISO_CHECK && (*bytes==0xe || *bytes==0xf))
700     ) {
701         fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n",
702             (int)c, printBytes(buffer, bytes, length));
703         return false;
704     }
705 
706     if(flag==1 && length==1 && *bytes==0) {
707         fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n",
708             (int)c, *bytes);
709         return false;
710     }
711 
712     /*
713      * Walk down the triple-stage compact array ("trie") and
714      * allocate parts as necessary.
715      * Note that the first stage 2 and 3 blocks are reserved for
716      * all-unassigned mappings.
717      * We assume that length<=maxCharLength and that c<=0x10ffff.
718      */
719     stage3=mbcsData->fromUBytes;
720 
721     /* inspect stage 1 */
722     idx=c>>MBCS_STAGE_1_SHIFT;
723     if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
724         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
725     } else {
726         nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
727     }
728     if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
729         /* allocate another block in stage 2 */
730         newBlock=mbcsData->stage2Top;
731         if(mbcsData->utf8Friendly) {
732             min=newBlock-nextOffset; /* minimum block start with overlap */
733             while(min<newBlock && mbcsData->stage2[newBlock-1]==0) {
734                 --newBlock;
735             }
736         }
737         newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
738 
739         if(newTop>MBCS_MAX_STAGE_2_TOP) {
740             fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n",
741                 (int)c, printBytes(buffer, bytes, length));
742             return false;
743         }
744 
745         /*
746          * each stage 2 block contains 64 32-bit words:
747          * 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index
748          */
749         i=idx;
750         while(newBlock<newTop) {
751             mbcsData->stage1[i++]=(uint16_t)newBlock;
752             newBlock+=MBCS_STAGE_2_BLOCK_SIZE;
753         }
754         mbcsData->stage2Top=newTop; /* ==newBlock */
755     }
756 
757     /* inspect stage 2 */
758     idx=mbcsData->stage1[idx]+nextOffset;
759     if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
760         /* allocate 64-entry blocks for UTF-8-friendly lookup */
761         blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength;
762         nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
763     } else {
764         blockSize=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
765         nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
766     }
767     if(mbcsData->stage2[idx]==0) {
768         /* allocate another block in stage 3 */
769         newBlock=mbcsData->stage3Top;
770         if(mbcsData->utf8Friendly && nextOffset>=MBCS_STAGE_3_GRANULARITY) {
771             /*
772              * Overlap stage 3 blocks only in multiples of 16-entry blocks
773              * because of the indexing granularity in stage 2.
774              */
775             maxOverlap=(nextOffset&~(MBCS_STAGE_3_GRANULARITY-1))*maxCharLength;
776             for(overlap=0;
777                 overlap<maxOverlap && stage3[newBlock-overlap-1]==0;
778                 ++overlap) {}
779 
780             overlap=(overlap/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
781             overlap=(overlap*MBCS_STAGE_3_GRANULARITY)*maxCharLength;
782 
783             newBlock-=overlap;
784         }
785         newTop=newBlock+blockSize;
786 
787         if(newTop>MBCS_STAGE_3_MBCS_SIZE*(uint32_t)maxCharLength) {
788             fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n",
789                 (int)c, printBytes(buffer, bytes, length));
790             return false;
791         }
792         /* each block has 16*maxCharLength bytes */
793         i=idx;
794         while(newBlock<newTop) {
795             mbcsData->stage2[i++]=(newBlock/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
796             newBlock+=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
797         }
798         mbcsData->stage3Top=newTop; /* ==newBlock */
799     }
800 
801     stage3Index=MBCS_STAGE_3_GRANULARITY*(uint32_t)(uint16_t)mbcsData->stage2[idx];
802 
803     /* Build an alternate, UTF-8-friendly stage table as well. */
804     if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
805         /* Overflow for uint16_t entries in stageUTF8? */
806         if(stage3Index>0xffff) {
807             /*
808              * This can occur only if the mapping table is nearly perfectly filled and if
809              * utf8Max==0xffff.
810              * (There is no known charset like this. GB 18030 does not map
811              * surrogate code points and LMBCS does not map 256 PUA code points.)
812              *
813              * Otherwise, stage3Index<=MBCS_UTF8_LIMIT<0xffff
814              * (stage3Index can at most reach exactly MBCS_UTF8_LIMIT)
815              * because we have a sorted table and there are at most MBCS_UTF8_LIMIT
816              * mappings with 0<=c<MBCS_UTF8_LIMIT, and there is only also
817              * the initial all-unassigned block in stage3.
818              *
819              * Solution for the overflow: Reduce utf8Max to the next lower value, 0xfeff.
820              *
821              * (See svn revision 20866 of the markus/ucnvutf8 feature branch for
822              * code that causes MBCSAddTable() to rebuild the table not utf8Friendly
823              * in case of overflow. That code was not tested.)
824              */
825             mbcsData->utf8Max=0xfeff;
826         } else {
827             /*
828              * The stage 3 block has been assigned for the regular trie.
829              * Just copy its index into stageUTF8[], without the granularity.
830              */
831             mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index;
832         }
833     }
834 
835     /* write the codepage bytes into stage 3 and get the previous bytes */
836 
837     /* assemble the bytes into a single integer */
838     pb=bytes;
839     b=0;
840     switch(length) {
841     case 4:
842         b=*pb++;
843         U_FALLTHROUGH;
844     case 3:
845         b=(b<<8)|*pb++;
846         U_FALLTHROUGH;
847     case 2:
848         b=(b<<8)|*pb++;
849         U_FALLTHROUGH;
850     case 1:
851     default:
852         b=(b<<8)|*pb++;
853         break;
854     }
855 
856     old=0;
857     p=stage3+(stage3Index+nextOffset)*maxCharLength;
858     switch(maxCharLength) {
859     case 2:
860         old=*(uint16_t *)p;
861         *(uint16_t *)p=(uint16_t)b;
862         break;
863     case 3:
864         old=(uint32_t)*p<<16;
865         *p++=(uint8_t)(b>>16);
866         old|=(uint32_t)*p<<8;
867         *p++=(uint8_t)(b>>8);
868         old|=*p;
869         *p=(uint8_t)b;
870         break;
871     case 4:
872         old=*(uint32_t *)p;
873         *(uint32_t *)p=b;
874         break;
875     default:
876         /* will never occur */
877         break;
878     }
879 
880     /* check that this Unicode code point was still unassigned */
881     if((mbcsData->stage2[idx+(nextOffset>>MBCS_STAGE_2_SHIFT)]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
882         if(flag>=0) {
883             fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
884                 (int)c, printBytes(buffer, bytes, length), (int)old);
885             return false;
886         } else if(VERBOSE) {
887             fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
888                 (int)c, printBytes(buffer, bytes, length), (int)old);
889         }
890         /* continue after the above warning if the precision of the mapping is
891            unspecified */
892     }
893     if(flag<=0) {
894         /* set the roundtrip flag */
895         mbcsData->stage2[idx+(nextOffset>>4)]|=(1UL<<(16+(c&0xf)));
896     }
897 
898     return true;
899 }
900 
901 U_CFUNC UBool
MBCSOkForBaseFromUnicode(const MBCSData * mbcsData,const uint8_t * bytes,int32_t length,UChar32 c,int8_t flag)902 MBCSOkForBaseFromUnicode(const MBCSData *mbcsData,
903                          const uint8_t *bytes, int32_t length,
904                          UChar32 c, int8_t flag) {
905     /*
906      * A 1:1 mapping does not fit into the MBCS base table's fromUnicode table under
907      * the following conditions:
908      *
909      * - a |2 SUB mapping for <subchar1> (no base table data structure for them)
910      * - a |1 fallback to 0x00 (result value 0, indistinguishable from unmappable entry)
911      * - a multi-byte mapping with leading 0x00 bytes (no explicit length field)
912      *
913      * Some of these tests are redundant with ucm_mappingType().
914      */
915     if( (flag==2 && length==1) ||
916         (flag==1 && bytes[0]==0) || /* testing length==1 would be redundant with the next test */
917         (flag<=1 && length>1 && bytes[0]==0)
918     ) {
919         return false;
920     }
921 
922     /*
923      * Additional restrictions for UTF-8-friendly fromUnicode tables,
924      * for code points up to the maximum optimized one:
925      *
926      * - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry)
927      * - any |1 fallback (no roundtrip flags in the optimized table)
928      */
929     if(mbcsData->utf8Friendly && flag<=1 && c<=mbcsData->utf8Max && (bytes[0]==0 || flag==1)) {
930         return false;
931     }
932 
933     /*
934      * If we omit the fromUnicode data, we can only store roundtrips there
935      * because only they are recoverable from the toUnicode data.
936      * Fallbacks must go into the extension table.
937      */
938     if(mbcsData->omitFromU && flag!=0) {
939         return false;
940     }
941 
942     /* All other mappings do fit into the base table. */
943     return true;
944 }
945 
946 U_CDECL_BEGIN
947 /* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
948 static UBool
MBCSAddTable(NewConverter * cnvData,UCMTable * table,UConverterStaticData * staticData)949 MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
950     MBCSData *mbcsData;
951     UCMapping *m;
952     UChar32 c;
953     int32_t i, maxCharLength;
954     int8_t f;
955     UBool isOK, utf8Friendly;
956 
957     staticData->unicodeMask=table->unicodeMask;
958     if(staticData->unicodeMask==3) {
959         fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n");
960         return false;
961     }
962 
963     staticData->conversionType=UCNV_MBCS;
964 
965     mbcsData=(MBCSData *)cnvData;
966     maxCharLength=mbcsData->ucm->states.maxCharLength;
967 
968     /*
969      * Generation of UTF-8-friendly data requires
970      * a sorted table, which makeconv generates when explicit precision
971      * indicators are used.
972      */
973     mbcsData->utf8Friendly=utf8Friendly=(UBool)((table->flagsType&UCM_FLAGS_EXPLICIT)!=0);
974     if(utf8Friendly) {
975         mbcsData->utf8Max=MBCS_UTF8_MAX;
976         if(SMALL && maxCharLength>1) {
977             mbcsData->omitFromU=true;
978         }
979     } else {
980         mbcsData->utf8Max=0;
981         if(SMALL && maxCharLength>1) {
982             fprintf(stderr,
983                 "makeconv warning: --small not available for .ucm files without |0 etc.\n");
984         }
985     }
986 
987     if(!MBCSStartMappings(mbcsData)) {
988         return false;
989     }
990 
991     staticData->hasFromUnicodeFallback=false;
992     staticData->hasToUnicodeFallback=false;
993 
994     isOK=true;
995 
996     m=table->mappings;
997     for(i=0; i<table->mappingsLength; ++m, ++i) {
998         c=m->u;
999         f=m->f;
1000 
1001         /*
1002          * Small optimization for --small .cnv files:
1003          *
1004          * If there are fromUnicode mappings above MBCS_UTF8_MAX,
1005          * then the file size will be smaller if we make utf8Max larger
1006          * because the size increase in stageUTF8 will be more than balanced by
1007          * how much less of stage2 needs to be stored.
1008          *
1009          * There is no point in doing this incrementally because stageUTF8
1010          * uses so much less space per block than stage2,
1011          * so we immediately increase utf8Max to 0xffff.
1012          *
1013          * Do not increase utf8Max if it is already at 0xfeff because MBCSAddFromUnicode()
1014          * sets it to that value when stageUTF8 overflows.
1015          */
1016         if( mbcsData->omitFromU && f<=1 &&
1017             mbcsData->utf8Max<c && c<=0xffff &&
1018             mbcsData->utf8Max<0xfeff
1019         ) {
1020             mbcsData->utf8Max=0xffff;
1021         }
1022 
1023         switch(f) {
1024         case -1:
1025             /* there was no precision/fallback indicator */
1026             /* fall through to set the mappings */
1027             U_FALLTHROUGH;
1028         case 0:
1029             /* set roundtrip mappings */
1030             isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1031 
1032             if(maxCharLength==1) {
1033                 isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1034             } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
1035                 isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1036             } else {
1037                 m->f|=MBCS_FROM_U_EXT_FLAG;
1038                 m->moveFlag=UCM_MOVE_TO_EXT;
1039             }
1040             break;
1041         case 1:
1042             /* set only a fallback mapping from Unicode to codepage */
1043             if(maxCharLength==1) {
1044                 staticData->hasFromUnicodeFallback=true;
1045                 isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1046             } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
1047                 staticData->hasFromUnicodeFallback=true;
1048                 isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1049             } else {
1050                 m->f|=MBCS_FROM_U_EXT_FLAG;
1051                 m->moveFlag=UCM_MOVE_TO_EXT;
1052             }
1053             break;
1054         case 2:
1055             /* ignore |2 SUB mappings, except to move <subchar1> mappings to the extension table */
1056             if(maxCharLength>1 && m->bLen==1) {
1057                 m->f|=MBCS_FROM_U_EXT_FLAG;
1058                 m->moveFlag=UCM_MOVE_TO_EXT;
1059             }
1060             break;
1061         case 3:
1062             /* set only a fallback mapping from codepage to Unicode */
1063             staticData->hasToUnicodeFallback=true;
1064             isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
1065             break;
1066         case 4:
1067             /* move "good one-way" mappings to the extension table */
1068             m->f|=MBCS_FROM_U_EXT_FLAG;
1069             m->moveFlag=UCM_MOVE_TO_EXT;
1070             break;
1071         default:
1072             /* will not occur because the parser checked it already */
1073             fprintf(stderr, "error: illegal fallback indicator %d\n", f);
1074             return false;
1075         }
1076     }
1077 
1078     MBCSPostprocess(mbcsData, staticData);
1079 
1080     return isOK;
1081 }
1082 U_CDECL_END
1083 static UBool
transformEUC(MBCSData * mbcsData)1084 transformEUC(MBCSData *mbcsData) {
1085     uint8_t *p8;
1086     uint32_t i, value, oldLength, old3Top;
1087     uint8_t b;
1088 
1089     oldLength=mbcsData->ucm->states.maxCharLength;
1090     if(oldLength<3) {
1091         return false;
1092     }
1093 
1094     old3Top=mbcsData->stage3Top;
1095 
1096     /* careful: 2-byte and 4-byte codes are stored in platform endianness! */
1097 
1098     /* test if all first bytes are in {0, 0x8e, 0x8f} */
1099     p8=mbcsData->fromUBytes;
1100 
1101 #if !U_IS_BIG_ENDIAN
1102     if(oldLength==4) {
1103         p8+=3;
1104     }
1105 #endif
1106 
1107     for(i=0; i<old3Top; i+=oldLength) {
1108         b=p8[i];
1109         if(b!=0 && b!=0x8e && b!=0x8f) {
1110             /* some first byte does not fit the EUC pattern, nothing to be done */
1111             return false;
1112         }
1113     }
1114     /* restore p if it was modified above */
1115     p8=mbcsData->fromUBytes;
1116 
1117     /* modify outputType and adjust stage3Top */
1118     mbcsData->ucm->states.outputType=(int8_t)(MBCS_OUTPUT_3_EUC+oldLength-3);
1119     mbcsData->stage3Top=(old3Top*(oldLength-1))/oldLength;
1120 
1121     /*
1122      * EUC-encode all byte sequences;
1123      * see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly,
1124      * p. 161 in chapter 4 "Encoding Methods"
1125      *
1126      * This also must reverse the byte order if the platform is little-endian!
1127      */
1128     if(oldLength==3) {
1129         uint16_t *q=(uint16_t *)p8;
1130         for(i=0; i<old3Top; i+=oldLength) {
1131             b=*p8;
1132             if(b==0) {
1133                 /* short sequences are stored directly */
1134                 /* code set 0 or 1 */
1135                 (*q++)=(uint16_t)((p8[1]<<8)|p8[2]);
1136             } else if(b==0x8e) {
1137                 /* code set 2 */
1138                 (*q++)=(uint16_t)(((p8[1]&0x7f)<<8)|p8[2]);
1139             } else /* b==0x8f */ {
1140                 /* code set 3 */
1141                 (*q++)=(uint16_t)((p8[1]<<8)|(p8[2]&0x7f));
1142             }
1143             p8+=3;
1144         }
1145     } else /* oldLength==4 */ {
1146         uint8_t *q=p8;
1147         uint32_t *p32=(uint32_t *)p8;
1148         for(i=0; i<old3Top; i+=4) {
1149             value=(*p32++);
1150             if(value<=0xffffff) {
1151                 /* short sequences are stored directly */
1152                 /* code set 0 or 1 */
1153                 (*q++)=(uint8_t)(value>>16);
1154                 (*q++)=(uint8_t)(value>>8);
1155                 (*q++)=(uint8_t)value;
1156             } else if(value<=0x8effffff) {
1157                 /* code set 2 */
1158                 (*q++)=(uint8_t)((value>>16)&0x7f);
1159                 (*q++)=(uint8_t)(value>>8);
1160                 (*q++)=(uint8_t)value;
1161             } else /* first byte is 0x8f */ {
1162                 /* code set 3 */
1163                 (*q++)=(uint8_t)(value>>16);
1164                 (*q++)=(uint8_t)((value>>8)&0x7f);
1165                 (*q++)=(uint8_t)value;
1166             }
1167         }
1168     }
1169 
1170     return true;
1171 }
1172 
1173 /*
1174  * Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far
1175  * as possible. Overlapping is done on unassigned head and tail
1176  * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
1177  * Stage 1 indexes need to be adjusted accordingly.
1178  * This function is very similar to genprops/store.c/compactStage().
1179  */
1180 static void
singleCompactStage2(MBCSData * mbcsData)1181 singleCompactStage2(MBCSData *mbcsData) {
1182     /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
1183     uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
1184     uint16_t i, start, prevEnd, newStart;
1185 
1186     /* enter the all-unassigned first stage 2 block into the map */
1187     map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
1188 
1189     /* begin with the first block after the all-unassigned one */
1190     start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
1191     while(start<mbcsData->stage2Top) {
1192         prevEnd=(uint16_t)(newStart-1);
1193 
1194         /* find the size of the overlap */
1195         for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2Single[start+i]==0 && mbcsData->stage2Single[prevEnd-i]==0; ++i) {}
1196 
1197         if(i>0) {
1198             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i);
1199 
1200             /* move the non-overlapping indexes to their new positions */
1201             start+=i;
1202             for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) {
1203                 mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
1204             }
1205         } else if(newStart<start) {
1206             /* move the indexes to their new positions */
1207             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
1208             for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
1209                 mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
1210             }
1211         } else /* no overlap && newStart==start */ {
1212             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
1213             start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
1214         }
1215     }
1216 
1217     /* adjust stage2Top */
1218     if(VERBOSE && newStart<mbcsData->stage2Top) {
1219         printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
1220                 (unsigned long)mbcsData->stage2Top, (unsigned long)newStart,
1221                 (long)(mbcsData->stage2Top-newStart)*2);
1222     }
1223     mbcsData->stage2Top=newStart;
1224 
1225     /* now adjust stage 1 */
1226     for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
1227         mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
1228     }
1229 }
1230 
1231 /* Compact stage 3 for SBCS - same algorithm as above. */
1232 static void
singleCompactStage3(MBCSData * mbcsData)1233 singleCompactStage3(MBCSData *mbcsData) {
1234     uint16_t *stage3=(uint16_t *)mbcsData->fromUBytes;
1235 
1236     /* this array maps the ordinal number of a stage 3 block to its new stage 2 index */
1237     uint16_t map[0x1000];
1238     uint16_t i, start, prevEnd, newStart;
1239 
1240     /* enter the all-unassigned first stage 3 block into the map */
1241     map[0]=0;
1242 
1243     /* begin with the first block after the all-unassigned one */
1244     start=newStart=16;
1245     while(start<mbcsData->stage3Top) {
1246         prevEnd=(uint16_t)(newStart-1);
1247 
1248         /* find the size of the overlap */
1249         for(i=0; i<16 && stage3[start+i]==0 && stage3[prevEnd-i]==0; ++i) {}
1250 
1251         if(i>0) {
1252             map[start>>4]=(uint16_t)(newStart-i);
1253 
1254             /* move the non-overlapping indexes to their new positions */
1255             start+=i;
1256             for(i=(uint16_t)(16-i); i>0; --i) {
1257                 stage3[newStart++]=stage3[start++];
1258             }
1259         } else if(newStart<start) {
1260             /* move the indexes to their new positions */
1261             map[start>>4]=newStart;
1262             for(i=16; i>0; --i) {
1263                 stage3[newStart++]=stage3[start++];
1264             }
1265         } else /* no overlap && newStart==start */ {
1266             map[start>>4]=start;
1267             start=newStart+=16;
1268         }
1269     }
1270 
1271     /* adjust stage3Top */
1272     if(VERBOSE && newStart<mbcsData->stage3Top) {
1273         printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n",
1274                 (unsigned long)mbcsData->stage3Top, (unsigned long)newStart,
1275                 (long)(mbcsData->stage3Top-newStart)*2);
1276     }
1277     mbcsData->stage3Top=newStart;
1278 
1279     /* now adjust stage 2 */
1280     for(i=0; i<mbcsData->stage2Top; ++i) {
1281         mbcsData->stage2Single[i]=map[mbcsData->stage2Single[i]>>4];
1282     }
1283 }
1284 
1285 /*
1286  * Compact stage 2 by overlapping adjacent stage 2 blocks as far
1287  * as possible. Overlapping is done on unassigned head and tail
1288  * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
1289  * Stage 1 indexes need to be adjusted accordingly.
1290  * This function is very similar to genprops/store.c/compactStage().
1291  */
1292 static void
compactStage2(MBCSData * mbcsData)1293 compactStage2(MBCSData *mbcsData) {
1294     /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
1295     uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
1296     uint16_t i, start, prevEnd, newStart;
1297 
1298     /* enter the all-unassigned first stage 2 block into the map */
1299     map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
1300 
1301     /* begin with the first block after the all-unassigned one */
1302     start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
1303     while(start<mbcsData->stage2Top) {
1304         prevEnd=(uint16_t)(newStart-1);
1305 
1306         /* find the size of the overlap */
1307         for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2[start+i]==0 && mbcsData->stage2[prevEnd-i]==0; ++i) {}
1308 
1309         if(i>0) {
1310             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i);
1311 
1312             /* move the non-overlapping indexes to their new positions */
1313             start+=i;
1314             for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) {
1315                 mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
1316             }
1317         } else if(newStart<start) {
1318             /* move the indexes to their new positions */
1319             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
1320             for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
1321                 mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
1322             }
1323         } else /* no overlap && newStart==start */ {
1324             map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
1325             start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
1326         }
1327     }
1328 
1329     /* adjust stage2Top */
1330     if(VERBOSE && newStart<mbcsData->stage2Top) {
1331         printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
1332                 (unsigned long)mbcsData->stage2Top, (unsigned long)newStart,
1333                 (long)(mbcsData->stage2Top-newStart)*4);
1334     }
1335     mbcsData->stage2Top=newStart;
1336 
1337     /* now adjust stage 1 */
1338     for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
1339         mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
1340     }
1341 }
1342 
1343 static void
MBCSPostprocess(MBCSData * mbcsData,const UConverterStaticData *)1344 MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData * /*staticData*/) {
1345     UCMStates *states;
1346     int32_t maxCharLength, stage3Width;
1347 
1348     states=&mbcsData->ucm->states;
1349     stage3Width=maxCharLength=states->maxCharLength;
1350 
1351     ucm_optimizeStates(states,
1352                        &mbcsData->unicodeCodeUnits,
1353                        mbcsData->toUFallbacks, mbcsData->countToUFallbacks,
1354                        VERBOSE);
1355 
1356     /* try to compact the fromUnicode tables */
1357     if(transformEUC(mbcsData)) {
1358         --stage3Width;
1359     }
1360 
1361     /*
1362      * UTF-8-friendly tries are built precompacted, to cope with variable
1363      * stage 3 allocation block sizes.
1364      *
1365      * Tables without precision indicators cannot be built that way,
1366      * because if a block was overlapped with a previous one, then a smaller
1367      * code point for the same block would not fit.
1368      * Therefore, such tables are not marked UTF-8-friendly and must be
1369      * compacted after all mappings are entered.
1370      */
1371     if(!mbcsData->utf8Friendly) {
1372         if(maxCharLength==1) {
1373             singleCompactStage3(mbcsData);
1374             singleCompactStage2(mbcsData);
1375         } else {
1376             compactStage2(mbcsData);
1377         }
1378     }
1379 
1380     if(VERBOSE) {
1381         /*uint32_t c, i1, i2, i2Limit, i3;*/
1382 
1383         printf("fromUnicode number of uint%s_t in stage 2: 0x%lx=%lu\n",
1384                maxCharLength==1 ? "16" : "32",
1385                (unsigned long)mbcsData->stage2Top,
1386                (unsigned long)mbcsData->stage2Top);
1387         printf("fromUnicode number of %d-byte stage 3 mapping entries: 0x%lx=%lu\n",
1388                (int)stage3Width,
1389                (unsigned long)mbcsData->stage3Top/stage3Width,
1390                (unsigned long)mbcsData->stage3Top/stage3Width);
1391 #if 0
1392         c=0;
1393         for(i1=0; i1<MBCS_STAGE_1_SIZE; ++i1) {
1394             i2=mbcsData->stage1[i1];
1395             if(i2==0) {
1396                 c+=MBCS_STAGE_2_BLOCK_SIZE*MBCS_STAGE_3_BLOCK_SIZE;
1397                 continue;
1398             }
1399             for(i2Limit=i2+MBCS_STAGE_2_BLOCK_SIZE; i2<i2Limit; ++i2) {
1400                 if(maxCharLength==1) {
1401                     i3=mbcsData->stage2Single[i2];
1402                 } else {
1403                     i3=(uint16_t)mbcsData->stage2[i2];
1404                 }
1405                 if(i3==0) {
1406                     c+=MBCS_STAGE_3_BLOCK_SIZE;
1407                     continue;
1408                 }
1409                 printf("U+%04lx i1=0x%02lx i2=0x%04lx i3=0x%04lx\n",
1410                        (unsigned long)c,
1411                        (unsigned long)i1,
1412                        (unsigned long)i2,
1413                        (unsigned long)i3);
1414                 c+=MBCS_STAGE_3_BLOCK_SIZE;
1415             }
1416         }
1417 #endif
1418     }
1419 }
1420 
1421 U_CDECL_BEGIN
1422 static uint32_t
MBCSWrite(NewConverter * cnvData,const UConverterStaticData * staticData,UNewDataMemory * pData,int32_t tableType)1423 MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
1424           UNewDataMemory *pData, int32_t tableType) {
1425     MBCSData *mbcsData=(MBCSData *)cnvData;
1426     uint32_t stage2Start, stage2Length;
1427     uint32_t top, stageUTF8Length=0;
1428     int32_t i, stage1Top;
1429     uint32_t headerLength;
1430 
1431     _MBCSHeader header=UCNV_MBCS_HEADER_INITIALIZER;
1432 
1433     stage2Length=mbcsData->stage2Top;
1434     if(mbcsData->omitFromU) {
1435         /* find how much of stage2 can be omitted */
1436         int32_t utf8Limit=(int32_t)mbcsData->utf8Max+1;
1437         uint32_t st2=0; /*initialized it to avoid compiler warnings */
1438 
1439         i=utf8Limit>>MBCS_STAGE_1_SHIFT;
1440         if((utf8Limit&((1<<MBCS_STAGE_1_SHIFT)-1))!=0 && (st2=mbcsData->stage1[i])!=0) {
1441             /* utf8Limit is in the middle of an existing stage 2 block */
1442             stage2Start=st2+((utf8Limit>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK);
1443         } else {
1444             /* find the last stage2 block with mappings before utf8Limit */
1445             while(i>0 && (st2=mbcsData->stage1[--i])==0) {}
1446             /* stage2 up to the end of this block corresponds to stageUTF8 */
1447             stage2Start=st2+MBCS_STAGE_2_BLOCK_SIZE;
1448         }
1449         header.options|=MBCS_OPT_NO_FROM_U;
1450         header.fullStage2Length=stage2Length;
1451         stage2Length-=stage2Start;
1452         if(VERBOSE) {
1453             printf("+ omitting %lu out of %lu stage2 entries and %lu fromUBytes\n",
1454                     (unsigned long)stage2Start,
1455                     (unsigned long)mbcsData->stage2Top,
1456                     (unsigned long)mbcsData->stage3Top);
1457             printf("+ total size savings: %lu bytes\n", (unsigned long)stage2Start*4+mbcsData->stage3Top);
1458         }
1459     } else {
1460         stage2Start=0;
1461     }
1462 
1463     if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
1464         stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
1465     } else {
1466         stage1Top=0x40; /* 0x40==64 */
1467     }
1468 
1469     /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
1470     if(mbcsData->ucm->states.maxCharLength==1) {
1471         for(i=0; i<stage1Top; ++i) {
1472             mbcsData->stage1[i]+=(uint16_t)stage1Top;
1473         }
1474 
1475         /* stage2Top/Length have counted 16-bit results, now we need to count bytes */
1476         /* also round up to a multiple of 4 bytes */
1477         stage2Length=(stage2Length*2+1)&~1;
1478 
1479         /* stage3Top has counted 16-bit results, now we need to count bytes */
1480         mbcsData->stage3Top*=2;
1481 
1482         if(mbcsData->utf8Friendly) {
1483             header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */
1484         }
1485     } else {
1486         for(i=0; i<stage1Top; ++i) {
1487             mbcsData->stage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */
1488         }
1489 
1490         /* stage2Top/Length have counted 32-bit results, now we need to count bytes */
1491         stage2Length*=4;
1492         /* leave stage2Start counting 32-bit units */
1493 
1494         if(mbcsData->utf8Friendly) {
1495             stageUTF8Length=(mbcsData->utf8Max+1)>>MBCS_UTF8_STAGE_SHIFT;
1496             header.version[2]=(uint8_t)(mbcsData->utf8Max>>8); /* store 0xd7 for max==0xd7ff */
1497         }
1498 
1499         /* stage3Top has already counted bytes */
1500     }
1501 
1502     /* round up stage3Top so that the sizes of all data blocks are multiples of 4 */
1503     mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3;
1504 
1505     /* fill the header */
1506     if(header.options&MBCS_OPT_INCOMPATIBLE_MASK) {
1507         header.version[0]=5;
1508         if(header.options&MBCS_OPT_NO_FROM_U) {
1509             headerLength=10;  /* include fullStage2Length */
1510         } else {
1511             headerLength=MBCS_HEADER_V5_MIN_LENGTH;  /* 9 */
1512         }
1513     } else {
1514         header.version[0]=4;
1515         headerLength=MBCS_HEADER_V4_LENGTH;  /* 8 */
1516     }
1517     header.version[1]=4;
1518     /* header.version[2] set above for utf8Friendly data */
1519 
1520     header.options|=(uint32_t)headerLength;
1521 
1522     header.countStates=mbcsData->ucm->states.countStates;
1523     header.countToUFallbacks=mbcsData->countToUFallbacks;
1524 
1525     header.offsetToUCodeUnits=
1526         headerLength*4+
1527         mbcsData->ucm->states.countStates*1024+
1528         mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback);
1529     header.offsetFromUTable=
1530         header.offsetToUCodeUnits+
1531         mbcsData->ucm->states.countToUCodeUnits*2;
1532     header.offsetFromUBytes=
1533         header.offsetFromUTable+
1534         stage1Top*2+
1535         stage2Length;
1536     header.fromUBytesLength=mbcsData->stage3Top;
1537 
1538     top=header.offsetFromUBytes+stageUTF8Length*2;
1539     if(!(header.options&MBCS_OPT_NO_FROM_U)) {
1540         top+=header.fromUBytesLength;
1541     }
1542 
1543     header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
1544 
1545     if(tableType&TABLE_EXT) {
1546         if(top>0xffffff) {
1547             fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top);
1548             return 0;
1549         }
1550 
1551         header.flags|=top<<8;
1552     }
1553 
1554     /* write the MBCS data */
1555     udata_writeBlock(pData, &header, headerLength*4);
1556     udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024);
1557     udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback));
1558     udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2);
1559     udata_writeBlock(pData, mbcsData->stage1, stage1Top*2);
1560     if(mbcsData->ucm->states.maxCharLength==1) {
1561         udata_writeBlock(pData, mbcsData->stage2Single+stage2Start, stage2Length);
1562     } else {
1563         udata_writeBlock(pData, mbcsData->stage2+stage2Start, stage2Length);
1564     }
1565     if(!(header.options&MBCS_OPT_NO_FROM_U)) {
1566         udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
1567     }
1568 
1569     if(stageUTF8Length>0) {
1570         udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2);
1571     }
1572 
1573     /* return the number of bytes that should have been written */
1574     return top;
1575 }
1576 U_CDECL_END
1577