1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2000-2008, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: genmbcs.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2000jul10 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __GENMBCS_H__ 20 #define __GENMBCS_H__ 21 22 #include "makeconv.h" 23 24 enum { 25 /* 26 * TODO: Consider using ucnvmbcs.h constants. 27 * However, not all values need to be exactly the same, for example 28 * the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX 29 * may be higher in makeconv than in the runtime code because that 30 * affects only a small number of .cnv files [if any] but all 31 * runtime UConverterSharedData objects. 32 */ 33 MBCS_STAGE_2_SHIFT=4, 34 MBCS_STAGE_2_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits in stage 2 */ 35 MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */ 36 MBCS_STAGE_2_BLOCK_MASK=0x3f, /* for after shifting by MBCS_STAGE_2_SHIFT */ 37 MBCS_STAGE_1_SHIFT=10, 38 MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */ 39 MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */ 40 MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */ 41 MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE, 42 MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT, 43 44 MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */ 45 MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */ 46 47 MBCS_STAGE_3_BLOCK_SIZE=16, /* =16=1<<4 for 4 bits in stage 3 */ 48 MBCS_STAGE_3_BLOCK_MASK=0xf, 49 MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */ 50 51 MBCS_STAGE_3_GRANULARITY=16, /* =1<<4: MBCS stage 2 indexes are shifted left 4 */ 52 MBCS_STAGE_3_SBCS_SIZE=0x10000, /* max 64k mappings for SBCS */ 53 MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for MBCS */ 54 55 /* 56 * SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures. 57 * Possible values are 0x01ff..0xffff, in steps of 0x100. 58 * 59 * Unlike for MBCS, this constant only affects the stage 3 block allocation size; 60 * there is no additional stage 1/2 table stored in the .cnv file. 61 * The max value should be at least 0x7ff to cover 2-byte UTF-8. 62 * 0xfff also covers a number other small scripts which have legacy charsets 63 * (like Thai). 64 * Higher values up to 0x1fff are harmless and potentially useful because 65 * that covers small-script blocks which usually have either dense mappings 66 * or no mappings at all. 67 * Starting at U+2000, there are mostly symbols and format characters 68 * with a low density of SBCS mappings, which would result in more wasted 69 * stage 3 entries with the larger block size. 70 */ 71 SBCS_UTF8_MAX=0x1fff, 72 73 /* 74 * MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures. 75 * Possible values are 0x01ff..0xffff, in steps of 0x100. 76 * 77 * Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table 78 * with extreme input data. The function checks for this overflow. 79 * 80 * 0xd7ff is chosen for the majority of common characters including Unihan and Hangul. 81 * At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc. 82 * Larger values cause slightly larger MBCS .cnv files. 83 */ 84 MBCS_UTF8_MAX=0xd7ff, 85 MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1, /* =0xd800 */ 86 87 MBCS_UTF8_STAGE_SHIFT=6, 88 MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits from last trail byte */ 89 MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f, 90 91 /* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */ 92 MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */ 93 94 MBCS_FROM_U_EXT_FLAG=0x10, /* UCMapping.f bit for base table mappings that fit into the base toU table */ 95 MBCS_FROM_U_EXT_MASK=0x0f, /* but need to go into the extension fromU table */ 96 97 /* =4 number of regular stage 3 blocks for final UTF-8 trail byte */ 98 MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZE, 99 100 MBCS_MAX_FALLBACK_COUNT=8192 101 }; 102 103 U_CFUNC NewConverter * 104 MBCSOpen(UCMFile *ucm); 105 106 struct MBCSData; 107 typedef struct MBCSData MBCSData; 108 109 /* 110 * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode() 111 * for creating an extension-only file. 112 * Assume maxCharLength>1. 113 */ 114 U_CFUNC const MBCSData * 115 MBCSGetDummy(void); 116 117 /* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */ 118 U_CFUNC UBool 119 MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, 120 const uint8_t *bytes, int32_t length, 121 UChar32 c, int8_t flag); 122 123 U_CFUNC NewConverter * 124 CnvExtOpen(UCMFile *ucm); 125 126 #endif /* __GENMBCS_H__ */ 127