1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2003-2013, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: ucm.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2003jun20 14 * created by: Markus W. Scherer 15 * 16 * Definitions for the .ucm file parser and handler module ucm.c. 17 */ 18 19 #ifndef __UCM_H__ 20 #define __UCM_H__ 21 22 #include "unicode/utypes.h" 23 #include "ucnvmbcs.h" 24 #include "ucnv_ext.h" 25 #include "filestrm.h" 26 #include <stdio.h> 27 28 #if !UCONFIG_NO_CONVERSION 29 30 U_CDECL_BEGIN 31 32 /* constants for UCMapping.moveFlag */ 33 enum { 34 UCM_MOVE_TO_EXT=1, 35 UCM_REMOVE_MAPPING=2 36 }; 37 38 /* 39 * Per-mapping data structure 40 * 41 * u if uLen==1: Unicode code point 42 * else index to uLen code points 43 * b if bLen<=4: up to 4 bytes 44 * else index to bLen bytes 45 * uLen number of code points 46 * bLen number of words containing left-justified bytes 47 * bIsMultipleChars indicates that the bytes contain more than one sequence 48 * according to the state table 49 * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) 50 * or "good one-way" mapping (4). 51 * Same values as in the source file after | 52 */ 53 typedef struct UCMapping { 54 UChar32 u; 55 union { 56 uint32_t idx; 57 uint8_t bytes[4]; 58 } b; 59 int8_t uLen, bLen, f, moveFlag; 60 } UCMapping; 61 62 /* constants for UCMTable.flagsType */ 63 enum { 64 UCM_FLAGS_INITIAL, /* no mappings parsed yet */ 65 UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ 66 UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ 67 UCM_FLAGS_MIXED /* both implicit and explicit */ 68 }; 69 70 typedef struct UCMTable { 71 UCMapping *mappings; 72 int32_t mappingsCapacity, mappingsLength; 73 74 UChar32 *codePoints; 75 int32_t codePointsCapacity, codePointsLength; 76 77 uint8_t *bytes; 78 int32_t bytesCapacity, bytesLength; 79 80 /* index map for mapping by bytes first */ 81 int32_t *reverseMap; 82 83 uint8_t unicodeMask; 84 int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ 85 UBool isSorted; 86 } UCMTable; 87 88 enum { 89 MBCS_STATE_FLAG_DIRECT=1, 90 MBCS_STATE_FLAG_SURROGATES, 91 92 MBCS_STATE_FLAG_READY=16 93 }; 94 95 typedef struct UCMStates { 96 int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; 97 uint32_t stateFlags[MBCS_MAX_STATE_COUNT], 98 stateOffsetSum[MBCS_MAX_STATE_COUNT]; 99 100 int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; 101 int8_t conversionType, outputType; 102 } UCMStates; 103 104 typedef struct UCMFile { 105 UCMTable *base, *ext; 106 UCMStates states; 107 108 char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; 109 } UCMFile; 110 111 /* simple accesses ---------------------------------------------------------- */ 112 113 #define UCM_GET_CODE_POINTS(t, m) \ 114 (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) 115 116 #define UCM_GET_BYTES(t, m) \ 117 (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx) 118 119 /* APIs --------------------------------------------------------------------- */ 120 121 U_CAPI UCMFile * U_EXPORT2 122 ucm_open(void); 123 124 U_CAPI void U_EXPORT2 125 ucm_close(UCMFile *ucm); 126 127 U_CAPI UBool U_EXPORT2 128 ucm_parseHeaderLine(UCMFile *ucm, 129 char *line, char **pKey, char **pValue); 130 131 /* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ 132 U_CAPI int32_t U_EXPORT2 133 ucm_mappingType(UCMStates *baseStates, 134 UCMapping *m, 135 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 136 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 137 138 /* add a mapping to the base or extension table as appropriate */ 139 U_CAPI UBool U_EXPORT2 140 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, 141 UCMapping *m, 142 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 143 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 144 145 U_CAPI UBool U_EXPORT2 146 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); 147 148 149 U_CAPI UCMTable * U_EXPORT2 150 ucm_openTable(void); 151 152 U_CAPI void U_EXPORT2 153 ucm_closeTable(UCMTable *table); 154 155 U_CAPI void U_EXPORT2 156 ucm_resetTable(UCMTable *table); 157 158 U_CAPI void U_EXPORT2 159 ucm_sortTable(UCMTable *t); 160 161 /* 162 * Remove mappings with their move flag set from the base table 163 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. 164 */ 165 U_CAPI void U_EXPORT2 166 ucm_moveMappings(UCMTable *base, UCMTable *ext); 167 168 /** 169 * Read a table from a .ucm file, from after the CHARMAP line to 170 * including the END CHARMAP line. 171 */ 172 U_CAPI void U_EXPORT2 173 ucm_readTable(UCMFile *ucm, FileStream* convFile, 174 UBool forBase, UCMStates *baseStates, 175 UErrorCode *pErrorCode); 176 177 /** 178 * Check the validity of mappings against a base table's states; 179 * necessary for extension-only tables that were read before their base tables. 180 */ 181 U_CAPI UBool U_EXPORT2 182 ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); 183 184 /** 185 * Check a base table against an extension table. 186 * Set the moveTarget!=NULL if it is possible to move mappings from the base. 187 * This is the case where base and extension tables are parsed from a single file 188 * (moveTarget==ext) 189 * or when delta file mappings are subtracted from a base table. 190 * 191 * When a base table cannot be modified because a delta file is parsed in makeconv, 192 * then set moveTarget=NULL. 193 * 194 * if(intersectBase) then mappings that exist in the base table but not in 195 * the extension table are moved to moveTarget instead of showing an error. 196 * 197 * Special mode: 198 * If intersectBase==2 for a DBCS extension table, then SBCS mappings are 199 * not moved out of the base unless their Unicode input requires it. 200 * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. 201 * 202 * For both tables in the same file, the extension table is automatically 203 * built. 204 * For separate files, the extension file can use a complete mapping table (.ucm file), 205 * so that common mappings need not be stripped out manually. 206 * 207 * 208 * Sort both tables, and then for each mapping direction: 209 * 210 * If intersectBase is TRUE and the base table contains a mapping 211 * that does not exist in the extension table, then this mapping is moved 212 * to moveTarget. 213 * 214 * - otherwise - 215 * 216 * If the base table contains a mapping for which the input sequence is 217 * the same as the extension input, then 218 * - if the output is the same: remove the extension mapping 219 * - else: error 220 * 221 * If the base table contains a mapping for which the input sequence is 222 * a prefix of the extension input, then 223 * - if moveTarget!=NULL: move the base mapping to the moveTarget table 224 * - else: error 225 * 226 * @return FALSE in case of an irreparable error 227 */ 228 U_CAPI UBool U_EXPORT2 229 ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, 230 UCMTable *moveTarget, UBool intersectBase); 231 232 U_CAPI void U_EXPORT2 233 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); 234 235 U_CAPI void U_EXPORT2 236 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); 237 238 239 U_CAPI void U_EXPORT2 240 ucm_addState(UCMStates *states, const char *s); 241 242 U_CAPI void U_EXPORT2 243 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck); 244 245 U_CAPI int32_t U_EXPORT2 246 ucm_countChars(UCMStates *states, 247 const uint8_t *bytes, int32_t length); 248 249 250 U_CAPI int8_t U_EXPORT2 251 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); 252 253 U_CAPI UBool U_EXPORT2 254 ucm_parseMappingLine(UCMapping *m, 255 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 256 uint8_t bytes[UCNV_EXT_MAX_BYTES], 257 const char *line); 258 259 U_CAPI void U_EXPORT2 260 ucm_addMapping(UCMTable *table, 261 UCMapping *m, 262 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 263 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 264 265 /* very makeconv-specific functions ----------------------------------------- */ 266 267 /* finalize and optimize states after the toUnicode mappings are processed */ 268 U_CAPI void U_EXPORT2 269 ucm_optimizeStates(UCMStates *states, 270 uint16_t **pUnicodeCodeUnits, 271 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 272 UBool verbose); 273 274 /* moved here because it is used inside ucmstate.c */ 275 U_CAPI int32_t U_EXPORT2 276 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 277 uint32_t offset); 278 279 /* very rptp2ucm-specific functions ----------------------------------------- */ 280 281 /* 282 * Input: Separate tables with mappings from/to Unicode, 283 * subchar and subchar1 (0 if none). 284 * All mappings must have flag 0. 285 * 286 * Output: fromUTable will contain the union of mappings with the correct 287 * precision flags, and be sorted. 288 */ 289 U_CAPI void U_EXPORT2 290 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, 291 const uint8_t *subchar, int32_t subcharLength, 292 uint8_t subchar1); 293 294 U_CAPI UBool U_EXPORT2 295 ucm_separateMappings(UCMFile *ucm, UBool isSISO); 296 297 U_CDECL_END 298 299 #endif 300 301 #endif 302 303