1 /* 2 ******************************************************************************* 3 * Copyright (C) 2003-2007, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucm.h 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2003jun20 12 * created by: Markus W. Scherer 13 * 14 * Definitions for the .ucm file parser and handler module ucm.c. 15 */ 16 17 #ifndef __UCM_H__ 18 #define __UCM_H__ 19 20 #include "unicode/utypes.h" 21 #include "ucnvmbcs.h" 22 #include "ucnv_ext.h" 23 #include "filestrm.h" 24 #include <stdio.h> 25 26 #if !UCONFIG_NO_CONVERSION 27 28 U_CDECL_BEGIN 29 30 /* constants for UCMapping.moveFlag */ 31 enum { 32 UCM_MOVE_TO_EXT=1, 33 UCM_REMOVE_MAPPING=2 34 }; 35 36 /* 37 * Per-mapping data structure 38 * 39 * u if uLen==1: Unicode code point 40 * else index to uLen code points 41 * b if bLen<=4: up to 4 bytes 42 * else index to bLen bytes 43 * uLen number of code points 44 * bLen number of words containing left-justified bytes 45 * bIsMultipleChars indicates that the bytes contain more than one sequence 46 * according to the state table 47 * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) 48 * same values as in the source file after | 49 */ 50 typedef struct UCMapping { 51 UChar32 u; 52 union { 53 uint32_t index; 54 uint8_t bytes[4]; 55 } b; 56 int8_t uLen, bLen, f, moveFlag; 57 } UCMapping; 58 59 /* constants for UCMTable.flagsType */ 60 enum { 61 UCM_FLAGS_INITIAL, /* no mappings parsed yet */ 62 UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ 63 UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ 64 UCM_FLAGS_MIXED /* both implicit and explicit */ 65 }; 66 67 typedef struct UCMTable { 68 UCMapping *mappings; 69 int32_t mappingsCapacity, mappingsLength; 70 71 UChar32 *codePoints; 72 int32_t codePointsCapacity, codePointsLength; 73 74 uint8_t *bytes; 75 int32_t bytesCapacity, bytesLength; 76 77 /* index map for mapping by bytes first */ 78 int32_t *reverseMap; 79 80 uint8_t unicodeMask; 81 int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ 82 UBool isSorted; 83 } UCMTable; 84 85 enum { 86 MBCS_STATE_FLAG_DIRECT=1, 87 MBCS_STATE_FLAG_SURROGATES, 88 89 MBCS_STATE_FLAG_READY=16 90 }; 91 92 typedef struct UCMStates { 93 int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; 94 uint32_t stateFlags[MBCS_MAX_STATE_COUNT], 95 stateOffsetSum[MBCS_MAX_STATE_COUNT]; 96 97 int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; 98 int8_t conversionType, outputType; 99 } UCMStates; 100 101 typedef struct UCMFile { 102 UCMTable *base, *ext; 103 UCMStates states; 104 105 char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; 106 } UCMFile; 107 108 /* simple accesses ---------------------------------------------------------- */ 109 110 #define UCM_GET_CODE_POINTS(t, m) \ 111 (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) 112 113 #define UCM_GET_BYTES(t, m) \ 114 (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.index) 115 116 /* APIs --------------------------------------------------------------------- */ 117 118 U_CAPI UCMFile * U_EXPORT2 119 ucm_open(void); 120 121 U_CAPI void U_EXPORT2 122 ucm_close(UCMFile *ucm); 123 124 U_CAPI UBool U_EXPORT2 125 ucm_parseHeaderLine(UCMFile *ucm, 126 char *line, char **pKey, char **pValue); 127 128 /* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ 129 U_CAPI int32_t U_EXPORT2 130 ucm_mappingType(UCMStates *baseStates, 131 UCMapping *m, 132 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 133 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 134 135 /* add a mapping to the base or extension table as appropriate */ 136 U_CAPI UBool U_EXPORT2 137 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, 138 UCMapping *m, 139 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 140 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 141 142 U_CAPI UBool U_EXPORT2 143 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); 144 145 146 U_CAPI UCMTable * U_EXPORT2 147 ucm_openTable(void); 148 149 U_CAPI void U_EXPORT2 150 ucm_closeTable(UCMTable *table); 151 152 U_CAPI void U_EXPORT2 153 ucm_resetTable(UCMTable *table); 154 155 U_CAPI void U_EXPORT2 156 ucm_sortTable(UCMTable *t); 157 158 /* 159 * Remove mappings with their move flag set from the base table 160 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. 161 */ 162 U_CAPI void U_EXPORT2 163 ucm_moveMappings(UCMTable *base, UCMTable *ext); 164 165 /** 166 * Read a table from a .ucm file, from after the CHARMAP line to 167 * including the END CHARMAP line. 168 */ 169 U_CAPI void U_EXPORT2 170 ucm_readTable(UCMFile *ucm, FileStream* convFile, 171 UBool forBase, UCMStates *baseStates, 172 UErrorCode *pErrorCode); 173 174 /** 175 * Check the validity of mappings against a base table's states; 176 * necessary for extension-only tables that were read before their base tables. 177 */ 178 U_CAPI UBool U_EXPORT2 179 ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); 180 181 /** 182 * Check a base table against an extension table. 183 * Set the moveTarget!=NULL if it is possible to move mappings from the base. 184 * This is the case where base and extension tables are parsed from a single file 185 * (moveTarget==ext) 186 * or when delta file mappings are subtracted from a base table. 187 * 188 * When a base table cannot be modified because a delta file is parsed in makeconv, 189 * then set moveTarget=NULL. 190 * 191 * if(intersectBase) then mappings that exist in the base table but not in 192 * the extension table are moved to moveTarget instead of showing an error. 193 * 194 * Special mode: 195 * If intersectBase==2 for a DBCS extension table, then SBCS mappings are 196 * not moved out of the base unless their Unicode input requires it. 197 * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. 198 * 199 * For both tables in the same file, the extension table is automatically 200 * built. 201 * For separate files, the extension file can use a complete mapping table (.ucm file), 202 * so that common mappings need not be stripped out manually. 203 * 204 * 205 * Sort both tables, and then for each mapping direction: 206 * 207 * If intersectBase is TRUE and the base table contains a mapping 208 * that does not exist in the extension table, then this mapping is moved 209 * to moveTarget. 210 * 211 * - otherwise - 212 * 213 * If the base table contains a mapping for which the input sequence is 214 * the same as the extension input, then 215 * - if the output is the same: remove the extension mapping 216 * - else: error 217 * 218 * If the base table contains a mapping for which the input sequence is 219 * a prefix of the extension input, then 220 * - if moveTarget!=NULL: move the base mapping to the moveTarget table 221 * - else: error 222 * 223 * @return FALSE in case of an irreparable error 224 */ 225 U_CAPI UBool U_EXPORT2 226 ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, 227 UCMTable *moveTarget, UBool intersectBase); 228 229 U_CAPI void U_EXPORT2 230 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); 231 232 U_CAPI void U_EXPORT2 233 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); 234 235 236 U_CAPI void U_EXPORT2 237 ucm_addState(UCMStates *states, const char *s); 238 239 U_CAPI void U_EXPORT2 240 ucm_processStates(UCMStates *states); 241 242 U_CAPI int32_t U_EXPORT2 243 ucm_countChars(UCMStates *states, 244 const uint8_t *bytes, int32_t length); 245 246 247 U_CAPI int8_t U_EXPORT2 248 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); 249 250 U_CAPI UBool U_EXPORT2 251 ucm_parseMappingLine(UCMapping *m, 252 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 253 uint8_t bytes[UCNV_EXT_MAX_BYTES], 254 const char *line); 255 256 U_CAPI void U_EXPORT2 257 ucm_addMapping(UCMTable *table, 258 UCMapping *m, 259 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 260 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 261 262 /* very makeconv-specific functions ----------------------------------------- */ 263 264 /* finalize and optimize states after the toUnicode mappings are processed */ 265 U_CAPI void U_EXPORT2 266 ucm_optimizeStates(UCMStates *states, 267 uint16_t **pUnicodeCodeUnits, 268 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 269 UBool verbose); 270 271 /* moved here because it is used inside ucmstate.c */ 272 U_CAPI int32_t U_EXPORT2 273 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 274 uint32_t offset); 275 276 /* very rptp2ucm-specific functions ----------------------------------------- */ 277 278 /* 279 * Input: Separate tables with mappings from/to Unicode, 280 * subchar and subchar1 (0 if none). 281 * All mappings must have flag 0. 282 * 283 * Output: fromUTable will contain the union of mappings with the correct 284 * precision flags, and be sorted. 285 */ 286 U_CAPI void U_EXPORT2 287 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, 288 const uint8_t *subchar, int32_t subcharLength, 289 uint8_t subchar1); 290 291 U_CAPI UBool U_EXPORT2 292 ucm_separateMappings(UCMFile *ucm, UBool isSISO); 293 294 U_CDECL_END 295 296 #endif 297 298 #endif 299 300