1 /* 2 ********************************************************************** 3 * Copyright (C) 1999-2007, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * 7 * ucnv_cnv.h: 8 * Definitions for converter implementations. 9 * 10 * Modification History: 11 * 12 * Date Name Description 13 * 05/09/00 helena Added implementation to handle fallback mappings. 14 * 06/29/2000 helena Major rewrite of the callback APIs. 15 */ 16 17 #ifndef UCNV_CNV_H 18 #define UCNV_CNV_H 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_CONVERSION 23 24 #include "unicode/ucnv.h" 25 #include "unicode/ucnv_err.h" 26 #include "unicode/uset.h" 27 #include "uset_imp.h" 28 29 U_CDECL_BEGIN 30 31 /* this is used in fromUnicode DBCS tables as an "unassigned" marker */ 32 #define missingCharMarker 0xFFFF 33 34 /* 35 * #define missingUCharMarker 0xfffe 36 * 37 * commented out because there are actually two values used in toUnicode tables: 38 * U+fffe "unassigned" 39 * U+ffff "illegal" 40 */ 41 42 /** Forward declaration, see ucnv_bld.h */ 43 struct UConverterSharedData; 44 typedef struct UConverterSharedData UConverterSharedData; 45 46 /* function types for UConverterImpl ---------------------------------------- */ 47 48 /* struct with arguments for UConverterLoad and ucnv_load() */ 49 typedef struct { 50 int32_t size; /* sizeof(UConverterLoadArgs) */ 51 int32_t nestedLoads; /* count nested ucnv_load() calls */ 52 int32_t reserved; /* reserved - for good alignment of the pointers */ 53 uint32_t options; 54 const char *pkg, *name; 55 } UConverterLoadArgs; 56 57 typedef void (*UConverterLoad) (UConverterSharedData *sharedData, 58 UConverterLoadArgs *pArgs, 59 const uint8_t *raw, UErrorCode *pErrorCode); 60 typedef void (*UConverterUnload) (UConverterSharedData *sharedData); 61 62 typedef void (*UConverterOpen) (UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *pErrorCode); 63 typedef void (*UConverterClose) (UConverter *cnv); 64 65 typedef enum UConverterResetChoice { 66 UCNV_RESET_BOTH, 67 UCNV_RESET_TO_UNICODE, 68 UCNV_RESET_FROM_UNICODE 69 } UConverterResetChoice; 70 71 typedef void (*UConverterReset) (UConverter *cnv, UConverterResetChoice choice); 72 73 /* 74 * Converter implementation function(s) for ucnv_toUnicode(). 75 * If the toUnicodeWithOffsets function pointer is NULL, 76 * then the toUnicode function will be used and the offsets will be set to -1. 77 * 78 * Must maintain state across buffers. Use toUBytes[toULength] for partial input 79 * sequences; it will be checked in ucnv.c at the end of the input stream 80 * to detect truncated input. 81 * Some converters may need additional detection and may then set U_TRUNCATED_CHAR_FOUND. 82 * 83 * The toUnicodeWithOffsets must write exactly as many offset values as target 84 * units. Write offset values of -1 for when the source index corresponding to 85 * the output unit is not known (e.g., the character started in an earlier buffer). 86 * The pArgs->offsets pointer need not be moved forward. 87 * 88 * At function return, either one of the following conditions must be true: 89 * - U_BUFFER_OVERFLOW_ERROR and the target is full: target==targetLimit 90 * - another error code with toUBytes[toULength] set to the offending input 91 * - no error, and the source is consumed: source==sourceLimit 92 * 93 * The ucnv.c code will handle the end of the input (reset) 94 * (reset, and truncation detection) and callbacks. 95 */ 96 typedef void (*UConverterToUnicode) (UConverterToUnicodeArgs *, UErrorCode *); 97 98 /* 99 * Same rules as for UConverterToUnicode. 100 * A lead surrogate is kept in fromUChar32 across buffers, and if an error 101 * occurs, then the offending input code point must be put into fromUChar32 102 * as well. 103 */ 104 typedef void (*UConverterFromUnicode) (UConverterFromUnicodeArgs *, UErrorCode *); 105 106 /* 107 * Converter implementation function for ucnv_convertEx(), for direct conversion 108 * between two charsets without pivoting through UTF-16. 109 * The rules are the same as for UConverterToUnicode and UConverterFromUnicode. 110 * In addition, 111 * - The toUnicode side must behave and keep state exactly like the 112 * UConverterToUnicode implementation for the same source charset. 113 * - A U_USING_DEFAULT_WARNING can be set to request to temporarily fall back 114 * to pivoting. When this function is called, the conversion framework makes 115 * sure that this warning is not set on input. 116 * - Continuing a partial match and flushing the toUnicode replay buffer 117 * are handled by pivoting, using the toUnicode and fromUnicode functions. 118 */ 119 typedef void (*UConverterConvert) (UConverterFromUnicodeArgs *pFromUArgs, 120 UConverterToUnicodeArgs *pToUArgs, 121 UErrorCode *pErrorCode); 122 123 /* 124 * Converter implementation function for ucnv_getNextUChar(). 125 * If the function pointer is NULL, then the toUnicode function will be used. 126 * 127 * Will be called at a character boundary (toULength==0). 128 * May return with 129 * - U_INDEX_OUTOFBOUNDS_ERROR if there was no output for the input 130 * (the return value will be ignored) 131 * - U_TRUNCATED_CHAR_FOUND or another error code (never U_BUFFER_OVERFLOW_ERROR!) 132 * with toUBytes[toULength] set to the offending input 133 * (the return value will be ignored) 134 * - return UCNV_GET_NEXT_UCHAR_USE_TO_U, without moving the source pointer, 135 * to indicate that the ucnv.c code shall call the toUnicode function instead 136 * - return a real code point result 137 * 138 * Unless UCNV_GET_NEXT_UCHAR_USE_TO_U is returned, the source bytes must be consumed. 139 * 140 * The ucnv.c code will handle the end of the input (reset) 141 * (except for truncation detection!) and callbacks. 142 */ 143 typedef UChar32 (*UConverterGetNextUChar) (UConverterToUnicodeArgs *, UErrorCode *); 144 145 typedef void (*UConverterGetStarters)(const UConverter* converter, 146 UBool starters[256], 147 UErrorCode *pErrorCode); 148 149 /* If this function pointer is null or if the function returns null 150 * the name field in static data struct should be returned by 151 * ucnv_getName() API function 152 */ 153 typedef const char * (*UConverterGetName) (const UConverter *cnv); 154 155 /** 156 * Write the codepage substitution character. 157 * If this function is not set, then ucnv_cbFromUWriteSub() writes 158 * the substitution character from UConverter. 159 * For stateful converters, it is typically necessary to handle this 160 * specificially for the converter in order to properly maintain the state. 161 */ 162 typedef void (*UConverterWriteSub) (UConverterFromUnicodeArgs *pArgs, int32_t offsetIndex, UErrorCode *pErrorCode); 163 164 /** 165 * For converter-specific safeClone processing 166 * If this function is not set, then ucnv_safeClone assumes that the converter has no private data that changes 167 * after the converter is done opening. 168 * If this function is set, then it is called just after a memcpy() of 169 * converter data to the new, empty converter, and is expected to set up 170 * the initial state of the converter. It is not expected to increment the 171 * reference counts of the standard data types such as the shared data. 172 */ 173 typedef UConverter * (*UConverterSafeClone) (const UConverter *cnv, 174 void *stackBuffer, 175 int32_t *pBufferSize, 176 UErrorCode *status); 177 178 /** 179 * Filters for some ucnv_getUnicodeSet() implementation code. 180 */ 181 typedef enum UConverterSetFilter { 182 UCNV_SET_FILTER_NONE, 183 UCNV_SET_FILTER_DBCS_ONLY, 184 UCNV_SET_FILTER_2022_CN, 185 UCNV_SET_FILTER_SJIS, 186 UCNV_SET_FILTER_GR94DBCS, 187 UCNV_SET_FILTER_HZ, 188 UCNV_SET_FILTER_COUNT 189 } UConverterSetFilter; 190 191 /** 192 * Fills the set of Unicode code points that can be converted by an ICU converter. 193 * The API function ucnv_getUnicodeSet() clears the USet before calling 194 * the converter's getUnicodeSet() implementation; the converter should only 195 * add the appropriate code points to allow recursive use. 196 * For example, the ISO-2022-JP converter will call each subconverter's 197 * getUnicodeSet() implementation to consecutively add code points to 198 * the same USet, which will result in a union of the sets of all subconverters. 199 * 200 * For more documentation, see ucnv_getUnicodeSet() in ucnv.h. 201 */ 202 typedef void (*UConverterGetUnicodeSet) (const UConverter *cnv, 203 const USetAdder *sa, 204 UConverterUnicodeSet which, 205 UErrorCode *pErrorCode); 206 207 UBool CONVERSION_U_SUCCESS (UErrorCode err); 208 209 /** 210 * UConverterImpl contains all the data and functions for a converter type. 211 * Its function pointers work much like a C++ vtable. 212 * Many converter types need to define only a subset of the functions; 213 * when a function pointer is NULL, then a default action will be performed. 214 * 215 * Every converter type must implement toUnicode, fromUnicode, and getNextUChar, 216 * otherwise the converter may crash. 217 * Every converter type that has variable-length codepage sequences should 218 * also implement toUnicodeWithOffsets and fromUnicodeWithOffsets for 219 * correct offset handling. 220 * All other functions may or may not be implemented - it depends only on 221 * whether the converter type needs them. 222 * 223 * When open() fails, then close() will be called, if present. 224 */ 225 struct UConverterImpl { 226 UConverterType type; 227 228 UConverterLoad load; 229 UConverterUnload unload; 230 231 UConverterOpen open; 232 UConverterClose close; 233 UConverterReset reset; 234 235 UConverterToUnicode toUnicode; 236 UConverterToUnicode toUnicodeWithOffsets; 237 UConverterFromUnicode fromUnicode; 238 UConverterFromUnicode fromUnicodeWithOffsets; 239 UConverterGetNextUChar getNextUChar; 240 241 UConverterGetStarters getStarters; 242 UConverterGetName getName; 243 UConverterWriteSub writeSub; 244 UConverterSafeClone safeClone; 245 UConverterGetUnicodeSet getUnicodeSet; 246 247 UConverterConvert toUTF8; 248 UConverterConvert fromUTF8; 249 }; 250 251 extern const UConverterSharedData 252 _MBCSData, _Latin1Data, 253 _UTF8Data, _UTF16BEData, _UTF16LEData, _UTF32BEData, _UTF32LEData, 254 _ISO2022Data, 255 _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6, 256 _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19, 257 _HZData,_ISCIIData, _SCSUData, _ASCIIData, 258 _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData; 259 260 U_CDECL_END 261 262 /** Always use fallbacks from codepage to Unicode */ 263 #define TO_U_USE_FALLBACK(useFallback) TRUE 264 #define UCNV_TO_U_USE_FALLBACK(cnv) TRUE 265 266 /** Use fallbacks from Unicode to codepage when cnv->useFallback or for private-use code points */ 267 #define IS_PRIVATE_USE(c) ((uint32_t)((c)-0xe000)<0x1900 || (uint32_t)((c)-0xf0000)<0x20000) 268 #define FROM_U_USE_FALLBACK(useFallback, c) ((useFallback) || IS_PRIVATE_USE(c)) 269 #define UCNV_FROM_U_USE_FALLBACK(cnv, c) FROM_U_USE_FALLBACK((cnv)->useFallback, c) 270 271 /** 272 * Magic number for ucnv_getNextUChar(), returned by a 273 * getNextUChar() implementation to indicate to use the converter's toUnicode() 274 * instead of the native function. 275 * @internal 276 */ 277 #define UCNV_GET_NEXT_UCHAR_USE_TO_U -9 278 279 U_CFUNC void 280 ucnv_getCompleteUnicodeSet(const UConverter *cnv, 281 const USetAdder *sa, 282 UConverterUnicodeSet which, 283 UErrorCode *pErrorCode); 284 285 U_CFUNC void 286 ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv, 287 const USetAdder *sa, 288 UConverterUnicodeSet which, 289 UErrorCode *pErrorCode); 290 291 U_CFUNC void 292 ucnv_fromUWriteBytes(UConverter *cnv, 293 const char *bytes, int32_t length, 294 char **target, const char *targetLimit, 295 int32_t **offsets, 296 int32_t sourceIndex, 297 UErrorCode *pErrorCode); 298 U_CFUNC void 299 ucnv_toUWriteUChars(UConverter *cnv, 300 const UChar *uchars, int32_t length, 301 UChar **target, const UChar *targetLimit, 302 int32_t **offsets, 303 int32_t sourceIndex, 304 UErrorCode *pErrorCode); 305 306 U_CFUNC void 307 ucnv_toUWriteCodePoint(UConverter *cnv, 308 UChar32 c, 309 UChar **target, const UChar *targetLimit, 310 int32_t **offsets, 311 int32_t sourceIndex, 312 UErrorCode *pErrorCode); 313 314 #endif 315 316 #endif /* UCNV_CNV */ 317