1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1999-2009, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * 9 * 10 * ucnv_err.h: 11 */ 12 13 /** 14 * \file 15 * \brief C UConverter predefined error callbacks 16 * 17 * <h2>Error Behaviour Functions</h2> 18 * Defines some error behaviour functions called by ucnv_{from,to}Unicode 19 * These are provided as part of ICU and many are stable, but they 20 * can also be considered only as an example of what can be done with 21 * callbacks. You may of course write your own. 22 * 23 * If you want to write your own, you may also find the functions from 24 * ucnv_cb.h useful when writing your own callbacks. 25 * 26 * These functions, although public, should NEVER be called directly. 27 * They should be used as parameters to the ucnv_setFromUCallback 28 * and ucnv_setToUCallback functions, to set the behaviour of a converter 29 * when it encounters ILLEGAL/UNMAPPED/INVALID sequences. 30 * 31 * usage example: 'STOP' doesn't need any context, but newContext 32 * could be set to something other than 'NULL' if needed. The available 33 * contexts in this header can modify the default behavior of the callback. 34 * 35 * \code 36 * UErrorCode err = U_ZERO_ERROR; 37 * UConverter *myConverter = ucnv_open("ibm-949", &err); 38 * const void *oldContext; 39 * UConverterFromUCallback oldAction; 40 * 41 * 42 * if (U_SUCCESS(err)) 43 * { 44 * ucnv_setFromUCallBack(myConverter, 45 * UCNV_FROM_U_CALLBACK_STOP, 46 * NULL, 47 * &oldAction, 48 * &oldContext, 49 * &status); 50 * } 51 * \endcode 52 * 53 * The code above tells "myConverter" to stop when it encounters an 54 * ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from 55 * Unicode -> Codepage. The behavior from Codepage to Unicode is not changed, 56 * and ucnv_setToUCallBack would need to be called in order to change 57 * that behavior too. 58 * 59 * Here is an example with a context: 60 * 61 * \code 62 * UErrorCode err = U_ZERO_ERROR; 63 * UConverter *myConverter = ucnv_open("ibm-949", &err); 64 * const void *oldContext; 65 * UConverterFromUCallback oldAction; 66 * 67 * 68 * if (U_SUCCESS(err)) 69 * { 70 * ucnv_setToUCallBack(myConverter, 71 * UCNV_TO_U_CALLBACK_SUBSTITUTE, 72 * UCNV_SUB_STOP_ON_ILLEGAL, 73 * &oldAction, 74 * &oldContext, 75 * &status); 76 * } 77 * \endcode 78 * 79 * The code above tells "myConverter" to stop when it encounters an 80 * ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from 81 * Codepage -> Unicode. Any unmapped and legal characters will be 82 * substituted to be the default substitution character. 83 */ 84 85 #ifndef UCNV_ERR_H 86 #define UCNV_ERR_H 87 88 #include "unicode/utypes.h" 89 90 #if !UCONFIG_NO_CONVERSION 91 92 /** Forward declaring the UConverter structure. @stable ICU 2.0 */ 93 struct UConverter; 94 95 /** @stable ICU 2.0 */ 96 typedef struct UConverter UConverter; 97 98 /** 99 * FROM_U, TO_U context options for sub callback 100 * @stable ICU 2.0 101 */ 102 #define UCNV_SUB_STOP_ON_ILLEGAL "i" 103 104 /** 105 * FROM_U, TO_U context options for skip callback 106 * @stable ICU 2.0 107 */ 108 #define UCNV_SKIP_STOP_ON_ILLEGAL "i" 109 110 /** 111 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX) 112 * @stable ICU 2.0 113 */ 114 #define UCNV_ESCAPE_ICU NULL 115 /** 116 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX) 117 * @stable ICU 2.0 118 */ 119 #define UCNV_ESCAPE_JAVA "J" 120 /** 121 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX) 122 * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX) 123 * @stable ICU 2.0 124 */ 125 #define UCNV_ESCAPE_C "C" 126 /** 127 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly 128 * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly 129 * @stable ICU 2.0 130 */ 131 #define UCNV_ESCAPE_XML_DEC "D" 132 /** 133 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly 134 * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly 135 * @stable ICU 2.0 136 */ 137 #define UCNV_ESCAPE_XML_HEX "X" 138 /** 139 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) 140 * @stable ICU 2.0 141 */ 142 #define UCNV_ESCAPE_UNICODE "U" 143 144 /** 145 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to CSS2 conventions (\\HH..H<space>, that is, 146 * a backslash, 1..6 hex digits, and a space) 147 * @stable ICU 4.0 148 */ 149 #define UCNV_ESCAPE_CSS2 "S" 150 151 /** 152 * The process condition code to be used with the callbacks. 153 * Codes which are greater than UCNV_IRREGULAR should be 154 * passed on to any chained callbacks. 155 * @stable ICU 2.0 156 */ 157 typedef enum { 158 UCNV_UNASSIGNED = 0, /**< The code point is unassigned. 159 The error code U_INVALID_CHAR_FOUND will be set. */ 160 UCNV_ILLEGAL = 1, /**< The code point is illegal. For example, 161 \\x81\\x2E is illegal in SJIS because \\x2E 162 is not a valid trail byte for the \\x81 163 lead byte. 164 Also, starting with Unicode 3.0.1, non-shortest byte sequences 165 in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061) 166 are also illegal, not just irregular. 167 The error code U_ILLEGAL_CHAR_FOUND will be set. */ 168 UCNV_IRREGULAR = 2, /**< The codepoint is not a regular sequence in 169 the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF 170 are irregular UTF-8 byte sequences for single surrogate 171 code points. 172 The error code U_INVALID_CHAR_FOUND will be set. */ 173 UCNV_RESET = 3, /**< The callback is called with this reason when a 174 'reset' has occured. Callback should reset all 175 state. */ 176 UCNV_CLOSE = 4, /**< Called when the converter is closed. The 177 callback should release any allocated memory.*/ 178 UCNV_CLONE = 5 /**< Called when ucnv_safeClone() is called on the 179 converter. the pointer available as the 180 'context' is an alias to the original converters' 181 context pointer. If the context must be owned 182 by the new converter, the callback must clone 183 the data and call ucnv_setFromUCallback 184 (or setToUCallback) with the correct pointer. 185 @stable ICU 2.2 186 */ 187 } UConverterCallbackReason; 188 189 190 /** 191 * The structure for the fromUnicode callback function parameter. 192 * @stable ICU 2.0 193 */ 194 typedef struct { 195 uint16_t size; /**< The size of this struct. @stable ICU 2.0 */ 196 UBool flush; /**< The internal state of converter will be reset and data flushed if set to TRUE. @stable ICU 2.0 */ 197 UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */ 198 const UChar *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */ 199 const UChar *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */ 200 char *target; /**< Pointer to the target buffer. @stable ICU 2.0 */ 201 const char *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */ 202 int32_t *offsets; /**< Pointer to the buffer that recieves the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */ 203 } UConverterFromUnicodeArgs; 204 205 206 /** 207 * The structure for the toUnicode callback function parameter. 208 * @stable ICU 2.0 209 */ 210 typedef struct { 211 uint16_t size; /**< The size of this struct @stable ICU 2.0 */ 212 UBool flush; /**< The internal state of converter will be reset and data flushed if set to TRUE. @stable ICU 2.0 */ 213 UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */ 214 const char *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */ 215 const char *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */ 216 UChar *target; /**< Pointer to the target buffer. @stable ICU 2.0 */ 217 const UChar *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */ 218 int32_t *offsets; /**< Pointer to the buffer that recieves the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */ 219 } UConverterToUnicodeArgs; 220 221 222 /** 223 * DO NOT CALL THIS FUNCTION DIRECTLY! 224 * This From Unicode callback STOPS at the ILLEGAL_SEQUENCE, 225 * returning the error code back to the caller immediately. 226 * 227 * @param context Pointer to the callback's private data 228 * @param fromUArgs Information about the conversion in progress 229 * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence 230 * @param length Size (in bytes) of the concerned codepage sequence 231 * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. 232 * @param reason Defines the reason the callback was invoked 233 * @param err This should always be set to a failure status prior to calling. 234 * @stable ICU 2.0 235 */ 236 U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP ( 237 const void *context, 238 UConverterFromUnicodeArgs *fromUArgs, 239 const UChar* codeUnits, 240 int32_t length, 241 UChar32 codePoint, 242 UConverterCallbackReason reason, 243 UErrorCode * err); 244 245 246 247 /** 248 * DO NOT CALL THIS FUNCTION DIRECTLY! 249 * This To Unicode callback STOPS at the ILLEGAL_SEQUENCE, 250 * returning the error code back to the caller immediately. 251 * 252 * @param context Pointer to the callback's private data 253 * @param toUArgs Information about the conversion in progress 254 * @param codeUnits Points to 'length' bytes of the concerned codepage sequence 255 * @param length Size (in bytes) of the concerned codepage sequence 256 * @param reason Defines the reason the callback was invoked 257 * @param err This should always be set to a failure status prior to calling. 258 * @stable ICU 2.0 259 */ 260 U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP ( 261 const void *context, 262 UConverterToUnicodeArgs *toUArgs, 263 const char* codeUnits, 264 int32_t length, 265 UConverterCallbackReason reason, 266 UErrorCode * err); 267 268 /** 269 * DO NOT CALL THIS FUNCTION DIRECTLY! 270 * This From Unicode callback skips any ILLEGAL_SEQUENCE, or 271 * skips only UNASSINGED_SEQUENCE depending on the context parameter 272 * simply ignoring those characters. 273 * 274 * @param context The function currently recognizes the callback options: 275 * UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, 276 * returning the error code back to the caller immediately. 277 * NULL: Skips any ILLEGAL_SEQUENCE 278 * @param fromUArgs Information about the conversion in progress 279 * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence 280 * @param length Size (in bytes) of the concerned codepage sequence 281 * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. 282 * @param reason Defines the reason the callback was invoked 283 * @param err Return value will be set to success if the callback was handled, 284 * otherwise this value will be set to a failure status. 285 * @stable ICU 2.0 286 */ 287 U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP ( 288 const void *context, 289 UConverterFromUnicodeArgs *fromUArgs, 290 const UChar* codeUnits, 291 int32_t length, 292 UChar32 codePoint, 293 UConverterCallbackReason reason, 294 UErrorCode * err); 295 296 /** 297 * DO NOT CALL THIS FUNCTION DIRECTLY! 298 * This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or 299 * UNASSIGNED_SEQUENCE depending on context parameter, with the 300 * current substitution string for the converter. This is the default 301 * callback. 302 * 303 * @param context The function currently recognizes the callback options: 304 * UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, 305 * returning the error code back to the caller immediately. 306 * NULL: Substitutes any ILLEGAL_SEQUENCE 307 * @param fromUArgs Information about the conversion in progress 308 * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence 309 * @param length Size (in bytes) of the concerned codepage sequence 310 * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. 311 * @param reason Defines the reason the callback was invoked 312 * @param err Return value will be set to success if the callback was handled, 313 * otherwise this value will be set to a failure status. 314 * @see ucnv_setSubstChars 315 * @stable ICU 2.0 316 */ 317 U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE ( 318 const void *context, 319 UConverterFromUnicodeArgs *fromUArgs, 320 const UChar* codeUnits, 321 int32_t length, 322 UChar32 codePoint, 323 UConverterCallbackReason reason, 324 UErrorCode * err); 325 326 /** 327 * DO NOT CALL THIS FUNCTION DIRECTLY! 328 * This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the 329 * hexadecimal representation of the illegal codepoints 330 * 331 * @param context The function currently recognizes the callback options: 332 * <ul> 333 * <li>UCNV_ESCAPE_ICU: Substitues the ILLEGAL SEQUENCE with the hexadecimal 334 * representation in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE"). 335 * In the Event the converter doesn't support the characters {%,U}[A-F][0-9], 336 * it will substitute the illegal sequence with the substitution characters. 337 * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as 338 * %UD84D%UDC56</li> 339 * <li>UCNV_ESCAPE_JAVA: Substitues the ILLEGAL SEQUENCE with the hexadecimal 340 * representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE"). 341 * In the Event the converter doesn't support the characters {\,u}[A-F][0-9], 342 * it will substitute the illegal sequence with the substitution characters. 343 * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as 344 * \\uD84D\\uDC56</li> 345 * <li>UCNV_ESCAPE_C: Substitues the ILLEGAL SEQUENCE with the hexadecimal 346 * representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE"). 347 * In the Event the converter doesn't support the characters {\,u,U}[A-F][0-9], 348 * it will substitute the illegal sequence with the substitution characters. 349 * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as 350 * \\U00023456</li> 351 * <li>UCNV_ESCAPE_XML_DEC: Substitues the ILLEGAL SEQUENCE with the decimal 352 * representation in the format \htmlonly&#DDDDDDDD;, e.g. "&#65534;&#172;&#51454;")\endhtmlonly. 353 * In the Event the converter doesn't support the characters {&,#}[0-9], 354 * it will substitute the illegal sequence with the substitution characters. 355 * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as 356 * &#144470; and Zero padding is ignored.</li> 357 * <li>UCNV_ESCAPE_XML_HEX:Substitues the ILLEGAL SEQUENCE with the decimal 358 * representation in the format \htmlonly&#xXXXX; e.g. "&#xFFFE;&#x00AC;&#xC8FE;")\endhtmlonly. 359 * In the Event the converter doesn't support the characters {&,#,x}[0-9], 360 * it will substitute the illegal sequence with the substitution characters. 361 * Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as 362 * \htmlonly&#x23456;\endhtmlonly</li> 363 * </ul> 364 * @param fromUArgs Information about the conversion in progress 365 * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence 366 * @param length Size (in bytes) of the concerned codepage sequence 367 * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. 368 * @param reason Defines the reason the callback was invoked 369 * @param err Return value will be set to success if the callback was handled, 370 * otherwise this value will be set to a failure status. 371 * @stable ICU 2.0 372 */ 373 U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE ( 374 const void *context, 375 UConverterFromUnicodeArgs *fromUArgs, 376 const UChar* codeUnits, 377 int32_t length, 378 UChar32 codePoint, 379 UConverterCallbackReason reason, 380 UErrorCode * err); 381 382 383 /** 384 * DO NOT CALL THIS FUNCTION DIRECTLY! 385 * This To Unicode callback skips any ILLEGAL_SEQUENCE, or 386 * skips only UNASSINGED_SEQUENCE depending on the context parameter 387 * simply ignoring those characters. 388 * 389 * @param context The function currently recognizes the callback options: 390 * UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, 391 * returning the error code back to the caller immediately. 392 * NULL: Skips any ILLEGAL_SEQUENCE 393 * @param toUArgs Information about the conversion in progress 394 * @param codeUnits Points to 'length' bytes of the concerned codepage sequence 395 * @param length Size (in bytes) of the concerned codepage sequence 396 * @param reason Defines the reason the callback was invoked 397 * @param err Return value will be set to success if the callback was handled, 398 * otherwise this value will be set to a failure status. 399 * @stable ICU 2.0 400 */ 401 U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP ( 402 const void *context, 403 UConverterToUnicodeArgs *toUArgs, 404 const char* codeUnits, 405 int32_t length, 406 UConverterCallbackReason reason, 407 UErrorCode * err); 408 409 /** 410 * DO NOT CALL THIS FUNCTION DIRECTLY! 411 * This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or 412 * UNASSIGNED_SEQUENCE depending on context parameter, with the 413 * Unicode substitution character, U+FFFD. 414 * 415 * @param context The function currently recognizes the callback options: 416 * UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, 417 * returning the error code back to the caller immediately. 418 * NULL: Substitutes any ILLEGAL_SEQUENCE 419 * @param toUArgs Information about the conversion in progress 420 * @param codeUnits Points to 'length' bytes of the concerned codepage sequence 421 * @param length Size (in bytes) of the concerned codepage sequence 422 * @param reason Defines the reason the callback was invoked 423 * @param err Return value will be set to success if the callback was handled, 424 * otherwise this value will be set to a failure status. 425 * @stable ICU 2.0 426 */ 427 U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE ( 428 const void *context, 429 UConverterToUnicodeArgs *toUArgs, 430 const char* codeUnits, 431 int32_t length, 432 UConverterCallbackReason reason, 433 UErrorCode * err); 434 435 /** 436 * DO NOT CALL THIS FUNCTION DIRECTLY! 437 * This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the 438 * hexadecimal representation of the illegal bytes 439 * (in the format %XNN, e.g. "%XFF%X0A%XC8%X03"). 440 * 441 * @param context This function currently recognizes the callback options: 442 * UCNV_ESCAPE_ICU, UCNV_ESCAPE_JAVA, UCNV_ESCAPE_C, UCNV_ESCAPE_XML_DEC, 443 * UCNV_ESCAPE_XML_HEX and UCNV_ESCAPE_UNICODE. 444 * @param toUArgs Information about the conversion in progress 445 * @param codeUnits Points to 'length' bytes of the concerned codepage sequence 446 * @param length Size (in bytes) of the concerned codepage sequence 447 * @param reason Defines the reason the callback was invoked 448 * @param err Return value will be set to success if the callback was handled, 449 * otherwise this value will be set to a failure status. 450 * @stable ICU 2.0 451 */ 452 453 U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_ESCAPE ( 454 const void *context, 455 UConverterToUnicodeArgs *toUArgs, 456 const char* codeUnits, 457 int32_t length, 458 UConverterCallbackReason reason, 459 UErrorCode * err); 460 461 #endif 462 463 #endif 464 465 /*UCNV_ERR_H*/ 466