1Add code support for ICU. 2 3diff --git a/third_party/libxml/encoding.c b/third_party/libxml/encoding.c 4index b86a547..0f41df9 100644 5--- a/third_party/libxml/encoding.c 6+++ b/third_party/libxml/encoding.c 7@@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; 8 static int xmlCharEncodingAliasesNb = 0; 9 static int xmlCharEncodingAliasesMax = 0; 10 11-#ifdef LIBXML_ICONV_ENABLED 12+#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) 13 #if 0 14 #define DEBUG_ENCODING /* Define this to get encoding traces */ 15 #endif 16@@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val) 17 NULL, 0, val, NULL, NULL, 0, 0, msg, val); 18 } 19 20+#ifdef LIBXML_ICU_ENABLED 21+static uconv_t* 22+openIcuConverter(const char* name, int toUnicode) 23+{ 24+ UErrorCode status = U_ZERO_ERROR; 25+ uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t)); 26+ if (conv == NULL) 27+ return NULL; 28+ 29+ conv->uconv = ucnv_open(name, &status); 30+ if (U_FAILURE(status)) 31+ goto error; 32+ 33+ status = U_ZERO_ERROR; 34+ if (toUnicode) { 35+ ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, 36+ NULL, NULL, NULL, &status); 37+ } 38+ else { 39+ ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, 40+ NULL, NULL, NULL, &status); 41+ } 42+ if (U_FAILURE(status)) 43+ goto error; 44+ 45+ status = U_ZERO_ERROR; 46+ conv->utf8 = ucnv_open("UTF-8", &status); 47+ if (U_SUCCESS(status)) 48+ return conv; 49+ 50+error: 51+ if (conv->uconv) 52+ ucnv_close(conv->uconv); 53+ xmlFree(conv); 54+ return NULL; 55+} 56+ 57+static void 58+closeIcuConverter(uconv_t *conv) 59+{ 60+ if (conv != NULL) { 61+ ucnv_close(conv->uconv); 62+ ucnv_close(conv->utf8); 63+ xmlFree(conv); 64+ } 65+} 66+#endif /* LIBXML_ICU_ENABLED */ 67+ 68 /************************************************************************ 69 * * 70 * Conversions To/From UTF8 encoding * 71@@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name, 72 #ifdef LIBXML_ICONV_ENABLED 73 handler->iconv_in = NULL; 74 handler->iconv_out = NULL; 75-#endif /* LIBXML_ICONV_ENABLED */ 76+#endif 77+#ifdef LIBXML_ICU_ENABLED 78+ handler->uconv_in = NULL; 79+ handler->uconv_out = NULL; 80+#endif 81 82 /* 83 * registers and returns the handler. 84@@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) { 85 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); 86 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); 87 #endif /* LIBXML_OUTPUT_ENABLED */ 88-#ifndef LIBXML_ICONV_ENABLED 89+#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) 90 #ifdef LIBXML_ISO8859X_ENABLED 91 xmlRegisterCharEncodingHandlersISO8859x (); 92 #endif 93@@ -1578,6 +1630,10 @@ xmlFindCharEncodingHandler(const char *name) { 94 xmlCharEncodingHandlerPtr enc; 95 iconv_t icv_in, icv_out; 96 #endif /* LIBXML_ICONV_ENABLED */ 97+#ifdef LIBXML_ICU_ENABLED 98+ xmlCharEncodingHandlerPtr enc; 99+ uconv_t *ucv_in, *ucv_out; 100+#endif /* LIBXML_ICU_ENABLED */ 101 char upper[100]; 102 int i; 103 104@@ -1647,6 +1703,35 @@ xmlFindCharEncodingHandler(const char *name) { 105 "iconv : problems with filters for '%s'\n", name); 106 } 107 #endif /* LIBXML_ICONV_ENABLED */ 108+#ifdef LIBXML_ICU_ENABLED 109+ /* check whether icu can handle this */ 110+ ucv_in = openIcuConverter(name, 1); 111+ ucv_out = openIcuConverter(name, 0); 112+ if (ucv_in != NULL && ucv_out != NULL) { 113+ enc = (xmlCharEncodingHandlerPtr) 114+ xmlMalloc(sizeof(xmlCharEncodingHandler)); 115+ if (enc == NULL) { 116+ closeIcuConverter(ucv_in); 117+ closeIcuConverter(ucv_out); 118+ return(NULL); 119+ } 120+ enc->name = xmlMemStrdup(name); 121+ enc->input = NULL; 122+ enc->output = NULL; 123+ enc->uconv_in = ucv_in; 124+ enc->uconv_out = ucv_out; 125+#ifdef DEBUG_ENCODING 126+ xmlGenericError(xmlGenericErrorContext, 127+ "Found ICU converter handler for encoding %s\n", name); 128+#endif 129+ return enc; 130+ } else if (ucv_in != NULL || ucv_out != NULL) { 131+ closeIcuConverter(ucv_in); 132+ closeIcuConverter(ucv_out); 133+ xmlEncodingErr(XML_ERR_INTERNAL_ERROR, 134+ "ICU converter : problems with filters for '%s'\n", name); 135+ } 136+#endif /* LIBXML_ICU_ENABLED */ 137 138 #ifdef DEBUG_ENCODING 139 xmlGenericError(xmlGenericErrorContext, 140@@ -1737,6 +1822,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, 141 142 /************************************************************************ 143 * * 144+ * ICU based generic conversion functions * 145+ * * 146+ ************************************************************************/ 147+ 148+#ifdef LIBXML_ICU_ENABLED 149+/** 150+ * xmlUconvWrapper: 151+ * @cd: ICU uconverter data structure 152+ * @toUnicode : non-zero if toUnicode. 0 otherwise. 153+ * @out: a pointer to an array of bytes to store the result 154+ * @outlen: the length of @out 155+ * @in: a pointer to an array of ISO Latin 1 chars 156+ * @inlen: the length of @in 157+ * 158+ * Returns 0 if success, or 159+ * -1 by lack of space, or 160+ * -2 if the transcoding fails (for *in is not valid utf8 string or 161+ * the result of transformation can't fit into the encoding we want), or 162+ * -3 if there the last byte can't form a single output char. 163+ * 164+ * The value of @inlen after return is the number of octets consumed 165+ * as the return value is positive, else unpredictable. 166+ * The value of @outlen after return is the number of ocetes consumed. 167+ */ 168+static int 169+xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, 170+ const unsigned char *in, int *inlen) { 171+ const char *ucv_in = (const char *) in; 172+ char *ucv_out = (char *) out; 173+ UErrorCode err = U_ZERO_ERROR; 174+ 175+ if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { 176+ if (outlen != NULL) *outlen = 0; 177+ return(-1); 178+ } 179+ 180+ /* 181+ * TODO(jungshik) 182+ * 1. is ucnv_convert(To|From)Algorithmic better? 183+ * 2. had we better use an explicit pivot buffer? 184+ * 3. error returned comes from 'fromUnicode' only even 185+ * when toUnicode is true ! 186+ */ 187+ if (toUnicode) { 188+ /* encoding => UTF-16 => UTF-8 */ 189+ ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, 190+ &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, 191+ 0, TRUE, &err); 192+ } else { 193+ /* UTF-8 => UTF-16 => encoding */ 194+ ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, 195+ &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, 196+ 0, TRUE, &err); 197+ } 198+ *inlen = ucv_in - (const char*) in; 199+ *outlen = ucv_out - (char *) out; 200+ if (U_SUCCESS(err)) 201+ return 0; 202+ if (err == U_BUFFER_OVERFLOW_ERROR) 203+ return -1; 204+ if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) 205+ return -2; 206+ /* if (err == U_TRUNCATED_CHAR_FOUND) */ 207+ return -3; 208+} 209+#endif /* LIBXML_ICU_ENABLED */ 210+ 211+/************************************************************************ 212+ * * 213 * The real API used by libxml for on-the-fly conversion * 214 * * 215 ************************************************************************/ 216@@ -1810,6 +1964,16 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, 217 if (ret == -1) ret = -3; 218 } 219 #endif /* LIBXML_ICONV_ENABLED */ 220+#ifdef LIBXML_ICU_ENABLED 221+ else if (handler->uconv_in != NULL) { 222+ ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], 223+ &written, in->content, &toconv); 224+ xmlBufferShrink(in, toconv); 225+ out->use += written; 226+ out->content[out->use] = 0; 227+ if (ret == -1) ret = -3; 228+ } 229+#endif /* LIBXML_ICU_ENABLED */ 230 #ifdef DEBUG_ENCODING 231 switch (ret) { 232 case 0: 233@@ -1915,6 +2079,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, 234 ret = -3; 235 } 236 #endif /* LIBXML_ICONV_ENABLED */ 237+#ifdef LIBXML_ICU_ENABLED 238+ else if (handler->uconv_in != NULL) { 239+ ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], 240+ &written, in->content, &toconv); 241+ xmlBufferShrink(in, toconv); 242+ out->use += written; 243+ out->content[out->use] = 0; 244+ if (ret == -1) 245+ ret = -3; 246+ } 247+#endif /* LIBXML_ICU_ENABLED */ 248 switch (ret) { 249 case 0: 250 #ifdef DEBUG_ENCODING 251@@ -2015,6 +2190,15 @@ retry: 252 out->content[out->use] = 0; 253 } 254 #endif /* LIBXML_ICONV_ENABLED */ 255+#ifdef LIBXML_ICU_ENABLED 256+ else if (handler->uconv_out != NULL) { 257+ ret = xmlUconvWrapper(handler->uconv_out, 0, 258+ &out->content[out->use], 259+ &written, NULL, &toconv); 260+ out->use += written; 261+ out->content[out->use] = 0; 262+ } 263+#endif /* LIBXML_ICU_ENABLED */ 264 #ifdef DEBUG_ENCODING 265 xmlGenericError(xmlGenericErrorContext, 266 "initialized encoder\n"); 267@@ -2061,6 +2245,26 @@ retry: 268 } 269 } 270 #endif /* LIBXML_ICONV_ENABLED */ 271+#ifdef LIBXML_ICU_ENABLED 272+ else if (handler->uconv_out != NULL) { 273+ ret = xmlUconvWrapper(handler->uconv_out, 0, 274+ &out->content[out->use], 275+ &written, in->content, &toconv); 276+ xmlBufferShrink(in, toconv); 277+ out->use += written; 278+ writtentot += written; 279+ out->content[out->use] = 0; 280+ if (ret == -1) { 281+ if (written > 0) { 282+ /* 283+ * Can be a limitation of iconv 284+ */ 285+ goto retry; 286+ } 287+ ret = -3; 288+ } 289+ } 290+#endif /* LIBXML_ICU_ENABLED */ 291 else { 292 xmlEncodingErr(XML_I18N_NO_OUTPUT, 293 "xmlCharEncOutFunc: no output function !\n", NULL); 294@@ -2173,6 +2377,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { 295 xmlFree(handler); 296 } 297 #endif /* LIBXML_ICONV_ENABLED */ 298+#ifdef LIBXML_ICU_ENABLED 299+ if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) { 300+ if (handler->name != NULL) 301+ xmlFree(handler->name); 302+ handler->name = NULL; 303+ if (handler->uconv_out != NULL) { 304+ closeIcuConverter(handler->uconv_out); 305+ handler->uconv_out = NULL; 306+ } 307+ if (handler->uconv_in != NULL) { 308+ closeIcuConverter(handler->uconv_in); 309+ handler->uconv_in = NULL; 310+ } 311+ xmlFree(handler); 312+ } 313+#endif 314 #ifdef DEBUG_ENCODING 315 if (ret) 316 xmlGenericError(xmlGenericErrorContext, 317@@ -2248,6 +2468,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { 318 cur += toconv; 319 } while (ret == -2); 320 #endif 321+#ifdef LIBXML_ICU_ENABLED 322+ } else if (handler->uconv_out != NULL) { 323+ do { 324+ toconv = in->end - cur; 325+ written = 32000; 326+ ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0], 327+ &written, cur, &toconv); 328+ if (ret < 0) { 329+ if (written > 0) 330+ ret = -2; 331+ else 332+ return(-1); 333+ } 334+ unused += written; 335+ cur += toconv; 336+ } while (ret == -2); 337 } else { 338 /* could not find a converter */ 339 return(-1); 340@@ -2259,8 +2495,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { 341 } 342 return(in->consumed + (in->cur - in->base)); 343 } 344+#endif 345 346-#ifndef LIBXML_ICONV_ENABLED 347+#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) 348 #ifdef LIBXML_ISO8859X_ENABLED 349 350 /** 351diff --git a/third_party/libxml/include/libxml/encoding.h b/third_party/libxml/include/libxml/encoding.h 352index c74b25f..b5f8b48 100644 353--- a/third_party/libxml/include/libxml/encoding.h 354+++ b/third_party/libxml/include/libxml/encoding.h 355@@ -26,6 +26,24 @@ 356 357 #ifdef LIBXML_ICONV_ENABLED 358 #include <iconv.h> 359+#else 360+#ifdef LIBXML_ICU_ENABLED 361+#include <unicode/ucnv.h> 362+#if 0 363+/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h> 364+ * to prevent unwanted ICU symbols being exposed to users of libxml2. 365+ * One particular case is Qt4 conflicting on UChar32. 366+ */ 367+#include <stdint.h> 368+struct UConverter; 369+typedef struct UConverter UConverter; 370+#ifdef _MSC_VER 371+typedef wchar_t UChar; 372+#else 373+typedef uint16_t UChar; 374+#endif 375+#endif 376+#endif 377 #endif 378 #ifdef __cplusplus 379 extern "C" { 380@@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, 381 * Block defining the handlers for non UTF-8 encodings. 382 * If iconv is supported, there are two extra fields. 383 */ 384+#ifdef LIBXML_ICU_ENABLED 385+struct _uconv_t { 386+ UConverter *uconv; /* for conversion between an encoding and UTF-16 */ 387+ UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ 388+}; 389+typedef struct _uconv_t uconv_t; 390+#endif 391 392 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; 393 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; 394@@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler { 395 iconv_t iconv_in; 396 iconv_t iconv_out; 397 #endif /* LIBXML_ICONV_ENABLED */ 398+#ifdef LIBXML_ICU_ENABLED 399+ uconv_t *uconv_in; 400+ uconv_t *uconv_out; 401+#endif /* LIBXML_ICU_ENABLED */ 402 }; 403 404 #ifdef __cplusplus 405diff --git a/third_party/libxml/include/libxml/parser.h b/third_party/libxml/include/libxml/parser.h 406index dd79c42..3580b63 100644 407--- a/third_party/libxml/include/libxml/parser.h 408+++ b/third_party/libxml/include/libxml/parser.h 409@@ -1222,6 +1222,7 @@ typedef enum { 410 XML_WITH_DEBUG_MEM = 29, 411 XML_WITH_DEBUG_RUN = 30, 412 XML_WITH_ZLIB = 31, 413+ XML_WITH_ICU = 32, 414 XML_WITH_NONE = 99999 /* just to be sure of allocation size */ 415 } xmlFeature; 416 417diff --git a/third_party/libxml/include/libxml/xmlversion.h.in b/third_party/libxml/include/libxml/xmlversion.h.in 418index 4739f3a..de310ab 100644 419--- a/third_party/libxml/include/libxml/xmlversion.h.in 420+++ b/third_party/libxml/include/libxml/xmlversion.h.in 421@@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); 422 #endif 423 424 /** 425+ * LIBXML_ICU_ENABLED: 426+ * 427+ * Whether icu support is available 428+ */ 429+#if @WITH_ICU@ 430+#define LIBXML_ICU_ENABLED 431+#endif 432+ 433+/** 434 * LIBXML_ISO8859X_ENABLED: 435 * 436 * Whether ISO-8859-* support is made available in case iconv is not 437diff --git a/third_party/libxml/parser.c b/third_party/libxml/parser.c 438index 85e7599..3ba2a06 100644 439--- a/third_party/libxml/parser.c 440+++ b/third_party/libxml/parser.c 441@@ -954,6 +954,12 @@ xmlHasFeature(xmlFeature feature) 442 #else 443 return(0); 444 #endif 445+ case XML_WITH_ICU: 446+#ifdef LIBXML_ICU_ENABLED 447+ return(1); 448+#else 449+ return(0); 450+#endif 451 default: 452 break; 453 } 454