• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1Add code support for ICU.
2
3diff --git a/third_party/libxml/encoding.c b/third_party/libxml/encoding.c
4index b86a547..0f41df9 100644
5--- a/third_party/libxml/encoding.c
6+++ b/third_party/libxml/encoding.c
7@@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
8 static int xmlCharEncodingAliasesNb = 0;
9 static int xmlCharEncodingAliasesMax = 0;
10
11-#ifdef LIBXML_ICONV_ENABLED
12+#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
13 #if 0
14 #define DEBUG_ENCODING  /* Define this to get encoding traces */
15 #endif
16@@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val)
17                     NULL, 0, val, NULL, NULL, 0, 0, msg, val);
18 }
19
20+#ifdef LIBXML_ICU_ENABLED
21+static uconv_t*
22+openIcuConverter(const char* name, int toUnicode)
23+{
24+  UErrorCode status = U_ZERO_ERROR;
25+  uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t));
26+  if (conv == NULL)
27+    return NULL;
28+
29+  conv->uconv = ucnv_open(name, &status);
30+  if (U_FAILURE(status))
31+    goto error;
32+
33+  status = U_ZERO_ERROR;
34+  if (toUnicode) {
35+    ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
36+                        NULL, NULL, NULL, &status);
37+  }
38+  else {
39+    ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
40+                        NULL, NULL, NULL, &status);
41+  }
42+  if (U_FAILURE(status))
43+    goto error;
44+
45+  status = U_ZERO_ERROR;
46+  conv->utf8 = ucnv_open("UTF-8", &status);
47+  if (U_SUCCESS(status))
48+    return conv;
49+
50+error:
51+  if (conv->uconv)
52+    ucnv_close(conv->uconv);
53+  xmlFree(conv);
54+  return NULL;
55+}
56+
57+static void
58+closeIcuConverter(uconv_t *conv)
59+{
60+  if (conv != NULL) {
61+    ucnv_close(conv->uconv);
62+    ucnv_close(conv->utf8);
63+    xmlFree(conv);
64+  }
65+}
66+#endif /* LIBXML_ICU_ENABLED */
67+
68 /************************************************************************
69  *									*
70  *		Conversions To/From UTF8 encoding			*
71@@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name,
72 #ifdef LIBXML_ICONV_ENABLED
73     handler->iconv_in = NULL;
74     handler->iconv_out = NULL;
75-#endif /* LIBXML_ICONV_ENABLED */
76+#endif
77+#ifdef LIBXML_ICU_ENABLED
78+    handler->uconv_in = NULL;
79+    handler->uconv_out = NULL;
80+#endif
81
82     /*
83      * registers and returns the handler.
84@@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) {
85     xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL);
86     xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL);
87 #endif /* LIBXML_OUTPUT_ENABLED */
88-#ifndef LIBXML_ICONV_ENABLED
89+#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
90 #ifdef LIBXML_ISO8859X_ENABLED
91     xmlRegisterCharEncodingHandlersISO8859x ();
92 #endif
93@@ -1578,6 +1630,10 @@ xmlFindCharEncodingHandler(const char *name) {
94     xmlCharEncodingHandlerPtr enc;
95     iconv_t icv_in, icv_out;
96 #endif /* LIBXML_ICONV_ENABLED */
97+#ifdef LIBXML_ICU_ENABLED
98+    xmlCharEncodingHandlerPtr enc;
99+    uconv_t *ucv_in, *ucv_out;
100+#endif /* LIBXML_ICU_ENABLED */
101     char upper[100];
102     int i;
103
104@@ -1647,6 +1703,35 @@ xmlFindCharEncodingHandler(const char *name) {
105 		    "iconv : problems with filters for '%s'\n", name);
106     }
107 #endif /* LIBXML_ICONV_ENABLED */
108+#ifdef LIBXML_ICU_ENABLED
109+    /* check whether icu can handle this */
110+    ucv_in = openIcuConverter(name, 1);
111+    ucv_out = openIcuConverter(name, 0);
112+    if (ucv_in != NULL && ucv_out != NULL) {
113+	    enc = (xmlCharEncodingHandlerPtr)
114+	          xmlMalloc(sizeof(xmlCharEncodingHandler));
115+	    if (enc == NULL) {
116+                closeIcuConverter(ucv_in);
117+                closeIcuConverter(ucv_out);
118+		return(NULL);
119+	    }
120+	    enc->name = xmlMemStrdup(name);
121+	    enc->input = NULL;
122+	    enc->output = NULL;
123+	    enc->uconv_in = ucv_in;
124+	    enc->uconv_out = ucv_out;
125+#ifdef DEBUG_ENCODING
126+            xmlGenericError(xmlGenericErrorContext,
127+		    "Found ICU converter handler for encoding %s\n", name);
128+#endif
129+	    return enc;
130+    } else if (ucv_in != NULL || ucv_out != NULL) {
131+            closeIcuConverter(ucv_in);
132+            closeIcuConverter(ucv_out);
133+	    xmlEncodingErr(XML_ERR_INTERNAL_ERROR,
134+		    "ICU converter : problems with filters for '%s'\n", name);
135+    }
136+#endif /* LIBXML_ICU_ENABLED */
137
138 #ifdef DEBUG_ENCODING
139     xmlGenericError(xmlGenericErrorContext,
140@@ -1737,6 +1822,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
141
142 /************************************************************************
143  *									*
144+ *		ICU based generic conversion functions	         	*
145+ *									*
146+ ************************************************************************/
147+
148+#ifdef LIBXML_ICU_ENABLED
149+/**
150+ * xmlUconvWrapper:
151+ * @cd: ICU uconverter data structure
152+ * @toUnicode : non-zero if toUnicode. 0 otherwise.
153+ * @out:  a pointer to an array of bytes to store the result
154+ * @outlen:  the length of @out
155+ * @in:  a pointer to an array of ISO Latin 1 chars
156+ * @inlen:  the length of @in
157+ *
158+ * Returns 0 if success, or
159+ *     -1 by lack of space, or
160+ *     -2 if the transcoding fails (for *in is not valid utf8 string or
161+ *        the result of transformation can't fit into the encoding we want), or
162+ *     -3 if there the last byte can't form a single output char.
163+ *
164+ * The value of @inlen after return is the number of octets consumed
165+ *     as the return value is positive, else unpredictable.
166+ * The value of @outlen after return is the number of ocetes consumed.
167+ */
168+static int
169+xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
170+                const unsigned char *in, int *inlen) {
171+    const char *ucv_in = (const char *) in;
172+    char *ucv_out = (char *) out;
173+    UErrorCode err = U_ZERO_ERROR;
174+
175+    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
176+        if (outlen != NULL) *outlen = 0;
177+        return(-1);
178+    }
179+
180+    /*
181+     * TODO(jungshik)
182+     * 1. is ucnv_convert(To|From)Algorithmic better?
183+     * 2. had we better use an explicit pivot buffer?
184+     * 3. error returned comes from 'fromUnicode' only even
185+     *    when toUnicode is true !
186+     */
187+    if (toUnicode) {
188+        /* encoding => UTF-16 => UTF-8 */
189+        ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
190+                       &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
191+                       0, TRUE, &err);
192+    } else {
193+        /* UTF-8 => UTF-16 => encoding */
194+        ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
195+                       &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
196+                       0, TRUE, &err);
197+    }
198+    *inlen = ucv_in - (const char*) in;
199+    *outlen = ucv_out - (char *) out;
200+    if (U_SUCCESS(err))
201+        return 0;
202+    if (err == U_BUFFER_OVERFLOW_ERROR)
203+        return -1;
204+    if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
205+        return -2;
206+    /* if (err == U_TRUNCATED_CHAR_FOUND) */
207+    return -3;
208+}
209+#endif /* LIBXML_ICU_ENABLED */
210+
211+/************************************************************************
212+ *									*
213  *		The real API used by libxml for on-the-fly conversion	*
214  *									*
215  ************************************************************************/
216@@ -1810,6 +1964,16 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
217 	if (ret == -1) ret = -3;
218     }
219 #endif /* LIBXML_ICONV_ENABLED */
220+#ifdef LIBXML_ICU_ENABLED
221+    else if (handler->uconv_in != NULL) {
222+	ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
223+	                      &written, in->content, &toconv);
224+	xmlBufferShrink(in, toconv);
225+	out->use += written;
226+	out->content[out->use] = 0;
227+	if (ret == -1) ret = -3;
228+    }
229+#endif /* LIBXML_ICU_ENABLED */
230 #ifdef DEBUG_ENCODING
231     switch (ret) {
232         case 0:
233@@ -1915,6 +2079,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
234             ret = -3;
235     }
236 #endif /* LIBXML_ICONV_ENABLED */
237+#ifdef LIBXML_ICU_ENABLED
238+    else if (handler->uconv_in != NULL) {
239+        ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
240+                              &written, in->content, &toconv);
241+        xmlBufferShrink(in, toconv);
242+        out->use += written;
243+        out->content[out->use] = 0;
244+        if (ret == -1)
245+            ret = -3;
246+    }
247+#endif /* LIBXML_ICU_ENABLED */
248     switch (ret) {
249         case 0:
250 #ifdef DEBUG_ENCODING
251@@ -2015,6 +2190,15 @@ retry:
252 	    out->content[out->use] = 0;
253 	}
254 #endif /* LIBXML_ICONV_ENABLED */
255+#ifdef LIBXML_ICU_ENABLED
256+	else if (handler->uconv_out != NULL) {
257+	    ret = xmlUconvWrapper(handler->uconv_out, 0,
258+                              &out->content[out->use],
259+ 				              &written, NULL, &toconv);
260+	    out->use += written;
261+	    out->content[out->use] = 0;
262+	}
263+#endif /* LIBXML_ICU_ENABLED */
264 #ifdef DEBUG_ENCODING
265 	xmlGenericError(xmlGenericErrorContext,
266 		"initialized encoder\n");
267@@ -2061,6 +2245,26 @@ retry:
268 	}
269     }
270 #endif /* LIBXML_ICONV_ENABLED */
271+#ifdef LIBXML_ICU_ENABLED
272+    else if (handler->uconv_out != NULL) {
273+	ret = xmlUconvWrapper(handler->uconv_out, 0,
274+                              &out->content[out->use],
275+	                      &written, in->content, &toconv);
276+	xmlBufferShrink(in, toconv);
277+	out->use += written;
278+	writtentot += written;
279+	out->content[out->use] = 0;
280+	if (ret == -1) {
281+	    if (written > 0) {
282+		/*
283+		 * Can be a limitation of iconv
284+		 */
285+		goto retry;
286+	    }
287+	    ret = -3;
288+	}
289+    }
290+#endif /* LIBXML_ICU_ENABLED */
291     else {
292 	xmlEncodingErr(XML_I18N_NO_OUTPUT,
293 		       "xmlCharEncOutFunc: no output function !\n", NULL);
294@@ -2173,6 +2377,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
295 	xmlFree(handler);
296     }
297 #endif /* LIBXML_ICONV_ENABLED */
298+#ifdef LIBXML_ICU_ENABLED
299+    if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) {
300+	if (handler->name != NULL)
301+	    xmlFree(handler->name);
302+	handler->name = NULL;
303+	if (handler->uconv_out != NULL) {
304+	    closeIcuConverter(handler->uconv_out);
305+	    handler->uconv_out = NULL;
306+	}
307+	if (handler->uconv_in != NULL) {
308+	    closeIcuConverter(handler->uconv_in);
309+	    handler->uconv_in = NULL;
310+	}
311+	xmlFree(handler);
312+    }
313+#endif
314 #ifdef DEBUG_ENCODING
315     if (ret)
316         xmlGenericError(xmlGenericErrorContext,
317@@ -2248,6 +2468,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
318 		    cur += toconv;
319 		} while (ret == -2);
320 #endif
321+#ifdef LIBXML_ICU_ENABLED
322+	    } else if (handler->uconv_out != NULL) {
323+	        do {
324+		    toconv = in->end - cur;
325+		    written = 32000;
326+		    ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0],
327+	                      &written, cur, &toconv);
328+		    if (ret < 0) {
329+		        if (written > 0)
330+			    ret = -2;
331+			else
332+			    return(-1);
333+		    }
334+		    unused += written;
335+		    cur += toconv;
336+		} while (ret == -2);
337             } else {
338 	        /* could not find a converter */
339 	        return(-1);
340@@ -2259,8 +2495,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
341     }
342     return(in->consumed + (in->cur - in->base));
343 }
344+#endif
345
346-#ifndef LIBXML_ICONV_ENABLED
347+#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
348 #ifdef LIBXML_ISO8859X_ENABLED
349
350 /**
351diff --git a/third_party/libxml/include/libxml/encoding.h b/third_party/libxml/include/libxml/encoding.h
352index c74b25f..b5f8b48 100644
353--- a/third_party/libxml/include/libxml/encoding.h
354+++ b/third_party/libxml/include/libxml/encoding.h
355@@ -26,6 +26,24 @@
356
357 #ifdef LIBXML_ICONV_ENABLED
358 #include <iconv.h>
359+#else
360+#ifdef LIBXML_ICU_ENABLED
361+#include <unicode/ucnv.h>
362+#if 0
363+/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h>
364+ * to prevent unwanted ICU symbols being exposed to users of libxml2.
365+ * One particular case is Qt4 conflicting on UChar32.
366+ */
367+#include <stdint.h>
368+struct UConverter;
369+typedef struct UConverter UConverter;
370+#ifdef _MSC_VER
371+typedef wchar_t UChar;
372+#else
373+typedef uint16_t UChar;
374+#endif
375+#endif
376+#endif
377 #endif
378 #ifdef __cplusplus
379 extern "C" {
380@@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
381  * Block defining the handlers for non UTF-8 encodings.
382  * If iconv is supported, there are two extra fields.
383  */
384+#ifdef LIBXML_ICU_ENABLED
385+struct _uconv_t {
386+  UConverter *uconv; /* for conversion between an encoding and UTF-16 */
387+  UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
388+};
389+typedef struct _uconv_t uconv_t;
390+#endif
391
392 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
393 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;
394@@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler {
395     iconv_t                    iconv_in;
396     iconv_t                    iconv_out;
397 #endif /* LIBXML_ICONV_ENABLED */
398+#ifdef LIBXML_ICU_ENABLED
399+    uconv_t                    *uconv_in;
400+    uconv_t                    *uconv_out;
401+#endif /* LIBXML_ICU_ENABLED */
402 };
403
404 #ifdef __cplusplus
405diff --git a/third_party/libxml/include/libxml/parser.h b/third_party/libxml/include/libxml/parser.h
406index dd79c42..3580b63 100644
407--- a/third_party/libxml/include/libxml/parser.h
408+++ b/third_party/libxml/include/libxml/parser.h
409@@ -1222,6 +1222,7 @@ typedef enum {
410     XML_WITH_DEBUG_MEM = 29,
411     XML_WITH_DEBUG_RUN = 30,
412     XML_WITH_ZLIB = 31,
413+    XML_WITH_ICU = 32,
414     XML_WITH_NONE = 99999 /* just to be sure of allocation size */
415 } xmlFeature;
416
417diff --git a/third_party/libxml/include/libxml/xmlversion.h.in b/third_party/libxml/include/libxml/xmlversion.h.in
418index 4739f3a..de310ab 100644
419--- a/third_party/libxml/include/libxml/xmlversion.h.in
420+++ b/third_party/libxml/include/libxml/xmlversion.h.in
421@@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
422 #endif
423
424 /**
425+ * LIBXML_ICU_ENABLED:
426+ *
427+ * Whether icu support is available
428+ */
429+#if @WITH_ICU@
430+#define LIBXML_ICU_ENABLED
431+#endif
432+
433+/**
434  * LIBXML_ISO8859X_ENABLED:
435  *
436  * Whether ISO-8859-* support is made available in case iconv is not
437diff --git a/third_party/libxml/parser.c b/third_party/libxml/parser.c
438index 85e7599..3ba2a06 100644
439--- a/third_party/libxml/parser.c
440+++ b/third_party/libxml/parser.c
441@@ -954,6 +954,12 @@ xmlHasFeature(xmlFeature feature)
442 #else
443             return(0);
444 #endif
445+        case XML_WITH_ICU:
446+#ifdef LIBXML_ICU_ENABLED
447+            return(1);
448+#else
449+            return(0);
450+#endif
451         default:
452 	    break;
453      }
454