1 /*
2 * icu.c: Example how to use ICU for character encoding conversion
3 *
4 * This example shows how to use ICU by installing a custom character
5 * encoding converter with xmlCtxtSetCharEncConvImpl, available
6 * since libxml2 2.14.
7 *
8 * This approach makes it possible to use ICU even if libxml2 is
9 * compiled without ICU support. It also makes sure that *only* ICU
10 * is used. Many Linux distros currently ship libxml2 with support
11 * for both ICU and iconv which makes the library's behavior hard to
12 * predict.
13 *
14 * The long-term plan is to make libxml2 only support a single
15 * conversion library internally (iconv on POSIX).
16 */
17
18 #include <stdio.h>
19 #include <libxml/parser.h>
20 #include <unicode/ucnv.h>
21
22 #define ICU_PIVOT_BUF_SIZE 1024
23
24 typedef struct {
25 UConverter *uconv; /* for conversion between an encoding and UTF-16 */
26 UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
27 UChar *pivot_source;
28 UChar *pivot_target;
29 int isInput;
30 UChar pivot_buf[ICU_PIVOT_BUF_SIZE];
31 } myConvCtxt;
32
33 static int
icuConvert(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt)34 icuConvert(unsigned char *out, int *outlen,
35 const unsigned char *in, int *inlen, void *vctxt) {
36 myConvCtxt *cd = vctxt;
37 const char *ucv_in = (const char *) in;
38 char *ucv_out = (char *) out;
39 UConverter *target, *source;
40 UErrorCode err = U_ZERO_ERROR;
41 int ret;
42
43 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
44 if (outlen != NULL)
45 *outlen = 0;
46 return XML_ENC_ERR_INTERNAL;
47 }
48
49 /*
50 * Note that the ICU API is stateful. It can always consume a certain
51 * amount of input even if the output buffer would overflow. The
52 * remaining input must be processed by calling ucnv_convertEx with a
53 * possibly empty input buffer.
54 *
55 * ucnv_convertEx is always called with reset and flush set to 0,
56 * so we don't mess up the state. This should never generate
57 * U_TRUNCATED_CHAR_FOUND errors.
58 */
59 if (cd->isInput) {
60 source = cd->uconv;
61 target = cd->utf8;
62 } else {
63 source = cd->utf8;
64 target = cd->uconv;
65 }
66
67 ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
68 &ucv_in, ucv_in + *inlen, cd->pivot_buf,
69 &cd->pivot_source, &cd->pivot_target,
70 cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
71
72 *inlen = ucv_in - (const char*) in;
73 *outlen = ucv_out - (char *) out;
74
75 if (U_SUCCESS(err)) {
76 ret = XML_ENC_ERR_SUCCESS;
77 } else {
78 switch (err) {
79 case U_TRUNCATED_CHAR_FOUND:
80 /* Shouldn't happen without flush */
81 ret = XML_ENC_ERR_SUCCESS;
82 break;
83
84 case U_BUFFER_OVERFLOW_ERROR:
85 ret = XML_ENC_ERR_SPACE;
86 break;
87
88 case U_INVALID_CHAR_FOUND:
89 case U_ILLEGAL_CHAR_FOUND:
90 case U_ILLEGAL_ESCAPE_SEQUENCE:
91 case U_UNSUPPORTED_ESCAPE_SEQUENCE:
92 ret = XML_ENC_ERR_INPUT;
93 break;
94
95 case U_MEMORY_ALLOCATION_ERROR:
96 ret = XML_ENC_ERR_MEMORY;
97 break;
98
99 default:
100 ret = XML_ENC_ERR_INTERNAL;
101 break;
102 }
103 }
104
105 return ret;
106 }
107
108 static int
icuOpen(const char * name,int isInput,myConvCtxt ** out)109 icuOpen(const char* name, int isInput, myConvCtxt **out)
110 {
111 UErrorCode status;
112 myConvCtxt *cd;
113
114 *out = NULL;
115
116 cd = xmlMalloc(sizeof(myConvCtxt));
117 if (cd == NULL)
118 return XML_ERR_NO_MEMORY;
119
120 cd->isInput = isInput;
121 cd->pivot_source = cd->pivot_buf;
122 cd->pivot_target = cd->pivot_buf;
123
124 status = U_ZERO_ERROR;
125 cd->uconv = ucnv_open(name, &status);
126 if (U_FAILURE(status))
127 goto error;
128
129 status = U_ZERO_ERROR;
130 if (isInput) {
131 ucnv_setToUCallBack(cd->uconv, UCNV_TO_U_CALLBACK_STOP,
132 NULL, NULL, NULL, &status);
133 }
134 else {
135 ucnv_setFromUCallBack(cd->uconv, UCNV_FROM_U_CALLBACK_STOP,
136 NULL, NULL, NULL, &status);
137 }
138 if (U_FAILURE(status))
139 goto error;
140
141 status = U_ZERO_ERROR;
142 cd->utf8 = ucnv_open("UTF-8", &status);
143 if (U_FAILURE(status))
144 goto error;
145
146 *out = cd;
147 return 0;
148
149 error:
150 if (cd->uconv)
151 ucnv_close(cd->uconv);
152 xmlFree(cd);
153
154 if (status == U_FILE_ACCESS_ERROR)
155 return XML_ERR_UNSUPPORTED_ENCODING;
156 if (status == U_MEMORY_ALLOCATION_ERROR)
157 return XML_ERR_NO_MEMORY;
158 return XML_ERR_SYSTEM;
159 }
160
161 static void
icuClose(myConvCtxt * cd)162 icuClose(myConvCtxt *cd)
163 {
164 if (cd == NULL)
165 return;
166 ucnv_close(cd->uconv);
167 ucnv_close(cd->utf8);
168 xmlFree(cd);
169 }
170
171 static void
icuConvCtxtDtor(void * vctxt)172 icuConvCtxtDtor(void *vctxt) {
173 icuClose(vctxt);
174 }
175
176 static int
icuConvImpl(void * vctxt,const char * name,xmlCharEncConverter * conv)177 icuConvImpl(void *vctxt, const char *name,
178 xmlCharEncConverter *conv) {
179 myConvCtxt *inputCtxt = NULL;
180 myConvCtxt *outputCtxt = NULL;
181 int ret;
182
183 ret = icuOpen(name, 1, &inputCtxt);
184 if (ret != 0)
185 goto error;
186 ret = icuOpen(name, 0, &outputCtxt);
187 if (ret != 0)
188 goto error;
189
190 conv->input = icuConvert;
191 conv->output = icuConvert;
192 conv->ctxtDtor = icuConvCtxtDtor;
193 conv->inputCtxt = inputCtxt;
194 conv->outputCtxt = outputCtxt;
195
196 return XML_ERR_OK;
197
198 error:
199 if (inputCtxt != NULL)
200 icuClose(inputCtxt);
201 if (outputCtxt != NULL)
202 icuClose(outputCtxt);
203 return ret;
204 }
205
206 int
main(void)207 main(void) {
208 xmlParserCtxtPtr ctxt;
209 xmlDocPtr doc;
210 const char *xml;
211 xmlChar *content;
212 int ret = 0;
213
214 /*
215 * We use IBM-1051, an alias for HP Roman, as a simple example that
216 * ICU supports, but iconv (typically) doesn't.
217 *
218 * Character code 0xDE is U+00DF Latin Small Letter Sharp S.
219 */
220 xml = "<doc>\xDE</doc>";
221
222 ctxt = xmlNewParserCtxt();
223 xmlCtxtSetCharEncConvImpl(ctxt, icuConvImpl, NULL);
224 doc = xmlCtxtReadDoc(ctxt, BAD_CAST xml, NULL, "IBM-1051", 0);
225 xmlFreeParserCtxt(ctxt);
226
227 content = xmlNodeGetContent((xmlNodePtr) doc);
228
229 printf("content: %s\n", content);
230
231 if (!xmlStrEqual(content, BAD_CAST "\xC3\x9F")) {
232 fprintf(stderr, "conversion failed\n");
233 ret = 1;
234 }
235
236 xmlFree(content);
237 xmlFreeDoc(doc);
238
239 return ret;
240 }
241
242