• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * icu.c: Example how to use ICU for character encoding conversion
3  *
4  * This example shows how to use ICU by installing a custom character
5  * encoding converter with xmlCtxtSetCharEncConvImpl, available
6  * since libxml2 2.14.
7  *
8  * This approach makes it possible to use ICU even if libxml2 is
9  * compiled without ICU support. It also makes sure that *only* ICU
10  * is used. Many Linux distros currently ship libxml2 with support
11  * for both ICU and iconv which makes the library's behavior hard to
12  * predict.
13  *
14  * The long-term plan is to make libxml2 only support a single
15  * conversion library internally (iconv on POSIX).
16  */
17 
18 #include <stdio.h>
19 #include <libxml/parser.h>
20 #include <unicode/ucnv.h>
21 
22 #define ICU_PIVOT_BUF_SIZE 1024
23 
24 typedef struct {
25     UConverter *uconv; /* for conversion between an encoding and UTF-16 */
26     UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
27     UChar      *pivot_source;
28     UChar      *pivot_target;
29     int        isInput;
30     UChar      pivot_buf[ICU_PIVOT_BUF_SIZE];
31 } myConvCtxt;
32 
33 static int
icuConvert(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,void * vctxt)34 icuConvert(unsigned char *out, int *outlen,
35            const unsigned char *in, int *inlen, void *vctxt) {
36     myConvCtxt *cd = vctxt;
37     const char *ucv_in = (const char *) in;
38     char *ucv_out = (char *) out;
39     UConverter *target, *source;
40     UErrorCode err = U_ZERO_ERROR;
41     int ret;
42 
43     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
44         if (outlen != NULL)
45             *outlen = 0;
46         return XML_ENC_ERR_INTERNAL;
47     }
48 
49     /*
50      * Note that the ICU API is stateful. It can always consume a certain
51      * amount of input even if the output buffer would overflow. The
52      * remaining input must be processed by calling ucnv_convertEx with a
53      * possibly empty input buffer.
54      *
55      * ucnv_convertEx is always called with reset and flush set to 0,
56      * so we don't mess up the state. This should never generate
57      * U_TRUNCATED_CHAR_FOUND errors.
58      */
59     if (cd->isInput) {
60         source = cd->uconv;
61         target = cd->utf8;
62     } else {
63         source = cd->utf8;
64         target = cd->uconv;
65     }
66 
67     ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
68                    &ucv_in, ucv_in + *inlen, cd->pivot_buf,
69                    &cd->pivot_source, &cd->pivot_target,
70                    cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err);
71 
72     *inlen = ucv_in - (const char*) in;
73     *outlen = ucv_out - (char *) out;
74 
75     if (U_SUCCESS(err)) {
76         ret = XML_ENC_ERR_SUCCESS;
77     } else {
78         switch (err) {
79             case U_TRUNCATED_CHAR_FOUND:
80                 /* Shouldn't happen without flush */
81                 ret = XML_ENC_ERR_SUCCESS;
82                 break;
83 
84             case U_BUFFER_OVERFLOW_ERROR:
85                 ret = XML_ENC_ERR_SPACE;
86                 break;
87 
88             case U_INVALID_CHAR_FOUND:
89             case U_ILLEGAL_CHAR_FOUND:
90             case U_ILLEGAL_ESCAPE_SEQUENCE:
91             case U_UNSUPPORTED_ESCAPE_SEQUENCE:
92                 ret = XML_ENC_ERR_INPUT;
93                 break;
94 
95             case U_MEMORY_ALLOCATION_ERROR:
96                 ret = XML_ENC_ERR_MEMORY;
97                 break;
98 
99             default:
100                 ret = XML_ENC_ERR_INTERNAL;
101                 break;
102         }
103     }
104 
105     return ret;
106 }
107 
108 static int
icuOpen(const char * name,int isInput,myConvCtxt ** out)109 icuOpen(const char* name, int isInput, myConvCtxt **out)
110 {
111     UErrorCode status;
112     myConvCtxt *cd;
113 
114     *out = NULL;
115 
116     cd = xmlMalloc(sizeof(myConvCtxt));
117     if (cd == NULL)
118         return XML_ERR_NO_MEMORY;
119 
120     cd->isInput = isInput;
121     cd->pivot_source = cd->pivot_buf;
122     cd->pivot_target = cd->pivot_buf;
123 
124     status = U_ZERO_ERROR;
125     cd->uconv = ucnv_open(name, &status);
126     if (U_FAILURE(status))
127         goto error;
128 
129     status = U_ZERO_ERROR;
130     if (isInput) {
131         ucnv_setToUCallBack(cd->uconv, UCNV_TO_U_CALLBACK_STOP,
132                             NULL, NULL, NULL, &status);
133     }
134     else {
135         ucnv_setFromUCallBack(cd->uconv, UCNV_FROM_U_CALLBACK_STOP,
136                               NULL, NULL, NULL, &status);
137     }
138     if (U_FAILURE(status))
139         goto error;
140 
141     status = U_ZERO_ERROR;
142     cd->utf8 = ucnv_open("UTF-8", &status);
143     if (U_FAILURE(status))
144         goto error;
145 
146     *out = cd;
147     return 0;
148 
149 error:
150     if (cd->uconv)
151         ucnv_close(cd->uconv);
152     xmlFree(cd);
153 
154     if (status == U_FILE_ACCESS_ERROR)
155         return XML_ERR_UNSUPPORTED_ENCODING;
156     if (status == U_MEMORY_ALLOCATION_ERROR)
157         return XML_ERR_NO_MEMORY;
158     return XML_ERR_SYSTEM;
159 }
160 
161 static void
icuClose(myConvCtxt * cd)162 icuClose(myConvCtxt *cd)
163 {
164     if (cd == NULL)
165         return;
166     ucnv_close(cd->uconv);
167     ucnv_close(cd->utf8);
168     xmlFree(cd);
169 }
170 
171 static void
icuConvCtxtDtor(void * vctxt)172 icuConvCtxtDtor(void *vctxt) {
173     icuClose(vctxt);
174 }
175 
176 static int
icuConvImpl(void * vctxt,const char * name,xmlCharEncConverter * conv)177 icuConvImpl(void *vctxt, const char *name,
178             xmlCharEncConverter *conv) {
179     myConvCtxt *inputCtxt = NULL;
180     myConvCtxt *outputCtxt = NULL;
181     int ret;
182 
183     ret = icuOpen(name, 1, &inputCtxt);
184     if (ret != 0)
185         goto error;
186     ret = icuOpen(name, 0, &outputCtxt);
187     if (ret != 0)
188         goto error;
189 
190     conv->input = icuConvert;
191     conv->output = icuConvert;
192     conv->ctxtDtor = icuConvCtxtDtor;
193     conv->inputCtxt = inputCtxt;
194     conv->outputCtxt = outputCtxt;
195 
196     return XML_ERR_OK;
197 
198 error:
199     if (inputCtxt != NULL)
200         icuClose(inputCtxt);
201     if (outputCtxt != NULL)
202         icuClose(outputCtxt);
203     return ret;
204 }
205 
206 int
main(void)207 main(void) {
208     xmlParserCtxtPtr ctxt;
209     xmlDocPtr doc;
210     const char *xml;
211     xmlChar *content;
212     int ret = 0;
213 
214     /*
215      * We use IBM-1051, an alias for HP Roman, as a simple example that
216      * ICU supports, but iconv (typically) doesn't.
217      *
218      * Character code 0xDE is U+00DF Latin Small Letter Sharp S.
219      */
220     xml = "<doc>\xDE</doc>";
221 
222     ctxt = xmlNewParserCtxt();
223     xmlCtxtSetCharEncConvImpl(ctxt, icuConvImpl, NULL);
224     doc = xmlCtxtReadDoc(ctxt, BAD_CAST xml, NULL, "IBM-1051", 0);
225     xmlFreeParserCtxt(ctxt);
226 
227     content = xmlNodeGetContent((xmlNodePtr) doc);
228 
229     printf("content: %s\n", content);
230 
231     if (!xmlStrEqual(content, BAD_CAST "\xC3\x9F")) {
232         fprintf(stderr, "conversion failed\n");
233         ret = 1;
234     }
235 
236     xmlFree(content);
237     xmlFreeDoc(doc);
238 
239     return ret;
240 }
241 
242