1 /* 2 * Summary: interface for the encoding conversion functions 3 * Description: interface for the encoding conversion functions needed for 4 * XML basic encoding and iconv() support. 5 * 6 * Related specs are 7 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies 8 * [ISO-10646] UTF-8 and UTF-16 in Annexes 9 * [ISO-8859-1] ISO Latin-1 characters codes. 10 * [UNICODE] The Unicode Consortium, "The Unicode Standard -- 11 * Worldwide Character Encoding -- Version 1.0", Addison- 12 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is 13 * described in Unicode Technical Report #4. 14 * [US-ASCII] Coded Character Set--7-bit American Standard Code for 15 * Information Interchange, ANSI X3.4-1986. 16 * 17 * Copy: See Copyright for the status of this software. 18 * 19 * Author: Daniel Veillard 20 */ 21 22 #ifndef __XML_CHAR_ENCODING_H__ 23 #define __XML_CHAR_ENCODING_H__ 24 25 #include <libxml/xmlversion.h> 26 27 #ifdef LIBXML_ICONV_ENABLED 28 #include <iconv.h> 29 #endif 30 #ifdef LIBXML_ICU_ENABLED 31 #include <unicode/ucnv.h> 32 #endif 33 #ifdef __cplusplus 34 extern "C" { 35 #endif 36 37 /* 38 * xmlCharEncoding: 39 * 40 * Predefined values for some standard encodings. 41 * Libxml does not do beforehand translation on UTF8 and ISOLatinX. 42 * It also supports ASCII, ISO-8859-1, and UTF16 (LE and BE) by default. 43 * 44 * Anything else would have to be translated to UTF8 before being 45 * given to the parser itself. The BOM for UTF16 and the encoding 46 * declaration are looked at and a converter is looked for at that 47 * point. If not found the parser stops here as asked by the XML REC. A 48 * converter can be registered by the user using xmlRegisterCharEncodingHandler 49 * but the current form doesn't allow stateful transcoding (a serious 50 * problem agreed !). If iconv has been found it will be used 51 * automatically and allow stateful transcoding, the simplest is then 52 * to be sure to enable iconv and to provide iconv libs for the encoding 53 * support needed. 54 * 55 * Note that the generic "UTF-16" is not a predefined value. Instead, only 56 * the specific UTF-16LE and UTF-16BE are present. 57 */ 58 typedef enum { 59 XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */ 60 XML_CHAR_ENCODING_NONE= 0, /* No char encoding detected */ 61 XML_CHAR_ENCODING_UTF8= 1, /* UTF-8 */ 62 XML_CHAR_ENCODING_UTF16LE= 2, /* UTF-16 little endian */ 63 XML_CHAR_ENCODING_UTF16BE= 3, /* UTF-16 big endian */ 64 XML_CHAR_ENCODING_UCS4LE= 4, /* UCS-4 little endian */ 65 XML_CHAR_ENCODING_UCS4BE= 5, /* UCS-4 big endian */ 66 XML_CHAR_ENCODING_EBCDIC= 6, /* EBCDIC uh! */ 67 XML_CHAR_ENCODING_UCS4_2143=7, /* UCS-4 unusual ordering */ 68 XML_CHAR_ENCODING_UCS4_3412=8, /* UCS-4 unusual ordering */ 69 XML_CHAR_ENCODING_UCS2= 9, /* UCS-2 */ 70 XML_CHAR_ENCODING_8859_1= 10,/* ISO-8859-1 ISO Latin 1 */ 71 XML_CHAR_ENCODING_8859_2= 11,/* ISO-8859-2 ISO Latin 2 */ 72 XML_CHAR_ENCODING_8859_3= 12,/* ISO-8859-3 */ 73 XML_CHAR_ENCODING_8859_4= 13,/* ISO-8859-4 */ 74 XML_CHAR_ENCODING_8859_5= 14,/* ISO-8859-5 */ 75 XML_CHAR_ENCODING_8859_6= 15,/* ISO-8859-6 */ 76 XML_CHAR_ENCODING_8859_7= 16,/* ISO-8859-7 */ 77 XML_CHAR_ENCODING_8859_8= 17,/* ISO-8859-8 */ 78 XML_CHAR_ENCODING_8859_9= 18,/* ISO-8859-9 */ 79 XML_CHAR_ENCODING_2022_JP= 19,/* ISO-2022-JP */ 80 XML_CHAR_ENCODING_SHIFT_JIS=20,/* Shift_JIS */ 81 XML_CHAR_ENCODING_EUC_JP= 21,/* EUC-JP */ 82 XML_CHAR_ENCODING_ASCII= 22 /* pure ASCII */ 83 } xmlCharEncoding; 84 85 /** 86 * xmlCharEncodingInputFunc: 87 * @out: a pointer to an array of bytes to store the UTF-8 result 88 * @outlen: the length of @out 89 * @in: a pointer to an array of chars in the original encoding 90 * @inlen: the length of @in 91 * 92 * Take a block of chars in the original encoding and try to convert 93 * it to an UTF-8 block of chars out. 94 * 95 * Returns the number of bytes written, -1 if lack of space, or -2 96 * if the transcoding failed. 97 * The value of @inlen after return is the number of octets consumed 98 * if the return value is positive, else unpredictiable. 99 * The value of @outlen after return is the number of octets consumed. 100 */ 101 typedef int (* xmlCharEncodingInputFunc)(unsigned char *out, int *outlen, 102 const unsigned char *in, int *inlen); 103 104 105 /** 106 * xmlCharEncodingOutputFunc: 107 * @out: a pointer to an array of bytes to store the result 108 * @outlen: the length of @out 109 * @in: a pointer to an array of UTF-8 chars 110 * @inlen: the length of @in 111 * 112 * Take a block of UTF-8 chars in and try to convert it to another 113 * encoding. 114 * Note: a first call designed to produce heading info is called with 115 * in = NULL. If stateful this should also initialize the encoder state. 116 * 117 * Returns the number of bytes written, -1 if lack of space, or -2 118 * if the transcoding failed. 119 * The value of @inlen after return is the number of octets consumed 120 * if the return value is positive, else unpredictiable. 121 * The value of @outlen after return is the number of octets produced. 122 */ 123 typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, 124 const unsigned char *in, int *inlen); 125 126 127 /* 128 * Block defining the handlers for non UTF-8 encodings. 129 * If iconv is supported, there are two extra fields. 130 */ 131 #ifdef LIBXML_ICU_ENABLED 132 /* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */ 133 #define ICU_PIVOT_BUF_SIZE 1024 134 struct _uconv_t { 135 UConverter *uconv; /* for conversion between an encoding and UTF-16 */ 136 UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ 137 UChar pivot_buf[ICU_PIVOT_BUF_SIZE]; 138 UChar *pivot_source; 139 UChar *pivot_target; 140 }; 141 typedef struct _uconv_t uconv_t; 142 #endif 143 144 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; 145 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; 146 struct _xmlCharEncodingHandler { 147 char *name; 148 xmlCharEncodingInputFunc input; 149 xmlCharEncodingOutputFunc output; 150 #ifdef LIBXML_ICONV_ENABLED 151 iconv_t iconv_in; 152 iconv_t iconv_out; 153 #endif /* LIBXML_ICONV_ENABLED */ 154 #ifdef LIBXML_ICU_ENABLED 155 uconv_t *uconv_in; 156 uconv_t *uconv_out; 157 #endif /* LIBXML_ICU_ENABLED */ 158 }; 159 160 #ifdef __cplusplus 161 } 162 #endif 163 #include <libxml/tree.h> 164 #ifdef __cplusplus 165 extern "C" { 166 #endif 167 168 /* 169 * Interfaces for encoding handlers. 170 */ 171 XMLPUBFUN void XMLCALL 172 xmlInitCharEncodingHandlers (void); 173 XMLPUBFUN void XMLCALL 174 xmlCleanupCharEncodingHandlers (void); 175 XMLPUBFUN void XMLCALL 176 xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler); 177 XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL 178 xmlGetCharEncodingHandler (xmlCharEncoding enc); 179 XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL 180 xmlFindCharEncodingHandler (const char *name); 181 XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL 182 xmlNewCharEncodingHandler (const char *name, 183 xmlCharEncodingInputFunc input, 184 xmlCharEncodingOutputFunc output); 185 186 /* 187 * Interfaces for encoding names and aliases. 188 */ 189 XMLPUBFUN int XMLCALL 190 xmlAddEncodingAlias (const char *name, 191 const char *alias); 192 XMLPUBFUN int XMLCALL 193 xmlDelEncodingAlias (const char *alias); 194 XMLPUBFUN const char * XMLCALL 195 xmlGetEncodingAlias (const char *alias); 196 XMLPUBFUN void XMLCALL 197 xmlCleanupEncodingAliases (void); 198 XMLPUBFUN xmlCharEncoding XMLCALL 199 xmlParseCharEncoding (const char *name); 200 XMLPUBFUN const char * XMLCALL 201 xmlGetCharEncodingName (xmlCharEncoding enc); 202 203 /* 204 * Interfaces directly used by the parsers. 205 */ 206 XMLPUBFUN xmlCharEncoding XMLCALL 207 xmlDetectCharEncoding (const unsigned char *in, 208 int len); 209 210 XMLPUBFUN int XMLCALL 211 xmlCharEncOutFunc (xmlCharEncodingHandler *handler, 212 xmlBufferPtr out, 213 xmlBufferPtr in); 214 215 XMLPUBFUN int XMLCALL 216 xmlCharEncInFunc (xmlCharEncodingHandler *handler, 217 xmlBufferPtr out, 218 xmlBufferPtr in); 219 XMLPUBFUN int XMLCALL 220 xmlCharEncFirstLine (xmlCharEncodingHandler *handler, 221 xmlBufferPtr out, 222 xmlBufferPtr in); 223 XMLPUBFUN int XMLCALL 224 xmlCharEncCloseFunc (xmlCharEncodingHandler *handler); 225 226 /* 227 * Export a few useful functions 228 */ 229 #ifdef LIBXML_OUTPUT_ENABLED 230 XMLPUBFUN int XMLCALL 231 UTF8Toisolat1 (unsigned char *out, 232 int *outlen, 233 const unsigned char *in, 234 int *inlen); 235 #endif /* LIBXML_OUTPUT_ENABLED */ 236 XMLPUBFUN int XMLCALL 237 isolat1ToUTF8 (unsigned char *out, 238 int *outlen, 239 const unsigned char *in, 240 int *inlen); 241 #ifdef __cplusplus 242 } 243 #endif 244 245 #endif /* __XML_CHAR_ENCODING_H__ */ 246