• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved.
3  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4  * Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
5  * Copyright (C) 2009 Dominik Röttsches <dominik.roettsches@access-company.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
20  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include "config.h"
30 #include "TextCodecGtk.h"
31 
32 #include "CString.h"
33 #include "PlatformString.h"
34 #include <wtf/Assertions.h>
35 #include <wtf/HashMap.h>
36 #include <wtf/gtk/GOwnPtr.h>
37 #include "Logging.h"
38 
39 using std::min;
40 
41 namespace WebCore {
42 
43 // TextCodec's appendOmittingBOM() is gone (http://trac.webkit.org/changeset/33380).
44 // That's why we need to avoid generating extra BOM's for the conversion result.
45 // This can be achieved by specifying the UTF-16 codecs' endianness explicitly when initializing GLib.
46 
47 #if (G_BYTE_ORDER == G_BIG_ENDIAN)
48     const gchar* WebCore::TextCodecGtk::m_internalEncodingName = "UTF-16BE";
49 #else
50     const gchar* WebCore::TextCodecGtk::m_internalEncodingName = "UTF-16LE";
51 #endif
52 
53 
54 // We're specifying the list of text codecs and their aliases here.
55 // For each codec the first entry is the canonical name, remaining ones are used as aliases.
56 // Each alias list must be terminated by a 0.
57 
58 // Unicode
59 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_UTF_8            = { "UTF-8", 0 };
60 
61 // Western
62 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_1       = { "ISO-8859-1", "CP819", "IBM819", "ISO-IR-100", "ISO8859-1", "ISO_8859-1", "ISO_8859-1:1987",  "L1", "LATIN1", "CSISOLATIN1", 0 };
63 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_MACROMAN         = { "MACROMAN", "MAC", "MACINTOSH", "CSMACINTOSH", 0 };
64 
65 // Japanese
66 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_SHIFT_JIS        = { "Shift_JIS", "MS_KANJI", "SHIFT-JIS", "SJIS", "CSSHIFTJIS", 0 };
67     TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_EUC_JP       = { "EUC-JP", "EUC_JP", "EUCJP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE", "CSEUCPKDFMTJAPANESE", 0 };
68 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_2022_JP      = { "ISO-2022-JP", 0 };
69 
70 // Traditional Chinese
71 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_BIG5             = { "BIG5", "BIG-5", "BIG-FIVE", "BIG5", "BIGFIVE", "CN-BIG5", "CSBIG5", 0 };
72 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_BIG5_HKSCS       = { "BIG5-HKSCS", "BIG5-HKSCS:2004", "BIG5HKSCS", 0 };
73 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP950            = { "CP950", 0 };
74 
75 // Korean
76 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_2022_KR      = { "ISO-2022-KR", "CSISO2022KR", 0 };
77 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP949            = { "CP949", "UHC", 0 };
78 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_EUC_KR           = { "EUC-KR", "CSEUCKR", 0 };
79 
80 // Arabic
81 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_6       = { "ISO-8859-6", "ARABIC", "ASMO-708", "ECMA-114", "ISO-IR-127", "ISO8859-6", "ISO_8859-6", "ISO_8859-6:1987", "CSISOLATINARABIC", 0 };
82 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1256           = { "windows-1256", "CP1256", "MS-ARAB", 0 }; // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case
83 
84 // Hebrew
85 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_8       = { "ISO-8859-8", "HEBREW", "ISO-8859-8", "ISO-IR-138", "ISO8859-8", "ISO_8859-8", "ISO_8859-8:1988", "CSISOLATINHEBREW", 0 };
86 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1255           = { "windows-1255", "CP1255", "MS-HEBR", 0 }; // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html
87 
88 // Greek
89 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_7       = { "ISO-8859-7", "ECMA-118", "ELOT_928", "GREEK", "GREEK8", "ISO-IR-126", "ISO8859-7", "ISO_8859-7", "ISO_8859-7:1987", "ISO_8859-7:2003", "CSI", 0 };
90 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP869            = { "CP869", "869", "CP-GR", "IBM869", "CSIBM869", 0 };
91 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_WINDOWS_1253     = { "WINDOWS-1253", 0 };
92 
93 // Cyrillic
94 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_5       = { "ISO-8859-5", "CYRILLIC", "ISO-IR-144", "ISO8859-5", "ISO_8859-5", "ISO_8859-5:1988", "CSISOLATINCYRILLIC", 0 };
95 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_KOI8_R           = { "KOI8-R", "CSKOI8R", 0 };
96 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP866            = { "CP866", "866", "IBM866", "CSIBM866", 0 };
97 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_KOI8_U           = { "KOI8-U", 0 };
98 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_WINDOWS_1251     = { "windows-1251", "CP1251", 0 }; // CP1251 added to pass /fast/encoding/charset-cp1251.html
99 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_MACCYRILLIC      = { "mac-cyrillic", "MACCYRILLIC", "x-mac-cyrillic", 0 };
100 
101 // Thai
102 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP874            = { "CP874", "WINDOWS-874", 0 };
103 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_TIS_620          = { "TIS-620", 0 };
104 
105 // Simplified Chinese
106 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_GBK              = { "GBK", 0 };
107 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_HZ               = { "HZ", "HZ-GB-2312", 0 };
108 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_GB18030          = { "GB18030", 0 };
109 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_EUC_CN           = { "EUC-CN", "EUCCN", "GB2312", "CN-GB", "CSGB2312", "EUC_CN", 0 };
110 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_2312_80          = { "GB_2312-80", "CHINESE", "csISO58GB231280", "GB2312.1980-0", "ISO-IR-58" };
111 
112 // Central European
113 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_2       = { "ISO-8859-2", "ISO-IR-101", "ISO8859-2", "ISO_8859-2", "ISO_8859-2:1987", "L2", "LATIN2", "CSISOLATIN2", 0 };
114 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1250           = { "CP1250", "MS-EE", "WINDOWS-1250", 0 };
115 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_MACCENTRALEUROPE = { "MAC-CENTRALEUROPE", 0 };
116 
117 // Vietnamese
118 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1258           = { "CP1258", "WINDOWS-1258", 0 };
119 
120 // Turkish
121 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1254           = { "CP1254", "MS-TURK", "WINDOWS-1254", 0 };
122 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_9       = { "ISO-8859-9", "ISO-IR-148", "ISO8859-9", "ISO_8859-9", "ISO_8859-9:1989", "L5", "LATIN5", "CSISOLATIN5", 0 };
123 
124 // Baltic
125 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_CP1257           = { "CP1257", "WINBALTRIM", "WINDOWS-1257", 0 };
126 TextCodecGtk::codecAliasList TextCodecGtk::m_codecAliases_ISO_8859_4       = { "ISO-8859-4", "ISO-IR-110", "ISO8859-4", "ISO_8859-4", "ISO_8859-4:1988", "L4", "LATIN4", "CSISOLATIN4", 0 };
127 
128 gconstpointer const TextCodecGtk::m_iconvBaseCodecList[] = {
129     // Unicode
130     &m_codecAliases_UTF_8,
131 
132     // Western
133     &m_codecAliases_ISO_8859_1
134 };
135 
136 gconstpointer const TextCodecGtk::m_iconvExtendedCodecList[] =
137 {
138     // Western
139     &m_codecAliases_MACROMAN,
140 
141     // Japanese
142     &m_codecAliases_SHIFT_JIS,
143     &m_codecAliases_EUC_JP,
144     &m_codecAliases_ISO_2022_JP,
145 
146     // Simplified Chinese
147     &m_codecAliases_BIG5,
148     &m_codecAliases_BIG5_HKSCS,
149     &m_codecAliases_CP950,
150 
151     // Korean
152     &m_codecAliases_ISO_2022_KR,
153     &m_codecAliases_CP949,
154     &m_codecAliases_EUC_KR,
155 
156     // Arabic
157     &m_codecAliases_ISO_8859_6,
158     &m_codecAliases_CP1256,
159 
160     // Hebrew
161     &m_codecAliases_ISO_8859_8,
162     &m_codecAliases_CP1255,
163 
164     // Greek
165     &m_codecAliases_ISO_8859_7,
166     &m_codecAliases_CP869,
167     &m_codecAliases_WINDOWS_1253,
168 
169     // Cyrillic
170     &m_codecAliases_ISO_8859_5,
171     &m_codecAliases_KOI8_R,
172     &m_codecAliases_CP866,
173     &m_codecAliases_KOI8_U,
174     &m_codecAliases_WINDOWS_1251,
175     &m_codecAliases_MACCYRILLIC,
176 
177     // Thai
178     &m_codecAliases_CP874,
179     &m_codecAliases_TIS_620,
180 
181     // Traditional Chinese
182     &m_codecAliases_GBK,
183     &m_codecAliases_HZ,
184     &m_codecAliases_GB18030,
185     &m_codecAliases_EUC_CN,
186     &m_codecAliases_2312_80,
187 
188     // Central European
189     &m_codecAliases_ISO_8859_2,
190     &m_codecAliases_CP1250,
191     &m_codecAliases_MACCENTRALEUROPE,
192 
193     // Vietnamese
194     &m_codecAliases_CP1258,
195 
196     // Turkish
197     &m_codecAliases_CP1254,
198     &m_codecAliases_ISO_8859_9,
199 
200     // Baltic
201     &m_codecAliases_CP1257,
202     &m_codecAliases_ISO_8859_4
203 };
204 
205 
206 const size_t ConversionBufferSize = 16384;
207 
208 
newTextCodecGtk(const TextEncoding & encoding,const void *)209 static PassOwnPtr<TextCodec> newTextCodecGtk(const TextEncoding& encoding, const void*)
210 {
211     return new TextCodecGtk(encoding);
212 }
213 
isEncodingAvailable(const gchar * encName)214 gboolean TextCodecGtk::isEncodingAvailable(const gchar* encName)
215 {
216     GIConv tester;
217     // test decoding
218     tester = g_iconv_open(m_internalEncodingName, encName);
219     if (tester == reinterpret_cast<GIConv>(-1)) {
220         return false;
221     } else {
222         g_iconv_close(tester);
223         // test encoding
224         tester = g_iconv_open(encName, m_internalEncodingName);
225         if (tester == reinterpret_cast<GIConv>(-1)) {
226             return false;
227         } else {
228             g_iconv_close(tester);
229             return true;
230         }
231     }
232 }
233 
registerEncodingNames(EncodingNameRegistrar registrar,bool extended)234 void TextCodecGtk::registerEncodingNames(EncodingNameRegistrar registrar, bool extended)
235 {
236     const void* const* encodingList;
237     unsigned int listLength = 0;
238     if (extended) {
239         encodingList = m_iconvExtendedCodecList;
240         listLength = sizeof(m_iconvExtendedCodecList)/sizeof(gpointer);
241     } else {
242         encodingList = m_iconvBaseCodecList;
243         listLength = sizeof(m_iconvBaseCodecList)/sizeof(gpointer);
244     }
245 
246     for (unsigned int i = 0; i < listLength; ++i) {
247         codecAliasList *codecAliases = static_cast<codecAliasList*>(encodingList[i]);
248 
249         // Our convention is, the first entry in codecAliases is the canonical name,
250         // see above in the list of declarations.
251         // Probe GLib for this one first. If it's not available, we skip the whole group of aliases.
252 
253         int codecCount = 0;
254         const char *canonicalName;
255         canonicalName = (*codecAliases)[codecCount];
256 
257         if(!isEncodingAvailable(canonicalName)) {
258             LOG(TextConversion, "Canonical encoding %s not available, skipping.", canonicalName);
259             continue;
260         }
261         registrar(canonicalName, canonicalName);
262 
263         const char *currentAlias;
264         while ((currentAlias = (*codecAliases)[++codecCount])) {
265             if (isEncodingAvailable(currentAlias)) {
266                 LOG(TextConversion, "Registering encoding name alias %s to canonical %s", currentAlias, canonicalName);
267                 registrar(currentAlias, canonicalName);
268             }
269         }
270 
271     }
272 }
273 
registerCodecs(TextCodecRegistrar registrar,bool extended)274 void TextCodecGtk::registerCodecs(TextCodecRegistrar registrar, bool extended)
275 {
276     const void* const* encodingList;
277     unsigned int listLength = 0;
278     if (extended) {
279         encodingList = m_iconvExtendedCodecList;
280         listLength = sizeof(m_iconvExtendedCodecList)/sizeof(gpointer);
281     } else {
282         encodingList = m_iconvBaseCodecList;
283         listLength = sizeof(m_iconvBaseCodecList)/sizeof(gpointer);
284     }
285 
286     for (unsigned int i = 0; i < listLength; ++i) {
287         codecAliasList *codecAliases = static_cast<codecAliasList*>(encodingList[i]);
288         // by convention, the first "alias" should be the canonical name, see the definition of the alias lists
289         const gchar *codecName = (*codecAliases)[0];
290         if (isEncodingAvailable(codecName))
291             registrar(codecName, newTextCodecGtk, 0);
292     }
293 }
294 
registerBaseEncodingNames(EncodingNameRegistrar registrar)295 void TextCodecGtk::registerBaseEncodingNames(EncodingNameRegistrar registrar)
296 {
297     registerEncodingNames(registrar, false);
298 }
299 
registerBaseCodecs(TextCodecRegistrar registrar)300 void TextCodecGtk::registerBaseCodecs(TextCodecRegistrar registrar)
301 {
302     registerCodecs(registrar, false);
303 }
304 
registerExtendedEncodingNames(EncodingNameRegistrar registrar)305 void TextCodecGtk::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
306 {
307     registerEncodingNames(registrar, true);
308 }
309 
registerExtendedCodecs(TextCodecRegistrar registrar)310 void TextCodecGtk::registerExtendedCodecs(TextCodecRegistrar registrar)
311 {
312     registerCodecs(registrar, true);
313 }
314 
TextCodecGtk(const TextEncoding & encoding)315 TextCodecGtk::TextCodecGtk(const TextEncoding& encoding)
316     : m_encoding(encoding)
317     , m_numBufferedBytes(0)
318     , m_iconvDecoder(reinterpret_cast<GIConv>(-1))
319     , m_iconvEncoder(reinterpret_cast<GIConv>(-1))
320 {
321 }
322 
~TextCodecGtk()323 TextCodecGtk::~TextCodecGtk()
324 {
325     if (m_iconvDecoder != reinterpret_cast<GIConv>(-1)) {
326         g_iconv_close(m_iconvDecoder);
327         m_iconvDecoder = reinterpret_cast<GIConv>(-1);
328     }
329     if (m_iconvEncoder != reinterpret_cast<GIConv>(-1)) {
330         g_iconv_close(m_iconvEncoder);
331         m_iconvEncoder = reinterpret_cast<GIConv>(-1);
332     }
333 }
334 
createIConvDecoder() const335 void TextCodecGtk::createIConvDecoder() const
336 {
337     ASSERT(m_iconvDecoder == reinterpret_cast<GIConv>(-1));
338 
339     m_iconvDecoder = g_iconv_open(m_internalEncodingName, m_encoding.name());
340 }
341 
createIConvEncoder() const342 void TextCodecGtk::createIConvEncoder() const
343 {
344     ASSERT(m_iconvDecoder == reinterpret_cast<GIConv>(-1));
345 
346     m_iconvEncoder = g_iconv_open(m_encoding.name(), m_internalEncodingName);
347 }
348 
decode(const char * bytes,size_t length,bool flush,bool stopOnError,bool & sawError)349 String TextCodecGtk::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
350 {
351     // Get a converter for the passed-in encoding.
352     if (m_iconvDecoder == reinterpret_cast<GIConv>(-1)) {
353         createIConvDecoder();
354         ASSERT(m_iconvDecoder != reinterpret_cast<GIConv>(-1));
355         if (m_iconvDecoder == reinterpret_cast<GIConv>(-1)) {
356             LOG_ERROR("Error creating IConv encoder even though encoding was in table.");
357             return String();
358         }
359     }
360 
361     size_t countWritten, countRead, conversionLength;
362     const char* conversionBytes;
363     char* prefixedBytes = 0;
364 
365     if (m_numBufferedBytes) {
366         conversionLength = length + m_numBufferedBytes;
367         prefixedBytes = static_cast<char*>(fastMalloc(conversionLength));
368         memcpy(prefixedBytes, m_bufferedBytes, m_numBufferedBytes);
369         memcpy(prefixedBytes + m_numBufferedBytes, bytes, length);
370 
371         conversionBytes = prefixedBytes;
372 
373         // all buffered bytes are consumed now
374         m_numBufferedBytes = 0;
375     } else {
376         // no previously buffered partial data,
377         // just convert the data that was passed in
378         conversionBytes = bytes;
379         conversionLength = length;
380     }
381 
382     GOwnPtr<GError> err;
383     GOwnPtr<UChar> buffer;
384 
385     buffer.outPtr() = reinterpret_cast<UChar*>(g_convert_with_iconv(conversionBytes, conversionLength, m_iconvDecoder, &countRead, &countWritten, &err.outPtr()));
386 
387 
388     if (err) {
389         LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", err->code, err->message);
390         m_numBufferedBytes = 0; // reset state for subsequent calls to decode
391         fastFree(prefixedBytes);
392         sawError = true;
393         return String();
394     }
395 
396     // Partial input at the end of the string may not result in an error being raised.
397     // From the gnome library documentation on g_convert_with_iconv:
398     // "Even if the conversion was successful, this may be less than len if there were partial characters at the end of the input."
399     // That's why we need to compare conversionLength against countRead
400 
401     m_numBufferedBytes = conversionLength - countRead;
402     if (m_numBufferedBytes > 0) {
403         if (flush) {
404             LOG_ERROR("Partial bytes at end of input while flush requested.");
405             m_numBufferedBytes = 0; // reset state for subsequent calls to decode
406             fastFree(prefixedBytes);
407             sawError = true;
408             return String();
409         }
410         memcpy(m_bufferedBytes, conversionBytes + countRead, m_numBufferedBytes);
411     }
412 
413     fastFree(prefixedBytes);
414 
415     Vector<UChar> result;
416 
417     result.append(buffer.get(), countWritten / sizeof(UChar));
418 
419     return String::adopt(result);
420 }
421 
encode(const UChar * characters,size_t length,UnencodableHandling handling)422 CString TextCodecGtk::encode(const UChar* characters, size_t length, UnencodableHandling handling)
423 {
424     if (!length)
425         return "";
426 
427     if (m_iconvEncoder == reinterpret_cast<GIConv>(-1))
428         createIConvEncoder();
429     if (m_iconvEncoder == reinterpret_cast<GIConv>(-1))
430         return CString();
431 
432     size_t count;
433 
434     GOwnPtr<GError> err;
435     GOwnPtr<char> buffer;
436 
437     buffer.outPtr() = g_convert_with_iconv(reinterpret_cast<const char*>(characters), length * sizeof(UChar), m_iconvEncoder, 0, &count, &err.outPtr());
438     if (err) {
439         LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", err->code, err->message);
440         return CString();
441     }
442 
443     return CString(buffer.get(), count);
444 }
445 
446 } // namespace WebCore
447