• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved.
3  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4  * Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
5  * Copyright (C) 2009 Dominik Röttsches <dominik.roettsches@access-company.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
20  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include "config.h"
30 #include "TextCodecGtk.h"
31 
32 #include <gio/gio.h>
33 #include "GOwnPtr.h"
34 #include "Logging.h"
35 #include "PlatformString.h"
36 #include <wtf/Assertions.h>
37 #include <wtf/HashMap.h>
38 #include <wtf/text/CString.h>
39 
40 using std::min;
41 
42 namespace WebCore {
43 
44 // TextCodec's appendOmittingBOM() is gone (http://trac.webkit.org/changeset/33380).
45 // That's why we need to avoid generating extra BOM's for the conversion result.
46 // This can be achieved by specifying the UTF-16 codecs' endianness explicitly when initializing GLib.
47 
48 #if (G_BYTE_ORDER == G_BIG_ENDIAN)
49 static const gchar* internalEncodingName = "UTF-16BE";
50 #else
51 static const gchar* internalEncodingName = "UTF-16LE";
52 #endif
53 
54 
55 const size_t ConversionBufferSize = 16384;
56 
57 
newTextCodecGtk(const TextEncoding & encoding,const void *)58 static PassOwnPtr<TextCodec> newTextCodecGtk(const TextEncoding& encoding, const void*)
59 {
60     return new TextCodecGtk(encoding);
61 }
62 
isEncodingAvailable(const gchar * encodingName)63 static bool isEncodingAvailable(const gchar* encodingName)
64 {
65     GIConv tester;
66     // test decoding
67     tester = g_iconv_open(internalEncodingName, encodingName);
68     if (tester == reinterpret_cast<GIConv>(-1)) {
69         return false;
70     } else {
71         g_iconv_close(tester);
72         // test encoding
73         tester = g_iconv_open(encodingName, internalEncodingName);
74         if (tester == reinterpret_cast<GIConv>(-1)) {
75             return false;
76         } else {
77             g_iconv_close(tester);
78             return true;
79         }
80     }
81 }
82 
registerEncodingNameIfAvailable(EncodingNameRegistrar registrar,const char * canonicalName)83 static bool registerEncodingNameIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName)
84 {
85     if (isEncodingAvailable(canonicalName)) {
86         registrar(canonicalName, canonicalName);
87         return true;
88     }
89 
90     return false;
91 }
92 
registerEncodingAliasIfAvailable(EncodingNameRegistrar registrar,const char * canonicalName,const char * aliasName)93 static void registerEncodingAliasIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName, const char* aliasName)
94 {
95     if (isEncodingAvailable(aliasName))
96         registrar(aliasName, canonicalName);
97 }
98 
registerCodecIfAvailable(TextCodecRegistrar registrar,const char * codecName)99 static void registerCodecIfAvailable(TextCodecRegistrar registrar, const char* codecName)
100 {
101     if (isEncodingAvailable(codecName))
102         registrar(codecName, newTextCodecGtk, 0);
103 }
104 
registerBaseEncodingNames(EncodingNameRegistrar registrar)105 void TextCodecGtk::registerBaseEncodingNames(EncodingNameRegistrar registrar)
106 {
107     // Unicode
108     registerEncodingNameIfAvailable(registrar, "UTF-8");
109     registerEncodingNameIfAvailable(registrar, "UTF-32");
110     registerEncodingNameIfAvailable(registrar, "UTF-32BE");
111     registerEncodingNameIfAvailable(registrar, "UTF-32LE");
112 
113     // Western
114     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-1")) {
115         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CP819");
116         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "IBM819");
117         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO-IR-100");
118         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO8859-1");
119         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1");
120         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1:1987");
121         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "L1");
122         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "LATIN1");
123         registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CSISOLATIN1");
124     }
125 }
126 
registerBaseCodecs(TextCodecRegistrar registrar)127 void TextCodecGtk::registerBaseCodecs(TextCodecRegistrar registrar)
128 {
129     // Unicode
130     registerCodecIfAvailable(registrar, "UTF-8");
131     registerCodecIfAvailable(registrar, "UTF-32");
132     registerCodecIfAvailable(registrar, "UTF-32BE");
133     registerCodecIfAvailable(registrar, "UTF-32LE");
134 
135     // Western
136     registerCodecIfAvailable(registrar, "ISO-8859-1");
137 }
138 
registerExtendedEncodingNames(EncodingNameRegistrar registrar)139 void TextCodecGtk::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
140 {
141     // Western
142     if (registerEncodingNameIfAvailable(registrar, "MACROMAN")) {
143         registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MAC");
144         registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MACINTOSH");
145         registerEncodingAliasIfAvailable(registrar, "MACROMAN", "CSMACINTOSH");
146     }
147 
148     // Japanese
149     if (registerEncodingNameIfAvailable(registrar, "Shift_JIS")) {
150         registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "MS_KANJI");
151         registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SHIFT-JIS");
152         registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SJIS");
153         registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "CSSHIFTJIS");
154     }
155     if (registerEncodingNameIfAvailable(registrar, "EUC-JP")) {
156         registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUC_JP");
157         registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUCJP");
158         registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE");
159         registerEncodingAliasIfAvailable(registrar, "EUC-JP", "CSEUCPKDFMTJAPANESE");
160     }
161     registerEncodingNameIfAvailable(registrar, "ISO-2022-JP");
162 
163     // Traditional Chinese
164     if (registerEncodingNameIfAvailable(registrar, "BIG5")) {
165         registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-5");
166         registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-FIVE");
167         registerEncodingAliasIfAvailable(registrar, "BIG5", "BIGFIVE");
168         registerEncodingAliasIfAvailable(registrar, "BIG5", "CN-BIG5");
169         registerEncodingAliasIfAvailable(registrar, "BIG5", "CSBIG5");
170     }
171     if (registerEncodingNameIfAvailable(registrar, "BIG5-HKSCS")) {
172         registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5-HKSCS:2004");
173         registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5HKSCS");
174     }
175     registerEncodingNameIfAvailable(registrar, "CP950");
176 
177     // Korean
178     if (registerEncodingNameIfAvailable(registrar, "ISO-2022-KR"))
179         registerEncodingAliasIfAvailable(registrar, "ISO-2022-KR", "CSISO2022KR");
180     if (registerEncodingNameIfAvailable(registrar, "CP949"))
181         registerEncodingAliasIfAvailable(registrar, "CP949", "UHC");
182     if (registerEncodingNameIfAvailable(registrar, "EUC-KR"))
183         registerEncodingAliasIfAvailable(registrar, "EUC-KR", "CSEUCKR");
184 
185     // Arabic
186     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-6")) {
187         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ARABIC");
188         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ASMO-708");
189         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ECMA-114");
190         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO-IR-127");
191         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO8859-6");
192         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6");
193         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6:1987");
194         registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "CSISOLATINARABIC");
195     }
196     // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case
197     if (registerEncodingNameIfAvailable(registrar, "windows-1256")) {
198         registerEncodingAliasIfAvailable(registrar, "windows-1256", "CP1256");
199         registerEncodingAliasIfAvailable(registrar, "windows-1256", "MS-ARAB");
200     }
201 
202     // Hebrew
203     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-8")) {
204         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "HEBREW");
205         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-8859-8");
206         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-IR-138");
207         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO8859-8");
208         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8");
209         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8:1988");
210         registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "CSISOLATINHEBREW");
211     }
212     // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html
213     if (registerEncodingNameIfAvailable(registrar, "windows-1255")) {
214         registerEncodingAliasIfAvailable(registrar, "windows-1255", "CP1255");
215         registerEncodingAliasIfAvailable(registrar, "windows-1255", "MS-HEBR");
216     }
217 
218     // Greek
219     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-7")) {
220         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ECMA-118");
221         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ELOT_928");
222         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK");
223         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK8");
224         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO-IR-126");
225         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO8859-7");
226         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7");
227         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:1987");
228         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:2003");
229         registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "CSI");
230     }
231     if (registerEncodingNameIfAvailable(registrar, "CP869")) {
232         registerEncodingAliasIfAvailable(registrar, "CP869", "869");
233         registerEncodingAliasIfAvailable(registrar, "CP869", "CP-GR");
234         registerEncodingAliasIfAvailable(registrar, "CP869", "IBM869");
235         registerEncodingAliasIfAvailable(registrar, "CP869", "CSIBM869");
236     }
237     registerEncodingNameIfAvailable(registrar, "WINDOWS-1253");
238 
239     // Cyrillic
240     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-5")) {
241         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CYRILLIC");
242         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO-IR-144");
243         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO8859-5");
244         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5");
245         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5:1988");
246         registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CSISOLATINCYRILLIC");
247     }
248     if (registerEncodingNameIfAvailable(registrar, "KOI8-R"))
249         registerEncodingAliasIfAvailable(registrar, "KOI8-R", "CSKOI8R");
250     if (registerEncodingNameIfAvailable(registrar, "CP866")) {
251         registerEncodingAliasIfAvailable(registrar, "CP866", "866");
252         registerEncodingAliasIfAvailable(registrar, "CP866", "IBM866");
253         registerEncodingAliasIfAvailable(registrar, "CP866", "CSIBM866");
254     }
255     registerEncodingNameIfAvailable(registrar, "KOI8-U");
256     // CP1251 added to pass /fast/encoding/charset-cp1251.html
257     if (registerEncodingNameIfAvailable(registrar, "windows-1251"))
258         registerEncodingAliasIfAvailable(registrar, "windows-1251", "CP1251");
259     if (registerEncodingNameIfAvailable(registrar, "mac-cyrillic")) {
260         registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "MACCYRILLIC");
261         registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "x-mac-cyrillic");
262     }
263 
264     // Thai
265     if (registerEncodingNameIfAvailable(registrar, "CP874"))
266         registerEncodingAliasIfAvailable(registrar, "CP874", "WINDOWS-874");
267     registerEncodingNameIfAvailable(registrar, "TIS-620");
268 
269     // Simplified Chinese
270     registerEncodingNameIfAvailable(registrar, "GBK");
271     if (registerEncodingNameIfAvailable(registrar, "HZ"))
272         registerEncodingAliasIfAvailable(registrar, "HZ", "HZ-GB-2312");
273     registerEncodingNameIfAvailable(registrar, "GB18030");
274     if (registerEncodingNameIfAvailable(registrar, "EUC-CN")) {
275         registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUCCN");
276         registerEncodingAliasIfAvailable(registrar, "EUC-CN", "GB2312");
277         registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CN-GB");
278         registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CSGB2312");
279         registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUC_CN");
280     }
281     if (registerEncodingNameIfAvailable(registrar, "GB_2312-80")) {
282         registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "CHINESE");
283         registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "csISO58GB231280");
284         registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "GB2312.1980-0");
285         registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "ISO-IR-58");
286     }
287 
288     // Central European
289     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-2")) {
290         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO-IR-101");
291         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO8859-2");
292         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2");
293         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2:1987");
294         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "L2");
295         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "LATIN2");
296         registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "CSISOLATIN2");
297     }
298     if (registerEncodingNameIfAvailable(registrar, "CP1250")) {
299         registerEncodingAliasIfAvailable(registrar, "CP1250", "MS-EE");
300         registerEncodingAliasIfAvailable(registrar, "CP1250", "WINDOWS-1250");
301     }
302     registerEncodingNameIfAvailable(registrar, "MAC-CENTRALEUROPE");
303 
304     // Vietnamese
305     if (registerEncodingNameIfAvailable(registrar, "CP1258"))
306         registerEncodingAliasIfAvailable(registrar, "CP1258", "WINDOWS-1258");
307 
308     // Turkish
309     if (registerEncodingNameIfAvailable(registrar, "CP1254")) {
310         registerEncodingAliasIfAvailable(registrar, "CP1254", "MS-TURK");
311         registerEncodingAliasIfAvailable(registrar, "CP1254", "WINDOWS-1254");
312     }
313     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-9")) {
314         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO-IR-148");
315         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO8859-9");
316         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9");
317         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9:1989");
318         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "L5");
319         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "LATIN5");
320         registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "CSISOLATIN5");
321     }
322 
323     // Baltic
324     if (registerEncodingNameIfAvailable(registrar, "CP1257")) {
325         registerEncodingAliasIfAvailable(registrar, "CP1257", "WINBALTRIM");
326         registerEncodingAliasIfAvailable(registrar, "CP1257", "WINDOWS-1257");
327     }
328     if (registerEncodingNameIfAvailable(registrar, "ISO-8859-4")) {
329         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO-IR-110");
330         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO8859-4");
331         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4");
332         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4:1988");
333         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "L4");
334         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "LATIN4");
335         registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "CSISOLATIN4");
336     }
337 }
338 
registerExtendedCodecs(TextCodecRegistrar registrar)339 void TextCodecGtk::registerExtendedCodecs(TextCodecRegistrar registrar)
340 {
341     // Western
342     registerCodecIfAvailable(registrar, "MACROMAN");
343 
344     // Japanese
345     registerCodecIfAvailable(registrar, "Shift_JIS");
346     registerCodecIfAvailable(registrar, "EUC-JP");
347     registerCodecIfAvailable(registrar, "ISO-2022-JP");
348 
349     // Traditional Chinese
350     registerCodecIfAvailable(registrar, "BIG5");
351     registerCodecIfAvailable(registrar, "BIG5-HKSCS");
352     registerCodecIfAvailable(registrar, "CP950");
353 
354     // Korean
355     registerCodecIfAvailable(registrar, "ISO-2022-KR");
356     registerCodecIfAvailable(registrar, "CP949");
357     registerCodecIfAvailable(registrar, "EUC-KR");
358 
359     // Arabic
360     registerCodecIfAvailable(registrar, "ISO-8859-6");
361     // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case
362     registerCodecIfAvailable(registrar, "windows-1256");
363 
364     // Hebrew
365     registerCodecIfAvailable(registrar, "ISO-8859-8");
366     // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html
367     registerCodecIfAvailable(registrar, "windows-1255");
368 
369     // Greek
370     registerCodecIfAvailable(registrar, "ISO-8859-7");
371     registerCodecIfAvailable(registrar, "CP869");
372     registerCodecIfAvailable(registrar, "WINDOWS-1253");
373 
374     // Cyrillic
375     registerCodecIfAvailable(registrar, "ISO-8859-5");
376     registerCodecIfAvailable(registrar, "KOI8-R");
377     registerCodecIfAvailable(registrar, "CP866");
378     registerCodecIfAvailable(registrar, "KOI8-U");
379     // CP1251 added to pass /fast/encoding/charset-cp1251.html
380     registerCodecIfAvailable(registrar, "windows-1251");
381     registerCodecIfAvailable(registrar, "mac-cyrillic");
382 
383     // Thai
384     registerCodecIfAvailable(registrar, "CP874");
385     registerCodecIfAvailable(registrar, "TIS-620");
386 
387     // Simplified Chinese
388     registerCodecIfAvailable(registrar, "GBK");
389     registerCodecIfAvailable(registrar, "HZ");
390     registerCodecIfAvailable(registrar, "GB18030");
391     registerCodecIfAvailable(registrar, "EUC-CN");
392     registerCodecIfAvailable(registrar, "GB_2312-80");
393 
394     // Central European
395     registerCodecIfAvailable(registrar, "ISO-8859-2");
396     registerCodecIfAvailable(registrar, "CP1250");
397     registerCodecIfAvailable(registrar, "MAC-CENTRALEUROPE");
398 
399     // Vietnamese
400     registerCodecIfAvailable(registrar, "CP1258");
401 
402     // Turkish
403     registerCodecIfAvailable(registrar, "CP1254");
404     registerCodecIfAvailable(registrar, "ISO-8859-9");
405 
406     // Baltic
407     registerCodecIfAvailable(registrar, "CP1257");
408     registerCodecIfAvailable(registrar, "ISO-8859-4");
409 }
410 
TextCodecGtk(const TextEncoding & encoding)411 TextCodecGtk::TextCodecGtk(const TextEncoding& encoding)
412     : m_encoding(encoding)
413     , m_numBufferedBytes(0)
414 {
415 }
416 
~TextCodecGtk()417 TextCodecGtk::~TextCodecGtk()
418 {
419 }
420 
createIConvDecoder() const421 void TextCodecGtk::createIConvDecoder() const
422 {
423     ASSERT(!m_iconvDecoder);
424 
425     m_iconvDecoder = adoptGRef(g_charset_converter_new(internalEncodingName, m_encoding.name(), 0));
426 }
427 
createIConvEncoder() const428 void TextCodecGtk::createIConvEncoder() const
429 {
430     ASSERT(!m_iconvEncoder);
431 
432     m_iconvEncoder = adoptGRef(g_charset_converter_new(m_encoding.name(), internalEncodingName, 0));
433 }
434 
decode(const char * bytes,size_t length,bool flush,bool stopOnError,bool & sawError)435 String TextCodecGtk::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
436 {
437     // Get a converter for the passed-in encoding.
438     if (!m_iconvDecoder)
439         createIConvDecoder();
440     if (!m_iconvDecoder) {
441         LOG_ERROR("Error creating IConv encoder even though encoding was in table.");
442         return String();
443     }
444 
445     Vector<UChar> result;
446 
447     gsize bytesRead = 0;
448     gsize bytesWritten = 0;
449     const gchar* input = bytes;
450     gsize inputLength = length;
451     gchar buffer[ConversionBufferSize];
452     int flags = !length ? G_CONVERTER_INPUT_AT_END : G_CONVERTER_NO_FLAGS;
453     if (flush)
454         flags |= G_CONVERTER_FLUSH;
455 
456     bool bufferWasFull = false;
457     char* prefixedBytes = 0;
458 
459     if (m_numBufferedBytes) {
460         inputLength = length + m_numBufferedBytes;
461         prefixedBytes = static_cast<char*>(fastMalloc(inputLength));
462         memcpy(prefixedBytes, m_bufferedBytes, m_numBufferedBytes);
463         memcpy(prefixedBytes + m_numBufferedBytes, bytes, length);
464 
465         input = prefixedBytes;
466 
467         // all buffered bytes are consumed now
468         m_numBufferedBytes = 0;
469     }
470 
471     do {
472         GOwnPtr<GError> error;
473         GConverterResult res = g_converter_convert(G_CONVERTER(m_iconvDecoder.get()),
474                                                    input, inputLength,
475                                                    buffer, sizeof(buffer),
476                                                    static_cast<GConverterFlags>(flags),
477                                                    &bytesRead, &bytesWritten,
478                                                    &error.outPtr());
479         input += bytesRead;
480         inputLength -= bytesRead;
481 
482         if (res == G_CONVERTER_ERROR) {
483             if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) {
484                 // There is not enough input to fully determine what the conversion should produce,
485                 // save it to a buffer to prepend it to the next input.
486                 memcpy(m_bufferedBytes, input, inputLength);
487                 m_numBufferedBytes = inputLength;
488                 inputLength = 0;
489             } else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_NO_SPACE))
490                 bufferWasFull = true;
491             else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) {
492                 if (stopOnError)
493                     sawError = true;
494                 if (inputLength) {
495                     // Ignore invalid character.
496                     input += 1;
497                     inputLength -= 1;
498                 }
499             } else {
500                 sawError = true;
501                 LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message);
502                 m_numBufferedBytes = 0; // Reset state for subsequent calls to decode.
503                 fastFree(prefixedBytes);
504                 return String();
505             }
506         }
507 
508         result.append(reinterpret_cast<UChar*>(buffer), bytesWritten / sizeof(UChar));
509     } while ((inputLength || bufferWasFull) && !sawError);
510 
511     fastFree(prefixedBytes);
512 
513     return String::adopt(result);
514 }
515 
encode(const UChar * characters,size_t length,UnencodableHandling handling)516 CString TextCodecGtk::encode(const UChar* characters, size_t length, UnencodableHandling handling)
517 {
518     if (!length)
519         return "";
520 
521     if (!m_iconvEncoder)
522         createIConvEncoder();
523     if (!m_iconvEncoder) {
524         LOG_ERROR("Error creating IConv encoder even though encoding was in table.");
525         return CString();
526     }
527 
528     gsize bytesRead = 0;
529     gsize bytesWritten = 0;
530     const gchar* input = reinterpret_cast<const char*>(characters);
531     gsize inputLength = length * sizeof(UChar);
532     gchar buffer[ConversionBufferSize];
533     Vector<char> result;
534     GOwnPtr<GError> error;
535 
536     size_t size = 0;
537     do {
538         g_converter_convert(G_CONVERTER(m_iconvEncoder.get()),
539                             input, inputLength,
540                             buffer, sizeof(buffer),
541                             G_CONVERTER_INPUT_AT_END,
542                             &bytesRead, &bytesWritten,
543                             &error.outPtr());
544         input += bytesRead;
545         inputLength -= bytesRead;
546         if (bytesWritten > 0) {
547             result.grow(size + bytesWritten);
548             memcpy(result.data() + size, buffer, bytesWritten);
549             size += bytesWritten;
550         }
551 
552         if (error && g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) {
553             UChar codePoint = reinterpret_cast<const UChar*>(input)[0];
554             UnencodableReplacementArray replacement;
555             int replacementLength = TextCodec::getUnencodableReplacement(codePoint, handling, replacement);
556 
557             // Consume the invalid character.
558             input += sizeof(UChar);
559             inputLength -= sizeof(UChar);
560 
561             // Append replacement string to result buffer.
562             result.grow(size + replacementLength);
563             memcpy(result.data() + size, replacement, replacementLength);
564             size += replacementLength;
565 
566             error.clear();
567         }
568     } while (inputLength && !error.get());
569 
570     if (error) {
571         LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message);
572         return CString();
573     }
574 
575     return CString(result.data(), size);
576 }
577 
578 } // namespace WebCore
579