1 /*
2 * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 * Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
5 * Copyright (C) 2009 Dominik Röttsches <dominik.roettsches@access-company.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
20 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "config.h"
30 #include "TextCodecGtk.h"
31
32 #include <gio/gio.h>
33 #include "GOwnPtr.h"
34 #include "Logging.h"
35 #include "PlatformString.h"
36 #include <wtf/Assertions.h>
37 #include <wtf/HashMap.h>
38 #include <wtf/text/CString.h>
39
40 using std::min;
41
42 namespace WebCore {
43
44 // TextCodec's appendOmittingBOM() is gone (http://trac.webkit.org/changeset/33380).
45 // That's why we need to avoid generating extra BOM's for the conversion result.
46 // This can be achieved by specifying the UTF-16 codecs' endianness explicitly when initializing GLib.
47
48 #if (G_BYTE_ORDER == G_BIG_ENDIAN)
49 static const gchar* internalEncodingName = "UTF-16BE";
50 #else
51 static const gchar* internalEncodingName = "UTF-16LE";
52 #endif
53
54
55 const size_t ConversionBufferSize = 16384;
56
57
newTextCodecGtk(const TextEncoding & encoding,const void *)58 static PassOwnPtr<TextCodec> newTextCodecGtk(const TextEncoding& encoding, const void*)
59 {
60 return new TextCodecGtk(encoding);
61 }
62
isEncodingAvailable(const gchar * encodingName)63 static bool isEncodingAvailable(const gchar* encodingName)
64 {
65 GIConv tester;
66 // test decoding
67 tester = g_iconv_open(internalEncodingName, encodingName);
68 if (tester == reinterpret_cast<GIConv>(-1)) {
69 return false;
70 } else {
71 g_iconv_close(tester);
72 // test encoding
73 tester = g_iconv_open(encodingName, internalEncodingName);
74 if (tester == reinterpret_cast<GIConv>(-1)) {
75 return false;
76 } else {
77 g_iconv_close(tester);
78 return true;
79 }
80 }
81 }
82
registerEncodingNameIfAvailable(EncodingNameRegistrar registrar,const char * canonicalName)83 static bool registerEncodingNameIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName)
84 {
85 if (isEncodingAvailable(canonicalName)) {
86 registrar(canonicalName, canonicalName);
87 return true;
88 }
89
90 return false;
91 }
92
registerEncodingAliasIfAvailable(EncodingNameRegistrar registrar,const char * canonicalName,const char * aliasName)93 static void registerEncodingAliasIfAvailable(EncodingNameRegistrar registrar, const char* canonicalName, const char* aliasName)
94 {
95 if (isEncodingAvailable(aliasName))
96 registrar(aliasName, canonicalName);
97 }
98
registerCodecIfAvailable(TextCodecRegistrar registrar,const char * codecName)99 static void registerCodecIfAvailable(TextCodecRegistrar registrar, const char* codecName)
100 {
101 if (isEncodingAvailable(codecName))
102 registrar(codecName, newTextCodecGtk, 0);
103 }
104
registerBaseEncodingNames(EncodingNameRegistrar registrar)105 void TextCodecGtk::registerBaseEncodingNames(EncodingNameRegistrar registrar)
106 {
107 // Unicode
108 registerEncodingNameIfAvailable(registrar, "UTF-8");
109 registerEncodingNameIfAvailable(registrar, "UTF-32");
110 registerEncodingNameIfAvailable(registrar, "UTF-32BE");
111 registerEncodingNameIfAvailable(registrar, "UTF-32LE");
112
113 // Western
114 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-1")) {
115 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CP819");
116 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "IBM819");
117 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO-IR-100");
118 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO8859-1");
119 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1");
120 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "ISO_8859-1:1987");
121 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "L1");
122 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "LATIN1");
123 registerEncodingAliasIfAvailable(registrar, "ISO-8859-1", "CSISOLATIN1");
124 }
125 }
126
registerBaseCodecs(TextCodecRegistrar registrar)127 void TextCodecGtk::registerBaseCodecs(TextCodecRegistrar registrar)
128 {
129 // Unicode
130 registerCodecIfAvailable(registrar, "UTF-8");
131 registerCodecIfAvailable(registrar, "UTF-32");
132 registerCodecIfAvailable(registrar, "UTF-32BE");
133 registerCodecIfAvailable(registrar, "UTF-32LE");
134
135 // Western
136 registerCodecIfAvailable(registrar, "ISO-8859-1");
137 }
138
registerExtendedEncodingNames(EncodingNameRegistrar registrar)139 void TextCodecGtk::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
140 {
141 // Western
142 if (registerEncodingNameIfAvailable(registrar, "MACROMAN")) {
143 registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MAC");
144 registerEncodingAliasIfAvailable(registrar, "MACROMAN", "MACINTOSH");
145 registerEncodingAliasIfAvailable(registrar, "MACROMAN", "CSMACINTOSH");
146 }
147
148 // Japanese
149 if (registerEncodingNameIfAvailable(registrar, "Shift_JIS")) {
150 registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "MS_KANJI");
151 registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SHIFT-JIS");
152 registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "SJIS");
153 registerEncodingAliasIfAvailable(registrar, "Shift_JIS", "CSSHIFTJIS");
154 }
155 if (registerEncodingNameIfAvailable(registrar, "EUC-JP")) {
156 registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUC_JP");
157 registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EUCJP");
158 registerEncodingAliasIfAvailable(registrar, "EUC-JP", "EXTENDED_UNIX_CODE_PACKED_FORMAT_FOR_JAPANESE");
159 registerEncodingAliasIfAvailable(registrar, "EUC-JP", "CSEUCPKDFMTJAPANESE");
160 }
161 registerEncodingNameIfAvailable(registrar, "ISO-2022-JP");
162
163 // Traditional Chinese
164 if (registerEncodingNameIfAvailable(registrar, "BIG5")) {
165 registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-5");
166 registerEncodingAliasIfAvailable(registrar, "BIG5", "BIG-FIVE");
167 registerEncodingAliasIfAvailable(registrar, "BIG5", "BIGFIVE");
168 registerEncodingAliasIfAvailable(registrar, "BIG5", "CN-BIG5");
169 registerEncodingAliasIfAvailable(registrar, "BIG5", "CSBIG5");
170 }
171 if (registerEncodingNameIfAvailable(registrar, "BIG5-HKSCS")) {
172 registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5-HKSCS:2004");
173 registerEncodingAliasIfAvailable(registrar, "BIG5-HKSCS", "BIG5HKSCS");
174 }
175 registerEncodingNameIfAvailable(registrar, "CP950");
176
177 // Korean
178 if (registerEncodingNameIfAvailable(registrar, "ISO-2022-KR"))
179 registerEncodingAliasIfAvailable(registrar, "ISO-2022-KR", "CSISO2022KR");
180 if (registerEncodingNameIfAvailable(registrar, "CP949"))
181 registerEncodingAliasIfAvailable(registrar, "CP949", "UHC");
182 if (registerEncodingNameIfAvailable(registrar, "EUC-KR"))
183 registerEncodingAliasIfAvailable(registrar, "EUC-KR", "CSEUCKR");
184
185 // Arabic
186 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-6")) {
187 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ARABIC");
188 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ASMO-708");
189 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ECMA-114");
190 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO-IR-127");
191 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO8859-6");
192 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6");
193 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "ISO_8859-6:1987");
194 registerEncodingAliasIfAvailable(registrar, "ISO-8859-6", "CSISOLATINARABIC");
195 }
196 // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case
197 if (registerEncodingNameIfAvailable(registrar, "windows-1256")) {
198 registerEncodingAliasIfAvailable(registrar, "windows-1256", "CP1256");
199 registerEncodingAliasIfAvailable(registrar, "windows-1256", "MS-ARAB");
200 }
201
202 // Hebrew
203 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-8")) {
204 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "HEBREW");
205 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-8859-8");
206 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO-IR-138");
207 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO8859-8");
208 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8");
209 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "ISO_8859-8:1988");
210 registerEncodingAliasIfAvailable(registrar, "ISO-8859-8", "CSISOLATINHEBREW");
211 }
212 // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html
213 if (registerEncodingNameIfAvailable(registrar, "windows-1255")) {
214 registerEncodingAliasIfAvailable(registrar, "windows-1255", "CP1255");
215 registerEncodingAliasIfAvailable(registrar, "windows-1255", "MS-HEBR");
216 }
217
218 // Greek
219 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-7")) {
220 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ECMA-118");
221 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ELOT_928");
222 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK");
223 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "GREEK8");
224 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO-IR-126");
225 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO8859-7");
226 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7");
227 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:1987");
228 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "ISO_8859-7:2003");
229 registerEncodingAliasIfAvailable(registrar, "ISO-8859-7", "CSI");
230 }
231 if (registerEncodingNameIfAvailable(registrar, "CP869")) {
232 registerEncodingAliasIfAvailable(registrar, "CP869", "869");
233 registerEncodingAliasIfAvailable(registrar, "CP869", "CP-GR");
234 registerEncodingAliasIfAvailable(registrar, "CP869", "IBM869");
235 registerEncodingAliasIfAvailable(registrar, "CP869", "CSIBM869");
236 }
237 registerEncodingNameIfAvailable(registrar, "WINDOWS-1253");
238
239 // Cyrillic
240 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-5")) {
241 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CYRILLIC");
242 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO-IR-144");
243 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO8859-5");
244 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5");
245 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "ISO_8859-5:1988");
246 registerEncodingAliasIfAvailable(registrar, "ISO-8859-5", "CSISOLATINCYRILLIC");
247 }
248 if (registerEncodingNameIfAvailable(registrar, "KOI8-R"))
249 registerEncodingAliasIfAvailable(registrar, "KOI8-R", "CSKOI8R");
250 if (registerEncodingNameIfAvailable(registrar, "CP866")) {
251 registerEncodingAliasIfAvailable(registrar, "CP866", "866");
252 registerEncodingAliasIfAvailable(registrar, "CP866", "IBM866");
253 registerEncodingAliasIfAvailable(registrar, "CP866", "CSIBM866");
254 }
255 registerEncodingNameIfAvailable(registrar, "KOI8-U");
256 // CP1251 added to pass /fast/encoding/charset-cp1251.html
257 if (registerEncodingNameIfAvailable(registrar, "windows-1251"))
258 registerEncodingAliasIfAvailable(registrar, "windows-1251", "CP1251");
259 if (registerEncodingNameIfAvailable(registrar, "mac-cyrillic")) {
260 registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "MACCYRILLIC");
261 registerEncodingAliasIfAvailable(registrar, "mac-cyrillic", "x-mac-cyrillic");
262 }
263
264 // Thai
265 if (registerEncodingNameIfAvailable(registrar, "CP874"))
266 registerEncodingAliasIfAvailable(registrar, "CP874", "WINDOWS-874");
267 registerEncodingNameIfAvailable(registrar, "TIS-620");
268
269 // Simplified Chinese
270 registerEncodingNameIfAvailable(registrar, "GBK");
271 if (registerEncodingNameIfAvailable(registrar, "HZ"))
272 registerEncodingAliasIfAvailable(registrar, "HZ", "HZ-GB-2312");
273 registerEncodingNameIfAvailable(registrar, "GB18030");
274 if (registerEncodingNameIfAvailable(registrar, "EUC-CN")) {
275 registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUCCN");
276 registerEncodingAliasIfAvailable(registrar, "EUC-CN", "GB2312");
277 registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CN-GB");
278 registerEncodingAliasIfAvailable(registrar, "EUC-CN", "CSGB2312");
279 registerEncodingAliasIfAvailable(registrar, "EUC-CN", "EUC_CN");
280 }
281 if (registerEncodingNameIfAvailable(registrar, "GB_2312-80")) {
282 registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "CHINESE");
283 registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "csISO58GB231280");
284 registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "GB2312.1980-0");
285 registerEncodingAliasIfAvailable(registrar, "GB_2312-80", "ISO-IR-58");
286 }
287
288 // Central European
289 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-2")) {
290 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO-IR-101");
291 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO8859-2");
292 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2");
293 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "ISO_8859-2:1987");
294 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "L2");
295 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "LATIN2");
296 registerEncodingAliasIfAvailable(registrar, "ISO-8859-2", "CSISOLATIN2");
297 }
298 if (registerEncodingNameIfAvailable(registrar, "CP1250")) {
299 registerEncodingAliasIfAvailable(registrar, "CP1250", "MS-EE");
300 registerEncodingAliasIfAvailable(registrar, "CP1250", "WINDOWS-1250");
301 }
302 registerEncodingNameIfAvailable(registrar, "MAC-CENTRALEUROPE");
303
304 // Vietnamese
305 if (registerEncodingNameIfAvailable(registrar, "CP1258"))
306 registerEncodingAliasIfAvailable(registrar, "CP1258", "WINDOWS-1258");
307
308 // Turkish
309 if (registerEncodingNameIfAvailable(registrar, "CP1254")) {
310 registerEncodingAliasIfAvailable(registrar, "CP1254", "MS-TURK");
311 registerEncodingAliasIfAvailable(registrar, "CP1254", "WINDOWS-1254");
312 }
313 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-9")) {
314 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO-IR-148");
315 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO8859-9");
316 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9");
317 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "ISO_8859-9:1989");
318 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "L5");
319 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "LATIN5");
320 registerEncodingAliasIfAvailable(registrar, "ISO-8859-9", "CSISOLATIN5");
321 }
322
323 // Baltic
324 if (registerEncodingNameIfAvailable(registrar, "CP1257")) {
325 registerEncodingAliasIfAvailable(registrar, "CP1257", "WINBALTRIM");
326 registerEncodingAliasIfAvailable(registrar, "CP1257", "WINDOWS-1257");
327 }
328 if (registerEncodingNameIfAvailable(registrar, "ISO-8859-4")) {
329 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO-IR-110");
330 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO8859-4");
331 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4");
332 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "ISO_8859-4:1988");
333 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "L4");
334 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "LATIN4");
335 registerEncodingAliasIfAvailable(registrar, "ISO-8859-4", "CSISOLATIN4");
336 }
337 }
338
registerExtendedCodecs(TextCodecRegistrar registrar)339 void TextCodecGtk::registerExtendedCodecs(TextCodecRegistrar registrar)
340 {
341 // Western
342 registerCodecIfAvailable(registrar, "MACROMAN");
343
344 // Japanese
345 registerCodecIfAvailable(registrar, "Shift_JIS");
346 registerCodecIfAvailable(registrar, "EUC-JP");
347 registerCodecIfAvailable(registrar, "ISO-2022-JP");
348
349 // Traditional Chinese
350 registerCodecIfAvailable(registrar, "BIG5");
351 registerCodecIfAvailable(registrar, "BIG5-HKSCS");
352 registerCodecIfAvailable(registrar, "CP950");
353
354 // Korean
355 registerCodecIfAvailable(registrar, "ISO-2022-KR");
356 registerCodecIfAvailable(registrar, "CP949");
357 registerCodecIfAvailable(registrar, "EUC-KR");
358
359 // Arabic
360 registerCodecIfAvailable(registrar, "ISO-8859-6");
361 // rearranged, windows-1256 now declared the canonical name and put to lowercase to fix /fast/encoding/ahram-org-eg.html test case
362 registerCodecIfAvailable(registrar, "windows-1256");
363
364 // Hebrew
365 registerCodecIfAvailable(registrar, "ISO-8859-8");
366 // rearranged, moved windows-1255 as canonical and lowercased, fixing /fast/encoding/meta-charset.html
367 registerCodecIfAvailable(registrar, "windows-1255");
368
369 // Greek
370 registerCodecIfAvailable(registrar, "ISO-8859-7");
371 registerCodecIfAvailable(registrar, "CP869");
372 registerCodecIfAvailable(registrar, "WINDOWS-1253");
373
374 // Cyrillic
375 registerCodecIfAvailable(registrar, "ISO-8859-5");
376 registerCodecIfAvailable(registrar, "KOI8-R");
377 registerCodecIfAvailable(registrar, "CP866");
378 registerCodecIfAvailable(registrar, "KOI8-U");
379 // CP1251 added to pass /fast/encoding/charset-cp1251.html
380 registerCodecIfAvailable(registrar, "windows-1251");
381 registerCodecIfAvailable(registrar, "mac-cyrillic");
382
383 // Thai
384 registerCodecIfAvailable(registrar, "CP874");
385 registerCodecIfAvailable(registrar, "TIS-620");
386
387 // Simplified Chinese
388 registerCodecIfAvailable(registrar, "GBK");
389 registerCodecIfAvailable(registrar, "HZ");
390 registerCodecIfAvailable(registrar, "GB18030");
391 registerCodecIfAvailable(registrar, "EUC-CN");
392 registerCodecIfAvailable(registrar, "GB_2312-80");
393
394 // Central European
395 registerCodecIfAvailable(registrar, "ISO-8859-2");
396 registerCodecIfAvailable(registrar, "CP1250");
397 registerCodecIfAvailable(registrar, "MAC-CENTRALEUROPE");
398
399 // Vietnamese
400 registerCodecIfAvailable(registrar, "CP1258");
401
402 // Turkish
403 registerCodecIfAvailable(registrar, "CP1254");
404 registerCodecIfAvailable(registrar, "ISO-8859-9");
405
406 // Baltic
407 registerCodecIfAvailable(registrar, "CP1257");
408 registerCodecIfAvailable(registrar, "ISO-8859-4");
409 }
410
TextCodecGtk(const TextEncoding & encoding)411 TextCodecGtk::TextCodecGtk(const TextEncoding& encoding)
412 : m_encoding(encoding)
413 , m_numBufferedBytes(0)
414 {
415 }
416
~TextCodecGtk()417 TextCodecGtk::~TextCodecGtk()
418 {
419 }
420
createIConvDecoder() const421 void TextCodecGtk::createIConvDecoder() const
422 {
423 ASSERT(!m_iconvDecoder);
424
425 m_iconvDecoder = adoptGRef(g_charset_converter_new(internalEncodingName, m_encoding.name(), 0));
426 }
427
createIConvEncoder() const428 void TextCodecGtk::createIConvEncoder() const
429 {
430 ASSERT(!m_iconvEncoder);
431
432 m_iconvEncoder = adoptGRef(g_charset_converter_new(m_encoding.name(), internalEncodingName, 0));
433 }
434
decode(const char * bytes,size_t length,bool flush,bool stopOnError,bool & sawError)435 String TextCodecGtk::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
436 {
437 // Get a converter for the passed-in encoding.
438 if (!m_iconvDecoder)
439 createIConvDecoder();
440 if (!m_iconvDecoder) {
441 LOG_ERROR("Error creating IConv encoder even though encoding was in table.");
442 return String();
443 }
444
445 Vector<UChar> result;
446
447 gsize bytesRead = 0;
448 gsize bytesWritten = 0;
449 const gchar* input = bytes;
450 gsize inputLength = length;
451 gchar buffer[ConversionBufferSize];
452 int flags = !length ? G_CONVERTER_INPUT_AT_END : G_CONVERTER_NO_FLAGS;
453 if (flush)
454 flags |= G_CONVERTER_FLUSH;
455
456 bool bufferWasFull = false;
457 char* prefixedBytes = 0;
458
459 if (m_numBufferedBytes) {
460 inputLength = length + m_numBufferedBytes;
461 prefixedBytes = static_cast<char*>(fastMalloc(inputLength));
462 memcpy(prefixedBytes, m_bufferedBytes, m_numBufferedBytes);
463 memcpy(prefixedBytes + m_numBufferedBytes, bytes, length);
464
465 input = prefixedBytes;
466
467 // all buffered bytes are consumed now
468 m_numBufferedBytes = 0;
469 }
470
471 do {
472 GOwnPtr<GError> error;
473 GConverterResult res = g_converter_convert(G_CONVERTER(m_iconvDecoder.get()),
474 input, inputLength,
475 buffer, sizeof(buffer),
476 static_cast<GConverterFlags>(flags),
477 &bytesRead, &bytesWritten,
478 &error.outPtr());
479 input += bytesRead;
480 inputLength -= bytesRead;
481
482 if (res == G_CONVERTER_ERROR) {
483 if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) {
484 // There is not enough input to fully determine what the conversion should produce,
485 // save it to a buffer to prepend it to the next input.
486 memcpy(m_bufferedBytes, input, inputLength);
487 m_numBufferedBytes = inputLength;
488 inputLength = 0;
489 } else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_NO_SPACE))
490 bufferWasFull = true;
491 else if (g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) {
492 if (stopOnError)
493 sawError = true;
494 if (inputLength) {
495 // Ignore invalid character.
496 input += 1;
497 inputLength -= 1;
498 }
499 } else {
500 sawError = true;
501 LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message);
502 m_numBufferedBytes = 0; // Reset state for subsequent calls to decode.
503 fastFree(prefixedBytes);
504 return String();
505 }
506 }
507
508 result.append(reinterpret_cast<UChar*>(buffer), bytesWritten / sizeof(UChar));
509 } while ((inputLength || bufferWasFull) && !sawError);
510
511 fastFree(prefixedBytes);
512
513 return String::adopt(result);
514 }
515
encode(const UChar * characters,size_t length,UnencodableHandling handling)516 CString TextCodecGtk::encode(const UChar* characters, size_t length, UnencodableHandling handling)
517 {
518 if (!length)
519 return "";
520
521 if (!m_iconvEncoder)
522 createIConvEncoder();
523 if (!m_iconvEncoder) {
524 LOG_ERROR("Error creating IConv encoder even though encoding was in table.");
525 return CString();
526 }
527
528 gsize bytesRead = 0;
529 gsize bytesWritten = 0;
530 const gchar* input = reinterpret_cast<const char*>(characters);
531 gsize inputLength = length * sizeof(UChar);
532 gchar buffer[ConversionBufferSize];
533 Vector<char> result;
534 GOwnPtr<GError> error;
535
536 size_t size = 0;
537 do {
538 g_converter_convert(G_CONVERTER(m_iconvEncoder.get()),
539 input, inputLength,
540 buffer, sizeof(buffer),
541 G_CONVERTER_INPUT_AT_END,
542 &bytesRead, &bytesWritten,
543 &error.outPtr());
544 input += bytesRead;
545 inputLength -= bytesRead;
546 if (bytesWritten > 0) {
547 result.grow(size + bytesWritten);
548 memcpy(result.data() + size, buffer, bytesWritten);
549 size += bytesWritten;
550 }
551
552 if (error && g_error_matches(error.get(), G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) {
553 UChar codePoint = reinterpret_cast<const UChar*>(input)[0];
554 UnencodableReplacementArray replacement;
555 int replacementLength = TextCodec::getUnencodableReplacement(codePoint, handling, replacement);
556
557 // Consume the invalid character.
558 input += sizeof(UChar);
559 inputLength -= sizeof(UChar);
560
561 // Append replacement string to result buffer.
562 result.grow(size + replacementLength);
563 memcpy(result.data() + size, replacement, replacementLength);
564 size += replacementLength;
565
566 error.clear();
567 }
568 } while (inputLength && !error.get());
569
570 if (error) {
571 LOG_ERROR("GIConv conversion error, Code %d: \"%s\"", error->code, error->message);
572 return CString();
573 }
574
575 return CString(result.data(), size);
576 }
577
578 } // namespace WebCore
579