1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA.
20 */
21
22
23 #include "config.h"
24 #include "TextResourceDecoder.h"
25
26 #include "DOMImplementation.h"
27 #include "HTMLMetaCharsetParser.h"
28 #include "HTMLNames.h"
29 #include "TextCodec.h"
30 #include "TextEncoding.h"
31 #include "TextEncodingDetector.h"
32 #include "TextEncodingRegistry.h"
33 #include <wtf/ASCIICType.h>
34 #include <wtf/StringExtras.h>
35
36 using namespace WTF;
37
38 namespace WebCore {
39
40 using namespace HTMLNames;
41
42 // You might think we should put these find functions elsewhere, perhaps with the
43 // similar functions that operate on UChar, but arguably only the decoder has
44 // a reason to process strings of char rather than UChar.
45
find(const char * subject,size_t subjectLength,const char * target)46 static int find(const char* subject, size_t subjectLength, const char* target)
47 {
48 size_t targetLength = strlen(target);
49 if (targetLength > subjectLength)
50 return -1;
51 for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
52 bool match = true;
53 for (size_t j = 0; j < targetLength; ++j) {
54 if (subject[i + j] != target[j]) {
55 match = false;
56 break;
57 }
58 }
59 if (match)
60 return i;
61 }
62 return -1;
63 }
64
findTextEncoding(const char * encodingName,int length)65 static TextEncoding findTextEncoding(const char* encodingName, int length)
66 {
67 Vector<char, 64> buffer(length + 1);
68 memcpy(buffer.data(), encodingName, length);
69 buffer[length] = '\0';
70 return buffer.data();
71 }
72
73 class KanjiCode {
74 public:
75 enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
76 static enum Type judge(const char* str, int length);
77 static const int ESC = 0x1b;
78 static const unsigned char sjisMap[256];
ISkanji(int code)79 static int ISkanji(int code)
80 {
81 if (code >= 0x100)
82 return 0;
83 return sjisMap[code & 0xff] & 1;
84 }
ISkana(int code)85 static int ISkana(int code)
86 {
87 if (code >= 0x100)
88 return 0;
89 return sjisMap[code & 0xff] & 2;
90 }
91 };
92
93 const unsigned char KanjiCode::sjisMap[256] = {
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
105 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
110 };
111
112 /*
113 * EUC-JP is
114 * [0xa1 - 0xfe][0xa1 - 0xfe]
115 * 0x8e[0xa1 - 0xfe](SS2)
116 * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
117 *
118 * Shift_Jis is
119 * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
120 *
121 * Shift_Jis Hankaku Kana is
122 * [0xa1 - 0xdf]
123 */
124
125 /*
126 * KanjiCode::judge() is based on judge_jcode() from jvim
127 * http://hp.vector.co.jp/authors/VA003457/vim/
128 *
129 * Special Thanks to Kenichi Tsuchida
130 */
131
judge(const char * str,int size)132 enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
133 {
134 enum Type code;
135 int i;
136 int bfr = false; /* Kana Moji */
137 int bfk = 0; /* EUC Kana */
138 int sjis = 0;
139 int euc = 0;
140
141 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
142
143 code = ASCII;
144
145 i = 0;
146 while (i < size) {
147 if (ptr[i] == ESC && (size - i >= 3)) {
148 if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
149 || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
150 code = JIS;
151 goto breakBreak;
152 } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
153 || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
154 code = JIS;
155 goto breakBreak;
156 } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
157 code = JIS;
158 i += 3;
159 } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
160 code = JIS;
161 i += 3;
162 } else {
163 i++;
164 }
165 bfr = false;
166 bfk = 0;
167 } else {
168 if (ptr[i] < 0x20) {
169 bfr = false;
170 bfk = 0;
171 /* ?? check kudokuten ?? && ?? hiragana ?? */
172 if ((i >= 2) && (ptr[i - 2] == 0x81)
173 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
174 code = SJIS;
175 sjis += 100; /* kudokuten */
176 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
177 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
178 code = EUC;
179 euc += 100; /* kudokuten */
180 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
181 sjis += 40; /* hiragana */
182 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
183 euc += 40; /* hiragana */
184 }
185 } else {
186 /* ?? check hiragana or katana ?? */
187 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
188 sjis++; /* hiragana */
189 } else if ((size - i > 1) && (ptr[i] == 0x83)
190 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
191 sjis++; /* katakana */
192 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
193 euc++; /* hiragana */
194 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
195 euc++; /* katakana */
196 }
197 if (bfr) {
198 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
199 code = SJIS;
200 goto breakBreak;
201 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
202 code = SJIS;
203 goto breakBreak;
204 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
205 code = EUC;
206 goto breakBreak;
207 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
208 code = EUC;
209 goto breakBreak;
210 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
211 code = SJIS;
212 goto breakBreak;
213 } else if (ptr[i] <= 0x7f) {
214 code = SJIS;
215 goto breakBreak;
216 } else {
217 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
218 euc++; /* sjis hankaku kana kigo */
219 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
220 ; /* sjis hankaku kana */
221 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
222 euc++;
223 } else if (0x8e == ptr[i]) {
224 euc++;
225 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
226 sjis++;
227 }
228 bfr = false;
229 bfk = 0;
230 }
231 } else if (0x8e == ptr[i]) {
232 if (size - i <= 1) {
233 ;
234 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
235 /* EUC KANA or SJIS KANJI */
236 if (bfk == 1) {
237 euc += 100;
238 }
239 bfk++;
240 i++;
241 } else {
242 /* SJIS only */
243 code = SJIS;
244 goto breakBreak;
245 }
246 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
247 /* SJIS only */
248 code = SJIS;
249 if ((size - i >= 1)
250 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
251 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
252 goto breakBreak;
253 }
254 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
255 /* EUC only */
256 code = EUC;
257 if ((size - i >= 1)
258 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
259 goto breakBreak;
260 }
261 } else if (ptr[i] <= 0x7f) {
262 ;
263 } else {
264 bfr = true;
265 bfk = 0;
266 }
267 }
268 i++;
269 }
270 }
271 if (code == ASCII) {
272 if (sjis > euc) {
273 code = SJIS;
274 } else if (sjis < euc) {
275 code = EUC;
276 }
277 }
278 breakBreak:
279 return (code);
280 }
281
determineContentType(const String & mimeType)282 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
283 {
284 if (equalIgnoringCase(mimeType, "text/css"))
285 return CSS;
286 if (equalIgnoringCase(mimeType, "text/html"))
287 return HTML;
288 if (DOMImplementation::isXMLMIMEType(mimeType))
289 return XML;
290 return PlainText;
291 }
292
defaultEncoding(ContentType contentType,const TextEncoding & specifiedDefaultEncoding)293 const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
294 {
295 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
296 // for text/xml. This matches Firefox.
297 if (contentType == XML)
298 return UTF8Encoding();
299 if (!specifiedDefaultEncoding.isValid())
300 return Latin1Encoding();
301 return specifiedDefaultEncoding;
302 }
303
TextResourceDecoder(const String & mimeType,const TextEncoding & specifiedDefaultEncoding,bool usesEncodingDetector)304 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
305 : m_contentType(determineContentType(mimeType))
306 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
307 , m_source(DefaultEncoding)
308 , m_hintEncoding(0)
309 , m_checkedForBOM(false)
310 , m_checkedForCSSCharset(false)
311 , m_checkedForHeadCharset(false)
312 , m_useLenientXMLDecoding(false)
313 , m_sawError(false)
314 , m_usesEncodingDetector(usesEncodingDetector)
315 {
316 }
317
~TextResourceDecoder()318 TextResourceDecoder::~TextResourceDecoder()
319 {
320 }
321
setEncoding(const TextEncoding & encoding,EncodingSource source)322 void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
323 {
324 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
325 if (!encoding.isValid())
326 return;
327
328 // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
329 // treat x-user-defined as windows-1252 (bug 18270)
330 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
331 m_encoding = "windows-1252";
332 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
333 m_encoding = encoding.closestByteBasedEquivalent();
334 else
335 m_encoding = encoding;
336
337 m_codec.clear();
338 m_source = source;
339 }
340
341 // Returns the position of the encoding string.
findXMLEncoding(const char * str,int len,int & encodingLength)342 static int findXMLEncoding(const char* str, int len, int& encodingLength)
343 {
344 int pos = find(str, len, "encoding");
345 if (pos == -1)
346 return -1;
347 pos += 8;
348
349 // Skip spaces and stray control characters.
350 while (pos < len && str[pos] <= ' ')
351 ++pos;
352
353 // Skip equals sign.
354 if (pos >= len || str[pos] != '=')
355 return -1;
356 ++pos;
357
358 // Skip spaces and stray control characters.
359 while (pos < len && str[pos] <= ' ')
360 ++pos;
361
362 // Skip quotation mark.
363 if (pos >= len)
364 return - 1;
365 char quoteMark = str[pos];
366 if (quoteMark != '"' && quoteMark != '\'')
367 return -1;
368 ++pos;
369
370 // Find the trailing quotation mark.
371 int end = pos;
372 while (end < len && str[end] != quoteMark)
373 ++end;
374 if (end >= len)
375 return -1;
376
377 encodingLength = end - pos;
378 return pos;
379 }
380
381 // true if there is more to parse
skipWhitespace(const char * & pos,const char * dataEnd)382 static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
383 {
384 while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
385 ++pos;
386 return pos != dataEnd;
387 }
388
checkForBOM(const char * data,size_t len)389 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
390 {
391 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
392 // We let it override even a user-chosen encoding.
393 ASSERT(!m_checkedForBOM);
394
395 size_t lengthOfBOM = 0;
396
397 size_t bufferLength = m_buffer.size();
398
399 size_t buf1Len = bufferLength;
400 size_t buf2Len = len;
401 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
402 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
403 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
404 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
405 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
406 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
407
408 // Check for the BOM.
409 if (c1 == 0xFF && c2 == 0xFE) {
410 if (c3 != 0 || c4 != 0) {
411 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
412 lengthOfBOM = 2;
413 } else {
414 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
415 lengthOfBOM = 4;
416 }
417 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
418 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
419 lengthOfBOM = 3;
420 } else if (c1 == 0xFE && c2 == 0xFF) {
421 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
422 lengthOfBOM = 2;
423 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
424 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
425 lengthOfBOM = 4;
426 }
427
428 if (lengthOfBOM || bufferLength + len >= 4)
429 m_checkedForBOM = true;
430
431 return lengthOfBOM;
432 }
433
checkForCSSCharset(const char * data,size_t len,bool & movedDataToBuffer)434 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
435 {
436 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
437 m_checkedForCSSCharset = true;
438 return true;
439 }
440
441 size_t oldSize = m_buffer.size();
442 m_buffer.grow(oldSize + len);
443 memcpy(m_buffer.data() + oldSize, data, len);
444
445 movedDataToBuffer = true;
446
447 if (m_buffer.size() > 8) { // strlen("@charset") == 8
448 const char* dataStart = m_buffer.data();
449 const char* dataEnd = dataStart + m_buffer.size();
450
451 if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' &&
452 dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') {
453
454 dataStart += 8;
455 const char* pos = dataStart;
456 if (!skipWhitespace(pos, dataEnd))
457 return false;
458
459 if (*pos == '"' || *pos == '\'') {
460 char quotationMark = *pos;
461 ++pos;
462 dataStart = pos;
463
464 while (pos < dataEnd && *pos != quotationMark)
465 ++pos;
466 if (pos == dataEnd)
467 return false;
468
469 int encodingNameLength = pos - dataStart;
470
471 ++pos;
472 if (!skipWhitespace(pos, dataEnd))
473 return false;
474
475 if (*pos == ';')
476 setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
477 }
478 }
479 m_checkedForCSSCharset = true;
480 return true;
481 }
482 return false;
483 }
484
485 // Other browsers allow comments in the head section, so we need to also.
486 // It's important not to look for tags inside the comments.
skipComment(const char * & ptr,const char * pEnd)487 static inline void skipComment(const char*& ptr, const char* pEnd)
488 {
489 const char* p = ptr;
490 if (p == pEnd)
491 return;
492 // Allow <!-->; other browsers do.
493 if (*p == '>') {
494 p++;
495 } else {
496 while (p + 2 < pEnd) {
497 if (*p == '-') {
498 // This is the real end of comment, "-->".
499 if (p[1] == '-' && p[2] == '>') {
500 p += 3;
501 break;
502 }
503 // This is the incorrect end of comment that other browsers allow, "--!>".
504 if (p + 3 < pEnd && p[1] == '-' && p[2] == '!' && p[3] == '>') {
505 p += 4;
506 break;
507 }
508 }
509 p++;
510 }
511 }
512 ptr = p;
513 }
514
checkForHeadCharset(const char * data,size_t len,bool & movedDataToBuffer)515 bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
516 {
517 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
518 m_checkedForHeadCharset = true;
519 return true;
520 }
521
522 // This is not completely efficient, since the function might go
523 // through the HTML head several times.
524
525 size_t oldSize = m_buffer.size();
526 m_buffer.grow(oldSize + len);
527 memcpy(m_buffer.data() + oldSize, data, len);
528
529 movedDataToBuffer = true;
530
531 // Continue with checking for an HTML meta tag if we were already doing so.
532 if (m_charsetParser)
533 return checkForMetaCharset(data, len);
534
535 const char* ptr = m_buffer.data();
536 const char* pEnd = ptr + m_buffer.size();
537
538 // Is there enough data available to check for XML declaration?
539 if (m_buffer.size() < 8)
540 return false;
541
542 // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
543 // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
544 if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') {
545 const char* xmlDeclarationEnd = ptr;
546 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
547 ++xmlDeclarationEnd;
548 if (xmlDeclarationEnd == pEnd)
549 return false;
550 // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
551 int len = 0;
552 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
553 if (pos != -1)
554 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
555 // continue looking for a charset - it may be specified in an HTTP-Equiv meta
556 } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) {
557 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
558 return true;
559 } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') {
560 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
561 return true;
562 } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) {
563 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
564 return true;
565 } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') {
566 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
567 return true;
568 }
569
570 // The HTTP-EQUIV meta has no effect on XHTML.
571 if (m_contentType == XML)
572 return true;
573
574 m_charsetParser = HTMLMetaCharsetParser::create();
575 return checkForMetaCharset(data, len);
576 }
577
checkForMetaCharset(const char * data,size_t length)578 bool TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
579 {
580 if (!m_charsetParser->checkForMetaCharset(data, length))
581 return false;
582
583 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
584 m_charsetParser.clear();
585 m_checkedForHeadCharset = true;
586 return true;
587 }
588
detectJapaneseEncoding(const char * data,size_t len)589 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
590 {
591 switch (KanjiCode::judge(data, len)) {
592 case KanjiCode::JIS:
593 setEncoding("ISO-2022-JP", AutoDetectedEncoding);
594 break;
595 case KanjiCode::EUC:
596 setEncoding("EUC-JP", AutoDetectedEncoding);
597 break;
598 case KanjiCode::SJIS:
599 setEncoding("Shift_JIS", AutoDetectedEncoding);
600 break;
601 case KanjiCode::ASCII:
602 case KanjiCode::UTF16:
603 case KanjiCode::UTF8:
604 break;
605 }
606 }
607
608 // We use the encoding detector in two cases:
609 // 1. Encoding detector is turned ON and no other encoding source is
610 // available (that is, it's DefaultEncoding).
611 // 2. Encoding detector is turned ON and the encoding is set to
612 // the encoding of the parent frame, which is also auto-detected.
613 // Note that condition #2 is NOT satisfied unless parent-child frame
614 // relationship is compliant to the same-origin policy. If they're from
615 // different domains, |m_source| would not be set to EncodingFromParentFrame
616 // in the first place.
shouldAutoDetect() const617 bool TextResourceDecoder::shouldAutoDetect() const
618 {
619 // Just checking m_hintEncoding suffices here because it's only set
620 // in setHintEncoding when the source is AutoDetectedEncoding.
621 return m_usesEncodingDetector
622 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
623 }
624
decode(const char * data,size_t len)625 String TextResourceDecoder::decode(const char* data, size_t len)
626 {
627 size_t lengthOfBOM = 0;
628 if (!m_checkedForBOM)
629 lengthOfBOM = checkForBOM(data, len);
630
631 bool movedDataToBuffer = false;
632
633 if (m_contentType == CSS && !m_checkedForCSSCharset)
634 if (!checkForCSSCharset(data, len, movedDataToBuffer))
635 return "";
636
637 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
638 if (!checkForHeadCharset(data, len, movedDataToBuffer))
639 return "";
640
641 // FIXME: It is wrong to change the encoding downstream after we have already done some decoding.
642 if (shouldAutoDetect()) {
643 if (m_encoding.isJapanese())
644 detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
645 else {
646 TextEncoding detectedEncoding;
647 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
648 setEncoding(detectedEncoding, AutoDetectedEncoding);
649 }
650 }
651
652 ASSERT(m_encoding.isValid());
653
654 if (!m_codec)
655 m_codec = newTextCodec(m_encoding);
656
657 if (m_buffer.isEmpty())
658 return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError);
659
660 if (!movedDataToBuffer) {
661 size_t oldSize = m_buffer.size();
662 m_buffer.grow(oldSize + len);
663 memcpy(m_buffer.data() + oldSize, data, len);
664 }
665
666 String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
667 m_buffer.clear();
668 return result;
669 }
670
flush()671 String TextResourceDecoder::flush()
672 {
673 // If we can not identify the encoding even after a document is completely
674 // loaded, we need to detect the encoding if other conditions for
675 // autodetection is satisfied.
676 if (m_buffer.size() && shouldAutoDetect()
677 && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
678 TextEncoding detectedEncoding;
679 if (detectTextEncoding(m_buffer.data(), m_buffer.size(),
680 m_hintEncoding, &detectedEncoding))
681 setEncoding(detectedEncoding, AutoDetectedEncoding);
682 }
683
684 if (!m_codec)
685 m_codec = newTextCodec(m_encoding);
686
687 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
688 m_buffer.clear();
689 m_codec.clear();
690 m_checkedForBOM = false; // Skip BOM again when re-decoding.
691 return result;
692 }
693
694 }
695