1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA.
20 */
21
22
23 #include "config.h"
24 #include "TextResourceDecoder.h"
25
26 #include "DOMImplementation.h"
27 #include "HTMLNames.h"
28 #include "TextCodec.h"
29 #include "TextEncoding.h"
30 #include "TextEncodingDetector.h"
31 #include "TextEncodingRegistry.h"
32 #include <wtf/ASCIICType.h>
33 #include <wtf/StringExtras.h>
34
35 using namespace WTF;
36
37 namespace WebCore {
38
39 using namespace HTMLNames;
40
41 // You might think we should put these find functions elsewhere, perhaps with the
42 // similar functions that operate on UChar, but arguably only the decoder has
43 // a reason to process strings of char rather than UChar.
44
find(const char * subject,size_t subjectLength,const char * target)45 static int find(const char* subject, size_t subjectLength, const char* target)
46 {
47 size_t targetLength = strlen(target);
48 if (targetLength > subjectLength)
49 return -1;
50 for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
51 bool match = true;
52 for (size_t j = 0; j < targetLength; ++j) {
53 if (subject[i + j] != target[j]) {
54 match = false;
55 break;
56 }
57 }
58 if (match)
59 return i;
60 }
61 return -1;
62 }
63
findIgnoringCase(const char * subject,size_t subjectLength,const char * target)64 static int findIgnoringCase(const char* subject, size_t subjectLength, const char* target)
65 {
66 size_t targetLength = strlen(target);
67 if (targetLength > subjectLength)
68 return -1;
69 #ifndef NDEBUG
70 for (size_t i = 0; i < targetLength; ++i)
71 ASSERT(isASCIILower(target[i]));
72 #endif
73 for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
74 bool match = true;
75 for (size_t j = 0; j < targetLength; ++j) {
76 if (toASCIILower(subject[i + j]) != target[j]) {
77 match = false;
78 break;
79 }
80 }
81 if (match)
82 return i;
83 }
84 return -1;
85 }
86
findTextEncoding(const char * encodingName,int length)87 static TextEncoding findTextEncoding(const char* encodingName, int length)
88 {
89 Vector<char, 64> buffer(length + 1);
90 memcpy(buffer.data(), encodingName, length);
91 buffer[length] = '\0';
92 return buffer.data();
93 }
94
95 class KanjiCode {
96 public:
97 enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
98 static enum Type judge(const char* str, int length);
99 static const int ESC = 0x1b;
100 static const unsigned char sjisMap[256];
ISkanji(int code)101 static int ISkanji(int code)
102 {
103 if (code >= 0x100)
104 return 0;
105 return sjisMap[code & 0xff] & 1;
106 }
ISkana(int code)107 static int ISkana(int code)
108 {
109 if (code >= 0x100)
110 return 0;
111 return sjisMap[code & 0xff] & 2;
112 }
113 };
114
115 const unsigned char KanjiCode::sjisMap[256] = {
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
127 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
128 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
129 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
132 };
133
134 /*
135 * EUC-JP is
136 * [0xa1 - 0xfe][0xa1 - 0xfe]
137 * 0x8e[0xa1 - 0xfe](SS2)
138 * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
139 *
140 * Shift_Jis is
141 * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
142 *
143 * Shift_Jis Hankaku Kana is
144 * [0xa1 - 0xdf]
145 */
146
147 /*
148 * KanjiCode::judge() is based on judge_jcode() from jvim
149 * http://hp.vector.co.jp/authors/VA003457/vim/
150 *
151 * Special Thanks to Kenichi Tsuchida
152 */
153
judge(const char * str,int size)154 enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
155 {
156 enum Type code;
157 int i;
158 int bfr = false; /* Kana Moji */
159 int bfk = 0; /* EUC Kana */
160 int sjis = 0;
161 int euc = 0;
162
163 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
164
165 code = ASCII;
166
167 i = 0;
168 while (i < size) {
169 if (ptr[i] == ESC && (size - i >= 3)) {
170 if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
171 || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
172 code = JIS;
173 goto breakBreak;
174 } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
175 || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
176 code = JIS;
177 goto breakBreak;
178 } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
179 code = JIS;
180 i += 3;
181 } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
182 code = JIS;
183 i += 3;
184 } else {
185 i++;
186 }
187 bfr = false;
188 bfk = 0;
189 } else {
190 if (ptr[i] < 0x20) {
191 bfr = false;
192 bfk = 0;
193 /* ?? check kudokuten ?? && ?? hiragana ?? */
194 if ((i >= 2) && (ptr[i - 2] == 0x81)
195 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
196 code = SJIS;
197 sjis += 100; /* kudokuten */
198 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
199 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
200 code = EUC;
201 euc += 100; /* kudokuten */
202 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
203 sjis += 40; /* hiragana */
204 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
205 euc += 40; /* hiragana */
206 }
207 } else {
208 /* ?? check hiragana or katana ?? */
209 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
210 sjis++; /* hiragana */
211 } else if ((size - i > 1) && (ptr[i] == 0x83)
212 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
213 sjis++; /* katakana */
214 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
215 euc++; /* hiragana */
216 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
217 euc++; /* katakana */
218 }
219 if (bfr) {
220 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
221 code = SJIS;
222 goto breakBreak;
223 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
224 code = SJIS;
225 goto breakBreak;
226 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
227 code = EUC;
228 goto breakBreak;
229 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
230 code = EUC;
231 goto breakBreak;
232 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
233 code = SJIS;
234 goto breakBreak;
235 } else if (ptr[i] <= 0x7f) {
236 code = SJIS;
237 goto breakBreak;
238 } else {
239 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
240 euc++; /* sjis hankaku kana kigo */
241 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
242 ; /* sjis hankaku kana */
243 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
244 euc++;
245 } else if (0x8e == ptr[i]) {
246 euc++;
247 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
248 sjis++;
249 }
250 bfr = false;
251 bfk = 0;
252 }
253 } else if (0x8e == ptr[i]) {
254 if (size - i <= 1) {
255 ;
256 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
257 /* EUC KANA or SJIS KANJI */
258 if (bfk == 1) {
259 euc += 100;
260 }
261 bfk++;
262 i++;
263 } else {
264 /* SJIS only */
265 code = SJIS;
266 goto breakBreak;
267 }
268 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
269 /* SJIS only */
270 code = SJIS;
271 if ((size - i >= 1)
272 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
273 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
274 goto breakBreak;
275 }
276 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
277 /* EUC only */
278 code = EUC;
279 if ((size - i >= 1)
280 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
281 goto breakBreak;
282 }
283 } else if (ptr[i] <= 0x7f) {
284 ;
285 } else {
286 bfr = true;
287 bfk = 0;
288 }
289 }
290 i++;
291 }
292 }
293 if (code == ASCII) {
294 if (sjis > euc) {
295 code = SJIS;
296 } else if (sjis < euc) {
297 code = EUC;
298 }
299 }
300 breakBreak:
301 return (code);
302 }
303
determineContentType(const String & mimeType)304 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
305 {
306 if (equalIgnoringCase(mimeType, "text/css"))
307 return CSS;
308 if (equalIgnoringCase(mimeType, "text/html"))
309 return HTML;
310 if (DOMImplementation::isXMLMIMEType(mimeType))
311 return XML;
312 return PlainText;
313 }
314
defaultEncoding(ContentType contentType,const TextEncoding & specifiedDefaultEncoding)315 const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
316 {
317 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
318 // for text/xml. This matches Firefox.
319 if (contentType == XML)
320 return UTF8Encoding();
321 if (!specifiedDefaultEncoding.isValid())
322 return Latin1Encoding();
323 return specifiedDefaultEncoding;
324 }
325
TextResourceDecoder(const String & mimeType,const TextEncoding & specifiedDefaultEncoding,bool usesEncodingDetector)326 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
327 : m_contentType(determineContentType(mimeType))
328 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
329 , m_source(DefaultEncoding)
330 , m_hintEncoding(0)
331 , m_checkedForBOM(false)
332 , m_checkedForCSSCharset(false)
333 , m_checkedForHeadCharset(false)
334 , m_useLenientXMLDecoding(false)
335 , m_sawError(false)
336 , m_usesEncodingDetector(usesEncodingDetector)
337 {
338 }
339
~TextResourceDecoder()340 TextResourceDecoder::~TextResourceDecoder()
341 {
342 }
343
setEncoding(const TextEncoding & encoding,EncodingSource source)344 void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
345 {
346 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
347 if (!encoding.isValid())
348 return;
349
350 // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
351 // treat x-user-defined as windows-1252 (bug 18270)
352 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
353 m_encoding = "windows-1252";
354 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
355 m_encoding = encoding.closestByteBasedEquivalent();
356 else
357 m_encoding = encoding;
358
359 m_codec.clear();
360 m_source = source;
361 }
362
363 // Returns the position of the encoding string.
findXMLEncoding(const char * str,int len,int & encodingLength)364 static int findXMLEncoding(const char* str, int len, int& encodingLength)
365 {
366 int pos = find(str, len, "encoding");
367 if (pos == -1)
368 return -1;
369 pos += 8;
370
371 // Skip spaces and stray control characters.
372 while (pos < len && str[pos] <= ' ')
373 ++pos;
374
375 // Skip equals sign.
376 if (pos >= len || str[pos] != '=')
377 return -1;
378 ++pos;
379
380 // Skip spaces and stray control characters.
381 while (pos < len && str[pos] <= ' ')
382 ++pos;
383
384 // Skip quotation mark.
385 if (pos >= len)
386 return - 1;
387 char quoteMark = str[pos];
388 if (quoteMark != '"' && quoteMark != '\'')
389 return -1;
390 ++pos;
391
392 // Find the trailing quotation mark.
393 int end = pos;
394 while (end < len && str[end] != quoteMark)
395 ++end;
396 if (end >= len)
397 return -1;
398
399 encodingLength = end - pos;
400 return pos;
401 }
402
403 // true if there is more to parse
skipWhitespace(const char * & pos,const char * dataEnd)404 static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
405 {
406 while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
407 ++pos;
408 return pos != dataEnd;
409 }
410
checkForBOM(const char * data,size_t len)411 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
412 {
413 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
414 // We let it override even a user-chosen encoding.
415 ASSERT(!m_checkedForBOM);
416
417 size_t lengthOfBOM = 0;
418
419 size_t bufferLength = m_buffer.size();
420
421 size_t buf1Len = bufferLength;
422 size_t buf2Len = len;
423 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
424 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
425 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
426 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
427 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
428 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
429
430 // Check for the BOM.
431 if (c1 == 0xFF && c2 == 0xFE) {
432 if (c3 != 0 || c4 != 0) {
433 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
434 lengthOfBOM = 2;
435 } else {
436 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
437 lengthOfBOM = 4;
438 }
439 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
440 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
441 lengthOfBOM = 3;
442 } else if (c1 == 0xFE && c2 == 0xFF) {
443 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
444 lengthOfBOM = 2;
445 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
446 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
447 lengthOfBOM = 4;
448 }
449
450 if (lengthOfBOM || bufferLength + len >= 4)
451 m_checkedForBOM = true;
452
453 return lengthOfBOM;
454 }
455
checkForCSSCharset(const char * data,size_t len,bool & movedDataToBuffer)456 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
457 {
458 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
459 m_checkedForCSSCharset = true;
460 return true;
461 }
462
463 size_t oldSize = m_buffer.size();
464 m_buffer.grow(oldSize + len);
465 memcpy(m_buffer.data() + oldSize, data, len);
466
467 movedDataToBuffer = true;
468
469 if (m_buffer.size() > 8) { // strlen("@charset") == 8
470 const char* dataStart = m_buffer.data();
471 const char* dataEnd = dataStart + m_buffer.size();
472
473 if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' &&
474 dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') {
475
476 dataStart += 8;
477 const char* pos = dataStart;
478 if (!skipWhitespace(pos, dataEnd))
479 return false;
480
481 if (*pos == '"' || *pos == '\'') {
482 char quotationMark = *pos;
483 ++pos;
484 dataStart = pos;
485
486 while (pos < dataEnd && *pos != quotationMark)
487 ++pos;
488 if (pos == dataEnd)
489 return false;
490
491 int encodingNameLength = pos - dataStart + 1;
492
493 ++pos;
494 if (!skipWhitespace(pos, dataEnd))
495 return false;
496
497 if (*pos == ';')
498 setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
499 }
500 }
501 m_checkedForCSSCharset = true;
502 return true;
503 }
504 return false;
505 }
506
507 // Other browsers allow comments in the head section, so we need to also.
508 // It's important not to look for tags inside the comments.
skipComment(const char * & ptr,const char * pEnd)509 static inline void skipComment(const char*& ptr, const char* pEnd)
510 {
511 const char* p = ptr;
512 if (p == pEnd)
513 return;
514 // Allow <!-->; other browsers do.
515 if (*p == '>') {
516 p++;
517 } else {
518 while (p + 2 < pEnd) {
519 if (*p == '-') {
520 // This is the real end of comment, "-->".
521 if (p[1] == '-' && p[2] == '>') {
522 p += 3;
523 break;
524 }
525 // This is the incorrect end of comment that other browsers allow, "--!>".
526 if (p + 3 < pEnd && p[1] == '-' && p[2] == '!' && p[3] == '>') {
527 p += 4;
528 break;
529 }
530 }
531 p++;
532 }
533 }
534 ptr = p;
535 }
536
537 const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over.
538
checkForHeadCharset(const char * data,size_t len,bool & movedDataToBuffer)539 bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
540 {
541 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
542 m_checkedForHeadCharset = true;
543 return true;
544 }
545
546 // This is not completely efficient, since the function might go
547 // through the HTML head several times.
548
549 size_t oldSize = m_buffer.size();
550 m_buffer.grow(oldSize + len);
551 memcpy(m_buffer.data() + oldSize, data, len);
552
553 movedDataToBuffer = true;
554
555 const char* ptr = m_buffer.data();
556 const char* pEnd = ptr + m_buffer.size();
557
558 // Is there enough data available to check for XML declaration?
559 if (m_buffer.size() < 8)
560 return false;
561
562 // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
563 // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
564 if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') {
565 const char* xmlDeclarationEnd = ptr;
566 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
567 ++xmlDeclarationEnd;
568 if (xmlDeclarationEnd == pEnd)
569 return false;
570 // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
571 int len;
572 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
573 if (pos != -1)
574 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
575 // continue looking for a charset - it may be specified in an HTTP-Equiv meta
576 } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) {
577 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
578 return true;
579 } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') {
580 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
581 return true;
582 } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) {
583 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
584 return true;
585 } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') {
586 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
587 return true;
588 }
589
590 // we still don't have an encoding, and are in the head
591 // the following tags are allowed in <head>:
592 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
593
594 // We stop scanning when a tag that is not permitted in <head>
595 // is seen, rather when </head> is seen, because that more closely
596 // matches behavior in other browsers; more details in
597 // <http://bugs.webkit.org/show_bug.cgi?id=3590>.
598
599 // Additionally, we ignore things that looks like tags in <title>, <script> and <noscript>; see
600 // <http://bugs.webkit.org/show_bug.cgi?id=4560>, <http://bugs.webkit.org/show_bug.cgi?id=12165>
601 // and <http://bugs.webkit.org/show_bug.cgi?id=12389>.
602
603 // Since many sites have charset declarations after <body> or other tags that are disallowed in <head>,
604 // we don't bail out until we've checked at least bytesToCheckUnconditionally bytes of input.
605
606 AtomicStringImpl* enclosingTagName = 0;
607 bool inHeadSection = true; // Becomes false when </head> or any tag not allowed in head is encountered.
608
609 // the HTTP-EQUIV meta has no effect on XHTML
610 if (m_contentType == XML)
611 return true;
612
613 while (ptr + 3 < pEnd) { // +3 guarantees that "<!--" fits in the buffer - and certainly we aren't going to lose any "charset" that way.
614 if (*ptr == '<') {
615 bool end = false;
616 ptr++;
617
618 // Handle comments.
619 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
620 ptr += 3;
621 skipComment(ptr, pEnd);
622 if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) {
623 // Some pages that test bandwidth from within the browser do it by having
624 // huge comments and measuring the time they take to load. Repeatedly scanning
625 // these comments can take a lot of CPU time.
626 m_checkedForHeadCharset = true;
627 return true;
628 }
629 continue;
630 }
631
632 if (*ptr == '/') {
633 ++ptr;
634 end = true;
635 }
636
637 // Grab the tag name, but mostly ignore namespaces.
638 bool sawNamespace = false;
639 char tagBuffer[20];
640 int len = 0;
641 while (len < 19) {
642 if (ptr == pEnd)
643 return false;
644 char c = *ptr;
645 if (c == ':') {
646 len = 0;
647 sawNamespace = true;
648 ptr++;
649 continue;
650 }
651 if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'))
652 ;
653 else if (c >= 'A' && c <= 'Z')
654 c += 'a' - 'A';
655 else
656 break;
657 tagBuffer[len++] = c;
658 ptr++;
659 }
660 tagBuffer[len] = 0;
661 AtomicString tag(tagBuffer);
662
663 if (enclosingTagName) {
664 if (end && tag.impl() == enclosingTagName)
665 enclosingTagName = 0;
666 } else {
667 if (tag == titleTag)
668 enclosingTagName = titleTag.localName().impl();
669 else if (tag == scriptTag)
670 enclosingTagName = scriptTag.localName().impl();
671 else if (tag == noscriptTag)
672 enclosingTagName = noscriptTag.localName().impl();
673 }
674
675 // Find where the opening tag ends.
676 const char* tagContentStart = ptr;
677 if (!end) {
678 while (ptr != pEnd && *ptr != '>') {
679 if (*ptr == '\'' || *ptr == '"') {
680 char quoteMark = *ptr;
681 ++ptr;
682 while (ptr != pEnd && *ptr != quoteMark)
683 ++ptr;
684 if (ptr == pEnd)
685 return false;
686 }
687 ++ptr;
688 }
689 if (ptr == pEnd)
690 return false;
691 ++ptr;
692 }
693
694 if (!end && tag == metaTag && !sawNamespace) {
695 const char* str = tagContentStart;
696 int length = ptr - tagContentStart;
697 int pos = 0;
698 while (pos < length) {
699 int charsetPos = findIgnoringCase(str + pos, length - pos, "charset");
700 if (charsetPos == -1)
701 break;
702 pos += charsetPos + 7;
703 // skip whitespace
704 while (pos < length && str[pos] <= ' ')
705 pos++;
706 if (pos == length)
707 break;
708 if (str[pos++] != '=')
709 continue;
710 while ((pos < length) &&
711 (str[pos] <= ' ' || str[pos] == '=' || str[pos] == '"' || str[pos] == '\''))
712 pos++;
713
714 // end ?
715 if (pos == length)
716 break;
717 int end = pos;
718 while (end < length &&
719 str[end] != ' ' && str[end] != '"' && str[end] != '\'' &&
720 str[end] != ';' && str[end] != '>')
721 end++;
722 setEncoding(findTextEncoding(str + pos, end - pos), EncodingFromMetaTag);
723 if (m_source == EncodingFromMetaTag)
724 return true;
725
726 if (end >= length || str[end] == '/' || str[end] == '>')
727 break;
728
729 pos = end + 1;
730 }
731 } else {
732 if (!enclosingTagName && tag != scriptTag && tag != noscriptTag && tag != styleTag
733 && tag != linkTag && tag != metaTag && tag != objectTag && tag != titleTag && tag != baseTag
734 && (end || tag != htmlTag) && (end || tag != headTag) && isASCIIAlpha(tagBuffer[0])) {
735 inHeadSection = false;
736 }
737
738 if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) {
739 m_checkedForHeadCharset = true;
740 return true;
741 }
742 }
743 } else
744 ++ptr;
745 }
746 return false;
747 }
748
detectJapaneseEncoding(const char * data,size_t len)749 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
750 {
751 switch (KanjiCode::judge(data, len)) {
752 case KanjiCode::JIS:
753 setEncoding("ISO-2022-JP", AutoDetectedEncoding);
754 break;
755 case KanjiCode::EUC:
756 setEncoding("EUC-JP", AutoDetectedEncoding);
757 break;
758 case KanjiCode::SJIS:
759 setEncoding("Shift_JIS", AutoDetectedEncoding);
760 break;
761 case KanjiCode::ASCII:
762 case KanjiCode::UTF16:
763 case KanjiCode::UTF8:
764 break;
765 }
766 }
767
768 // We use the encoding detector in two cases:
769 // 1. Encoding detector is turned ON and no other encoding source is
770 // available (that is, it's DefaultEncoding).
771 // 2. Encoding detector is turned ON and the encoding is set to
772 // the encoding of the parent frame, which is also auto-detected.
773 // Note that condition #2 is NOT satisfied unless parent-child frame
774 // relationship is compliant to the same-origin policy. If they're from
775 // different domains, |m_source| would not be set to EncodingFromParentFrame
776 // in the first place.
shouldAutoDetect() const777 bool TextResourceDecoder::shouldAutoDetect() const
778 {
779 // Just checking m_hintEncoding suffices here because it's only set
780 // in setHintEncoding when the source is AutoDetectedEncoding.
781 return m_usesEncodingDetector
782 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
783 }
784
decode(const char * data,size_t len)785 String TextResourceDecoder::decode(const char* data, size_t len)
786 {
787 size_t lengthOfBOM = 0;
788 if (!m_checkedForBOM)
789 lengthOfBOM = checkForBOM(data, len);
790
791 bool movedDataToBuffer = false;
792
793 if (m_contentType == CSS && !m_checkedForCSSCharset)
794 if (!checkForCSSCharset(data, len, movedDataToBuffer))
795 return "";
796
797 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
798 if (!checkForHeadCharset(data, len, movedDataToBuffer))
799 return "";
800
801 // FIXME: It is wrong to change the encoding downstream after we have already done some decoding.
802 if (shouldAutoDetect()) {
803 if (m_encoding.isJapanese())
804 detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
805 else {
806 TextEncoding detectedEncoding;
807 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
808 setEncoding(detectedEncoding, AutoDetectedEncoding);
809 }
810 }
811
812 ASSERT(m_encoding.isValid());
813
814 if (!m_codec)
815 m_codec.set(newTextCodec(m_encoding).release());
816
817 if (m_buffer.isEmpty())
818 return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError);
819
820 if (!movedDataToBuffer) {
821 size_t oldSize = m_buffer.size();
822 m_buffer.grow(oldSize + len);
823 memcpy(m_buffer.data() + oldSize, data, len);
824 }
825
826 String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
827 m_buffer.clear();
828 return result;
829 }
830
flush()831 String TextResourceDecoder::flush()
832 {
833 // If we can not identify the encoding even after a document is completely
834 // loaded, we need to detect the encoding if other conditions for
835 // autodetection is satisfied.
836 if (m_buffer.size() && shouldAutoDetect()
837 && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
838 TextEncoding detectedEncoding;
839 if (detectTextEncoding(m_buffer.data(), m_buffer.size(),
840 m_hintEncoding, &detectedEncoding))
841 setEncoding(detectedEncoding, AutoDetectedEncoding);
842 }
843
844 if (!m_codec)
845 m_codec.set(newTextCodec(m_encoding).release());
846
847 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
848 m_buffer.clear();
849 m_codec.clear();
850 m_checkedForBOM = false; // Skip BOM again when re-decoding.
851 return result;
852 }
853
854 }
855