• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2     Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3     Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4     Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 
6     This library is free software; you can redistribute it and/or
7     modify it under the terms of the GNU Library General Public
8     License as published by the Free Software Foundation; either
9     version 2 of the License, or (at your option) any later version.
10 
11     This library is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14     Library General Public License for more details.
15 
16     You should have received a copy of the GNU Library General Public License
17     along with this library; see the file COPYING.LIB.  If not, write to
18     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19     Boston, MA 02110-1301, USA.
20 */
21 
22 
23 #include "config.h"
24 #include "TextResourceDecoder.h"
25 
26 #include "DOMImplementation.h"
27 #include "HTMLNames.h"
28 #include "TextCodec.h"
29 #include "TextEncoding.h"
30 #include "TextEncodingDetector.h"
31 #include "TextEncodingRegistry.h"
32 #include <wtf/ASCIICType.h>
33 #include <wtf/StringExtras.h>
34 
35 using namespace WTF;
36 
37 namespace WebCore {
38 
39 using namespace HTMLNames;
40 
41 // You might think we should put these find functions elsewhere, perhaps with the
42 // similar functions that operate on UChar, but arguably only the decoder has
43 // a reason to process strings of char rather than UChar.
44 
find(const char * subject,size_t subjectLength,const char * target)45 static int find(const char* subject, size_t subjectLength, const char* target)
46 {
47     size_t targetLength = strlen(target);
48     if (targetLength > subjectLength)
49         return -1;
50     for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
51         bool match = true;
52         for (size_t j = 0; j < targetLength; ++j) {
53             if (subject[i + j] != target[j]) {
54                 match = false;
55                 break;
56             }
57         }
58         if (match)
59             return i;
60     }
61     return -1;
62 }
63 
findIgnoringCase(const char * subject,size_t subjectLength,const char * target)64 static int findIgnoringCase(const char* subject, size_t subjectLength, const char* target)
65 {
66     size_t targetLength = strlen(target);
67     if (targetLength > subjectLength)
68         return -1;
69 #ifndef NDEBUG
70     for (size_t i = 0; i < targetLength; ++i)
71         ASSERT(isASCIILower(target[i]));
72 #endif
73     for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
74         bool match = true;
75         for (size_t j = 0; j < targetLength; ++j) {
76             if (toASCIILower(subject[i + j]) != target[j]) {
77                 match = false;
78                 break;
79             }
80         }
81         if (match)
82             return i;
83     }
84     return -1;
85 }
86 
findTextEncoding(const char * encodingName,int length)87 static TextEncoding findTextEncoding(const char* encodingName, int length)
88 {
89     Vector<char, 64> buffer(length + 1);
90     memcpy(buffer.data(), encodingName, length);
91     buffer[length] = '\0';
92     return buffer.data();
93 }
94 
95 class KanjiCode {
96 public:
97     enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
98     static enum Type judge(const char* str, int length);
99     static const int ESC = 0x1b;
100     static const unsigned char sjisMap[256];
ISkanji(int code)101     static int ISkanji(int code)
102     {
103         if (code >= 0x100)
104             return 0;
105         return sjisMap[code & 0xff] & 1;
106     }
ISkana(int code)107     static int ISkana(int code)
108     {
109         if (code >= 0x100)
110             return 0;
111         return sjisMap[code & 0xff] & 2;
112     }
113 };
114 
115 const unsigned char KanjiCode::sjisMap[256] = {
116     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126     0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
127     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
128     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
129     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
132 };
133 
134 /*
135  * EUC-JP is
136  *     [0xa1 - 0xfe][0xa1 - 0xfe]
137  *     0x8e[0xa1 - 0xfe](SS2)
138  *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
139  *
140  * Shift_Jis is
141  *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
142  *
143  * Shift_Jis Hankaku Kana is
144  *     [0xa1 - 0xdf]
145  */
146 
147 /*
148  * KanjiCode::judge() is based on judge_jcode() from jvim
149  *     http://hp.vector.co.jp/authors/VA003457/vim/
150  *
151  * Special Thanks to Kenichi Tsuchida
152  */
153 
judge(const char * str,int size)154 enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
155 {
156     enum Type code;
157     int i;
158     int bfr = false;            /* Kana Moji */
159     int bfk = 0;                /* EUC Kana */
160     int sjis = 0;
161     int euc = 0;
162 
163     const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
164 
165     code = ASCII;
166 
167     i = 0;
168     while (i < size) {
169         if (ptr[i] == ESC && (size - i >= 3)) {
170             if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
171             || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
172                 code = JIS;
173                 goto breakBreak;
174             } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
175                     || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
176                 code = JIS;
177                 goto breakBreak;
178             } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
179                 code = JIS;
180                 i += 3;
181             } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
182                 code = JIS;
183                 i += 3;
184             } else {
185                 i++;
186             }
187             bfr = false;
188             bfk = 0;
189         } else {
190             if (ptr[i] < 0x20) {
191                 bfr = false;
192                 bfk = 0;
193                 /* ?? check kudokuten ?? && ?? hiragana ?? */
194                 if ((i >= 2) && (ptr[i - 2] == 0x81)
195                         && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
196                     code = SJIS;
197                     sjis += 100;        /* kudokuten */
198                 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
199                         && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
200                     code = EUC;
201                     euc += 100;         /* kudokuten */
202                 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
203                     sjis += 40;         /* hiragana */
204                 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
205                     euc += 40;          /* hiragana */
206                 }
207             } else {
208                 /* ?? check hiragana or katana ?? */
209                 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
210                     sjis++;     /* hiragana */
211                 } else if ((size - i > 1) && (ptr[i] == 0x83)
212                          && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
213                     sjis++;     /* katakana */
214                 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
215                     euc++;      /* hiragana */
216                 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
217                     euc++;      /* katakana */
218                 }
219                 if (bfr) {
220                     if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
221                         code = SJIS;
222                         goto breakBreak;
223                     } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
224                         code = SJIS;
225                         goto breakBreak;
226                     } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
227                         code = EUC;
228                         goto breakBreak;
229                     } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
230                         code = EUC;
231                         goto breakBreak;
232                     } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
233                         code = SJIS;
234                         goto breakBreak;
235                     } else if (ptr[i] <= 0x7f) {
236                         code = SJIS;
237                         goto breakBreak;
238                     } else {
239                         if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
240                             euc++;      /* sjis hankaku kana kigo */
241                         } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
242                             ;           /* sjis hankaku kana */
243                         } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
244                             euc++;
245                         } else if (0x8e == ptr[i]) {
246                             euc++;
247                         } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
248                             sjis++;
249                         }
250                         bfr = false;
251                         bfk = 0;
252                     }
253                 } else if (0x8e == ptr[i]) {
254                     if (size - i <= 1) {
255                         ;
256                     } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
257                         /* EUC KANA or SJIS KANJI */
258                         if (bfk == 1) {
259                             euc += 100;
260                         }
261                         bfk++;
262                         i++;
263                     } else {
264                         /* SJIS only */
265                         code = SJIS;
266                         goto breakBreak;
267                     }
268                 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
269                     /* SJIS only */
270                     code = SJIS;
271                     if ((size - i >= 1)
272                             && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
273                             || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
274                         goto breakBreak;
275                     }
276                 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
277                     /* EUC only */
278                     code = EUC;
279                     if ((size - i >= 1)
280                             && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
281                         goto breakBreak;
282                     }
283                 } else if (ptr[i] <= 0x7f) {
284                     ;
285                 } else {
286                     bfr = true;
287                     bfk = 0;
288                 }
289             }
290             i++;
291         }
292     }
293     if (code == ASCII) {
294         if (sjis > euc) {
295             code = SJIS;
296         } else if (sjis < euc) {
297             code = EUC;
298         }
299     }
300 breakBreak:
301     return (code);
302 }
303 
determineContentType(const String & mimeType)304 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
305 {
306     if (equalIgnoringCase(mimeType, "text/css"))
307         return CSS;
308     if (equalIgnoringCase(mimeType, "text/html"))
309         return HTML;
310     if (DOMImplementation::isXMLMIMEType(mimeType))
311         return XML;
312     return PlainText;
313 }
314 
defaultEncoding(ContentType contentType,const TextEncoding & specifiedDefaultEncoding)315 const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
316 {
317     // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
318     // for text/xml. This matches Firefox.
319     if (contentType == XML)
320         return UTF8Encoding();
321     if (!specifiedDefaultEncoding.isValid())
322         return Latin1Encoding();
323     return specifiedDefaultEncoding;
324 }
325 
TextResourceDecoder(const String & mimeType,const TextEncoding & specifiedDefaultEncoding,bool usesEncodingDetector)326 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
327     : m_contentType(determineContentType(mimeType))
328     , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
329     , m_source(DefaultEncoding)
330     , m_hintEncoding(0)
331     , m_checkedForBOM(false)
332     , m_checkedForCSSCharset(false)
333     , m_checkedForHeadCharset(false)
334     , m_useLenientXMLDecoding(false)
335     , m_sawError(false)
336     , m_usesEncodingDetector(usesEncodingDetector)
337 {
338 }
339 
~TextResourceDecoder()340 TextResourceDecoder::~TextResourceDecoder()
341 {
342 }
343 
setEncoding(const TextEncoding & encoding,EncodingSource source)344 void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
345 {
346     // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
347     if (!encoding.isValid())
348         return;
349 
350     // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
351     // treat x-user-defined as windows-1252 (bug 18270)
352     if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
353         m_encoding = "windows-1252";
354     else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
355         m_encoding = encoding.closestByteBasedEquivalent();
356     else
357         m_encoding = encoding;
358 
359     m_codec.clear();
360     m_source = source;
361 }
362 
363 // Returns the position of the encoding string.
findXMLEncoding(const char * str,int len,int & encodingLength)364 static int findXMLEncoding(const char* str, int len, int& encodingLength)
365 {
366     int pos = find(str, len, "encoding");
367     if (pos == -1)
368         return -1;
369     pos += 8;
370 
371     // Skip spaces and stray control characters.
372     while (pos < len && str[pos] <= ' ')
373         ++pos;
374 
375     // Skip equals sign.
376     if (pos >= len || str[pos] != '=')
377         return -1;
378     ++pos;
379 
380     // Skip spaces and stray control characters.
381     while (pos < len && str[pos] <= ' ')
382         ++pos;
383 
384     // Skip quotation mark.
385     if (pos >= len)
386         return - 1;
387     char quoteMark = str[pos];
388     if (quoteMark != '"' && quoteMark != '\'')
389         return -1;
390     ++pos;
391 
392     // Find the trailing quotation mark.
393     int end = pos;
394     while (end < len && str[end] != quoteMark)
395         ++end;
396     if (end >= len)
397         return -1;
398 
399     encodingLength = end - pos;
400     return pos;
401 }
402 
403 // true if there is more to parse
skipWhitespace(const char * & pos,const char * dataEnd)404 static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
405 {
406     while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
407         ++pos;
408     return pos != dataEnd;
409 }
410 
checkForBOM(const char * data,size_t len)411 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
412 {
413     // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
414     // We let it override even a user-chosen encoding.
415     ASSERT(!m_checkedForBOM);
416 
417     size_t lengthOfBOM = 0;
418 
419     size_t bufferLength = m_buffer.size();
420 
421     size_t buf1Len = bufferLength;
422     size_t buf2Len = len;
423     const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
424     const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
425     unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
426     unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
427     unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
428     unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
429 
430     // Check for the BOM.
431     if (c1 == 0xFF && c2 == 0xFE) {
432         if (c3 != 0 || c4 != 0) {
433             setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
434             lengthOfBOM = 2;
435         } else {
436             setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
437             lengthOfBOM = 4;
438         }
439     } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
440         setEncoding(UTF8Encoding(), AutoDetectedEncoding);
441         lengthOfBOM = 3;
442     } else if (c1 == 0xFE && c2 == 0xFF) {
443         setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
444         lengthOfBOM = 2;
445     } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
446         setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
447         lengthOfBOM = 4;
448     }
449 
450     if (lengthOfBOM || bufferLength + len >= 4)
451         m_checkedForBOM = true;
452 
453     return lengthOfBOM;
454 }
455 
checkForCSSCharset(const char * data,size_t len,bool & movedDataToBuffer)456 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
457 {
458     if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
459         m_checkedForCSSCharset = true;
460         return true;
461     }
462 
463     size_t oldSize = m_buffer.size();
464     m_buffer.grow(oldSize + len);
465     memcpy(m_buffer.data() + oldSize, data, len);
466 
467     movedDataToBuffer = true;
468 
469     if (m_buffer.size() > 8) { // strlen("@charset") == 8
470         const char* dataStart = m_buffer.data();
471         const char* dataEnd = dataStart + m_buffer.size();
472 
473         if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' &&
474             dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') {
475 
476             dataStart += 8;
477             const char* pos = dataStart;
478             if (!skipWhitespace(pos, dataEnd))
479                 return false;
480 
481             if (*pos == '"' || *pos == '\'') {
482                 char quotationMark = *pos;
483                 ++pos;
484                 dataStart = pos;
485 
486                 while (pos < dataEnd && *pos != quotationMark)
487                     ++pos;
488                 if (pos == dataEnd)
489                     return false;
490 
491                 int encodingNameLength = pos - dataStart + 1;
492 
493                 ++pos;
494                 if (!skipWhitespace(pos, dataEnd))
495                     return false;
496 
497                 if (*pos == ';')
498                     setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
499             }
500         }
501         m_checkedForCSSCharset = true;
502         return true;
503     }
504     return false;
505 }
506 
507 // Other browsers allow comments in the head section, so we need to also.
508 // It's important not to look for tags inside the comments.
skipComment(const char * & ptr,const char * pEnd)509 static inline void skipComment(const char*& ptr, const char* pEnd)
510 {
511     const char* p = ptr;
512     if (p == pEnd)
513       return;
514     // Allow <!-->; other browsers do.
515     if (*p == '>') {
516         p++;
517     } else {
518         while (p + 2 < pEnd) {
519             if (*p == '-') {
520                 // This is the real end of comment, "-->".
521                 if (p[1] == '-' && p[2] == '>') {
522                     p += 3;
523                     break;
524                 }
525                 // This is the incorrect end of comment that other browsers allow, "--!>".
526                 if (p + 3 < pEnd && p[1] == '-' && p[2] == '!' && p[3] == '>') {
527                     p += 4;
528                     break;
529                 }
530             }
531             p++;
532         }
533     }
534     ptr = p;
535 }
536 
537 const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over.
538 
checkForHeadCharset(const char * data,size_t len,bool & movedDataToBuffer)539 bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
540 {
541     if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
542         m_checkedForHeadCharset = true;
543         return true;
544     }
545 
546     // This is not completely efficient, since the function might go
547     // through the HTML head several times.
548 
549     size_t oldSize = m_buffer.size();
550     m_buffer.grow(oldSize + len);
551     memcpy(m_buffer.data() + oldSize, data, len);
552 
553     movedDataToBuffer = true;
554 
555     const char* ptr = m_buffer.data();
556     const char* pEnd = ptr + m_buffer.size();
557 
558     // Is there enough data available to check for XML declaration?
559     if (m_buffer.size() < 8)
560         return false;
561 
562     // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
563     // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
564     if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') {
565         const char* xmlDeclarationEnd = ptr;
566         while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
567             ++xmlDeclarationEnd;
568         if (xmlDeclarationEnd == pEnd)
569             return false;
570         // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
571         int len;
572         int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
573         if (pos != -1)
574             setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
575         // continue looking for a charset - it may be specified in an HTTP-Equiv meta
576     } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) {
577         setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
578         return true;
579     } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') {
580         setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
581         return true;
582     } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) {
583         setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
584         return true;
585     } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') {
586         setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
587         return true;
588     }
589 
590     // we still don't have an encoding, and are in the head
591     // the following tags are allowed in <head>:
592     // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
593 
594     // We stop scanning when a tag that is not permitted in <head>
595     // is seen, rather when </head> is seen, because that more closely
596     // matches behavior in other browsers; more details in
597     // <http://bugs.webkit.org/show_bug.cgi?id=3590>.
598 
599     // Additionally, we ignore things that looks like tags in <title>, <script> and <noscript>; see
600     // <http://bugs.webkit.org/show_bug.cgi?id=4560>, <http://bugs.webkit.org/show_bug.cgi?id=12165>
601     // and <http://bugs.webkit.org/show_bug.cgi?id=12389>.
602 
603     // Since many sites have charset declarations after <body> or other tags that are disallowed in <head>,
604     // we don't bail out until we've checked at least bytesToCheckUnconditionally bytes of input.
605 
606     AtomicStringImpl* enclosingTagName = 0;
607     bool inHeadSection = true; // Becomes false when </head> or any tag not allowed in head is encountered.
608 
609     // the HTTP-EQUIV meta has no effect on XHTML
610     if (m_contentType == XML)
611         return true;
612 
613     while (ptr + 3 < pEnd) { // +3 guarantees that "<!--" fits in the buffer - and certainly we aren't going to lose any "charset" that way.
614         if (*ptr == '<') {
615             bool end = false;
616             ptr++;
617 
618             // Handle comments.
619             if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
620                 ptr += 3;
621                 skipComment(ptr, pEnd);
622                 if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) {
623                     // Some pages that test bandwidth from within the browser do it by having
624                     // huge comments and measuring the time they take to load. Repeatedly scanning
625                     // these comments can take a lot of CPU time.
626                     m_checkedForHeadCharset = true;
627                     return true;
628                 }
629                 continue;
630             }
631 
632             if (*ptr == '/') {
633                 ++ptr;
634                 end = true;
635             }
636 
637             // Grab the tag name, but mostly ignore namespaces.
638             bool sawNamespace = false;
639             char tagBuffer[20];
640             int len = 0;
641             while (len < 19) {
642                 if (ptr == pEnd)
643                     return false;
644                 char c = *ptr;
645                 if (c == ':') {
646                     len = 0;
647                     sawNamespace = true;
648                     ptr++;
649                     continue;
650                 }
651                 if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'))
652                     ;
653                 else if (c >= 'A' && c <= 'Z')
654                     c += 'a' - 'A';
655                 else
656                     break;
657                 tagBuffer[len++] = c;
658                 ptr++;
659             }
660             tagBuffer[len] = 0;
661             AtomicString tag(tagBuffer);
662 
663             if (enclosingTagName) {
664                 if (end && tag.impl() == enclosingTagName)
665                     enclosingTagName = 0;
666             } else {
667                 if (tag == titleTag)
668                     enclosingTagName = titleTag.localName().impl();
669                 else if (tag == scriptTag)
670                     enclosingTagName = scriptTag.localName().impl();
671                 else if (tag == noscriptTag)
672                     enclosingTagName = noscriptTag.localName().impl();
673             }
674 
675             // Find where the opening tag ends.
676             const char* tagContentStart = ptr;
677             if (!end) {
678                 while (ptr != pEnd && *ptr != '>') {
679                     if (*ptr == '\'' || *ptr == '"') {
680                         char quoteMark = *ptr;
681                         ++ptr;
682                         while (ptr != pEnd && *ptr != quoteMark)
683                             ++ptr;
684                         if (ptr == pEnd)
685                             return false;
686                     }
687                     ++ptr;
688                 }
689                 if (ptr == pEnd)
690                     return false;
691                 ++ptr;
692             }
693 
694             if (!end && tag == metaTag && !sawNamespace) {
695                 const char* str = tagContentStart;
696                 int length = ptr - tagContentStart;
697                 int pos = 0;
698                 while (pos < length) {
699                     int charsetPos = findIgnoringCase(str + pos, length - pos, "charset");
700                     if (charsetPos == -1)
701                         break;
702                     pos += charsetPos + 7;
703                     // skip whitespace
704                     while (pos < length && str[pos] <= ' ')
705                         pos++;
706                     if (pos == length)
707                         break;
708                     if (str[pos++] != '=')
709                         continue;
710                     while ((pos < length) &&
711                             (str[pos] <= ' ' || str[pos] == '=' || str[pos] == '"' || str[pos] == '\''))
712                         pos++;
713 
714                     // end ?
715                     if (pos == length)
716                         break;
717                     int end = pos;
718                     while (end < length &&
719                            str[end] != ' ' && str[end] != '"' && str[end] != '\'' &&
720                            str[end] != ';' && str[end] != '>')
721                         end++;
722                     setEncoding(findTextEncoding(str + pos, end - pos), EncodingFromMetaTag);
723                     if (m_source == EncodingFromMetaTag)
724                         return true;
725 
726                     if (end >= length || str[end] == '/' || str[end] == '>')
727                         break;
728 
729                     pos = end + 1;
730                 }
731             } else {
732                 if (!enclosingTagName && tag != scriptTag && tag != noscriptTag && tag != styleTag
733                     && tag != linkTag && tag != metaTag && tag != objectTag && tag != titleTag && tag != baseTag
734                     && (end || tag != htmlTag) && (end || tag != headTag) && isASCIIAlpha(tagBuffer[0])) {
735                     inHeadSection = false;
736                 }
737 
738                 if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) {
739                     m_checkedForHeadCharset = true;
740                     return true;
741                 }
742             }
743         } else
744             ++ptr;
745     }
746     return false;
747 }
748 
detectJapaneseEncoding(const char * data,size_t len)749 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
750 {
751     switch (KanjiCode::judge(data, len)) {
752         case KanjiCode::JIS:
753             setEncoding("ISO-2022-JP", AutoDetectedEncoding);
754             break;
755         case KanjiCode::EUC:
756             setEncoding("EUC-JP", AutoDetectedEncoding);
757             break;
758         case KanjiCode::SJIS:
759             setEncoding("Shift_JIS", AutoDetectedEncoding);
760             break;
761         case KanjiCode::ASCII:
762         case KanjiCode::UTF16:
763         case KanjiCode::UTF8:
764             break;
765     }
766 }
767 
768 // We use the encoding detector in two cases:
769 //   1. Encoding detector is turned ON and no other encoding source is
770 //      available (that is, it's DefaultEncoding).
771 //   2. Encoding detector is turned ON and the encoding is set to
772 //      the encoding of the parent frame, which is also auto-detected.
773 //   Note that condition #2 is NOT satisfied unless parent-child frame
774 //   relationship is compliant to the same-origin policy. If they're from
775 //   different domains, |m_source| would not be set to EncodingFromParentFrame
776 //   in the first place.
shouldAutoDetect() const777 bool TextResourceDecoder::shouldAutoDetect() const
778 {
779     // Just checking m_hintEncoding suffices here because it's only set
780     // in setHintEncoding when the source is AutoDetectedEncoding.
781     return m_usesEncodingDetector
782         && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
783 }
784 
decode(const char * data,size_t len)785 String TextResourceDecoder::decode(const char* data, size_t len)
786 {
787     size_t lengthOfBOM = 0;
788     if (!m_checkedForBOM)
789         lengthOfBOM = checkForBOM(data, len);
790 
791     bool movedDataToBuffer = false;
792 
793     if (m_contentType == CSS && !m_checkedForCSSCharset)
794         if (!checkForCSSCharset(data, len, movedDataToBuffer))
795             return "";
796 
797     if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
798         if (!checkForHeadCharset(data, len, movedDataToBuffer))
799             return "";
800 
801     // FIXME: It is wrong to change the encoding downstream after we have already done some decoding.
802     if (shouldAutoDetect()) {
803         if (m_encoding.isJapanese())
804             detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
805         else {
806             TextEncoding detectedEncoding;
807             if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
808                 setEncoding(detectedEncoding, AutoDetectedEncoding);
809         }
810     }
811 
812     ASSERT(m_encoding.isValid());
813 
814     if (!m_codec)
815         m_codec.set(newTextCodec(m_encoding).release());
816 
817     if (m_buffer.isEmpty())
818         return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError);
819 
820     if (!movedDataToBuffer) {
821         size_t oldSize = m_buffer.size();
822         m_buffer.grow(oldSize + len);
823         memcpy(m_buffer.data() + oldSize, data, len);
824     }
825 
826     String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
827     m_buffer.clear();
828     return result;
829 }
830 
flush()831 String TextResourceDecoder::flush()
832 {
833    // If we can not identify the encoding even after a document is completely
834    // loaded, we need to detect the encoding if other conditions for
835    // autodetection is satisfied.
836     if (m_buffer.size() && shouldAutoDetect()
837         && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
838          TextEncoding detectedEncoding;
839          if (detectTextEncoding(m_buffer.data(), m_buffer.size(),
840                                 m_hintEncoding, &detectedEncoding))
841              setEncoding(detectedEncoding, AutoDetectedEncoding);
842     }
843 
844     if (!m_codec)
845         m_codec.set(newTextCodec(m_encoding).release());
846 
847     String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
848     m_buffer.clear();
849     m_codec.clear();
850     m_checkedForBOM = false; // Skip BOM again when re-decoding.
851     return result;
852 }
853 
854 }
855