1 /*
2 * Copyright (C) 2004, 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "config.h"
27 #include "TextDecoder.h"
28
29 #include "TextEncodingRegistry.h"
30
31 // FIXME: Would be nice to also handle BOM for UTF-7 and UTF-32.
32
33 namespace WebCore {
34
TextDecoder(const TextEncoding & encoding)35 TextDecoder::TextDecoder(const TextEncoding& encoding)
36 : m_encoding(encoding)
37 , m_checkedForBOM(false)
38 , m_numBufferedBytes(0)
39 {
40 }
41
reset(const TextEncoding & encoding)42 void TextDecoder::reset(const TextEncoding& encoding)
43 {
44 m_encoding = encoding;
45 m_codec.clear();
46 m_checkedForBOM = false;
47 m_numBufferedBytes = 0;
48 }
49
checkForBOM(const char * data,size_t length,bool flush,bool stopOnError,bool & sawError)50 String TextDecoder::checkForBOM(const char* data, size_t length, bool flush, bool stopOnError, bool& sawError)
51 {
52 ASSERT(!m_checkedForBOM);
53
54 // Check to see if we found a BOM.
55 size_t numBufferedBytes = m_numBufferedBytes;
56 size_t buf1Len = numBufferedBytes;
57 size_t buf2Len = length;
58 const unsigned char* buf1 = m_bufferedBytes;
59 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
60 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
61 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
62 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
63 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
64
65 const TextEncoding* encodingConsideringBOM = &m_encoding;
66 bool foundBOM = true;
67 size_t lengthOfBOM = 0;
68 if (c1 == 0xFF && c2 == 0xFE) {
69 if (c3 != 0 || c4 != 0) {
70 encodingConsideringBOM = &UTF16LittleEndianEncoding();
71 lengthOfBOM = 2;
72 } else if (numBufferedBytes + length > sizeof(m_bufferedBytes)) {
73 encodingConsideringBOM = &UTF32LittleEndianEncoding();
74 lengthOfBOM = 4;
75 } else
76 foundBOM = false;
77 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
78 encodingConsideringBOM = &UTF8Encoding();
79 lengthOfBOM = 3;
80 } else if (c1 == 0xFE && c2 == 0xFF) {
81 encodingConsideringBOM = &UTF16BigEndianEncoding();
82 lengthOfBOM = 2;
83 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
84 encodingConsideringBOM = &UTF32BigEndianEncoding();
85 lengthOfBOM = 4;
86 } else
87 foundBOM = false;
88
89 if (!foundBOM && numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
90 // Continue to look for the BOM.
91 memcpy(&m_bufferedBytes[numBufferedBytes], data, length);
92 m_numBufferedBytes += length;
93 return "";
94 }
95
96 // Done checking for BOM.
97 m_codec.set(newTextCodec(*encodingConsideringBOM).release());
98 if (!m_codec)
99 return String();
100 m_checkedForBOM = true;
101
102 // Skip the BOM.
103 if (foundBOM) {
104 ASSERT(numBufferedBytes < lengthOfBOM);
105 size_t numUnbufferedBOMBytes = lengthOfBOM - numBufferedBytes;
106 ASSERT(numUnbufferedBOMBytes <= length);
107
108 data += numUnbufferedBOMBytes;
109 length -= numUnbufferedBOMBytes;
110 numBufferedBytes = 0;
111 m_numBufferedBytes = 0;
112 }
113
114 // Handle case where we have some buffered bytes to deal with.
115 if (numBufferedBytes) {
116 char bufferedBytes[sizeof(m_bufferedBytes)];
117 memcpy(bufferedBytes, m_bufferedBytes, numBufferedBytes);
118 m_numBufferedBytes = 0;
119
120 String bufferedResult = m_codec->decode(bufferedBytes, numBufferedBytes, false, stopOnError, sawError);
121 if (stopOnError && sawError)
122 return bufferedResult;
123 return bufferedResult + m_codec->decode(data, length, flush, stopOnError, sawError);
124 }
125
126 return m_codec->decode(data, length, flush, stopOnError, sawError);
127 }
128
129 } // namespace WebCore
130