1 /*
2 * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "TextCodecMac.h"
29
30 #include "CharsetData.h"
31 #include "PlatformString.h"
32 #include "ThreadGlobalData.h"
33 #include <wtf/Assertions.h>
34 #include <wtf/PassOwnPtr.h>
35 #include <wtf/RetainPtr.h>
36 #include <wtf/Threading.h>
37 #include <wtf/text/CString.h>
38 #include <wtf/unicode/CharacterNames.h>
39
40 using namespace std;
41
42 namespace WebCore {
43
44 // We need to keep this because ICU doesn't support some of the encodings that we need:
45 // <http://bugs.webkit.org/show_bug.cgi?id=4195>.
46
47 const size_t ConversionBufferSize = 16384;
48
cachedConverterTEC()49 static TECConverterWrapper& cachedConverterTEC()
50 {
51 return threadGlobalData().cachedConverterTEC();
52 }
53
registerEncodingNames(EncodingNameRegistrar registrar)54 void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar)
55 {
56 TECTextEncodingID lastEncoding = invalidEncoding;
57 const char* lastName = 0;
58
59 for (size_t i = 0; CharsetTable[i].name; ++i) {
60 if (CharsetTable[i].encoding != lastEncoding) {
61 lastEncoding = CharsetTable[i].encoding;
62 lastName = CharsetTable[i].name;
63 }
64 registrar(CharsetTable[i].name, lastName);
65 }
66 }
67
newTextCodecMac(const TextEncoding &,const void * additionalData)68 static PassOwnPtr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData)
69 {
70 return new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData));
71 }
72
registerCodecs(TextCodecRegistrar registrar)73 void TextCodecMac::registerCodecs(TextCodecRegistrar registrar)
74 {
75 TECTextEncodingID lastEncoding = invalidEncoding;
76
77 for (size_t i = 0; CharsetTable[i].name; ++i)
78 if (CharsetTable[i].encoding != lastEncoding) {
79 registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding);
80 lastEncoding = CharsetTable[i].encoding;
81 }
82 }
83
TextCodecMac(TECTextEncodingID encoding)84 TextCodecMac::TextCodecMac(TECTextEncodingID encoding)
85 : m_encoding(encoding)
86 , m_numBufferedBytes(0)
87 , m_converterTEC(0)
88 {
89 }
90
~TextCodecMac()91 TextCodecMac::~TextCodecMac()
92 {
93 releaseTECConverter();
94 }
95
releaseTECConverter() const96 void TextCodecMac::releaseTECConverter() const
97 {
98 if (m_converterTEC) {
99 TECConverterWrapper& cachedConverter = cachedConverterTEC();
100 if (cachedConverter.converter)
101 TECDisposeConverter(cachedConverter.converter);
102 cachedConverter.converter = m_converterTEC;
103 cachedConverter.encoding = m_encoding;
104 m_converterTEC = 0;
105 }
106 }
107
createTECConverter() const108 OSStatus TextCodecMac::createTECConverter() const
109 {
110 TECConverterWrapper& cachedConverter = cachedConverterTEC();
111
112 bool cachedEncodingEqual = cachedConverter.encoding == m_encoding;
113 cachedConverter.encoding = invalidEncoding;
114
115 if (cachedEncodingEqual && cachedConverter.converter) {
116 m_converterTEC = cachedConverter.converter;
117 cachedConverter.converter = 0;
118
119 TECClearConverterContextInfo(m_converterTEC);
120 } else {
121 OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding,
122 CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat));
123 if (status)
124 return status;
125
126 TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask);
127 }
128
129 return noErr;
130 }
131
decode(const unsigned char * inputBuffer,int inputBufferLength,int & inputLength,void * outputBuffer,int outputBufferLength,int & outputLength)132 OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength,
133 void *outputBuffer, int outputBufferLength, int& outputLength)
134 {
135 OSStatus status;
136 unsigned long bytesRead = 0;
137 unsigned long bytesWritten = 0;
138
139 if (m_numBufferedBytes != 0) {
140 // Finish converting a partial character that's in our buffer.
141
142 // First, fill the partial character buffer with as many bytes as are available.
143 ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes));
144 const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes;
145 const int bytesToPutInBuffer = min(spaceInBuffer, inputBufferLength);
146 ASSERT(bytesToPutInBuffer != 0);
147 memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer);
148
149 // Now, do a conversion on the buffer.
150 status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead,
151 reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
152 ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer);
153
154 if (status == kTECPartialCharErr && bytesRead == 0) {
155 // Handle the case where the partial character was not converted.
156 if (bytesToPutInBuffer >= spaceInBuffer) {
157 LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes));
158 m_numBufferedBytes = 0;
159 status = kTECUnmappableElementErr; // should never happen, but use this error code
160 } else {
161 // Tell the caller we read all the source bytes and keep them in the buffer.
162 m_numBufferedBytes += bytesToPutInBuffer;
163 bytesRead = bytesToPutInBuffer;
164 status = noErr;
165 }
166 } else {
167 // We are done with the partial character buffer.
168 // Also, we have read some of the bytes from the main buffer.
169 if (bytesRead > m_numBufferedBytes) {
170 bytesRead -= m_numBufferedBytes;
171 } else {
172 LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr");
173 bytesRead = 0;
174 }
175 m_numBufferedBytes = 0;
176 if (status == kTECPartialCharErr) {
177 // While there may be a partial character problem in the small buffer,
178 // we have to try again and not get confused and think there is a partial
179 // character problem in the large buffer.
180 status = noErr;
181 }
182 }
183 } else {
184 status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead,
185 static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
186 ASSERT(static_cast<int>(bytesRead) <= inputBufferLength);
187 }
188
189 // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus.
190 if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0)
191 status = kTECOutputBufferFullStatus;
192
193 inputLength = bytesRead;
194 outputLength = bytesWritten;
195 return status;
196 }
197
decode(const char * bytes,size_t length,bool flush,bool stopOnError,bool & sawError)198 String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
199 {
200 // Get a converter for the passed-in encoding.
201 if (!m_converterTEC && createTECConverter() != noErr)
202 return String();
203
204 Vector<UChar> result;
205
206 const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes);
207 int sourceLength = length;
208 bool bufferWasFull = false;
209 UniChar buffer[ConversionBufferSize];
210
211 while ((sourceLength || bufferWasFull) && !sawError) {
212 int bytesRead = 0;
213 int bytesWritten = 0;
214 OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
215 ASSERT(bytesRead <= sourceLength);
216 sourcePointer += bytesRead;
217 sourceLength -= bytesRead;
218
219 switch (status) {
220 case noErr:
221 case kTECOutputBufferFullStatus:
222 break;
223 case kTextMalformedInputErr:
224 case kTextUndefinedElementErr:
225 // FIXME: Put FFFD character into the output string in this case?
226 TECClearConverterContextInfo(m_converterTEC);
227 if (stopOnError) {
228 sawError = true;
229 break;
230 }
231 if (sourceLength) {
232 sourcePointer += 1;
233 sourceLength -= 1;
234 }
235 break;
236 case kTECPartialCharErr: {
237 // Put the partial character into the buffer.
238 ASSERT(m_numBufferedBytes == 0);
239 const int bufferSize = sizeof(m_numBufferedBytes);
240 if (sourceLength < bufferSize) {
241 memcpy(m_bufferedBytes, sourcePointer, sourceLength);
242 m_numBufferedBytes = sourceLength;
243 } else {
244 LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
245 }
246 sourceLength = 0;
247 break;
248 }
249 default:
250 sawError = true;
251 return String();
252 }
253
254 ASSERT(!(bytesWritten % sizeof(UChar)));
255 result.append(buffer, bytesWritten / sizeof(UChar));
256
257 bufferWasFull = status == kTECOutputBufferFullStatus;
258 }
259
260 if (flush) {
261 unsigned long bytesWritten = 0;
262 TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
263 ASSERT(!(bytesWritten % sizeof(UChar)));
264 result.append(buffer, bytesWritten / sizeof(UChar));
265 }
266
267 String resultString = String::adopt(result);
268
269 // <rdar://problem/3225472>
270 // Simplified Chinese pages use the code A3A0 to mean "full-width space".
271 // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
272 // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
273 if (m_encoding == kCFStringEncodingGB_18030_2000)
274 resultString.replace(0xE5E5, ideographicSpace);
275
276 return resultString;
277 }
278
encode(const UChar * characters,size_t length,UnencodableHandling handling)279 CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling)
280 {
281 // FIXME: We should really use TEC here instead of CFString for consistency with the other direction.
282
283 // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
284 // Encoding will change the yen sign back into a backslash.
285 String copy(characters, length);
286 copy.replace('\\', m_backslashAsCurrencySymbol);
287 RetainPtr<CFStringRef> cfs(AdoptCF, copy.createCFString());
288
289 CFIndex startPos = 0;
290 CFIndex charactersLeft = CFStringGetLength(cfs.get());
291 Vector<char> result;
292 size_t size = 0;
293 UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0;
294 while (charactersLeft > 0) {
295 CFRange range = CFRangeMake(startPos, charactersLeft);
296 CFIndex bufferLength;
297 CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength);
298
299 result.grow(size + bufferLength);
300 unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size);
301 CFIndex charactersConverted = CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength);
302 size += bufferLength;
303
304 if (charactersConverted != charactersLeft) {
305 unsigned badChar = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
306 ++charactersConverted;
307 if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate
308 UniChar low = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
309 if ((low & 0xFC00) == 0xDC00) { // is low surrogate
310 badChar <<= 10;
311 badChar += low;
312 badChar += 0x10000 - (0xD800 << 10) - 0xDC00;
313 ++charactersConverted;
314 }
315 }
316 UnencodableReplacementArray entity;
317 int entityLength = getUnencodableReplacement(badChar, handling, entity);
318 result.grow(size + entityLength);
319 memcpy(result.data() + size, entity, entityLength);
320 size += entityLength;
321 }
322
323 startPos += charactersConverted;
324 charactersLeft -= charactersConverted;
325 }
326 return CString(result.data(), size);
327 }
328
329 } // namespace WebCore
330