• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved.
3  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "TextCodecMac.h"
29 
30 #include "CString.h"
31 #include "CharacterNames.h"
32 #include "CharsetData.h"
33 #include "PlatformString.h"
34 #include "ThreadGlobalData.h"
35 #include <wtf/Assertions.h>
36 #include <wtf/PassOwnPtr.h>
37 #include <wtf/Threading.h>
38 
39 using namespace std;
40 
41 namespace WebCore {
42 
43 // We need to keep this because ICU doesn't support some of the encodings that we need:
44 // <http://bugs.webkit.org/show_bug.cgi?id=4195>.
45 
46 const size_t ConversionBufferSize = 16384;
47 
cachedConverterTEC()48 static TECConverterWrapper& cachedConverterTEC()
49 {
50     return threadGlobalData().cachedConverterTEC();
51 }
52 
registerEncodingNames(EncodingNameRegistrar registrar)53 void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar)
54 {
55     TECTextEncodingID lastEncoding = invalidEncoding;
56     const char* lastName = 0;
57 
58     for (size_t i = 0; CharsetTable[i].name; ++i) {
59         if (CharsetTable[i].encoding != lastEncoding) {
60             lastEncoding = CharsetTable[i].encoding;
61             lastName = CharsetTable[i].name;
62         }
63         registrar(CharsetTable[i].name, lastName);
64     }
65 }
66 
newTextCodecMac(const TextEncoding &,const void * additionalData)67 static PassOwnPtr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData)
68 {
69     return new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData));
70 }
71 
registerCodecs(TextCodecRegistrar registrar)72 void TextCodecMac::registerCodecs(TextCodecRegistrar registrar)
73 {
74     TECTextEncodingID lastEncoding = invalidEncoding;
75 
76     for (size_t i = 0; CharsetTable[i].name; ++i)
77         if (CharsetTable[i].encoding != lastEncoding) {
78             registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding);
79             lastEncoding = CharsetTable[i].encoding;
80         }
81 }
82 
TextCodecMac(TECTextEncodingID encoding)83 TextCodecMac::TextCodecMac(TECTextEncodingID encoding)
84     : m_encoding(encoding)
85     , m_numBufferedBytes(0)
86     , m_converterTEC(0)
87 {
88 }
89 
~TextCodecMac()90 TextCodecMac::~TextCodecMac()
91 {
92     releaseTECConverter();
93 }
94 
releaseTECConverter() const95 void TextCodecMac::releaseTECConverter() const
96 {
97     if (m_converterTEC) {
98         TECConverterWrapper& cachedConverter = cachedConverterTEC();
99         if (cachedConverter.converter)
100             TECDisposeConverter(cachedConverter.converter);
101         cachedConverter.converter = m_converterTEC;
102         cachedConverter.encoding = m_encoding;
103         m_converterTEC = 0;
104     }
105 }
106 
createTECConverter() const107 OSStatus TextCodecMac::createTECConverter() const
108 {
109     TECConverterWrapper& cachedConverter = cachedConverterTEC();
110 
111     bool cachedEncodingEqual = cachedConverter.encoding == m_encoding;
112     cachedConverter.encoding = invalidEncoding;
113 
114     if (cachedEncodingEqual && cachedConverter.converter) {
115         m_converterTEC = cachedConverter.converter;
116         cachedConverter.converter = 0;
117 
118         TECClearConverterContextInfo(m_converterTEC);
119     } else {
120         OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding,
121             CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat));
122         if (status)
123             return status;
124 
125         TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask);
126     }
127 
128     return noErr;
129 }
130 
decode(const unsigned char * inputBuffer,int inputBufferLength,int & inputLength,void * outputBuffer,int outputBufferLength,int & outputLength)131 OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength,
132     void *outputBuffer, int outputBufferLength, int& outputLength)
133 {
134     OSStatus status;
135     unsigned long bytesRead = 0;
136     unsigned long bytesWritten = 0;
137 
138     if (m_numBufferedBytes != 0) {
139         // Finish converting a partial character that's in our buffer.
140 
141         // First, fill the partial character buffer with as many bytes as are available.
142         ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes));
143         const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes;
144         const int bytesToPutInBuffer = min(spaceInBuffer, inputBufferLength);
145         ASSERT(bytesToPutInBuffer != 0);
146         memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer);
147 
148         // Now, do a conversion on the buffer.
149         status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead,
150             reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
151         ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer);
152 
153         if (status == kTECPartialCharErr && bytesRead == 0) {
154             // Handle the case where the partial character was not converted.
155             if (bytesToPutInBuffer >= spaceInBuffer) {
156                 LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes));
157                 m_numBufferedBytes = 0;
158                 status = kTECUnmappableElementErr; // should never happen, but use this error code
159             } else {
160                 // Tell the caller we read all the source bytes and keep them in the buffer.
161                 m_numBufferedBytes += bytesToPutInBuffer;
162                 bytesRead = bytesToPutInBuffer;
163                 status = noErr;
164             }
165         } else {
166             // We are done with the partial character buffer.
167             // Also, we have read some of the bytes from the main buffer.
168             if (bytesRead > m_numBufferedBytes) {
169                 bytesRead -= m_numBufferedBytes;
170             } else {
171                 LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr");
172                 bytesRead = 0;
173             }
174             m_numBufferedBytes = 0;
175             if (status == kTECPartialCharErr) {
176                 // While there may be a partial character problem in the small buffer,
177                 // we have to try again and not get confused and think there is a partial
178                 // character problem in the large buffer.
179                 status = noErr;
180             }
181         }
182     } else {
183         status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead,
184             static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
185         ASSERT(static_cast<int>(bytesRead) <= inputBufferLength);
186     }
187 
188     // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus.
189     if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0)
190         status = kTECOutputBufferFullStatus;
191 
192     inputLength = bytesRead;
193     outputLength = bytesWritten;
194     return status;
195 }
196 
decode(const char * bytes,size_t length,bool flush,bool stopOnError,bool & sawError)197 String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
198 {
199     // Get a converter for the passed-in encoding.
200     if (!m_converterTEC && createTECConverter() != noErr)
201         return String();
202 
203     Vector<UChar> result;
204 
205     const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes);
206     int sourceLength = length;
207     bool bufferWasFull = false;
208     UniChar buffer[ConversionBufferSize];
209 
210     while ((sourceLength || bufferWasFull) && !sawError) {
211         int bytesRead = 0;
212         int bytesWritten = 0;
213         OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
214         ASSERT(bytesRead <= sourceLength);
215         sourcePointer += bytesRead;
216         sourceLength -= bytesRead;
217 
218         switch (status) {
219             case noErr:
220             case kTECOutputBufferFullStatus:
221                 break;
222             case kTextMalformedInputErr:
223             case kTextUndefinedElementErr:
224                 // FIXME: Put FFFD character into the output string in this case?
225                 TECClearConverterContextInfo(m_converterTEC);
226                 if (stopOnError) {
227                     sawError = true;
228                     break;
229                 }
230                 if (sourceLength) {
231                     sourcePointer += 1;
232                     sourceLength -= 1;
233                 }
234                 break;
235             case kTECPartialCharErr: {
236                 // Put the partial character into the buffer.
237                 ASSERT(m_numBufferedBytes == 0);
238                 const int bufferSize = sizeof(m_numBufferedBytes);
239                 if (sourceLength < bufferSize) {
240                     memcpy(m_bufferedBytes, sourcePointer, sourceLength);
241                     m_numBufferedBytes = sourceLength;
242                 } else {
243                     LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
244                 }
245                 sourceLength = 0;
246                 break;
247             }
248             default:
249                 sawError = true;
250                 return String();
251         }
252 
253         ASSERT(!(bytesWritten % sizeof(UChar)));
254         result.append(buffer, bytesWritten / sizeof(UChar));
255 
256         bufferWasFull = status == kTECOutputBufferFullStatus;
257     }
258 
259     if (flush) {
260         unsigned long bytesWritten = 0;
261         TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
262         ASSERT(!(bytesWritten % sizeof(UChar)));
263         result.append(buffer, bytesWritten / sizeof(UChar));
264     }
265 
266     String resultString = String::adopt(result);
267 
268     // <rdar://problem/3225472>
269     // Simplified Chinese pages use the code A3A0 to mean "full-width space".
270     // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
271     // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
272     if (m_encoding == kCFStringEncodingGB_18030_2000)
273         resultString.replace(0xE5E5, ideographicSpace);
274 
275     return resultString;
276 }
277 
encode(const UChar * characters,size_t length,UnencodableHandling handling)278 CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling)
279 {
280     // FIXME: We should really use TEC here instead of CFString for consistency with the other direction.
281 
282     // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
283     // Encoding will change the yen sign back into a backslash.
284     String copy(characters, length);
285     copy.replace('\\', m_backslashAsCurrencySymbol);
286     RetainPtr<CFStringRef> cfs(AdoptCF, copy.createCFString());
287 
288     CFIndex startPos = 0;
289     CFIndex charactersLeft = CFStringGetLength(cfs.get());
290     Vector<char> result;
291     size_t size = 0;
292     UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0;
293     while (charactersLeft > 0) {
294         CFRange range = CFRangeMake(startPos, charactersLeft);
295         CFIndex bufferLength;
296         CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength);
297 
298         result.grow(size + bufferLength);
299         unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size);
300         CFIndex charactersConverted = CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength);
301         size += bufferLength;
302 
303         if (charactersConverted != charactersLeft) {
304             unsigned badChar = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
305             ++charactersConverted;
306             if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate
307                 UniChar low = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
308                 if ((low & 0xFC00) == 0xDC00) { // is low surrogate
309                     badChar <<= 10;
310                     badChar += low;
311                     badChar += 0x10000 - (0xD800 << 10) - 0xDC00;
312                     ++charactersConverted;
313                 }
314             }
315             UnencodableReplacementArray entity;
316             int entityLength = getUnencodableReplacement(badChar, handling, entity);
317             result.grow(size + entityLength);
318             memcpy(result.data() + size, entity, entityLength);
319             size += entityLength;
320         }
321 
322         startPos += charactersConverted;
323         charactersLeft -= charactersConverted;
324     }
325     return CString(result.data(), size);
326 }
327 
328 } // namespace WebCore
329