1 /*
2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "config.h"
29 #include "TextEncoding.h"
30
31 #include "PlatformString.h"
32 #include "TextCodec.h"
33 #include "TextEncodingRegistry.h"
34 #if USE(ICU_UNICODE)
35 #include <unicode/unorm.h>
36 #elif USE(QT4_UNICODE)
37 #include <QString>
38 #elif USE(GLIB_UNICODE)
39 #include <glib.h>
40 #include "GOwnPtr.h"
41 #endif
42 #include <wtf/text/CString.h>
43 #include <wtf/OwnPtr.h>
44 #include <wtf/StdLibExtras.h>
45
46 namespace WebCore {
47
UTF7Encoding()48 static const TextEncoding& UTF7Encoding()
49 {
50 static TextEncoding globalUTF7Encoding("UTF-7");
51 return globalUTF7Encoding;
52 }
53
TextEncoding(const char * name)54 TextEncoding::TextEncoding(const char* name)
55 : m_name(atomicCanonicalTextEncodingName(name))
56 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
57 {
58 }
59
TextEncoding(const String & name)60 TextEncoding::TextEncoding(const String& name)
61 : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
62 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
63 {
64 }
65
decode(const char * data,size_t length,bool stopOnError,bool & sawError) const66 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
67 {
68 if (!m_name)
69 return String();
70
71 return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
72 }
73
encode(const UChar * characters,size_t length,UnencodableHandling handling) const74 CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
75 {
76 if (!m_name)
77 return CString();
78
79 if (!length)
80 return "";
81
82 #if USE(ICU_UNICODE)
83 // FIXME: What's the right place to do normalization?
84 // It's a little strange to do it inside the encode function.
85 // Perhaps normalization should be an explicit step done before calling encode.
86
87 const UChar* source = characters;
88 size_t sourceLength = length;
89
90 Vector<UChar> normalizedCharacters;
91
92 UErrorCode err = U_ZERO_ERROR;
93 if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
94 // First try using the length of the original string, since normalization to NFC rarely increases length.
95 normalizedCharacters.grow(sourceLength);
96 int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
97 if (err == U_BUFFER_OVERFLOW_ERROR) {
98 err = U_ZERO_ERROR;
99 normalizedCharacters.resize(normalizedLength);
100 normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
101 }
102 ASSERT(U_SUCCESS(err));
103
104 source = normalizedCharacters.data();
105 sourceLength = normalizedLength;
106 }
107 return newTextCodec(*this)->encode(source, sourceLength, handling);
108 #elif USE(QT4_UNICODE)
109 QString str(reinterpret_cast<const QChar*>(characters), length);
110 str = str.normalized(QString::NormalizationForm_C);
111 return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
112 #elif USE(GLIB_UNICODE)
113 GOwnPtr<char> UTF8Source;
114 UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0));
115 if (!UTF8Source) {
116 // If conversion to UTF-8 failed, try with the string without normalization
117 return newTextCodec(*this)->encode(characters, length, handling);
118 }
119
120 GOwnPtr<char> UTF8Normalized;
121 UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC));
122
123 long UTF16Length;
124 GOwnPtr<UChar> UTF16Normalized;
125 UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0));
126
127 return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling);
128 #elif OS(WINCE)
129 // normalization will be done by Windows CE API
130 OwnPtr<TextCodec> textCodec = newTextCodec(*this);
131 return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
132 #elif USE(BREWMP_UNICODE)
133 // FIXME: not sure if Brew MP normalizes the input string automatically
134 OwnPtr<TextCodec> textCodec = newTextCodec(*this);
135 return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
136 #endif
137 }
138
domName() const139 const char* TextEncoding::domName() const
140 {
141 if (noExtendedTextEncodingNameUsed())
142 return m_name;
143
144 // We treat EUC-KR as windows-949 (its superset), but need to expose
145 // the name 'EUC-KR' because the name 'windows-949' is not recognized by
146 // most Korean web servers even though they do use the encoding
147 // 'windows-949' with the name 'EUC-KR'.
148 // FIXME: This is not thread-safe. At the moment, this function is
149 // only accessed in a single thread, but eventually has to be made
150 // thread-safe along with usesVisualOrdering().
151 static const char* const a = atomicCanonicalTextEncodingName("windows-949");
152 if (m_name == a)
153 return "EUC-KR";
154 return m_name;
155 }
156
usesVisualOrdering() const157 bool TextEncoding::usesVisualOrdering() const
158 {
159 if (noExtendedTextEncodingNameUsed())
160 return false;
161
162 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
163 return m_name == a;
164 }
165
isJapanese() const166 bool TextEncoding::isJapanese() const
167 {
168 return isJapaneseEncoding(m_name);
169 }
170
backslashAsCurrencySymbol() const171 UChar TextEncoding::backslashAsCurrencySymbol() const
172 {
173 return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
174 }
175
isNonByteBasedEncoding() const176 bool TextEncoding::isNonByteBasedEncoding() const
177 {
178 if (noExtendedTextEncodingNameUsed()) {
179 return *this == UTF16LittleEndianEncoding()
180 || *this == UTF16BigEndianEncoding();
181 }
182
183 return *this == UTF16LittleEndianEncoding()
184 || *this == UTF16BigEndianEncoding()
185 || *this == UTF32BigEndianEncoding()
186 || *this == UTF32LittleEndianEncoding();
187 }
188
isUTF7Encoding() const189 bool TextEncoding::isUTF7Encoding() const
190 {
191 if (noExtendedTextEncodingNameUsed())
192 return false;
193
194 return *this == UTF7Encoding();
195 }
196
closestByteBasedEquivalent() const197 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
198 {
199 if (isNonByteBasedEncoding())
200 return UTF8Encoding();
201 return *this;
202 }
203
204 // HTML5 specifies that UTF-8 be used in form submission when a form is
205 // is a part of a document in UTF-16 probably because UTF-16 is not a
206 // byte-based encoding and can contain 0x00. By extension, the same
207 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
208 // but it's fraught with problems and we'd rather steer clear of it.
encodingForFormSubmission() const209 const TextEncoding& TextEncoding::encodingForFormSubmission() const
210 {
211 if (isNonByteBasedEncoding() || isUTF7Encoding())
212 return UTF8Encoding();
213 return *this;
214 }
215
ASCIIEncoding()216 const TextEncoding& ASCIIEncoding()
217 {
218 static TextEncoding globalASCIIEncoding("ASCII");
219 return globalASCIIEncoding;
220 }
221
Latin1Encoding()222 const TextEncoding& Latin1Encoding()
223 {
224 static TextEncoding globalLatin1Encoding("latin1");
225 return globalLatin1Encoding;
226 }
227
UTF16BigEndianEncoding()228 const TextEncoding& UTF16BigEndianEncoding()
229 {
230 static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
231 return globalUTF16BigEndianEncoding;
232 }
233
UTF16LittleEndianEncoding()234 const TextEncoding& UTF16LittleEndianEncoding()
235 {
236 static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
237 return globalUTF16LittleEndianEncoding;
238 }
239
UTF32BigEndianEncoding()240 const TextEncoding& UTF32BigEndianEncoding()
241 {
242 static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
243 return globalUTF32BigEndianEncoding;
244 }
245
UTF32LittleEndianEncoding()246 const TextEncoding& UTF32LittleEndianEncoding()
247 {
248 static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
249 return globalUTF32LittleEndianEncoding;
250 }
251
UTF8Encoding()252 const TextEncoding& UTF8Encoding()
253 {
254 static TextEncoding globalUTF8Encoding("UTF-8");
255 ASSERT(globalUTF8Encoding.isValid());
256 return globalUTF8Encoding;
257 }
258
WindowsLatin1Encoding()259 const TextEncoding& WindowsLatin1Encoding()
260 {
261 static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
262 return globalWindowsLatin1Encoding;
263 }
264
265 } // namespace WebCore
266