• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "config.h"
29 #include "TextEncoding.h"
30 
31 #include "CString.h"
32 #include "PlatformString.h"
33 #include "TextCodec.h"
34 #include "TextEncodingRegistry.h"
35 #if USE(ICU_UNICODE)
36 #include <unicode/unorm.h>
37 #elif USE(QT4_UNICODE)
38 #include <QString>
39 #elif USE(GLIB_UNICODE)
40 #include <glib.h>
41 #include <wtf/gtk/GOwnPtr.h>
42 #endif
43 #include <wtf/HashSet.h>
44 #include <wtf/OwnPtr.h>
45 #include <wtf/StdLibExtras.h>
46 
47 namespace WebCore {
48 
addEncodingName(HashSet<const char * > & set,const char * name)49 static void addEncodingName(HashSet<const char*>& set, const char* name)
50 {
51     const char* atomicName = atomicCanonicalTextEncodingName(name);
52     if (atomicName)
53         set.add(atomicName);
54 }
55 
UTF7Encoding()56 static const TextEncoding& UTF7Encoding()
57 {
58     static TextEncoding globalUTF7Encoding("UTF-7");
59     return globalUTF7Encoding;
60 }
61 
TextEncoding(const char * name)62 TextEncoding::TextEncoding(const char* name)
63     : m_name(atomicCanonicalTextEncodingName(name))
64     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
65 {
66 }
67 
TextEncoding(const String & name)68 TextEncoding::TextEncoding(const String& name)
69     : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
70     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
71 {
72 }
73 
decode(const char * data,size_t length,bool stopOnError,bool & sawError) const74 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
75 {
76     if (!m_name)
77         return String();
78 
79     return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
80 }
81 
encode(const UChar * characters,size_t length,UnencodableHandling handling) const82 CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
83 {
84     if (!m_name)
85         return CString();
86 
87     if (!length)
88         return "";
89 
90 #if USE(ICU_UNICODE)
91     // FIXME: What's the right place to do normalization?
92     // It's a little strange to do it inside the encode function.
93     // Perhaps normalization should be an explicit step done before calling encode.
94 
95     const UChar* source = characters;
96     size_t sourceLength = length;
97 
98     Vector<UChar> normalizedCharacters;
99 
100     UErrorCode err = U_ZERO_ERROR;
101     if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
102         // First try using the length of the original string, since normalization to NFC rarely increases length.
103         normalizedCharacters.grow(sourceLength);
104         int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
105         if (err == U_BUFFER_OVERFLOW_ERROR) {
106             err = U_ZERO_ERROR;
107             normalizedCharacters.resize(normalizedLength);
108             normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
109         }
110         ASSERT(U_SUCCESS(err));
111 
112         source = normalizedCharacters.data();
113         sourceLength = normalizedLength;
114     }
115     return newTextCodec(*this)->encode(source, sourceLength, handling);
116 #elif USE(QT4_UNICODE)
117     QString str(reinterpret_cast<const QChar*>(characters), length);
118     str = str.normalized(QString::NormalizationForm_C);
119     return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
120 #elif USE(GLIB_UNICODE)
121     GOwnPtr<char> UTF8Source;
122     UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0));
123 
124     GOwnPtr<char> UTF8Normalized;
125     UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC));
126 
127     long UTF16Length;
128     GOwnPtr<UChar> UTF16Normalized;
129     UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0));
130 
131     return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling);
132 #elif OS(WINCE)
133     // normalization will be done by Windows CE API
134     OwnPtr<TextCodec> textCodec = newTextCodec(*this);
135     return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
136 #endif
137 }
138 
domName() const139 const char* TextEncoding::domName() const
140 {
141     if (noExtendedTextEncodingNameUsed())
142         return m_name;
143 
144     // We treat EUC-KR as windows-949 (its superset), but need to expose
145     // the name 'EUC-KR' because the name 'windows-949' is not recognized by
146     // most Korean web servers even though they do use the encoding
147     // 'windows-949' with the name 'EUC-KR'.
148     // FIXME: This is not thread-safe. At the moment, this function is
149     // only accessed in a single thread, but eventually has to be made
150     // thread-safe along with usesVisualOrdering().
151     static const char* const a = atomicCanonicalTextEncodingName("windows-949");
152     if (m_name == a)
153         return "EUC-KR";
154     return m_name;
155 }
156 
usesVisualOrdering() const157 bool TextEncoding::usesVisualOrdering() const
158 {
159     if (noExtendedTextEncodingNameUsed())
160         return false;
161 
162     static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
163     return m_name == a;
164 }
165 
isJapanese() const166 bool TextEncoding::isJapanese() const
167 {
168     if (noExtendedTextEncodingNameUsed())
169         return false;
170 
171     DEFINE_STATIC_LOCAL(HashSet<const char*>, set, ());
172     if (set.isEmpty()) {
173         addEncodingName(set, "x-mac-japanese");
174         addEncodingName(set, "cp932");
175         addEncodingName(set, "JIS_X0201");
176         addEncodingName(set, "JIS_X0208-1983");
177         addEncodingName(set, "JIS_X0208-1990");
178         addEncodingName(set, "JIS_X0212-1990");
179         addEncodingName(set, "JIS_C6226-1978");
180         addEncodingName(set, "Shift_JIS_X0213-2000");
181         addEncodingName(set, "ISO-2022-JP");
182         addEncodingName(set, "ISO-2022-JP-2");
183         addEncodingName(set, "ISO-2022-JP-1");
184         addEncodingName(set, "ISO-2022-JP-3");
185         addEncodingName(set, "EUC-JP");
186         addEncodingName(set, "Shift_JIS");
187     }
188     return m_name && set.contains(m_name);
189 }
190 
backslashAsCurrencySymbol() const191 UChar TextEncoding::backslashAsCurrencySymbol() const
192 {
193     if (noExtendedTextEncodingNameUsed())
194         return '\\';
195 
196     // The text encodings below treat backslash as a currency symbol.
197     // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
198     static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");
199     static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");
200     return (m_name == a || m_name == b) ? 0x00A5 : '\\';
201 }
202 
isNonByteBasedEncoding() const203 bool TextEncoding::isNonByteBasedEncoding() const
204 {
205     if (noExtendedTextEncodingNameUsed()) {
206         return *this == UTF16LittleEndianEncoding()
207             || *this == UTF16BigEndianEncoding();
208     }
209 
210     return *this == UTF16LittleEndianEncoding()
211         || *this == UTF16BigEndianEncoding()
212         || *this == UTF32BigEndianEncoding()
213         || *this == UTF32LittleEndianEncoding();
214 }
215 
isUTF7Encoding() const216 bool TextEncoding::isUTF7Encoding() const
217 {
218     if (noExtendedTextEncodingNameUsed())
219         return false;
220 
221     return *this == UTF7Encoding();
222 }
223 
closestByteBasedEquivalent() const224 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
225 {
226     if (isNonByteBasedEncoding())
227         return UTF8Encoding();
228     return *this;
229 }
230 
231 // HTML5 specifies that UTF-8 be used in form submission when a form is
232 // is a part of a document in UTF-16 probably because UTF-16 is not a
233 // byte-based encoding and can contain 0x00. By extension, the same
234 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
235 // but it's fraught with problems and we'd rather steer clear of it.
encodingForFormSubmission() const236 const TextEncoding& TextEncoding::encodingForFormSubmission() const
237 {
238     if (isNonByteBasedEncoding() || isUTF7Encoding())
239         return UTF8Encoding();
240     return *this;
241 }
242 
ASCIIEncoding()243 const TextEncoding& ASCIIEncoding()
244 {
245     static TextEncoding globalASCIIEncoding("ASCII");
246     return globalASCIIEncoding;
247 }
248 
Latin1Encoding()249 const TextEncoding& Latin1Encoding()
250 {
251     static TextEncoding globalLatin1Encoding("Latin-1");
252     return globalLatin1Encoding;
253 }
254 
UTF16BigEndianEncoding()255 const TextEncoding& UTF16BigEndianEncoding()
256 {
257     static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
258     return globalUTF16BigEndianEncoding;
259 }
260 
UTF16LittleEndianEncoding()261 const TextEncoding& UTF16LittleEndianEncoding()
262 {
263     static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
264     return globalUTF16LittleEndianEncoding;
265 }
266 
UTF32BigEndianEncoding()267 const TextEncoding& UTF32BigEndianEncoding()
268 {
269     static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
270     return globalUTF32BigEndianEncoding;
271 }
272 
UTF32LittleEndianEncoding()273 const TextEncoding& UTF32LittleEndianEncoding()
274 {
275     static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
276     return globalUTF32LittleEndianEncoding;
277 }
278 
UTF8Encoding()279 const TextEncoding& UTF8Encoding()
280 {
281     static TextEncoding globalUTF8Encoding("UTF-8");
282     ASSERT(globalUTF8Encoding.isValid());
283     return globalUTF8Encoding;
284 }
285 
WindowsLatin1Encoding()286 const TextEncoding& WindowsLatin1Encoding()
287 {
288     static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
289     return globalWindowsLatin1Encoding;
290 }
291 
292 } // namespace WebCore
293