• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "config.h"
29 #include "TextEncoding.h"
30 
31 #include "CString.h"
32 #include "PlatformString.h"
33 #include "TextCodec.h"
34 #include "TextEncodingRegistry.h"
35 #if USE(ICU_UNICODE) || USE(GLIB_ICU_UNICODE_HYBRID)
36 #include <unicode/unorm.h>
37 #elif USE(QT4_UNICODE)
38 #include <QString>
39 #endif
40 #include <wtf/HashSet.h>
41 #include <wtf/OwnPtr.h>
42 #include <wtf/StdLibExtras.h>
43 
44 namespace WebCore {
45 
addEncodingName(HashSet<const char * > & set,const char * name)46 static void addEncodingName(HashSet<const char*>& set, const char* name)
47 {
48     const char* atomicName = atomicCanonicalTextEncodingName(name);
49     if (atomicName)
50         set.add(atomicName);
51 }
52 
UTF7Encoding()53 static const TextEncoding& UTF7Encoding()
54 {
55     static TextEncoding globalUTF7Encoding("UTF-7");
56     return globalUTF7Encoding;
57 }
58 
TextEncoding(const char * name)59 TextEncoding::TextEncoding(const char* name)
60     : m_name(atomicCanonicalTextEncodingName(name))
61     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
62 {
63 }
64 
TextEncoding(const String & name)65 TextEncoding::TextEncoding(const String& name)
66     : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
67     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
68 {
69 }
70 
decode(const char * data,size_t length,bool stopOnError,bool & sawError) const71 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
72 {
73     if (!m_name)
74         return String();
75 
76     return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
77 }
78 
encode(const UChar * characters,size_t length,UnencodableHandling handling) const79 CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
80 {
81     if (!m_name)
82         return CString();
83 
84     if (!length)
85         return "";
86 
87 #if USE(ICU_UNICODE) || USE(GLIB_ICU_UNICODE_HYBRID)
88     // FIXME: What's the right place to do normalization?
89     // It's a little strange to do it inside the encode function.
90     // Perhaps normalization should be an explicit step done before calling encode.
91 
92     const UChar* source = characters;
93     size_t sourceLength = length;
94 
95     Vector<UChar> normalizedCharacters;
96 
97     UErrorCode err = U_ZERO_ERROR;
98     if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
99         // First try using the length of the original string, since normalization to NFC rarely increases length.
100         normalizedCharacters.grow(sourceLength);
101         int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
102         if (err == U_BUFFER_OVERFLOW_ERROR) {
103             err = U_ZERO_ERROR;
104             normalizedCharacters.resize(normalizedLength);
105             normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
106         }
107         ASSERT(U_SUCCESS(err));
108 
109         source = normalizedCharacters.data();
110         sourceLength = normalizedLength;
111     }
112     return newTextCodec(*this)->encode(source, sourceLength, handling);
113 #elif USE(QT4_UNICODE)
114     QString str(reinterpret_cast<const QChar*>(characters), length);
115     str = str.normalized(QString::NormalizationForm_C);
116     return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
117 #elif PLATFORM(WINCE)
118     // normalization will be done by Windows CE API
119     OwnPtr<TextCodec> textCodec = newTextCodec(*this);
120     return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
121 #endif
122 }
123 
domName() const124 const char* TextEncoding::domName() const
125 {
126     if (noExtendedTextEncodingNameUsed())
127         return m_name;
128 
129     // We treat EUC-KR as windows-949 (its superset), but need to expose
130     // the name 'EUC-KR' because the name 'windows-949' is not recognized by
131     // most Korean web servers even though they do use the encoding
132     // 'windows-949' with the name 'EUC-KR'.
133     // FIXME: This is not thread-safe. At the moment, this function is
134     // only accessed in a single thread, but eventually has to be made
135     // thread-safe along with usesVisualOrdering().
136     static const char* const a = atomicCanonicalTextEncodingName("windows-949");
137     if (m_name == a)
138         return "EUC-KR";
139     return m_name;
140 }
141 
usesVisualOrdering() const142 bool TextEncoding::usesVisualOrdering() const
143 {
144     if (noExtendedTextEncodingNameUsed())
145         return false;
146 
147     static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
148     return m_name == a;
149 }
150 
isJapanese() const151 bool TextEncoding::isJapanese() const
152 {
153     if (noExtendedTextEncodingNameUsed())
154         return false;
155 
156     DEFINE_STATIC_LOCAL(HashSet<const char*>, set, ());
157     if (set.isEmpty()) {
158         addEncodingName(set, "x-mac-japanese");
159         addEncodingName(set, "cp932");
160         addEncodingName(set, "JIS_X0201");
161         addEncodingName(set, "JIS_X0208-1983");
162         addEncodingName(set, "JIS_X0208-1990");
163         addEncodingName(set, "JIS_X0212-1990");
164         addEncodingName(set, "JIS_C6226-1978");
165         addEncodingName(set, "Shift_JIS_X0213-2000");
166         addEncodingName(set, "ISO-2022-JP");
167         addEncodingName(set, "ISO-2022-JP-2");
168         addEncodingName(set, "ISO-2022-JP-1");
169         addEncodingName(set, "ISO-2022-JP-3");
170         addEncodingName(set, "EUC-JP");
171         addEncodingName(set, "Shift_JIS");
172     }
173     return m_name && set.contains(m_name);
174 }
175 
backslashAsCurrencySymbol() const176 UChar TextEncoding::backslashAsCurrencySymbol() const
177 {
178     if (noExtendedTextEncodingNameUsed())
179         return '\\';
180 
181     // The text encodings below treat backslash as a currency symbol.
182     // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
183     static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");
184     static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");
185     return (m_name == a || m_name == b) ? 0x00A5 : '\\';
186 }
187 
isNonByteBasedEncoding() const188 bool TextEncoding::isNonByteBasedEncoding() const
189 {
190     if (noExtendedTextEncodingNameUsed()) {
191         return *this == UTF16LittleEndianEncoding()
192             || *this == UTF16BigEndianEncoding();
193     }
194 
195     return *this == UTF16LittleEndianEncoding()
196         || *this == UTF16BigEndianEncoding()
197         || *this == UTF32BigEndianEncoding()
198         || *this == UTF32LittleEndianEncoding();
199 }
200 
isUTF7Encoding() const201 bool TextEncoding::isUTF7Encoding() const
202 {
203     if (noExtendedTextEncodingNameUsed())
204         return false;
205 
206     return *this == UTF7Encoding();
207 }
208 
closestByteBasedEquivalent() const209 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
210 {
211     if (isNonByteBasedEncoding())
212         return UTF8Encoding();
213     return *this;
214 }
215 
216 // HTML5 specifies that UTF-8 be used in form submission when a form is
217 // is a part of a document in UTF-16 probably because UTF-16 is not a
218 // byte-based encoding and can contain 0x00. By extension, the same
219 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
220 // but it's fraught with problems and we'd rather steer clear of it.
encodingForFormSubmission() const221 const TextEncoding& TextEncoding::encodingForFormSubmission() const
222 {
223     if (isNonByteBasedEncoding() || isUTF7Encoding())
224         return UTF8Encoding();
225     return *this;
226 }
227 
ASCIIEncoding()228 const TextEncoding& ASCIIEncoding()
229 {
230     static TextEncoding globalASCIIEncoding("ASCII");
231     return globalASCIIEncoding;
232 }
233 
Latin1Encoding()234 const TextEncoding& Latin1Encoding()
235 {
236     static TextEncoding globalLatin1Encoding("Latin-1");
237     return globalLatin1Encoding;
238 }
239 
UTF16BigEndianEncoding()240 const TextEncoding& UTF16BigEndianEncoding()
241 {
242     static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
243     return globalUTF16BigEndianEncoding;
244 }
245 
UTF16LittleEndianEncoding()246 const TextEncoding& UTF16LittleEndianEncoding()
247 {
248     static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
249     return globalUTF16LittleEndianEncoding;
250 }
251 
UTF32BigEndianEncoding()252 const TextEncoding& UTF32BigEndianEncoding()
253 {
254     static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
255     return globalUTF32BigEndianEncoding;
256 }
257 
UTF32LittleEndianEncoding()258 const TextEncoding& UTF32LittleEndianEncoding()
259 {
260     static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
261     return globalUTF32LittleEndianEncoding;
262 }
263 
UTF8Encoding()264 const TextEncoding& UTF8Encoding()
265 {
266     static TextEncoding globalUTF8Encoding("UTF-8");
267     return globalUTF8Encoding;
268 }
269 
WindowsLatin1Encoding()270 const TextEncoding& WindowsLatin1Encoding()
271 {
272     static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
273     return globalWindowsLatin1Encoding;
274 }
275 
276 } // namespace WebCore
277