1 /*
2 * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "TextEncoding.h"
29
30 #include "CString.h"
31 #include "PlatformString.h"
32 #include "TextCodec.h"
33 #include "TextDecoder.h"
34 #include "TextEncodingRegistry.h"
35 #if USE(ICU_UNICODE)
36 #include <unicode/unorm.h>
37 #elif USE(QT4_UNICODE)
38 #include <QString>
39 #endif
40 #include <wtf/HashSet.h>
41 #include <wtf/OwnPtr.h>
42 #include <wtf/StdLibExtras.h>
43
44 namespace WebCore {
45
addEncodingName(HashSet<const char * > & set,const char * name)46 static void addEncodingName(HashSet<const char*>& set, const char* name)
47 {
48 const char* atomicName = atomicCanonicalTextEncodingName(name);
49 if (atomicName)
50 set.add(atomicName);
51 }
52
UTF7Encoding()53 static const TextEncoding& UTF7Encoding()
54 {
55 static TextEncoding globalUTF7Encoding("UTF-7");
56 return globalUTF7Encoding;
57 }
58
TextEncoding(const char * name)59 TextEncoding::TextEncoding(const char* name)
60 : m_name(atomicCanonicalTextEncodingName(name))
61 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
62 {
63 }
64
TextEncoding(const String & name)65 TextEncoding::TextEncoding(const String& name)
66 : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
67 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
68 {
69 }
70
decode(const char * data,size_t length,bool stopOnError,bool & sawError) const71 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
72 {
73 if (!m_name)
74 return String();
75
76 return TextDecoder(*this).decode(data, length, true, stopOnError, sawError);
77 }
78
encode(const UChar * characters,size_t length,UnencodableHandling handling) const79 CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
80 {
81 if (!m_name)
82 return CString();
83
84 if (!length)
85 return "";
86
87 #if USE(ICU_UNICODE)
88 // FIXME: What's the right place to do normalization?
89 // It's a little strange to do it inside the encode function.
90 // Perhaps normalization should be an explicit step done before calling encode.
91
92 const UChar* source = characters;
93 size_t sourceLength = length;
94
95 Vector<UChar> normalizedCharacters;
96
97 UErrorCode err = U_ZERO_ERROR;
98 if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
99 // First try using the length of the original string, since normalization to NFC rarely increases length.
100 normalizedCharacters.grow(sourceLength);
101 int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
102 if (err == U_BUFFER_OVERFLOW_ERROR) {
103 err = U_ZERO_ERROR;
104 normalizedCharacters.resize(normalizedLength);
105 normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
106 }
107 ASSERT(U_SUCCESS(err));
108
109 source = normalizedCharacters.data();
110 sourceLength = normalizedLength;
111 }
112 return newTextCodec(*this)->encode(source, sourceLength, handling);
113 #elif USE(QT4_UNICODE)
114 QString str(reinterpret_cast<const QChar*>(characters), length);
115 str = str.normalized(QString::NormalizationForm_C);
116 return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
117 #endif
118 }
119
usesVisualOrdering() const120 bool TextEncoding::usesVisualOrdering() const
121 {
122 if (noExtendedTextEncodingNameUsed())
123 return false;
124
125 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
126 return m_name == a;
127 }
128
isJapanese() const129 bool TextEncoding::isJapanese() const
130 {
131 if (noExtendedTextEncodingNameUsed())
132 return false;
133
134 DEFINE_STATIC_LOCAL(HashSet<const char*>, set, ());
135 if (set.isEmpty()) {
136 addEncodingName(set, "x-mac-japanese");
137 addEncodingName(set, "cp932");
138 addEncodingName(set, "JIS_X0201");
139 addEncodingName(set, "JIS_X0208-1983");
140 addEncodingName(set, "JIS_X0208-1990");
141 addEncodingName(set, "JIS_X0212-1990");
142 addEncodingName(set, "JIS_C6226-1978");
143 addEncodingName(set, "Shift_JIS_X0213-2000");
144 addEncodingName(set, "ISO-2022-JP");
145 addEncodingName(set, "ISO-2022-JP-2");
146 addEncodingName(set, "ISO-2022-JP-1");
147 addEncodingName(set, "ISO-2022-JP-3");
148 addEncodingName(set, "EUC-JP");
149 addEncodingName(set, "Shift_JIS");
150 }
151 return m_name && set.contains(m_name);
152 }
153
backslashAsCurrencySymbol() const154 UChar TextEncoding::backslashAsCurrencySymbol() const
155 {
156 if (noExtendedTextEncodingNameUsed())
157 return '\\';
158
159 // The text encodings below treat backslash as a currency symbol.
160 // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
161 static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");
162 static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");
163 return (m_name == a || m_name == b) ? 0x00A5 : '\\';
164 }
165
isNonByteBasedEncoding() const166 bool TextEncoding::isNonByteBasedEncoding() const
167 {
168 return *this == UTF16LittleEndianEncoding()
169 || *this == UTF16BigEndianEncoding()
170 || *this == UTF32BigEndianEncoding()
171 || *this == UTF32LittleEndianEncoding();
172 }
173
closestByteBasedEquivalent() const174 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
175 {
176 if (isNonByteBasedEncoding())
177 return UTF8Encoding();
178 return *this;
179 }
180
181 // HTML5 specifies that UTF-8 be used in form submission when a form is
182 // is a part of a document in UTF-16 probably because UTF-16 is not a
183 // byte-based encoding and can contain 0x00. By extension, the same
184 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
185 // but it's fraught with problems and we'd rather steer clear of it.
encodingForFormSubmission() const186 const TextEncoding& TextEncoding::encodingForFormSubmission() const
187 {
188 if (isNonByteBasedEncoding() || *this == UTF7Encoding())
189 return UTF8Encoding();
190 return *this;
191 }
192
ASCIIEncoding()193 const TextEncoding& ASCIIEncoding()
194 {
195 static TextEncoding globalASCIIEncoding("ASCII");
196 return globalASCIIEncoding;
197 }
198
Latin1Encoding()199 const TextEncoding& Latin1Encoding()
200 {
201 static TextEncoding globalLatin1Encoding("Latin-1");
202 return globalLatin1Encoding;
203 }
204
UTF16BigEndianEncoding()205 const TextEncoding& UTF16BigEndianEncoding()
206 {
207 static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
208 return globalUTF16BigEndianEncoding;
209 }
210
UTF16LittleEndianEncoding()211 const TextEncoding& UTF16LittleEndianEncoding()
212 {
213 static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
214 return globalUTF16LittleEndianEncoding;
215 }
216
UTF32BigEndianEncoding()217 const TextEncoding& UTF32BigEndianEncoding()
218 {
219 static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
220 return globalUTF32BigEndianEncoding;
221 }
222
UTF32LittleEndianEncoding()223 const TextEncoding& UTF32LittleEndianEncoding()
224 {
225 static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
226 return globalUTF32LittleEndianEncoding;
227 }
228
UTF8Encoding()229 const TextEncoding& UTF8Encoding()
230 {
231 static TextEncoding globalUTF8Encoding("UTF-8");
232 return globalUTF8Encoding;
233 }
234
WindowsLatin1Encoding()235 const TextEncoding& WindowsLatin1Encoding()
236 {
237 static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
238 return globalWindowsLatin1Encoding;
239 }
240
241 } // namespace WebCore
242