1 /*
2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "config.h"
29 #include "wtf/text/TextEncoding.h"
30
31 #include "wtf/text/TextEncodingRegistry.h"
32 #include <unicode/unorm.h>
33 #include "wtf/OwnPtr.h"
34 #include "wtf/StdLibExtras.h"
35 #include "wtf/text/CString.h"
36 #include "wtf/text/WTFString.h"
37
38 namespace WTF {
39
UTF7Encoding()40 static const TextEncoding& UTF7Encoding()
41 {
42 static TextEncoding globalUTF7Encoding("UTF-7");
43 return globalUTF7Encoding;
44 }
45
TextEncoding(const char * name)46 TextEncoding::TextEncoding(const char* name)
47 : m_name(atomicCanonicalTextEncodingName(name))
48 {
49 // Aliases are valid, but not "replacement" itself.
50 if (m_name && isReplacementEncoding(name))
51 m_name = 0;
52 }
53
TextEncoding(const String & name)54 TextEncoding::TextEncoding(const String& name)
55 : m_name(atomicCanonicalTextEncodingName(name))
56 {
57 // Aliases are valid, but not "replacement" itself.
58 if (m_name && isReplacementEncoding(name))
59 m_name = 0;
60 }
61
decode(const char * data,size_t length,bool stopOnError,bool & sawError) const62 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
63 {
64 if (!m_name)
65 return String();
66
67 return newTextCodec(*this)->decode(data, length, DataEOF, stopOnError, sawError);
68 }
69
encode(const String & string,UnencodableHandling handling) const70 CString TextEncoding::encode(const String& string, UnencodableHandling handling) const
71 {
72 if (!m_name)
73 return CString();
74
75 if (string.isEmpty())
76 return "";
77
78 OwnPtr<TextCodec> textCodec = newTextCodec(*this);
79 CString encodedString;
80 if (string.is8Bit())
81 encodedString = textCodec->encode(string.characters8(), string.length(), handling);
82 else
83 encodedString = textCodec->encode(string.characters16(), string.length(), handling);
84 return encodedString;
85 }
86
normalizeAndEncode(const String & string,UnencodableHandling handling) const87 CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
88 {
89 if (!m_name)
90 return CString();
91
92 if (string.isEmpty())
93 return "";
94
95 // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
96 // unaffected by NFC. This is effectively the same as saying that all
97 // Latin-1 text is already normalized to NFC.
98 // Source: http://unicode.org/reports/tr15/
99 if (string.is8Bit())
100 return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);
101
102 const UChar* source = string.characters16();
103 size_t length = string.length();
104
105 Vector<UChar> normalizedCharacters;
106
107 UErrorCode err = U_ZERO_ERROR;
108 if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
109 // First try using the length of the original string, since normalization to NFC rarely increases length.
110 normalizedCharacters.grow(length);
111 int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
112 if (err == U_BUFFER_OVERFLOW_ERROR) {
113 err = U_ZERO_ERROR;
114 normalizedCharacters.resize(normalizedLength);
115 normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
116 }
117 ASSERT(U_SUCCESS(err));
118
119 source = normalizedCharacters.data();
120 length = normalizedLength;
121 }
122
123 return newTextCodec(*this)->encode(source, length, handling);
124 }
125
usesVisualOrdering() const126 bool TextEncoding::usesVisualOrdering() const
127 {
128 if (noExtendedTextEncodingNameUsed())
129 return false;
130
131 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
132 return m_name == a;
133 }
134
isNonByteBasedEncoding() const135 bool TextEncoding::isNonByteBasedEncoding() const
136 {
137 if (noExtendedTextEncodingNameUsed()) {
138 return *this == UTF16LittleEndianEncoding()
139 || *this == UTF16BigEndianEncoding();
140 }
141
142 return *this == UTF16LittleEndianEncoding()
143 || *this == UTF16BigEndianEncoding()
144 || *this == UTF32BigEndianEncoding()
145 || *this == UTF32LittleEndianEncoding();
146 }
147
isUTF7Encoding() const148 bool TextEncoding::isUTF7Encoding() const
149 {
150 if (noExtendedTextEncodingNameUsed())
151 return false;
152
153 return *this == UTF7Encoding();
154 }
155
closestByteBasedEquivalent() const156 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
157 {
158 if (isNonByteBasedEncoding())
159 return UTF8Encoding();
160 return *this;
161 }
162
163 // HTML5 specifies that UTF-8 be used in form submission when a form is
164 // is a part of a document in UTF-16 probably because UTF-16 is not a
165 // byte-based encoding and can contain 0x00. By extension, the same
166 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
167 // but it's fraught with problems and we'd rather steer clear of it.
encodingForFormSubmission() const168 const TextEncoding& TextEncoding::encodingForFormSubmission() const
169 {
170 if (isNonByteBasedEncoding() || isUTF7Encoding())
171 return UTF8Encoding();
172 return *this;
173 }
174
ASCIIEncoding()175 const TextEncoding& ASCIIEncoding()
176 {
177 static TextEncoding globalASCIIEncoding("ASCII");
178 return globalASCIIEncoding;
179 }
180
Latin1Encoding()181 const TextEncoding& Latin1Encoding()
182 {
183 static TextEncoding globalLatin1Encoding("latin1");
184 return globalLatin1Encoding;
185 }
186
UTF16BigEndianEncoding()187 const TextEncoding& UTF16BigEndianEncoding()
188 {
189 static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
190 return globalUTF16BigEndianEncoding;
191 }
192
UTF16LittleEndianEncoding()193 const TextEncoding& UTF16LittleEndianEncoding()
194 {
195 static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
196 return globalUTF16LittleEndianEncoding;
197 }
198
UTF32BigEndianEncoding()199 const TextEncoding& UTF32BigEndianEncoding()
200 {
201 static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
202 return globalUTF32BigEndianEncoding;
203 }
204
UTF32LittleEndianEncoding()205 const TextEncoding& UTF32LittleEndianEncoding()
206 {
207 static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
208 return globalUTF32LittleEndianEncoding;
209 }
210
UTF8Encoding()211 const TextEncoding& UTF8Encoding()
212 {
213 static TextEncoding globalUTF8Encoding("UTF-8");
214 ASSERT(globalUTF8Encoding.isValid());
215 return globalUTF8Encoding;
216 }
217
WindowsLatin1Encoding()218 const TextEncoding& WindowsLatin1Encoding()
219 {
220 static TextEncoding globalWindowsLatin1Encoding("WinLatin1");
221 return globalWindowsLatin1Encoding;
222 }
223
224 } // namespace WTF
225