• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "config.h"
29 #include "wtf/text/TextEncoding.h"
30 
31 #include "wtf/text/TextEncodingRegistry.h"
32 #include <unicode/unorm.h>
33 #include "wtf/OwnPtr.h"
34 #include "wtf/StdLibExtras.h"
35 #include "wtf/text/CString.h"
36 #include "wtf/text/WTFString.h"
37 
38 namespace WTF {
39 
UTF7Encoding()40 static const TextEncoding& UTF7Encoding()
41 {
42     static TextEncoding globalUTF7Encoding("UTF-7");
43     return globalUTF7Encoding;
44 }
45 
TextEncoding(const char * name)46 TextEncoding::TextEncoding(const char* name)
47     : m_name(atomicCanonicalTextEncodingName(name))
48     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
49 {
50 }
51 
TextEncoding(const String & name)52 TextEncoding::TextEncoding(const String& name)
53     : m_name(atomicCanonicalTextEncodingName(name))
54     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
55 {
56 }
57 
decode(const char * data,size_t length,bool stopOnError,bool & sawError) const58 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
59 {
60     if (!m_name)
61         return String();
62 
63     return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
64 }
65 
encode(const String & string,UnencodableHandling handling) const66 CString TextEncoding::encode(const String& string, UnencodableHandling handling) const
67 {
68     if (!m_name)
69         return CString();
70 
71     if (string.isEmpty())
72         return "";
73 
74     OwnPtr<TextCodec> textCodec = newTextCodec(*this);
75     CString encodedString;
76     if (string.is8Bit())
77         encodedString = textCodec->encode(string.characters8(), string.length(), handling);
78     else
79         encodedString = textCodec->encode(string.characters16(), string.length(), handling);
80     return encodedString;
81 }
82 
normalizeAndEncode(const String & string,UnencodableHandling handling) const83 CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
84 {
85     if (!m_name)
86         return CString();
87 
88     if (string.isEmpty())
89         return "";
90 
91     // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
92     // unaffected by NFC. This is effectively the same as saying that all
93     // Latin-1 text is already normalized to NFC.
94     // Source: http://unicode.org/reports/tr15/
95     if (string.is8Bit())
96         return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);
97 
98     const UChar* source = string.characters16();
99     size_t length = string.length();
100 
101     Vector<UChar> normalizedCharacters;
102 
103     UErrorCode err = U_ZERO_ERROR;
104     if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
105         // First try using the length of the original string, since normalization to NFC rarely increases length.
106         normalizedCharacters.grow(length);
107         int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
108         if (err == U_BUFFER_OVERFLOW_ERROR) {
109             err = U_ZERO_ERROR;
110             normalizedCharacters.resize(normalizedLength);
111             normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
112         }
113         ASSERT(U_SUCCESS(err));
114 
115         source = normalizedCharacters.data();
116         length = normalizedLength;
117     }
118 
119     return newTextCodec(*this)->encode(source, length, handling);
120 }
121 
usesVisualOrdering() const122 bool TextEncoding::usesVisualOrdering() const
123 {
124     if (noExtendedTextEncodingNameUsed())
125         return false;
126 
127     static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
128     return m_name == a;
129 }
130 
backslashAsCurrencySymbol() const131 UChar TextEncoding::backslashAsCurrencySymbol() const
132 {
133     return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
134 }
135 
isNonByteBasedEncoding() const136 bool TextEncoding::isNonByteBasedEncoding() const
137 {
138     if (noExtendedTextEncodingNameUsed()) {
139         return *this == UTF16LittleEndianEncoding()
140             || *this == UTF16BigEndianEncoding();
141     }
142 
143     return *this == UTF16LittleEndianEncoding()
144         || *this == UTF16BigEndianEncoding()
145         || *this == UTF32BigEndianEncoding()
146         || *this == UTF32LittleEndianEncoding();
147 }
148 
isUTF7Encoding() const149 bool TextEncoding::isUTF7Encoding() const
150 {
151     if (noExtendedTextEncodingNameUsed())
152         return false;
153 
154     return *this == UTF7Encoding();
155 }
156 
closestByteBasedEquivalent() const157 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
158 {
159     if (isNonByteBasedEncoding())
160         return UTF8Encoding();
161     return *this;
162 }
163 
164 // HTML5 specifies that UTF-8 be used in form submission when a form is
165 // is a part of a document in UTF-16 probably because UTF-16 is not a
166 // byte-based encoding and can contain 0x00. By extension, the same
167 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
168 // but it's fraught with problems and we'd rather steer clear of it.
encodingForFormSubmission() const169 const TextEncoding& TextEncoding::encodingForFormSubmission() const
170 {
171     if (isNonByteBasedEncoding() || isUTF7Encoding())
172         return UTF8Encoding();
173     return *this;
174 }
175 
ASCIIEncoding()176 const TextEncoding& ASCIIEncoding()
177 {
178     static TextEncoding globalASCIIEncoding("ASCII");
179     return globalASCIIEncoding;
180 }
181 
Latin1Encoding()182 const TextEncoding& Latin1Encoding()
183 {
184     static TextEncoding globalLatin1Encoding("latin1");
185     return globalLatin1Encoding;
186 }
187 
UTF16BigEndianEncoding()188 const TextEncoding& UTF16BigEndianEncoding()
189 {
190     static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
191     return globalUTF16BigEndianEncoding;
192 }
193 
UTF16LittleEndianEncoding()194 const TextEncoding& UTF16LittleEndianEncoding()
195 {
196     static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
197     return globalUTF16LittleEndianEncoding;
198 }
199 
UTF32BigEndianEncoding()200 const TextEncoding& UTF32BigEndianEncoding()
201 {
202     static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
203     return globalUTF32BigEndianEncoding;
204 }
205 
UTF32LittleEndianEncoding()206 const TextEncoding& UTF32LittleEndianEncoding()
207 {
208     static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
209     return globalUTF32LittleEndianEncoding;
210 }
211 
UTF8Encoding()212 const TextEncoding& UTF8Encoding()
213 {
214     static TextEncoding globalUTF8Encoding("UTF-8");
215     ASSERT(globalUTF8Encoding.isValid());
216     return globalUTF8Encoding;
217 }
218 
WindowsLatin1Encoding()219 const TextEncoding& WindowsLatin1Encoding()
220 {
221     static TextEncoding globalWindowsLatin1Encoding("WinLatin1");
222     return globalWindowsLatin1Encoding;
223 }
224 
225 } // namespace WTF
226