1 /*
2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
3 * Copyright (C) 2007-2009 Torch Mobile, Inc.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "wtf/text/TextEncodingRegistry.h"
29
30 #include "wtf/ASCIICType.h"
31 #include "wtf/CurrentTime.h"
32 #include "wtf/HashMap.h"
33 #include "wtf/HashSet.h"
34 #include "wtf/MainThread.h"
35 #include "wtf/StdLibExtras.h"
36 #include "wtf/StringExtras.h"
37 #include "wtf/ThreadingPrimitives.h"
38 #include "wtf/text/CString.h"
39 #include "wtf/text/TextCodecICU.h"
40 #include "wtf/text/TextCodecLatin1.h"
41 #include "wtf/text/TextCodecReplacement.h"
42 #include "wtf/text/TextCodecUTF16.h"
43 #include "wtf/text/TextCodecUTF8.h"
44 #include "wtf/text/TextCodecUserDefined.h"
45 #include "wtf/text/TextEncoding.h"
46
47 namespace WTF {
48
49 const size_t maxEncodingNameLength = 63;
50
51 // Hash for all-ASCII strings that does case folding.
52 struct TextEncodingNameHash {
equalWTF::TextEncodingNameHash53 static bool equal(const char* s1, const char* s2)
54 {
55 char c1;
56 char c2;
57 do {
58 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106
59 // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released.
60 // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only
61 c1 = toASCIILower(*s1++);
62 c2 = toASCIILower(*s2++);
63 if (c1 != c2)
64 return false;
65 #else
66 c1 = *s1++;
67 c2 = *s2++;
68 if (toASCIILower(c1) != toASCIILower(c2))
69 return false;
70 #endif
71 } while (c1 && c2);
72 return !c1 && !c2;
73 }
74
75 // This algorithm is the one-at-a-time hash from:
76 // http://burtleburtle.net/bob/hash/hashfaq.html
77 // http://burtleburtle.net/bob/hash/doobs.html
hashWTF::TextEncodingNameHash78 static unsigned hash(const char* s)
79 {
80 unsigned h = WTF::stringHashingStartValue;
81 for (;;) {
82 char c = *s++;
83 if (!c) {
84 h += (h << 3);
85 h ^= (h >> 11);
86 h += (h << 15);
87 return h;
88 }
89 h += toASCIILower(c);
90 h += (h << 10);
91 h ^= (h >> 6);
92 }
93 }
94
95 static const bool safeToCompareToEmptyOrDeleted = false;
96 };
97
98 struct TextCodecFactory {
99 NewTextCodecFunction function;
100 const void* additionalData;
TextCodecFactoryWTF::TextCodecFactory101 TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
102 };
103
104 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
105 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
106
encodingRegistryMutex()107 static Mutex& encodingRegistryMutex()
108 {
109 // We don't have to use AtomicallyInitializedStatic here because
110 // this function is called on the main thread for any page before
111 // it is used in worker threads.
112 DEFINE_STATIC_LOCAL(Mutex, mutex, ());
113 return mutex;
114 }
115
116 static TextEncodingNameMap* textEncodingNameMap;
117 static TextCodecMap* textCodecMap;
118 static bool didExtendTextCodecMaps;
119
120 static const char textEncodingNameBlacklist[][6] = { "UTF-7" };
121
122 #if ERROR_DISABLED
123
checkExistingName(const char *,const char *)124 static inline void checkExistingName(const char*, const char*) { }
125
126 #else
127
checkExistingName(const char * alias,const char * atomicName)128 static void checkExistingName(const char* alias, const char* atomicName)
129 {
130 const char* oldAtomicName = textEncodingNameMap->get(alias);
131 if (!oldAtomicName)
132 return;
133 if (oldAtomicName == atomicName)
134 return;
135 // Keep the warning silent about one case where we know this will happen.
136 if (strcmp(alias, "ISO-8859-8-I") == 0
137 && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
138 && strcasecmp(atomicName, "iso-8859-8") == 0)
139 return;
140 WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
141 }
142
143 #endif
144
isUndesiredAlias(const char * alias)145 static bool isUndesiredAlias(const char* alias)
146 {
147 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
148 for (const char* p = alias; *p; ++p) {
149 if (*p == ',')
150 return true;
151 }
152 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
153 // problem, see bug 43554.
154 if (0 == strcmp(alias, "8859_1"))
155 return true;
156 return false;
157 }
158
addToTextEncodingNameMap(const char * alias,const char * name)159 static void addToTextEncodingNameMap(const char* alias, const char* name)
160 {
161 ASSERT(strlen(alias) <= maxEncodingNameLength);
162 if (isUndesiredAlias(alias))
163 return;
164 const char* atomicName = textEncodingNameMap->get(name);
165 ASSERT(strcmp(alias, name) == 0 || atomicName);
166 if (!atomicName)
167 atomicName = name;
168 checkExistingName(alias, atomicName);
169 textEncodingNameMap->add(alias, atomicName);
170 }
171
addToTextCodecMap(const char * name,NewTextCodecFunction function,const void * additionalData)172 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
173 {
174 const char* atomicName = textEncodingNameMap->get(name);
175 ASSERT(atomicName);
176 textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
177 }
178
pruneBlacklistedCodecs()179 static void pruneBlacklistedCodecs()
180 {
181 for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
182 const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
183 if (!atomicName)
184 continue;
185
186 Vector<const char*> names;
187 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
188 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
189 for (; it != end; ++it) {
190 if (it->value == atomicName)
191 names.append(it->key);
192 }
193
194 textEncodingNameMap->removeAll(names);
195
196 textCodecMap->remove(atomicName);
197 }
198 }
199
buildBaseTextCodecMaps()200 static void buildBaseTextCodecMaps()
201 {
202 ASSERT(isMainThread());
203 ASSERT(!textCodecMap);
204 ASSERT(!textEncodingNameMap);
205
206 textCodecMap = new TextCodecMap;
207 textEncodingNameMap = new TextEncodingNameMap;
208
209 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
210 TextCodecLatin1::registerCodecs(addToTextCodecMap);
211
212 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
213 TextCodecUTF8::registerCodecs(addToTextCodecMap);
214
215 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
216 TextCodecUTF16::registerCodecs(addToTextCodecMap);
217
218 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
219 TextCodecUserDefined::registerCodecs(addToTextCodecMap);
220 }
221
isReplacementEncoding(const char * alias)222 bool isReplacementEncoding(const char* alias)
223 {
224 return alias && !strcasecmp(alias, "replacement");
225 }
226
isReplacementEncoding(const String & alias)227 bool isReplacementEncoding(const String& alias)
228 {
229 return alias == "replacement";
230 }
231
extendTextCodecMaps()232 static void extendTextCodecMaps()
233 {
234 TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap);
235 TextCodecReplacement::registerCodecs(addToTextCodecMap);
236
237 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
238 TextCodecICU::registerCodecs(addToTextCodecMap);
239
240 pruneBlacklistedCodecs();
241 }
242
newTextCodec(const TextEncoding & encoding)243 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
244 {
245 MutexLocker lock(encodingRegistryMutex());
246
247 ASSERT(textCodecMap);
248 TextCodecFactory factory = textCodecMap->get(encoding.name());
249 ASSERT(factory.function);
250 return factory.function(encoding, factory.additionalData);
251 }
252
atomicCanonicalTextEncodingName(const char * name)253 const char* atomicCanonicalTextEncodingName(const char* name)
254 {
255 if (!name || !name[0])
256 return 0;
257 if (!textEncodingNameMap)
258 buildBaseTextCodecMaps();
259
260 MutexLocker lock(encodingRegistryMutex());
261
262 if (const char* atomicName = textEncodingNameMap->get(name))
263 return atomicName;
264 if (didExtendTextCodecMaps)
265 return 0;
266 extendTextCodecMaps();
267 didExtendTextCodecMaps = true;
268 return textEncodingNameMap->get(name);
269 }
270
271 template <typename CharacterType>
atomicCanonicalTextEncodingName(const CharacterType * characters,size_t length)272 const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
273 {
274 char buffer[maxEncodingNameLength + 1];
275 size_t j = 0;
276 for (size_t i = 0; i < length; ++i) {
277 CharacterType c = characters[i];
278 if (j == maxEncodingNameLength)
279 return 0;
280 buffer[j++] = c;
281 }
282 buffer[j] = 0;
283 return atomicCanonicalTextEncodingName(buffer);
284 }
285
atomicCanonicalTextEncodingName(const String & alias)286 const char* atomicCanonicalTextEncodingName(const String& alias)
287 {
288 if (!alias.length())
289 return 0;
290
291 if (alias.is8Bit())
292 return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length());
293
294 return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length());
295 }
296
noExtendedTextEncodingNameUsed()297 bool noExtendedTextEncodingNameUsed()
298 {
299 // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
300 return !didExtendTextCodecMaps;
301 }
302
303 #ifndef NDEBUG
dumpTextEncodingNameMap()304 void dumpTextEncodingNameMap()
305 {
306 unsigned size = textEncodingNameMap->size();
307 fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size);
308
309 MutexLocker lock(encodingRegistryMutex());
310
311 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
312 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
313 for (; it != end; ++it)
314 fprintf(stderr, "'%s' => '%s'\n", it->key, it->value);
315 }
316 #endif
317
318 } // namespace WTF
319