1 /*
2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
3 * Copyright (C) 2007-2009 Torch Mobile, Inc.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "TextEncodingRegistry.h"
29
30 #include "TextCodecLatin1.h"
31 #include "TextCodecUserDefined.h"
32 #include "TextCodecUTF16.h"
33 #include "TextCodecUTF8.h"
34 #include "TextEncoding.h"
35 #include <wtf/ASCIICType.h>
36 #include <wtf/HashMap.h>
37 #include <wtf/HashSet.h>
38 #include <wtf/StdLibExtras.h>
39 #include <wtf/StringExtras.h>
40 #include <wtf/Threading.h>
41
42 #if USE(ICU_UNICODE)
43 #include "TextCodecICU.h"
44 #endif
45 #if PLATFORM(MAC)
46 #include "TextCodecMac.h"
47 #endif
48 #if PLATFORM(QT)
49 #include "qt/TextCodecQt.h"
50 #endif
51 #if USE(GLIB_UNICODE)
52 #include "gtk/TextCodecGtk.h"
53 #endif
54 #if USE(BREWMP_UNICODE)
55 #include "brew/TextCodecBrew.h"
56 #endif
57 #if OS(WINCE) && !PLATFORM(QT)
58 #include "TextCodecWinCE.h"
59 #endif
60
61 #include <wtf/CurrentTime.h>
62 #include <wtf/text/CString.h>
63
64 using namespace WTF;
65
66 namespace WebCore {
67
68 const size_t maxEncodingNameLength = 63;
69
70 // Hash for all-ASCII strings that does case folding.
71 struct TextEncodingNameHash {
equalWebCore::TextEncodingNameHash72 static bool equal(const char* s1, const char* s2)
73 {
74 char c1;
75 char c2;
76 do {
77 c1 = *s1++;
78 c2 = *s2++;
79 if (toASCIILower(c1) != toASCIILower(c2))
80 return false;
81 } while (c1 && c2);
82 return !c1 && !c2;
83 }
84
85 // This algorithm is the one-at-a-time hash from:
86 // http://burtleburtle.net/bob/hash/hashfaq.html
87 // http://burtleburtle.net/bob/hash/doobs.html
hashWebCore::TextEncodingNameHash88 static unsigned hash(const char* s)
89 {
90 unsigned h = WTF::stringHashingStartValue;
91 for (;;) {
92 char c = *s++;
93 if (!c) {
94 h += (h << 3);
95 h ^= (h >> 11);
96 h += (h << 15);
97 return h;
98 }
99 h += toASCIILower(c);
100 h += (h << 10);
101 h ^= (h >> 6);
102 }
103 }
104
105 static const bool safeToCompareToEmptyOrDeleted = false;
106 };
107
108 struct TextCodecFactory {
109 NewTextCodecFunction function;
110 const void* additionalData;
TextCodecFactoryWebCore::TextCodecFactory111 TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
112 };
113
114 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
115 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
116
encodingRegistryMutex()117 static Mutex& encodingRegistryMutex()
118 {
119 // We don't have to use AtomicallyInitializedStatic here because
120 // this function is called on the main thread for any page before
121 // it is used in worker threads.
122 DEFINE_STATIC_LOCAL(Mutex, mutex, ());
123 return mutex;
124 }
125
126 static TextEncodingNameMap* textEncodingNameMap;
127 static TextCodecMap* textCodecMap;
128 static bool didExtendTextCodecMaps;
129 static HashSet<const char*>* japaneseEncodings;
130 static HashSet<const char*>* nonBackslashEncodings;
131
132 static const char* const textEncodingNameBlacklist[] = { "UTF-7" };
133
134 #if ERROR_DISABLED
135
checkExistingName(const char *,const char *)136 static inline void checkExistingName(const char*, const char*) { }
137
138 #else
139
checkExistingName(const char * alias,const char * atomicName)140 static void checkExistingName(const char* alias, const char* atomicName)
141 {
142 const char* oldAtomicName = textEncodingNameMap->get(alias);
143 if (!oldAtomicName)
144 return;
145 if (oldAtomicName == atomicName)
146 return;
147 // Keep the warning silent about one case where we know this will happen.
148 if (strcmp(alias, "ISO-8859-8-I") == 0
149 && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
150 && strcasecmp(atomicName, "iso-8859-8") == 0)
151 return;
152 LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
153 }
154
155 #endif
156
isUndesiredAlias(const char * alias)157 static bool isUndesiredAlias(const char* alias)
158 {
159 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
160 for (const char* p = alias; *p; ++p) {
161 if (*p == ',')
162 return true;
163 }
164 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
165 // problem, see bug 43554.
166 if (0 == strcmp(alias, "8859_1"))
167 return true;
168 return false;
169 }
170
addToTextEncodingNameMap(const char * alias,const char * name)171 static void addToTextEncodingNameMap(const char* alias, const char* name)
172 {
173 ASSERT(strlen(alias) <= maxEncodingNameLength);
174 if (isUndesiredAlias(alias))
175 return;
176 const char* atomicName = textEncodingNameMap->get(name);
177 ASSERT(strcmp(alias, name) == 0 || atomicName);
178 if (!atomicName)
179 atomicName = name;
180 checkExistingName(alias, atomicName);
181 textEncodingNameMap->add(alias, atomicName);
182 }
183
addToTextCodecMap(const char * name,NewTextCodecFunction function,const void * additionalData)184 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
185 {
186 const char* atomicName = textEncodingNameMap->get(name);
187 ASSERT(atomicName);
188 textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
189 }
190
pruneBlacklistedCodecs()191 static void pruneBlacklistedCodecs()
192 {
193 for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
194 const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
195 if (!atomicName)
196 continue;
197
198 Vector<const char*> names;
199 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
200 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
201 for (; it != end; ++it) {
202 if (it->second == atomicName)
203 names.append(it->first);
204 }
205
206 size_t length = names.size();
207 for (size_t j = 0; j < length; ++j)
208 textEncodingNameMap->remove(names[j]);
209
210 textCodecMap->remove(atomicName);
211 }
212 }
213
buildBaseTextCodecMaps()214 static void buildBaseTextCodecMaps()
215 {
216 ASSERT(isMainThread());
217 ASSERT(!textCodecMap);
218 ASSERT(!textEncodingNameMap);
219
220 textCodecMap = new TextCodecMap;
221 textEncodingNameMap = new TextEncodingNameMap;
222
223 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
224 TextCodecLatin1::registerCodecs(addToTextCodecMap);
225
226 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
227 TextCodecUTF8::registerCodecs(addToTextCodecMap);
228
229 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
230 TextCodecUTF16::registerCodecs(addToTextCodecMap);
231
232 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
233 TextCodecUserDefined::registerCodecs(addToTextCodecMap);
234
235 #if USE(GLIB_UNICODE)
236 // FIXME: This is not needed. The code above covers all the base codecs.
237 TextCodecGtk::registerBaseEncodingNames(addToTextEncodingNameMap);
238 TextCodecGtk::registerBaseCodecs(addToTextCodecMap);
239 #endif
240 }
241
addEncodingName(HashSet<const char * > * set,const char * name)242 static void addEncodingName(HashSet<const char*>* set, const char* name)
243 {
244 // We must not use atomicCanonicalTextEncodingName() because this function is called in it.
245 const char* atomicName = textEncodingNameMap->get(name);
246 if (atomicName)
247 set->add(atomicName);
248 }
249
buildQuirksSets()250 static void buildQuirksSets()
251 {
252 // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
253 // and initializing the sets for them in TextEncodingRegistry.cpp look strange.
254
255 ASSERT(!japaneseEncodings);
256 ASSERT(!nonBackslashEncodings);
257
258 japaneseEncodings = new HashSet<const char*>;
259 addEncodingName(japaneseEncodings, "EUC-JP");
260 addEncodingName(japaneseEncodings, "ISO-2022-JP");
261 addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
262 addEncodingName(japaneseEncodings, "ISO-2022-JP-2");
263 addEncodingName(japaneseEncodings, "ISO-2022-JP-3");
264 addEncodingName(japaneseEncodings, "JIS_C6226-1978");
265 addEncodingName(japaneseEncodings, "JIS_X0201");
266 addEncodingName(japaneseEncodings, "JIS_X0208-1983");
267 addEncodingName(japaneseEncodings, "JIS_X0208-1990");
268 addEncodingName(japaneseEncodings, "JIS_X0212-1990");
269 addEncodingName(japaneseEncodings, "Shift_JIS");
270 addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000");
271 addEncodingName(japaneseEncodings, "cp932");
272 addEncodingName(japaneseEncodings, "x-mac-japanese");
273
274 nonBackslashEncodings = new HashSet<const char*>;
275 // The text encodings below treat backslash as a currency symbol for IE compatibility.
276 // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
277 addEncodingName(nonBackslashEncodings, "x-mac-japanese");
278 addEncodingName(nonBackslashEncodings, "ISO-2022-JP");
279 addEncodingName(nonBackslashEncodings, "EUC-JP");
280 // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
281 addEncodingName(nonBackslashEncodings, "Shift_JIS");
282 addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000");
283 }
284
isJapaneseEncoding(const char * canonicalEncodingName)285 bool isJapaneseEncoding(const char* canonicalEncodingName)
286 {
287 return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
288 }
289
shouldShowBackslashAsCurrencySymbolIn(const char * canonicalEncodingName)290 bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName)
291 {
292 return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
293 }
294
extendTextCodecMaps()295 static void extendTextCodecMaps()
296 {
297 #if USE(ICU_UNICODE)
298 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
299 TextCodecICU::registerCodecs(addToTextCodecMap);
300 #endif
301
302 #if USE(QT4_UNICODE)
303 TextCodecQt::registerEncodingNames(addToTextEncodingNameMap);
304 TextCodecQt::registerCodecs(addToTextCodecMap);
305 #endif
306
307 #if PLATFORM(MAC)
308 TextCodecMac::registerEncodingNames(addToTextEncodingNameMap);
309 TextCodecMac::registerCodecs(addToTextCodecMap);
310 #endif
311
312 #if USE(GLIB_UNICODE)
313 TextCodecGtk::registerExtendedEncodingNames(addToTextEncodingNameMap);
314 TextCodecGtk::registerExtendedCodecs(addToTextCodecMap);
315 #endif
316
317 #if OS(WINCE) && !PLATFORM(QT)
318 TextCodecWinCE::registerExtendedEncodingNames(addToTextEncodingNameMap);
319 TextCodecWinCE::registerExtendedCodecs(addToTextCodecMap);
320 #endif
321
322 pruneBlacklistedCodecs();
323 buildQuirksSets();
324 }
325
newTextCodec(const TextEncoding & encoding)326 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
327 {
328 MutexLocker lock(encodingRegistryMutex());
329
330 ASSERT(textCodecMap);
331 TextCodecFactory factory = textCodecMap->get(encoding.name());
332 ASSERT(factory.function);
333 return factory.function(encoding, factory.additionalData);
334 }
335
atomicCanonicalTextEncodingName(const char * name)336 const char* atomicCanonicalTextEncodingName(const char* name)
337 {
338 if (!name || !name[0])
339 return 0;
340 if (!textEncodingNameMap)
341 buildBaseTextCodecMaps();
342
343 MutexLocker lock(encodingRegistryMutex());
344
345 if (const char* atomicName = textEncodingNameMap->get(name))
346 return atomicName;
347 if (didExtendTextCodecMaps)
348 return 0;
349 extendTextCodecMaps();
350 didExtendTextCodecMaps = true;
351 return textEncodingNameMap->get(name);
352 }
353
atomicCanonicalTextEncodingName(const UChar * characters,size_t length)354 const char* atomicCanonicalTextEncodingName(const UChar* characters, size_t length)
355 {
356 char buffer[maxEncodingNameLength + 1];
357 size_t j = 0;
358 for (size_t i = 0; i < length; ++i) {
359 UChar c = characters[i];
360 if (j == maxEncodingNameLength)
361 return 0;
362 buffer[j++] = c;
363 }
364 buffer[j] = 0;
365 return atomicCanonicalTextEncodingName(buffer);
366 }
367
noExtendedTextEncodingNameUsed()368 bool noExtendedTextEncodingNameUsed()
369 {
370 // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
371 return !didExtendTextCodecMaps;
372 }
373
374 #ifndef NDEBUG
dumpTextEncodingNameMap()375 void dumpTextEncodingNameMap()
376 {
377 unsigned size = textEncodingNameMap->size();
378 fprintf(stderr, "Dumping %u entries in WebCore::textEncodingNameMap...\n", size);
379
380 MutexLocker lock(encodingRegistryMutex());
381
382 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
383 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
384 for (; it != end; ++it)
385 fprintf(stderr, "'%s' => '%s'\n", it->first, it->second);
386 }
387 #endif
388
389 } // namespace WebCore
390