1 /*
2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "config.h"
32 #include "platform/text/TextEncodingDetector.h"
33
34 #include "wtf/text/TextEncoding.h"
35 #include <unicode/ucnv.h>
36 #include <unicode/ucsdet.h>
37
38 namespace blink {
39
detectTextEncoding(const char * data,size_t length,const char * hintEncodingName,WTF::TextEncoding * detectedEncoding)40 bool detectTextEncoding(const char* data, size_t length,
41 const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)
42 {
43 *detectedEncoding = WTF::TextEncoding();
44 int matchesCount = 0;
45 UErrorCode status = U_ZERO_ERROR;
46 UCharsetDetector* detector = ucsdet_open(&status);
47 if (U_FAILURE(status))
48 return false;
49 ucsdet_enableInputFilter(detector, true);
50 ucsdet_setText(detector, data, static_cast<int32_t>(length), &status);
51 if (U_FAILURE(status))
52 return false;
53
54 // FIXME: A few things we can do other than improving
55 // the ICU detector itself.
56 // 1. Use ucsdet_detectAll and pick the most likely one given
57 // "the context" (parent-encoding, referrer encoding, etc).
58 // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
59 // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
60 // encoding with a highest confidence among the detector-specific
61 // limited set of candidate encodings.
62 // Below is a partial implementation of the first part of what's outlined
63 // above.
64 const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
65 if (U_FAILURE(status)) {
66 ucsdet_close(detector);
67 return false;
68 }
69
70 const char* encoding = 0;
71 if (hintEncodingName) {
72 WTF::TextEncoding hintEncoding(hintEncodingName);
73 // 10 is the minimum confidence value consistent with the codepoint
74 // allocation in a given encoding. The size of a chunk passed to
75 // us varies even for the same html file (apparently depending on
76 // the network load). When we're given a rather short chunk, we
77 // don't have a sufficiently reliable signal other than the fact that
78 // the chunk is consistent with a set of encodings. So, instead of
79 // setting an arbitrary threshold, we have to scan all the encodings
80 // consistent with the data.
81 const int32_t kThresold = 10;
82 for (int i = 0; i < matchesCount; ++i) {
83 int32_t confidence = ucsdet_getConfidence(matches[i], &status);
84 if (U_FAILURE(status)) {
85 status = U_ZERO_ERROR;
86 continue;
87 }
88 if (confidence < kThresold)
89 break;
90 const char* matchEncoding = ucsdet_getName(matches[i], &status);
91 if (U_FAILURE(status)) {
92 status = U_ZERO_ERROR;
93 continue;
94 }
95 if (WTF::TextEncoding(matchEncoding) == hintEncoding) {
96 encoding = hintEncodingName;
97 break;
98 }
99 }
100 }
101 // If no match is found so far, just pick the top match.
102 // This can happen, say, when a parent frame in EUC-JP refers to
103 // a child frame in Shift_JIS and both frames do NOT specify the encoding
104 // making us resort to auto-detection (when it IS turned on).
105 if (!encoding && matchesCount > 0)
106 encoding = ucsdet_getName(matches[0], &status);
107 if (U_SUCCESS(status)) {
108 *detectedEncoding = WTF::TextEncoding(encoding);
109 ucsdet_close(detector);
110 return true;
111 }
112 ucsdet_close(detector);
113 return false;
114 }
115
116 }
117