1 /*
2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "config.h"
27 #include "TextCodecUTF8.h"
28
29 #include "TextCodecASCIIFastPath.h"
30 #include <wtf/text/CString.h>
31 #include <wtf/text/StringBuffer.h>
32 #include <wtf/unicode/CharacterNames.h>
33
34 using namespace WTF::Unicode;
35 using namespace std;
36
37 namespace WebCore {
38
39 const int nonCharacter = -1;
40
create(const TextEncoding &,const void *)41 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
42 {
43 return adoptPtr(new TextCodecUTF8);
44 }
45
registerEncodingNames(EncodingNameRegistrar registrar)46 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
47 {
48 registrar("UTF-8", "UTF-8");
49
50 // Additional aliases that originally were present in the encoding
51 // table in WebKit on Macintosh, and subsequently added by
52 // TextCodecICU. Perhaps we can prove some are not used on the web
53 // and remove them.
54 registrar("unicode11utf8", "UTF-8");
55 registrar("unicode20utf8", "UTF-8");
56 registrar("utf8", "UTF-8");
57 registrar("x-unicode20utf8", "UTF-8");
58 }
59
registerCodecs(TextCodecRegistrar registrar)60 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
61 {
62 registrar("UTF-8", create, 0);
63 }
64
nonASCIISequenceLength(uint8_t firstByte)65 static inline int nonASCIISequenceLength(uint8_t firstByte)
66 {
67 static const uint8_t lengths[256] = {
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
82 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
83 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
84 };
85 return lengths[firstByte];
86 }
87
decodeNonASCIISequence(const uint8_t * sequence,unsigned length)88 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
89 {
90 ASSERT(!isASCII(sequence[0]));
91 if (length == 2) {
92 ASSERT(sequence[0] <= 0xDF);
93 if (sequence[0] < 0xC2)
94 return nonCharacter;
95 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
96 return nonCharacter;
97 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
98 }
99 if (length == 3) {
100 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
101 switch (sequence[0]) {
102 case 0xE0:
103 if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
104 return nonCharacter;
105 break;
106 case 0xED:
107 if (sequence[1] < 0x80 || sequence[1] > 0x9F)
108 return nonCharacter;
109 break;
110 default:
111 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
112 return nonCharacter;
113 }
114 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
115 return nonCharacter;
116 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
117 }
118 ASSERT(length == 4);
119 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
120 switch (sequence[0]) {
121 case 0xF0:
122 if (sequence[1] < 0x90 || sequence[1] > 0xBF)
123 return nonCharacter;
124 break;
125 case 0xF4:
126 if (sequence[1] < 0x80 || sequence[1] > 0x8F)
127 return nonCharacter;
128 break;
129 default:
130 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
131 return nonCharacter;
132 }
133 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
134 return nonCharacter;
135 if (sequence[3] < 0x80 || sequence[3] > 0xBF)
136 return nonCharacter;
137 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
138 }
139
appendCharacter(UChar * destination,int character)140 static inline UChar* appendCharacter(UChar* destination, int character)
141 {
142 ASSERT(character != nonCharacter);
143 ASSERT(!U_IS_SURROGATE(character));
144 if (U_IS_BMP(character))
145 *destination++ = character;
146 else {
147 *destination++ = U16_LEAD(character);
148 *destination++ = U16_TRAIL(character);
149 }
150 return destination;
151 }
152
consumePartialSequenceByte()153 void TextCodecUTF8::consumePartialSequenceByte()
154 {
155 --m_partialSequenceSize;
156 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
157 }
158
handleError(UChar * & destination,bool stopOnError,bool & sawError)159 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
160 {
161 sawError = true;
162 if (stopOnError)
163 return;
164 // Each error generates a replacement character and consumes one byte.
165 *destination++ = replacementCharacter;
166 consumePartialSequenceByte();
167 }
168
handlePartialSequence(UChar * & destination,const uint8_t * & source,const uint8_t * end,bool flush,bool stopOnError,bool & sawError)169 void TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
170 {
171 ASSERT(m_partialSequenceSize);
172 do {
173 if (isASCII(m_partialSequence[0])) {
174 *destination++ = m_partialSequence[0];
175 consumePartialSequenceByte();
176 continue;
177 }
178 int count = nonASCIISequenceLength(m_partialSequence[0]);
179 if (!count) {
180 handleError(destination, stopOnError, sawError);
181 if (stopOnError)
182 return;
183 continue;
184 }
185 if (count > m_partialSequenceSize) {
186 if (count - m_partialSequenceSize > end - source) {
187 if (!flush) {
188 // The new data is not enough to complete the sequence, so
189 // add it to the existing partial sequence.
190 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
191 m_partialSequenceSize += end - source;
192 return;
193 }
194 // An incomplete partial sequence at the end is an error.
195 handleError(destination, stopOnError, sawError);
196 if (stopOnError)
197 return;
198 continue;
199 }
200 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
201 source += count - m_partialSequenceSize;
202 m_partialSequenceSize = count;
203 }
204 int character = decodeNonASCIISequence(m_partialSequence, count);
205 if (character == nonCharacter) {
206 handleError(destination, stopOnError, sawError);
207 if (stopOnError)
208 return;
209 continue;
210 }
211 m_partialSequenceSize -= count;
212 destination = appendCharacter(destination, character);
213 } while (m_partialSequenceSize);
214 }
215
decode(const char * bytes,size_t length,bool flush,bool stopOnError,bool & sawError)216 String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
217 {
218 // Each input byte might turn into a character.
219 // That includes all bytes in the partial-sequence buffer because
220 // each byte in an invalid sequence will turn into a replacement character.
221 StringBuffer buffer(m_partialSequenceSize + length);
222
223 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
224 const uint8_t* end = source + length;
225 const uint8_t* alignedEnd = alignToMachineWord(end);
226 UChar* destination = buffer.characters();
227
228 do {
229 if (m_partialSequenceSize) {
230 // Explicitly copy destination and source pointers to avoid taking pointers to the
231 // local variables, which may harm code generation by disabling some optimizations
232 // in some compilers.
233 UChar* destinationForHandlePartialSequence = destination;
234 const uint8_t* sourceForHandlePartialSequence = source;
235 handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
236 destination = destinationForHandlePartialSequence;
237 source = sourceForHandlePartialSequence;
238 if (m_partialSequenceSize)
239 break;
240 }
241
242 while (source < end) {
243 if (isASCII(*source)) {
244 // Fast path for ASCII. Most UTF-8 text will be ASCII.
245 if (isAlignedToMachineWord(source)) {
246 while (source < alignedEnd) {
247 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
248 if (!isAllASCII(chunk))
249 break;
250 copyASCIIMachineWord(destination, source);
251 source += sizeof(MachineWord);
252 destination += sizeof(MachineWord);
253 }
254 if (source == end)
255 break;
256 if (!isASCII(*source))
257 continue;
258 }
259 *destination++ = *source++;
260 continue;
261 }
262 int count = nonASCIISequenceLength(*source);
263 int character;
264 if (!count)
265 character = nonCharacter;
266 else {
267 if (count > end - source) {
268 ASSERT(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
269 ASSERT(!m_partialSequenceSize);
270 m_partialSequenceSize = end - source;
271 memcpy(m_partialSequence, source, m_partialSequenceSize);
272 source = end;
273 break;
274 }
275 character = decodeNonASCIISequence(source, count);
276 }
277 if (character == nonCharacter) {
278 sawError = true;
279 if (stopOnError)
280 break;
281 // Each error generates a replacement character and consumes one byte.
282 *destination++ = replacementCharacter;
283 ++source;
284 continue;
285 }
286 source += count;
287 destination = appendCharacter(destination, character);
288 }
289 } while (flush && m_partialSequenceSize);
290
291 buffer.shrink(destination - buffer.characters());
292
293 return String::adopt(buffer);
294 }
295
encode(const UChar * characters,size_t length,UnencodableHandling)296 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
297 {
298 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
299 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
300 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
301 if (length > numeric_limits<size_t>::max() / 3)
302 CRASH();
303 Vector<uint8_t> bytes(length * 3);
304
305 size_t i = 0;
306 size_t bytesWritten = 0;
307 while (i < length) {
308 UChar32 character;
309 U16_NEXT(characters, i, length, character);
310 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
311 }
312
313 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
314 }
315
316 } // namespace WebCore
317