1 /*
2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "config.h"
27 #include "wtf/text/TextCodecUTF8.h"
28
29 #include "wtf/text/TextCodecASCIIFastPath.h"
30 #include "wtf/text/CString.h"
31 #include "wtf/text/StringBuffer.h"
32 #include "wtf/unicode/CharacterNames.h"
33
34 using namespace WTF;
35 using namespace WTF::Unicode;
36 using namespace std;
37
38 namespace WTF {
39
40 const int nonCharacter = -1;
41
create(const TextEncoding &,const void *)42 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
43 {
44 return adoptPtr(new TextCodecUTF8);
45 }
46
registerEncodingNames(EncodingNameRegistrar registrar)47 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
48 {
49 registrar("UTF-8", "UTF-8");
50
51 // Additional aliases that originally were present in the encoding
52 // table in WebKit on Macintosh, and subsequently added by
53 // TextCodecICU. Perhaps we can prove some are not used on the web
54 // and remove them.
55 registrar("unicode11utf8", "UTF-8");
56 registrar("unicode20utf8", "UTF-8");
57 registrar("utf8", "UTF-8");
58 registrar("x-unicode20utf8", "UTF-8");
59
60 // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
61 // and Firefox (24), but not in ICU 4.6.
62 registrar("unicode-1-1-utf-8", "UTF-8");
63 }
64
registerCodecs(TextCodecRegistrar registrar)65 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
66 {
67 registrar("UTF-8", create, 0);
68 }
69
nonASCIISequenceLength(uint8_t firstByte)70 static inline int nonASCIISequenceLength(uint8_t firstByte)
71 {
72 static const uint8_t lengths[256] = {
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
83 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
86 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
87 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
88 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
89 };
90 return lengths[firstByte];
91 }
92
decodeNonASCIISequence(const uint8_t * sequence,unsigned length)93 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
94 {
95 ASSERT(!isASCII(sequence[0]));
96 if (length == 2) {
97 ASSERT(sequence[0] <= 0xDF);
98 if (sequence[0] < 0xC2)
99 return nonCharacter;
100 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
101 return nonCharacter;
102 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
103 }
104 if (length == 3) {
105 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
106 switch (sequence[0]) {
107 case 0xE0:
108 if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
109 return nonCharacter;
110 break;
111 case 0xED:
112 if (sequence[1] < 0x80 || sequence[1] > 0x9F)
113 return nonCharacter;
114 break;
115 default:
116 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
117 return nonCharacter;
118 }
119 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
120 return nonCharacter;
121 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
122 }
123 ASSERT(length == 4);
124 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
125 switch (sequence[0]) {
126 case 0xF0:
127 if (sequence[1] < 0x90 || sequence[1] > 0xBF)
128 return nonCharacter;
129 break;
130 case 0xF4:
131 if (sequence[1] < 0x80 || sequence[1] > 0x8F)
132 return nonCharacter;
133 break;
134 default:
135 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
136 return nonCharacter;
137 }
138 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
139 return nonCharacter;
140 if (sequence[3] < 0x80 || sequence[3] > 0xBF)
141 return nonCharacter;
142 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
143 }
144
appendCharacter(UChar * destination,int character)145 static inline UChar* appendCharacter(UChar* destination, int character)
146 {
147 ASSERT(character != nonCharacter);
148 ASSERT(!U_IS_SURROGATE(character));
149 if (U_IS_BMP(character))
150 *destination++ = character;
151 else {
152 *destination++ = U16_LEAD(character);
153 *destination++ = U16_TRAIL(character);
154 }
155 return destination;
156 }
157
consumePartialSequenceByte()158 void TextCodecUTF8::consumePartialSequenceByte()
159 {
160 --m_partialSequenceSize;
161 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
162 }
163
handleError(UChar * & destination,bool stopOnError,bool & sawError)164 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
165 {
166 sawError = true;
167 if (stopOnError)
168 return;
169 // Each error generates a replacement character and consumes one byte.
170 *destination++ = replacementCharacter;
171 consumePartialSequenceByte();
172 }
173
174 template <>
handlePartialSequence(LChar * & destination,const uint8_t * & source,const uint8_t * end,bool flush,bool,bool &)175 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
176 {
177 ASSERT(m_partialSequenceSize);
178 do {
179 if (isASCII(m_partialSequence[0])) {
180 *destination++ = m_partialSequence[0];
181 consumePartialSequenceByte();
182 continue;
183 }
184 int count = nonASCIISequenceLength(m_partialSequence[0]);
185 if (!count)
186 return true;
187
188 if (count > m_partialSequenceSize) {
189 if (count - m_partialSequenceSize > end - source) {
190 if (!flush) {
191 // The new data is not enough to complete the sequence, so
192 // add it to the existing partial sequence.
193 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
194 m_partialSequenceSize += end - source;
195 return false;
196 }
197 // An incomplete partial sequence at the end is an error, but it will create
198 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
199 // the error.
200 return true;
201 }
202 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
203 source += count - m_partialSequenceSize;
204 m_partialSequenceSize = count;
205 }
206 int character = decodeNonASCIISequence(m_partialSequence, count);
207 if ((character == nonCharacter) || (character > 0xff))
208 return true;
209
210 m_partialSequenceSize -= count;
211 *destination++ = character;
212 } while (m_partialSequenceSize);
213
214 return false;
215 }
216
217 template <>
handlePartialSequence(UChar * & destination,const uint8_t * & source,const uint8_t * end,bool flush,bool stopOnError,bool & sawError)218 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
219 {
220 ASSERT(m_partialSequenceSize);
221 do {
222 if (isASCII(m_partialSequence[0])) {
223 *destination++ = m_partialSequence[0];
224 consumePartialSequenceByte();
225 continue;
226 }
227 int count = nonASCIISequenceLength(m_partialSequence[0]);
228 if (!count) {
229 handleError(destination, stopOnError, sawError);
230 if (stopOnError)
231 return false;
232 continue;
233 }
234 if (count > m_partialSequenceSize) {
235 if (count - m_partialSequenceSize > end - source) {
236 if (!flush) {
237 // The new data is not enough to complete the sequence, so
238 // add it to the existing partial sequence.
239 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
240 m_partialSequenceSize += end - source;
241 return false;
242 }
243 // An incomplete partial sequence at the end is an error.
244 handleError(destination, stopOnError, sawError);
245 if (stopOnError)
246 return false;
247 continue;
248 }
249 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
250 source += count - m_partialSequenceSize;
251 m_partialSequenceSize = count;
252 }
253 int character = decodeNonASCIISequence(m_partialSequence, count);
254 if (character == nonCharacter) {
255 handleError(destination, stopOnError, sawError);
256 if (stopOnError)
257 return false;
258 continue;
259 }
260
261 m_partialSequenceSize -= count;
262 destination = appendCharacter(destination, character);
263 } while (m_partialSequenceSize);
264
265 return false;
266 }
267
decode(const char * bytes,size_t length,FlushBehavior flush,bool stopOnError,bool & sawError)268 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flush, bool stopOnError, bool& sawError)
269 {
270 // Each input byte might turn into a character.
271 // That includes all bytes in the partial-sequence buffer because
272 // each byte in an invalid sequence will turn into a replacement character.
273 StringBuffer<LChar> buffer(m_partialSequenceSize + length);
274
275 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
276 const uint8_t* end = source + length;
277 const uint8_t* alignedEnd = alignToMachineWord(end);
278 LChar* destination = buffer.characters();
279
280 do {
281 if (m_partialSequenceSize) {
282 // Explicitly copy destination and source pointers to avoid taking pointers to the
283 // local variables, which may harm code generation by disabling some optimizations
284 // in some compilers.
285 LChar* destinationForHandlePartialSequence = destination;
286 const uint8_t* sourceForHandlePartialSequence = source;
287 if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
288 source = sourceForHandlePartialSequence;
289 goto upConvertTo16Bit;
290 }
291 destination = destinationForHandlePartialSequence;
292 source = sourceForHandlePartialSequence;
293 if (m_partialSequenceSize)
294 break;
295 }
296
297 while (source < end) {
298 if (isASCII(*source)) {
299 // Fast path for ASCII. Most UTF-8 text will be ASCII.
300 if (isAlignedToMachineWord(source)) {
301 while (source < alignedEnd) {
302 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
303 if (!isAllASCII<LChar>(chunk))
304 break;
305 copyASCIIMachineWord(destination, source);
306 source += sizeof(MachineWord);
307 destination += sizeof(MachineWord);
308 }
309 if (source == end)
310 break;
311 if (!isASCII(*source))
312 continue;
313 }
314 *destination++ = *source++;
315 continue;
316 }
317 int count = nonASCIISequenceLength(*source);
318 int character;
319 if (!count)
320 character = nonCharacter;
321 else {
322 if (count > end - source) {
323 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
324 ASSERT(!m_partialSequenceSize);
325 m_partialSequenceSize = end - source;
326 memcpy(m_partialSequence, source, m_partialSequenceSize);
327 source = end;
328 break;
329 }
330 character = decodeNonASCIISequence(source, count);
331 }
332 if (character == nonCharacter) {
333 sawError = true;
334 if (stopOnError)
335 break;
336
337 goto upConvertTo16Bit;
338 }
339 if (character > 0xff)
340 goto upConvertTo16Bit;
341
342 source += count;
343 *destination++ = character;
344 }
345 } while (flush && m_partialSequenceSize);
346
347 buffer.shrink(destination - buffer.characters());
348
349 return String::adopt(buffer);
350
351 upConvertTo16Bit:
352 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
353
354 UChar* destination16 = buffer16.characters();
355
356 // Copy the already converted characters
357 for (LChar* converted8 = buffer.characters(); converted8 < destination;)
358 *destination16++ = *converted8++;
359
360 do {
361 if (m_partialSequenceSize) {
362 // Explicitly copy destination and source pointers to avoid taking pointers to the
363 // local variables, which may harm code generation by disabling some optimizations
364 // in some compilers.
365 UChar* destinationForHandlePartialSequence = destination16;
366 const uint8_t* sourceForHandlePartialSequence = source;
367 handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
368 destination16 = destinationForHandlePartialSequence;
369 source = sourceForHandlePartialSequence;
370 if (m_partialSequenceSize)
371 break;
372 }
373
374 while (source < end) {
375 if (isASCII(*source)) {
376 // Fast path for ASCII. Most UTF-8 text will be ASCII.
377 if (isAlignedToMachineWord(source)) {
378 while (source < alignedEnd) {
379 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
380 if (!isAllASCII<LChar>(chunk))
381 break;
382 copyASCIIMachineWord(destination16, source);
383 source += sizeof(MachineWord);
384 destination16 += sizeof(MachineWord);
385 }
386 if (source == end)
387 break;
388 if (!isASCII(*source))
389 continue;
390 }
391 *destination16++ = *source++;
392 continue;
393 }
394 int count = nonASCIISequenceLength(*source);
395 int character;
396 if (!count)
397 character = nonCharacter;
398 else {
399 if (count > end - source) {
400 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
401 ASSERT(!m_partialSequenceSize);
402 m_partialSequenceSize = end - source;
403 memcpy(m_partialSequence, source, m_partialSequenceSize);
404 source = end;
405 break;
406 }
407 character = decodeNonASCIISequence(source, count);
408 }
409 if (character == nonCharacter) {
410 sawError = true;
411 if (stopOnError)
412 break;
413 // Each error generates a replacement character and consumes one byte.
414 *destination16++ = replacementCharacter;
415 ++source;
416 continue;
417 }
418 source += count;
419 destination16 = appendCharacter(destination16, character);
420 }
421 } while (flush && m_partialSequenceSize);
422
423 buffer16.shrink(destination16 - buffer16.characters());
424
425 return String::adopt(buffer16);
426 }
427
428 template<typename CharType>
encodeCommon(const CharType * characters,size_t length)429 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
430 {
431 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
432 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
433 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
434 if (length > numeric_limits<size_t>::max() / 3)
435 CRASH();
436 Vector<uint8_t> bytes(length * 3);
437
438 size_t i = 0;
439 size_t bytesWritten = 0;
440 while (i < length) {
441 UChar32 character;
442 U16_NEXT(characters, i, length, character);
443 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
444 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
445 if (0xD800 <= character && character <= 0xDFFF)
446 character = replacementCharacter;
447 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
448 }
449
450 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
451 }
452
encode(const UChar * characters,size_t length,UnencodableHandling)453 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
454 {
455 return encodeCommon(characters, length);
456 }
457
encode(const LChar * characters,size_t length,UnencodableHandling)458 CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
459 {
460 return encodeCommon(characters, length);
461 }
462
463 } // namespace WTF
464