• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "config.h"
27 #include "wtf/text/TextCodecUTF8.h"
28 
29 #include "wtf/text/TextCodecASCIIFastPath.h"
30 #include "wtf/text/CString.h"
31 #include "wtf/text/StringBuffer.h"
32 #include "wtf/unicode/CharacterNames.h"
33 
34 using namespace WTF;
35 using namespace WTF::Unicode;
36 using namespace std;
37 
38 namespace WTF {
39 
40 const int nonCharacter = -1;
41 
create(const TextEncoding &,const void *)42 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
43 {
44     return adoptPtr(new TextCodecUTF8);
45 }
46 
registerEncodingNames(EncodingNameRegistrar registrar)47 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
48 {
49     registrar("UTF-8", "UTF-8");
50 
51     // Additional aliases that originally were present in the encoding
52     // table in WebKit on Macintosh, and subsequently added by
53     // TextCodecICU. Perhaps we can prove some are not used on the web
54     // and remove them.
55     registrar("unicode11utf8", "UTF-8");
56     registrar("unicode20utf8", "UTF-8");
57     registrar("utf8", "UTF-8");
58     registrar("x-unicode20utf8", "UTF-8");
59 
60     // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
61     // and Firefox (24), but not in ICU 4.6.
62     registrar("unicode-1-1-utf-8", "UTF-8");
63 }
64 
registerCodecs(TextCodecRegistrar registrar)65 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
66 {
67     registrar("UTF-8", create, 0);
68 }
69 
nonASCIISequenceLength(uint8_t firstByte)70 static inline int nonASCIISequenceLength(uint8_t firstByte)
71 {
72     static const uint8_t lengths[256] = {
73         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
82         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
83         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
86         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
87         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
88         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
89     };
90     return lengths[firstByte];
91 }
92 
decodeNonASCIISequence(const uint8_t * sequence,unsigned length)93 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
94 {
95     ASSERT(!isASCII(sequence[0]));
96     if (length == 2) {
97         ASSERT(sequence[0] <= 0xDF);
98         if (sequence[0] < 0xC2)
99             return nonCharacter;
100         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
101             return nonCharacter;
102         return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
103     }
104     if (length == 3) {
105         ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
106         switch (sequence[0]) {
107         case 0xE0:
108             if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
109                 return nonCharacter;
110             break;
111         case 0xED:
112             if (sequence[1] < 0x80 || sequence[1] > 0x9F)
113                 return nonCharacter;
114             break;
115         default:
116             if (sequence[1] < 0x80 || sequence[1] > 0xBF)
117                 return nonCharacter;
118         }
119         if (sequence[2] < 0x80 || sequence[2] > 0xBF)
120             return nonCharacter;
121         return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
122     }
123     ASSERT(length == 4);
124     ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
125     switch (sequence[0]) {
126     case 0xF0:
127         if (sequence[1] < 0x90 || sequence[1] > 0xBF)
128             return nonCharacter;
129         break;
130     case 0xF4:
131         if (sequence[1] < 0x80 || sequence[1] > 0x8F)
132             return nonCharacter;
133         break;
134     default:
135         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
136             return nonCharacter;
137     }
138     if (sequence[2] < 0x80 || sequence[2] > 0xBF)
139         return nonCharacter;
140     if (sequence[3] < 0x80 || sequence[3] > 0xBF)
141         return nonCharacter;
142     return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
143 }
144 
appendCharacter(UChar * destination,int character)145 static inline UChar* appendCharacter(UChar* destination, int character)
146 {
147     ASSERT(character != nonCharacter);
148     ASSERT(!U_IS_SURROGATE(character));
149     if (U_IS_BMP(character))
150         *destination++ = character;
151     else {
152         *destination++ = U16_LEAD(character);
153         *destination++ = U16_TRAIL(character);
154     }
155     return destination;
156 }
157 
consumePartialSequenceByte()158 void TextCodecUTF8::consumePartialSequenceByte()
159 {
160     --m_partialSequenceSize;
161     memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
162 }
163 
handleError(UChar * & destination,bool stopOnError,bool & sawError)164 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
165 {
166     sawError = true;
167     if (stopOnError)
168         return;
169     // Each error generates a replacement character and consumes one byte.
170     *destination++ = replacementCharacter;
171     consumePartialSequenceByte();
172 }
173 
174 template <>
handlePartialSequence(LChar * & destination,const uint8_t * & source,const uint8_t * end,bool flush,bool,bool &)175 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
176 {
177     ASSERT(m_partialSequenceSize);
178     do {
179         if (isASCII(m_partialSequence[0])) {
180             *destination++ = m_partialSequence[0];
181             consumePartialSequenceByte();
182             continue;
183         }
184         int count = nonASCIISequenceLength(m_partialSequence[0]);
185         if (!count)
186             return true;
187 
188         if (count > m_partialSequenceSize) {
189             if (count - m_partialSequenceSize > end - source) {
190                 if (!flush) {
191                     // The new data is not enough to complete the sequence, so
192                     // add it to the existing partial sequence.
193                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
194                     m_partialSequenceSize += end - source;
195                     return false;
196                 }
197                 // An incomplete partial sequence at the end is an error, but it will create
198                 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
199                 // the error.
200                 return true;
201             }
202             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
203             source += count - m_partialSequenceSize;
204             m_partialSequenceSize = count;
205         }
206         int character = decodeNonASCIISequence(m_partialSequence, count);
207         if ((character == nonCharacter) || (character > 0xff))
208             return true;
209 
210         m_partialSequenceSize -= count;
211         *destination++ = character;
212     } while (m_partialSequenceSize);
213 
214     return false;
215 }
216 
217 template <>
handlePartialSequence(UChar * & destination,const uint8_t * & source,const uint8_t * end,bool flush,bool stopOnError,bool & sawError)218 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
219 {
220     ASSERT(m_partialSequenceSize);
221     do {
222         if (isASCII(m_partialSequence[0])) {
223             *destination++ = m_partialSequence[0];
224             consumePartialSequenceByte();
225             continue;
226         }
227         int count = nonASCIISequenceLength(m_partialSequence[0]);
228         if (!count) {
229             handleError(destination, stopOnError, sawError);
230             if (stopOnError)
231                 return false;
232             continue;
233         }
234         if (count > m_partialSequenceSize) {
235             if (count - m_partialSequenceSize > end - source) {
236                 if (!flush) {
237                     // The new data is not enough to complete the sequence, so
238                     // add it to the existing partial sequence.
239                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
240                     m_partialSequenceSize += end - source;
241                     return false;
242                 }
243                 // An incomplete partial sequence at the end is an error.
244                 handleError(destination, stopOnError, sawError);
245                 if (stopOnError)
246                     return false;
247                 continue;
248             }
249             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
250             source += count - m_partialSequenceSize;
251             m_partialSequenceSize = count;
252         }
253         int character = decodeNonASCIISequence(m_partialSequence, count);
254         if (character == nonCharacter) {
255             handleError(destination, stopOnError, sawError);
256             if (stopOnError)
257                 return false;
258             continue;
259         }
260 
261         m_partialSequenceSize -= count;
262         destination = appendCharacter(destination, character);
263     } while (m_partialSequenceSize);
264 
265     return false;
266 }
267 
decode(const char * bytes,size_t length,FlushBehavior flush,bool stopOnError,bool & sawError)268 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flush, bool stopOnError, bool& sawError)
269 {
270     // Each input byte might turn into a character.
271     // That includes all bytes in the partial-sequence buffer because
272     // each byte in an invalid sequence will turn into a replacement character.
273     StringBuffer<LChar> buffer(m_partialSequenceSize + length);
274 
275     const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
276     const uint8_t* end = source + length;
277     const uint8_t* alignedEnd = alignToMachineWord(end);
278     LChar* destination = buffer.characters();
279 
280     do {
281         if (m_partialSequenceSize) {
282             // Explicitly copy destination and source pointers to avoid taking pointers to the
283             // local variables, which may harm code generation by disabling some optimizations
284             // in some compilers.
285             LChar* destinationForHandlePartialSequence = destination;
286             const uint8_t* sourceForHandlePartialSequence = source;
287             if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
288                 source = sourceForHandlePartialSequence;
289                 goto upConvertTo16Bit;
290             }
291             destination = destinationForHandlePartialSequence;
292             source = sourceForHandlePartialSequence;
293             if (m_partialSequenceSize)
294                 break;
295         }
296 
297         while (source < end) {
298             if (isASCII(*source)) {
299                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
300                 if (isAlignedToMachineWord(source)) {
301                     while (source < alignedEnd) {
302                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
303                         if (!isAllASCII<LChar>(chunk))
304                             break;
305                         copyASCIIMachineWord(destination, source);
306                         source += sizeof(MachineWord);
307                         destination += sizeof(MachineWord);
308                     }
309                     if (source == end)
310                         break;
311                     if (!isASCII(*source))
312                         continue;
313                 }
314                 *destination++ = *source++;
315                 continue;
316             }
317             int count = nonASCIISequenceLength(*source);
318             int character;
319             if (!count)
320                 character = nonCharacter;
321             else {
322                 if (count > end - source) {
323                     ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
324                     ASSERT(!m_partialSequenceSize);
325                     m_partialSequenceSize = end - source;
326                     memcpy(m_partialSequence, source, m_partialSequenceSize);
327                     source = end;
328                     break;
329                 }
330                 character = decodeNonASCIISequence(source, count);
331             }
332             if (character == nonCharacter) {
333                 sawError = true;
334                 if (stopOnError)
335                     break;
336 
337                 goto upConvertTo16Bit;
338             }
339             if (character > 0xff)
340                 goto upConvertTo16Bit;
341 
342             source += count;
343             *destination++ = character;
344         }
345     } while (flush && m_partialSequenceSize);
346 
347     buffer.shrink(destination - buffer.characters());
348 
349     return String::adopt(buffer);
350 
351 upConvertTo16Bit:
352     StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
353 
354     UChar* destination16 = buffer16.characters();
355 
356     // Copy the already converted characters
357     for (LChar* converted8 = buffer.characters(); converted8 < destination;)
358         *destination16++ = *converted8++;
359 
360     do {
361         if (m_partialSequenceSize) {
362             // Explicitly copy destination and source pointers to avoid taking pointers to the
363             // local variables, which may harm code generation by disabling some optimizations
364             // in some compilers.
365             UChar* destinationForHandlePartialSequence = destination16;
366             const uint8_t* sourceForHandlePartialSequence = source;
367             handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
368             destination16 = destinationForHandlePartialSequence;
369             source = sourceForHandlePartialSequence;
370             if (m_partialSequenceSize)
371                 break;
372         }
373 
374         while (source < end) {
375             if (isASCII(*source)) {
376                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
377                 if (isAlignedToMachineWord(source)) {
378                     while (source < alignedEnd) {
379                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
380                         if (!isAllASCII<LChar>(chunk))
381                             break;
382                         copyASCIIMachineWord(destination16, source);
383                         source += sizeof(MachineWord);
384                         destination16 += sizeof(MachineWord);
385                     }
386                     if (source == end)
387                         break;
388                     if (!isASCII(*source))
389                         continue;
390                 }
391                 *destination16++ = *source++;
392                 continue;
393             }
394             int count = nonASCIISequenceLength(*source);
395             int character;
396             if (!count)
397                 character = nonCharacter;
398             else {
399                 if (count > end - source) {
400                     ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
401                     ASSERT(!m_partialSequenceSize);
402                     m_partialSequenceSize = end - source;
403                     memcpy(m_partialSequence, source, m_partialSequenceSize);
404                     source = end;
405                     break;
406                 }
407                 character = decodeNonASCIISequence(source, count);
408             }
409             if (character == nonCharacter) {
410                 sawError = true;
411                 if (stopOnError)
412                     break;
413                 // Each error generates a replacement character and consumes one byte.
414                 *destination16++ = replacementCharacter;
415                 ++source;
416                 continue;
417             }
418             source += count;
419             destination16 = appendCharacter(destination16, character);
420         }
421     } while (flush && m_partialSequenceSize);
422 
423     buffer16.shrink(destination16 - buffer16.characters());
424 
425     return String::adopt(buffer16);
426 }
427 
428 template<typename CharType>
encodeCommon(const CharType * characters,size_t length)429 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
430 {
431     // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
432     // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
433     // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
434     if (length > numeric_limits<size_t>::max() / 3)
435         CRASH();
436     Vector<uint8_t> bytes(length * 3);
437 
438     size_t i = 0;
439     size_t bytesWritten = 0;
440     while (i < length) {
441         UChar32 character;
442         U16_NEXT(characters, i, length, character);
443         // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
444         // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
445         if (0xD800 <= character && character <= 0xDFFF)
446             character = replacementCharacter;
447         U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
448     }
449 
450     return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
451 }
452 
encode(const UChar * characters,size_t length,UnencodableHandling)453 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
454 {
455     return encodeCommon(characters, length);
456 }
457 
encode(const LChar * characters,size_t length,UnencodableHandling)458 CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
459 {
460     return encodeCommon(characters, length);
461 }
462 
463 } // namespace WTF
464