• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2007 Apple Inc.  All rights reserved.
3  * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "wtf/unicode/UTF8.h"
29 
30 #include "wtf/ASCIICType.h"
31 #include "wtf/StringHasher.h"
32 #include "wtf/unicode/CharacterNames.h"
33 
34 namespace WTF {
35 namespace Unicode {
36 
inlineUTF8SequenceLengthNonASCII(char b0)37 inline int inlineUTF8SequenceLengthNonASCII(char b0)
38 {
39     if ((b0 & 0xC0) != 0xC0)
40         return 0;
41     if ((b0 & 0xE0) == 0xC0)
42         return 2;
43     if ((b0 & 0xF0) == 0xE0)
44         return 3;
45     if ((b0 & 0xF8) == 0xF0)
46         return 4;
47     return 0;
48 }
49 
inlineUTF8SequenceLength(char b0)50 inline int inlineUTF8SequenceLength(char b0)
51 {
52     return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
53 }
54 
UTF8SequenceLength(char b0)55 int UTF8SequenceLength(char b0)
56 {
57     return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
58 }
59 
decodeUTF8Sequence(const char * sequence)60 int decodeUTF8Sequence(const char* sequence)
61 {
62     // Handle 0-byte sequences (never valid).
63     const unsigned char b0 = sequence[0];
64     const int length = inlineUTF8SequenceLength(b0);
65     if (length == 0)
66         return -1;
67 
68     // Handle 1-byte sequences (plain ASCII).
69     const unsigned char b1 = sequence[1];
70     if (length == 1) {
71         if (b1)
72             return -1;
73         return b0;
74     }
75 
76     // Handle 2-byte sequences.
77     if ((b1 & 0xC0) != 0x80)
78         return -1;
79     const unsigned char b2 = sequence[2];
80     if (length == 2) {
81         if (b2)
82             return -1;
83         const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
84         if (c < 0x80)
85             return -1;
86         return c;
87     }
88 
89     // Handle 3-byte sequences.
90     if ((b2 & 0xC0) != 0x80)
91         return -1;
92     const unsigned char b3 = sequence[3];
93     if (length == 3) {
94         if (b3)
95             return -1;
96         const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
97         if (c < 0x800)
98             return -1;
99         // UTF-16 surrogates should never appear in UTF-8 data.
100         if (c >= 0xD800 && c <= 0xDFFF)
101             return -1;
102         return c;
103     }
104 
105     // Handle 4-byte sequences.
106     if ((b3 & 0xC0) != 0x80)
107         return -1;
108     const unsigned char b4 = sequence[4];
109     if (length == 4) {
110         if (b4)
111             return -1;
112         const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
113         if (c < 0x10000 || c > 0x10FFFF)
114             return -1;
115         return c;
116     }
117 
118     return -1;
119 }
120 
121 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
122 // into the first byte, depending on how many bytes follow.  There are
123 // as many entries in this table as there are UTF-8 sequence types.
124 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
125 // for *legal* UTF-8 will be 4 or fewer bytes total.
126 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
127 
convertLatin1ToUTF8(const LChar ** sourceStart,const LChar * sourceEnd,char ** targetStart,char * targetEnd)128 ConversionResult convertLatin1ToUTF8(
129                                      const LChar** sourceStart, const LChar* sourceEnd,
130                                      char** targetStart, char* targetEnd)
131 {
132     ConversionResult result = conversionOK;
133     const LChar* source = *sourceStart;
134     char* target = *targetStart;
135     while (source < sourceEnd) {
136         UChar32 ch;
137         unsigned short bytesToWrite = 0;
138         const UChar32 byteMask = 0xBF;
139         const UChar32 byteMark = 0x80;
140         const LChar* oldSource = source; // In case we have to back up because of target overflow.
141         ch = static_cast<unsigned short>(*source++);
142 
143         // Figure out how many bytes the result will require
144         if (ch < (UChar32)0x80)
145             bytesToWrite = 1;
146         else
147             bytesToWrite = 2;
148 
149         target += bytesToWrite;
150         if (target > targetEnd) {
151             source = oldSource; // Back up source pointer!
152             target -= bytesToWrite;
153             result = targetExhausted;
154             break;
155         }
156         switch (bytesToWrite) { // note: everything falls through.
157         case 2:
158             *--target = (char)((ch | byteMark) & byteMask);
159             ch >>= 6;
160         case 1:
161             *--target =  (char)(ch | firstByteMark[bytesToWrite]);
162         }
163         target += bytesToWrite;
164     }
165     *sourceStart = source;
166     *targetStart = target;
167     return result;
168 }
169 
convertUTF16ToUTF8(const UChar ** sourceStart,const UChar * sourceEnd,char ** targetStart,char * targetEnd,bool strict)170 ConversionResult convertUTF16ToUTF8(
171     const UChar** sourceStart, const UChar* sourceEnd,
172     char** targetStart, char* targetEnd, bool strict)
173 {
174     ConversionResult result = conversionOK;
175     const UChar* source = *sourceStart;
176     char* target = *targetStart;
177     while (source < sourceEnd) {
178         UChar32 ch;
179         unsigned short bytesToWrite = 0;
180         const UChar32 byteMask = 0xBF;
181         const UChar32 byteMark = 0x80;
182         const UChar* oldSource = source; // In case we have to back up because of target overflow.
183         ch = static_cast<unsigned short>(*source++);
184         // If we have a surrogate pair, convert to UChar32 first.
185         if (ch >= 0xD800 && ch <= 0xDBFF) {
186             // If the 16 bits following the high surrogate are in the source buffer...
187             if (source < sourceEnd) {
188                 UChar32 ch2 = static_cast<unsigned short>(*source);
189                 // If it's a low surrogate, convert to UChar32.
190                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
191                     ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
192                     ++source;
193                 } else if (strict) { // it's an unpaired high surrogate
194                     --source; // return to the illegal value itself
195                     result = sourceIllegal;
196                     break;
197                 }
198             } else { // We don't have the 16 bits following the high surrogate.
199                 --source; // return to the high surrogate
200                 result = sourceExhausted;
201                 break;
202             }
203         } else if (strict) {
204             // UTF-16 surrogate values are illegal in UTF-32
205             if (ch >= 0xDC00 && ch <= 0xDFFF) {
206                 --source; // return to the illegal value itself
207                 result = sourceIllegal;
208                 break;
209             }
210         }
211         // Figure out how many bytes the result will require
212         if (ch < (UChar32)0x80) {
213             bytesToWrite = 1;
214         } else if (ch < (UChar32)0x800) {
215             bytesToWrite = 2;
216         } else if (ch < (UChar32)0x10000) {
217             bytesToWrite = 3;
218         } else if (ch < (UChar32)0x110000) {
219             bytesToWrite = 4;
220         } else {
221             bytesToWrite = 3;
222             ch = replacementCharacter;
223         }
224 
225         target += bytesToWrite;
226         if (target > targetEnd) {
227             source = oldSource; // Back up source pointer!
228             target -= bytesToWrite;
229             result = targetExhausted;
230             break;
231         }
232         switch (bytesToWrite) { // note: everything falls through.
233             case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
234             case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
235             case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
236             case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]);
237         }
238         target += bytesToWrite;
239     }
240     *sourceStart = source;
241     *targetStart = target;
242     return result;
243 }
244 
245 // This must be called with the length pre-determined by the first byte.
246 // If presented with a length > 4, this returns false.  The Unicode
247 // definition of UTF-8 goes up to 4-byte sequences.
isLegalUTF8(const unsigned char * source,int length)248 static bool isLegalUTF8(const unsigned char* source, int length)
249 {
250     unsigned char a;
251     const unsigned char* srcptr = source + length;
252     switch (length) {
253         default: return false;
254         // Everything else falls through when "true"...
255         case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
256         case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
257         case 2: if ((a = (*--srcptr)) > 0xBF) return false;
258 
259         switch (*source) {
260             // no fall-through in this inner switch
261             case 0xE0: if (a < 0xA0) return false; break;
262             case 0xED: if (a > 0x9F) return false; break;
263             case 0xF0: if (a < 0x90) return false; break;
264             case 0xF4: if (a > 0x8F) return false; break;
265             default:   if (a < 0x80) return false;
266         }
267 
268         case 1: if (*source >= 0x80 && *source < 0xC2) return false;
269     }
270     if (*source > 0xF4)
271         return false;
272     return true;
273 }
274 
275 // Magic values subtracted from a buffer value during UTF8 conversion.
276 // This table contains as many values as there might be trailing bytes
277 // in a UTF-8 sequence.
278 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82082080UL) };
279 
readUTF8Sequence(const char * & sequence,unsigned length)280 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
281 {
282     UChar32 character = 0;
283 
284     // The cases all fall through.
285     switch (length) {
286         case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;
287         case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;
288         case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;
289         case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;
290         case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;
291         case 1: character += static_cast<unsigned char>(*sequence++);
292     }
293 
294     return character - offsetsFromUTF8[length - 1];
295 }
296 
convertUTF8ToUTF16(const char ** sourceStart,const char * sourceEnd,UChar ** targetStart,UChar * targetEnd,bool * sourceAllASCII,bool strict)297 ConversionResult convertUTF8ToUTF16(
298     const char** sourceStart, const char* sourceEnd,
299     UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
300 {
301     ConversionResult result = conversionOK;
302     const char* source = *sourceStart;
303     UChar* target = *targetStart;
304     UChar orAllData = 0;
305     while (source < sourceEnd) {
306         int utf8SequenceLength = inlineUTF8SequenceLength(*source);
307         if (sourceEnd - source < utf8SequenceLength)  {
308             result = sourceExhausted;
309             break;
310         }
311         // Do this check whether lenient or strict
312         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
313             result = sourceIllegal;
314             break;
315         }
316 
317         UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
318 
319         if (target >= targetEnd) {
320             source -= utf8SequenceLength; // Back up source pointer!
321             result = targetExhausted;
322             break;
323         }
324 
325         if (U_IS_BMP(character)) {
326             // UTF-16 surrogate values are illegal in UTF-32
327             if (U_IS_SURROGATE(character)) {
328                 if (strict) {
329                     source -= utf8SequenceLength; // return to the illegal value itself
330                     result = sourceIllegal;
331                     break;
332                 } else {
333                     *target++ = replacementCharacter;
334                     orAllData |= replacementCharacter;
335                 }
336             } else {
337                 *target++ = character; // normal case
338                 orAllData |= character;
339             }
340         } else if (U_IS_SUPPLEMENTARY(character)) {
341             // target is a character in range 0xFFFF - 0x10FFFF
342             if (target + 1 >= targetEnd) {
343                 source -= utf8SequenceLength; // Back up source pointer!
344                 result = targetExhausted;
345                 break;
346             }
347             *target++ = U16_LEAD(character);
348             *target++ = U16_TRAIL(character);
349             orAllData = 0xffff;
350         } else {
351             if (strict) {
352                 source -= utf8SequenceLength; // return to the start
353                 result = sourceIllegal;
354                 break; // Bail out; shouldn't continue
355             } else {
356                 *target++ = replacementCharacter;
357                 orAllData |= replacementCharacter;
358             }
359         }
360     }
361     *sourceStart = source;
362     *targetStart = target;
363 
364     if (sourceAllASCII)
365         *sourceAllASCII = !(orAllData & ~0x7f);
366 
367     return result;
368 }
369 
calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char * data,const char * dataEnd,unsigned & dataLength,unsigned & utf16Length)370 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
371 {
372     if (!data)
373         return 0;
374 
375     StringHasher stringHasher;
376     dataLength = 0;
377     utf16Length = 0;
378 
379     while (data < dataEnd || (!dataEnd && *data)) {
380         if (isASCII(*data)) {
381             stringHasher.addCharacter(*data++);
382             dataLength++;
383             utf16Length++;
384             continue;
385         }
386 
387         int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
388         dataLength += utf8SequenceLength;
389 
390         if (!dataEnd) {
391             for (int i = 1; i < utf8SequenceLength; ++i) {
392                 if (!data[i])
393                     return 0;
394             }
395         } else if (dataEnd - data < utf8SequenceLength)
396             return 0;
397 
398         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
399             return 0;
400 
401         UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
402         ASSERT(!isASCII(character));
403 
404         if (U_IS_BMP(character)) {
405             // UTF-16 surrogate values are illegal in UTF-32
406             if (U_IS_SURROGATE(character))
407                 return 0;
408             stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
409             utf16Length++;
410         } else if (U_IS_SUPPLEMENTARY(character)) {
411             stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
412                                        static_cast<UChar>(U16_TRAIL(character)));
413             utf16Length += 2;
414         } else
415             return 0;
416     }
417 
418     return stringHasher.hashWithTop8BitsMasked();
419 }
420 
421 template<typename CharType>
equalWithUTF8Internal(const CharType * a,const CharType * aEnd,const char * b,const char * bEnd)422 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd)
423 {
424     while (b < bEnd) {
425         if (isASCII(*b)) {
426             if (*a++ != *b++)
427                 return false;
428             continue;
429         }
430 
431         int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
432 
433         if (bEnd - b < utf8SequenceLength)
434             return false;
435 
436         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
437             return 0;
438 
439         UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
440         ASSERT(!isASCII(character));
441 
442         if (U_IS_BMP(character)) {
443             // UTF-16 surrogate values are illegal in UTF-32
444             if (U_IS_SURROGATE(character))
445                 return false;
446             if (*a++ != character)
447                 return false;
448         } else if (U_IS_SUPPLEMENTARY(character)) {
449             if (*a++ != U16_LEAD(character))
450                 return false;
451             if (*a++ != U16_TRAIL(character))
452                 return false;
453         } else
454             return false;
455     }
456 
457     return a == aEnd;
458 }
459 
equalUTF16WithUTF8(const UChar * a,const UChar * aEnd,const char * b,const char * bEnd)460 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
461 {
462     return equalWithUTF8Internal(a, aEnd, b, bEnd);
463 }
464 
equalLatin1WithUTF8(const LChar * a,const LChar * aEnd,const char * b,const char * bEnd)465 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)
466 {
467     return equalWithUTF8Internal(a, aEnd, b, bEnd);
468 }
469 
470 } // namespace Unicode
471 } // namespace WTF
472