• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2007 Apple Inc.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "UTF8.h"
27 
28 namespace WTF {
29 namespace Unicode {
30 
inlineUTF8SequenceLengthNonASCII(char b0)31 inline int inlineUTF8SequenceLengthNonASCII(char b0)
32 {
33     if ((b0 & 0xC0) != 0xC0)
34         return 0;
35     if ((b0 & 0xE0) == 0xC0)
36         return 2;
37     if ((b0 & 0xF0) == 0xE0)
38         return 3;
39     if ((b0 & 0xF8) == 0xF0)
40         return 4;
41     return 0;
42 }
43 
inlineUTF8SequenceLength(char b0)44 inline int inlineUTF8SequenceLength(char b0)
45 {
46     return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
47 }
48 
UTF8SequenceLength(char b0)49 int UTF8SequenceLength(char b0)
50 {
51     return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
52 }
53 
decodeUTF8Sequence(const char * sequence)54 int decodeUTF8Sequence(const char* sequence)
55 {
56     // Handle 0-byte sequences (never valid).
57     const unsigned char b0 = sequence[0];
58     const int length = inlineUTF8SequenceLength(b0);
59     if (length == 0)
60         return -1;
61 
62     // Handle 1-byte sequences (plain ASCII).
63     const unsigned char b1 = sequence[1];
64     if (length == 1) {
65         if (b1)
66             return -1;
67         return b0;
68     }
69 
70     // Handle 2-byte sequences.
71     if ((b1 & 0xC0) != 0x80)
72         return -1;
73     const unsigned char b2 = sequence[2];
74     if (length == 2) {
75         if (b2)
76             return -1;
77         const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
78         if (c < 0x80)
79             return -1;
80         return c;
81     }
82 
83     // Handle 3-byte sequences.
84     if ((b2 & 0xC0) != 0x80)
85         return -1;
86     const unsigned char b3 = sequence[3];
87     if (length == 3) {
88         if (b3)
89             return -1;
90         const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
91         if (c < 0x800)
92             return -1;
93         // UTF-16 surrogates should never appear in UTF-8 data.
94         if (c >= 0xD800 && c <= 0xDFFF)
95             return -1;
96         return c;
97     }
98 
99     // Handle 4-byte sequences.
100     if ((b3 & 0xC0) != 0x80)
101         return -1;
102     const unsigned char b4 = sequence[4];
103     if (length == 4) {
104         if (b4)
105             return -1;
106         const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
107         if (c < 0x10000 || c > 0x10FFFF)
108             return -1;
109         return c;
110     }
111 
112     return -1;
113 }
114 
115 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
116 // into the first byte, depending on how many bytes follow.  There are
117 // as many entries in this table as there are UTF-8 sequence types.
118 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
119 // for *legal* UTF-8 will be 4 or fewer bytes total.
120 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
121 
convertUTF16ToUTF8(const UChar ** sourceStart,const UChar * sourceEnd,char ** targetStart,char * targetEnd,bool strict)122 ConversionResult convertUTF16ToUTF8(
123     const UChar** sourceStart, const UChar* sourceEnd,
124     char** targetStart, char* targetEnd, bool strict)
125 {
126     ConversionResult result = conversionOK;
127     const UChar* source = *sourceStart;
128     char* target = *targetStart;
129     while (source < sourceEnd) {
130         UChar32 ch;
131         unsigned short bytesToWrite = 0;
132         const UChar32 byteMask = 0xBF;
133         const UChar32 byteMark = 0x80;
134         const UChar* oldSource = source; // In case we have to back up because of target overflow.
135         ch = static_cast<unsigned short>(*source++);
136         // If we have a surrogate pair, convert to UChar32 first.
137         if (ch >= 0xD800 && ch <= 0xDBFF) {
138             // If the 16 bits following the high surrogate are in the source buffer...
139             if (source < sourceEnd) {
140                 UChar32 ch2 = static_cast<unsigned short>(*source);
141                 // If it's a low surrogate, convert to UChar32.
142                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
143                     ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
144                     ++source;
145                 } else if (strict) { // it's an unpaired high surrogate
146                     --source; // return to the illegal value itself
147                     result = sourceIllegal;
148                     break;
149                 }
150             } else { // We don't have the 16 bits following the high surrogate.
151                 --source; // return to the high surrogate
152                 result = sourceExhausted;
153                 break;
154             }
155         } else if (strict) {
156             // UTF-16 surrogate values are illegal in UTF-32
157             if (ch >= 0xDC00 && ch <= 0xDFFF) {
158                 --source; // return to the illegal value itself
159                 result = sourceIllegal;
160                 break;
161             }
162         }
163         // Figure out how many bytes the result will require
164         if (ch < (UChar32)0x80) {
165             bytesToWrite = 1;
166         } else if (ch < (UChar32)0x800) {
167             bytesToWrite = 2;
168         } else if (ch < (UChar32)0x10000) {
169             bytesToWrite = 3;
170         } else if (ch < (UChar32)0x110000) {
171             bytesToWrite = 4;
172         } else {
173             bytesToWrite = 3;
174             ch = 0xFFFD;
175         }
176 
177         target += bytesToWrite;
178         if (target > targetEnd) {
179             source = oldSource; // Back up source pointer!
180             target -= bytesToWrite;
181             result = targetExhausted;
182             break;
183         }
184         switch (bytesToWrite) { // note: everything falls through.
185             case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
186             case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
187             case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
188             case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]);
189         }
190         target += bytesToWrite;
191     }
192     *sourceStart = source;
193     *targetStart = target;
194     return result;
195 }
196 
197 // This must be called with the length pre-determined by the first byte.
198 // If presented with a length > 4, this returns false.  The Unicode
199 // definition of UTF-8 goes up to 4-byte sequences.
isLegalUTF8(const unsigned char * source,int length)200 static bool isLegalUTF8(const unsigned char* source, int length)
201 {
202     unsigned char a;
203     const unsigned char* srcptr = source + length;
204     switch (length) {
205         default: return false;
206         // Everything else falls through when "true"...
207         case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
208         case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
209         case 2: if ((a = (*--srcptr)) > 0xBF) return false;
210 
211         switch (*source) {
212             // no fall-through in this inner switch
213             case 0xE0: if (a < 0xA0) return false; break;
214             case 0xED: if (a > 0x9F) return false; break;
215             case 0xF0: if (a < 0x90) return false; break;
216             case 0xF4: if (a > 0x8F) return false; break;
217             default:   if (a < 0x80) return false;
218         }
219 
220         case 1: if (*source >= 0x80 && *source < 0xC2) return false;
221     }
222     if (*source > 0xF4)
223         return false;
224     return true;
225 }
226 
227 // Magic values subtracted from a buffer value during UTF8 conversion.
228 // This table contains as many values as there might be trailing bytes
229 // in a UTF-8 sequence.
230 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
231             0x03C82080UL, 0xFA082080UL, 0x82082080UL };
232 
convertUTF8ToUTF16(const char ** sourceStart,const char * sourceEnd,UChar ** targetStart,UChar * targetEnd,bool strict)233 ConversionResult convertUTF8ToUTF16(
234     const char** sourceStart, const char* sourceEnd,
235     UChar** targetStart, UChar* targetEnd, bool strict)
236 {
237     ConversionResult result = conversionOK;
238     const char* source = *sourceStart;
239     UChar* target = *targetStart;
240     while (source < sourceEnd) {
241         UChar32 ch = 0;
242         int extraBytesToRead = UTF8SequenceLength(*source) - 1;
243         if (source + extraBytesToRead >= sourceEnd) {
244             result = sourceExhausted;
245             break;
246         }
247         // Do this check whether lenient or strict
248         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) {
249             result = sourceIllegal;
250             break;
251         }
252         // The cases all fall through.
253         switch (extraBytesToRead) {
254             case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
255             case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
256             case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6;
257             case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6;
258             case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6;
259             case 0: ch += static_cast<unsigned char>(*source++);
260         }
261         ch -= offsetsFromUTF8[extraBytesToRead];
262 
263         if (target >= targetEnd) {
264             source -= (extraBytesToRead + 1); // Back up source pointer!
265             result = targetExhausted; break;
266         }
267         if (ch <= 0xFFFF) {
268             // UTF-16 surrogate values are illegal in UTF-32
269             if (ch >= 0xD800 && ch <= 0xDFFF) {
270                 if (strict) {
271                     source -= (extraBytesToRead + 1); // return to the illegal value itself
272                     result = sourceIllegal;
273                     break;
274                 } else
275                     *target++ = 0xFFFD;
276             } else
277                 *target++ = (UChar)ch; // normal case
278         } else if (ch > 0x10FFFF) {
279             if (strict) {
280                 result = sourceIllegal;
281                 source -= (extraBytesToRead + 1); // return to the start
282                 break; // Bail out; shouldn't continue
283             } else
284                 *target++ = 0xFFFD;
285         } else {
286             // target is a character in range 0xFFFF - 0x10FFFF
287             if (target + 1 >= targetEnd) {
288                 source -= (extraBytesToRead + 1); // Back up source pointer!
289                 result = targetExhausted;
290                 break;
291             }
292             ch -= 0x0010000UL;
293             *target++ = (UChar)((ch >> 10) + 0xD800);
294             *target++ = (UChar)((ch & 0x03FF) + 0xDC00);
295         }
296     }
297     *sourceStart = source;
298     *targetStart = target;
299     return result;
300 }
301 
302 }
303 }
304