• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 Google LLC.
2 // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
3 
4 #include "include/private/SkTFitsIn.h"
5 #include "src/utils/SkUTF.h"
6 
7 #include <climits>
8 
left_shift(int32_t value,int32_t shift)9 static constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
10     return (int32_t) ((uint32_t) value << shift);
11 }
12 
is_align2(T x)13 template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); }
14 
is_align4(T x)15 template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); }
16 
utf16_is_high_surrogate(uint16_t c)17 static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; }
18 
utf16_is_low_surrogate(uint16_t c)19 static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; }
20 
21 /** @returns   -1  iff invalid UTF8 byte,
22                 0  iff UTF8 continuation byte,
23                 1  iff ASCII byte,
24                 2  iff leading byte of 2-byte sequence,
25                 3  iff leading byte of 3-byte sequence, and
26                 4  iff leading byte of 4-byte sequence.
27       I.e.: if return value > 0, then gives length of sequence.
28 */
utf8_byte_type(uint8_t c)29 static int utf8_byte_type(uint8_t c) {
30     if (c < 0x80) {
31         return 1;
32     } else if (c < 0xC0) {
33         return 0;
34     } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear"
35         return -1;
36     } else {
37         int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
38         // assert(value >= 2 && value <=4);
39         return value;
40     }
41 }
utf8_type_is_valid_leading_byte(int type)42 static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
43 
utf8_byte_is_continuation(uint8_t c)44 static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; }
45 
46 ////////////////////////////////////////////////////////////////////////////////
47 
CountUTF8(const char * utf8,size_t byteLength)48 int SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
49     if (!utf8) {
50         return -1;
51     }
52     int count = 0;
53     const char* stop = utf8 + byteLength;
54     while (utf8 < stop) {
55         int type = utf8_byte_type(*(const uint8_t*)utf8);
56         if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
57             return -1;  // Sequence extends beyond end.
58         }
59         while(type-- > 1) {
60             ++utf8;
61             if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
62                 return -1;
63             }
64         }
65         ++utf8;
66         ++count;
67     }
68     return count;
69 }
70 
CountUTF16(const uint16_t * utf16,size_t byteLength)71 int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
72     if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) {
73         return -1;
74     }
75     const uint16_t* src = (const uint16_t*)utf16;
76     const uint16_t* stop = src + (byteLength >> 1);
77     int count = 0;
78     while (src < stop) {
79         unsigned c = *src++;
80         if (utf16_is_low_surrogate(c)) {
81             return -1;
82         }
83         if (utf16_is_high_surrogate(c)) {
84             if (src >= stop) {
85                 return -1;
86             }
87             c = *src++;
88             if (!utf16_is_low_surrogate(c)) {
89                 return -1;
90             }
91         }
92         count += 1;
93     }
94     return count;
95 }
96 
CountUTF32(const int32_t * utf32,size_t byteLength)97 int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
98     if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || !SkTFitsIn<int>(byteLength >> 2)) {
99         return -1;
100     }
101     const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
102     const uint32_t* ptr = (const uint32_t*)utf32;
103     const uint32_t* stop = ptr + (byteLength >> 2);
104     while (ptr < stop) {
105         if (*ptr & kInvalidUnicharMask) {
106             return -1;
107         }
108         ptr += 1;
109     }
110     return (int)(byteLength >> 2);
111 }
112 
113 template <typename T>
next_fail(const T ** ptr,const T * end)114 static SkUnichar next_fail(const T** ptr, const T* end) {
115     *ptr = end;
116     return -1;
117 }
118 
NextUTF8(const char ** ptr,const char * end)119 SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
120     if (!ptr || !end ) {
121         return -1;
122     }
123     const uint8_t*  p = (const uint8_t*)*ptr;
124     if (!p || p >= (const uint8_t*)end) {
125         return next_fail(ptr, end);
126     }
127     int             c = *p;
128     int             hic = c << 24;
129 
130     if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
131         return next_fail(ptr, end);
132     }
133     if (hic < 0) {
134         uint32_t mask = (uint32_t)~0x3F;
135         hic = left_shift(hic, 1);
136         do {
137             ++p;
138             if (p >= (const uint8_t*)end) {
139                 return next_fail(ptr, end);
140             }
141             // check before reading off end of array.
142             uint8_t nextByte = *p;
143             if (!utf8_byte_is_continuation(nextByte)) {
144                 return next_fail(ptr, end);
145             }
146             c = (c << 6) | (nextByte & 0x3F);
147             mask <<= 5;
148         } while ((hic = left_shift(hic, 1)) < 0);
149         c &= ~mask;
150     }
151     *ptr = (char*)p + 1;
152     return c;
153 }
154 
NextUTF16(const uint16_t ** ptr,const uint16_t * end)155 SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
156     if (!ptr || !end ) {
157         return -1;
158     }
159     const uint16_t* src = *ptr;
160     if (!src || src + 1 > end || !is_align2(intptr_t(src))) {
161         return next_fail(ptr, end);
162     }
163     uint16_t c = *src++;
164     SkUnichar result = c;
165     if (utf16_is_low_surrogate(c)) {
166         return next_fail(ptr, end);  // srcPtr should never point at low surrogate.
167     }
168     if (utf16_is_high_surrogate(c)) {
169         if (src + 1 > end) {
170             return next_fail(ptr, end);  // Truncated string.
171         }
172         uint16_t low = *src++;
173         if (!utf16_is_low_surrogate(low)) {
174             return next_fail(ptr, end);
175         }
176         /*
177         [paraphrased from wikipedia]
178         Take the high surrogate and subtract 0xD800, then multiply by 0x400.
179         Take the low surrogate and subtract 0xDC00.  Add these two results
180         together, and finally add 0x10000 to get the final decoded codepoint.
181 
182         unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
183         unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000
184         unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
185         unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
186         */
187         result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000);
188     }
189     *ptr = src;
190     return result;
191 }
192 
NextUTF32(const int32_t ** ptr,const int32_t * end)193 SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
194     if (!ptr || !end ) {
195         return -1;
196     }
197     const int32_t* s = *ptr;
198     if (!s || s + 1 > end || !is_align4(intptr_t(s))) {
199         return next_fail(ptr, end);
200     }
201     int32_t value = *s;
202     const uint32_t kInvalidUnicharMask = 0xFF000000;    // unichar fits in 24 bits
203     if (value & kInvalidUnicharMask) {
204         return next_fail(ptr, end);
205     }
206     *ptr = s + 1;
207     return value;
208 }
209 
ToUTF8(SkUnichar uni,char utf8[SkUTF::kMaxBytesInUTF8Sequence])210 size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
211     if ((uint32_t)uni > 0x10FFFF) {
212         return 0;
213     }
214     if (uni <= 127) {
215         if (utf8) {
216             *utf8 = (char)uni;
217         }
218         return 1;
219     }
220     char    tmp[4];
221     char*   p = tmp;
222     size_t  count = 1;
223     while (uni > 0x7F >> count) {
224         *p++ = (char)(0x80 | (uni & 0x3F));
225         uni >>= 6;
226         count += 1;
227     }
228     if (utf8) {
229         p = tmp;
230         utf8 += count;
231         while (p < tmp + count - 1) {
232             *--utf8 = *p++;
233         }
234         *--utf8 = (char)(~(0xFF >> count) | uni);
235     }
236     return count;
237 }
238 
ToUTF16(SkUnichar uni,uint16_t utf16[2])239 size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) {
240     if ((uint32_t)uni > 0x10FFFF) {
241         return 0;
242     }
243     int extra = (uni > 0xFFFF);
244     if (utf16) {
245         if (extra) {
246             utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10));
247             utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF));
248         } else {
249             utf16[0] = (uint16_t)uni;
250         }
251     }
252     return 1 + extra;
253 }
254 
UTF8ToUTF16(uint16_t dst[],int dstCapacity,const char src[],size_t srcByteLength)255 int SkUTF::UTF8ToUTF16(uint16_t dst[], int dstCapacity, const char src[], size_t srcByteLength) {
256     if (!dst) {
257         dstCapacity = 0;
258     }
259 
260     int dstLength = 0;
261     uint16_t* endDst = dst + dstCapacity;
262     const char* endSrc = src + srcByteLength;
263     while (src < endSrc) {
264         SkUnichar uni = NextUTF8(&src, endSrc);
265         if (uni < 0) {
266             return -1;
267         }
268 
269         uint16_t utf16[2];
270         size_t count = ToUTF16(uni, utf16);
271         if (count == 0) {
272             return -1;
273         }
274         dstLength += count;
275 
276         if (dst) {
277             uint16_t* elems = utf16;
278             while (dst < endDst && count > 0) {
279                 *dst++ = *elems++;
280                 count -= 1;
281             }
282         }
283     }
284     return dstLength;
285 }
286 
UTF16ToUTF8(char dst[],int dstCapacity,const uint16_t src[],size_t srcLength)287 int SkUTF::UTF16ToUTF8(char dst[], int dstCapacity, const uint16_t src[], size_t srcLength) {
288     if (!dst) {
289         dstCapacity = 0;
290     }
291 
292     int dstLength = 0;
293     const char* endDst = dst + dstCapacity;
294     const uint16_t* endSrc = src + srcLength;
295     while (src < endSrc) {
296         SkUnichar uni = NextUTF16(&src, endSrc);
297         if (uni < 0) {
298             return -1;
299         }
300 
301         char utf8[SkUTF::kMaxBytesInUTF8Sequence];
302         size_t count = ToUTF8(uni, utf8);
303         if (count == 0) {
304             return -1;
305         }
306         dstLength += count;
307 
308         if (dst) {
309             const char* elems = utf8;
310             while (dst < endDst && count > 0) {
311                 *dst++ = *elems++;
312                 count -= 1;
313             }
314         }
315     }
316     return dstLength;
317 }
318