1 /* 2 * Copyright © 2011,2012,2014 Google, Inc. 3 * 4 * This is part of HarfBuzz, a text shaping library. 5 * 6 * Permission is hereby granted, without written agreement and without 7 * license or royalty fees, to use, copy, modify, and distribute this 8 * software and its documentation for any purpose, provided that the 9 * above copyright notice and the following two paragraphs appear in 10 * all copies of this software. 11 * 12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR 13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN 15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 16 * DAMAGE. 17 * 18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, 19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO 22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 23 * 24 * Google Author(s): Behdad Esfahbod 25 */ 26 27 #ifndef HB_UTF_HH 28 #define HB_UTF_HH 29 30 #include "hb.hh" 31 32 #include "hb-open-type.hh" 33 34 35 struct hb_utf8_t 36 { 37 typedef uint8_t codepoint_t; 38 39 static const codepoint_t * nexthb_utf8_t40 next (const codepoint_t *text, 41 const codepoint_t *end, 42 hb_codepoint_t *unicode, 43 hb_codepoint_t replacement) 44 { 45 /* Written to only accept well-formed sequences. 46 * Based on ideas from ICU's U8_NEXT. 47 * Generates one "replacement" for each ill-formed byte. */ 48 49 hb_codepoint_t c = *text++; 50 51 if (c > 0x7Fu) 52 { 53 if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */ 54 { 55 unsigned int t1; 56 if (likely (text < end && 57 (t1 = text[0] - 0x80u) <= 0x3Fu)) 58 { 59 c = ((c&0x1Fu)<<6) | t1; 60 text++; 61 } 62 else 63 goto error; 64 } 65 else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */ 66 { 67 unsigned int t1, t2; 68 if (likely (1 < end - text && 69 (t1 = text[0] - 0x80u) <= 0x3Fu && 70 (t2 = text[1] - 0x80u) <= 0x3Fu)) 71 { 72 c = ((c&0xFu)<<12) | (t1<<6) | t2; 73 if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) 74 goto error; 75 text += 2; 76 } 77 else 78 goto error; 79 } 80 else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */ 81 { 82 unsigned int t1, t2, t3; 83 if (likely (2 < end - text && 84 (t1 = text[0] - 0x80u) <= 0x3Fu && 85 (t2 = text[1] - 0x80u) <= 0x3Fu && 86 (t3 = text[2] - 0x80u) <= 0x3Fu)) 87 { 88 c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; 89 if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu))) 90 goto error; 91 text += 3; 92 } 93 else 94 goto error; 95 } 96 else 97 goto error; 98 } 99 100 *unicode = c; 101 return text; 102 103 error: 104 *unicode = replacement; 105 return text; 106 } 107 108 static const codepoint_t * prevhb_utf8_t109 prev (const codepoint_t *text, 110 const codepoint_t *start, 111 hb_codepoint_t *unicode, 112 hb_codepoint_t replacement) 113 { 114 const codepoint_t *end = text--; 115 while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) 116 text--; 117 118 if (likely (next (text, end, unicode, replacement) == end)) 119 return text; 120 121 *unicode = replacement; 122 return end - 1; 123 } 124 125 static unsigned int strlenhb_utf8_t126 strlen (const codepoint_t *text) 127 { return ::strlen ((const char *) text); } 128 129 static unsigned int encode_lenhb_utf8_t130 encode_len (hb_codepoint_t unicode) 131 { 132 if (unicode < 0x0080u) return 1; 133 if (unicode < 0x0800u) return 2; 134 if (unicode < 0x10000u) return 3; 135 if (unicode < 0x110000u) return 4; 136 return 3; 137 } 138 139 static codepoint_t * encodehb_utf8_t140 encode (codepoint_t *text, 141 const codepoint_t *end, 142 hb_codepoint_t unicode) 143 { 144 if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) 145 unicode = 0xFFFDu; 146 if (unicode < 0x0080u) 147 *text++ = unicode; 148 else if (unicode < 0x0800u) 149 { 150 if (end - text >= 2) 151 { 152 *text++ = 0xC0u + (0x1Fu & (unicode >> 6)); 153 *text++ = 0x80u + (0x3Fu & (unicode )); 154 } 155 } 156 else if (unicode < 0x10000u) 157 { 158 if (end - text >= 3) 159 { 160 *text++ = 0xE0u + (0x0Fu & (unicode >> 12)); 161 *text++ = 0x80u + (0x3Fu & (unicode >> 6)); 162 *text++ = 0x80u + (0x3Fu & (unicode )); 163 } 164 } 165 else 166 { 167 if (end - text >= 4) 168 { 169 *text++ = 0xF0u + (0x07u & (unicode >> 18)); 170 *text++ = 0x80u + (0x3Fu & (unicode >> 12)); 171 *text++ = 0x80u + (0x3Fu & (unicode >> 6)); 172 *text++ = 0x80u + (0x3Fu & (unicode )); 173 } 174 } 175 return text; 176 } 177 }; 178 179 180 template <typename TCodepoint> 181 struct hb_utf16_xe_t 182 { 183 static_assert (sizeof (TCodepoint) == 2, ""); 184 typedef TCodepoint codepoint_t; 185 186 static const codepoint_t * nexthb_utf16_xe_t187 next (const codepoint_t *text, 188 const codepoint_t *end, 189 hb_codepoint_t *unicode, 190 hb_codepoint_t replacement) 191 { 192 hb_codepoint_t c = *text++; 193 194 if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) 195 { 196 *unicode = c; 197 return text; 198 } 199 200 if (likely (c <= 0xDBFFu && text < end)) 201 { 202 /* High-surrogate in c */ 203 hb_codepoint_t l = *text; 204 if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu))) 205 { 206 /* Low-surrogate in l */ 207 *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); 208 text++; 209 return text; 210 } 211 } 212 213 /* Lonely / out-of-order surrogate. */ 214 *unicode = replacement; 215 return text; 216 } 217 218 static const codepoint_t * prevhb_utf16_xe_t219 prev (const codepoint_t *text, 220 const codepoint_t *start, 221 hb_codepoint_t *unicode, 222 hb_codepoint_t replacement) 223 { 224 hb_codepoint_t c = *--text; 225 226 if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) 227 { 228 *unicode = c; 229 return text; 230 } 231 232 if (likely (c >= 0xDC00u && start < text)) 233 { 234 /* Low-surrogate in c */ 235 hb_codepoint_t h = text[-1]; 236 if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu))) 237 { 238 /* High-surrogate in h */ 239 *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u); 240 text--; 241 return text; 242 } 243 } 244 245 /* Lonely / out-of-order surrogate. */ 246 *unicode = replacement; 247 return text; 248 } 249 250 251 static unsigned int strlenhb_utf16_xe_t252 strlen (const codepoint_t *text) 253 { 254 unsigned int l = 0; 255 while (*text++) l++; 256 return l; 257 } 258 259 static unsigned int encode_lenhb_utf16_xe_t260 encode_len (hb_codepoint_t unicode) 261 { 262 return unicode < 0x10000 ? 1 : 2; 263 } 264 265 static codepoint_t * encodehb_utf16_xe_t266 encode (codepoint_t *text, 267 const codepoint_t *end, 268 hb_codepoint_t unicode) 269 { 270 if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) 271 unicode = 0xFFFDu; 272 if (unicode < 0x10000u) 273 *text++ = unicode; 274 else if (end - text >= 2) 275 { 276 unicode -= 0x10000u; 277 *text++ = 0xD800u + (unicode >> 10); 278 *text++ = 0xDC00u + (unicode & 0x03FFu); 279 } 280 return text; 281 } 282 }; 283 284 typedef hb_utf16_xe_t<uint16_t> hb_utf16_t; 285 typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t; 286 287 288 template <typename TCodepoint, bool validate=true> 289 struct hb_utf32_xe_t 290 { 291 static_assert (sizeof (TCodepoint) == 4, ""); 292 typedef TCodepoint codepoint_t; 293 294 static const TCodepoint * nexthb_utf32_xe_t295 next (const TCodepoint *text, 296 const TCodepoint *end HB_UNUSED, 297 hb_codepoint_t *unicode, 298 hb_codepoint_t replacement) 299 { 300 hb_codepoint_t c = *unicode = *text++; 301 if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) 302 *unicode = replacement; 303 return text; 304 } 305 306 static const TCodepoint * prevhb_utf32_xe_t307 prev (const TCodepoint *text, 308 const TCodepoint *start HB_UNUSED, 309 hb_codepoint_t *unicode, 310 hb_codepoint_t replacement) 311 { 312 hb_codepoint_t c = *unicode = *--text; 313 if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) 314 *unicode = replacement; 315 return text; 316 } 317 318 static unsigned int strlenhb_utf32_xe_t319 strlen (const TCodepoint *text) 320 { 321 unsigned int l = 0; 322 while (*text++) l++; 323 return l; 324 } 325 326 static unsigned int encode_lenhb_utf32_xe_t327 encode_len (hb_codepoint_t unicode HB_UNUSED) 328 { 329 return 1; 330 } 331 332 static codepoint_t * encodehb_utf32_xe_t333 encode (codepoint_t *text, 334 const codepoint_t *end HB_UNUSED, 335 hb_codepoint_t unicode) 336 { 337 if (validate && unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) 338 unicode = 0xFFFDu; 339 *text++ = unicode; 340 return text; 341 } 342 }; 343 344 typedef hb_utf32_xe_t<uint32_t> hb_utf32_t; 345 typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t; 346 347 348 struct hb_latin1_t 349 { 350 typedef uint8_t codepoint_t; 351 352 static const codepoint_t * nexthb_latin1_t353 next (const codepoint_t *text, 354 const codepoint_t *end HB_UNUSED, 355 hb_codepoint_t *unicode, 356 hb_codepoint_t replacement HB_UNUSED) 357 { 358 *unicode = *text++; 359 return text; 360 } 361 362 static const codepoint_t * prevhb_latin1_t363 prev (const codepoint_t *text, 364 const codepoint_t *start HB_UNUSED, 365 hb_codepoint_t *unicode, 366 hb_codepoint_t replacement HB_UNUSED) 367 { 368 *unicode = *--text; 369 return text; 370 } 371 372 static unsigned int strlenhb_latin1_t373 strlen (const codepoint_t *text) 374 { 375 unsigned int l = 0; 376 while (*text++) l++; 377 return l; 378 } 379 380 static unsigned int encode_lenhb_latin1_t381 encode_len (hb_codepoint_t unicode HB_UNUSED) 382 { 383 return 1; 384 } 385 386 static codepoint_t * encodehb_latin1_t387 encode (codepoint_t *text, 388 const codepoint_t *end HB_UNUSED, 389 hb_codepoint_t unicode) 390 { 391 if (unlikely (unicode >= 0x0100u)) 392 unicode = '?'; 393 *text++ = unicode; 394 return text; 395 } 396 }; 397 398 399 struct hb_ascii_t 400 { 401 typedef uint8_t codepoint_t; 402 403 static const codepoint_t * nexthb_ascii_t404 next (const codepoint_t *text, 405 const codepoint_t *end HB_UNUSED, 406 hb_codepoint_t *unicode, 407 hb_codepoint_t replacement HB_UNUSED) 408 { 409 *unicode = *text++; 410 if (*unicode >= 0x0080u) 411 *unicode = replacement; 412 return text; 413 } 414 415 static const codepoint_t * prevhb_ascii_t416 prev (const codepoint_t *text, 417 const codepoint_t *start HB_UNUSED, 418 hb_codepoint_t *unicode, 419 hb_codepoint_t replacement) 420 { 421 *unicode = *--text; 422 if (*unicode >= 0x0080u) 423 *unicode = replacement; 424 return text; 425 } 426 427 static unsigned int strlenhb_ascii_t428 strlen (const codepoint_t *text) 429 { 430 unsigned int l = 0; 431 while (*text++) l++; 432 return l; 433 } 434 435 static unsigned int encode_lenhb_ascii_t436 encode_len (hb_codepoint_t unicode HB_UNUSED) 437 { 438 return 1; 439 } 440 441 static codepoint_t * encodehb_ascii_t442 encode (codepoint_t *text, 443 const codepoint_t *end HB_UNUSED, 444 hb_codepoint_t unicode) 445 { 446 if (unlikely (unicode >= 0x0080u)) 447 unicode = '?'; 448 *text++ = unicode; 449 return text; 450 } 451 }; 452 453 #endif /* HB_UTF_HH */ 454