1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1999-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: utf8.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 1999sep13 16 * created by: Markus W. Scherer 17 */ 18 19 /** 20 * \file 21 * \brief C API: 8-bit Unicode handling macros 22 * 23 * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. 24 * 25 * For more information see utf.h and the ICU User Guide Strings chapter 26 * (https://unicode-org.github.io/icu/userguide/strings). 27 * 28 * <em>Usage:</em> 29 * ICU coding guidelines for if() statements should be followed when using these macros. 30 * Compound statements (curly braces {}) must be used for if-else-while... 31 * bodies and all macro statements should be terminated with semicolon. 32 */ 33 34 #ifndef __UTF8_H__ 35 #define __UTF8_H__ 36 37 #include <stdbool.h> 38 #include "unicode/umachine.h" 39 #ifndef __UTF_H__ 40 # include "unicode/utf.h" 41 #endif 42 43 /* internal definitions ----------------------------------------------------- */ 44 45 /** 46 * Counts the trail bytes for a UTF-8 lead byte. 47 * Returns 0 for 0..0xc1 as well as for 0xf5..0xff. 48 * leadByte might be evaluated multiple times. 49 * 50 * This is internal since it is not meant to be called directly by external clients; 51 * however it is called by public macros in this file and thus must remain stable. 52 * 53 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. 54 * @internal 55 */ 56 #define U8_COUNT_TRAIL_BYTES(leadByte) \ 57 (U8_IS_LEAD(leadByte) ? \ 58 ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0) 59 60 /** 61 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. 62 * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff. 63 * leadByte might be evaluated multiple times. 64 * 65 * This is internal since it is not meant to be called directly by external clients; 66 * however it is called by public macros in this file and thus must remain stable. 67 * 68 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. 69 * @internal 70 */ 71 #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ 72 (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)) 73 74 /** 75 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. 76 * 77 * This is internal since it is not meant to be called directly by external clients; 78 * however it is called by public macros in this file and thus must remain stable. 79 * @internal 80 */ 81 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) 82 83 /** 84 * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. 85 * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. 86 * Lead byte E0..EF bits 3..0 are used as byte index, 87 * first trail byte bits 7..5 are used as bit index into that byte. 88 * @see U8_IS_VALID_LEAD3_AND_T1 89 * @internal 90 */ 91 #define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" 92 93 /** 94 * Internal 3-byte UTF-8 validity check. 95 * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence. 96 * @internal 97 */ 98 #define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) 99 100 /** 101 * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. 102 * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. 103 * First trail byte bits 7..4 are used as byte index, 104 * lead byte F0..F4 bits 2..0 are used as bit index into that byte. 105 * @see U8_IS_VALID_LEAD4_AND_T1 106 * @internal 107 */ 108 #define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" 109 110 /** 111 * Internal 4-byte UTF-8 validity check. 112 * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence. 113 * @internal 114 */ 115 #define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) 116 117 118 119 120 121 122 123 124 125 /* single-code point definitions -------------------------------------------- */ 126 127 /** 128 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? 129 * @param c 8-bit code unit (byte) 130 * @return true or false 131 * @stable ICU 2.4 132 */ 133 #define U8_IS_SINGLE(c) (((c)&0x80)==0) 134 135 /** 136 * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4) 137 * @param c 8-bit code unit (byte) 138 * @return true or false 139 * @stable ICU 2.4 140 */ 141 #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32) 142 // 0x32=0xf4-0xc2 143 144 /** 145 * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) 146 * @param c 8-bit code unit (byte) 147 * @return true or false 148 * @stable ICU 2.4 149 */ 150 #define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40) 151 152 /** 153 * How many code units (bytes) are used for the UTF-8 encoding 154 * of this Unicode code point? 155 * @param c 32-bit code point 156 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point 157 * @stable ICU 2.4 158 */ 159 #define U8_LENGTH(c) \ 160 ((uint32_t)(c)<=0x7f ? 1 : \ 161 ((uint32_t)(c)<=0x7ff ? 2 : \ 162 ((uint32_t)(c)<=0xd7ff ? 3 : \ 163 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ 164 ((uint32_t)(c)<=0xffff ? 3 : 4)\ 165 ) \ 166 ) \ 167 ) \ 168 ) 169 170 /** 171 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). 172 * @return 4 173 * @stable ICU 2.4 174 */ 175 #define U8_MAX_LENGTH 4 176 177 /** 178 * Get a code point from a string at a random-access offset, 179 * without changing the offset. 180 * The offset may point to either the lead byte or one of the trail bytes 181 * for a code point, in which case the macro will read all of the bytes 182 * for the code point. 183 * The result is undefined if the offset points to an illegal UTF-8 184 * byte sequence. 185 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. 186 * 187 * @param s const uint8_t * string 188 * @param i string offset 189 * @param c output UChar32 variable 190 * @see U8_GET 191 * @stable ICU 2.4 192 */ 193 #define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 194 int32_t _u8_get_unsafe_index=(int32_t)(i); \ 195 U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ 196 U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ 197 } UPRV_BLOCK_MACRO_END 198 199 /** 200 * Get a code point from a string at a random-access offset, 201 * without changing the offset. 202 * The offset may point to either the lead byte or one of the trail bytes 203 * for a code point, in which case the macro will read all of the bytes 204 * for the code point. 205 * 206 * The length can be negative for a NUL-terminated string. 207 * 208 * If the offset points to an illegal UTF-8 byte sequence, then 209 * c is set to a negative value. 210 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. 211 * 212 * @param s const uint8_t * string 213 * @param start int32_t starting string offset 214 * @param i int32_t string offset, must be start<=i<length 215 * @param length int32_t string length 216 * @param c output UChar32 variable, set to <0 in case of an error 217 * @see U8_GET_UNSAFE 218 * @stable ICU 2.4 219 */ 220 #define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ 221 int32_t _u8_get_index=(i); \ 222 U8_SET_CP_START(s, start, _u8_get_index); \ 223 U8_NEXT(s, _u8_get_index, length, c); \ 224 } UPRV_BLOCK_MACRO_END 225 226 /** 227 * Get a code point from a string at a random-access offset, 228 * without changing the offset. 229 * The offset may point to either the lead byte or one of the trail bytes 230 * for a code point, in which case the macro will read all of the bytes 231 * for the code point. 232 * 233 * The length can be negative for a NUL-terminated string. 234 * 235 * If the offset points to an illegal UTF-8 byte sequence, then 236 * c is set to U+FFFD. 237 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD. 238 * 239 * This macro does not distinguish between a real U+FFFD in the text 240 * and U+FFFD returned for an ill-formed sequence. 241 * Use U8_GET() if that distinction is important. 242 * 243 * @param s const uint8_t * string 244 * @param start int32_t starting string offset 245 * @param i int32_t string offset, must be start<=i<length 246 * @param length int32_t string length 247 * @param c output UChar32 variable, set to U+FFFD in case of an error 248 * @see U8_GET 249 * @stable ICU 51 250 */ 251 #define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ 252 int32_t _u8_get_index=(i); \ 253 U8_SET_CP_START(s, start, _u8_get_index); \ 254 U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ 255 } UPRV_BLOCK_MACRO_END 256 257 /* definitions with forward iteration --------------------------------------- */ 258 259 /** 260 * Get a code point from a string at a code point boundary offset, 261 * and advance the offset to the next code point boundary. 262 * (Post-incrementing forward iteration.) 263 * "Unsafe" macro, assumes well-formed UTF-8. 264 * 265 * The offset may point to the lead byte of a multi-byte sequence, 266 * in which case the macro will read the whole sequence. 267 * The result is undefined if the offset points to a trail byte 268 * or an illegal UTF-8 sequence. 269 * 270 * @param s const uint8_t * string 271 * @param i string offset 272 * @param c output UChar32 variable 273 * @see U8_NEXT 274 * @stable ICU 2.4 275 */ 276 #define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 277 (c)=(uint8_t)(s)[(i)++]; \ 278 if(!U8_IS_SINGLE(c)) { \ 279 if((c)<0xe0) { \ 280 (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \ 281 } else if((c)<0xf0) { \ 282 /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ 283 (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \ 284 (i)+=2; \ 285 } else { \ 286 (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \ 287 (i)+=3; \ 288 } \ 289 } \ 290 } UPRV_BLOCK_MACRO_END 291 292 /** 293 * Get a code point from a string at a code point boundary offset, 294 * and advance the offset to the next code point boundary. 295 * (Post-incrementing forward iteration.) 296 * "Safe" macro, checks for illegal sequences and for string boundaries. 297 * 298 * The length can be negative for a NUL-terminated string. 299 * 300 * The offset may point to the lead byte of a multi-byte sequence, 301 * in which case the macro will read the whole sequence. 302 * If the offset points to a trail byte or an illegal UTF-8 sequence, then 303 * c is set to a negative value. 304 * 305 * @param s const uint8_t * string 306 * @param i int32_t string offset, must be i<length 307 * @param length int32_t string length 308 * @param c output UChar32 variable, set to <0 in case of an error 309 * @see U8_NEXT_UNSAFE 310 * @stable ICU 2.4 311 */ 312 #define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL) 313 314 /** 315 * Get a code point from a string at a code point boundary offset, 316 * and advance the offset to the next code point boundary. 317 * (Post-incrementing forward iteration.) 318 * "Safe" macro, checks for illegal sequences and for string boundaries. 319 * 320 * The length can be negative for a NUL-terminated string. 321 * 322 * The offset may point to the lead byte of a multi-byte sequence, 323 * in which case the macro will read the whole sequence. 324 * If the offset points to a trail byte or an illegal UTF-8 sequence, then 325 * c is set to U+FFFD. 326 * 327 * This macro does not distinguish between a real U+FFFD in the text 328 * and U+FFFD returned for an ill-formed sequence. 329 * Use U8_NEXT() if that distinction is important. 330 * 331 * @param s const uint8_t * string 332 * @param i int32_t string offset, must be i<length 333 * @param length int32_t string length 334 * @param c output UChar32 variable, set to U+FFFD in case of an error 335 * @see U8_NEXT 336 * @stable ICU 51 337 */ 338 #define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd) 339 340 /** @internal */ 341 #define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \ 342 (c)=(uint8_t)(s)[(i)++]; \ 343 if(!U8_IS_SINGLE(c)) { \ 344 uint8_t __t = 0; \ 345 if((i)!=(length) && \ 346 /* fetch/validate/assemble all but last trail byte */ \ 347 ((c)>=0xe0 ? \ 348 ((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \ 349 U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \ 350 (__t&=0x3f, 1) \ 351 : /* U+10000..U+10FFFF */ \ 352 ((c)-=0xf0)<=4 && \ 353 U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \ 354 ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \ 355 (__t=(s)[i]-0x80)<=0x3f) && \ 356 /* valid second-to-last trail byte */ \ 357 ((c)=((c)<<6)|__t, ++(i)!=(length)) \ 358 : /* U+0080..U+07FF */ \ 359 (c)>=0xc2 && ((c)&=0x1f, 1)) && \ 360 /* last trail byte */ \ 361 (__t=(s)[i]-0x80)<=0x3f && \ 362 ((c)=((c)<<6)|__t, ++(i), 1)) { \ 363 } else { \ 364 (c)=(sub); /* ill-formed*/ \ 365 } \ 366 } \ 367 } UPRV_BLOCK_MACRO_END 368 369 /** 370 * Append a code point to a string, overwriting 1 to 4 bytes. 371 * The offset points to the current end of the string contents 372 * and is advanced (post-increment). 373 * "Unsafe" macro, assumes a valid code point and sufficient space in the string. 374 * Otherwise, the result is undefined. 375 * 376 * @param s const uint8_t * string buffer 377 * @param i string offset 378 * @param c code point to append 379 * @see U8_APPEND 380 * @stable ICU 2.4 381 */ 382 #define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 383 uint32_t __uc=(c); \ 384 if(__uc<=0x7f) { \ 385 (s)[(i)++]=(uint8_t)__uc; \ 386 } else { \ 387 if(__uc<=0x7ff) { \ 388 (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ 389 } else { \ 390 if(__uc<=0xffff) { \ 391 (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ 392 } else { \ 393 (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ 394 (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ 395 } \ 396 (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ 397 } \ 398 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ 399 } \ 400 } UPRV_BLOCK_MACRO_END 401 402 /** 403 * Append a code point to a string, overwriting 1 to 4 bytes. 404 * The offset points to the current end of the string contents 405 * and is advanced (post-increment). 406 * "Safe" macro, checks for a valid code point. 407 * If a non-ASCII code point is written, checks for sufficient space in the string. 408 * If the code point is not valid or trail bytes do not fit, 409 * then isError is set to true. 410 * 411 * @param s const uint8_t * string buffer 412 * @param i int32_t string offset, must be i<capacity 413 * @param capacity int32_t size of the string buffer 414 * @param c UChar32 code point to append 415 * @param isError output UBool set to true if an error occurs, otherwise not modified 416 * @see U8_APPEND_UNSAFE 417 * @stable ICU 2.4 418 */ 419 #define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \ 420 uint32_t __uc=(c); \ 421 if(__uc<=0x7f) { \ 422 (s)[(i)++]=(uint8_t)__uc; \ 423 } else if(__uc<=0x7ff && (i)+1<(capacity)) { \ 424 (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ 425 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ 426 } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \ 427 (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ 428 (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ 429 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ 430 } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \ 431 (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ 432 (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ 433 (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ 434 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ 435 } else { \ 436 (isError)=true; \ 437 } \ 438 } UPRV_BLOCK_MACRO_END 439 440 /** 441 * Advance the string offset from one code point boundary to the next. 442 * (Post-incrementing iteration.) 443 * "Unsafe" macro, assumes well-formed UTF-8. 444 * 445 * @param s const uint8_t * string 446 * @param i string offset 447 * @see U8_FWD_1 448 * @stable ICU 2.4 449 */ 450 #define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ 451 (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \ 452 } UPRV_BLOCK_MACRO_END 453 454 /** 455 * Advance the string offset from one code point boundary to the next. 456 * (Post-incrementing iteration.) 457 * "Safe" macro, checks for illegal sequences and for string boundaries. 458 * 459 * The length can be negative for a NUL-terminated string. 460 * 461 * @param s const uint8_t * string 462 * @param i int32_t string offset, must be i<length 463 * @param length int32_t string length 464 * @see U8_FWD_1_UNSAFE 465 * @stable ICU 2.4 466 */ 467 #define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \ 468 uint8_t __b=(s)[(i)++]; \ 469 if(U8_IS_LEAD(__b) && (i)!=(length)) { \ 470 uint8_t __t1=(s)[i]; \ 471 if((0xe0<=__b && __b<0xf0)) { \ 472 if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \ 473 ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ 474 ++(i); \ 475 } \ 476 } else if(__b<0xe0) { \ 477 if(U8_IS_TRAIL(__t1)) { \ 478 ++(i); \ 479 } \ 480 } else /* c>=0xf0 */ { \ 481 if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \ 482 ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \ 483 ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ 484 ++(i); \ 485 } \ 486 } \ 487 } \ 488 } UPRV_BLOCK_MACRO_END 489 490 /** 491 * Advance the string offset from one code point boundary to the n-th next one, 492 * i.e., move forward by n code points. 493 * (Post-incrementing iteration.) 494 * "Unsafe" macro, assumes well-formed UTF-8. 495 * 496 * @param s const uint8_t * string 497 * @param i string offset 498 * @param n number of code points to skip 499 * @see U8_FWD_N 500 * @stable ICU 2.4 501 */ 502 #define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ 503 int32_t __N=(n); \ 504 while(__N>0) { \ 505 U8_FWD_1_UNSAFE(s, i); \ 506 --__N; \ 507 } \ 508 } UPRV_BLOCK_MACRO_END 509 510 /** 511 * Advance the string offset from one code point boundary to the n-th next one, 512 * i.e., move forward by n code points. 513 * (Post-incrementing iteration.) 514 * "Safe" macro, checks for illegal sequences and for string boundaries. 515 * 516 * The length can be negative for a NUL-terminated string. 517 * 518 * @param s const uint8_t * string 519 * @param i int32_t string offset, must be i<length 520 * @param length int32_t string length 521 * @param n number of code points to skip 522 * @see U8_FWD_N_UNSAFE 523 * @stable ICU 2.4 524 */ 525 #define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \ 526 int32_t __N=(n); \ 527 while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ 528 U8_FWD_1(s, i, length); \ 529 --__N; \ 530 } \ 531 } UPRV_BLOCK_MACRO_END 532 533 /** 534 * Adjust a random-access offset to a code point boundary 535 * at the start of a code point. 536 * If the offset points to a UTF-8 trail byte, 537 * then the offset is moved backward to the corresponding lead byte. 538 * Otherwise, it is not modified. 539 * "Unsafe" macro, assumes well-formed UTF-8. 540 * 541 * @param s const uint8_t * string 542 * @param i string offset 543 * @see U8_SET_CP_START 544 * @stable ICU 2.4 545 */ 546 #define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ 547 while(U8_IS_TRAIL((s)[i])) { --(i); } \ 548 } UPRV_BLOCK_MACRO_END 549 550 /** 551 * Adjust a random-access offset to a code point boundary 552 * at the start of a code point. 553 * If the offset points to a UTF-8 trail byte, 554 * then the offset is moved backward to the corresponding lead byte. 555 * Otherwise, it is not modified. 556 * 557 * "Safe" macro, checks for illegal sequences and for string boundaries. 558 * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i]. 559 * 560 * @param s const uint8_t * string 561 * @param start int32_t starting string offset (usually 0) 562 * @param i int32_t string offset, must be start<=i 563 * @see U8_SET_CP_START_UNSAFE 564 * @see U8_TRUNCATE_IF_INCOMPLETE 565 * @stable ICU 2.4 566 */ 567 #define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ 568 if(U8_IS_TRAIL((s)[(i)])) { \ 569 (i)=utf8_back1SafeBody(s, start, (i)); \ 570 } \ 571 } UPRV_BLOCK_MACRO_END 572 573 /** 574 * If the string ends with a UTF-8 byte sequence that is valid so far 575 * but incomplete, then reduce the length of the string to end before 576 * the lead byte of that incomplete sequence. 577 * For example, if the string ends with E1 80, the length is reduced by 2. 578 * 579 * In all other cases (the string ends with a complete sequence, or it is not 580 * possible for any further trail byte to extend the trailing sequence) 581 * the length remains unchanged. 582 * 583 * Useful for processing text split across multiple buffers 584 * (save the incomplete sequence for later) 585 * and for optimizing iteration 586 * (check for string length only once per character). 587 * 588 * "Safe" macro, checks for illegal sequences and for string boundaries. 589 * Unlike U8_SET_CP_START(), this macro never reads s[length]. 590 * 591 * (In UTF-16, simply check for U16_IS_LEAD(last code unit).) 592 * 593 * @param s const uint8_t * string 594 * @param start int32_t starting string offset (usually 0) 595 * @param length int32_t string length (usually start<=length) 596 * @see U8_SET_CP_START 597 * @stable ICU 61 598 */ 599 #define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \ 600 if((length)>(start)) { \ 601 uint8_t __b1=s[(length)-1]; \ 602 if(U8_IS_SINGLE(__b1)) { \ 603 /* common ASCII character */ \ 604 } else if(U8_IS_LEAD(__b1)) { \ 605 --(length); \ 606 } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \ 607 uint8_t __b2=s[(length)-2]; \ 608 if(0xe0<=__b2 && __b2<=0xf4) { \ 609 if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \ 610 U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \ 611 (length)-=2; \ 612 } \ 613 } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \ 614 uint8_t __b3=s[(length)-3]; \ 615 if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \ 616 (length)-=3; \ 617 } \ 618 } \ 619 } \ 620 } \ 621 } UPRV_BLOCK_MACRO_END 622 623 /* definitions with backward iteration -------------------------------------- */ 624 625 /** 626 * Move the string offset from one code point boundary to the previous one 627 * and get the code point between them. 628 * (Pre-decrementing backward iteration.) 629 * "Unsafe" macro, assumes well-formed UTF-8. 630 * 631 * The input offset may be the same as the string length. 632 * If the offset is behind a multi-byte sequence, then the macro will read 633 * the whole sequence. 634 * If the offset is behind a lead byte, then that itself 635 * will be returned as the code point. 636 * The result is undefined if the offset is behind an illegal UTF-8 sequence. 637 * 638 * @param s const uint8_t * string 639 * @param i string offset 640 * @param c output UChar32 variable 641 * @see U8_PREV 642 * @stable ICU 2.4 643 */ 644 #define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 645 (c)=(uint8_t)(s)[--(i)]; \ 646 if(U8_IS_TRAIL(c)) { \ 647 uint8_t __b, __count=1, __shift=6; \ 648 \ 649 /* c is a trail byte */ \ 650 (c)&=0x3f; \ 651 for(;;) { \ 652 __b=(s)[--(i)]; \ 653 if(__b>=0xc0) { \ 654 U8_MASK_LEAD_BYTE(__b, __count); \ 655 (c)|=(UChar32)__b<<__shift; \ 656 break; \ 657 } else { \ 658 (c)|=(UChar32)(__b&0x3f)<<__shift; \ 659 ++__count; \ 660 __shift+=6; \ 661 } \ 662 } \ 663 } \ 664 } UPRV_BLOCK_MACRO_END 665 666 /** 667 * Move the string offset from one code point boundary to the previous one 668 * and get the code point between them. 669 * (Pre-decrementing backward iteration.) 670 * "Safe" macro, checks for illegal sequences and for string boundaries. 671 * 672 * The input offset may be the same as the string length. 673 * If the offset is behind a multi-byte sequence, then the macro will read 674 * the whole sequence. 675 * If the offset is behind a lead byte, then that itself 676 * will be returned as the code point. 677 * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. 678 * 679 * @param s const uint8_t * string 680 * @param start int32_t starting string offset (usually 0) 681 * @param i int32_t string offset, must be start<i 682 * @param c output UChar32 variable, set to <0 in case of an error 683 * @see U8_PREV_UNSAFE 684 * @stable ICU 2.4 685 */ 686 #define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 687 (c)=(uint8_t)(s)[--(i)]; \ 688 if(!U8_IS_SINGLE(c)) { \ 689 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ 690 } \ 691 } UPRV_BLOCK_MACRO_END 692 693 /** 694 * Move the string offset from one code point boundary to the previous one 695 * and get the code point between them. 696 * (Pre-decrementing backward iteration.) 697 * "Safe" macro, checks for illegal sequences and for string boundaries. 698 * 699 * The input offset may be the same as the string length. 700 * If the offset is behind a multi-byte sequence, then the macro will read 701 * the whole sequence. 702 * If the offset is behind a lead byte, then that itself 703 * will be returned as the code point. 704 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. 705 * 706 * This macro does not distinguish between a real U+FFFD in the text 707 * and U+FFFD returned for an ill-formed sequence. 708 * Use U8_PREV() if that distinction is important. 709 * 710 * @param s const uint8_t * string 711 * @param start int32_t starting string offset (usually 0) 712 * @param i int32_t string offset, must be start<i 713 * @param c output UChar32 variable, set to U+FFFD in case of an error 714 * @see U8_PREV 715 * @stable ICU 51 716 */ 717 #define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 718 (c)=(uint8_t)(s)[--(i)]; \ 719 if(!U8_IS_SINGLE(c)) { \ 720 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ 721 } \ 722 } UPRV_BLOCK_MACRO_END 723 724 /** 725 * Move the string offset from one code point boundary to the previous one. 726 * (Pre-decrementing backward iteration.) 727 * The input offset may be the same as the string length. 728 * "Unsafe" macro, assumes well-formed UTF-8. 729 * 730 * @param s const uint8_t * string 731 * @param i string offset 732 * @see U8_BACK_1 733 * @stable ICU 2.4 734 */ 735 #define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ 736 while(U8_IS_TRAIL((s)[--(i)])) {} \ 737 } UPRV_BLOCK_MACRO_END 738 739 /** 740 * Move the string offset from one code point boundary to the previous one. 741 * (Pre-decrementing backward iteration.) 742 * The input offset may be the same as the string length. 743 * "Safe" macro, checks for illegal sequences and for string boundaries. 744 * 745 * @param s const uint8_t * string 746 * @param start int32_t starting string offset (usually 0) 747 * @param i int32_t string offset, must be start<i 748 * @see U8_BACK_1_UNSAFE 749 * @stable ICU 2.4 750 */ 751 #define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ 752 if(U8_IS_TRAIL((s)[--(i)])) { \ 753 (i)=utf8_back1SafeBody(s, start, (i)); \ 754 } \ 755 } UPRV_BLOCK_MACRO_END 756 757 /** 758 * Move the string offset from one code point boundary to the n-th one before it, 759 * i.e., move backward by n code points. 760 * (Pre-decrementing backward iteration.) 761 * The input offset may be the same as the string length. 762 * "Unsafe" macro, assumes well-formed UTF-8. 763 * 764 * @param s const uint8_t * string 765 * @param i string offset 766 * @param n number of code points to skip 767 * @see U8_BACK_N 768 * @stable ICU 2.4 769 */ 770 #define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ 771 int32_t __N=(n); \ 772 while(__N>0) { \ 773 U8_BACK_1_UNSAFE(s, i); \ 774 --__N; \ 775 } \ 776 } UPRV_BLOCK_MACRO_END 777 778 /** 779 * Move the string offset from one code point boundary to the n-th one before it, 780 * i.e., move backward by n code points. 781 * (Pre-decrementing backward iteration.) 782 * The input offset may be the same as the string length. 783 * "Safe" macro, checks for illegal sequences and for string boundaries. 784 * 785 * @param s const uint8_t * string 786 * @param start int32_t index of the start of the string 787 * @param i int32_t string offset, must be start<i 788 * @param n number of code points to skip 789 * @see U8_BACK_N_UNSAFE 790 * @stable ICU 2.4 791 */ 792 #define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \ 793 int32_t __N=(n); \ 794 while(__N>0 && (i)>(start)) { \ 795 U8_BACK_1(s, start, i); \ 796 --__N; \ 797 } \ 798 } UPRV_BLOCK_MACRO_END 799 800 /** 801 * Adjust a random-access offset to a code point boundary after a code point. 802 * If the offset is behind a partial multi-byte sequence, 803 * then the offset is incremented to behind the whole sequence. 804 * Otherwise, it is not modified. 805 * The input offset may be the same as the string length. 806 * "Unsafe" macro, assumes well-formed UTF-8. 807 * 808 * @param s const uint8_t * string 809 * @param i string offset 810 * @see U8_SET_CP_LIMIT 811 * @stable ICU 2.4 812 */ 813 #define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ 814 U8_BACK_1_UNSAFE(s, i); \ 815 U8_FWD_1_UNSAFE(s, i); \ 816 } UPRV_BLOCK_MACRO_END 817 818 /** 819 * Adjust a random-access offset to a code point boundary after a code point. 820 * If the offset is behind a partial multi-byte sequence, 821 * then the offset is incremented to behind the whole sequence. 822 * Otherwise, it is not modified. 823 * The input offset may be the same as the string length. 824 * "Safe" macro, checks for illegal sequences and for string boundaries. 825 * 826 * The length can be negative for a NUL-terminated string. 827 * 828 * @param s const uint8_t * string 829 * @param start int32_t starting string offset (usually 0) 830 * @param i int32_t string offset, must be start<=i<=length 831 * @param length int32_t string length 832 * @see U8_SET_CP_LIMIT_UNSAFE 833 * @stable ICU 2.4 834 */ 835 #define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \ 836 if((start)<(i) && ((i)<(length) || (length)<0)) { \ 837 U8_BACK_1(s, start, i); \ 838 U8_FWD_1(s, i, length); \ 839 } \ 840 } UPRV_BLOCK_MACRO_END 841 842 #endif 843