• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  utf8.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 1999sep13
16 *   created by: Markus W. Scherer
17 */
18 
19 /**
20  * \file
21  * \brief C API: 8-bit Unicode handling macros
22  *
23  * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
24  *
25  * For more information see utf.h and the ICU User Guide Strings chapter
26  * (https://unicode-org.github.io/icu/userguide/strings).
27  *
28  * <em>Usage:</em>
29  * ICU coding guidelines for if() statements should be followed when using these macros.
30  * Compound statements (curly braces {}) must be used  for if-else-while...
31  * bodies and all macro statements should be terminated with semicolon.
32  */
33 
34 #ifndef __UTF8_H__
35 #define __UTF8_H__
36 
37 #include <stdbool.h>
38 #include "unicode/umachine.h"
39 #ifndef __UTF_H__
40 #   include "unicode/utf.h"
41 #endif
42 
43 /* internal definitions ----------------------------------------------------- */
44 
45 /**
46  * Counts the trail bytes for a UTF-8 lead byte.
47  * Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
48  * leadByte might be evaluated multiple times.
49  *
50  * This is internal since it is not meant to be called directly by external clients;
51  * however it is called by public macros in this file and thus must remain stable.
52  *
53  * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
54  * @internal
55  */
56 #define U8_COUNT_TRAIL_BYTES(leadByte) \
57     (U8_IS_LEAD(leadByte) ? \
58         ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
59 
60 /**
61  * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
62  * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
63  * leadByte might be evaluated multiple times.
64  *
65  * This is internal since it is not meant to be called directly by external clients;
66  * however it is called by public macros in this file and thus must remain stable.
67  *
68  * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
69  * @internal
70  */
71 #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
72     (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
73 
74 /**
75  * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
76  *
77  * This is internal since it is not meant to be called directly by external clients;
78  * however it is called by public macros in this file and thus must remain stable.
79  * @internal
80  */
81 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
82 
83 /**
84  * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
85  * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
86  * Lead byte E0..EF bits 3..0 are used as byte index,
87  * first trail byte bits 7..5 are used as bit index into that byte.
88  * @see U8_IS_VALID_LEAD3_AND_T1
89  * @internal
90  */
91 #define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
92 
93 /**
94  * Internal 3-byte UTF-8 validity check.
95  * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
96  * @internal
97  */
98 #define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
99 
100 /**
101  * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
102  * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
103  * First trail byte bits 7..4 are used as byte index,
104  * lead byte F0..F4 bits 2..0 are used as bit index into that byte.
105  * @see U8_IS_VALID_LEAD4_AND_T1
106  * @internal
107  */
108 #define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
109 
110 /**
111  * Internal 4-byte UTF-8 validity check.
112  * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
113  * @internal
114  */
115 #define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
116 
117 
118 
119 
120 
121 
122 
123 
124 
125 /* single-code point definitions -------------------------------------------- */
126 
127 /**
128  * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
129  * @param c 8-bit code unit (byte)
130  * @return true or false
131  * @stable ICU 2.4
132  */
133 #define U8_IS_SINGLE(c) (((c)&0x80)==0)
134 
135 /**
136  * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
137  * @param c 8-bit code unit (byte)
138  * @return true or false
139  * @stable ICU 2.4
140  */
141 #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
142 // 0x32=0xf4-0xc2
143 
144 /**
145  * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
146  * @param c 8-bit code unit (byte)
147  * @return true or false
148  * @stable ICU 2.4
149  */
150 #define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
151 
152 /**
153  * How many code units (bytes) are used for the UTF-8 encoding
154  * of this Unicode code point?
155  * @param c 32-bit code point
156  * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
157  * @stable ICU 2.4
158  */
159 #define U8_LENGTH(c) \
160     ((uint32_t)(c)<=0x7f ? 1 : \
161         ((uint32_t)(c)<=0x7ff ? 2 : \
162             ((uint32_t)(c)<=0xd7ff ? 3 : \
163                 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
164                     ((uint32_t)(c)<=0xffff ? 3 : 4)\
165                 ) \
166             ) \
167         ) \
168     )
169 
170 /**
171  * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
172  * @return 4
173  * @stable ICU 2.4
174  */
175 #define U8_MAX_LENGTH 4
176 
177 /**
178  * Get a code point from a string at a random-access offset,
179  * without changing the offset.
180  * The offset may point to either the lead byte or one of the trail bytes
181  * for a code point, in which case the macro will read all of the bytes
182  * for the code point.
183  * The result is undefined if the offset points to an illegal UTF-8
184  * byte sequence.
185  * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
186  *
187  * @param s const uint8_t * string
188  * @param i string offset
189  * @param c output UChar32 variable
190  * @see U8_GET
191  * @stable ICU 2.4
192  */
193 #define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
194     int32_t _u8_get_unsafe_index=(int32_t)(i); \
195     U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
196     U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
197 } UPRV_BLOCK_MACRO_END
198 
199 /**
200  * Get a code point from a string at a random-access offset,
201  * without changing the offset.
202  * The offset may point to either the lead byte or one of the trail bytes
203  * for a code point, in which case the macro will read all of the bytes
204  * for the code point.
205  *
206  * The length can be negative for a NUL-terminated string.
207  *
208  * If the offset points to an illegal UTF-8 byte sequence, then
209  * c is set to a negative value.
210  * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
211  *
212  * @param s const uint8_t * string
213  * @param start int32_t starting string offset
214  * @param i int32_t string offset, must be start<=i<length
215  * @param length int32_t string length
216  * @param c output UChar32 variable, set to <0 in case of an error
217  * @see U8_GET_UNSAFE
218  * @stable ICU 2.4
219  */
220 #define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
221     int32_t _u8_get_index=(i); \
222     U8_SET_CP_START(s, start, _u8_get_index); \
223     U8_NEXT(s, _u8_get_index, length, c); \
224 } UPRV_BLOCK_MACRO_END
225 
226 /**
227  * Get a code point from a string at a random-access offset,
228  * without changing the offset.
229  * The offset may point to either the lead byte or one of the trail bytes
230  * for a code point, in which case the macro will read all of the bytes
231  * for the code point.
232  *
233  * The length can be negative for a NUL-terminated string.
234  *
235  * If the offset points to an illegal UTF-8 byte sequence, then
236  * c is set to U+FFFD.
237  * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
238  *
239  * This macro does not distinguish between a real U+FFFD in the text
240  * and U+FFFD returned for an ill-formed sequence.
241  * Use U8_GET() if that distinction is important.
242  *
243  * @param s const uint8_t * string
244  * @param start int32_t starting string offset
245  * @param i int32_t string offset, must be start<=i<length
246  * @param length int32_t string length
247  * @param c output UChar32 variable, set to U+FFFD in case of an error
248  * @see U8_GET
249  * @stable ICU 51
250  */
251 #define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
252     int32_t _u8_get_index=(i); \
253     U8_SET_CP_START(s, start, _u8_get_index); \
254     U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
255 } UPRV_BLOCK_MACRO_END
256 
257 /* definitions with forward iteration --------------------------------------- */
258 
259 /**
260  * Get a code point from a string at a code point boundary offset,
261  * and advance the offset to the next code point boundary.
262  * (Post-incrementing forward iteration.)
263  * "Unsafe" macro, assumes well-formed UTF-8.
264  *
265  * The offset may point to the lead byte of a multi-byte sequence,
266  * in which case the macro will read the whole sequence.
267  * The result is undefined if the offset points to a trail byte
268  * or an illegal UTF-8 sequence.
269  *
270  * @param s const uint8_t * string
271  * @param i string offset
272  * @param c output UChar32 variable
273  * @see U8_NEXT
274  * @stable ICU 2.4
275  */
276 #define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
277     (c)=(uint8_t)(s)[(i)++]; \
278     if(!U8_IS_SINGLE(c)) { \
279         if((c)<0xe0) { \
280             (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
281         } else if((c)<0xf0) { \
282             /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
283             (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
284             (i)+=2; \
285         } else { \
286             (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
287             (i)+=3; \
288         } \
289     } \
290 } UPRV_BLOCK_MACRO_END
291 
292 /**
293  * Get a code point from a string at a code point boundary offset,
294  * and advance the offset to the next code point boundary.
295  * (Post-incrementing forward iteration.)
296  * "Safe" macro, checks for illegal sequences and for string boundaries.
297  *
298  * The length can be negative for a NUL-terminated string.
299  *
300  * The offset may point to the lead byte of a multi-byte sequence,
301  * in which case the macro will read the whole sequence.
302  * If the offset points to a trail byte or an illegal UTF-8 sequence, then
303  * c is set to a negative value.
304  *
305  * @param s const uint8_t * string
306  * @param i int32_t string offset, must be i<length
307  * @param length int32_t string length
308  * @param c output UChar32 variable, set to <0 in case of an error
309  * @see U8_NEXT_UNSAFE
310  * @stable ICU 2.4
311  */
312 #define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
313 
314 /**
315  * Get a code point from a string at a code point boundary offset,
316  * and advance the offset to the next code point boundary.
317  * (Post-incrementing forward iteration.)
318  * "Safe" macro, checks for illegal sequences and for string boundaries.
319  *
320  * The length can be negative for a NUL-terminated string.
321  *
322  * The offset may point to the lead byte of a multi-byte sequence,
323  * in which case the macro will read the whole sequence.
324  * If the offset points to a trail byte or an illegal UTF-8 sequence, then
325  * c is set to U+FFFD.
326  *
327  * This macro does not distinguish between a real U+FFFD in the text
328  * and U+FFFD returned for an ill-formed sequence.
329  * Use U8_NEXT() if that distinction is important.
330  *
331  * @param s const uint8_t * string
332  * @param i int32_t string offset, must be i<length
333  * @param length int32_t string length
334  * @param c output UChar32 variable, set to U+FFFD in case of an error
335  * @see U8_NEXT
336  * @stable ICU 51
337  */
338 #define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
339 
340 /** @internal */
341 #define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
342     (c)=(uint8_t)(s)[(i)++]; \
343     if(!U8_IS_SINGLE(c)) { \
344         uint8_t __t = 0; \
345         if((i)!=(length) && \
346             /* fetch/validate/assemble all but last trail byte */ \
347             ((c)>=0xe0 ? \
348                 ((c)<0xf0 ?  /* U+0800..U+FFFF except surrogates */ \
349                     U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
350                     (__t&=0x3f, 1) \
351                 :  /* U+10000..U+10FFFF */ \
352                     ((c)-=0xf0)<=4 && \
353                     U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
354                     ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
355                     (__t=(s)[i]-0x80)<=0x3f) && \
356                 /* valid second-to-last trail byte */ \
357                 ((c)=((c)<<6)|__t, ++(i)!=(length)) \
358             :  /* U+0080..U+07FF */ \
359                 (c)>=0xc2 && ((c)&=0x1f, 1)) && \
360             /* last trail byte */ \
361             (__t=(s)[i]-0x80)<=0x3f && \
362             ((c)=((c)<<6)|__t, ++(i), 1)) { \
363         } else { \
364             (c)=(sub);  /* ill-formed*/ \
365         } \
366     } \
367 } UPRV_BLOCK_MACRO_END
368 
369 /**
370  * Append a code point to a string, overwriting 1 to 4 bytes.
371  * The offset points to the current end of the string contents
372  * and is advanced (post-increment).
373  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
374  * Otherwise, the result is undefined.
375  *
376  * @param s const uint8_t * string buffer
377  * @param i string offset
378  * @param c code point to append
379  * @see U8_APPEND
380  * @stable ICU 2.4
381  */
382 #define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
383     uint32_t __uc=(c); \
384     if(__uc<=0x7f) { \
385         (s)[(i)++]=(uint8_t)__uc; \
386     } else { \
387         if(__uc<=0x7ff) { \
388             (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
389         } else { \
390             if(__uc<=0xffff) { \
391                 (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
392             } else { \
393                 (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
394                 (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
395             } \
396             (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
397         } \
398         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
399     } \
400 } UPRV_BLOCK_MACRO_END
401 
402 /**
403  * Append a code point to a string, overwriting 1 to 4 bytes.
404  * The offset points to the current end of the string contents
405  * and is advanced (post-increment).
406  * "Safe" macro, checks for a valid code point.
407  * If a non-ASCII code point is written, checks for sufficient space in the string.
408  * If the code point is not valid or trail bytes do not fit,
409  * then isError is set to true.
410  *
411  * @param s const uint8_t * string buffer
412  * @param i int32_t string offset, must be i<capacity
413  * @param capacity int32_t size of the string buffer
414  * @param c UChar32 code point to append
415  * @param isError output UBool set to true if an error occurs, otherwise not modified
416  * @see U8_APPEND_UNSAFE
417  * @stable ICU 2.4
418  */
419 #define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
420     uint32_t __uc=(c); \
421     if(__uc<=0x7f) { \
422         (s)[(i)++]=(uint8_t)__uc; \
423     } else if(__uc<=0x7ff && (i)+1<(capacity)) { \
424         (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
425         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
426     } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
427         (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
428         (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
429         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
430     } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
431         (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
432         (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
433         (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
434         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
435     } else { \
436         (isError)=true; \
437     } \
438 } UPRV_BLOCK_MACRO_END
439 
440 /**
441  * Advance the string offset from one code point boundary to the next.
442  * (Post-incrementing iteration.)
443  * "Unsafe" macro, assumes well-formed UTF-8.
444  *
445  * @param s const uint8_t * string
446  * @param i string offset
447  * @see U8_FWD_1
448  * @stable ICU 2.4
449  */
450 #define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
451     (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
452 } UPRV_BLOCK_MACRO_END
453 
454 /**
455  * Advance the string offset from one code point boundary to the next.
456  * (Post-incrementing iteration.)
457  * "Safe" macro, checks for illegal sequences and for string boundaries.
458  *
459  * The length can be negative for a NUL-terminated string.
460  *
461  * @param s const uint8_t * string
462  * @param i int32_t string offset, must be i<length
463  * @param length int32_t string length
464  * @see U8_FWD_1_UNSAFE
465  * @stable ICU 2.4
466  */
467 #define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
468     uint8_t __b=(s)[(i)++]; \
469     if(U8_IS_LEAD(__b) && (i)!=(length)) { \
470         uint8_t __t1=(s)[i]; \
471         if((0xe0<=__b && __b<0xf0)) { \
472             if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
473                     ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
474                 ++(i); \
475             } \
476         } else if(__b<0xe0) { \
477             if(U8_IS_TRAIL(__t1)) { \
478                 ++(i); \
479             } \
480         } else /* c>=0xf0 */ { \
481             if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
482                     ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
483                     ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
484                 ++(i); \
485             } \
486         } \
487     } \
488 } UPRV_BLOCK_MACRO_END
489 
490 /**
491  * Advance the string offset from one code point boundary to the n-th next one,
492  * i.e., move forward by n code points.
493  * (Post-incrementing iteration.)
494  * "Unsafe" macro, assumes well-formed UTF-8.
495  *
496  * @param s const uint8_t * string
497  * @param i string offset
498  * @param n number of code points to skip
499  * @see U8_FWD_N
500  * @stable ICU 2.4
501  */
502 #define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
503     int32_t __N=(n); \
504     while(__N>0) { \
505         U8_FWD_1_UNSAFE(s, i); \
506         --__N; \
507     } \
508 } UPRV_BLOCK_MACRO_END
509 
510 /**
511  * Advance the string offset from one code point boundary to the n-th next one,
512  * i.e., move forward by n code points.
513  * (Post-incrementing iteration.)
514  * "Safe" macro, checks for illegal sequences and for string boundaries.
515  *
516  * The length can be negative for a NUL-terminated string.
517  *
518  * @param s const uint8_t * string
519  * @param i int32_t string offset, must be i<length
520  * @param length int32_t string length
521  * @param n number of code points to skip
522  * @see U8_FWD_N_UNSAFE
523  * @stable ICU 2.4
524  */
525 #define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
526     int32_t __N=(n); \
527     while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
528         U8_FWD_1(s, i, length); \
529         --__N; \
530     } \
531 } UPRV_BLOCK_MACRO_END
532 
533 /**
534  * Adjust a random-access offset to a code point boundary
535  * at the start of a code point.
536  * If the offset points to a UTF-8 trail byte,
537  * then the offset is moved backward to the corresponding lead byte.
538  * Otherwise, it is not modified.
539  * "Unsafe" macro, assumes well-formed UTF-8.
540  *
541  * @param s const uint8_t * string
542  * @param i string offset
543  * @see U8_SET_CP_START
544  * @stable ICU 2.4
545  */
546 #define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
547     while(U8_IS_TRAIL((s)[i])) { --(i); } \
548 } UPRV_BLOCK_MACRO_END
549 
550 /**
551  * Adjust a random-access offset to a code point boundary
552  * at the start of a code point.
553  * If the offset points to a UTF-8 trail byte,
554  * then the offset is moved backward to the corresponding lead byte.
555  * Otherwise, it is not modified.
556  *
557  * "Safe" macro, checks for illegal sequences and for string boundaries.
558  * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
559  *
560  * @param s const uint8_t * string
561  * @param start int32_t starting string offset (usually 0)
562  * @param i int32_t string offset, must be start<=i
563  * @see U8_SET_CP_START_UNSAFE
564  * @see U8_TRUNCATE_IF_INCOMPLETE
565  * @stable ICU 2.4
566  */
567 #define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
568     if(U8_IS_TRAIL((s)[(i)])) { \
569         (i)=utf8_back1SafeBody(s, start, (i)); \
570     } \
571 } UPRV_BLOCK_MACRO_END
572 
573 /**
574  * If the string ends with a UTF-8 byte sequence that is valid so far
575  * but incomplete, then reduce the length of the string to end before
576  * the lead byte of that incomplete sequence.
577  * For example, if the string ends with E1 80, the length is reduced by 2.
578  *
579  * In all other cases (the string ends with a complete sequence, or it is not
580  * possible for any further trail byte to extend the trailing sequence)
581  * the length remains unchanged.
582  *
583  * Useful for processing text split across multiple buffers
584  * (save the incomplete sequence for later)
585  * and for optimizing iteration
586  * (check for string length only once per character).
587  *
588  * "Safe" macro, checks for illegal sequences and for string boundaries.
589  * Unlike U8_SET_CP_START(), this macro never reads s[length].
590  *
591  * (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
592  *
593  * @param s const uint8_t * string
594  * @param start int32_t starting string offset (usually 0)
595  * @param length int32_t string length (usually start<=length)
596  * @see U8_SET_CP_START
597  * @stable ICU 61
598  */
599 #define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
600     if((length)>(start)) { \
601         uint8_t __b1=s[(length)-1]; \
602         if(U8_IS_SINGLE(__b1)) { \
603             /* common ASCII character */ \
604         } else if(U8_IS_LEAD(__b1)) { \
605             --(length); \
606         } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
607             uint8_t __b2=s[(length)-2]; \
608             if(0xe0<=__b2 && __b2<=0xf4) { \
609                 if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
610                         U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
611                     (length)-=2; \
612                 } \
613             } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
614                 uint8_t __b3=s[(length)-3]; \
615                 if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
616                     (length)-=3; \
617                 } \
618             } \
619         } \
620     } \
621 } UPRV_BLOCK_MACRO_END
622 
623 /* definitions with backward iteration -------------------------------------- */
624 
625 /**
626  * Move the string offset from one code point boundary to the previous one
627  * and get the code point between them.
628  * (Pre-decrementing backward iteration.)
629  * "Unsafe" macro, assumes well-formed UTF-8.
630  *
631  * The input offset may be the same as the string length.
632  * If the offset is behind a multi-byte sequence, then the macro will read
633  * the whole sequence.
634  * If the offset is behind a lead byte, then that itself
635  * will be returned as the code point.
636  * The result is undefined if the offset is behind an illegal UTF-8 sequence.
637  *
638  * @param s const uint8_t * string
639  * @param i string offset
640  * @param c output UChar32 variable
641  * @see U8_PREV
642  * @stable ICU 2.4
643  */
644 #define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
645     (c)=(uint8_t)(s)[--(i)]; \
646     if(U8_IS_TRAIL(c)) { \
647         uint8_t __b, __count=1, __shift=6; \
648 \
649         /* c is a trail byte */ \
650         (c)&=0x3f; \
651         for(;;) { \
652             __b=(s)[--(i)]; \
653             if(__b>=0xc0) { \
654                 U8_MASK_LEAD_BYTE(__b, __count); \
655                 (c)|=(UChar32)__b<<__shift; \
656                 break; \
657             } else { \
658                 (c)|=(UChar32)(__b&0x3f)<<__shift; \
659                 ++__count; \
660                 __shift+=6; \
661             } \
662         } \
663     } \
664 } UPRV_BLOCK_MACRO_END
665 
666 /**
667  * Move the string offset from one code point boundary to the previous one
668  * and get the code point between them.
669  * (Pre-decrementing backward iteration.)
670  * "Safe" macro, checks for illegal sequences and for string boundaries.
671  *
672  * The input offset may be the same as the string length.
673  * If the offset is behind a multi-byte sequence, then the macro will read
674  * the whole sequence.
675  * If the offset is behind a lead byte, then that itself
676  * will be returned as the code point.
677  * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
678  *
679  * @param s const uint8_t * string
680  * @param start int32_t starting string offset (usually 0)
681  * @param i int32_t string offset, must be start<i
682  * @param c output UChar32 variable, set to <0 in case of an error
683  * @see U8_PREV_UNSAFE
684  * @stable ICU 2.4
685  */
686 #define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
687     (c)=(uint8_t)(s)[--(i)]; \
688     if(!U8_IS_SINGLE(c)) { \
689         (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
690     } \
691 } UPRV_BLOCK_MACRO_END
692 
693 /**
694  * Move the string offset from one code point boundary to the previous one
695  * and get the code point between them.
696  * (Pre-decrementing backward iteration.)
697  * "Safe" macro, checks for illegal sequences and for string boundaries.
698  *
699  * The input offset may be the same as the string length.
700  * If the offset is behind a multi-byte sequence, then the macro will read
701  * the whole sequence.
702  * If the offset is behind a lead byte, then that itself
703  * will be returned as the code point.
704  * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
705  *
706  * This macro does not distinguish between a real U+FFFD in the text
707  * and U+FFFD returned for an ill-formed sequence.
708  * Use U8_PREV() if that distinction is important.
709  *
710  * @param s const uint8_t * string
711  * @param start int32_t starting string offset (usually 0)
712  * @param i int32_t string offset, must be start<i
713  * @param c output UChar32 variable, set to U+FFFD in case of an error
714  * @see U8_PREV
715  * @stable ICU 51
716  */
717 #define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
718     (c)=(uint8_t)(s)[--(i)]; \
719     if(!U8_IS_SINGLE(c)) { \
720         (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
721     } \
722 } UPRV_BLOCK_MACRO_END
723 
724 /**
725  * Move the string offset from one code point boundary to the previous one.
726  * (Pre-decrementing backward iteration.)
727  * The input offset may be the same as the string length.
728  * "Unsafe" macro, assumes well-formed UTF-8.
729  *
730  * @param s const uint8_t * string
731  * @param i string offset
732  * @see U8_BACK_1
733  * @stable ICU 2.4
734  */
735 #define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
736     while(U8_IS_TRAIL((s)[--(i)])) {} \
737 } UPRV_BLOCK_MACRO_END
738 
739 /**
740  * Move the string offset from one code point boundary to the previous one.
741  * (Pre-decrementing backward iteration.)
742  * The input offset may be the same as the string length.
743  * "Safe" macro, checks for illegal sequences and for string boundaries.
744  *
745  * @param s const uint8_t * string
746  * @param start int32_t starting string offset (usually 0)
747  * @param i int32_t string offset, must be start<i
748  * @see U8_BACK_1_UNSAFE
749  * @stable ICU 2.4
750  */
751 #define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
752     if(U8_IS_TRAIL((s)[--(i)])) { \
753         (i)=utf8_back1SafeBody(s, start, (i)); \
754     } \
755 } UPRV_BLOCK_MACRO_END
756 
757 /**
758  * Move the string offset from one code point boundary to the n-th one before it,
759  * i.e., move backward by n code points.
760  * (Pre-decrementing backward iteration.)
761  * The input offset may be the same as the string length.
762  * "Unsafe" macro, assumes well-formed UTF-8.
763  *
764  * @param s const uint8_t * string
765  * @param i string offset
766  * @param n number of code points to skip
767  * @see U8_BACK_N
768  * @stable ICU 2.4
769  */
770 #define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
771     int32_t __N=(n); \
772     while(__N>0) { \
773         U8_BACK_1_UNSAFE(s, i); \
774         --__N; \
775     } \
776 } UPRV_BLOCK_MACRO_END
777 
778 /**
779  * Move the string offset from one code point boundary to the n-th one before it,
780  * i.e., move backward by n code points.
781  * (Pre-decrementing backward iteration.)
782  * The input offset may be the same as the string length.
783  * "Safe" macro, checks for illegal sequences and for string boundaries.
784  *
785  * @param s const uint8_t * string
786  * @param start int32_t index of the start of the string
787  * @param i int32_t string offset, must be start<i
788  * @param n number of code points to skip
789  * @see U8_BACK_N_UNSAFE
790  * @stable ICU 2.4
791  */
792 #define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
793     int32_t __N=(n); \
794     while(__N>0 && (i)>(start)) { \
795         U8_BACK_1(s, start, i); \
796         --__N; \
797     } \
798 } UPRV_BLOCK_MACRO_END
799 
800 /**
801  * Adjust a random-access offset to a code point boundary after a code point.
802  * If the offset is behind a partial multi-byte sequence,
803  * then the offset is incremented to behind the whole sequence.
804  * Otherwise, it is not modified.
805  * The input offset may be the same as the string length.
806  * "Unsafe" macro, assumes well-formed UTF-8.
807  *
808  * @param s const uint8_t * string
809  * @param i string offset
810  * @see U8_SET_CP_LIMIT
811  * @stable ICU 2.4
812  */
813 #define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
814     U8_BACK_1_UNSAFE(s, i); \
815     U8_FWD_1_UNSAFE(s, i); \
816 } UPRV_BLOCK_MACRO_END
817 
818 /**
819  * Adjust a random-access offset to a code point boundary after a code point.
820  * If the offset is behind a partial multi-byte sequence,
821  * then the offset is incremented to behind the whole sequence.
822  * Otherwise, it is not modified.
823  * The input offset may be the same as the string length.
824  * "Safe" macro, checks for illegal sequences and for string boundaries.
825  *
826  * The length can be negative for a NUL-terminated string.
827  *
828  * @param s const uint8_t * string
829  * @param start int32_t starting string offset (usually 0)
830  * @param i int32_t string offset, must be start<=i<=length
831  * @param length int32_t string length
832  * @see U8_SET_CP_LIMIT_UNSAFE
833  * @stable ICU 2.4
834  */
835 #define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
836     if((start)<(i) && ((i)<(length) || (length)<0)) { \
837         U8_BACK_1(s, start, i); \
838         U8_FWD_1(s, i, length); \
839     } \
840 } UPRV_BLOCK_MACRO_END
841 
842 #endif
843