• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  utf8.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 1999sep13
16 *   created by: Markus W. Scherer
17 */
18 
19 /**
20  * @addtogroup icu4c ICU4C
21  * @{
22  * \file
23  * \brief C API: 8-bit Unicode handling macros
24  *
25  * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
26  *
27  * For more information see utf.h and the ICU User Guide Strings chapter
28  * (https://unicode-org.github.io/icu/userguide/strings).
29  *
30  * <em>Usage:</em>
31  * ICU coding guidelines for if() statements should be followed when using these macros.
32  * Compound statements (curly braces {}) must be used  for if-else-while...
33  * bodies and all macro statements should be terminated with semicolon.
34  */
35 
36 #ifndef __UTF8_H__
37 #define __UTF8_H__
38 
39 #include <stdbool.h>
40 #include "unicode/umachine.h"
41 #ifndef __UTF_H__
42 #   include "unicode/utf.h"
43 #endif
44 
45 /* internal definitions ----------------------------------------------------- */
46 
47 /**
48  * Counts the trail bytes for a UTF-8 lead byte.
49  * Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
50  * leadByte might be evaluated multiple times.
51  *
52  * This is internal since it is not meant to be called directly by external clients;
53  * however it is called by public macros in this file and thus must remain stable.
54  *
55  * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
56  * \xrefitem internal "Internal"  "Internal List"  Do not use. This API is for internal use only.
57  */
58 #define U8_COUNT_TRAIL_BYTES(leadByte) \
59     (U8_IS_LEAD(leadByte) ? \
60         ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
61 
62 /**
63  * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
64  * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
65  * leadByte might be evaluated multiple times.
66  *
67  * This is internal since it is not meant to be called directly by external clients;
68  * however it is called by public macros in this file and thus must remain stable.
69  *
70  * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
71  * \xrefitem internal "Internal"  "Internal List"  Do not use. This API is for internal use only.
72  */
73 #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
74     (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
75 
76 /**
77  * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
78  *
79  * This is internal since it is not meant to be called directly by external clients;
80  * however it is called by public macros in this file and thus must remain stable.
81  * \xrefitem internal "Internal"  "Internal List"  Do not use. This API is for internal use only.
82  */
83 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
84 
85 /**
86  * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
87  * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
88  * Lead byte E0..EF bits 3..0 are used as byte index,
89  * first trail byte bits 7..5 are used as bit index into that byte.
90  * @see U8_IS_VALID_LEAD3_AND_T1
91  * \xrefitem internal "Internal"  "Internal List"  Do not use. This API is for internal use only.
92  */
93 #define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
94 
95 /**
96  * Internal 3-byte UTF-8 validity check.
97  * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
98  * \xrefitem internal "Internal"  "Internal List"  Do not use. This API is for internal use only.
99  */
100 #define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
101 
102 /**
103  * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
104  * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
105  * First trail byte bits 7..4 are used as byte index,
106  * lead byte F0..F4 bits 2..0 are used as bit index into that byte.
107  * @see U8_IS_VALID_LEAD4_AND_T1
108  * \xrefitem internal "Internal"  "Internal List"  Do not use. This API is for internal use only.
109  */
110 #define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
111 
112 /**
113  * Internal 4-byte UTF-8 validity check.
114  * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
115  * \xrefitem internal "Internal"  "Internal List"  Do not use. This API is for internal use only.
116  */
117 #define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
118 
119 
120 
121 
122 
123 
124 
125 
126 
127 /* single-code point definitions -------------------------------------------- */
128 
129 /**
130  * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
131  * @param c 8-bit code unit (byte)
132  * @return true or false
133  * \xrefitem stable "Stable" "Stable List" ICU 2.4
134  */
135 #define U8_IS_SINGLE(c) (((c)&0x80)==0)
136 
137 /**
138  * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
139  * @param c 8-bit code unit (byte)
140  * @return true or false
141  * \xrefitem stable "Stable" "Stable List" ICU 2.4
142  */
143 #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
144 // 0x32=0xf4-0xc2
145 
146 /**
147  * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
148  * @param c 8-bit code unit (byte)
149  * @return true or false
150  * \xrefitem stable "Stable" "Stable List" ICU 2.4
151  */
152 #define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
153 
154 /**
155  * How many code units (bytes) are used for the UTF-8 encoding
156  * of this Unicode code point?
157  * @param c 32-bit code point
158  * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
159  * \xrefitem stable "Stable" "Stable List" ICU 2.4
160  */
161 #define U8_LENGTH(c) \
162     ((uint32_t)(c)<=0x7f ? 1 : \
163         ((uint32_t)(c)<=0x7ff ? 2 : \
164             ((uint32_t)(c)<=0xd7ff ? 3 : \
165                 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
166                     ((uint32_t)(c)<=0xffff ? 3 : 4)\
167                 ) \
168             ) \
169         ) \
170     )
171 
172 /**
173  * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
174  * @return 4
175  * \xrefitem stable "Stable" "Stable List" ICU 2.4
176  */
177 #define U8_MAX_LENGTH 4
178 
179 /**
180  * Get a code point from a string at a random-access offset,
181  * without changing the offset.
182  * The offset may point to either the lead byte or one of the trail bytes
183  * for a code point, in which case the macro will read all of the bytes
184  * for the code point.
185  * The result is undefined if the offset points to an illegal UTF-8
186  * byte sequence.
187  * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
188  *
189  * @param s const uint8_t * string
190  * @param i string offset
191  * @param c output UChar32 variable
192  * @see U8_GET
193  * \xrefitem stable "Stable" "Stable List" ICU 2.4
194  */
195 #define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
196     int32_t _u8_get_unsafe_index=(int32_t)(i); \
197     U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
198     U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
199 } UPRV_BLOCK_MACRO_END
200 
201 /**
202  * Get a code point from a string at a random-access offset,
203  * without changing the offset.
204  * The offset may point to either the lead byte or one of the trail bytes
205  * for a code point, in which case the macro will read all of the bytes
206  * for the code point.
207  *
208  * The length can be negative for a NUL-terminated string.
209  *
210  * If the offset points to an illegal UTF-8 byte sequence, then
211  * c is set to a negative value.
212  * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
213  *
214  * @param s const uint8_t * string
215  * @param start int32_t starting string offset
216  * @param i int32_t string offset, must be start<=i<length
217  * @param length int32_t string length
218  * @param c output UChar32 variable, set to <0 in case of an error
219  * @see U8_GET_UNSAFE
220  * \xrefitem stable "Stable" "Stable List" ICU 2.4
221  */
222 #define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
223     int32_t _u8_get_index=(i); \
224     U8_SET_CP_START(s, start, _u8_get_index); \
225     U8_NEXT(s, _u8_get_index, length, c); \
226 } UPRV_BLOCK_MACRO_END
227 
228 /**
229  * Get a code point from a string at a random-access offset,
230  * without changing the offset.
231  * The offset may point to either the lead byte or one of the trail bytes
232  * for a code point, in which case the macro will read all of the bytes
233  * for the code point.
234  *
235  * The length can be negative for a NUL-terminated string.
236  *
237  * If the offset points to an illegal UTF-8 byte sequence, then
238  * c is set to U+FFFD.
239  * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
240  *
241  * This macro does not distinguish between a real U+FFFD in the text
242  * and U+FFFD returned for an ill-formed sequence.
243  * Use U8_GET() if that distinction is important.
244  *
245  * @param s const uint8_t * string
246  * @param start int32_t starting string offset
247  * @param i int32_t string offset, must be start<=i<length
248  * @param length int32_t string length
249  * @param c output UChar32 variable, set to U+FFFD in case of an error
250  * @see U8_GET
251  * \xrefitem stable "Stable" "Stable List" ICU 51
252  */
253 #define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
254     int32_t _u8_get_index=(i); \
255     U8_SET_CP_START(s, start, _u8_get_index); \
256     U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
257 } UPRV_BLOCK_MACRO_END
258 
259 /* definitions with forward iteration --------------------------------------- */
260 
261 /**
262  * Get a code point from a string at a code point boundary offset,
263  * and advance the offset to the next code point boundary.
264  * (Post-incrementing forward iteration.)
265  * "Unsafe" macro, assumes well-formed UTF-8.
266  *
267  * The offset may point to the lead byte of a multi-byte sequence,
268  * in which case the macro will read the whole sequence.
269  * The result is undefined if the offset points to a trail byte
270  * or an illegal UTF-8 sequence.
271  *
272  * @param s const uint8_t * string
273  * @param i string offset
274  * @param c output UChar32 variable
275  * @see U8_NEXT
276  * \xrefitem stable "Stable" "Stable List" ICU 2.4
277  */
278 #define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
279     (c)=(uint8_t)(s)[(i)++]; \
280     if(!U8_IS_SINGLE(c)) { \
281         if((c)<0xe0) { \
282             (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
283         } else if((c)<0xf0) { \
284             /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
285             (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
286             (i)+=2; \
287         } else { \
288             (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
289             (i)+=3; \
290         } \
291     } \
292 } UPRV_BLOCK_MACRO_END
293 
294 /**
295  * Get a code point from a string at a code point boundary offset,
296  * and advance the offset to the next code point boundary.
297  * (Post-incrementing forward iteration.)
298  * "Safe" macro, checks for illegal sequences and for string boundaries.
299  *
300  * The length can be negative for a NUL-terminated string.
301  *
302  * The offset may point to the lead byte of a multi-byte sequence,
303  * in which case the macro will read the whole sequence.
304  * If the offset points to a trail byte or an illegal UTF-8 sequence, then
305  * c is set to a negative value.
306  *
307  * @param s const uint8_t * string
308  * @param i int32_t string offset, must be i<length
309  * @param length int32_t string length
310  * @param c output UChar32 variable, set to <0 in case of an error
311  * @see U8_NEXT_UNSAFE
312  * \xrefitem stable "Stable" "Stable List" ICU 2.4
313  */
314 #define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
315 
316 /**
317  * Get a code point from a string at a code point boundary offset,
318  * and advance the offset to the next code point boundary.
319  * (Post-incrementing forward iteration.)
320  * "Safe" macro, checks for illegal sequences and for string boundaries.
321  *
322  * The length can be negative for a NUL-terminated string.
323  *
324  * The offset may point to the lead byte of a multi-byte sequence,
325  * in which case the macro will read the whole sequence.
326  * If the offset points to a trail byte or an illegal UTF-8 sequence, then
327  * c is set to U+FFFD.
328  *
329  * This macro does not distinguish between a real U+FFFD in the text
330  * and U+FFFD returned for an ill-formed sequence.
331  * Use U8_NEXT() if that distinction is important.
332  *
333  * @param s const uint8_t * string
334  * @param i int32_t string offset, must be i<length
335  * @param length int32_t string length
336  * @param c output UChar32 variable, set to U+FFFD in case of an error
337  * @see U8_NEXT
338  * \xrefitem stable "Stable" "Stable List" ICU 51
339  */
340 #define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
341 
342 /** \xrefitem internal "Internal"  "Internal List"  Do not use. This API is for internal use only. */
343 #define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
344     (c)=(uint8_t)(s)[(i)++]; \
345     if(!U8_IS_SINGLE(c)) { \
346         uint8_t __t = 0; \
347         if((i)!=(length) && \
348             /* fetch/validate/assemble all but last trail byte */ \
349             ((c)>=0xe0 ? \
350                 ((c)<0xf0 ?  /* U+0800..U+FFFF except surrogates */ \
351                     U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
352                     (__t&=0x3f, 1) \
353                 :  /* U+10000..U+10FFFF */ \
354                     ((c)-=0xf0)<=4 && \
355                     U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
356                     ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
357                     (__t=(s)[i]-0x80)<=0x3f) && \
358                 /* valid second-to-last trail byte */ \
359                 ((c)=((c)<<6)|__t, ++(i)!=(length)) \
360             :  /* U+0080..U+07FF */ \
361                 (c)>=0xc2 && ((c)&=0x1f, 1)) && \
362             /* last trail byte */ \
363             (__t=(s)[i]-0x80)<=0x3f && \
364             ((c)=((c)<<6)|__t, ++(i), 1)) { \
365         } else { \
366             (c)=(sub);  /* ill-formed*/ \
367         } \
368     } \
369 } UPRV_BLOCK_MACRO_END
370 
371 /**
372  * Append a code point to a string, overwriting 1 to 4 bytes.
373  * The offset points to the current end of the string contents
374  * and is advanced (post-increment).
375  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
376  * Otherwise, the result is undefined.
377  *
378  * @param s const uint8_t * string buffer
379  * @param i string offset
380  * @param c code point to append
381  * @see U8_APPEND
382  * \xrefitem stable "Stable" "Stable List" ICU 2.4
383  */
384 #define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
385     uint32_t __uc=(c); \
386     if(__uc<=0x7f) { \
387         (s)[(i)++]=(uint8_t)__uc; \
388     } else { \
389         if(__uc<=0x7ff) { \
390             (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
391         } else { \
392             if(__uc<=0xffff) { \
393                 (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
394             } else { \
395                 (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
396                 (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
397             } \
398             (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
399         } \
400         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
401     } \
402 } UPRV_BLOCK_MACRO_END
403 
404 /**
405  * Append a code point to a string, overwriting 1 to 4 bytes.
406  * The offset points to the current end of the string contents
407  * and is advanced (post-increment).
408  * "Safe" macro, checks for a valid code point.
409  * If a non-ASCII code point is written, checks for sufficient space in the string.
410  * If the code point is not valid or trail bytes do not fit,
411  * then isError is set to true.
412  *
413  * @param s const uint8_t * string buffer
414  * @param i int32_t string offset, must be i<capacity
415  * @param capacity int32_t size of the string buffer
416  * @param c UChar32 code point to append
417  * @param isError output UBool set to true if an error occurs, otherwise not modified
418  * @see U8_APPEND_UNSAFE
419  * \xrefitem stable "Stable" "Stable List" ICU 2.4
420  */
421 #define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
422     uint32_t __uc=(c); \
423     if(__uc<=0x7f) { \
424         (s)[(i)++]=(uint8_t)__uc; \
425     } else if(__uc<=0x7ff && (i)+1<(capacity)) { \
426         (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
427         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
428     } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
429         (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
430         (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
431         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
432     } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
433         (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
434         (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
435         (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
436         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
437     } else { \
438         (isError)=true; \
439     } \
440 } UPRV_BLOCK_MACRO_END
441 
442 /**
443  * Advance the string offset from one code point boundary to the next.
444  * (Post-incrementing iteration.)
445  * "Unsafe" macro, assumes well-formed UTF-8.
446  *
447  * @param s const uint8_t * string
448  * @param i string offset
449  * @see U8_FWD_1
450  * \xrefitem stable "Stable" "Stable List" ICU 2.4
451  */
452 #define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
453     (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
454 } UPRV_BLOCK_MACRO_END
455 
456 /**
457  * Advance the string offset from one code point boundary to the next.
458  * (Post-incrementing iteration.)
459  * "Safe" macro, checks for illegal sequences and for string boundaries.
460  *
461  * The length can be negative for a NUL-terminated string.
462  *
463  * @param s const uint8_t * string
464  * @param i int32_t string offset, must be i<length
465  * @param length int32_t string length
466  * @see U8_FWD_1_UNSAFE
467  * \xrefitem stable "Stable" "Stable List" ICU 2.4
468  */
469 #define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
470     uint8_t __b=(s)[(i)++]; \
471     if(U8_IS_LEAD(__b) && (i)!=(length)) { \
472         uint8_t __t1=(s)[i]; \
473         if((0xe0<=__b && __b<0xf0)) { \
474             if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
475                     ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
476                 ++(i); \
477             } \
478         } else if(__b<0xe0) { \
479             if(U8_IS_TRAIL(__t1)) { \
480                 ++(i); \
481             } \
482         } else /* c>=0xf0 */ { \
483             if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
484                     ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
485                     ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
486                 ++(i); \
487             } \
488         } \
489     } \
490 } UPRV_BLOCK_MACRO_END
491 
492 /**
493  * Advance the string offset from one code point boundary to the n-th next one,
494  * i.e., move forward by n code points.
495  * (Post-incrementing iteration.)
496  * "Unsafe" macro, assumes well-formed UTF-8.
497  *
498  * @param s const uint8_t * string
499  * @param i string offset
500  * @param n number of code points to skip
501  * @see U8_FWD_N
502  * \xrefitem stable "Stable" "Stable List" ICU 2.4
503  */
504 #define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
505     int32_t __N=(n); \
506     while(__N>0) { \
507         U8_FWD_1_UNSAFE(s, i); \
508         --__N; \
509     } \
510 } UPRV_BLOCK_MACRO_END
511 
512 /**
513  * Advance the string offset from one code point boundary to the n-th next one,
514  * i.e., move forward by n code points.
515  * (Post-incrementing iteration.)
516  * "Safe" macro, checks for illegal sequences and for string boundaries.
517  *
518  * The length can be negative for a NUL-terminated string.
519  *
520  * @param s const uint8_t * string
521  * @param i int32_t string offset, must be i<length
522  * @param length int32_t string length
523  * @param n number of code points to skip
524  * @see U8_FWD_N_UNSAFE
525  * \xrefitem stable "Stable" "Stable List" ICU 2.4
526  */
527 #define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
528     int32_t __N=(n); \
529     while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
530         U8_FWD_1(s, i, length); \
531         --__N; \
532     } \
533 } UPRV_BLOCK_MACRO_END
534 
535 /**
536  * Adjust a random-access offset to a code point boundary
537  * at the start of a code point.
538  * If the offset points to a UTF-8 trail byte,
539  * then the offset is moved backward to the corresponding lead byte.
540  * Otherwise, it is not modified.
541  * "Unsafe" macro, assumes well-formed UTF-8.
542  *
543  * @param s const uint8_t * string
544  * @param i string offset
545  * @see U8_SET_CP_START
546  * \xrefitem stable "Stable" "Stable List" ICU 2.4
547  */
548 #define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
549     while(U8_IS_TRAIL((s)[i])) { --(i); } \
550 } UPRV_BLOCK_MACRO_END
551 
552 /**
553  * Adjust a random-access offset to a code point boundary
554  * at the start of a code point.
555  * If the offset points to a UTF-8 trail byte,
556  * then the offset is moved backward to the corresponding lead byte.
557  * Otherwise, it is not modified.
558  *
559  * "Safe" macro, checks for illegal sequences and for string boundaries.
560  * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
561  *
562  * @param s const uint8_t * string
563  * @param start int32_t starting string offset (usually 0)
564  * @param i int32_t string offset, must be start<=i
565  * @see U8_SET_CP_START_UNSAFE
566  * @see U8_TRUNCATE_IF_INCOMPLETE
567  * \xrefitem stable "Stable" "Stable List" ICU 2.4
568  */
569 #define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
570     if(U8_IS_TRAIL((s)[(i)])) { \
571         (i)=utf8_back1SafeBody(s, start, (i)); \
572     } \
573 } UPRV_BLOCK_MACRO_END
574 
575 /**
576  * If the string ends with a UTF-8 byte sequence that is valid so far
577  * but incomplete, then reduce the length of the string to end before
578  * the lead byte of that incomplete sequence.
579  * For example, if the string ends with E1 80, the length is reduced by 2.
580  *
581  * In all other cases (the string ends with a complete sequence, or it is not
582  * possible for any further trail byte to extend the trailing sequence)
583  * the length remains unchanged.
584  *
585  * Useful for processing text split across multiple buffers
586  * (save the incomplete sequence for later)
587  * and for optimizing iteration
588  * (check for string length only once per character).
589  *
590  * "Safe" macro, checks for illegal sequences and for string boundaries.
591  * Unlike U8_SET_CP_START(), this macro never reads s[length].
592  *
593  * (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
594  *
595  * @param s const uint8_t * string
596  * @param start int32_t starting string offset (usually 0)
597  * @param length int32_t string length (usually start<=length)
598  * @see U8_SET_CP_START
599  * \xrefitem stable "Stable" "Stable List" ICU 61
600  */
601 #define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
602     if((length)>(start)) { \
603         uint8_t __b1=s[(length)-1]; \
604         if(U8_IS_SINGLE(__b1)) { \
605             /* common ASCII character */ \
606         } else if(U8_IS_LEAD(__b1)) { \
607             --(length); \
608         } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
609             uint8_t __b2=s[(length)-2]; \
610             if(0xe0<=__b2 && __b2<=0xf4) { \
611                 if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
612                         U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
613                     (length)-=2; \
614                 } \
615             } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
616                 uint8_t __b3=s[(length)-3]; \
617                 if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
618                     (length)-=3; \
619                 } \
620             } \
621         } \
622     } \
623 } UPRV_BLOCK_MACRO_END
624 
625 /* definitions with backward iteration -------------------------------------- */
626 
627 /**
628  * Move the string offset from one code point boundary to the previous one
629  * and get the code point between them.
630  * (Pre-decrementing backward iteration.)
631  * "Unsafe" macro, assumes well-formed UTF-8.
632  *
633  * The input offset may be the same as the string length.
634  * If the offset is behind a multi-byte sequence, then the macro will read
635  * the whole sequence.
636  * If the offset is behind a lead byte, then that itself
637  * will be returned as the code point.
638  * The result is undefined if the offset is behind an illegal UTF-8 sequence.
639  *
640  * @param s const uint8_t * string
641  * @param i string offset
642  * @param c output UChar32 variable
643  * @see U8_PREV
644  * \xrefitem stable "Stable" "Stable List" ICU 2.4
645  */
646 #define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
647     (c)=(uint8_t)(s)[--(i)]; \
648     if(U8_IS_TRAIL(c)) { \
649         uint8_t __b, __count=1, __shift=6; \
650 \
651         /* c is a trail byte */ \
652         (c)&=0x3f; \
653         for(;;) { \
654             __b=(s)[--(i)]; \
655             if(__b>=0xc0) { \
656                 U8_MASK_LEAD_BYTE(__b, __count); \
657                 (c)|=(UChar32)__b<<__shift; \
658                 break; \
659             } else { \
660                 (c)|=(UChar32)(__b&0x3f)<<__shift; \
661                 ++__count; \
662                 __shift+=6; \
663             } \
664         } \
665     } \
666 } UPRV_BLOCK_MACRO_END
667 
668 /**
669  * Move the string offset from one code point boundary to the previous one
670  * and get the code point between them.
671  * (Pre-decrementing backward iteration.)
672  * "Safe" macro, checks for illegal sequences and for string boundaries.
673  *
674  * The input offset may be the same as the string length.
675  * If the offset is behind a multi-byte sequence, then the macro will read
676  * the whole sequence.
677  * If the offset is behind a lead byte, then that itself
678  * will be returned as the code point.
679  * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
680  *
681  * @param s const uint8_t * string
682  * @param start int32_t starting string offset (usually 0)
683  * @param i int32_t string offset, must be start<i
684  * @param c output UChar32 variable, set to <0 in case of an error
685  * @see U8_PREV_UNSAFE
686  * \xrefitem stable "Stable" "Stable List" ICU 2.4
687  */
688 #define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
689     (c)=(uint8_t)(s)[--(i)]; \
690     if(!U8_IS_SINGLE(c)) { \
691         (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
692     } \
693 } UPRV_BLOCK_MACRO_END
694 
695 /**
696  * Move the string offset from one code point boundary to the previous one
697  * and get the code point between them.
698  * (Pre-decrementing backward iteration.)
699  * "Safe" macro, checks for illegal sequences and for string boundaries.
700  *
701  * The input offset may be the same as the string length.
702  * If the offset is behind a multi-byte sequence, then the macro will read
703  * the whole sequence.
704  * If the offset is behind a lead byte, then that itself
705  * will be returned as the code point.
706  * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
707  *
708  * This macro does not distinguish between a real U+FFFD in the text
709  * and U+FFFD returned for an ill-formed sequence.
710  * Use U8_PREV() if that distinction is important.
711  *
712  * @param s const uint8_t * string
713  * @param start int32_t starting string offset (usually 0)
714  * @param i int32_t string offset, must be start<i
715  * @param c output UChar32 variable, set to U+FFFD in case of an error
716  * @see U8_PREV
717  * \xrefitem stable "Stable" "Stable List" ICU 51
718  */
719 #define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
720     (c)=(uint8_t)(s)[--(i)]; \
721     if(!U8_IS_SINGLE(c)) { \
722         (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
723     } \
724 } UPRV_BLOCK_MACRO_END
725 
726 /**
727  * Move the string offset from one code point boundary to the previous one.
728  * (Pre-decrementing backward iteration.)
729  * The input offset may be the same as the string length.
730  * "Unsafe" macro, assumes well-formed UTF-8.
731  *
732  * @param s const uint8_t * string
733  * @param i string offset
734  * @see U8_BACK_1
735  * \xrefitem stable "Stable" "Stable List" ICU 2.4
736  */
737 #define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
738     while(U8_IS_TRAIL((s)[--(i)])) {} \
739 } UPRV_BLOCK_MACRO_END
740 
741 /**
742  * Move the string offset from one code point boundary to the previous one.
743  * (Pre-decrementing backward iteration.)
744  * The input offset may be the same as the string length.
745  * "Safe" macro, checks for illegal sequences and for string boundaries.
746  *
747  * @param s const uint8_t * string
748  * @param start int32_t starting string offset (usually 0)
749  * @param i int32_t string offset, must be start<i
750  * @see U8_BACK_1_UNSAFE
751  * \xrefitem stable "Stable" "Stable List" ICU 2.4
752  */
753 #define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
754     if(U8_IS_TRAIL((s)[--(i)])) { \
755         (i)=utf8_back1SafeBody(s, start, (i)); \
756     } \
757 } UPRV_BLOCK_MACRO_END
758 
759 /**
760  * Move the string offset from one code point boundary to the n-th one before it,
761  * i.e., move backward by n code points.
762  * (Pre-decrementing backward iteration.)
763  * The input offset may be the same as the string length.
764  * "Unsafe" macro, assumes well-formed UTF-8.
765  *
766  * @param s const uint8_t * string
767  * @param i string offset
768  * @param n number of code points to skip
769  * @see U8_BACK_N
770  * \xrefitem stable "Stable" "Stable List" ICU 2.4
771  */
772 #define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
773     int32_t __N=(n); \
774     while(__N>0) { \
775         U8_BACK_1_UNSAFE(s, i); \
776         --__N; \
777     } \
778 } UPRV_BLOCK_MACRO_END
779 
780 /**
781  * Move the string offset from one code point boundary to the n-th one before it,
782  * i.e., move backward by n code points.
783  * (Pre-decrementing backward iteration.)
784  * The input offset may be the same as the string length.
785  * "Safe" macro, checks for illegal sequences and for string boundaries.
786  *
787  * @param s const uint8_t * string
788  * @param start int32_t index of the start of the string
789  * @param i int32_t string offset, must be start<i
790  * @param n number of code points to skip
791  * @see U8_BACK_N_UNSAFE
792  * \xrefitem stable "Stable" "Stable List" ICU 2.4
793  */
794 #define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
795     int32_t __N=(n); \
796     while(__N>0 && (i)>(start)) { \
797         U8_BACK_1(s, start, i); \
798         --__N; \
799     } \
800 } UPRV_BLOCK_MACRO_END
801 
802 /**
803  * Adjust a random-access offset to a code point boundary after a code point.
804  * If the offset is behind a partial multi-byte sequence,
805  * then the offset is incremented to behind the whole sequence.
806  * Otherwise, it is not modified.
807  * The input offset may be the same as the string length.
808  * "Unsafe" macro, assumes well-formed UTF-8.
809  *
810  * @param s const uint8_t * string
811  * @param i string offset
812  * @see U8_SET_CP_LIMIT
813  * \xrefitem stable "Stable" "Stable List" ICU 2.4
814  */
815 #define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
816     U8_BACK_1_UNSAFE(s, i); \
817     U8_FWD_1_UNSAFE(s, i); \
818 } UPRV_BLOCK_MACRO_END
819 
820 /**
821  * Adjust a random-access offset to a code point boundary after a code point.
822  * If the offset is behind a partial multi-byte sequence,
823  * then the offset is incremented to behind the whole sequence.
824  * Otherwise, it is not modified.
825  * The input offset may be the same as the string length.
826  * "Safe" macro, checks for illegal sequences and for string boundaries.
827  *
828  * The length can be negative for a NUL-terminated string.
829  *
830  * @param s const uint8_t * string
831  * @param start int32_t starting string offset (usually 0)
832  * @param i int32_t string offset, must be start<=i<=length
833  * @param length int32_t string length
834  * @see U8_SET_CP_LIMIT_UNSAFE
835  * \xrefitem stable "Stable" "Stable List" ICU 2.4
836  */
837 #define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
838     if((start)<(i) && ((i)<(length) || (length)<0)) { \
839         U8_BACK_1(s, start, i); \
840         U8_FWD_1(s, i, length); \
841     } \
842 } UPRV_BLOCK_MACRO_END
843 
844 #endif
845 
846 /** @} */ // addtogroup
847