• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  utf16.h
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 1999sep09
14 *   created by: Markus W. Scherer
15 */
16 
17 /**
18  * \file
19  * \brief C API: 16-bit Unicode handling macros
20  *
21  * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
22  * utf16.h is included by utf.h after unicode/umachine.h
23  * and some common definitions.
24  *
25  * For more information see utf.h and the ICU User Guide Strings chapter
26  * (http://icu-project.org/userguide/strings.html).
27  *
28  * <em>Usage:</em>
29  * ICU coding guidelines for if() statements should be followed when using these macros.
30  * Compound statements (curly braces {}) must be used  for if-else-while...
31  * bodies and all macro statements should be terminated with semicolon.
32  */
33 
34 #ifndef __UTF16_H__
35 #define __UTF16_H__
36 
37 /* utf.h must be included first. */
38 #ifndef __UTF_H__
39 #   include "unicode/utf.h"
40 #endif
41 
42 /* single-code point definitions -------------------------------------------- */
43 
44 /**
45  * Does this code unit alone encode a code point (BMP, not a surrogate)?
46  * @param c 16-bit code unit
47  * @return TRUE or FALSE
48  * @stable ICU 2.4
49  */
50 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
51 
52 /**
53  * Is this code unit a lead surrogate (U+d800..U+dbff)?
54  * @param c 16-bit code unit
55  * @return TRUE or FALSE
56  * @stable ICU 2.4
57  */
58 #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
59 
60 /**
61  * Is this code unit a trail surrogate (U+dc00..U+dfff)?
62  * @param c 16-bit code unit
63  * @return TRUE or FALSE
64  * @stable ICU 2.4
65  */
66 #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
67 
68 /**
69  * Is this code unit a surrogate (U+d800..U+dfff)?
70  * @param c 16-bit code unit
71  * @return TRUE or FALSE
72  * @stable ICU 2.4
73  */
74 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
75 
76 /**
77  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
78  * is it a lead surrogate?
79  * @param c 16-bit code unit
80  * @return TRUE or FALSE
81  * @stable ICU 2.4
82  */
83 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
84 
85 /**
86  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
87  * is it a trail surrogate?
88  * @param c 16-bit code unit
89  * @return TRUE or FALSE
90  * @stable ICU 4.2
91  */
92 #define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
93 
94 /**
95  * Helper constant for U16_GET_SUPPLEMENTARY.
96  * @internal
97  */
98 #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
99 
100 /**
101  * Get a supplementary code point value (U+10000..U+10ffff)
102  * from its lead and trail surrogates.
103  * The result is undefined if the input values are not
104  * lead and trail surrogates.
105  *
106  * @param lead lead surrogate (U+d800..U+dbff)
107  * @param trail trail surrogate (U+dc00..U+dfff)
108  * @return supplementary code point (U+10000..U+10ffff)
109  * @stable ICU 2.4
110  */
111 #define U16_GET_SUPPLEMENTARY(lead, trail) \
112     (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
113 
114 
115 /**
116  * Get the lead surrogate (0xd800..0xdbff) for a
117  * supplementary code point (0x10000..0x10ffff).
118  * @param supplementary 32-bit code point (U+10000..U+10ffff)
119  * @return lead surrogate (U+d800..U+dbff) for supplementary
120  * @stable ICU 2.4
121  */
122 #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
123 
124 /**
125  * Get the trail surrogate (0xdc00..0xdfff) for a
126  * supplementary code point (0x10000..0x10ffff).
127  * @param supplementary 32-bit code point (U+10000..U+10ffff)
128  * @return trail surrogate (U+dc00..U+dfff) for supplementary
129  * @stable ICU 2.4
130  */
131 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
132 
133 /**
134  * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
135  * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
136  * @param c 32-bit code point
137  * @return 1 or 2
138  * @stable ICU 2.4
139  */
140 #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
141 
142 /**
143  * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
144  * @return 2
145  * @stable ICU 2.4
146  */
147 #define U16_MAX_LENGTH 2
148 
149 /**
150  * Get a code point from a string at a random-access offset,
151  * without changing the offset.
152  * "Unsafe" macro, assumes well-formed UTF-16.
153  *
154  * The offset may point to either the lead or trail surrogate unit
155  * for a supplementary code point, in which case the macro will read
156  * the adjacent matching surrogate as well.
157  * The result is undefined if the offset points to a single, unpaired surrogate.
158  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
159  *
160  * @param s const UChar * string
161  * @param i string offset
162  * @param c output UChar32 variable
163  * @see U16_GET
164  * @stable ICU 2.4
165  */
166 #define U16_GET_UNSAFE(s, i, c) { \
167     (c)=(s)[i]; \
168     if(U16_IS_SURROGATE(c)) { \
169         if(U16_IS_SURROGATE_LEAD(c)) { \
170             (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
171         } else { \
172             (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
173         } \
174     } \
175 }
176 
177 /**
178  * Get a code point from a string at a random-access offset,
179  * without changing the offset.
180  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
181  *
182  * The offset may point to either the lead or trail surrogate unit
183  * for a supplementary code point, in which case the macro will read
184  * the adjacent matching surrogate as well.
185  * If the offset points to a single, unpaired surrogate, then that itself
186  * will be returned as the code point.
187  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
188  *
189  * @param s const UChar * string
190  * @param start starting string offset (usually 0)
191  * @param i string offset, must be start<=i<length
192  * @param length string length
193  * @param c output UChar32 variable
194  * @see U16_GET_UNSAFE
195  * @stable ICU 2.4
196  */
197 #define U16_GET(s, start, i, length, c) { \
198     (c)=(s)[i]; \
199     if(U16_IS_SURROGATE(c)) { \
200         uint16_t __c2; \
201         if(U16_IS_SURROGATE_LEAD(c)) { \
202             if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
203                 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
204             } \
205         } else { \
206             if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
207                 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
208             } \
209         } \
210     } \
211 }
212 
213 /* definitions with forward iteration --------------------------------------- */
214 
215 /**
216  * Get a code point from a string at a code point boundary offset,
217  * and advance the offset to the next code point boundary.
218  * (Post-incrementing forward iteration.)
219  * "Unsafe" macro, assumes well-formed UTF-16.
220  *
221  * The offset may point to the lead surrogate unit
222  * for a supplementary code point, in which case the macro will read
223  * the following trail surrogate as well.
224  * If the offset points to a trail surrogate, then that itself
225  * will be returned as the code point.
226  * The result is undefined if the offset points to a single, unpaired lead surrogate.
227  *
228  * @param s const UChar * string
229  * @param i string offset
230  * @param c output UChar32 variable
231  * @see U16_NEXT
232  * @stable ICU 2.4
233  */
234 #define U16_NEXT_UNSAFE(s, i, c) { \
235     (c)=(s)[(i)++]; \
236     if(U16_IS_LEAD(c)) { \
237         (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
238     } \
239 }
240 
241 /**
242  * Get a code point from a string at a code point boundary offset,
243  * and advance the offset to the next code point boundary.
244  * (Post-incrementing forward iteration.)
245  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
246  *
247  * The offset may point to the lead surrogate unit
248  * for a supplementary code point, in which case the macro will read
249  * the following trail surrogate as well.
250  * If the offset points to a trail surrogate or
251  * to a single, unpaired lead surrogate, then that itself
252  * will be returned as the code point.
253  *
254  * @param s const UChar * string
255  * @param i string offset, must be i<length
256  * @param length string length
257  * @param c output UChar32 variable
258  * @see U16_NEXT_UNSAFE
259  * @stable ICU 2.4
260  */
261 #define U16_NEXT(s, i, length, c) { \
262     (c)=(s)[(i)++]; \
263     if(U16_IS_LEAD(c)) { \
264         uint16_t __c2; \
265         if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
266             ++(i); \
267             (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
268         } \
269     } \
270 }
271 
272 /**
273  * Append a code point to a string, overwriting 1 or 2 code units.
274  * The offset points to the current end of the string contents
275  * and is advanced (post-increment).
276  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
277  * Otherwise, the result is undefined.
278  *
279  * @param s const UChar * string buffer
280  * @param i string offset
281  * @param c code point to append
282  * @see U16_APPEND
283  * @stable ICU 2.4
284  */
285 #define U16_APPEND_UNSAFE(s, i, c) { \
286     if((uint32_t)(c)<=0xffff) { \
287         (s)[(i)++]=(uint16_t)(c); \
288     } else { \
289         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
290         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
291     } \
292 }
293 
294 /**
295  * Append a code point to a string, overwriting 1 or 2 code units.
296  * The offset points to the current end of the string contents
297  * and is advanced (post-increment).
298  * "Safe" macro, checks for a valid code point.
299  * If a surrogate pair is written, checks for sufficient space in the string.
300  * If the code point is not valid or a trail surrogate does not fit,
301  * then isError is set to TRUE.
302  *
303  * @param s const UChar * string buffer
304  * @param i string offset, must be i<capacity
305  * @param capacity size of the string buffer
306  * @param c code point to append
307  * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
308  * @see U16_APPEND_UNSAFE
309  * @stable ICU 2.4
310  */
311 #define U16_APPEND(s, i, capacity, c, isError) { \
312     if((uint32_t)(c)<=0xffff) { \
313         (s)[(i)++]=(uint16_t)(c); \
314     } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
315         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
316         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
317     } else /* c>0x10ffff or not enough space */ { \
318         (isError)=TRUE; \
319     } \
320 }
321 
322 /**
323  * Advance the string offset from one code point boundary to the next.
324  * (Post-incrementing iteration.)
325  * "Unsafe" macro, assumes well-formed UTF-16.
326  *
327  * @param s const UChar * string
328  * @param i string offset
329  * @see U16_FWD_1
330  * @stable ICU 2.4
331  */
332 #define U16_FWD_1_UNSAFE(s, i) { \
333     if(U16_IS_LEAD((s)[(i)++])) { \
334         ++(i); \
335     } \
336 }
337 
338 /**
339  * Advance the string offset from one code point boundary to the next.
340  * (Post-incrementing iteration.)
341  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
342  *
343  * @param s const UChar * string
344  * @param i string offset, must be i<length
345  * @param length string length
346  * @see U16_FWD_1_UNSAFE
347  * @stable ICU 2.4
348  */
349 #define U16_FWD_1(s, i, length) { \
350     if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
351         ++(i); \
352     } \
353 }
354 
355 /**
356  * Advance the string offset from one code point boundary to the n-th next one,
357  * i.e., move forward by n code points.
358  * (Post-incrementing iteration.)
359  * "Unsafe" macro, assumes well-formed UTF-16.
360  *
361  * @param s const UChar * string
362  * @param i string offset
363  * @param n number of code points to skip
364  * @see U16_FWD_N
365  * @stable ICU 2.4
366  */
367 #define U16_FWD_N_UNSAFE(s, i, n) { \
368     int32_t __N=(n); \
369     while(__N>0) { \
370         U16_FWD_1_UNSAFE(s, i); \
371         --__N; \
372     } \
373 }
374 
375 /**
376  * Advance the string offset from one code point boundary to the n-th next one,
377  * i.e., move forward by n code points.
378  * (Post-incrementing iteration.)
379  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
380  *
381  * @param s const UChar * string
382  * @param i string offset, must be i<length
383  * @param length string length
384  * @param n number of code points to skip
385  * @see U16_FWD_N_UNSAFE
386  * @stable ICU 2.4
387  */
388 #define U16_FWD_N(s, i, length, n) { \
389     int32_t __N=(n); \
390     while(__N>0 && (i)<(length)) { \
391         U16_FWD_1(s, i, length); \
392         --__N; \
393     } \
394 }
395 
396 /**
397  * Adjust a random-access offset to a code point boundary
398  * at the start of a code point.
399  * If the offset points to the trail surrogate of a surrogate pair,
400  * then the offset is decremented.
401  * Otherwise, it is not modified.
402  * "Unsafe" macro, assumes well-formed UTF-16.
403  *
404  * @param s const UChar * string
405  * @param i string offset
406  * @see U16_SET_CP_START
407  * @stable ICU 2.4
408  */
409 #define U16_SET_CP_START_UNSAFE(s, i) { \
410     if(U16_IS_TRAIL((s)[i])) { \
411         --(i); \
412     } \
413 }
414 
415 /**
416  * Adjust a random-access offset to a code point boundary
417  * at the start of a code point.
418  * If the offset points to the trail surrogate of a surrogate pair,
419  * then the offset is decremented.
420  * Otherwise, it is not modified.
421  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
422  *
423  * @param s const UChar * string
424  * @param start starting string offset (usually 0)
425  * @param i string offset, must be start<=i
426  * @see U16_SET_CP_START_UNSAFE
427  * @stable ICU 2.4
428  */
429 #define U16_SET_CP_START(s, start, i) { \
430     if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
431         --(i); \
432     } \
433 }
434 
435 /* definitions with backward iteration -------------------------------------- */
436 
437 /**
438  * Move the string offset from one code point boundary to the previous one
439  * and get the code point between them.
440  * (Pre-decrementing backward iteration.)
441  * "Unsafe" macro, assumes well-formed UTF-16.
442  *
443  * The input offset may be the same as the string length.
444  * If the offset is behind a trail surrogate unit
445  * for a supplementary code point, then the macro will read
446  * the preceding lead surrogate as well.
447  * If the offset is behind a lead surrogate, then that itself
448  * will be returned as the code point.
449  * The result is undefined if the offset is behind a single, unpaired trail surrogate.
450  *
451  * @param s const UChar * string
452  * @param i string offset
453  * @param c output UChar32 variable
454  * @see U16_PREV
455  * @stable ICU 2.4
456  */
457 #define U16_PREV_UNSAFE(s, i, c) { \
458     (c)=(s)[--(i)]; \
459     if(U16_IS_TRAIL(c)) { \
460         (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
461     } \
462 }
463 
464 /**
465  * Move the string offset from one code point boundary to the previous one
466  * and get the code point between them.
467  * (Pre-decrementing backward iteration.)
468  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
469  *
470  * The input offset may be the same as the string length.
471  * If the offset is behind a trail surrogate unit
472  * for a supplementary code point, then the macro will read
473  * the preceding lead surrogate as well.
474  * If the offset is behind a lead surrogate or behind a single, unpaired
475  * trail surrogate, then that itself
476  * will be returned as the code point.
477  *
478  * @param s const UChar * string
479  * @param start starting string offset (usually 0)
480  * @param i string offset, must be start<i
481  * @param c output UChar32 variable
482  * @see U16_PREV_UNSAFE
483  * @stable ICU 2.4
484  */
485 #define U16_PREV(s, start, i, c) { \
486     (c)=(s)[--(i)]; \
487     if(U16_IS_TRAIL(c)) { \
488         uint16_t __c2; \
489         if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
490             --(i); \
491             (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
492         } \
493     } \
494 }
495 
496 /**
497  * Move the string offset from one code point boundary to the previous one.
498  * (Pre-decrementing backward iteration.)
499  * The input offset may be the same as the string length.
500  * "Unsafe" macro, assumes well-formed UTF-16.
501  *
502  * @param s const UChar * string
503  * @param i string offset
504  * @see U16_BACK_1
505  * @stable ICU 2.4
506  */
507 #define U16_BACK_1_UNSAFE(s, i) { \
508     if(U16_IS_TRAIL((s)[--(i)])) { \
509         --(i); \
510     } \
511 }
512 
513 /**
514  * Move the string offset from one code point boundary to the previous one.
515  * (Pre-decrementing backward iteration.)
516  * The input offset may be the same as the string length.
517  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
518  *
519  * @param s const UChar * string
520  * @param start starting string offset (usually 0)
521  * @param i string offset, must be start<i
522  * @see U16_BACK_1_UNSAFE
523  * @stable ICU 2.4
524  */
525 #define U16_BACK_1(s, start, i) { \
526     if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
527         --(i); \
528     } \
529 }
530 
531 /**
532  * Move the string offset from one code point boundary to the n-th one before it,
533  * i.e., move backward by n code points.
534  * (Pre-decrementing backward iteration.)
535  * The input offset may be the same as the string length.
536  * "Unsafe" macro, assumes well-formed UTF-16.
537  *
538  * @param s const UChar * string
539  * @param i string offset
540  * @param n number of code points to skip
541  * @see U16_BACK_N
542  * @stable ICU 2.4
543  */
544 #define U16_BACK_N_UNSAFE(s, i, n) { \
545     int32_t __N=(n); \
546     while(__N>0) { \
547         U16_BACK_1_UNSAFE(s, i); \
548         --__N; \
549     } \
550 }
551 
552 /**
553  * Move the string offset from one code point boundary to the n-th one before it,
554  * i.e., move backward by n code points.
555  * (Pre-decrementing backward iteration.)
556  * The input offset may be the same as the string length.
557  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
558  *
559  * @param s const UChar * string
560  * @param start start of string
561  * @param i string offset, must be start<i
562  * @param n number of code points to skip
563  * @see U16_BACK_N_UNSAFE
564  * @stable ICU 2.4
565  */
566 #define U16_BACK_N(s, start, i, n) { \
567     int32_t __N=(n); \
568     while(__N>0 && (i)>(start)) { \
569         U16_BACK_1(s, start, i); \
570         --__N; \
571     } \
572 }
573 
574 /**
575  * Adjust a random-access offset to a code point boundary after a code point.
576  * If the offset is behind the lead surrogate of a surrogate pair,
577  * then the offset is incremented.
578  * Otherwise, it is not modified.
579  * The input offset may be the same as the string length.
580  * "Unsafe" macro, assumes well-formed UTF-16.
581  *
582  * @param s const UChar * string
583  * @param i string offset
584  * @see U16_SET_CP_LIMIT
585  * @stable ICU 2.4
586  */
587 #define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
588     if(U16_IS_LEAD((s)[(i)-1])) { \
589         ++(i); \
590     } \
591 }
592 
593 /**
594  * Adjust a random-access offset to a code point boundary after a code point.
595  * If the offset is behind the lead surrogate of a surrogate pair,
596  * then the offset is incremented.
597  * Otherwise, it is not modified.
598  * The input offset may be the same as the string length.
599  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
600  *
601  * @param s const UChar * string
602  * @param start starting string offset (usually 0)
603  * @param i string offset, start<=i<=length
604  * @param length string length
605  * @see U16_SET_CP_LIMIT_UNSAFE
606  * @stable ICU 2.4
607  */
608 #define U16_SET_CP_LIMIT(s, start, i, length) { \
609     if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
610         ++(i); \
611     } \
612 }
613 
614 #endif
615