• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2011, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  utf16.h
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 1999sep09
14 *   created by: Markus W. Scherer
15 */
16 
17 /**
18  * \file
19  * \brief C API: 16-bit Unicode handling macros
20  *
21  * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
22  *
23  * For more information see utf.h and the ICU User Guide Strings chapter
24  * (http://userguide.icu-project.org/strings).
25  *
26  * <em>Usage:</em>
27  * ICU coding guidelines for if() statements should be followed when using these macros.
28  * Compound statements (curly braces {}) must be used  for if-else-while...
29  * bodies and all macro statements should be terminated with semicolon.
30  */
31 
32 #ifndef __UTF16_H__
33 #define __UTF16_H__
34 
35 #include "unicode/umachine.h"
36 #ifndef __UTF_H__
37 #   include "unicode/utf.h"
38 #endif
39 
40 /* single-code point definitions -------------------------------------------- */
41 
42 /**
43  * Does this code unit alone encode a code point (BMP, not a surrogate)?
44  * @param c 16-bit code unit
45  * @return TRUE or FALSE
46  * @stable ICU 2.4
47  */
48 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
49 
50 /**
51  * Is this code unit a lead surrogate (U+d800..U+dbff)?
52  * @param c 16-bit code unit
53  * @return TRUE or FALSE
54  * @stable ICU 2.4
55  */
56 #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
57 
58 /**
59  * Is this code unit a trail surrogate (U+dc00..U+dfff)?
60  * @param c 16-bit code unit
61  * @return TRUE or FALSE
62  * @stable ICU 2.4
63  */
64 #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
65 
66 /**
67  * Is this code unit a surrogate (U+d800..U+dfff)?
68  * @param c 16-bit code unit
69  * @return TRUE or FALSE
70  * @stable ICU 2.4
71  */
72 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
73 
74 /**
75  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
76  * is it a lead surrogate?
77  * @param c 16-bit code unit
78  * @return TRUE or FALSE
79  * @stable ICU 2.4
80  */
81 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
82 
83 /**
84  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
85  * is it a trail surrogate?
86  * @param c 16-bit code unit
87  * @return TRUE or FALSE
88  * @stable ICU 4.2
89  */
90 #define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
91 
92 /**
93  * Helper constant for U16_GET_SUPPLEMENTARY.
94  * @internal
95  */
96 #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
97 
98 /**
99  * Get a supplementary code point value (U+10000..U+10ffff)
100  * from its lead and trail surrogates.
101  * The result is undefined if the input values are not
102  * lead and trail surrogates.
103  *
104  * @param lead lead surrogate (U+d800..U+dbff)
105  * @param trail trail surrogate (U+dc00..U+dfff)
106  * @return supplementary code point (U+10000..U+10ffff)
107  * @stable ICU 2.4
108  */
109 #define U16_GET_SUPPLEMENTARY(lead, trail) \
110     (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
111 
112 
113 /**
114  * Get the lead surrogate (0xd800..0xdbff) for a
115  * supplementary code point (0x10000..0x10ffff).
116  * @param supplementary 32-bit code point (U+10000..U+10ffff)
117  * @return lead surrogate (U+d800..U+dbff) for supplementary
118  * @stable ICU 2.4
119  */
120 #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
121 
122 /**
123  * Get the trail surrogate (0xdc00..0xdfff) for a
124  * supplementary code point (0x10000..0x10ffff).
125  * @param supplementary 32-bit code point (U+10000..U+10ffff)
126  * @return trail surrogate (U+dc00..U+dfff) for supplementary
127  * @stable ICU 2.4
128  */
129 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
130 
131 /**
132  * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
133  * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
134  * @param c 32-bit code point
135  * @return 1 or 2
136  * @stable ICU 2.4
137  */
138 #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
139 
140 /**
141  * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
142  * @return 2
143  * @stable ICU 2.4
144  */
145 #define U16_MAX_LENGTH 2
146 
147 /**
148  * Get a code point from a string at a random-access offset,
149  * without changing the offset.
150  * "Unsafe" macro, assumes well-formed UTF-16.
151  *
152  * The offset may point to either the lead or trail surrogate unit
153  * for a supplementary code point, in which case the macro will read
154  * the adjacent matching surrogate as well.
155  * The result is undefined if the offset points to a single, unpaired surrogate.
156  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
157  *
158  * @param s const UChar * string
159  * @param i string offset
160  * @param c output UChar32 variable
161  * @see U16_GET
162  * @stable ICU 2.4
163  */
164 #define U16_GET_UNSAFE(s, i, c) { \
165     (c)=(s)[i]; \
166     if(U16_IS_SURROGATE(c)) { \
167         if(U16_IS_SURROGATE_LEAD(c)) { \
168             (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
169         } else { \
170             (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
171         } \
172     } \
173 }
174 
175 /**
176  * Get a code point from a string at a random-access offset,
177  * without changing the offset.
178  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
179  *
180  * The offset may point to either the lead or trail surrogate unit
181  * for a supplementary code point, in which case the macro will read
182  * the adjacent matching surrogate as well.
183  * If the offset points to a single, unpaired surrogate, then that itself
184  * will be returned as the code point.
185  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
186  *
187  * @param s const UChar * string
188  * @param start starting string offset (usually 0)
189  * @param i string offset, must be start<=i<length
190  * @param length string length
191  * @param c output UChar32 variable
192  * @see U16_GET_UNSAFE
193  * @stable ICU 2.4
194  */
195 #define U16_GET(s, start, i, length, c) { \
196     (c)=(s)[i]; \
197     if(U16_IS_SURROGATE(c)) { \
198         uint16_t __c2; \
199         if(U16_IS_SURROGATE_LEAD(c)) { \
200             if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
201                 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
202             } \
203         } else { \
204             if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
205                 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
206             } \
207         } \
208     } \
209 }
210 
211 /* definitions with forward iteration --------------------------------------- */
212 
213 /**
214  * Get a code point from a string at a code point boundary offset,
215  * and advance the offset to the next code point boundary.
216  * (Post-incrementing forward iteration.)
217  * "Unsafe" macro, assumes well-formed UTF-16.
218  *
219  * The offset may point to the lead surrogate unit
220  * for a supplementary code point, in which case the macro will read
221  * the following trail surrogate as well.
222  * If the offset points to a trail surrogate, then that itself
223  * will be returned as the code point.
224  * The result is undefined if the offset points to a single, unpaired lead surrogate.
225  *
226  * @param s const UChar * string
227  * @param i string offset
228  * @param c output UChar32 variable
229  * @see U16_NEXT
230  * @stable ICU 2.4
231  */
232 #define U16_NEXT_UNSAFE(s, i, c) { \
233     (c)=(s)[(i)++]; \
234     if(U16_IS_LEAD(c)) { \
235         (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
236     } \
237 }
238 
239 /**
240  * Get a code point from a string at a code point boundary offset,
241  * and advance the offset to the next code point boundary.
242  * (Post-incrementing forward iteration.)
243  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
244  *
245  * The offset may point to the lead surrogate unit
246  * for a supplementary code point, in which case the macro will read
247  * the following trail surrogate as well.
248  * If the offset points to a trail surrogate or
249  * to a single, unpaired lead surrogate, then that itself
250  * will be returned as the code point.
251  *
252  * @param s const UChar * string
253  * @param i string offset, must be i<length
254  * @param length string length
255  * @param c output UChar32 variable
256  * @see U16_NEXT_UNSAFE
257  * @stable ICU 2.4
258  */
259 #define U16_NEXT(s, i, length, c) { \
260     (c)=(s)[(i)++]; \
261     if(U16_IS_LEAD(c)) { \
262         uint16_t __c2; \
263         if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
264             ++(i); \
265             (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
266         } \
267     } \
268 }
269 
270 /**
271  * Append a code point to a string, overwriting 1 or 2 code units.
272  * The offset points to the current end of the string contents
273  * and is advanced (post-increment).
274  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
275  * Otherwise, the result is undefined.
276  *
277  * @param s const UChar * string buffer
278  * @param i string offset
279  * @param c code point to append
280  * @see U16_APPEND
281  * @stable ICU 2.4
282  */
283 #define U16_APPEND_UNSAFE(s, i, c) { \
284     if((uint32_t)(c)<=0xffff) { \
285         (s)[(i)++]=(uint16_t)(c); \
286     } else { \
287         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
288         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
289     } \
290 }
291 
292 /**
293  * Append a code point to a string, overwriting 1 or 2 code units.
294  * The offset points to the current end of the string contents
295  * and is advanced (post-increment).
296  * "Safe" macro, checks for a valid code point.
297  * If a surrogate pair is written, checks for sufficient space in the string.
298  * If the code point is not valid or a trail surrogate does not fit,
299  * then isError is set to TRUE.
300  *
301  * @param s const UChar * string buffer
302  * @param i string offset, must be i<capacity
303  * @param capacity size of the string buffer
304  * @param c code point to append
305  * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
306  * @see U16_APPEND_UNSAFE
307  * @stable ICU 2.4
308  */
309 #define U16_APPEND(s, i, capacity, c, isError) { \
310     if((uint32_t)(c)<=0xffff) { \
311         (s)[(i)++]=(uint16_t)(c); \
312     } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
313         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
314         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
315     } else /* c>0x10ffff or not enough space */ { \
316         (isError)=TRUE; \
317     } \
318 }
319 
320 /**
321  * Advance the string offset from one code point boundary to the next.
322  * (Post-incrementing iteration.)
323  * "Unsafe" macro, assumes well-formed UTF-16.
324  *
325  * @param s const UChar * string
326  * @param i string offset
327  * @see U16_FWD_1
328  * @stable ICU 2.4
329  */
330 #define U16_FWD_1_UNSAFE(s, i) { \
331     if(U16_IS_LEAD((s)[(i)++])) { \
332         ++(i); \
333     } \
334 }
335 
336 /**
337  * Advance the string offset from one code point boundary to the next.
338  * (Post-incrementing iteration.)
339  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
340  *
341  * @param s const UChar * string
342  * @param i string offset, must be i<length
343  * @param length string length
344  * @see U16_FWD_1_UNSAFE
345  * @stable ICU 2.4
346  */
347 #define U16_FWD_1(s, i, length) { \
348     if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
349         ++(i); \
350     } \
351 }
352 
353 /**
354  * Advance the string offset from one code point boundary to the n-th next one,
355  * i.e., move forward by n code points.
356  * (Post-incrementing iteration.)
357  * "Unsafe" macro, assumes well-formed UTF-16.
358  *
359  * @param s const UChar * string
360  * @param i string offset
361  * @param n number of code points to skip
362  * @see U16_FWD_N
363  * @stable ICU 2.4
364  */
365 #define U16_FWD_N_UNSAFE(s, i, n) { \
366     int32_t __N=(n); \
367     while(__N>0) { \
368         U16_FWD_1_UNSAFE(s, i); \
369         --__N; \
370     } \
371 }
372 
373 /**
374  * Advance the string offset from one code point boundary to the n-th next one,
375  * i.e., move forward by n code points.
376  * (Post-incrementing iteration.)
377  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
378  *
379  * @param s const UChar * string
380  * @param i string offset, must be i<length
381  * @param length string length
382  * @param n number of code points to skip
383  * @see U16_FWD_N_UNSAFE
384  * @stable ICU 2.4
385  */
386 #define U16_FWD_N(s, i, length, n) { \
387     int32_t __N=(n); \
388     while(__N>0 && (i)<(length)) { \
389         U16_FWD_1(s, i, length); \
390         --__N; \
391     } \
392 }
393 
394 /**
395  * Adjust a random-access offset to a code point boundary
396  * at the start of a code point.
397  * If the offset points to the trail surrogate of a surrogate pair,
398  * then the offset is decremented.
399  * Otherwise, it is not modified.
400  * "Unsafe" macro, assumes well-formed UTF-16.
401  *
402  * @param s const UChar * string
403  * @param i string offset
404  * @see U16_SET_CP_START
405  * @stable ICU 2.4
406  */
407 #define U16_SET_CP_START_UNSAFE(s, i) { \
408     if(U16_IS_TRAIL((s)[i])) { \
409         --(i); \
410     } \
411 }
412 
413 /**
414  * Adjust a random-access offset to a code point boundary
415  * at the start of a code point.
416  * If the offset points to the trail surrogate of a surrogate pair,
417  * then the offset is decremented.
418  * Otherwise, it is not modified.
419  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
420  *
421  * @param s const UChar * string
422  * @param start starting string offset (usually 0)
423  * @param i string offset, must be start<=i
424  * @see U16_SET_CP_START_UNSAFE
425  * @stable ICU 2.4
426  */
427 #define U16_SET_CP_START(s, start, i) { \
428     if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
429         --(i); \
430     } \
431 }
432 
433 /* definitions with backward iteration -------------------------------------- */
434 
435 /**
436  * Move the string offset from one code point boundary to the previous one
437  * and get the code point between them.
438  * (Pre-decrementing backward iteration.)
439  * "Unsafe" macro, assumes well-formed UTF-16.
440  *
441  * The input offset may be the same as the string length.
442  * If the offset is behind a trail surrogate unit
443  * for a supplementary code point, then the macro will read
444  * the preceding lead surrogate as well.
445  * If the offset is behind a lead surrogate, then that itself
446  * will be returned as the code point.
447  * The result is undefined if the offset is behind a single, unpaired trail surrogate.
448  *
449  * @param s const UChar * string
450  * @param i string offset
451  * @param c output UChar32 variable
452  * @see U16_PREV
453  * @stable ICU 2.4
454  */
455 #define U16_PREV_UNSAFE(s, i, c) { \
456     (c)=(s)[--(i)]; \
457     if(U16_IS_TRAIL(c)) { \
458         (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
459     } \
460 }
461 
462 /**
463  * Move the string offset from one code point boundary to the previous one
464  * and get the code point between them.
465  * (Pre-decrementing backward iteration.)
466  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
467  *
468  * The input offset may be the same as the string length.
469  * If the offset is behind a trail surrogate unit
470  * for a supplementary code point, then the macro will read
471  * the preceding lead surrogate as well.
472  * If the offset is behind a lead surrogate or behind a single, unpaired
473  * trail surrogate, then that itself
474  * will be returned as the code point.
475  *
476  * @param s const UChar * string
477  * @param start starting string offset (usually 0)
478  * @param i string offset, must be start<i
479  * @param c output UChar32 variable
480  * @see U16_PREV_UNSAFE
481  * @stable ICU 2.4
482  */
483 #define U16_PREV(s, start, i, c) { \
484     (c)=(s)[--(i)]; \
485     if(U16_IS_TRAIL(c)) { \
486         uint16_t __c2; \
487         if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
488             --(i); \
489             (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
490         } \
491     } \
492 }
493 
494 /**
495  * Move the string offset from one code point boundary to the previous one.
496  * (Pre-decrementing backward iteration.)
497  * The input offset may be the same as the string length.
498  * "Unsafe" macro, assumes well-formed UTF-16.
499  *
500  * @param s const UChar * string
501  * @param i string offset
502  * @see U16_BACK_1
503  * @stable ICU 2.4
504  */
505 #define U16_BACK_1_UNSAFE(s, i) { \
506     if(U16_IS_TRAIL((s)[--(i)])) { \
507         --(i); \
508     } \
509 }
510 
511 /**
512  * Move the string offset from one code point boundary to the previous one.
513  * (Pre-decrementing backward iteration.)
514  * The input offset may be the same as the string length.
515  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
516  *
517  * @param s const UChar * string
518  * @param start starting string offset (usually 0)
519  * @param i string offset, must be start<i
520  * @see U16_BACK_1_UNSAFE
521  * @stable ICU 2.4
522  */
523 #define U16_BACK_1(s, start, i) { \
524     if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
525         --(i); \
526     } \
527 }
528 
529 /**
530  * Move the string offset from one code point boundary to the n-th one before it,
531  * i.e., move backward by n code points.
532  * (Pre-decrementing backward iteration.)
533  * The input offset may be the same as the string length.
534  * "Unsafe" macro, assumes well-formed UTF-16.
535  *
536  * @param s const UChar * string
537  * @param i string offset
538  * @param n number of code points to skip
539  * @see U16_BACK_N
540  * @stable ICU 2.4
541  */
542 #define U16_BACK_N_UNSAFE(s, i, n) { \
543     int32_t __N=(n); \
544     while(__N>0) { \
545         U16_BACK_1_UNSAFE(s, i); \
546         --__N; \
547     } \
548 }
549 
550 /**
551  * Move the string offset from one code point boundary to the n-th one before it,
552  * i.e., move backward by n code points.
553  * (Pre-decrementing backward iteration.)
554  * The input offset may be the same as the string length.
555  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
556  *
557  * @param s const UChar * string
558  * @param start start of string
559  * @param i string offset, must be start<i
560  * @param n number of code points to skip
561  * @see U16_BACK_N_UNSAFE
562  * @stable ICU 2.4
563  */
564 #define U16_BACK_N(s, start, i, n) { \
565     int32_t __N=(n); \
566     while(__N>0 && (i)>(start)) { \
567         U16_BACK_1(s, start, i); \
568         --__N; \
569     } \
570 }
571 
572 /**
573  * Adjust a random-access offset to a code point boundary after a code point.
574  * If the offset is behind the lead surrogate of a surrogate pair,
575  * then the offset is incremented.
576  * Otherwise, it is not modified.
577  * The input offset may be the same as the string length.
578  * "Unsafe" macro, assumes well-formed UTF-16.
579  *
580  * @param s const UChar * string
581  * @param i string offset
582  * @see U16_SET_CP_LIMIT
583  * @stable ICU 2.4
584  */
585 #define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
586     if(U16_IS_LEAD((s)[(i)-1])) { \
587         ++(i); \
588     } \
589 }
590 
591 /**
592  * Adjust a random-access offset to a code point boundary after a code point.
593  * If the offset is behind the lead surrogate of a surrogate pair,
594  * then the offset is incremented.
595  * Otherwise, it is not modified.
596  * The input offset may be the same as the string length.
597  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
598  *
599  * @param s const UChar * string
600  * @param start starting string offset (usually 0)
601  * @param i string offset, start<=i<=length
602  * @param length string length
603  * @see U16_SET_CP_LIMIT_UNSAFE
604  * @stable ICU 2.4
605  */
606 #define U16_SET_CP_LIMIT(s, start, i, length) { \
607     if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
608         ++(i); \
609     } \
610 }
611 
612 #endif
613