• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1998-2014, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File ustring.h
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   12/07/98    bertrand    Creation.
15 ******************************************************************************
16 */
17 
18 #ifndef USTRING_H
19 #define USTRING_H
20 
21 #include "unicode/utypes.h"
22 
23 /**
24  * \def UBRK_TYPEDEF_UBREAK_ITERATOR
25  * @internal
26  */
27 
28 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
29 #   define UBRK_TYPEDEF_UBREAK_ITERATOR
30 /** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/
31     typedef struct UBreakIterator UBreakIterator;
32 #endif
33 
34 /**
35  * \file
36  * \brief C API: Unicode string handling functions
37  *
38  * These C API functions provide general Unicode string handling.
39  *
40  * Some functions are equivalent in name, signature, and behavior to the ANSI C <string.h>
41  * functions. (For example, they do not check for bad arguments like NULL string pointers.)
42  * In some cases, only the thread-safe variant of such a function is implemented here
43  * (see u_strtok_r()).
44  *
45  * Other functions provide more Unicode-specific functionality like locale-specific
46  * upper/lower-casing and string comparison in code point order.
47  *
48  * ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units.
49  * UTF-16 encodes each Unicode code point with either one or two UChar code units.
50  * (This is the default form of Unicode, and a forward-compatible extension of the original,
51  * fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0
52  * in 1996.)
53  *
54  * Some APIs accept a 32-bit UChar32 value for a single code point.
55  *
56  * ICU also handles 16-bit Unicode text with unpaired surrogates.
57  * Such text is not well-formed UTF-16.
58  * Code-point-related functions treat unpaired surrogates as surrogate code points,
59  * i.e., as separate units.
60  *
61  * Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings),
62  * it is much more efficient even for random access because the code unit values
63  * for single-unit characters vs. lead units vs. trail units are completely disjoint.
64  * This means that it is easy to determine character (code point) boundaries from
65  * random offsets in the string.
66  *
67  * Unicode (UTF-16) string processing is optimized for the single-unit case.
68  * Although it is important to support supplementary characters
69  * (which use pairs of lead/trail code units called "surrogates"),
70  * their occurrence is rare. Almost all characters in modern use require only
71  * a single UChar code unit (i.e., their code point values are <=0xffff).
72  *
73  * For more details see the User Guide Strings chapter (https://unicode-org.github.io/icu/userguide/strings/).
74  * For a discussion of the handling of unpaired surrogates see also
75  * Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18.
76  */
77 
78 /**
79  * \defgroup ustring_ustrlen String Length
80  * \ingroup ustring_strlen
81  */
82 /*@{*/
83 /**
84  * Determine the length of an array of UChar.
85  *
86  * @param s The array of UChars, NULL (U+0000) terminated.
87  * @return The number of UChars in <code>chars</code>, minus the terminator.
88  * @stable ICU 2.0
89  */
90 U_CAPI int32_t U_EXPORT2
91 u_strlen(const UChar *s);
92 /*@}*/
93 
94 /**
95  * Count Unicode code points in the length UChar code units of the string.
96  * A code point may occupy either one or two UChar code units.
97  * Counting code points involves reading all code units.
98  *
99  * This functions is basically the inverse of the U16_FWD_N() macro (see utf.h).
100  *
101  * @param s The input string.
102  * @param length The number of UChar code units to be checked, or -1 to count all
103  *               code points before the first NUL (U+0000).
104  * @return The number of code points in the specified code units.
105  * @stable ICU 2.0
106  */
107 U_CAPI int32_t U_EXPORT2
108 u_countChar32(const UChar *s, int32_t length);
109 
110 /**
111  * Check if the string contains more Unicode code points than a certain number.
112  * This is more efficient than counting all code points in the entire string
113  * and comparing that number with a threshold.
114  * This function may not need to scan the string at all if the length is known
115  * (not -1 for NUL-termination) and falls within a certain range, and
116  * never needs to count more than 'number+1' code points.
117  * Logically equivalent to (u_countChar32(s, length)>number).
118  * A Unicode code point may occupy either one or two UChar code units.
119  *
120  * @param s The input string.
121  * @param length The length of the string, or -1 if it is NUL-terminated.
122  * @param number The number of code points in the string is compared against
123  *               the 'number' parameter.
124  * @return Boolean value for whether the string contains more Unicode code points
125  *         than 'number'. Same as (u_countChar32(s, length)>number).
126  * @stable ICU 2.4
127  */
128 U_CAPI UBool U_EXPORT2
129 u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number);
130 
131 /**
132  * Concatenate two ustrings.  Appends a copy of <code>src</code>,
133  * including the null terminator, to <code>dst</code>. The initial copied
134  * character from <code>src</code> overwrites the null terminator in <code>dst</code>.
135  *
136  * @param dst The destination string.
137  * @param src The source string.
138  * @return A pointer to <code>dst</code>.
139  * @stable ICU 2.0
140  */
141 U_CAPI UChar* U_EXPORT2
142 u_strcat(UChar     *dst,
143     const UChar     *src);
144 
145 /**
146  * Concatenate two ustrings.
147  * Appends at most <code>n</code> characters from <code>src</code> to <code>dst</code>.
148  * Adds a terminating NUL.
149  * If src is too long, then only <code>n-1</code> characters will be copied
150  * before the terminating NUL.
151  * If <code>n&lt;=0</code> then dst is not modified.
152  *
153  * @param dst The destination string.
154  * @param src The source string (can be NULL/invalid if n<=0).
155  * @param n The maximum number of characters to append; no-op if <=0.
156  * @return A pointer to <code>dst</code>.
157  * @stable ICU 2.0
158  */
159 U_CAPI UChar* U_EXPORT2
160 u_strncat(UChar     *dst,
161      const UChar     *src,
162      int32_t     n);
163 
164 /**
165  * Find the first occurrence of a substring in a string.
166  * The substring is found at code point boundaries.
167  * That means that if the substring begins with
168  * a trail surrogate or ends with a lead surrogate,
169  * then it is found only if these surrogates stand alone in the text.
170  * Otherwise, the substring edge units would be matched against
171  * halves of surrogate pairs.
172  *
173  * @param s The string to search (NUL-terminated).
174  * @param substring The substring to find (NUL-terminated).
175  * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
176  *         or <code>s</code> itself if the <code>substring</code> is empty,
177  *         or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
178  * @stable ICU 2.0
179  *
180  * @see u_strrstr
181  * @see u_strFindFirst
182  * @see u_strFindLast
183  */
184 U_CAPI UChar * U_EXPORT2
185 u_strstr(const UChar *s, const UChar *substring);
186 
187 /**
188  * Find the first occurrence of a substring in a string.
189  * The substring is found at code point boundaries.
190  * That means that if the substring begins with
191  * a trail surrogate or ends with a lead surrogate,
192  * then it is found only if these surrogates stand alone in the text.
193  * Otherwise, the substring edge units would be matched against
194  * halves of surrogate pairs.
195  *
196  * @param s The string to search.
197  * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
198  * @param substring The substring to find (NUL-terminated).
199  * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
200  * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
201  *         or <code>s</code> itself if the <code>substring</code> is empty,
202  *         or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
203  * @stable ICU 2.4
204  *
205  * @see u_strstr
206  * @see u_strFindLast
207  */
208 U_CAPI UChar * U_EXPORT2
209 u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
210 
211 /**
212  * Find the first occurrence of a BMP code point in a string.
213  * A surrogate code point is found only if its match in the text is not
214  * part of a surrogate pair.
215  * A NUL character is found at the string terminator.
216  *
217  * @param s The string to search (NUL-terminated).
218  * @param c The BMP code point to find.
219  * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
220  *         or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
221  * @stable ICU 2.0
222  *
223  * @see u_strchr32
224  * @see u_memchr
225  * @see u_strstr
226  * @see u_strFindFirst
227  */
228 U_CAPI UChar * U_EXPORT2
229 u_strchr(const UChar *s, UChar c);
230 
231 /**
232  * Find the first occurrence of a code point in a string.
233  * A surrogate code point is found only if its match in the text is not
234  * part of a surrogate pair.
235  * A NUL character is found at the string terminator.
236  *
237  * @param s The string to search (NUL-terminated).
238  * @param c The code point to find.
239  * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
240  *         or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
241  * @stable ICU 2.0
242  *
243  * @see u_strchr
244  * @see u_memchr32
245  * @see u_strstr
246  * @see u_strFindFirst
247  */
248 U_CAPI UChar * U_EXPORT2
249 u_strchr32(const UChar *s, UChar32 c);
250 
251 /**
252  * Find the last occurrence of a substring in a string.
253  * The substring is found at code point boundaries.
254  * That means that if the substring begins with
255  * a trail surrogate or ends with a lead surrogate,
256  * then it is found only if these surrogates stand alone in the text.
257  * Otherwise, the substring edge units would be matched against
258  * halves of surrogate pairs.
259  *
260  * @param s The string to search (NUL-terminated).
261  * @param substring The substring to find (NUL-terminated).
262  * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
263  *         or <code>s</code> itself if the <code>substring</code> is empty,
264  *         or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
265  * @stable ICU 2.4
266  *
267  * @see u_strstr
268  * @see u_strFindFirst
269  * @see u_strFindLast
270  */
271 U_CAPI UChar * U_EXPORT2
272 u_strrstr(const UChar *s, const UChar *substring);
273 
274 /**
275  * Find the last occurrence of a substring in a string.
276  * The substring is found at code point boundaries.
277  * That means that if the substring begins with
278  * a trail surrogate or ends with a lead surrogate,
279  * then it is found only if these surrogates stand alone in the text.
280  * Otherwise, the substring edge units would be matched against
281  * halves of surrogate pairs.
282  *
283  * @param s The string to search.
284  * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
285  * @param substring The substring to find (NUL-terminated).
286  * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
287  * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
288  *         or <code>s</code> itself if the <code>substring</code> is empty,
289  *         or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
290  * @stable ICU 2.4
291  *
292  * @see u_strstr
293  * @see u_strFindLast
294  */
295 U_CAPI UChar * U_EXPORT2
296 u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
297 
298 /**
299  * Find the last occurrence of a BMP code point in a string.
300  * A surrogate code point is found only if its match in the text is not
301  * part of a surrogate pair.
302  * A NUL character is found at the string terminator.
303  *
304  * @param s The string to search (NUL-terminated).
305  * @param c The BMP code point to find.
306  * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
307  *         or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
308  * @stable ICU 2.4
309  *
310  * @see u_strrchr32
311  * @see u_memrchr
312  * @see u_strrstr
313  * @see u_strFindLast
314  */
315 U_CAPI UChar * U_EXPORT2
316 u_strrchr(const UChar *s, UChar c);
317 
318 /**
319  * Find the last occurrence of a code point in a string.
320  * A surrogate code point is found only if its match in the text is not
321  * part of a surrogate pair.
322  * A NUL character is found at the string terminator.
323  *
324  * @param s The string to search (NUL-terminated).
325  * @param c The code point to find.
326  * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
327  *         or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
328  * @stable ICU 2.4
329  *
330  * @see u_strrchr
331  * @see u_memchr32
332  * @see u_strrstr
333  * @see u_strFindLast
334  */
335 U_CAPI UChar * U_EXPORT2
336 u_strrchr32(const UChar *s, UChar32 c);
337 
338 /**
339  * Locates the first occurrence in the string <code>string</code> of any of the characters
340  * in the string <code>matchSet</code>.
341  * Works just like C's strpbrk but with Unicode.
342  *
343  * @param string The string in which to search, NUL-terminated.
344  * @param matchSet A NUL-terminated string defining a set of code points
345  *                 for which to search in the text string.
346  * @return A pointer to the  character in <code>string</code> that matches one of the
347  *         characters in <code>matchSet</code>, or NULL if no such character is found.
348  * @stable ICU 2.0
349  */
350 U_CAPI UChar * U_EXPORT2
351 u_strpbrk(const UChar *string, const UChar *matchSet);
352 
353 /**
354  * Returns the number of consecutive characters in <code>string</code>,
355  * beginning with the first, that do not occur somewhere in <code>matchSet</code>.
356  * Works just like C's strcspn but with Unicode.
357  *
358  * @param string The string in which to search, NUL-terminated.
359  * @param matchSet A NUL-terminated string defining a set of code points
360  *                 for which to search in the text string.
361  * @return The number of initial characters in <code>string</code> that do not
362  *         occur in <code>matchSet</code>.
363  * @see u_strspn
364  * @stable ICU 2.0
365  */
366 U_CAPI int32_t U_EXPORT2
367 u_strcspn(const UChar *string, const UChar *matchSet);
368 
369 /**
370  * Returns the number of consecutive characters in <code>string</code>,
371  * beginning with the first, that occur somewhere in <code>matchSet</code>.
372  * Works just like C's strspn but with Unicode.
373  *
374  * @param string The string in which to search, NUL-terminated.
375  * @param matchSet A NUL-terminated string defining a set of code points
376  *                 for which to search in the text string.
377  * @return The number of initial characters in <code>string</code> that do
378  *         occur in <code>matchSet</code>.
379  * @see u_strcspn
380  * @stable ICU 2.0
381  */
382 U_CAPI int32_t U_EXPORT2
383 u_strspn(const UChar *string, const UChar *matchSet);
384 
385 /**
386  * The string tokenizer API allows an application to break a string into
387  * tokens. Unlike strtok(), the saveState (the current pointer within the
388  * original string) is maintained in saveState. In the first call, the
389  * argument src is a pointer to the string. In subsequent calls to
390  * return successive tokens of that string, src must be specified as
391  * NULL. The value saveState is set by this function to maintain the
392  * function's position within the string, and on each subsequent call
393  * you must give this argument the same variable. This function does
394  * handle surrogate pairs. This function is similar to the strtok_r()
395  * the POSIX Threads Extension (1003.1c-1995) version.
396  *
397  * @param src String containing token(s). This string will be modified.
398  *            After the first call to u_strtok_r(), this argument must
399  *            be NULL to get to the next token.
400  * @param delim Set of delimiter characters (Unicode code points).
401  * @param saveState The current pointer within the original string,
402  *              which is set by this function. The saveState
403  *              parameter should the address of a local variable of type
404  *              UChar *. (i.e. defined "UChar *myLocalSaveState" and use
405  *              &myLocalSaveState for this parameter).
406  * @return A pointer to the next token found in src, or NULL
407  *         when there are no more tokens.
408  * @stable ICU 2.0
409  */
410 U_CAPI UChar * U_EXPORT2
411 u_strtok_r(UChar    *src,
412      const UChar    *delim,
413            UChar   **saveState);
414 
415 /**
416  * Compare two Unicode strings for bitwise equality (code unit order).
417  *
418  * @param s1 A string to compare.
419  * @param s2 A string to compare.
420  * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
421  * value if <code>s1</code> is bitwise less than <code>s2,</code>; a positive
422  * value if <code>s1</code> is bitwise greater than <code>s2</code>.
423  * @stable ICU 2.0
424  */
425 U_CAPI int32_t  U_EXPORT2
426 u_strcmp(const UChar     *s1,
427          const UChar     *s2);
428 
429 /**
430  * Compare two Unicode strings in code point order.
431  * See u_strCompare for details.
432  *
433  * @param s1 A string to compare.
434  * @param s2 A string to compare.
435  * @return a negative/zero/positive integer corresponding to whether
436  * the first string is less than/equal to/greater than the second one
437  * in code point order
438  * @stable ICU 2.0
439  */
440 U_CAPI int32_t U_EXPORT2
441 u_strcmpCodePointOrder(const UChar *s1, const UChar *s2);
442 
443 /**
444  * Compare two Unicode strings (binary order).
445  *
446  * The comparison can be done in code unit order or in code point order.
447  * They differ only in UTF-16 when
448  * comparing supplementary code points (U+10000..U+10ffff)
449  * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
450  * In code unit order, high BMP code points sort after supplementary code points
451  * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
452  *
453  * This functions works with strings of different explicitly specified lengths
454  * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
455  * NUL-terminated strings are possible with length arguments of -1.
456  *
457  * @param s1 First source string.
458  * @param length1 Length of first source string, or -1 if NUL-terminated.
459  *
460  * @param s2 Second source string.
461  * @param length2 Length of second source string, or -1 if NUL-terminated.
462  *
463  * @param codePointOrder Choose between code unit order (false)
464  *                       and code point order (true).
465  *
466  * @return <0 or 0 or >0 as usual for string comparisons
467  *
468  * @stable ICU 2.2
469  */
470 U_CAPI int32_t U_EXPORT2
471 u_strCompare(const UChar *s1, int32_t length1,
472              const UChar *s2, int32_t length2,
473              UBool codePointOrder);
474 
475 
476 /**
477  * Compare two strings case-insensitively using full case folding.
478  * This is equivalent to
479  *   u_strCompare(u_strFoldCase(s1, options),
480  *                u_strFoldCase(s2, options),
481  *                (options&U_COMPARE_CODE_POINT_ORDER)!=0).
482  *
483  * The comparison can be done in UTF-16 code unit order or in code point order.
484  * They differ only when comparing supplementary code points (U+10000..U+10ffff)
485  * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
486  * In code unit order, high BMP code points sort after supplementary code points
487  * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
488  *
489  * This functions works with strings of different explicitly specified lengths
490  * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
491  * NUL-terminated strings are possible with length arguments of -1.
492  *
493  * @param s1 First source string.
494  * @param length1 Length of first source string, or -1 if NUL-terminated.
495  *
496  * @param s2 Second source string.
497  * @param length2 Length of second source string, or -1 if NUL-terminated.
498  *
499  * @param options A bit set of options:
500  *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
501  *     Comparison in code unit order with default case folding.
502  *
503  *   - U_COMPARE_CODE_POINT_ORDER
504  *     Set to choose code point order instead of code unit order
505  *     (see u_strCompare for details).
506  *
507  *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
508  *
509  * @param pErrorCode Must be a valid pointer to an error code value,
510  *                  which must not indicate a failure before the function call.
511  *
512  * @return <0 or 0 or >0 as usual for string comparisons
513  *
514  * @stable ICU 2.2
515  */
516 U_CAPI int32_t U_EXPORT2
517 u_strCaseCompare(const UChar *s1, int32_t length1,
518                  const UChar *s2, int32_t length2,
519                  uint32_t options,
520                  UErrorCode *pErrorCode);
521 
522 /**
523  * Compare two ustrings for bitwise equality.
524  * Compares at most <code>n</code> characters.
525  *
526  * @param ucs1 A string to compare (can be NULL/invalid if n<=0).
527  * @param ucs2 A string to compare (can be NULL/invalid if n<=0).
528  * @param n The maximum number of characters to compare; always returns 0 if n<=0.
529  * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
530  * value if <code>s1</code> is bitwise less than <code>s2</code>; a positive
531  * value if <code>s1</code> is bitwise greater than <code>s2</code>.
532  * @stable ICU 2.0
533  */
534 U_CAPI int32_t U_EXPORT2
535 u_strncmp(const UChar     *ucs1,
536      const UChar     *ucs2,
537      int32_t     n);
538 
539 /**
540  * Compare two Unicode strings in code point order.
541  * This is different in UTF-16 from u_strncmp() if supplementary characters are present.
542  * For details, see u_strCompare().
543  *
544  * @param s1 A string to compare.
545  * @param s2 A string to compare.
546  * @param n The maximum number of characters to compare.
547  * @return a negative/zero/positive integer corresponding to whether
548  * the first string is less than/equal to/greater than the second one
549  * in code point order
550  * @stable ICU 2.0
551  */
552 U_CAPI int32_t U_EXPORT2
553 u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n);
554 
555 /**
556  * Compare two strings case-insensitively using full case folding.
557  * This is equivalent to u_strcmp(u_strFoldCase(s1, options), u_strFoldCase(s2, options)).
558  *
559  * @param s1 A string to compare.
560  * @param s2 A string to compare.
561  * @param options A bit set of options:
562  *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
563  *     Comparison in code unit order with default case folding.
564  *
565  *   - U_COMPARE_CODE_POINT_ORDER
566  *     Set to choose code point order instead of code unit order
567  *     (see u_strCompare for details).
568  *
569  *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
570  *
571  * @return A negative, zero, or positive integer indicating the comparison result.
572  * @stable ICU 2.0
573  */
574 U_CAPI int32_t U_EXPORT2
575 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options);
576 
577 /**
578  * Compare two strings case-insensitively using full case folding.
579  * This is equivalent to u_strcmp(u_strFoldCase(s1, at most n, options),
580  * u_strFoldCase(s2, at most n, options)).
581  *
582  * @param s1 A string to compare.
583  * @param s2 A string to compare.
584  * @param n The maximum number of characters each string to case-fold and then compare.
585  * @param options A bit set of options:
586  *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
587  *     Comparison in code unit order with default case folding.
588  *
589  *   - U_COMPARE_CODE_POINT_ORDER
590  *     Set to choose code point order instead of code unit order
591  *     (see u_strCompare for details).
592  *
593  *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
594  *
595  * @return A negative, zero, or positive integer indicating the comparison result.
596  * @stable ICU 2.0
597  */
598 U_CAPI int32_t U_EXPORT2
599 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options);
600 
601 /**
602  * Compare two strings case-insensitively using full case folding.
603  * This is equivalent to u_strcmp(u_strFoldCase(s1, n, options),
604  * u_strFoldCase(s2, n, options)).
605  *
606  * @param s1 A string to compare.
607  * @param s2 A string to compare.
608  * @param length The number of characters in each string to case-fold and then compare.
609  * @param options A bit set of options:
610  *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
611  *     Comparison in code unit order with default case folding.
612  *
613  *   - U_COMPARE_CODE_POINT_ORDER
614  *     Set to choose code point order instead of code unit order
615  *     (see u_strCompare for details).
616  *
617  *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
618  *
619  * @return A negative, zero, or positive integer indicating the comparison result.
620  * @stable ICU 2.0
621  */
622 U_CAPI int32_t U_EXPORT2
623 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options);
624 
625 /**
626  * Copy a ustring. Adds a null terminator.
627  *
628  * @param dst The destination string.
629  * @param src The source string.
630  * @return A pointer to <code>dst</code>.
631  * @stable ICU 2.0
632  */
633 U_CAPI UChar* U_EXPORT2
634 u_strcpy(UChar     *dst,
635     const UChar     *src);
636 
637 /**
638  * Copy a ustring.
639  * Copies at most <code>n</code> characters.  The result will be null terminated
640  * if the length of <code>src</code> is less than <code>n</code>.
641  *
642  * @param dst The destination string.
643  * @param src The source string (can be NULL/invalid if n<=0).
644  * @param n The maximum number of characters to copy; no-op if <=0.
645  * @return A pointer to <code>dst</code>.
646  * @stable ICU 2.0
647  */
648 U_CAPI UChar* U_EXPORT2
649 u_strncpy(UChar     *dst,
650      const UChar     *src,
651      int32_t     n);
652 
653 
654 /**
655  * Synonym for memcpy(), but with UChars only.
656  * @param dest The destination string
657  * @param src The source string (can be NULL/invalid if count<=0)
658  * @param count The number of characters to copy; no-op if <=0
659  * @return A pointer to <code>dest</code>
660  * @stable ICU 2.0
661  */
662 U_CAPI UChar* U_EXPORT2
663 u_memcpy(UChar *dest, const UChar *src, int32_t count);
664 
665 /**
666  * Synonym for memmove(), but with UChars only.
667  * @param dest The destination string
668  * @param src The source string (can be NULL/invalid if count<=0)
669  * @param count The number of characters to move; no-op if <=0
670  * @return A pointer to <code>dest</code>
671  * @stable ICU 2.0
672  */
673 U_CAPI UChar* U_EXPORT2
674 u_memmove(UChar *dest, const UChar *src, int32_t count);
675 
676 /**
677  * Initialize <code>count</code> characters of <code>dest</code> to <code>c</code>.
678  *
679  * @param dest The destination string.
680  * @param c The character to initialize the string.
681  * @param count The maximum number of characters to set.
682  * @return A pointer to <code>dest</code>.
683  * @stable ICU 2.0
684  */
685 U_CAPI UChar* U_EXPORT2
686 u_memset(UChar *dest, UChar c, int32_t count);
687 
688 /**
689  * Compare the first <code>count</code> UChars of each buffer.
690  *
691  * @param buf1 The first string to compare.
692  * @param buf2 The second string to compare.
693  * @param count The maximum number of UChars to compare.
694  * @return When buf1 < buf2, a negative number is returned.
695  *      When buf1 == buf2, 0 is returned.
696  *      When buf1 > buf2, a positive number is returned.
697  * @stable ICU 2.0
698  */
699 U_CAPI int32_t U_EXPORT2
700 u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count);
701 
702 /**
703  * Compare two Unicode strings in code point order.
704  * This is different in UTF-16 from u_memcmp() if supplementary characters are present.
705  * For details, see u_strCompare().
706  *
707  * @param s1 A string to compare.
708  * @param s2 A string to compare.
709  * @param count The maximum number of characters to compare.
710  * @return a negative/zero/positive integer corresponding to whether
711  * the first string is less than/equal to/greater than the second one
712  * in code point order
713  * @stable ICU 2.0
714  */
715 U_CAPI int32_t U_EXPORT2
716 u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count);
717 
718 /**
719  * Find the first occurrence of a BMP code point in a string.
720  * A surrogate code point is found only if its match in the text is not
721  * part of a surrogate pair.
722  * A NUL character is found at the string terminator.
723  *
724  * @param s The string to search (contains <code>count</code> UChars).
725  * @param c The BMP code point to find.
726  * @param count The length of the string.
727  * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
728  *         or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
729  * @stable ICU 2.0
730  *
731  * @see u_strchr
732  * @see u_memchr32
733  * @see u_strFindFirst
734  */
735 U_CAPI UChar* U_EXPORT2
736 u_memchr(const UChar *s, UChar c, int32_t count);
737 
738 /**
739  * Find the first occurrence of a code point in a string.
740  * A surrogate code point is found only if its match in the text is not
741  * part of a surrogate pair.
742  * A NUL character is found at the string terminator.
743  *
744  * @param s The string to search (contains <code>count</code> UChars).
745  * @param c The code point to find.
746  * @param count The length of the string.
747  * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
748  *         or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
749  * @stable ICU 2.0
750  *
751  * @see u_strchr32
752  * @see u_memchr
753  * @see u_strFindFirst
754  */
755 U_CAPI UChar* U_EXPORT2
756 u_memchr32(const UChar *s, UChar32 c, int32_t count);
757 
758 /**
759  * Find the last occurrence of a BMP code point in a string.
760  * A surrogate code point is found only if its match in the text is not
761  * part of a surrogate pair.
762  * A NUL character is found at the string terminator.
763  *
764  * @param s The string to search (contains <code>count</code> UChars).
765  * @param c The BMP code point to find.
766  * @param count The length of the string.
767  * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
768  *         or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
769  * @stable ICU 2.4
770  *
771  * @see u_strrchr
772  * @see u_memrchr32
773  * @see u_strFindLast
774  */
775 U_CAPI UChar* U_EXPORT2
776 u_memrchr(const UChar *s, UChar c, int32_t count);
777 
778 /**
779  * Find the last occurrence of a code point in a string.
780  * A surrogate code point is found only if its match in the text is not
781  * part of a surrogate pair.
782  * A NUL character is found at the string terminator.
783  *
784  * @param s The string to search (contains <code>count</code> UChars).
785  * @param c The code point to find.
786  * @param count The length of the string.
787  * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
788  *         or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
789  * @stable ICU 2.4
790  *
791  * @see u_strrchr32
792  * @see u_memrchr
793  * @see u_strFindLast
794  */
795 U_CAPI UChar* U_EXPORT2
796 u_memrchr32(const UChar *s, UChar32 c, int32_t count);
797 
798 /**
799  * Unicode String literals in C.
800  * We need one macro to declare a variable for the string
801  * and to statically preinitialize it if possible,
802  * and a second macro to dynamically initialize such a string variable if necessary.
803  *
804  * The macros are defined for maximum performance.
805  * They work only for strings that contain "invariant characters", i.e.,
806  * only latin letters, digits, and some punctuation.
807  * See utypes.h for details.
808  *
809  * A pair of macros for a single string must be used with the same
810  * parameters.
811  * The string parameter must be a C string literal.
812  * The length of the string, not including the terminating
813  * `NUL`, must be specified as a constant.
814  * The U_STRING_DECL macro should be invoked exactly once for one
815  * such string variable before it is used.
816  *
817  * Usage:
818  *
819  *     U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11);
820  *     U_STRING_DECL(ustringVar2, "jumps 5%", 8);
821  *     static UBool didInit=false;
822  *
823  *     int32_t function() {
824  *         if(!didInit) {
825  *             U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11);
826  *             U_STRING_INIT(ustringVar2, "jumps 5%", 8);
827  *             didInit=true;
828  *         }
829  *         return u_strcmp(ustringVar1, ustringVar2);
830  *     }
831  *
832  * Note that the macros will NOT consistently work if their argument is another #`define`.
833  * The following will not work on all platforms, don't use it.
834  *
835  *     #define GLUCK "Mr. Gluck"
836  *     U_STRING_DECL(var, GLUCK, 9)
837  *     U_STRING_INIT(var, GLUCK, 9)
838  *
839  * Instead, use the string literal "Mr. Gluck"  as the argument to both macro
840  * calls.
841  *
842  *
843  * @stable ICU 2.0
844  */
845 #if defined(U_DECLARE_UTF16)
846 #   define U_STRING_DECL(var, cs, length) static const UChar *var=(const UChar *)U_DECLARE_UTF16(cs)
847     /**@stable ICU 2.0 */
848 #   define U_STRING_INIT(var, cs, length)
849 #elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16)))
850 #   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs
851     /**@stable ICU 2.0 */
852 #   define U_STRING_INIT(var, cs, length)
853 #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
854 #   define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=cs
855     /**@stable ICU 2.0 */
856 #   define U_STRING_INIT(var, cs, length)
857 #else
858 #   define U_STRING_DECL(var, cs, length) static UChar var[(length)+1]
859     /**@stable ICU 2.0 */
860 #   define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1)
861 #endif
862 
863 
864 /**
865  * Uppercase the characters in a string.
866  * Casing is locale-dependent and context-sensitive.
867  * The result may be longer or shorter than the original.
868  * The source string and the destination buffer are allowed to overlap.
869  *
870  * @param dest      A buffer for the result string. The result will be zero-terminated if
871  *                  the buffer is large enough.
872  * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
873  *                  dest may be NULL and the function will only return the length of the result
874  *                  without writing any of the result string.
875  * @param src       The original string
876  * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
877  * @param locale    The locale to consider, or "" for the root locale or NULL for the default locale.
878  * @param pErrorCode Must be a valid pointer to an error code value,
879  *                  which must not indicate a failure before the function call.
880  * @return The length of the result string. It may be greater than destCapacity. In that case,
881  *         only some of the result was written to the destination buffer.
882  * @stable ICU 2.0
883  */
884 U_CAPI int32_t U_EXPORT2
885 u_strToUpper(UChar *dest, int32_t destCapacity,
886              const UChar *src, int32_t srcLength,
887              const char *locale,
888              UErrorCode *pErrorCode);
889 
890 /**
891  * Lowercase the characters in a string.
892  * Casing is locale-dependent and context-sensitive.
893  * The result may be longer or shorter than the original.
894  * The source string and the destination buffer are allowed to overlap.
895  *
896  * @param dest      A buffer for the result string. The result will be zero-terminated if
897  *                  the buffer is large enough.
898  * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
899  *                  dest may be NULL and the function will only return the length of the result
900  *                  without writing any of the result string.
901  * @param src       The original string
902  * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
903  * @param locale    The locale to consider, or "" for the root locale or NULL for the default locale.
904  * @param pErrorCode Must be a valid pointer to an error code value,
905  *                  which must not indicate a failure before the function call.
906  * @return The length of the result string. It may be greater than destCapacity. In that case,
907  *         only some of the result was written to the destination buffer.
908  * @stable ICU 2.0
909  */
910 U_CAPI int32_t U_EXPORT2
911 u_strToLower(UChar *dest, int32_t destCapacity,
912              const UChar *src, int32_t srcLength,
913              const char *locale,
914              UErrorCode *pErrorCode);
915 
916 #if !UCONFIG_NO_BREAK_ITERATION
917 
918 /**
919  * Titlecase a string.
920  * Casing is locale-dependent and context-sensitive.
921  * Titlecasing uses a break iterator to find the first characters of words
922  * that are to be titlecased. It titlecases those characters and lowercases
923  * all others.
924  *
925  * The titlecase break iterator can be provided to customize for arbitrary
926  * styles, using rules and dictionaries beyond the standard iterators.
927  * It may be more efficient to always provide an iterator to avoid
928  * opening and closing one for each string.
929  * The standard titlecase iterator for the root locale implements the
930  * algorithm of Unicode TR 21.
931  *
932  * This function uses only the setText(), first() and next() methods of the
933  * provided break iterator.
934  *
935  * The result may be longer or shorter than the original.
936  * The source string and the destination buffer are allowed to overlap.
937  *
938  * @param dest      A buffer for the result string. The result will be zero-terminated if
939  *                  the buffer is large enough.
940  * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
941  *                  dest may be NULL and the function will only return the length of the result
942  *                  without writing any of the result string.
943  * @param src       The original string
944  * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
945  * @param titleIter A break iterator to find the first characters of words
946  *                  that are to be titlecased.
947  *                  If none is provided (NULL), then a standard titlecase
948  *                  break iterator is opened.
949  * @param locale    The locale to consider, or "" for the root locale or NULL for the default locale.
950  * @param pErrorCode Must be a valid pointer to an error code value,
951  *                  which must not indicate a failure before the function call.
952  * @return The length of the result string. It may be greater than destCapacity. In that case,
953  *         only some of the result was written to the destination buffer.
954  * @stable ICU 2.1
955  */
956 U_CAPI int32_t U_EXPORT2
957 u_strToTitle(UChar *dest, int32_t destCapacity,
958              const UChar *src, int32_t srcLength,
959              UBreakIterator *titleIter,
960              const char *locale,
961              UErrorCode *pErrorCode);
962 
963 #endif
964 
965 /**
966  * Case-folds the characters in a string.
967  *
968  * Case-folding is locale-independent and not context-sensitive,
969  * but there is an option for whether to include or exclude mappings for dotted I
970  * and dotless i that are marked with 'T' in CaseFolding.txt.
971  *
972  * The result may be longer or shorter than the original.
973  * The source string and the destination buffer are allowed to overlap.
974  *
975  * @param dest      A buffer for the result string. The result will be zero-terminated if
976  *                  the buffer is large enough.
977  * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
978  *                  dest may be NULL and the function will only return the length of the result
979  *                  without writing any of the result string.
980  * @param src       The original string
981  * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
982  * @param options   Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
983  * @param pErrorCode Must be a valid pointer to an error code value,
984  *                  which must not indicate a failure before the function call.
985  * @return The length of the result string. It may be greater than destCapacity. In that case,
986  *         only some of the result was written to the destination buffer.
987  * @stable ICU 2.0
988  */
989 U_CAPI int32_t U_EXPORT2
990 u_strFoldCase(UChar *dest, int32_t destCapacity,
991               const UChar *src, int32_t srcLength,
992               uint32_t options,
993               UErrorCode *pErrorCode);
994 
995 
996 /**
997  * Convert a UTF-16 string to UTF-8.
998  * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
999  *
1000  * @param dest          A buffer for the result string. The result will be zero-terminated if
1001  *                      the buffer is large enough.
1002  * @param destCapacity  The size of the buffer (number of chars). If it is 0, then
1003  *                      dest may be NULL and the function will only return the length of the
1004  *                      result without writing any of the result string (pre-flighting).
1005  * @param pDestLength   A pointer to receive the number of units written to the destination. If
1006  *                      pDestLength!=NULL then *pDestLength is always set to the
1007  *                      number of output units corresponding to the transformation of
1008  *                      all the input units, even in case of a buffer overflow.
1009  * @param src           The original source string
1010  * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
1011  * @param pErrorCode    Must be a valid pointer to an error code value,
1012  *                      which must not indicate a failure before the function call.
1013  * @return The pointer to destination buffer.
1014  * @stable ICU 2.0
1015  * @see u_strToUTF8WithSub
1016  * @see u_strFromUTF8
1017  */
1018 U_CAPI char* U_EXPORT2
1019 u_strToUTF8(char *dest,
1020             int32_t destCapacity,
1021             int32_t *pDestLength,
1022             const UChar *src,
1023             int32_t srcLength,
1024             UErrorCode *pErrorCode);
1025 
1026 /**
1027  * Convert a UTF-8 string to UTF-16.
1028  * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
1029  *
1030  * @param dest          A buffer for the result string. The result will be zero-terminated if
1031  *                      the buffer is large enough.
1032  * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
1033  *                      dest may be NULL and the function will only return the length of the
1034  *                      result without writing any of the result string (pre-flighting).
1035  * @param pDestLength   A pointer to receive the number of units written to the destination. If
1036  *                      pDestLength!=NULL then *pDestLength is always set to the
1037  *                      number of output units corresponding to the transformation of
1038  *                      all the input units, even in case of a buffer overflow.
1039  * @param src           The original source string
1040  * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
1041  * @param pErrorCode    Must be a valid pointer to an error code value,
1042  *                      which must not indicate a failure before the function call.
1043  * @return The pointer to destination buffer.
1044  * @stable ICU 2.0
1045  * @see u_strFromUTF8WithSub
1046  * @see u_strFromUTF8Lenient
1047  */
1048 U_CAPI UChar* U_EXPORT2
1049 u_strFromUTF8(UChar *dest,
1050               int32_t destCapacity,
1051               int32_t *pDestLength,
1052               const char *src,
1053               int32_t srcLength,
1054               UErrorCode *pErrorCode);
1055 
1056 /**
1057  * Convert a UTF-16 string to UTF-8.
1058  *
1059  * Same as u_strToUTF8() except for the additional subchar which is output for
1060  * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
1061  * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
1062  *
1063  * @param dest          A buffer for the result string. The result will be zero-terminated if
1064  *                      the buffer is large enough.
1065  * @param destCapacity  The size of the buffer (number of chars). If it is 0, then
1066  *                      dest may be NULL and the function will only return the length of the
1067  *                      result without writing any of the result string (pre-flighting).
1068  * @param pDestLength   A pointer to receive the number of units written to the destination. If
1069  *                      pDestLength!=NULL then *pDestLength is always set to the
1070  *                      number of output units corresponding to the transformation of
1071  *                      all the input units, even in case of a buffer overflow.
1072  * @param src           The original source string
1073  * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
1074  * @param subchar       The substitution character to use in place of an illegal input sequence,
1075  *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
1076  *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
1077  *                      except for surrogate code points (U+D800..U+DFFF).
1078  *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
1079  * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
1080  *                      Set to 0 if no substitutions occur or subchar<0.
1081  *                      pNumSubstitutions can be NULL.
1082  * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
1083  *                      pass the U_SUCCESS() test, or else the function returns
1084  *                      immediately. Check for U_FAILURE() on output or use with
1085  *                      function chaining. (See User Guide for details.)
1086  * @return The pointer to destination buffer.
1087  * @see u_strToUTF8
1088  * @see u_strFromUTF8WithSub
1089  * @stable ICU 3.6
1090  */
1091 U_CAPI char* U_EXPORT2
1092 u_strToUTF8WithSub(char *dest,
1093             int32_t destCapacity,
1094             int32_t *pDestLength,
1095             const UChar *src,
1096             int32_t srcLength,
1097             UChar32 subchar, int32_t *pNumSubstitutions,
1098             UErrorCode *pErrorCode);
1099 
1100 /**
1101  * Convert a UTF-8 string to UTF-16.
1102  *
1103  * Same as u_strFromUTF8() except for the additional subchar which is output for
1104  * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
1105  * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
1106  *
1107  * @param dest          A buffer for the result string. The result will be zero-terminated if
1108  *                      the buffer is large enough.
1109  * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
1110  *                      dest may be NULL and the function will only return the length of the
1111  *                      result without writing any of the result string (pre-flighting).
1112  * @param pDestLength   A pointer to receive the number of units written to the destination. If
1113  *                      pDestLength!=NULL then *pDestLength is always set to the
1114  *                      number of output units corresponding to the transformation of
1115  *                      all the input units, even in case of a buffer overflow.
1116  * @param src           The original source string
1117  * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
1118  * @param subchar       The substitution character to use in place of an illegal input sequence,
1119  *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
1120  *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
1121  *                      except for surrogate code points (U+D800..U+DFFF).
1122  *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
1123  * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
1124  *                      Set to 0 if no substitutions occur or subchar<0.
1125  *                      pNumSubstitutions can be NULL.
1126  * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
1127  *                      pass the U_SUCCESS() test, or else the function returns
1128  *                      immediately. Check for U_FAILURE() on output or use with
1129  *                      function chaining. (See User Guide for details.)
1130  * @return The pointer to destination buffer.
1131  * @see u_strFromUTF8
1132  * @see u_strFromUTF8Lenient
1133  * @see u_strToUTF8WithSub
1134  * @stable ICU 3.6
1135  */
1136 U_CAPI UChar* U_EXPORT2
1137 u_strFromUTF8WithSub(UChar *dest,
1138               int32_t destCapacity,
1139               int32_t *pDestLength,
1140               const char *src,
1141               int32_t srcLength,
1142               UChar32 subchar, int32_t *pNumSubstitutions,
1143               UErrorCode *pErrorCode);
1144 
1145 /**
1146  * Convert a UTF-8 string to UTF-16.
1147  *
1148  * Same as u_strFromUTF8() except that this function is designed to be very fast,
1149  * which it achieves by being lenient about malformed UTF-8 sequences.
1150  * This function is intended for use in environments where UTF-8 text is
1151  * expected to be well-formed.
1152  *
1153  * Its semantics are:
1154  * - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text.
1155  * - The function will not read beyond the input string, nor write beyond
1156  *   the destCapacity.
1157  * - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not
1158  *   be well-formed UTF-16.
1159  *   The function will resynchronize to valid code point boundaries
1160  *   within a small number of code points after an illegal sequence.
1161  * - Non-shortest forms are not detected and will result in "spoofing" output.
1162  *
1163  * For further performance improvement, if srcLength is given (>=0),
1164  * then it must be destCapacity>=srcLength.
1165  *
1166  * There is no inverse u_strToUTF8Lenient() function because there is practically
1167  * no performance gain from not checking that a UTF-16 string is well-formed.
1168  *
1169  * @param dest          A buffer for the result string. The result will be zero-terminated if
1170  *                      the buffer is large enough.
1171  * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
1172  *                      dest may be NULL and the function will only return the length of the
1173  *                      result without writing any of the result string (pre-flighting).
1174  *                      Unlike for other ICU functions, if srcLength>=0 then it
1175  *                      must be destCapacity>=srcLength.
1176  * @param pDestLength   A pointer to receive the number of units written to the destination. If
1177  *                      pDestLength!=NULL then *pDestLength is always set to the
1178  *                      number of output units corresponding to the transformation of
1179  *                      all the input units, even in case of a buffer overflow.
1180  *                      Unlike for other ICU functions, if srcLength>=0 but
1181  *                      destCapacity<srcLength, then *pDestLength will be set to srcLength
1182  *                      (and U_BUFFER_OVERFLOW_ERROR will be set)
1183  *                      regardless of the actual result length.
1184  * @param src           The original source string
1185  * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
1186  * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
1187  *                      pass the U_SUCCESS() test, or else the function returns
1188  *                      immediately. Check for U_FAILURE() on output or use with
1189  *                      function chaining. (See User Guide for details.)
1190  * @return The pointer to destination buffer.
1191  * @see u_strFromUTF8
1192  * @see u_strFromUTF8WithSub
1193  * @see u_strToUTF8WithSub
1194  * @stable ICU 3.6
1195  */
1196 U_CAPI UChar * U_EXPORT2
1197 u_strFromUTF8Lenient(UChar *dest,
1198                      int32_t destCapacity,
1199                      int32_t *pDestLength,
1200                      const char *src,
1201                      int32_t srcLength,
1202                      UErrorCode *pErrorCode);
1203 
1204 /**
1205  * Convert a UTF-16 string to UTF-32.
1206  * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
1207  *
1208  * @param dest          A buffer for the result string. The result will be zero-terminated if
1209  *                      the buffer is large enough.
1210  * @param destCapacity  The size of the buffer (number of UChar32s). If it is 0, then
1211  *                      dest may be NULL and the function will only return the length of the
1212  *                      result without writing any of the result string (pre-flighting).
1213  * @param pDestLength   A pointer to receive the number of units written to the destination. If
1214  *                      pDestLength!=NULL then *pDestLength is always set to the
1215  *                      number of output units corresponding to the transformation of
1216  *                      all the input units, even in case of a buffer overflow.
1217  * @param src           The original source string
1218  * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
1219  * @param pErrorCode    Must be a valid pointer to an error code value,
1220  *                      which must not indicate a failure before the function call.
1221  * @return The pointer to destination buffer.
1222  * @see u_strToUTF32WithSub
1223  * @see u_strFromUTF32
1224  * @stable ICU 2.0
1225  */
1226 U_CAPI UChar32* U_EXPORT2
1227 u_strToUTF32(UChar32 *dest,
1228              int32_t  destCapacity,
1229              int32_t  *pDestLength,
1230              const UChar *src,
1231              int32_t  srcLength,
1232              UErrorCode *pErrorCode);
1233 
1234 /**
1235  * Convert a UTF-32 string to UTF-16.
1236  * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
1237  *
1238  * @param dest          A buffer for the result string. The result will be zero-terminated if
1239  *                      the buffer is large enough.
1240  * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
1241  *                      dest may be NULL and the function will only return the length of the
1242  *                      result without writing any of the result string (pre-flighting).
1243  * @param pDestLength   A pointer to receive the number of units written to the destination. If
1244  *                      pDestLength!=NULL then *pDestLength is always set to the
1245  *                      number of output units corresponding to the transformation of
1246  *                      all the input units, even in case of a buffer overflow.
1247  * @param src           The original source string
1248  * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
1249  * @param pErrorCode    Must be a valid pointer to an error code value,
1250  *                      which must not indicate a failure before the function call.
1251  * @return The pointer to destination buffer.
1252  * @see u_strFromUTF32WithSub
1253  * @see u_strToUTF32
1254  * @stable ICU 2.0
1255  */
1256 U_CAPI UChar* U_EXPORT2
1257 u_strFromUTF32(UChar   *dest,
1258                int32_t destCapacity,
1259                int32_t *pDestLength,
1260                const UChar32 *src,
1261                int32_t srcLength,
1262                UErrorCode *pErrorCode);
1263 
1264 /**
1265  * Convert a UTF-16 string to UTF-32.
1266  *
1267  * Same as u_strToUTF32() except for the additional subchar which is output for
1268  * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
1269  * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32().
1270  *
1271  * @param dest          A buffer for the result string. The result will be zero-terminated if
1272  *                      the buffer is large enough.
1273  * @param destCapacity  The size of the buffer (number of UChar32s). If it is 0, then
1274  *                      dest may be NULL and the function will only return the length of the
1275  *                      result without writing any of the result string (pre-flighting).
1276  * @param pDestLength   A pointer to receive the number of units written to the destination. If
1277  *                      pDestLength!=NULL then *pDestLength is always set to the
1278  *                      number of output units corresponding to the transformation of
1279  *                      all the input units, even in case of a buffer overflow.
1280  * @param src           The original source string
1281  * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
1282  * @param subchar       The substitution character to use in place of an illegal input sequence,
1283  *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
1284  *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
1285  *                      except for surrogate code points (U+D800..U+DFFF).
1286  *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
1287  * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
1288  *                      Set to 0 if no substitutions occur or subchar<0.
1289  *                      pNumSubstitutions can be NULL.
1290  * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
1291  *                      pass the U_SUCCESS() test, or else the function returns
1292  *                      immediately. Check for U_FAILURE() on output or use with
1293  *                      function chaining. (See User Guide for details.)
1294  * @return The pointer to destination buffer.
1295  * @see u_strToUTF32
1296  * @see u_strFromUTF32WithSub
1297  * @stable ICU 4.2
1298  */
1299 U_CAPI UChar32* U_EXPORT2
1300 u_strToUTF32WithSub(UChar32 *dest,
1301              int32_t destCapacity,
1302              int32_t *pDestLength,
1303              const UChar *src,
1304              int32_t srcLength,
1305              UChar32 subchar, int32_t *pNumSubstitutions,
1306              UErrorCode *pErrorCode);
1307 
1308 /**
1309  * Convert a UTF-32 string to UTF-16.
1310  *
1311  * Same as u_strFromUTF32() except for the additional subchar which is output for
1312  * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
1313  * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32().
1314  *
1315  * @param dest          A buffer for the result string. The result will be zero-terminated if
1316  *                      the buffer is large enough.
1317  * @param destCapacity  The size of the buffer (number of UChars). If it is 0, then
1318  *                      dest may be NULL and the function will only return the length of the
1319  *                      result without writing any of the result string (pre-flighting).
1320  * @param pDestLength   A pointer to receive the number of units written to the destination. If
1321  *                      pDestLength!=NULL then *pDestLength is always set to the
1322  *                      number of output units corresponding to the transformation of
1323  *                      all the input units, even in case of a buffer overflow.
1324  * @param src           The original source string
1325  * @param srcLength     The length of the original string. If -1, then src must be zero-terminated.
1326  * @param subchar       The substitution character to use in place of an illegal input sequence,
1327  *                      or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
1328  *                      A substitution character can be any valid Unicode code point (up to U+10FFFF)
1329  *                      except for surrogate code points (U+D800..U+DFFF).
1330  *                      The recommended value is U+FFFD "REPLACEMENT CHARACTER".
1331  * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
1332  *                      Set to 0 if no substitutions occur or subchar<0.
1333  *                      pNumSubstitutions can be NULL.
1334  * @param pErrorCode    Pointer to a standard ICU error code. Its input value must
1335  *                      pass the U_SUCCESS() test, or else the function returns
1336  *                      immediately. Check for U_FAILURE() on output or use with
1337  *                      function chaining. (See User Guide for details.)
1338  * @return The pointer to destination buffer.
1339  * @see u_strFromUTF32
1340  * @see u_strToUTF32WithSub
1341  * @stable ICU 4.2
1342  */
1343 U_CAPI UChar* U_EXPORT2
1344 u_strFromUTF32WithSub(UChar *dest,
1345                int32_t destCapacity,
1346                int32_t *pDestLength,
1347                const UChar32 *src,
1348                int32_t srcLength,
1349                UChar32 subchar, int32_t *pNumSubstitutions,
1350                UErrorCode *pErrorCode);
1351 
1352 
1353 #endif
1354