• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2005-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucasemap.h
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2005may06
16 *   created by: Markus W. Scherer
17 *
18 *   Case mapping service object and functions using it.
19 */
20 
21 #ifndef __UCASEMAP_H__
22 #define __UCASEMAP_H__
23 
24 #include "unicode/utypes.h"
25 #include "unicode/ustring.h"
26 #include "unicode/localpointer.h"
27 
28 /**
29  * \file
30  * \brief C API: Unicode case mapping functions using a UCaseMap service object.
31  *
32  * The service object takes care of memory allocations, data loading, and setup
33  * for the attributes, as usual.
34  *
35  * Currently, the functionality provided here does not overlap with uchar.h
36  * and ustring.h, except for ucasemap_toTitle().
37  *
38  * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings.
39  */
40 
41 /**
42  * UCaseMap is an opaque service object for newer ICU case mapping functions.
43  * Older functions did not use a service object.
44  * @stable ICU 3.4
45  */
46 struct UCaseMap;
47 typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */
48 
49 /**
50  * Open a UCaseMap service object for a locale and a set of options.
51  * The locale ID and options are preprocessed so that functions using the
52  * service object need not process them in each call.
53  *
54  * @param locale ICU locale ID, used for language-dependent
55  *               upper-/lower-/title-casing according to the Unicode standard.
56  *               Usual semantics: ""=root, NULL=default locale, etc.
57  * @param options Options bit set, used for case folding and string comparisons.
58  *                Same flags as for u_foldCase(), u_strFoldCase(),
59  *                u_strCaseCompare(), etc.
60  *                Use 0 or U_FOLD_CASE_DEFAULT for default behavior.
61  * @param pErrorCode Must be a valid pointer to an error code value,
62  *                   which must not indicate a failure before the function call.
63  * @return Pointer to a UCaseMap service object, if successful.
64  *
65  * @see U_FOLD_CASE_DEFAULT
66  * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
67  * @see U_TITLECASE_NO_LOWERCASE
68  * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
69  * @stable ICU 3.4
70  */
71 U_STABLE UCaseMap * U_EXPORT2
72 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode);
73 
74 /**
75  * Close a UCaseMap service object.
76  * @param csm Object to be closed.
77  * @stable ICU 3.4
78  */
79 U_STABLE void U_EXPORT2
80 ucasemap_close(UCaseMap *csm);
81 
82 #if U_SHOW_CPLUSPLUS_API
83 
84 U_NAMESPACE_BEGIN
85 
86 /**
87  * \class LocalUCaseMapPointer
88  * "Smart pointer" class, closes a UCaseMap via ucasemap_close().
89  * For most methods see the LocalPointerBase base class.
90  *
91  * @see LocalPointerBase
92  * @see LocalPointer
93  * @stable ICU 4.4
94  */
95 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCaseMapPointer, UCaseMap, ucasemap_close);
96 
97 U_NAMESPACE_END
98 
99 #endif
100 
101 /**
102  * Get the locale ID that is used for language-dependent case mappings.
103  * @param csm UCaseMap service object.
104  * @return locale ID
105  * @stable ICU 3.4
106  */
107 U_STABLE const char * U_EXPORT2
108 ucasemap_getLocale(const UCaseMap *csm);
109 
110 /**
111  * Get the options bit set that is used for case folding and string comparisons.
112  * @param csm UCaseMap service object.
113  * @return options bit set
114  * @stable ICU 3.4
115  */
116 U_STABLE uint32_t U_EXPORT2
117 ucasemap_getOptions(const UCaseMap *csm);
118 
119 /**
120  * Set the locale ID that is used for language-dependent case mappings.
121  *
122  * @param csm UCaseMap service object.
123  * @param locale Locale ID, see ucasemap_open().
124  * @param pErrorCode Must be a valid pointer to an error code value,
125  *                   which must not indicate a failure before the function call.
126  *
127  * @see ucasemap_open
128  * @stable ICU 3.4
129  */
130 U_STABLE void U_EXPORT2
131 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode);
132 
133 /**
134  * Set the options bit set that is used for case folding and string comparisons.
135  *
136  * @param csm UCaseMap service object.
137  * @param options Options bit set, see ucasemap_open().
138  * @param pErrorCode Must be a valid pointer to an error code value,
139  *                   which must not indicate a failure before the function call.
140  *
141  * @see ucasemap_open
142  * @stable ICU 3.4
143  */
144 U_STABLE void U_EXPORT2
145 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);
146 
147 /**
148  * Do not lowercase non-initial parts of words when titlecasing.
149  * Option bit for titlecasing APIs that take an options bit set.
150  *
151  * By default, titlecasing will titlecase the first cased character
152  * of a word and lowercase all other characters.
153  * With this option, the other characters will not be modified.
154  *
155  * @see ucasemap_setOptions
156  * @see ucasemap_toTitle
157  * @see ucasemap_utf8ToTitle
158  * @see UnicodeString::toTitle
159  * @stable ICU 3.8
160  */
161 #define U_TITLECASE_NO_LOWERCASE 0x100
162 
163 /**
164  * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
165  * titlecase exactly the characters at breaks from the iterator.
166  * Option bit for titlecasing APIs that take an options bit set.
167  *
168  * By default, titlecasing will take each break iterator index,
169  * adjust it by looking for the next cased character, and titlecase that one.
170  * Other characters are lowercased.
171  *
172  * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
173  *
174  * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
175  * #29, "Text Boundaries." Between each pair of word boundaries, find the first
176  * cased character F. If F exists, map F to default_title(F); then map each
177  * subsequent character C to default_lower(C).
178  *
179  * @see ucasemap_setOptions
180  * @see ucasemap_toTitle
181  * @see ucasemap_utf8ToTitle
182  * @see UnicodeString::toTitle
183  * @see U_TITLECASE_NO_LOWERCASE
184  * @stable ICU 3.8
185  */
186 #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
187 
188 #if !UCONFIG_NO_BREAK_ITERATION
189 
190 /**
191  * Get the break iterator that is used for titlecasing.
192  * Do not modify the returned break iterator.
193  * @param csm UCaseMap service object.
194  * @return titlecasing break iterator
195  * @stable ICU 3.8
196  */
197 U_STABLE const UBreakIterator * U_EXPORT2
198 ucasemap_getBreakIterator(const UCaseMap *csm);
199 
200 /**
201  * Set the break iterator that is used for titlecasing.
202  * The UCaseMap service object releases a previously set break iterator
203  * and "adopts" this new one, taking ownership of it.
204  * It will be released in a subsequent call to ucasemap_setBreakIterator()
205  * or ucasemap_close().
206  *
207  * Break iterator operations are not thread-safe. Therefore, titlecasing
208  * functions use non-const UCaseMap objects. It is not possible to titlecase
209  * strings concurrently using the same UCaseMap.
210  *
211  * @param csm UCaseMap service object.
212  * @param iterToAdopt Break iterator to be adopted for titlecasing.
213  * @param pErrorCode Must be a valid pointer to an error code value,
214  *                   which must not indicate a failure before the function call.
215  *
216  * @see ucasemap_toTitle
217  * @see ucasemap_utf8ToTitle
218  * @stable ICU 3.8
219  */
220 U_STABLE void U_EXPORT2
221 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode);
222 
223 /**
224  * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(),
225  * except that it takes ucasemap_setOptions() into account and has performance
226  * advantages from being able to use a UCaseMap object for multiple case mapping
227  * operations, saving setup time.
228  *
229  * Casing is locale-dependent and context-sensitive.
230  * Titlecasing uses a break iterator to find the first characters of words
231  * that are to be titlecased. It titlecases those characters and lowercases
232  * all others. (This can be modified with ucasemap_setOptions().)
233  *
234  * Note: This function takes a non-const UCaseMap pointer because it will
235  * open a default break iterator if no break iterator was set yet,
236  * and effectively call ucasemap_setBreakIterator();
237  * also because the break iterator is stateful and will be modified during
238  * the iteration.
239  *
240  * The titlecase break iterator can be provided to customize for arbitrary
241  * styles, using rules and dictionaries beyond the standard iterators.
242  * The standard titlecase iterator for the root locale implements the
243  * algorithm of Unicode TR 21.
244  *
245  * This function uses only the setUText(), first(), next() and close() methods of the
246  * provided break iterator.
247  *
248  * The result may be longer or shorter than the original.
249  * The source string and the destination buffer must not overlap.
250  *
251  * @param csm       UCaseMap service object. This pointer is non-const!
252  *                  See the note above for details.
253  * @param dest      A buffer for the result string. The result will be NUL-terminated if
254  *                  the buffer is large enough.
255  *                  The contents is undefined in case of failure.
256  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
257  *                  dest may be NULL and the function will only return the length of the result
258  *                  without writing any of the result string.
259  * @param src       The original string.
260  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
261  * @param pErrorCode Must be a valid pointer to an error code value,
262  *                  which must not indicate a failure before the function call.
263  * @return The length of the result string, if successful - or in case of a buffer overflow,
264  *         in which case it will be greater than destCapacity.
265  *
266  * @see u_strToTitle
267  * @stable ICU 3.8
268  */
269 U_STABLE int32_t U_EXPORT2
270 ucasemap_toTitle(UCaseMap *csm,
271                  UChar *dest, int32_t destCapacity,
272                  const UChar *src, int32_t srcLength,
273                  UErrorCode *pErrorCode);
274 
275 #endif
276 
277 /**
278  * Lowercase the characters in a UTF-8 string.
279  * Casing is locale-dependent and context-sensitive.
280  * The result may be longer or shorter than the original.
281  * The source string and the destination buffer must not overlap.
282  *
283  * @param csm       UCaseMap service object.
284  * @param dest      A buffer for the result string. The result will be NUL-terminated if
285  *                  the buffer is large enough.
286  *                  The contents is undefined in case of failure.
287  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
288  *                  dest may be NULL and the function will only return the length of the result
289  *                  without writing any of the result string.
290  * @param src       The original string.
291  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
292  * @param pErrorCode Must be a valid pointer to an error code value,
293  *                  which must not indicate a failure before the function call.
294  * @return The length of the result string, if successful - or in case of a buffer overflow,
295  *         in which case it will be greater than destCapacity.
296  *
297  * @see u_strToLower
298  * @stable ICU 3.4
299  */
300 U_STABLE int32_t U_EXPORT2
301 ucasemap_utf8ToLower(const UCaseMap *csm,
302                      char *dest, int32_t destCapacity,
303                      const char *src, int32_t srcLength,
304                      UErrorCode *pErrorCode);
305 
306 /**
307  * Uppercase the characters in a UTF-8 string.
308  * Casing is locale-dependent and context-sensitive.
309  * The result may be longer or shorter than the original.
310  * The source string and the destination buffer must not overlap.
311  *
312  * @param csm       UCaseMap service object.
313  * @param dest      A buffer for the result string. The result will be NUL-terminated if
314  *                  the buffer is large enough.
315  *                  The contents is undefined in case of failure.
316  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
317  *                  dest may be NULL and the function will only return the length of the result
318  *                  without writing any of the result string.
319  * @param src       The original string.
320  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
321  * @param pErrorCode Must be a valid pointer to an error code value,
322  *                  which must not indicate a failure before the function call.
323  * @return The length of the result string, if successful - or in case of a buffer overflow,
324  *         in which case it will be greater than destCapacity.
325  *
326  * @see u_strToUpper
327  * @stable ICU 3.4
328  */
329 U_STABLE int32_t U_EXPORT2
330 ucasemap_utf8ToUpper(const UCaseMap *csm,
331                      char *dest, int32_t destCapacity,
332                      const char *src, int32_t srcLength,
333                      UErrorCode *pErrorCode);
334 
335 #if !UCONFIG_NO_BREAK_ITERATION
336 
337 /**
338  * Titlecase a UTF-8 string.
339  * Casing is locale-dependent and context-sensitive.
340  * Titlecasing uses a break iterator to find the first characters of words
341  * that are to be titlecased. It titlecases those characters and lowercases
342  * all others. (This can be modified with ucasemap_setOptions().)
343  *
344  * Note: This function takes a non-const UCaseMap pointer because it will
345  * open a default break iterator if no break iterator was set yet,
346  * and effectively call ucasemap_setBreakIterator();
347  * also because the break iterator is stateful and will be modified during
348  * the iteration.
349  *
350  * The titlecase break iterator can be provided to customize for arbitrary
351  * styles, using rules and dictionaries beyond the standard iterators.
352  * The standard titlecase iterator for the root locale implements the
353  * algorithm of Unicode TR 21.
354  *
355  * This function uses only the setUText(), first(), next() and close() methods of the
356  * provided break iterator.
357  *
358  * The result may be longer or shorter than the original.
359  * The source string and the destination buffer must not overlap.
360  *
361  * @param csm       UCaseMap service object. This pointer is non-const!
362  *                  See the note above for details.
363  * @param dest      A buffer for the result string. The result will be NUL-terminated if
364  *                  the buffer is large enough.
365  *                  The contents is undefined in case of failure.
366  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
367  *                  dest may be NULL and the function will only return the length of the result
368  *                  without writing any of the result string.
369  * @param src       The original string.
370  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
371  * @param pErrorCode Must be a valid pointer to an error code value,
372  *                  which must not indicate a failure before the function call.
373  * @return The length of the result string, if successful - or in case of a buffer overflow,
374  *         in which case it will be greater than destCapacity.
375  *
376  * @see u_strToTitle
377  * @see U_TITLECASE_NO_LOWERCASE
378  * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
379  * @stable ICU 3.8
380  */
381 U_STABLE int32_t U_EXPORT2
382 ucasemap_utf8ToTitle(UCaseMap *csm,
383                     char *dest, int32_t destCapacity,
384                     const char *src, int32_t srcLength,
385                     UErrorCode *pErrorCode);
386 
387 #endif
388 
389 /**
390  * Case-folds the characters in a UTF-8 string.
391  *
392  * Case-folding is locale-independent and not context-sensitive,
393  * but there is an option for whether to include or exclude mappings for dotted I
394  * and dotless i that are marked with 'T' in CaseFolding.txt.
395  *
396  * The result may be longer or shorter than the original.
397  * The source string and the destination buffer must not overlap.
398  *
399  * @param csm       UCaseMap service object.
400  * @param dest      A buffer for the result string. The result will be NUL-terminated if
401  *                  the buffer is large enough.
402  *                  The contents is undefined in case of failure.
403  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
404  *                  dest may be NULL and the function will only return the length of the result
405  *                  without writing any of the result string.
406  * @param src       The original string.
407  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
408  * @param pErrorCode Must be a valid pointer to an error code value,
409  *                  which must not indicate a failure before the function call.
410  * @return The length of the result string, if successful - or in case of a buffer overflow,
411  *         in which case it will be greater than destCapacity.
412  *
413  * @see u_strFoldCase
414  * @see ucasemap_setOptions
415  * @see U_FOLD_CASE_DEFAULT
416  * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
417  * @stable ICU 3.8
418  */
419 U_STABLE int32_t U_EXPORT2
420 ucasemap_utf8FoldCase(const UCaseMap *csm,
421                       char *dest, int32_t destCapacity,
422                       const char *src, int32_t srcLength,
423                       UErrorCode *pErrorCode);
424 
425 #endif
426