• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *   Copyright (C) 1997-2005, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   Date        Name        Description
7 *   06/21/00    aliu        Creation.
8 *******************************************************************************
9 */
10 
11 #ifndef UTRANS_H
12 #define UTRANS_H
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_TRANSLITERATION
17 
18 #include "unicode/urep.h"
19 #include "unicode/parseerr.h"
20 #include "unicode/uenum.h"
21 
22 /********************************************************************
23  * General Notes
24  ********************************************************************
25  */
26 /**
27  * \file
28  * \brief C API: Transliterator
29  *
30  * <h2> Transliteration </h2>
31  * The data structures and functions described in this header provide
32  * transliteration services.  Transliteration services are implemented
33  * as C++ classes.  The comments and documentation in this header
34  * assume the reader is familiar with the C++ headers translit.h and
35  * associated documentation.
36  *
37  * A significant but incomplete subset of the C++ transliteration
38  * services are available to C code through this header.  In order to
39  * access more complex transliteration services, refer to the C++
40  * headers and documentation.
41  *
42  * There are two sets of functions for working with transliterator IDs:
43  *
44  * An old, deprecated set uses char * IDs, which works for true and pure
45  * identifiers that these APIs were designed for,
46  * for example "Cyrillic-Latin".
47  * It does not work when the ID contains filters ("[:Script=Cyrl:]")
48  * or even a complete set of rules because then the ID string contains more
49  * than just "invariant" characters (see utypes.h).
50  *
51  * A new set of functions replaces the old ones and uses UChar * IDs,
52  * paralleling the UnicodeString IDs in the C++ API. (New in ICU 2.8.)
53  */
54 
55 /********************************************************************
56  * Data Structures
57  ********************************************************************/
58 
59 /**
60  * An opaque transliterator for use in C.  Open with utrans_openxxx()
61  * and close with utrans_close() when done.  Equivalent to the C++ class
62  * Transliterator and its subclasses.
63  * @see Transliterator
64  * @stable ICU 2.0
65  */
66 typedef void* UTransliterator;
67 
68 /**
69  * Direction constant indicating the direction in a transliterator,
70  * e.g., the forward or reverse rules of a RuleBasedTransliterator.
71  * Specified when a transliterator is opened.  An "A-B" transliterator
72  * transliterates A to B when operating in the forward direction, and
73  * B to A when operating in the reverse direction.
74  * @stable ICU 2.0
75  */
76 typedef enum UTransDirection {
77 
78     /**
79      * UTRANS_FORWARD means from &lt;source&gt; to &lt;target&gt; for a
80      * transliterator with ID &lt;source&gt;-&lt;target&gt;.  For a transliterator
81      * opened using a rule, it means forward direction rules, e.g.,
82      * "A > B".
83      */
84     UTRANS_FORWARD,
85 
86     /**
87      * UTRANS_REVERSE means from &lt;target&gt; to &lt;source&gt; for a
88      * transliterator with ID &lt;source&gt;-&lt;target&gt;.  For a transliterator
89      * opened using a rule, it means reverse direction rules, e.g.,
90      * "A < B".
91      */
92     UTRANS_REVERSE
93 
94 } UTransDirection;
95 
96 /**
97  * Position structure for utrans_transIncremental() incremental
98  * transliteration.  This structure defines two substrings of the text
99  * being transliterated.  The first region, [contextStart,
100  * contextLimit), defines what characters the transliterator will read
101  * as context.  The second region, [start, limit), defines what
102  * characters will actually be transliterated.  The second region
103  * should be a subset of the first.
104  *
105  * <p>After a transliteration operation, some of the indices in this
106  * structure will be modified.  See the field descriptions for
107  * details.
108  *
109  * <p>contextStart <= start <= limit <= contextLimit
110  *
111  * <p>Note: All index values in this structure must be at code point
112  * boundaries.  That is, none of them may occur between two code units
113  * of a surrogate pair.  If any index does split a surrogate pair,
114  * results are unspecified.
115  *
116  * @stable ICU 2.0
117  */
118 typedef struct UTransPosition {
119 
120     /**
121      * Beginning index, inclusive, of the context to be considered for
122      * a transliteration operation.  The transliterator will ignore
123      * anything before this index.  INPUT/OUTPUT parameter: This parameter
124      * is updated by a transliteration operation to reflect the maximum
125      * amount of antecontext needed by a transliterator.
126      * @stable ICU 2.4
127      */
128     int32_t contextStart;
129 
130     /**
131      * Ending index, exclusive, of the context to be considered for a
132      * transliteration operation.  The transliterator will ignore
133      * anything at or after this index.  INPUT/OUTPUT parameter: This
134      * parameter is updated to reflect changes in the length of the
135      * text, but points to the same logical position in the text.
136      * @stable ICU 2.4
137      */
138     int32_t contextLimit;
139 
140     /**
141      * Beginning index, inclusive, of the text to be transliteratd.
142      * INPUT/OUTPUT parameter: This parameter is advanced past
143      * characters that have already been transliterated by a
144      * transliteration operation.
145      * @stable ICU 2.4
146      */
147     int32_t start;
148 
149     /**
150      * Ending index, exclusive, of the text to be transliteratd.
151      * INPUT/OUTPUT parameter: This parameter is updated to reflect
152      * changes in the length of the text, but points to the same
153      * logical position in the text.
154      * @stable ICU 2.4
155      */
156     int32_t limit;
157 
158 } UTransPosition;
159 
160 /********************************************************************
161  * General API
162  ********************************************************************/
163 
164 /**
165  * Open a custom transliterator, given a custom rules string
166  * OR
167  * a system transliterator, given its ID.
168  * Any non-NULL result from this function should later be closed with
169  * utrans_close().
170  *
171  * @param id a valid transliterator ID
172  * @param idLength the length of the ID string, or -1 if NUL-terminated
173  * @param dir the desired direction
174  * @param rules the transliterator rules.  See the C++ header rbt.h for
175  *              rules syntax. If NULL then a system transliterator matching
176  *              the ID is returned.
177  * @param rulesLength the length of the rules, or -1 if the rules
178  *                    are NUL-terminated.
179  * @param parseError a pointer to a UParseError struct to receive the details
180  *                   of any parsing errors. This parameter may be NULL if no
181  *                   parsing error details are desired.
182  * @param pErrorCode a pointer to the UErrorCode
183  * @return a transliterator pointer that may be passed to other
184  *         utrans_xxx() functions, or NULL if the open call fails.
185  * @stable ICU 2.8
186  */
187 U_STABLE UTransliterator* U_EXPORT2
188 utrans_openU(const UChar *id,
189              int32_t idLength,
190              UTransDirection dir,
191              const UChar *rules,
192              int32_t rulesLength,
193              UParseError *parseError,
194              UErrorCode *pErrorCode);
195 
196 /**
197  * Open an inverse of an existing transliterator.  For this to work,
198  * the inverse must be registered with the system.  For example, if
199  * the Transliterator "A-B" is opened, and then its inverse is opened,
200  * the result is the Transliterator "B-A", if such a transliterator is
201  * registered with the system.  Otherwise the result is NULL and a
202  * failing UErrorCode is set.  Any non-NULL result from this function
203  * should later be closed with utrans_close().
204  *
205  * @param trans the transliterator to open the inverse of.
206  * @param status a pointer to the UErrorCode
207  * @return a pointer to a newly-opened transliterator that is the
208  * inverse of trans, or NULL if the open call fails.
209  * @stable ICU 2.0
210  */
211 U_STABLE UTransliterator* U_EXPORT2
212 utrans_openInverse(const UTransliterator* trans,
213                    UErrorCode* status);
214 
215 /**
216  * Create a copy of a transliterator.  Any non-NULL result from this
217  * function should later be closed with utrans_close().
218  *
219  * @param trans the transliterator to be copied.
220  * @param status a pointer to the UErrorCode
221  * @return a transliterator pointer that may be passed to other
222  * utrans_xxx() functions, or NULL if the clone call fails.
223  * @stable ICU 2.0
224  */
225 U_STABLE UTransliterator* U_EXPORT2
226 utrans_clone(const UTransliterator* trans,
227              UErrorCode* status);
228 
229 /**
230  * Close a transliterator.  Any non-NULL pointer returned by
231  * utrans_openXxx() or utrans_clone() should eventually be closed.
232  * @param trans the transliterator to be closed.
233  * @stable ICU 2.0
234  */
235 U_STABLE void U_EXPORT2
236 utrans_close(UTransliterator* trans);
237 
238 /**
239  * Return the programmatic identifier for this transliterator.
240  * If this identifier is passed to utrans_openU(), it will open
241  * a transliterator equivalent to this one, if the ID has been
242  * registered.
243  *
244  * @param trans the transliterator to return the ID of.
245  * @param resultLength pointer to an output variable receiving the length
246  *        of the ID string; can be NULL
247  * @return the NUL-terminated ID string. This pointer remains
248  * valid until utrans_close() is called on this transliterator.
249  *
250  * @stable ICU 2.8
251  */
252 U_STABLE const UChar * U_EXPORT2
253 utrans_getUnicodeID(const UTransliterator *trans,
254                     int32_t *resultLength);
255 
256 /**
257  * Register an open transliterator with the system.  When
258  * utrans_open() is called with an ID string that is equal to that
259  * returned by utrans_getID(adoptedTrans,...), then
260  * utrans_clone(adoptedTrans,...) is returned.
261  *
262  * <p>NOTE: After this call the system owns the adoptedTrans and will
263  * close it.  The user must not call utrans_close() on adoptedTrans.
264  *
265  * @param adoptedTrans a transliterator, typically the result of
266  * utrans_openRules(), to be registered with the system.
267  * @param status a pointer to the UErrorCode
268  * @stable ICU 2.0
269  */
270 U_STABLE void U_EXPORT2
271 utrans_register(UTransliterator* adoptedTrans,
272                 UErrorCode* status);
273 
274 /**
275  * Unregister a transliterator from the system.  After this call the
276  * system will no longer recognize the given ID when passed to
277  * utrans_open(). If the ID is invalid then nothing is done.
278  *
279  * @param id an ID to unregister
280  * @param idLength the length of id, or -1 if id is zero-terminated
281  * @stable ICU 2.8
282  */
283 U_STABLE void U_EXPORT2
284 utrans_unregisterID(const UChar* id, int32_t idLength);
285 
286 /**
287  * Set the filter used by a transliterator.  A filter can be used to
288  * make the transliterator pass certain characters through untouched.
289  * The filter is expressed using a UnicodeSet pattern.  If the
290  * filterPattern is NULL or the empty string, then the transliterator
291  * will be reset to use no filter.
292  *
293  * @param trans the transliterator
294  * @param filterPattern a pattern string, in the form accepted by
295  * UnicodeSet, specifying which characters to apply the
296  * transliteration to.  May be NULL or the empty string to indicate no
297  * filter.
298  * @param filterPatternLen the length of filterPattern, or -1 if
299  * filterPattern is zero-terminated
300  * @param status a pointer to the UErrorCode
301  * @see UnicodeSet
302  * @stable ICU 2.0
303  */
304 U_STABLE void U_EXPORT2
305 utrans_setFilter(UTransliterator* trans,
306                  const UChar* filterPattern,
307                  int32_t filterPatternLen,
308                  UErrorCode* status);
309 
310 /**
311  * Return the number of system transliterators.
312  * It is recommended to use utrans_openIDs() instead.
313  *
314  * @return the number of system transliterators.
315  * @stable ICU 2.0
316  */
317 U_STABLE int32_t U_EXPORT2
318 utrans_countAvailableIDs(void);
319 
320 /**
321  * Return a UEnumeration for the available transliterators.
322  *
323  * @param pErrorCode Pointer to the UErrorCode in/out parameter.
324  * @return UEnumeration for the available transliterators.
325  *         Close with uenum_close().
326  *
327  * @stable ICU 2.8
328  */
329 U_STABLE UEnumeration * U_EXPORT2
330 utrans_openIDs(UErrorCode *pErrorCode);
331 
332 /********************************************************************
333  * Transliteration API
334  ********************************************************************/
335 
336 /**
337  * Transliterate a segment of a UReplaceable string.  The string is
338  * passed in as a UReplaceable pointer rep and a UReplaceableCallbacks
339  * function pointer struct repFunc.  Functions in the repFunc struct
340  * will be called in order to modify the rep string.
341  *
342  * @param trans the transliterator
343  * @param rep a pointer to the string.  This will be passed to the
344  * repFunc functions.
345  * @param repFunc a set of function pointers that will be used to
346  * modify the string pointed to by rep.
347  * @param start the beginning index, inclusive; <code>0 <= start <=
348  * limit</code>.
349  * @param limit pointer to the ending index, exclusive; <code>start <=
350  * limit <= repFunc->length(rep)</code>.  Upon return, *limit will
351  * contain the new limit index.  The text previously occupying
352  * <code>[start, limit)</code> has been transliterated, possibly to a
353  * string of a different length, at <code>[start,
354  * </code><em>new-limit</em><code>)</code>, where <em>new-limit</em>
355  * is the return value.
356  * @param status a pointer to the UErrorCode
357  * @stable ICU 2.0
358  */
359 U_STABLE void U_EXPORT2
360 utrans_trans(const UTransliterator* trans,
361              UReplaceable* rep,
362              UReplaceableCallbacks* repFunc,
363              int32_t start,
364              int32_t* limit,
365              UErrorCode* status);
366 
367 /**
368  * Transliterate the portion of the UReplaceable text buffer that can
369  * be transliterated unambiguosly.  This method is typically called
370  * after new text has been inserted, e.g. as a result of a keyboard
371  * event.  The transliterator will try to transliterate characters of
372  * <code>rep</code> between <code>index.cursor</code> and
373  * <code>index.limit</code>.  Characters before
374  * <code>index.cursor</code> will not be changed.
375  *
376  * <p>Upon return, values in <code>index</code> will be updated.
377  * <code>index.start</code> will be advanced to the first
378  * character that future calls to this method will read.
379  * <code>index.cursor</code> and <code>index.limit</code> will
380  * be adjusted to delimit the range of text that future calls to
381  * this method may change.
382  *
383  * <p>Typical usage of this method begins with an initial call
384  * with <code>index.start</code> and <code>index.limit</code>
385  * set to indicate the portion of <code>text</code> to be
386  * transliterated, and <code>index.cursor == index.start</code>.
387  * Thereafter, <code>index</code> can be used without
388  * modification in future calls, provided that all changes to
389  * <code>text</code> are made via this method.
390  *
391  * <p>This method assumes that future calls may be made that will
392  * insert new text into the buffer.  As a result, it only performs
393  * unambiguous transliterations.  After the last call to this method,
394  * there may be untransliterated text that is waiting for more input
395  * to resolve an ambiguity.  In order to perform these pending
396  * transliterations, clients should call utrans_trans() with a start
397  * of index.start and a limit of index.end after the last call to this
398  * method has been made.
399  *
400  * @param trans the transliterator
401  * @param rep a pointer to the string.  This will be passed to the
402  * repFunc functions.
403  * @param repFunc a set of function pointers that will be used to
404  * modify the string pointed to by rep.
405  * @param pos a struct containing the start and limit indices of the
406  * text to be read and the text to be transliterated
407  * @param status a pointer to the UErrorCode
408  * @stable ICU 2.0
409  */
410 U_STABLE void U_EXPORT2
411 utrans_transIncremental(const UTransliterator* trans,
412                         UReplaceable* rep,
413                         UReplaceableCallbacks* repFunc,
414                         UTransPosition* pos,
415                         UErrorCode* status);
416 
417 /**
418  * Transliterate a segment of a UChar* string.  The string is passed
419  * in in a UChar* buffer.  The string is modified in place.  If the
420  * result is longer than textCapacity, it is truncated.  The actual
421  * length of the result is returned in *textLength, if textLength is
422  * non-NULL. *textLength may be greater than textCapacity, but only
423  * textCapacity UChars will be written to *text, including the zero
424  * terminator.
425  *
426  * @param trans the transliterator
427  * @param text a pointer to a buffer containing the text to be
428  * transliterated on input and the result text on output.
429  * @param textLength a pointer to the length of the string in text.
430  * If the length is -1 then the string is assumed to be
431  * zero-terminated.  Upon return, the new length is stored in
432  * *textLength.  If textLength is NULL then the string is assumed to
433  * be zero-terminated.
434  * @param textCapacity a pointer to the length of the text buffer.
435  * Upon return,
436  * @param start the beginning index, inclusive; <code>0 <= start <=
437  * limit</code>.
438  * @param limit pointer to the ending index, exclusive; <code>start <=
439  * limit <= repFunc->length(rep)</code>.  Upon return, *limit will
440  * contain the new limit index.  The text previously occupying
441  * <code>[start, limit)</code> has been transliterated, possibly to a
442  * string of a different length, at <code>[start,
443  * </code><em>new-limit</em><code>)</code>, where <em>new-limit</em>
444  * is the return value.
445  * @param status a pointer to the UErrorCode
446  * @stable ICU 2.0
447  */
448 U_STABLE void U_EXPORT2
449 utrans_transUChars(const UTransliterator* trans,
450                    UChar* text,
451                    int32_t* textLength,
452                    int32_t textCapacity,
453                    int32_t start,
454                    int32_t* limit,
455                    UErrorCode* status);
456 
457 /**
458  * Transliterate the portion of the UChar* text buffer that can be
459  * transliterated unambiguosly.  See utrans_transIncremental().  The
460  * string is passed in in a UChar* buffer.  The string is modified in
461  * place.  If the result is longer than textCapacity, it is truncated.
462  * The actual length of the result is returned in *textLength, if
463  * textLength is non-NULL. *textLength may be greater than
464  * textCapacity, but only textCapacity UChars will be written to
465  * *text, including the zero terminator.  See utrans_transIncremental()
466  * for usage details.
467  *
468  * @param trans the transliterator
469  * @param text a pointer to a buffer containing the text to be
470  * transliterated on input and the result text on output.
471  * @param textLength a pointer to the length of the string in text.
472  * If the length is -1 then the string is assumed to be
473  * zero-terminated.  Upon return, the new length is stored in
474  * *textLength.  If textLength is NULL then the string is assumed to
475  * be zero-terminated.
476  * @param textCapacity the length of the text buffer
477  * @param pos a struct containing the start and limit indices of the
478  * text to be read and the text to be transliterated
479  * @param status a pointer to the UErrorCode
480  * @see utrans_transIncremental
481  * @stable ICU 2.0
482  */
483 U_STABLE void U_EXPORT2
484 utrans_transIncrementalUChars(const UTransliterator* trans,
485                               UChar* text,
486                               int32_t* textLength,
487                               int32_t textCapacity,
488                               UTransPosition* pos,
489                               UErrorCode* status);
490 
491 /* deprecated API ----------------------------------------------------------- */
492 
493 /* see utrans.h documentation for why these functions are deprecated */
494 
495 /**
496  * Deprecated, use utrans_openU() instead.
497  * Open a custom transliterator, given a custom rules string
498  * OR
499  * a system transliterator, given its ID.
500  * Any non-NULL result from this function should later be closed with
501  * utrans_close().
502  *
503  * @param id a valid ID, as returned by utrans_getAvailableID()
504  * @param dir the desired direction
505  * @param rules the transliterator rules.  See the C++ header rbt.h
506  * for rules syntax. If NULL then a system transliterator matching
507  * the ID is returned.
508  * @param rulesLength the length of the rules, or -1 if the rules
509  * are zero-terminated.
510  * @param parseError a pointer to a UParseError struct to receive the
511  * details of any parsing errors. This parameter may be NULL if no
512  * parsing error details are desired.
513  * @param status a pointer to the UErrorCode
514  * @return a transliterator pointer that may be passed to other
515  * utrans_xxx() functions, or NULL if the open call fails.
516  * @deprecated ICU 2.8 Use utrans_openU() instead, see utrans.h
517  */
518 U_DEPRECATED UTransliterator* U_EXPORT2
519 utrans_open(const char* id,
520             UTransDirection dir,
521             const UChar* rules,         /* may be Null */
522             int32_t rulesLength,        /* -1 if null-terminated */
523             UParseError* parseError,    /* may be Null */
524             UErrorCode* status);
525 
526 /**
527  * Deprecated, use utrans_getUnicodeID() instead.
528  * Return the programmatic identifier for this transliterator.
529  * If this identifier is passed to utrans_open(), it will open
530  * a transliterator equivalent to this one, if the ID has been
531  * registered.
532  * @param trans the transliterator to return the ID of.
533  * @param buf the buffer in which to receive the ID.  This may be
534  * NULL, in which case no characters are copied.
535  * @param bufCapacity the capacity of the buffer.  Ignored if buf is
536  * NULL.
537  * @return the actual length of the ID, not including
538  * zero-termination.  This may be greater than bufCapacity.
539  * @deprecated ICU 2.8 Use utrans_getUnicodeID() instead, see utrans.h
540  */
541 U_DEPRECATED int32_t U_EXPORT2
542 utrans_getID(const UTransliterator* trans,
543              char* buf,
544              int32_t bufCapacity);
545 
546 /**
547  * Deprecated, use utrans_unregisterID() instead.
548  * Unregister a transliterator from the system.  After this call the
549  * system will no longer recognize the given ID when passed to
550  * utrans_open().  If the id is invalid then nothing is done.
551  *
552  * @param id a zero-terminated ID
553  * @deprecated ICU 2.8 Use utrans_unregisterID() instead, see utrans.h
554  */
555 U_DEPRECATED void U_EXPORT2
556 utrans_unregister(const char* id);
557 
558 /**
559  * Deprecated, use utrans_openIDs() instead.
560  * Return the ID of the index-th system transliterator.  The result
561  * is placed in the given buffer.  If the given buffer is too small,
562  * the initial substring is copied to buf.  The result in buf is
563  * always zero-terminated.
564  *
565  * @param index the number of the transliterator to return.  Must
566  * satisfy 0 <= index < utrans_countAvailableIDs().  If index is out
567  * of range then it is treated as if it were 0.
568  * @param buf the buffer in which to receive the ID.  This may be
569  * NULL, in which case no characters are copied.
570  * @param bufCapacity the capacity of the buffer.  Ignored if buf is
571  * NULL.
572  * @return the actual length of the index-th ID, not including
573  * zero-termination.  This may be greater than bufCapacity.
574  * @deprecated ICU 2.8 Use utrans_openIDs() instead, see utrans.h
575  */
576 U_DEPRECATED int32_t U_EXPORT2
577 utrans_getAvailableID(int32_t index,
578                       char* buf,
579                       int32_t bufCapacity);
580 
581 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
582 
583 #endif
584