• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *   Copyright (C) 1997-2011, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   Date        Name        Description
7 *   06/21/00    aliu        Creation.
8 *******************************************************************************
9 */
10 
11 #ifndef UTRANS_H
12 #define UTRANS_H
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_TRANSLITERATION
17 
18 #include "unicode/localpointer.h"
19 #include "unicode/urep.h"
20 #include "unicode/parseerr.h"
21 #include "unicode/uenum.h"
22 
23 /********************************************************************
24  * General Notes
25  ********************************************************************
26  */
27 /**
28  * \file
29  * \brief C API: Transliterator
30  *
31  * <h2> Transliteration </h2>
32  * The data structures and functions described in this header provide
33  * transliteration services.  Transliteration services are implemented
34  * as C++ classes.  The comments and documentation in this header
35  * assume the reader is familiar with the C++ headers translit.h and
36  * associated documentation.
37  *
38  * A significant but incomplete subset of the C++ transliteration
39  * services are available to C code through this header.  In order to
40  * access more complex transliteration services, refer to the C++
41  * headers and documentation.
42  *
43  * There are two sets of functions for working with transliterator IDs:
44  *
45  * An old, deprecated set uses char * IDs, which works for true and pure
46  * identifiers that these APIs were designed for,
47  * for example "Cyrillic-Latin".
48  * It does not work when the ID contains filters ("[:Script=Cyrl:]")
49  * or even a complete set of rules because then the ID string contains more
50  * than just "invariant" characters (see utypes.h).
51  *
52  * A new set of functions replaces the old ones and uses UChar * IDs,
53  * paralleling the UnicodeString IDs in the C++ API. (New in ICU 2.8.)
54  */
55 
56 /********************************************************************
57  * Data Structures
58  ********************************************************************/
59 
60 /**
61  * An opaque transliterator for use in C.  Open with utrans_openxxx()
62  * and close with utrans_close() when done.  Equivalent to the C++ class
63  * Transliterator and its subclasses.
64  * @see Transliterator
65  * @stable ICU 2.0
66  */
67 typedef void* UTransliterator;
68 
69 /**
70  * Direction constant indicating the direction in a transliterator,
71  * e.g., the forward or reverse rules of a RuleBasedTransliterator.
72  * Specified when a transliterator is opened.  An "A-B" transliterator
73  * transliterates A to B when operating in the forward direction, and
74  * B to A when operating in the reverse direction.
75  * @stable ICU 2.0
76  */
77 typedef enum UTransDirection {
78 
79     /**
80      * UTRANS_FORWARD means from &lt;source&gt; to &lt;target&gt; for a
81      * transliterator with ID &lt;source&gt;-&lt;target&gt;.  For a transliterator
82      * opened using a rule, it means forward direction rules, e.g.,
83      * "A > B".
84      */
85     UTRANS_FORWARD,
86 
87     /**
88      * UTRANS_REVERSE means from &lt;target&gt; to &lt;source&gt; for a
89      * transliterator with ID &lt;source&gt;-&lt;target&gt;.  For a transliterator
90      * opened using a rule, it means reverse direction rules, e.g.,
91      * "A < B".
92      */
93     UTRANS_REVERSE
94 
95 } UTransDirection;
96 
97 /**
98  * Position structure for utrans_transIncremental() incremental
99  * transliteration.  This structure defines two substrings of the text
100  * being transliterated.  The first region, [contextStart,
101  * contextLimit), defines what characters the transliterator will read
102  * as context.  The second region, [start, limit), defines what
103  * characters will actually be transliterated.  The second region
104  * should be a subset of the first.
105  *
106  * <p>After a transliteration operation, some of the indices in this
107  * structure will be modified.  See the field descriptions for
108  * details.
109  *
110  * <p>contextStart <= start <= limit <= contextLimit
111  *
112  * <p>Note: All index values in this structure must be at code point
113  * boundaries.  That is, none of them may occur between two code units
114  * of a surrogate pair.  If any index does split a surrogate pair,
115  * results are unspecified.
116  *
117  * @stable ICU 2.0
118  */
119 typedef struct UTransPosition {
120 
121     /**
122      * Beginning index, inclusive, of the context to be considered for
123      * a transliteration operation.  The transliterator will ignore
124      * anything before this index.  INPUT/OUTPUT parameter: This parameter
125      * is updated by a transliteration operation to reflect the maximum
126      * amount of antecontext needed by a transliterator.
127      * @stable ICU 2.4
128      */
129     int32_t contextStart;
130 
131     /**
132      * Ending index, exclusive, of the context to be considered for a
133      * transliteration operation.  The transliterator will ignore
134      * anything at or after this index.  INPUT/OUTPUT parameter: This
135      * parameter is updated to reflect changes in the length of the
136      * text, but points to the same logical position in the text.
137      * @stable ICU 2.4
138      */
139     int32_t contextLimit;
140 
141     /**
142      * Beginning index, inclusive, of the text to be transliteratd.
143      * INPUT/OUTPUT parameter: This parameter is advanced past
144      * characters that have already been transliterated by a
145      * transliteration operation.
146      * @stable ICU 2.4
147      */
148     int32_t start;
149 
150     /**
151      * Ending index, exclusive, of the text to be transliteratd.
152      * INPUT/OUTPUT parameter: This parameter is updated to reflect
153      * changes in the length of the text, but points to the same
154      * logical position in the text.
155      * @stable ICU 2.4
156      */
157     int32_t limit;
158 
159 } UTransPosition;
160 
161 /********************************************************************
162  * General API
163  ********************************************************************/
164 
165 /**
166  * Open a custom transliterator, given a custom rules string
167  * OR
168  * a system transliterator, given its ID.
169  * Any non-NULL result from this function should later be closed with
170  * utrans_close().
171  *
172  * @param id a valid transliterator ID
173  * @param idLength the length of the ID string, or -1 if NUL-terminated
174  * @param dir the desired direction
175  * @param rules the transliterator rules.  See the C++ header rbt.h for
176  *              rules syntax. If NULL then a system transliterator matching
177  *              the ID is returned.
178  * @param rulesLength the length of the rules, or -1 if the rules
179  *                    are NUL-terminated.
180  * @param parseError a pointer to a UParseError struct to receive the details
181  *                   of any parsing errors. This parameter may be NULL if no
182  *                   parsing error details are desired.
183  * @param pErrorCode a pointer to the UErrorCode
184  * @return a transliterator pointer that may be passed to other
185  *         utrans_xxx() functions, or NULL if the open call fails.
186  * @stable ICU 2.8
187  */
188 U_STABLE UTransliterator* U_EXPORT2
189 utrans_openU(const UChar *id,
190              int32_t idLength,
191              UTransDirection dir,
192              const UChar *rules,
193              int32_t rulesLength,
194              UParseError *parseError,
195              UErrorCode *pErrorCode);
196 
197 /**
198  * Open an inverse of an existing transliterator.  For this to work,
199  * the inverse must be registered with the system.  For example, if
200  * the Transliterator "A-B" is opened, and then its inverse is opened,
201  * the result is the Transliterator "B-A", if such a transliterator is
202  * registered with the system.  Otherwise the result is NULL and a
203  * failing UErrorCode is set.  Any non-NULL result from this function
204  * should later be closed with utrans_close().
205  *
206  * @param trans the transliterator to open the inverse of.
207  * @param status a pointer to the UErrorCode
208  * @return a pointer to a newly-opened transliterator that is the
209  * inverse of trans, or NULL if the open call fails.
210  * @stable ICU 2.0
211  */
212 U_STABLE UTransliterator* U_EXPORT2
213 utrans_openInverse(const UTransliterator* trans,
214                    UErrorCode* status);
215 
216 /**
217  * Create a copy of a transliterator.  Any non-NULL result from this
218  * function should later be closed with utrans_close().
219  *
220  * @param trans the transliterator to be copied.
221  * @param status a pointer to the UErrorCode
222  * @return a transliterator pointer that may be passed to other
223  * utrans_xxx() functions, or NULL if the clone call fails.
224  * @stable ICU 2.0
225  */
226 U_STABLE UTransliterator* U_EXPORT2
227 utrans_clone(const UTransliterator* trans,
228              UErrorCode* status);
229 
230 /**
231  * Close a transliterator.  Any non-NULL pointer returned by
232  * utrans_openXxx() or utrans_clone() should eventually be closed.
233  * @param trans the transliterator to be closed.
234  * @stable ICU 2.0
235  */
236 U_STABLE void U_EXPORT2
237 utrans_close(UTransliterator* trans);
238 
239 #if U_SHOW_CPLUSPLUS_API
240 
241 U_NAMESPACE_BEGIN
242 
243 /**
244  * \class LocalUTransliteratorPointer
245  * "Smart pointer" class, closes a UTransliterator via utrans_close().
246  * For most methods see the LocalPointerBase base class.
247  *
248  * @see LocalPointerBase
249  * @see LocalPointer
250  * @stable ICU 4.4
251  */
252 U_DEFINE_LOCAL_OPEN_POINTER(LocalUTransliteratorPointer, UTransliterator, utrans_close);
253 
254 U_NAMESPACE_END
255 
256 #endif
257 
258 /**
259  * Return the programmatic identifier for this transliterator.
260  * If this identifier is passed to utrans_openU(), it will open
261  * a transliterator equivalent to this one, if the ID has been
262  * registered.
263  *
264  * @param trans the transliterator to return the ID of.
265  * @param resultLength pointer to an output variable receiving the length
266  *        of the ID string; can be NULL
267  * @return the NUL-terminated ID string. This pointer remains
268  * valid until utrans_close() is called on this transliterator.
269  *
270  * @stable ICU 2.8
271  */
272 U_STABLE const UChar * U_EXPORT2
273 utrans_getUnicodeID(const UTransliterator *trans,
274                     int32_t *resultLength);
275 
276 /**
277  * Register an open transliterator with the system.  When
278  * utrans_open() is called with an ID string that is equal to that
279  * returned by utrans_getID(adoptedTrans,...), then
280  * utrans_clone(adoptedTrans,...) is returned.
281  *
282  * <p>NOTE: After this call the system owns the adoptedTrans and will
283  * close it.  The user must not call utrans_close() on adoptedTrans.
284  *
285  * @param adoptedTrans a transliterator, typically the result of
286  * utrans_openRules(), to be registered with the system.
287  * @param status a pointer to the UErrorCode
288  * @stable ICU 2.0
289  */
290 U_STABLE void U_EXPORT2
291 utrans_register(UTransliterator* adoptedTrans,
292                 UErrorCode* status);
293 
294 /**
295  * Unregister a transliterator from the system.  After this call the
296  * system will no longer recognize the given ID when passed to
297  * utrans_open(). If the ID is invalid then nothing is done.
298  *
299  * @param id an ID to unregister
300  * @param idLength the length of id, or -1 if id is zero-terminated
301  * @stable ICU 2.8
302  */
303 U_STABLE void U_EXPORT2
304 utrans_unregisterID(const UChar* id, int32_t idLength);
305 
306 /**
307  * Set the filter used by a transliterator.  A filter can be used to
308  * make the transliterator pass certain characters through untouched.
309  * The filter is expressed using a UnicodeSet pattern.  If the
310  * filterPattern is NULL or the empty string, then the transliterator
311  * will be reset to use no filter.
312  *
313  * @param trans the transliterator
314  * @param filterPattern a pattern string, in the form accepted by
315  * UnicodeSet, specifying which characters to apply the
316  * transliteration to.  May be NULL or the empty string to indicate no
317  * filter.
318  * @param filterPatternLen the length of filterPattern, or -1 if
319  * filterPattern is zero-terminated
320  * @param status a pointer to the UErrorCode
321  * @see UnicodeSet
322  * @stable ICU 2.0
323  */
324 U_STABLE void U_EXPORT2
325 utrans_setFilter(UTransliterator* trans,
326                  const UChar* filterPattern,
327                  int32_t filterPatternLen,
328                  UErrorCode* status);
329 
330 /**
331  * Return the number of system transliterators.
332  * It is recommended to use utrans_openIDs() instead.
333  *
334  * @return the number of system transliterators.
335  * @stable ICU 2.0
336  */
337 U_STABLE int32_t U_EXPORT2
338 utrans_countAvailableIDs(void);
339 
340 /**
341  * Return a UEnumeration for the available transliterators.
342  *
343  * @param pErrorCode Pointer to the UErrorCode in/out parameter.
344  * @return UEnumeration for the available transliterators.
345  *         Close with uenum_close().
346  *
347  * @stable ICU 2.8
348  */
349 U_STABLE UEnumeration * U_EXPORT2
350 utrans_openIDs(UErrorCode *pErrorCode);
351 
352 /********************************************************************
353  * Transliteration API
354  ********************************************************************/
355 
356 /**
357  * Transliterate a segment of a UReplaceable string.  The string is
358  * passed in as a UReplaceable pointer rep and a UReplaceableCallbacks
359  * function pointer struct repFunc.  Functions in the repFunc struct
360  * will be called in order to modify the rep string.
361  *
362  * @param trans the transliterator
363  * @param rep a pointer to the string.  This will be passed to the
364  * repFunc functions.
365  * @param repFunc a set of function pointers that will be used to
366  * modify the string pointed to by rep.
367  * @param start the beginning index, inclusive; <code>0 <= start <=
368  * limit</code>.
369  * @param limit pointer to the ending index, exclusive; <code>start <=
370  * limit <= repFunc->length(rep)</code>.  Upon return, *limit will
371  * contain the new limit index.  The text previously occupying
372  * <code>[start, limit)</code> has been transliterated, possibly to a
373  * string of a different length, at <code>[start,
374  * </code><em>new-limit</em><code>)</code>, where <em>new-limit</em>
375  * is the return value.
376  * @param status a pointer to the UErrorCode
377  * @stable ICU 2.0
378  */
379 U_STABLE void U_EXPORT2
380 utrans_trans(const UTransliterator* trans,
381              UReplaceable* rep,
382              UReplaceableCallbacks* repFunc,
383              int32_t start,
384              int32_t* limit,
385              UErrorCode* status);
386 
387 /**
388  * Transliterate the portion of the UReplaceable text buffer that can
389  * be transliterated unambiguosly.  This method is typically called
390  * after new text has been inserted, e.g. as a result of a keyboard
391  * event.  The transliterator will try to transliterate characters of
392  * <code>rep</code> between <code>index.cursor</code> and
393  * <code>index.limit</code>.  Characters before
394  * <code>index.cursor</code> will not be changed.
395  *
396  * <p>Upon return, values in <code>index</code> will be updated.
397  * <code>index.start</code> will be advanced to the first
398  * character that future calls to this method will read.
399  * <code>index.cursor</code> and <code>index.limit</code> will
400  * be adjusted to delimit the range of text that future calls to
401  * this method may change.
402  *
403  * <p>Typical usage of this method begins with an initial call
404  * with <code>index.start</code> and <code>index.limit</code>
405  * set to indicate the portion of <code>text</code> to be
406  * transliterated, and <code>index.cursor == index.start</code>.
407  * Thereafter, <code>index</code> can be used without
408  * modification in future calls, provided that all changes to
409  * <code>text</code> are made via this method.
410  *
411  * <p>This method assumes that future calls may be made that will
412  * insert new text into the buffer.  As a result, it only performs
413  * unambiguous transliterations.  After the last call to this method,
414  * there may be untransliterated text that is waiting for more input
415  * to resolve an ambiguity.  In order to perform these pending
416  * transliterations, clients should call utrans_trans() with a start
417  * of index.start and a limit of index.end after the last call to this
418  * method has been made.
419  *
420  * @param trans the transliterator
421  * @param rep a pointer to the string.  This will be passed to the
422  * repFunc functions.
423  * @param repFunc a set of function pointers that will be used to
424  * modify the string pointed to by rep.
425  * @param pos a struct containing the start and limit indices of the
426  * text to be read and the text to be transliterated
427  * @param status a pointer to the UErrorCode
428  * @stable ICU 2.0
429  */
430 U_STABLE void U_EXPORT2
431 utrans_transIncremental(const UTransliterator* trans,
432                         UReplaceable* rep,
433                         UReplaceableCallbacks* repFunc,
434                         UTransPosition* pos,
435                         UErrorCode* status);
436 
437 /**
438  * Transliterate a segment of a UChar* string.  The string is passed
439  * in in a UChar* buffer.  The string is modified in place.  If the
440  * result is longer than textCapacity, it is truncated.  The actual
441  * length of the result is returned in *textLength, if textLength is
442  * non-NULL. *textLength may be greater than textCapacity, but only
443  * textCapacity UChars will be written to *text, including the zero
444  * terminator.
445  *
446  * @param trans the transliterator
447  * @param text a pointer to a buffer containing the text to be
448  * transliterated on input and the result text on output.
449  * @param textLength a pointer to the length of the string in text.
450  * If the length is -1 then the string is assumed to be
451  * zero-terminated.  Upon return, the new length is stored in
452  * *textLength.  If textLength is NULL then the string is assumed to
453  * be zero-terminated.
454  * @param textCapacity a pointer to the length of the text buffer.
455  * Upon return,
456  * @param start the beginning index, inclusive; <code>0 <= start <=
457  * limit</code>.
458  * @param limit pointer to the ending index, exclusive; <code>start <=
459  * limit <= repFunc->length(rep)</code>.  Upon return, *limit will
460  * contain the new limit index.  The text previously occupying
461  * <code>[start, limit)</code> has been transliterated, possibly to a
462  * string of a different length, at <code>[start,
463  * </code><em>new-limit</em><code>)</code>, where <em>new-limit</em>
464  * is the return value.
465  * @param status a pointer to the UErrorCode
466  * @stable ICU 2.0
467  */
468 U_STABLE void U_EXPORT2
469 utrans_transUChars(const UTransliterator* trans,
470                    UChar* text,
471                    int32_t* textLength,
472                    int32_t textCapacity,
473                    int32_t start,
474                    int32_t* limit,
475                    UErrorCode* status);
476 
477 /**
478  * Transliterate the portion of the UChar* text buffer that can be
479  * transliterated unambiguosly.  See utrans_transIncremental().  The
480  * string is passed in in a UChar* buffer.  The string is modified in
481  * place.  If the result is longer than textCapacity, it is truncated.
482  * The actual length of the result is returned in *textLength, if
483  * textLength is non-NULL. *textLength may be greater than
484  * textCapacity, but only textCapacity UChars will be written to
485  * *text, including the zero terminator.  See utrans_transIncremental()
486  * for usage details.
487  *
488  * @param trans the transliterator
489  * @param text a pointer to a buffer containing the text to be
490  * transliterated on input and the result text on output.
491  * @param textLength a pointer to the length of the string in text.
492  * If the length is -1 then the string is assumed to be
493  * zero-terminated.  Upon return, the new length is stored in
494  * *textLength.  If textLength is NULL then the string is assumed to
495  * be zero-terminated.
496  * @param textCapacity the length of the text buffer
497  * @param pos a struct containing the start and limit indices of the
498  * text to be read and the text to be transliterated
499  * @param status a pointer to the UErrorCode
500  * @see utrans_transIncremental
501  * @stable ICU 2.0
502  */
503 U_STABLE void U_EXPORT2
504 utrans_transIncrementalUChars(const UTransliterator* trans,
505                               UChar* text,
506                               int32_t* textLength,
507                               int32_t textCapacity,
508                               UTransPosition* pos,
509                               UErrorCode* status);
510 
511 /* deprecated API ----------------------------------------------------------- */
512 
513 #ifndef U_HIDE_DEPRECATED_API
514 
515 /* see utrans.h documentation for why these functions are deprecated */
516 
517 /**
518  * Deprecated, use utrans_openU() instead.
519  * Open a custom transliterator, given a custom rules string
520  * OR
521  * a system transliterator, given its ID.
522  * Any non-NULL result from this function should later be closed with
523  * utrans_close().
524  *
525  * @param id a valid ID, as returned by utrans_getAvailableID()
526  * @param dir the desired direction
527  * @param rules the transliterator rules.  See the C++ header rbt.h
528  * for rules syntax. If NULL then a system transliterator matching
529  * the ID is returned.
530  * @param rulesLength the length of the rules, or -1 if the rules
531  * are zero-terminated.
532  * @param parseError a pointer to a UParseError struct to receive the
533  * details of any parsing errors. This parameter may be NULL if no
534  * parsing error details are desired.
535  * @param status a pointer to the UErrorCode
536  * @return a transliterator pointer that may be passed to other
537  * utrans_xxx() functions, or NULL if the open call fails.
538  * @deprecated ICU 2.8 Use utrans_openU() instead, see utrans.h
539  */
540 U_DEPRECATED UTransliterator* U_EXPORT2
541 utrans_open(const char* id,
542             UTransDirection dir,
543             const UChar* rules,         /* may be Null */
544             int32_t rulesLength,        /* -1 if null-terminated */
545             UParseError* parseError,    /* may be Null */
546             UErrorCode* status);
547 
548 /**
549  * Deprecated, use utrans_getUnicodeID() instead.
550  * Return the programmatic identifier for this transliterator.
551  * If this identifier is passed to utrans_open(), it will open
552  * a transliterator equivalent to this one, if the ID has been
553  * registered.
554  * @param trans the transliterator to return the ID of.
555  * @param buf the buffer in which to receive the ID.  This may be
556  * NULL, in which case no characters are copied.
557  * @param bufCapacity the capacity of the buffer.  Ignored if buf is
558  * NULL.
559  * @return the actual length of the ID, not including
560  * zero-termination.  This may be greater than bufCapacity.
561  * @deprecated ICU 2.8 Use utrans_getUnicodeID() instead, see utrans.h
562  */
563 U_DEPRECATED int32_t U_EXPORT2
564 utrans_getID(const UTransliterator* trans,
565              char* buf,
566              int32_t bufCapacity);
567 
568 /**
569  * Deprecated, use utrans_unregisterID() instead.
570  * Unregister a transliterator from the system.  After this call the
571  * system will no longer recognize the given ID when passed to
572  * utrans_open().  If the id is invalid then nothing is done.
573  *
574  * @param id a zero-terminated ID
575  * @deprecated ICU 2.8 Use utrans_unregisterID() instead, see utrans.h
576  */
577 U_DEPRECATED void U_EXPORT2
578 utrans_unregister(const char* id);
579 
580 /**
581  * Deprecated, use utrans_openIDs() instead.
582  * Return the ID of the index-th system transliterator.  The result
583  * is placed in the given buffer.  If the given buffer is too small,
584  * the initial substring is copied to buf.  The result in buf is
585  * always zero-terminated.
586  *
587  * @param index the number of the transliterator to return.  Must
588  * satisfy 0 <= index < utrans_countAvailableIDs().  If index is out
589  * of range then it is treated as if it were 0.
590  * @param buf the buffer in which to receive the ID.  This may be
591  * NULL, in which case no characters are copied.
592  * @param bufCapacity the capacity of the buffer.  Ignored if buf is
593  * NULL.
594  * @return the actual length of the index-th ID, not including
595  * zero-termination.  This may be greater than bufCapacity.
596  * @deprecated ICU 2.8 Use utrans_openIDs() instead, see utrans.h
597  */
598 U_DEPRECATED int32_t U_EXPORT2
599 utrans_getAvailableID(int32_t index,
600                       char* buf,
601                       int32_t bufCapacity);
602 
603 #endif  /* U_HIDE_DEPRECATED_API */
604 
605 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
606 
607 #endif
608