• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2000-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  ushape.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2000jun29
16 *   created by: Markus W. Scherer
17 */
18 
19 #ifndef __USHAPE_H__
20 #define __USHAPE_H__
21 
22 #include "unicode/utypes.h"
23 
24 /**
25  * \file
26  * \brief C API:  Arabic shaping
27  *
28  */
29 
30 /**
31  * Shape Arabic text on a character basis.
32  *
33  * <p>This function performs basic operations for "shaping" Arabic text. It is most
34  * useful for use with legacy data formats and legacy display technology
35  * (simple terminals). All operations are performed on Unicode characters.</p>
36  *
37  * <p>Text-based shaping means that some character code points in the text are
38  * replaced by others depending on the context. It transforms one kind of text
39  * into another. In comparison, modern displays for Arabic text select
40  * appropriate, context-dependent font glyphs for each text element, which means
41  * that they transform text into a glyph vector.</p>
42  *
43  * <p>Text transformations are necessary when modern display technology is not
44  * available or when text needs to be transformed to or from legacy formats that
45  * use "shaped" characters. Since the Arabic script is cursive, connecting
46  * adjacent letters to each other, computers select images for each letter based
47  * on the surrounding letters. This usually results in four images per Arabic
48  * letter: initial, middle, final, and isolated forms. In Unicode, on the other
49  * hand, letters are normally stored abstract, and a display system is expected
50  * to select the necessary glyphs. (This makes searching and other text
51  * processing easier because the same letter has only one code.) It is possible
52  * to mimic this with text transformations because there are characters in
53  * Unicode that are rendered as letters with a specific shape
54  * (or cursive connectivity). They were included for interoperability with
55  * legacy systems and codepages, and for unsophisticated display systems.</p>
56  *
57  * <p>A second kind of text transformations is supported for Arabic digits:
58  * For compatibility with legacy codepages that only include European digits,
59  * it is possible to replace one set of digits by another, changing the
60  * character code points. These operations can be performed for either
61  * Arabic-Indic Digits (U+0660...U+0669) or Eastern (Extended) Arabic-Indic
62  * digits (U+06f0...U+06f9).</p>
63  *
64  * <p>Some replacements may result in more or fewer characters (code points).
65  * By default, this means that the destination buffer may receive text with a
66  * length different from the source length. Some legacy systems rely on the
67  * length of the text to be constant. They expect extra spaces to be added
68  * or consumed either next to the affected character or at the end of the
69  * text.</p>
70  *
71  * <p>For details about the available operations, see the description of the
72  * <code>U_SHAPE_...</code> options.</p>
73  *
74  * @param source The input text.
75  *
76  * @param sourceLength The number of UChars in <code>source</code>.
77  *
78  * @param dest The destination buffer that will receive the results of the
79  *             requested operations. It may be <code>NULL</code> only if
80  *             <code>destSize</code> is 0. The source and destination must not
81  *             overlap.
82  *
83  * @param destSize The size (capacity) of the destination buffer in UChars.
84  *                 If <code>destSize</code> is 0, then no output is produced,
85  *                 but the necessary buffer size is returned ("preflighting").
86  *
87  * @param options This is a 32-bit set of flags that specify the operations
88  *                that are performed on the input text. If no error occurs,
89  *                then the result will always be written to the destination
90  *                buffer.
91  *
92  * @param pErrorCode must be a valid pointer to an error code value,
93  *        which must not indicate a failure before the function call.
94  *
95  * @return The number of UChars written to the destination buffer.
96  *         If an error occurred, then no output was written, or it may be
97  *         incomplete. If <code>U_BUFFER_OVERFLOW_ERROR</code> is set, then
98  *         the return value indicates the necessary destination buffer size.
99  * @stable ICU 2.0
100  */
101 U_CAPI int32_t U_EXPORT2
102 u_shapeArabic(const UChar *source, int32_t sourceLength,
103               UChar *dest, int32_t destSize,
104               uint32_t options,
105               UErrorCode *pErrorCode);
106 
107 /**
108  * Memory option: allow the result to have a different length than the source.
109  * Affects: LamAlef options
110  * @stable ICU 2.0
111  */
112 #define U_SHAPE_LENGTH_GROW_SHRINK              0
113 
114 /**
115  * Memory option: allow the result to have a different length than the source.
116  * Affects: LamAlef options
117  * This option is an alias to U_SHAPE_LENGTH_GROW_SHRINK
118  * @stable ICU 4.2
119  */
120 #define U_SHAPE_LAMALEF_RESIZE                  0
121 
122 /**
123  * Memory option: the result must have the same length as the source.
124  * If more room is necessary, then try to consume spaces next to modified characters.
125  * @stable ICU 2.0
126  */
127 #define U_SHAPE_LENGTH_FIXED_SPACES_NEAR        1
128 
129 /**
130  * Memory option: the result must have the same length as the source.
131  * If more room is necessary, then try to consume spaces next to modified characters.
132  * Affects: LamAlef options
133  * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_NEAR
134  * @stable ICU 4.2
135  */
136 #define U_SHAPE_LAMALEF_NEAR                    1
137 
138 /**
139  * Memory option: the result must have the same length as the source.
140  * If more room is necessary, then try to consume spaces at the end of the text.
141  * @stable ICU 2.0
142  */
143 #define U_SHAPE_LENGTH_FIXED_SPACES_AT_END      2
144 
145 /**
146  * Memory option: the result must have the same length as the source.
147  * If more room is necessary, then try to consume spaces at the end of the text.
148  * Affects: LamAlef options
149  * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_END
150  * @stable ICU 4.2
151  */
152 #define U_SHAPE_LAMALEF_END                     2
153 
154 /**
155  * Memory option: the result must have the same length as the source.
156  * If more room is necessary, then try to consume spaces at the beginning of the text.
157  * @stable ICU 2.0
158  */
159 #define U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING 3
160 
161 /**
162  * Memory option: the result must have the same length as the source.
163  * If more room is necessary, then try to consume spaces at the beginning of the text.
164  * Affects: LamAlef options
165  * This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING
166  * @stable ICU 4.2
167  */
168 #define U_SHAPE_LAMALEF_BEGIN                    3
169 
170 
171 /**
172  * Memory option: the result must have the same length as the source.
173  * Shaping Mode: For each LAMALEF character found, expand LAMALEF using space at end.
174  *               If there is no space at end, use spaces at beginning of the buffer. If there
175  *               is no space at beginning of the buffer, use spaces at the near (i.e. the space
176  *               after the LAMALEF character).
177  *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
178  *               will be set in pErrorCode
179  *
180  * Deshaping Mode: Perform the same function as the flag equals U_SHAPE_LAMALEF_END.
181  * Affects: LamAlef options
182  * @stable ICU 4.2
183  */
184 #define U_SHAPE_LAMALEF_AUTO                     0x10000
185 
186 /** Bit mask for memory options. @stable ICU 2.0 */
187 #define U_SHAPE_LENGTH_MASK                      0x10003 /* Changed old value 3 */
188 
189 
190 /**
191  * Bit mask for LamAlef memory options.
192  * @stable ICU 4.2
193  */
194 #define U_SHAPE_LAMALEF_MASK                     0x10003 /* updated */
195 
196 /** Direction indicator: the source is in logical (keyboard) order. @stable ICU 2.0 */
197 #define U_SHAPE_TEXT_DIRECTION_LOGICAL          0
198 
199 /**
200  * Direction indicator:
201  * the source is in visual RTL order,
202  * the rightmost displayed character stored first.
203  * This option is an alias to U_SHAPE_TEXT_DIRECTION_LOGICAL
204  * @stable ICU 4.2
205  */
206 #define U_SHAPE_TEXT_DIRECTION_VISUAL_RTL       0
207 
208 /**
209  * Direction indicator:
210  * the source is in visual LTR order,
211  * the leftmost displayed character stored first.
212  * @stable ICU 2.0
213  */
214 #define U_SHAPE_TEXT_DIRECTION_VISUAL_LTR       4
215 
216 /** Bit mask for direction indicators. @stable ICU 2.0 */
217 #define U_SHAPE_TEXT_DIRECTION_MASK             4
218 
219 
220 /** Letter shaping option: do not perform letter shaping. @stable ICU 2.0 */
221 #define U_SHAPE_LETTERS_NOOP                    0
222 
223 /** Letter shaping option: replace abstract letter characters by "shaped" ones. @stable ICU 2.0 */
224 #define U_SHAPE_LETTERS_SHAPE                   8
225 
226 /** Letter shaping option: replace "shaped" letter characters by abstract ones. @stable ICU 2.0 */
227 #define U_SHAPE_LETTERS_UNSHAPE                 0x10
228 
229 /**
230  * Letter shaping option: replace abstract letter characters by "shaped" ones.
231  * The only difference with U_SHAPE_LETTERS_SHAPE is that Tashkeel letters
232  * are always "shaped" into the isolated form instead of the medial form
233  * (selecting code points from the Arabic Presentation Forms-B block).
234  * @stable ICU 2.0
235  */
236 #define U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED 0x18
237 
238 
239 /** Bit mask for letter shaping options. @stable ICU 2.0 */
240 #define U_SHAPE_LETTERS_MASK                        0x18
241 
242 
243 /** Digit shaping option: do not perform digit shaping. @stable ICU 2.0 */
244 #define U_SHAPE_DIGITS_NOOP                     0
245 
246 /**
247  * Digit shaping option:
248  * Replace European digits (U+0030...) by Arabic-Indic digits.
249  * @stable ICU 2.0
250  */
251 #define U_SHAPE_DIGITS_EN2AN                    0x20
252 
253 /**
254  * Digit shaping option:
255  * Replace Arabic-Indic digits by European digits (U+0030...).
256  * @stable ICU 2.0
257  */
258 #define U_SHAPE_DIGITS_AN2EN                    0x40
259 
260 /**
261  * Digit shaping option:
262  * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent
263  * strongly directional character is an Arabic letter
264  * (<code>u_charDirection()</code> result <code>U_RIGHT_TO_LEFT_ARABIC</code> [AL]).<br>
265  * The direction of "preceding" depends on the direction indicator option.
266  * For the first characters, the preceding strongly directional character
267  * (initial state) is assumed to be not an Arabic letter
268  * (it is <code>U_LEFT_TO_RIGHT</code> [L] or <code>U_RIGHT_TO_LEFT</code> [R]).
269  * @stable ICU 2.0
270  */
271 #define U_SHAPE_DIGITS_ALEN2AN_INIT_LR          0x60
272 
273 /**
274  * Digit shaping option:
275  * Replace European digits (U+0030...) by Arabic-Indic digits if the most recent
276  * strongly directional character is an Arabic letter
277  * (<code>u_charDirection()</code> result <code>U_RIGHT_TO_LEFT_ARABIC</code> [AL]).<br>
278  * The direction of "preceding" depends on the direction indicator option.
279  * For the first characters, the preceding strongly directional character
280  * (initial state) is assumed to be an Arabic letter.
281  * @stable ICU 2.0
282  */
283 #define U_SHAPE_DIGITS_ALEN2AN_INIT_AL          0x80
284 
285 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */
286 #define U_SHAPE_DIGITS_RESERVED                 0xa0
287 
288 /** Bit mask for digit shaping options. @stable ICU 2.0 */
289 #define U_SHAPE_DIGITS_MASK                     0xe0
290 
291 
292 /** Digit type option: Use Arabic-Indic digits (U+0660...U+0669). @stable ICU 2.0 */
293 #define U_SHAPE_DIGIT_TYPE_AN                   0
294 
295 /** Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). @stable ICU 2.0 */
296 #define U_SHAPE_DIGIT_TYPE_AN_EXTENDED          0x100
297 
298 /** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */
299 #define U_SHAPE_DIGIT_TYPE_RESERVED             0x200
300 
301 /** Bit mask for digit type options. @stable ICU 2.0 */
302 #define U_SHAPE_DIGIT_TYPE_MASK                 0x300 /* I need to change this from 0x3f00 to 0x300 */
303 
304 /**
305  * Tashkeel aggregation option:
306  * Replaces any combination of U+0651 with one of
307  * U+064C, U+064D, U+064E, U+064F, U+0650 with
308  * U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively.
309  * @stable ICU 3.6
310  */
311 #define U_SHAPE_AGGREGATE_TASHKEEL              0x4000
312 /** Tashkeel aggregation option: do not aggregate tashkeels. @stable ICU 3.6 */
313 #define U_SHAPE_AGGREGATE_TASHKEEL_NOOP         0
314 /** Bit mask for tashkeel aggregation. @stable ICU 3.6 */
315 #define U_SHAPE_AGGREGATE_TASHKEEL_MASK         0x4000
316 
317 /**
318  * Presentation form option:
319  * Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B
320  * characters with 0+06xx characters, before shaping.
321  * @stable ICU 3.6
322  */
323 #define U_SHAPE_PRESERVE_PRESENTATION           0x8000
324 /** Presentation form option:
325  * Replace Arabic Presentation Forms-A and Arabic Presentationo Forms-B with
326  * their unshaped correspondents in range 0+06xx, before shaping.
327  * @stable ICU 3.6
328  */
329 #define U_SHAPE_PRESERVE_PRESENTATION_NOOP      0
330 /** Bit mask for preserve presentation form. @stable ICU 3.6 */
331 #define U_SHAPE_PRESERVE_PRESENTATION_MASK      0x8000
332 
333 /* Seen Tail option */
334 /**
335  * Memory option: the result must have the same length as the source.
336  * Shaping mode: The SEEN family character will expand into two characters using space near
337  *               the SEEN family character(i.e. the space after the character).
338  *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
339  *               will be set in pErrorCode
340  *
341  * De-shaping mode: Any Seen character followed by Tail character will be
342  *                  replaced by one cell Seen and a space will replace the Tail.
343  * Affects: Seen options
344  * @stable ICU 4.2
345  */
346 #define U_SHAPE_SEEN_TWOCELL_NEAR     0x200000
347 
348 /**
349  * Bit mask for Seen memory options.
350  * @stable ICU 4.2
351  */
352 #define U_SHAPE_SEEN_MASK             0x700000
353 
354 /* YehHamza option */
355 /**
356  * Memory option: the result must have the same length as the source.
357  * Shaping mode: The YEHHAMZA character will expand into two characters using space near it
358  *              (i.e. the space after the character
359  *               If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
360  *               will be set in pErrorCode
361  *
362  * De-shaping mode: Any Yeh (final or isolated) character followed by Hamza character will be
363  *                  replaced by one cell YehHamza and space will replace the Hamza.
364  * Affects: YehHamza options
365  * @stable ICU 4.2
366  */
367 #define U_SHAPE_YEHHAMZA_TWOCELL_NEAR      0x1000000
368 
369 
370 /**
371  * Bit mask for YehHamza memory options.
372  * @stable ICU 4.2
373  */
374 #define U_SHAPE_YEHHAMZA_MASK              0x3800000
375 
376 /* New Tashkeel options */
377 /**
378  * Memory option: the result must have the same length as the source.
379  * Shaping mode: Tashkeel characters will be replaced by spaces.
380  *               Spaces will be placed at beginning of the buffer
381  *
382  * De-shaping mode: N/A
383  * Affects: Tashkeel options
384  * @stable ICU 4.2
385  */
386 #define U_SHAPE_TASHKEEL_BEGIN                      0x40000
387 
388 /**
389  * Memory option: the result must have the same length as the source.
390  * Shaping mode: Tashkeel characters will be replaced by spaces.
391  *               Spaces will be placed at end of the buffer
392  *
393  * De-shaping mode: N/A
394  * Affects: Tashkeel options
395  * @stable ICU 4.2
396  */
397 #define U_SHAPE_TASHKEEL_END                        0x60000
398 
399 /**
400  * Memory option: allow the result to have a different length than the source.
401  * Shaping mode: Tashkeel characters will be removed, buffer length will shrink.
402  * De-shaping mode: N/A
403  *
404  * Affect: Tashkeel options
405  * @stable ICU 4.2
406  */
407 #define U_SHAPE_TASHKEEL_RESIZE                     0x80000
408 
409 /**
410  * Memory option: the result must have the same length as the source.
411  * Shaping mode: Tashkeel characters will be replaced by Tatweel if it is connected to adjacent
412  *               characters (i.e. shaped on Tatweel) or replaced by space if it is not connected.
413  *
414  * De-shaping mode: N/A
415  * Affects: YehHamza options
416  * @stable ICU 4.2
417  */
418 #define U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL         0xC0000
419 
420 /**
421  * Bit mask for Tashkeel replacement with Space or Tatweel memory options.
422  * @stable ICU 4.2
423  */
424 #define U_SHAPE_TASHKEEL_MASK                       0xE0000
425 
426 
427 /* Space location Control options */
428 /**
429  * This option affect the meaning of BEGIN and END options. if this option is not used the default
430  * for BEGIN and END will be as following:
431  * The Default (for both Visual LTR, Visual RTL and Logical Text)
432  *           1. BEGIN always refers to the start address of physical memory.
433  *           2. END always refers to the end address of physical memory.
434  *
435  * If this option is used it will swap the meaning of BEGIN and END only for Visual LTR text.
436  *
437  * The effect on BEGIN and END Memory Options will be as following:
438  *    A. BEGIN For Visual LTR text: This will be the beginning (right side) of the visual text(
439  *       corresponding to the physical memory address end for Visual LTR text, Same as END in
440  *       default behavior)
441  *    B. BEGIN For Logical text: Same as BEGIN in default behavior.
442  *    C. END For Visual LTR text: This will be the end (left side) of the visual text (corresponding
443  *       to the physical memory address beginning for Visual LTR text, Same as BEGIN in default behavior.
444  *    D. END For Logical text: Same as END in default behavior).
445  * Affects: All LamAlef BEGIN, END and AUTO options.
446  * @stable ICU 4.2
447  */
448 #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END 0x4000000
449 
450 /**
451  * Bit mask for swapping BEGIN and END for Visual LTR text
452  * @stable ICU 4.2
453  */
454 #define U_SHAPE_SPACES_RELATIVE_TO_TEXT_MASK      0x4000000
455 
456 /**
457  * If this option is used, shaping will use the new Unicode code point for TAIL (i.e. 0xFE73).
458  * If this option is not specified (Default), old unofficial Unicode TAIL code point is used (i.e. 0x200B)
459  * De-shaping will not use this option as it will always search for both the new Unicode code point for the
460  * TAIL (i.e. 0xFE73) or the old unofficial Unicode TAIL code point (i.e. 0x200B) and de-shape the
461  * Seen-Family letter accordingly.
462  *
463  * Shaping Mode: Only shaping.
464  * De-shaping Mode: N/A.
465  * Affects: All Seen options
466  * @stable ICU 4.8
467  */
468 #define U_SHAPE_TAIL_NEW_UNICODE        0x8000000
469 
470 /**
471  * Bit mask for new Unicode Tail option
472  * @stable ICU 4.8
473  */
474 #define U_SHAPE_TAIL_TYPE_MASK          0x8000000
475 
476 #endif
477