• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2006 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 //
18 
19 #ifndef ANDROID_UNICODE_H
20 #define ANDROID_UNICODE_H
21 
22 #include <stdint.h>
23 #include <sys/types.h>
24 
25 #define REPLACEMENT_CHAR (0xFFFD)
26 
27 // this part of code is copied from umachine.h under ICU
28 /**
29  * Define UChar32 as a type for single Unicode code points.
30  * UChar32 is a signed 32-bit integer (same as int32_t).
31  *
32  * The Unicode code point range is 0..0x10ffff.
33  * All other values (negative or >=0x110000) are illegal as Unicode code points.
34  * They may be used as sentinel values to indicate "done", "error"
35  * or similar non-code point conditions.
36  *
37  * @stable ICU 2.4
38  */
39 typedef int32_t UChar32;
40 
41 namespace android {
42 
43     class Encoding;
44     /**
45      * \class Unicode
46      *
47      * Helper class for getting properties of Unicode characters. Characters
48      * can have one of the types listed in CharType and each character can have the
49      * directionality of Direction.
50      */
51     class Unicode
52     {
53     public:
54         /**
55          * Directions specified in the Unicode standard. These directions map directly
56          * to java.lang.Character.
57          */
58         enum Direction {
59             DIRECTIONALITY_UNDEFINED = -1,
60             DIRECTIONALITY_LEFT_TO_RIGHT,
61             DIRECTIONALITY_RIGHT_TO_LEFT,
62             DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC,
63             DIRECTIONALITY_EUROPEAN_NUMBER,
64             DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR,
65             DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR,
66             DIRECTIONALITY_ARABIC_NUMBER,
67             DIRECTIONALITY_COMMON_NUMBER_SEPARATOR,
68             DIRECTIONALITY_NONSPACING_MARK,
69             DIRECTIONALITY_BOUNDARY_NEUTRAL,
70             DIRECTIONALITY_PARAGRAPH_SEPARATOR,
71             DIRECTIONALITY_SEGMENT_SEPARATOR,
72             DIRECTIONALITY_WHITESPACE,
73             DIRECTIONALITY_OTHER_NEUTRALS,
74             DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING,
75             DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE,
76             DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING,
77             DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE,
78             DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
79         };
80 
81         /**
82          * Character types as specified in the Unicode standard. These map directly to
83          * java.lang.Character.
84          */
85         enum CharType {
86             CHARTYPE_UNASSIGNED = 0,
87             CHARTYPE_UPPERCASE_LETTER,
88             CHARTYPE_LOWERCASE_LETTER,
89             CHARTYPE_TITLECASE_LETTER,
90             CHARTYPE_MODIFIER_LETTER,
91             CHARTYPE_OTHER_LETTER,
92             CHARTYPE_NON_SPACING_MARK,
93             CHARTYPE_ENCLOSING_MARK,
94             CHARTYPE_COMBINING_SPACING_MARK,
95             CHARTYPE_DECIMAL_DIGIT_NUMBER,
96             CHARTYPE_LETTER_NUMBER,
97             CHARTYPE_OTHER_NUMBER,
98             CHARTYPE_SPACE_SEPARATOR,
99             CHARTYPE_LINE_SEPARATOR,
100             CHARTYPE_PARAGRAPH_SEPARATOR,
101             CHARTYPE_CONTROL,
102             CHARTYPE_FORMAT,
103             CHARTYPE_MISSING_VALUE_FOR_JAVA,    /* This is the mysterious missing 17 value from the java constants */
104             CHARTYPE_PRIVATE_USE,
105             CHARTYPE_SURROGATE,
106             CHARTYPE_DASH_PUNCTUATION,
107             CHARTYPE_START_PUNCTUATION,
108             CHARTYPE_END_PUNCTUATION,
109             CHARTYPE_CONNECTOR_PUNCTUATION,
110             CHARTYPE_OTHER_PUNCTUATION,
111             CHARTYPE_MATH_SYMBOL,
112             CHARTYPE_CURRENCY_SYMBOL,
113             CHARTYPE_MODIFIER_SYMBOL,
114             CHARTYPE_OTHER_SYMBOL,
115             CHARTYPE_INITIAL_QUOTE_PUNCTUATION,
116             CHARTYPE_FINAL_QUOTE_PUNCTUATION
117         };
118 
119         /**
120          * Decomposition types as described by the unicode standard. These values map to
121          * the same values in uchar.h in ICU.
122          */
123         enum DecompositionType {
124             DECOMPOSITION_NONE = 0,
125             DECOMPOSITION_CANONICAL,
126             DECOMPOSITION_COMPAT,
127             DECOMPOSITION_CIRCLE,
128             DECOMPOSITION_FINAL,
129             DECOMPOSITION_FONT,
130             DECOMPOSITION_FRACTION,
131             DECOMPOSITION_INITIAL,
132             DECOMPOSITION_ISOLATED,
133             DECOMPOSITION_MEDIAL,
134             DECOMPOSITION_NARROW,
135             DECOMPOSITION_NOBREAK,
136             DECOMPOSITION_SMALL,
137             DECOMPOSITION_SQUARE,
138             DECOMPOSITION_SUB,
139             DECOMPOSITION_SUPER,
140             DECOMPOSITION_VERTICAL,
141             DECOMPOSITION_WIDE
142         };
143 
144         /**
145          * Returns the packed data for java calls
146          * @param c The unicode character.
147          * @return The packed data for the character.
148          *
149          * Copied from java.lang.Character implementation:
150          * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
151          * F E D C B A 9 8 7 6 5 4 3 2 1 0 F E D C B A 9 8 7 6 5 4 3 2 1 0
152          *
153          *                              31 types                 ---------
154          *                   18 directionalities       ---------
155          *                   2 mirroreds             -
156          *                               -----------      56  toupper diffs
157          *                   -----------                  48  tolower diffs
158          *               ---                              4 totitlecase diffs
159          * -------------                                 84 numeric values
160          *     ---------                                 24 mirror char diffs
161          */
162         static uint32_t getPackedData(UChar32 c);
163 
164         /**
165          * Get the Character type.
166          * @param c The unicode character.
167          * @return The character's type or CHARTYPE_UNASSIGNED if the character is invalid
168          *         or has an unassigned class.
169          */
170         static CharType getType(UChar32 c);
171 
172         /**
173          * Get the Character's decomposition type.
174          * @param c The unicode character.
175          * @return The character's decomposition type or DECOMPOSITION_NONE is there
176          *         is no decomposition.
177          */
178         static DecompositionType getDecompositionType(UChar32 c);
179 
180         /**
181          * Returns the digit value of a character or -1 if the character
182          * is not within the specified radix.
183          *
184          * The digit value is computed for integer characters and letters
185          * within the given radix. This function does not handle Roman Numerals,
186          * fractions, or any other characters that may represent numbers.
187          *
188          * @param c The unicode character
189          * @param radix The intended radix.
190          * @return The digit value or -1 if there is no digit value or if the value is outside the radix.
191          */
192         static int getDigitValue(UChar32 c, int radix = 10);
193 
194         /**
195          * Return the numeric value of a character
196          *
197          * @param c The unicode character.
198          * @return The numeric value of the character. -1 if the character has no numeric value,
199          *         -2 if the character has a numeric value that is not representable by an integer.
200          */
201         static int getNumericValue(UChar32 c);
202 
203         /**
204          * Convert the character to lowercase
205          * @param c The unicode character.
206          * @return The lowercase character equivalent of c. If c does not have a lowercase equivalent,
207          *         the original character is returned.
208          */
209         static UChar32 toLower(UChar32 c);
210 
211         /**
212          * Convert the character to uppercase
213          * @param c The unicode character.
214          * @return The uppercase character equivalent of c. If c does not have an uppercase equivalent,
215          *         the original character is returned.
216          */
217         static UChar32 toUpper(UChar32 c);
218 
219         /**
220          * Get the directionality of the character.
221          * @param c The unicode character.
222          * @return The direction of the character or DIRECTIONALITY_UNDEFINED.
223          */
224         static Direction getDirectionality(UChar32 c);
225 
226         /**
227          * Check if the character is a mirrored character. This means that the character
228          * has an equivalent character that is the mirror image of itself.
229          * @param c The unicode character.
230          * @return True iff c has a mirror equivalent.
231          */
232         static bool isMirrored(UChar32 c);
233 
234         /**
235          * Return the mirror of the given character.
236          * @param c The unicode character.
237          * @return The mirror equivalent of c. If c does not have a mirror equivalent,
238          *         the original character is returned.
239          * @see isMirrored
240          */
241         static UChar32 toMirror(UChar32 c);
242 
243         /**
244          * Convert the character to title case.
245          * @param c The unicode character.
246          * @return The titlecase equivalent of c. If c does not have a titlecase equivalent,
247          *         the original character is returned.
248          */
249         static UChar32 toTitle(UChar32 c);
250 
251    };
252 
253 }
254 
255 #endif
256