• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Copyright (C) 2006 The Android Open Source Project
3   *
4   * Licensed under the Apache License, Version 2.0 (the "License");
5   * you may not use this file except in compliance with the License.
6   * You may obtain a copy of the License at
7   *
8   *      http://www.apache.org/licenses/LICENSE-2.0
9   *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  //
18  
19  #ifndef ANDROID_UNICODE_H
20  #define ANDROID_UNICODE_H
21  
22  #include <stdint.h>
23  #include <sys/types.h>
24  
25  #define REPLACEMENT_CHAR (0xFFFD)
26  
27  // this part of code is copied from umachine.h under ICU
28  /**
29   * Define UChar32 as a type for single Unicode code points.
30   * UChar32 is a signed 32-bit integer (same as int32_t).
31   *
32   * The Unicode code point range is 0..0x10ffff.
33   * All other values (negative or >=0x110000) are illegal as Unicode code points.
34   * They may be used as sentinel values to indicate "done", "error"
35   * or similar non-code point conditions.
36   *
37   * @stable ICU 2.4
38   */
39  typedef int32_t UChar32;
40  
41  namespace android {
42  
43      class Encoding;
44      /**
45       * \class Unicode
46       *
47       * Helper class for getting properties of Unicode characters. Characters
48       * can have one of the types listed in CharType and each character can have the
49       * directionality of Direction.
50       */
51      class Unicode
52      {
53      public:
54          /**
55           * Directions specified in the Unicode standard. These directions map directly
56           * to java.lang.Character.
57           */
58          enum Direction {
59              DIRECTIONALITY_UNDEFINED = -1,
60              DIRECTIONALITY_LEFT_TO_RIGHT,
61              DIRECTIONALITY_RIGHT_TO_LEFT,
62              DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC,
63              DIRECTIONALITY_EUROPEAN_NUMBER,
64              DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR,
65              DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR,
66              DIRECTIONALITY_ARABIC_NUMBER,
67              DIRECTIONALITY_COMMON_NUMBER_SEPARATOR,
68              DIRECTIONALITY_NONSPACING_MARK,
69              DIRECTIONALITY_BOUNDARY_NEUTRAL,
70              DIRECTIONALITY_PARAGRAPH_SEPARATOR,
71              DIRECTIONALITY_SEGMENT_SEPARATOR,
72              DIRECTIONALITY_WHITESPACE,
73              DIRECTIONALITY_OTHER_NEUTRALS,
74              DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING,
75              DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE,
76              DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING,
77              DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE,
78              DIRECTIONALITY_POP_DIRECTIONAL_FORMAT
79          };
80  
81          /**
82           * Character types as specified in the Unicode standard. These map directly to
83           * java.lang.Character.
84           */
85          enum CharType {
86              CHARTYPE_UNASSIGNED = 0,
87              CHARTYPE_UPPERCASE_LETTER,
88              CHARTYPE_LOWERCASE_LETTER,
89              CHARTYPE_TITLECASE_LETTER,
90              CHARTYPE_MODIFIER_LETTER,
91              CHARTYPE_OTHER_LETTER,
92              CHARTYPE_NON_SPACING_MARK,
93              CHARTYPE_ENCLOSING_MARK,
94              CHARTYPE_COMBINING_SPACING_MARK,
95              CHARTYPE_DECIMAL_DIGIT_NUMBER,
96              CHARTYPE_LETTER_NUMBER,
97              CHARTYPE_OTHER_NUMBER,
98              CHARTYPE_SPACE_SEPARATOR,
99              CHARTYPE_LINE_SEPARATOR,
100              CHARTYPE_PARAGRAPH_SEPARATOR,
101              CHARTYPE_CONTROL,
102              CHARTYPE_FORMAT,
103              CHARTYPE_MISSING_VALUE_FOR_JAVA,    /* This is the mysterious missing 17 value from the java constants */
104              CHARTYPE_PRIVATE_USE,
105              CHARTYPE_SURROGATE,
106              CHARTYPE_DASH_PUNCTUATION,
107              CHARTYPE_START_PUNCTUATION,
108              CHARTYPE_END_PUNCTUATION,
109              CHARTYPE_CONNECTOR_PUNCTUATION,
110              CHARTYPE_OTHER_PUNCTUATION,
111              CHARTYPE_MATH_SYMBOL,
112              CHARTYPE_CURRENCY_SYMBOL,
113              CHARTYPE_MODIFIER_SYMBOL,
114              CHARTYPE_OTHER_SYMBOL,
115              CHARTYPE_INITIAL_QUOTE_PUNCTUATION,
116              CHARTYPE_FINAL_QUOTE_PUNCTUATION
117          };
118  
119          /**
120           * Decomposition types as described by the unicode standard. These values map to
121           * the same values in uchar.h in ICU.
122           */
123          enum DecompositionType {
124              DECOMPOSITION_NONE = 0,
125              DECOMPOSITION_CANONICAL,
126              DECOMPOSITION_COMPAT,
127              DECOMPOSITION_CIRCLE,
128              DECOMPOSITION_FINAL,
129              DECOMPOSITION_FONT,
130              DECOMPOSITION_FRACTION,
131              DECOMPOSITION_INITIAL,
132              DECOMPOSITION_ISOLATED,
133              DECOMPOSITION_MEDIAL,
134              DECOMPOSITION_NARROW,
135              DECOMPOSITION_NOBREAK,
136              DECOMPOSITION_SMALL,
137              DECOMPOSITION_SQUARE,
138              DECOMPOSITION_SUB,
139              DECOMPOSITION_SUPER,
140              DECOMPOSITION_VERTICAL,
141              DECOMPOSITION_WIDE
142          };
143  
144          /**
145           * Returns the packed data for java calls
146           * @param c The unicode character.
147           * @return The packed data for the character.
148           *
149           * Copied from java.lang.Character implementation:
150           * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
151           * F E D C B A 9 8 7 6 5 4 3 2 1 0 F E D C B A 9 8 7 6 5 4 3 2 1 0
152           *
153           *                              31 types                 ---------
154           *                   18 directionalities       ---------
155           *                   2 mirroreds             -
156           *                               -----------      56  toupper diffs
157           *                   -----------                  48  tolower diffs
158           *               ---                              4 totitlecase diffs
159           * -------------                                 84 numeric values
160           *     ---------                                 24 mirror char diffs
161           */
162          static uint32_t getPackedData(UChar32 c);
163  
164          /**
165           * Get the Character type.
166           * @param c The unicode character.
167           * @return The character's type or CHARTYPE_UNASSIGNED if the character is invalid
168           *         or has an unassigned class.
169           */
170          static CharType getType(UChar32 c);
171  
172          /**
173           * Get the Character's decomposition type.
174           * @param c The unicode character.
175           * @return The character's decomposition type or DECOMPOSITION_NONE is there
176           *         is no decomposition.
177           */
178          static DecompositionType getDecompositionType(UChar32 c);
179  
180          /**
181           * Returns the digit value of a character or -1 if the character
182           * is not within the specified radix.
183           *
184           * The digit value is computed for integer characters and letters
185           * within the given radix. This function does not handle Roman Numerals,
186           * fractions, or any other characters that may represent numbers.
187           *
188           * @param c The unicode character
189           * @param radix The intended radix.
190           * @return The digit value or -1 if there is no digit value or if the value is outside the radix.
191           */
192          static int getDigitValue(UChar32 c, int radix = 10);
193  
194          /**
195           * Return the numeric value of a character
196           *
197           * @param c The unicode character.
198           * @return The numeric value of the character. -1 if the character has no numeric value,
199           *         -2 if the character has a numeric value that is not representable by an integer.
200           */
201          static int getNumericValue(UChar32 c);
202  
203          /**
204           * Convert the character to lowercase
205           * @param c The unicode character.
206           * @return The lowercase character equivalent of c. If c does not have a lowercase equivalent,
207           *         the original character is returned.
208           */
209          static UChar32 toLower(UChar32 c);
210  
211          /**
212           * Convert the character to uppercase
213           * @param c The unicode character.
214           * @return The uppercase character equivalent of c. If c does not have an uppercase equivalent,
215           *         the original character is returned.
216           */
217          static UChar32 toUpper(UChar32 c);
218  
219          /**
220           * Get the directionality of the character.
221           * @param c The unicode character.
222           * @return The direction of the character or DIRECTIONALITY_UNDEFINED.
223           */
224          static Direction getDirectionality(UChar32 c);
225  
226          /**
227           * Check if the character is a mirrored character. This means that the character
228           * has an equivalent character that is the mirror image of itself.
229           * @param c The unicode character.
230           * @return True iff c has a mirror equivalent.
231           */
232          static bool isMirrored(UChar32 c);
233  
234          /**
235           * Return the mirror of the given character.
236           * @param c The unicode character.
237           * @return The mirror equivalent of c. If c does not have a mirror equivalent,
238           *         the original character is returned.
239           * @see isMirrored
240           */
241          static UChar32 toMirror(UChar32 c);
242  
243          /**
244           * Convert the character to title case.
245           * @param c The unicode character.
246           * @return The titlecase equivalent of c. If c does not have a titlecase equivalent,
247           *         the original character is returned.
248           */
249          static UChar32 toTitle(UChar32 c);
250  
251     };
252  
253  }
254  
255  #endif
256