1 /* 2 * Copyright (C) 2006 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // 18 19 #ifndef ANDROID_UNICODE_H 20 #define ANDROID_UNICODE_H 21 22 #include <stdint.h> 23 #include <sys/types.h> 24 25 #define REPLACEMENT_CHAR (0xFFFD) 26 27 // this part of code is copied from umachine.h under ICU 28 /** 29 * Define UChar32 as a type for single Unicode code points. 30 * UChar32 is a signed 32-bit integer (same as int32_t). 31 * 32 * The Unicode code point range is 0..0x10ffff. 33 * All other values (negative or >=0x110000) are illegal as Unicode code points. 34 * They may be used as sentinel values to indicate "done", "error" 35 * or similar non-code point conditions. 36 * 37 * @stable ICU 2.4 38 */ 39 typedef int32_t UChar32; 40 41 namespace android { 42 43 class Encoding; 44 /** 45 * \class Unicode 46 * 47 * Helper class for getting properties of Unicode characters. Characters 48 * can have one of the types listed in CharType and each character can have the 49 * directionality of Direction. 50 */ 51 class Unicode 52 { 53 public: 54 /** 55 * Directions specified in the Unicode standard. These directions map directly 56 * to java.lang.Character. 57 */ 58 enum Direction { 59 DIRECTIONALITY_UNDEFINED = -1, 60 DIRECTIONALITY_LEFT_TO_RIGHT, 61 DIRECTIONALITY_RIGHT_TO_LEFT, 62 DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC, 63 DIRECTIONALITY_EUROPEAN_NUMBER, 64 DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR, 65 DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR, 66 DIRECTIONALITY_ARABIC_NUMBER, 67 DIRECTIONALITY_COMMON_NUMBER_SEPARATOR, 68 DIRECTIONALITY_NONSPACING_MARK, 69 DIRECTIONALITY_BOUNDARY_NEUTRAL, 70 DIRECTIONALITY_PARAGRAPH_SEPARATOR, 71 DIRECTIONALITY_SEGMENT_SEPARATOR, 72 DIRECTIONALITY_WHITESPACE, 73 DIRECTIONALITY_OTHER_NEUTRALS, 74 DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING, 75 DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE, 76 DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING, 77 DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE, 78 DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 79 }; 80 81 /** 82 * Character types as specified in the Unicode standard. These map directly to 83 * java.lang.Character. 84 */ 85 enum CharType { 86 CHARTYPE_UNASSIGNED = 0, 87 CHARTYPE_UPPERCASE_LETTER, 88 CHARTYPE_LOWERCASE_LETTER, 89 CHARTYPE_TITLECASE_LETTER, 90 CHARTYPE_MODIFIER_LETTER, 91 CHARTYPE_OTHER_LETTER, 92 CHARTYPE_NON_SPACING_MARK, 93 CHARTYPE_ENCLOSING_MARK, 94 CHARTYPE_COMBINING_SPACING_MARK, 95 CHARTYPE_DECIMAL_DIGIT_NUMBER, 96 CHARTYPE_LETTER_NUMBER, 97 CHARTYPE_OTHER_NUMBER, 98 CHARTYPE_SPACE_SEPARATOR, 99 CHARTYPE_LINE_SEPARATOR, 100 CHARTYPE_PARAGRAPH_SEPARATOR, 101 CHARTYPE_CONTROL, 102 CHARTYPE_FORMAT, 103 CHARTYPE_MISSING_VALUE_FOR_JAVA, /* This is the mysterious missing 17 value from the java constants */ 104 CHARTYPE_PRIVATE_USE, 105 CHARTYPE_SURROGATE, 106 CHARTYPE_DASH_PUNCTUATION, 107 CHARTYPE_START_PUNCTUATION, 108 CHARTYPE_END_PUNCTUATION, 109 CHARTYPE_CONNECTOR_PUNCTUATION, 110 CHARTYPE_OTHER_PUNCTUATION, 111 CHARTYPE_MATH_SYMBOL, 112 CHARTYPE_CURRENCY_SYMBOL, 113 CHARTYPE_MODIFIER_SYMBOL, 114 CHARTYPE_OTHER_SYMBOL, 115 CHARTYPE_INITIAL_QUOTE_PUNCTUATION, 116 CHARTYPE_FINAL_QUOTE_PUNCTUATION 117 }; 118 119 /** 120 * Decomposition types as described by the unicode standard. These values map to 121 * the same values in uchar.h in ICU. 122 */ 123 enum DecompositionType { 124 DECOMPOSITION_NONE = 0, 125 DECOMPOSITION_CANONICAL, 126 DECOMPOSITION_COMPAT, 127 DECOMPOSITION_CIRCLE, 128 DECOMPOSITION_FINAL, 129 DECOMPOSITION_FONT, 130 DECOMPOSITION_FRACTION, 131 DECOMPOSITION_INITIAL, 132 DECOMPOSITION_ISOLATED, 133 DECOMPOSITION_MEDIAL, 134 DECOMPOSITION_NARROW, 135 DECOMPOSITION_NOBREAK, 136 DECOMPOSITION_SMALL, 137 DECOMPOSITION_SQUARE, 138 DECOMPOSITION_SUB, 139 DECOMPOSITION_SUPER, 140 DECOMPOSITION_VERTICAL, 141 DECOMPOSITION_WIDE 142 }; 143 144 /** 145 * Returns the packed data for java calls 146 * @param c The unicode character. 147 * @return The packed data for the character. 148 * 149 * Copied from java.lang.Character implementation: 150 * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 151 * F E D C B A 9 8 7 6 5 4 3 2 1 0 F E D C B A 9 8 7 6 5 4 3 2 1 0 152 * 153 * 31 types --------- 154 * 18 directionalities --------- 155 * 2 mirroreds - 156 * ----------- 56 toupper diffs 157 * ----------- 48 tolower diffs 158 * --- 4 totitlecase diffs 159 * ------------- 84 numeric values 160 * --------- 24 mirror char diffs 161 */ 162 static uint32_t getPackedData(UChar32 c); 163 164 /** 165 * Get the Character type. 166 * @param c The unicode character. 167 * @return The character's type or CHARTYPE_UNASSIGNED if the character is invalid 168 * or has an unassigned class. 169 */ 170 static CharType getType(UChar32 c); 171 172 /** 173 * Get the Character's decomposition type. 174 * @param c The unicode character. 175 * @return The character's decomposition type or DECOMPOSITION_NONE is there 176 * is no decomposition. 177 */ 178 static DecompositionType getDecompositionType(UChar32 c); 179 180 /** 181 * Returns the digit value of a character or -1 if the character 182 * is not within the specified radix. 183 * 184 * The digit value is computed for integer characters and letters 185 * within the given radix. This function does not handle Roman Numerals, 186 * fractions, or any other characters that may represent numbers. 187 * 188 * @param c The unicode character 189 * @param radix The intended radix. 190 * @return The digit value or -1 if there is no digit value or if the value is outside the radix. 191 */ 192 static int getDigitValue(UChar32 c, int radix = 10); 193 194 /** 195 * Return the numeric value of a character 196 * 197 * @param c The unicode character. 198 * @return The numeric value of the character. -1 if the character has no numeric value, 199 * -2 if the character has a numeric value that is not representable by an integer. 200 */ 201 static int getNumericValue(UChar32 c); 202 203 /** 204 * Convert the character to lowercase 205 * @param c The unicode character. 206 * @return The lowercase character equivalent of c. If c does not have a lowercase equivalent, 207 * the original character is returned. 208 */ 209 static UChar32 toLower(UChar32 c); 210 211 /** 212 * Convert the character to uppercase 213 * @param c The unicode character. 214 * @return The uppercase character equivalent of c. If c does not have an uppercase equivalent, 215 * the original character is returned. 216 */ 217 static UChar32 toUpper(UChar32 c); 218 219 /** 220 * Get the directionality of the character. 221 * @param c The unicode character. 222 * @return The direction of the character or DIRECTIONALITY_UNDEFINED. 223 */ 224 static Direction getDirectionality(UChar32 c); 225 226 /** 227 * Check if the character is a mirrored character. This means that the character 228 * has an equivalent character that is the mirror image of itself. 229 * @param c The unicode character. 230 * @return True iff c has a mirror equivalent. 231 */ 232 static bool isMirrored(UChar32 c); 233 234 /** 235 * Return the mirror of the given character. 236 * @param c The unicode character. 237 * @return The mirror equivalent of c. If c does not have a mirror equivalent, 238 * the original character is returned. 239 * @see isMirrored 240 */ 241 static UChar32 toMirror(UChar32 c); 242 243 /** 244 * Convert the character to title case. 245 * @param c The unicode character. 246 * @return The titlecase equivalent of c. If c does not have a titlecase equivalent, 247 * the original character is returned. 248 */ 249 static UChar32 toTitle(UChar32 c); 250 251 }; 252 253 } 254 255 #endif 256