1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin; 18 19 import android.text.TextUtils; 20 21 import com.android.inputmethod.keyboard.Keyboard; // For character constants 22 23 import java.util.ArrayList; 24 import java.util.Locale; 25 26 public final class StringUtils { StringUtils()27 private StringUtils() { 28 // This utility class is not publicly instantiable. 29 } 30 codePointCount(String text)31 public static int codePointCount(String text) { 32 if (TextUtils.isEmpty(text)) return 0; 33 return text.codePointCount(0, text.length()); 34 } 35 containsInArray(String key, String[] array)36 public static boolean containsInArray(String key, String[] array) { 37 for (final String element : array) { 38 if (key.equals(element)) return true; 39 } 40 return false; 41 } 42 containsInCsv(String key, String csv)43 public static boolean containsInCsv(String key, String csv) { 44 if (TextUtils.isEmpty(csv)) return false; 45 return containsInArray(key, csv.split(",")); 46 } 47 appendToCsvIfNotExists(String key, String csv)48 public static String appendToCsvIfNotExists(String key, String csv) { 49 if (TextUtils.isEmpty(csv)) return key; 50 if (containsInCsv(key, csv)) return csv; 51 return csv + "," + key; 52 } 53 removeFromCsvIfExists(String key, String csv)54 public static String removeFromCsvIfExists(String key, String csv) { 55 if (TextUtils.isEmpty(csv)) return ""; 56 final String[] elements = csv.split(","); 57 if (!containsInArray(key, elements)) return csv; 58 final ArrayList<String> result = CollectionUtils.newArrayList(elements.length - 1); 59 for (final String element : elements) { 60 if (!key.equals(element)) result.add(element); 61 } 62 return TextUtils.join(",", result); 63 } 64 65 /** 66 * Returns true if a and b are equal ignoring the case of the character. 67 * @param a first character to check 68 * @param b second character to check 69 * @return {@code true} if a and b are equal, {@code false} otherwise. 70 */ equalsIgnoreCase(char a, char b)71 public static boolean equalsIgnoreCase(char a, char b) { 72 // Some language, such as Turkish, need testing both cases. 73 return a == b 74 || Character.toLowerCase(a) == Character.toLowerCase(b) 75 || Character.toUpperCase(a) == Character.toUpperCase(b); 76 } 77 78 /** 79 * Returns true if a and b are equal ignoring the case of the characters, including if they are 80 * both null. 81 * @param a first CharSequence to check 82 * @param b second CharSequence to check 83 * @return {@code true} if a and b are equal, {@code false} otherwise. 84 */ equalsIgnoreCase(CharSequence a, CharSequence b)85 public static boolean equalsIgnoreCase(CharSequence a, CharSequence b) { 86 if (a == b) 87 return true; // including both a and b are null. 88 if (a == null || b == null) 89 return false; 90 final int length = a.length(); 91 if (length != b.length()) 92 return false; 93 for (int i = 0; i < length; i++) { 94 if (!equalsIgnoreCase(a.charAt(i), b.charAt(i))) 95 return false; 96 } 97 return true; 98 } 99 100 /** 101 * Returns true if a and b are equal ignoring the case of the characters, including if a is null 102 * and b is zero length. 103 * @param a CharSequence to check 104 * @param b character array to check 105 * @param offset start offset of array b 106 * @param length length of characters in array b 107 * @return {@code true} if a and b are equal, {@code false} otherwise. 108 * @throws IndexOutOfBoundsException 109 * if {@code offset < 0 || length < 0 || offset + length > data.length}. 110 * @throws NullPointerException if {@code b == null}. 111 */ equalsIgnoreCase(CharSequence a, char[] b, int offset, int length)112 public static boolean equalsIgnoreCase(CharSequence a, char[] b, int offset, int length) { 113 if (offset < 0 || length < 0 || length > b.length - offset) 114 throw new IndexOutOfBoundsException("array.length=" + b.length + " offset=" + offset 115 + " length=" + length); 116 if (a == null) 117 return length == 0; // including a is null and b is zero length. 118 if (a.length() != length) 119 return false; 120 for (int i = 0; i < length; i++) { 121 if (!equalsIgnoreCase(a.charAt(i), b[offset + i])) 122 return false; 123 } 124 return true; 125 } 126 127 /** 128 * Remove duplicates from an array of strings. 129 * 130 * This method will always keep the first occurrence of all strings at their position 131 * in the array, removing the subsequent ones. 132 */ removeDupes(final ArrayList<CharSequence> suggestions)133 public static void removeDupes(final ArrayList<CharSequence> suggestions) { 134 if (suggestions.size() < 2) return; 135 int i = 1; 136 // Don't cache suggestions.size(), since we may be removing items 137 while (i < suggestions.size()) { 138 final CharSequence cur = suggestions.get(i); 139 // Compare each suggestion with each previous suggestion 140 for (int j = 0; j < i; j++) { 141 CharSequence previous = suggestions.get(j); 142 if (TextUtils.equals(cur, previous)) { 143 suggestions.remove(i); 144 i--; 145 break; 146 } 147 } 148 i++; 149 } 150 } 151 toTitleCase(String s, Locale locale)152 public static String toTitleCase(String s, Locale locale) { 153 if (s.length() <= 1) { 154 // TODO: is this really correct? Shouldn't this be s.toUpperCase()? 155 return s; 156 } 157 // TODO: fix the bugs below 158 // - This does not work for Greek, because it returns upper case instead of title case. 159 // - It does not work for Serbian, because it fails to account for the "lj" character, 160 // which should be "Lj" in title case and "LJ" in upper case. 161 // - It does not work for Dutch, because it fails to account for the "ij" digraph, which 162 // are two different characters but both should be capitalized as "IJ" as if they were 163 // a single letter. 164 // - It also does not work with unicode surrogate code points. 165 return s.toUpperCase(locale).charAt(0) + s.substring(1); 166 } 167 toCodePointArray(final String string)168 public static int[] toCodePointArray(final String string) { 169 final char[] characters = string.toCharArray(); 170 final int length = characters.length; 171 final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 172 if (length <= 0) { 173 return new int[0]; 174 } 175 int codePoint = Character.codePointAt(characters, 0); 176 int dsti = 0; 177 for (int srci = Character.charCount(codePoint); 178 srci < length; srci += Character.charCount(codePoint), ++dsti) { 179 codePoints[dsti] = codePoint; 180 codePoint = Character.codePointAt(characters, srci); 181 } 182 codePoints[dsti] = codePoint; 183 return codePoints; 184 } 185 186 /** 187 * Determine what caps mode should be in effect at the current offset in 188 * the text. Only the mode bits set in <var>reqModes</var> will be 189 * checked. Note that the caps mode flags here are explicitly defined 190 * to match those in {@link InputType}. 191 * 192 * This code is a straight copy of TextUtils.getCapsMode (modulo namespace and formatting 193 * issues). This will change in the future as we simplify the code for our use and fix bugs. 194 * 195 * @param cs The text that should be checked for caps modes. 196 * @param reqModes The modes to be checked: may be any combination of 197 * {@link TextUtils#CAP_MODE_CHARACTERS}, {@link TextUtils#CAP_MODE_WORDS}, and 198 * {@link TextUtils#CAP_MODE_SENTENCES}. 199 * @param locale The locale to consider for capitalization rules 200 * @param hasSpaceBefore Whether we should consider there is a space inserted at the end of cs 201 * 202 * @return Returns the actual capitalization modes that can be in effect 203 * at the current position, which is any combination of 204 * {@link TextUtils#CAP_MODE_CHARACTERS}, {@link TextUtils#CAP_MODE_WORDS}, and 205 * {@link TextUtils#CAP_MODE_SENTENCES}. 206 */ getCapsMode(final CharSequence cs, final int reqModes, final Locale locale, final boolean hasSpaceBefore)207 public static int getCapsMode(final CharSequence cs, final int reqModes, final Locale locale, 208 final boolean hasSpaceBefore) { 209 // Quick description of what we want to do: 210 // CAP_MODE_CHARACTERS is always on. 211 // CAP_MODE_WORDS is on if there is some whitespace before the cursor. 212 // CAP_MODE_SENTENCES is on if there is some whitespace before the cursor, and the end 213 // of a sentence just before that. 214 // We ignore opening parentheses and the like just before the cursor for purposes of 215 // finding whitespace for WORDS and SENTENCES modes. 216 // The end of a sentence ends with a period, question mark or exclamation mark. If it's 217 // a period, it also needs not to be an abbreviation, which means it also needs to either 218 // be immediately preceded by punctuation, or by a string of only letters with single 219 // periods interleaved. 220 221 // Step 1 : check for cap MODE_CHARACTERS. If it's looked for, it's always on. 222 if ((reqModes & (TextUtils.CAP_MODE_WORDS | TextUtils.CAP_MODE_SENTENCES)) == 0) { 223 // Here we are not looking for MODE_WORDS or MODE_SENTENCES, so since we already 224 // evaluated MODE_CHARACTERS, we can return. 225 return TextUtils.CAP_MODE_CHARACTERS & reqModes; 226 } 227 228 // Step 2 : Skip (ignore at the end of input) any opening punctuation. This includes 229 // opening parentheses, brackets, opening quotes, everything that *opens* a span of 230 // text in the linguistic sense. In RTL languages, this is still an opening sign, although 231 // it may look like a right parenthesis for example. We also include double quote and 232 // single quote since they aren't start punctuation in the unicode sense, but should still 233 // be skipped for English. TODO: does this depend on the language? 234 int i; 235 if (hasSpaceBefore) { 236 i = cs.length() + 1; 237 } else { 238 for (i = cs.length(); i > 0; i--) { 239 final char c = cs.charAt(i - 1); 240 if (c != Keyboard.CODE_DOUBLE_QUOTE && c != Keyboard.CODE_SINGLE_QUOTE 241 && Character.getType(c) != Character.START_PUNCTUATION) { 242 break; 243 } 244 } 245 } 246 247 // We are now on the character that precedes any starting punctuation, so in the most 248 // frequent case this will be whitespace or a letter, although it may occasionally be a 249 // start of line, or some symbol. 250 251 // Step 3 : Search for the start of a paragraph. From the starting point computed in step 2, 252 // we go back over any space or tab char sitting there. We find the start of a paragraph 253 // if the first char that's not a space or tab is a start of line (as in \n, start of text, 254 // or some other similar characters). 255 int j = i; 256 char prevChar = Keyboard.CODE_SPACE; 257 if (hasSpaceBefore) --j; 258 while (j > 0) { 259 prevChar = cs.charAt(j - 1); 260 if (!Character.isSpaceChar(prevChar) && prevChar != Keyboard.CODE_TAB) break; 261 j--; 262 } 263 if (j <= 0 || Character.isWhitespace(prevChar)) { 264 // There are only spacing chars between the start of the paragraph and the cursor, 265 // defined as a isWhitespace() char that is neither a isSpaceChar() nor a tab. Both 266 // MODE_WORDS and MODE_SENTENCES should be active. 267 return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS 268 | TextUtils.CAP_MODE_SENTENCES) & reqModes; 269 } 270 if (i == j) { 271 // If we don't have whitespace before index i, it means neither MODE_WORDS 272 // nor mode sentences should be on so we can return right away. 273 return TextUtils.CAP_MODE_CHARACTERS & reqModes; 274 } 275 if ((reqModes & TextUtils.CAP_MODE_SENTENCES) == 0) { 276 // Here we know we have whitespace before the cursor (if not, we returned in the above 277 // if i == j clause), so we need MODE_WORDS to be on. And we don't need to evaluate 278 // MODE_SENTENCES so we can return right away. 279 return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes; 280 } 281 // Please note that because of the reqModes & CAP_MODE_SENTENCES test a few lines above, 282 // we know that MODE_SENTENCES is being requested. 283 284 // Step 4 : Search for MODE_SENTENCES. 285 // English is a special case in that "American typography" rules, which are the most common 286 // in English, state that a sentence terminator immediately following a quotation mark 287 // should be swapped with it and de-duplicated (included in the quotation mark), 288 // e.g. <<Did he say, "let's go home?">> 289 // No other language has such a rule as far as I know, instead putting inside the quotation 290 // mark as the exact thing quoted and handling the surrounding punctuation independently, 291 // e.g. <<Did he say, "let's go home"?>> 292 // Hence, specifically for English, we treat this special case here. 293 if (Locale.ENGLISH.getLanguage().equals(locale.getLanguage())) { 294 for (; j > 0; j--) { 295 // Here we look to go over any closing punctuation. This is because in dominant 296 // variants of English, the final period is placed within double quotes and maybe 297 // other closing punctuation signs. This is generally not true in other languages. 298 final char c = cs.charAt(j - 1); 299 if (c != Keyboard.CODE_DOUBLE_QUOTE && c != Keyboard.CODE_SINGLE_QUOTE 300 && Character.getType(c) != Character.END_PUNCTUATION) { 301 break; 302 } 303 } 304 } 305 306 if (j <= 0) return TextUtils.CAP_MODE_CHARACTERS & reqModes; 307 char c = cs.charAt(--j); 308 309 // We found the next interesting chunk of text ; next we need to determine if it's the 310 // end of a sentence. If we have a question mark or an exclamation mark, it's the end of 311 // a sentence. If it's neither, the only remaining case is the period so we get the opposite 312 // case out of the way. 313 if (c == Keyboard.CODE_QUESTION_MARK || c == Keyboard.CODE_EXCLAMATION_MARK) { 314 return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_SENTENCES) & reqModes; 315 } 316 if (c != Keyboard.CODE_PERIOD || j <= 0) { 317 return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes; 318 } 319 320 // We found out that we have a period. We need to determine if this is a full stop or 321 // otherwise sentence-ending period, or an abbreviation like "e.g.". An abbreviation 322 // looks like (\w\.){2,} 323 // To find out, we will have a simple state machine with the following states : 324 // START, WORD, PERIOD, ABBREVIATION 325 // On START : (just before the first period) 326 // letter => WORD 327 // whitespace => end with no caps (it was a stand-alone period) 328 // otherwise => end with caps (several periods/symbols in a row) 329 // On WORD : (within the word just before the first period) 330 // letter => WORD 331 // period => PERIOD 332 // otherwise => end with caps (it was a word with a full stop at the end) 333 // On PERIOD : (period within a potential abbreviation) 334 // letter => LETTER 335 // otherwise => end with caps (it was not an abbreviation) 336 // On LETTER : (letter within a potential abbreviation) 337 // letter => LETTER 338 // period => PERIOD 339 // otherwise => end with no caps (it was an abbreviation) 340 // "Not an abbreviation" in the above chart essentially covers cases like "...yes.". This 341 // should capitalize. 342 343 final int START = 0; 344 final int WORD = 1; 345 final int PERIOD = 2; 346 final int LETTER = 3; 347 final int caps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS 348 | TextUtils.CAP_MODE_SENTENCES) & reqModes; 349 final int noCaps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes; 350 int state = START; 351 while (j > 0) { 352 c = cs.charAt(--j); 353 switch (state) { 354 case START: 355 if (Character.isLetter(c)) { 356 state = WORD; 357 } else if (Character.isWhitespace(c)) { 358 return noCaps; 359 } else { 360 return caps; 361 } 362 break; 363 case WORD: 364 if (Character.isLetter(c)) { 365 state = WORD; 366 } else if (c == Keyboard.CODE_PERIOD) { 367 state = PERIOD; 368 } else { 369 return caps; 370 } 371 break; 372 case PERIOD: 373 if (Character.isLetter(c)) { 374 state = LETTER; 375 } else { 376 return caps; 377 } 378 break; 379 case LETTER: 380 if (Character.isLetter(c)) { 381 state = LETTER; 382 } else if (c == Keyboard.CODE_PERIOD) { 383 state = PERIOD; 384 } else { 385 return noCaps; 386 } 387 } 388 } 389 // Here we arrived at the start of the line. This should behave exactly like whitespace. 390 return (START == state || LETTER == state) ? noCaps : caps; 391 } 392 } 393