1 /* 2 * Copyright (C) 2025 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.android.textclassifier; 17 18 import static java.lang.String.format; 19 20 import android.icu.util.ULocale; 21 import android.os.Bundle; 22 import android.util.ArrayMap; 23 import android.view.textclassifier.TextLanguage; 24 import android.view.textclassifier.TextLinks; 25 26 import androidx.annotation.NonNull; 27 import androidx.annotation.Nullable; 28 29 import com.android.textclassifier.common.base.TcLog; 30 import com.android.textclassifier.utils.TextClassifierUtils; 31 32 import com.google.common.annotations.VisibleForTesting; 33 34 import java.io.IOException; 35 import java.util.Collections; 36 import java.util.HashSet; 37 import java.util.Map; 38 import java.util.Set; 39 import java.util.regex.Matcher; 40 import java.util.regex.Pattern; 41 42 /** 43 * Class with helper methods related to detecting OTP codes in a text. 44 */ 45 public class TextClassifierOtpHelper { 46 private static final String TAG = TextClassifierOtpHelper.class.getSimpleName(); 47 48 private static final int PATTERN_FLAGS = 49 Pattern.DOTALL | Pattern.CASE_INSENSITIVE | Pattern.MULTILINE; 50 compileToRegex(String pattern)51 private static ThreadLocal<Matcher> compileToRegex(String pattern) { 52 return ThreadLocal.withInitial(() -> Pattern.compile(pattern, PATTERN_FLAGS).matcher("")); 53 } 54 55 private static final float TC_THRESHOLD = 0.6f; 56 57 private static final ArrayMap<String, ThreadLocal<Matcher>> EXTRA_LANG_OTP_REGEX = 58 new ArrayMap<>(); 59 60 private static final ThreadLocal<Matcher> OTP_REGEX = compileToRegex(RegExStrings.ALL_OTP); 61 62 /** 63 * A combination of common false positives. These matches are expected to be longer than (or equal 64 * in length to) otp matches 65 */ 66 private static final ThreadLocal<Matcher> FALSE_POSITIVE_REGEX = 67 compileToRegex(RegExStrings.FALSE_POSITIVE); 68 69 /** 70 * Creates a regular expression to match any of a series of individual words, case insensitive. It 71 * also verifies the position of the word, relative to the OTP match 72 */ createDictionaryRegex(String[] words)73 private static ThreadLocal<Matcher> createDictionaryRegex(String[] words) { 74 StringBuilder regex = new StringBuilder("("); 75 for (int i = 0; i < words.length; i++) { 76 regex.append(findContextWordWithCode(words[i])); 77 if (i != words.length - 1) { 78 regex.append("|"); 79 } 80 } 81 regex.append(")"); 82 return compileToRegex(regex.toString()); 83 } 84 85 /** 86 * Creates a regular expression that will find a context word, if that word occurs in the sentence 87 * preceding an OTP, or in the same sentence as an OTP (before or after). In both cases, the 88 * context word must occur within 50 characters of the suspected OTP 89 * 90 * @param contextWord The context word we expect to find around the OTP match 91 * @return A string representing a regular expression that will determine if we found a context 92 * word occurring before an otp match, or after it, but in the same sentence. 93 */ findContextWordWithCode(String contextWord)94 private static String findContextWordWithCode(String contextWord) { 95 String boundedContext = "\\b" + contextWord + "\\b"; 96 // Asserts that we find the OTP code within 50 characters after the context word, with at 97 // most one sentence punctuation between the OTP code and the context word (i.e. they are 98 // in the same sentence, or the context word is in the previous sentence) 99 String contextWordBeforeOtpInSameOrPreviousSentence = 100 String.format("(%s(?=.{1,50}%s)[^.?!]*[.?!]?[^.?!]*%s)", boundedContext, RegExStrings.ALL_OTP, RegExStrings.ALL_OTP); 101 // Asserts that we find the context word within 50 characters after the OTP code, with no 102 // sentence punctuation between the OTP code and the context word (i.e. they are in the same 103 // sentence) 104 String contextWordAfterOtpSameSentence = 105 String.format("(%s)[^.!?]{1,50}%s", RegExStrings.ALL_OTP, boundedContext); 106 return String.format( 107 "(%s|%s)", contextWordBeforeOtpInSameOrPreviousSentence, contextWordAfterOtpSameSentence); 108 } 109 110 static { 111 EXTRA_LANG_OTP_REGEX.put( ULocale.ENGLISH.toLanguageTag()112 ULocale.ENGLISH.toLanguageTag(), createDictionaryRegex(RegExStrings.ENGLISH_CONTEXT_WORDS)); 113 } 114 115 /** 116 * Checks if the text might contain an OTP, if so, adds a link to the builder with type as OTP 117 * 118 * @param text The text whose content should be checked for OTP 119 * @param tcImpl Instance of the TextClassifierImpl 120 * @param builder TextLinks builder object to whom the OTP link to be added 121 */ addOtpLink(@onNull String text, @NonNull TextClassifierImpl tcImpl, @NonNull TextLinks.Builder builder)122 public static void addOtpLink(@NonNull String text, @NonNull TextClassifierImpl tcImpl, 123 @NonNull TextLinks.Builder builder) { 124 if (!containsOtp(text, tcImpl)) { 125 return; 126 } 127 final Map<String, Float> entityScores = Collections.singletonMap(TextClassifierUtils.TYPE_OTP, 128 1f); 129 builder.addLink(0, 0, entityScores, new Bundle()); 130 } 131 132 /** 133 * Checks if a string of text might contain an OTP, based on several regular expressions, and 134 * potentially using a textClassifier to eliminate false positives 135 * 136 * @param text The text whose content should be checked 137 * @param tcImpl If non null, the provided TextClassifierImpl will be used to find the language 138 * of the text, and look for a language-specific regex for it. 139 * @return True if we believe an OTP is in the message, false otherwise. 140 */ containsOtp( @onNull String text, @NonNull TextClassifierImpl tcImpl)141 protected static boolean containsOtp( 142 @NonNull String text, 143 @NonNull TextClassifierImpl tcImpl) { 144 if (!containsOtpLikePattern(text)) { 145 return false; 146 } 147 148 ULocale language = getLanguageWithRegex(text, tcImpl); 149 if (language == null) { 150 return false; 151 } 152 return hasLanguageSpecificOtpWord(text, language.toLanguageTag()); 153 } 154 155 /** 156 * Checks if the given text contains a pattern resembling an OTP. 157 * 158 * <p>This method attempts to identify such patterns by matching against a regular expression. 159 * Avoids false positives by checking for common patterns that might be mistaken for OTPs, such 160 * as phone numbers or dates.</p> 161 * 162 * @param text The text to be checked. 163 * @return {@code true} if the text contains an OTP-like pattern, {@code false} otherwise. 164 */ 165 @VisibleForTesting containsOtpLikePattern(String text)166 protected static boolean containsOtpLikePattern(String text) { 167 Set<String> otpMatches = getAllMatches(text, OTP_REGEX.get()); 168 if (otpMatches.isEmpty()) { 169 return false; 170 } 171 Set<String> falsePositives = getAllMatches(text, FALSE_POSITIVE_REGEX.get()); 172 173 // This optional, but having this would help with performance 174 // Example: "Your OTP code is 1234 and this is sent on 01-01-2001" 175 // At this point -> otpMatches: [1234, 01-01-2001] falsePositives=[01-01-2001] 176 // It filters "01-01-2001" in advance and continues to next checks with otpMatches: [1234] 177 otpMatches.removeAll(falsePositives); 178 179 // Following is to handle text like: "Your OTP can't be shared at this point, please call 180 // (888) 888-8888" 181 // otpMatches: [888-8888] falsePositives=[(888) 888-8888] final=[] 182 for (String otpMatch : otpMatches) { 183 boolean currentOtpIsFalsePositive = false; 184 for (String falsePositive : falsePositives) { 185 if (falsePositive.contains(otpMatch)) { 186 currentOtpIsFalsePositive = true; 187 break; 188 } 189 } 190 if (!currentOtpIsFalsePositive) { 191 return true; 192 } 193 } 194 return false; 195 } 196 197 /** 198 * Checks if the given text contains a language-specific word or phrase associated with OTPs. 199 * This method uses regular expressions defined for specific languages to identify these words. 200 * 201 * @param text The text to check. 202 * @param languageTag The language tag (e.g., "en", "es", "fr") for which to check. 203 * @return {@code true} if the text contains a language-specific OTP word, {@code false} otherwise. 204 * Returns {@code false} if no language-specific regex is defined for the given tag. 205 */ 206 @VisibleForTesting hasLanguageSpecificOtpWord(@onNull String text, @NonNull String languageTag)207 protected static boolean hasLanguageSpecificOtpWord(@NonNull String text, @NonNull String languageTag) { 208 if (!EXTRA_LANG_OTP_REGEX.containsKey(languageTag)){ 209 return false; 210 } 211 Matcher languageSpecificMatcher = EXTRA_LANG_OTP_REGEX.get(languageTag).get(); 212 if (languageSpecificMatcher == null) { 213 return false; 214 } 215 languageSpecificMatcher.reset(text); 216 return languageSpecificMatcher.find(); 217 } 218 getAllMatches(String text, Matcher regex)219 private static Set<String> getAllMatches(String text, Matcher regex) { 220 Set<String> matches = new HashSet<>(); 221 regex.reset(text); 222 while (regex.find()) { 223 matches.add(regex.group()); 224 } 225 return matches; 226 } 227 228 // Tries to determine the language of the given text. Will return the language with the highest 229 // confidence score that meets the minimum threshold, and has a language-specific regex, null 230 // otherwise 231 @Nullable getLanguageWithRegex(String text, @NonNull TextClassifierImpl tcImpl)232 private static ULocale getLanguageWithRegex(String text, @NonNull TextClassifierImpl tcImpl) { 233 float highestConfidence = 0; 234 ULocale highestConfidenceLocale = null; 235 TextLanguage.Request langRequest = new TextLanguage.Request.Builder(text).build(); 236 TextLanguage lang; 237 try { 238 lang = tcImpl.detectLanguage(null, null, langRequest); 239 } catch (IOException e) { 240 TcLog.e(TAG, "Except detecting language", e); 241 return null; 242 } 243 for (int i = 0; i < lang.getLocaleHypothesisCount(); i++) { 244 ULocale locale = lang.getLocale(i); 245 float confidence = lang.getConfidenceScore(locale); 246 if (confidence >= TC_THRESHOLD 247 && confidence >= highestConfidence 248 && EXTRA_LANG_OTP_REGEX.containsKey(locale.toLanguageTag())) { 249 highestConfidence = confidence; 250 highestConfidenceLocale = locale; 251 } 252 } 253 return highestConfidenceLocale; 254 } 255 TextClassifierOtpHelper()256 private TextClassifierOtpHelper() {} 257 258 private static class RegExStrings { 259 /* 260 * A regex matching a line start, open paren, arrow, colon (not proceeded by a digit), open square 261 * bracket, equals sign, double or single quote, ideographic char, or a space that is not preceded 262 * by a number. It will not consume the start char (meaning START won't be included in the matched 263 * string) 264 */ 265 private static final String START = 266 "(^|(?<=((^|[^0-9])\\s)|[>(\"'=\\[\\p{IsIdeographic}]|[^0-9]:))"; 267 268 /* 269 * A regex matching a line end, a space that is not followed by a number, an ideographic char, or 270 * a period, close paren, close square bracket, single or double quote, exclamation point, 271 * question mark, or comma. It will not consume the end char 272 */ 273 private static final String END = "(?=\\s[^0-9]|$|\\p{IsIdeographic}|[.?!,)'\\]\"])"; 274 275 private static final String ALL_OTP; 276 277 static { 278 /* One single OTP char. A number or alphabetical char (that isn't also ideographic) */ 279 final String OTP_CHAR = "([0-9\\p{IsAlphabetic}&&[^\\p{IsIdeographic}]])"; 280 281 /* One OTP char, followed by an optional dash */ 282 final String OTP_CHAR_WITH_DASH = format("(%s-?)", OTP_CHAR); 283 284 /* 285 * Performs a lookahead to find a digit after 0 to 7 OTP_CHARs. This ensures that our potential 286 * OTP code contains at least one number 287 */ 288 final String FIND_DIGIT = format("(?=%s{0,7}\\d)", OTP_CHAR_WITH_DASH); 289 290 /* 291 * Matches between 5 and 8 otp chars, with dashes in between. Here, we are assuming an OTP code is 292 * 5-8 characters long. The last char must not be followed by a dash 293 */ 294 final String OTP_CHARS = format("(%s{4,7}%s)", OTP_CHAR_WITH_DASH, OTP_CHAR); 295 296 /* A regex matching four digit numerical codes */ 297 final String FOUR_DIGITS = "(\\d{4})"; 298 299 final String FIVE_TO_EIGHT_ALPHANUM_AT_LEAST_ONE_NUM = 300 format("(%s%s)", FIND_DIGIT, OTP_CHARS); 301 302 /* A regex matching two pairs of 3 digits (ex "123 456") */ 303 final String SIX_DIGITS_WITH_SPACE = "(\\d{3}\\s\\d{3})"; 304 305 /* 306 * Combining the regular expressions above, we get an OTP regex: 1. search for START, THEN 2. 307 * match ONE of a. alphanumeric sequence, at least one number, length 5-8, with optional dashes b. 308 * 4 numbers in a row c. pair of 3 digit codes separated by a space THEN 3. search for END Ex: 309 * "6454", " 345 678.", "[YDT-456]" 310 */ 311 ALL_OTP = 312 format( 313 "%s(%s|%s|%s)%s", 314 START, FIVE_TO_EIGHT_ALPHANUM_AT_LEAST_ONE_NUM, FOUR_DIGITS, 315 SIX_DIGITS_WITH_SPACE, END); 316 } 317 318 private static final String FALSE_POSITIVE; 319 320 static { 321 /* 322 * A Date regular expression. Looks for dates with the month, day, and year separated by dashes. 323 * Handles one and two digit months and days, and four or two-digit years. It makes the following 324 * assumptions: Dates and months will never be higher than 39 If a four digit year is used, the 325 * leading digit will be 1 or 2 326 */ 327 final String DATE_WITH_DASHES = "([0-3]?\\d-[0-3]?\\d-([12]\\d)?\\d\\d)"; 328 329 /* 330 * matches a ten digit phone number, when the area code is separated by a space or dash. Supports 331 * optional parentheses around the area code, and an optional dash or space in between the rest of 332 * the numbers. This format registers as an otp match due to the space between the area code and 333 * the rest, but shouldn't. 334 */ 335 final String PHONE_WITH_SPACE = "(\\(?\\d{3}\\)?(-|\\s)?\\d{3}(-|\\s)?\\d{4})"; 336 337 /* 338 * A combination of common false positives. These matches are expected to be longer than (or equal 339 * in length to) otp matches. 340 */ 341 FALSE_POSITIVE = format("%s(%s|%s)%s", START, DATE_WITH_DASHES, PHONE_WITH_SPACE, END); 342 } 343 344 /** 345 * A list of regular expressions representing words found in an OTP context (non case sensitive) 346 * Note: TAN is short for Transaction Authentication Number 347 */ 348 private static final String[] ENGLISH_CONTEXT_WORDS = 349 new String[] { 350 "pin", 351 "pass[-\\s]?(code|word)", 352 "TAN", 353 "otp", 354 "2fa", 355 "(two|2)[-\\s]?factor", 356 "log[-\\s]?in", 357 "auth(enticat(e|ion))?", 358 "code", 359 "secret", 360 "verif(y|ication)", 361 "one(\\s|-)?time", 362 "access", 363 "validat(e|ion)" 364 }; 365 } 366 } 367