• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2025 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.android.textclassifier;
17 
18 import static java.lang.String.format;
19 
20 import android.icu.util.ULocale;
21 import android.os.Bundle;
22 import android.util.ArrayMap;
23 import android.view.textclassifier.TextLanguage;
24 import android.view.textclassifier.TextLinks;
25 
26 import androidx.annotation.NonNull;
27 import androidx.annotation.Nullable;
28 
29 import com.android.textclassifier.common.base.TcLog;
30 import com.android.textclassifier.utils.TextClassifierUtils;
31 
32 import com.google.common.annotations.VisibleForTesting;
33 
34 import java.io.IOException;
35 import java.util.Collections;
36 import java.util.HashSet;
37 import java.util.Map;
38 import java.util.Set;
39 import java.util.regex.Matcher;
40 import java.util.regex.Pattern;
41 
42 /**
43  * Class with helper methods related to detecting OTP codes in a text.
44  */
45 public class TextClassifierOtpHelper {
46   private static final String TAG = TextClassifierOtpHelper.class.getSimpleName();
47 
48   private static final int PATTERN_FLAGS =
49       Pattern.DOTALL | Pattern.CASE_INSENSITIVE | Pattern.MULTILINE;
50 
compileToRegex(String pattern)51   private static ThreadLocal<Matcher> compileToRegex(String pattern) {
52     return ThreadLocal.withInitial(() -> Pattern.compile(pattern, PATTERN_FLAGS).matcher(""));
53   }
54 
55   private static final float TC_THRESHOLD = 0.6f;
56 
57   private static final ArrayMap<String, ThreadLocal<Matcher>> EXTRA_LANG_OTP_REGEX =
58       new ArrayMap<>();
59 
60   private static final ThreadLocal<Matcher> OTP_REGEX = compileToRegex(RegExStrings.ALL_OTP);
61 
62   /**
63    * A combination of common false positives. These matches are expected to be longer than (or equal
64    * in length to) otp matches
65    */
66   private static final ThreadLocal<Matcher> FALSE_POSITIVE_REGEX =
67       compileToRegex(RegExStrings.FALSE_POSITIVE);
68 
69   /**
70    * Creates a regular expression to match any of a series of individual words, case insensitive. It
71    * also verifies the position of the word, relative to the OTP match
72    */
createDictionaryRegex(String[] words)73   private static ThreadLocal<Matcher> createDictionaryRegex(String[] words) {
74     StringBuilder regex = new StringBuilder("(");
75     for (int i = 0; i < words.length; i++) {
76       regex.append(findContextWordWithCode(words[i]));
77       if (i != words.length - 1) {
78         regex.append("|");
79       }
80     }
81     regex.append(")");
82     return compileToRegex(regex.toString());
83   }
84 
85   /**
86    * Creates a regular expression that will find a context word, if that word occurs in the sentence
87    * preceding an OTP, or in the same sentence as an OTP (before or after). In both cases, the
88    * context word must occur within 50 characters of the suspected OTP
89    *
90    * @param contextWord The context word we expect to find around the OTP match
91    * @return A string representing a regular expression that will determine if we found a context
92    *     word occurring before an otp match, or after it, but in the same sentence.
93    */
findContextWordWithCode(String contextWord)94   private static String findContextWordWithCode(String contextWord) {
95     String boundedContext = "\\b" + contextWord + "\\b";
96     // Asserts that we find the OTP code within 50 characters after the context word, with at
97     // most one sentence punctuation between the OTP code and the context word (i.e. they are
98     // in the same sentence, or the context word is in the previous sentence)
99     String contextWordBeforeOtpInSameOrPreviousSentence =
100         String.format("(%s(?=.{1,50}%s)[^.?!]*[.?!]?[^.?!]*%s)", boundedContext, RegExStrings.ALL_OTP, RegExStrings.ALL_OTP);
101     // Asserts that we find the context word within 50 characters after the OTP code, with no
102     // sentence punctuation between the OTP code and the context word (i.e. they are in the same
103     // sentence)
104     String contextWordAfterOtpSameSentence =
105         String.format("(%s)[^.!?]{1,50}%s", RegExStrings.ALL_OTP, boundedContext);
106     return String.format(
107         "(%s|%s)", contextWordBeforeOtpInSameOrPreviousSentence, contextWordAfterOtpSameSentence);
108   }
109 
110   static {
111     EXTRA_LANG_OTP_REGEX.put(
ULocale.ENGLISH.toLanguageTag()112         ULocale.ENGLISH.toLanguageTag(), createDictionaryRegex(RegExStrings.ENGLISH_CONTEXT_WORDS));
113   }
114 
115   /**
116    * Checks if the text might contain an OTP, if so, adds a link to the builder with type as OTP
117    *
118    * @param text    The text whose content should be checked for OTP
119    * @param tcImpl  Instance of the TextClassifierImpl
120    * @param builder TextLinks builder object to whom the OTP link to be added
121    */
addOtpLink(@onNull String text, @NonNull TextClassifierImpl tcImpl, @NonNull TextLinks.Builder builder)122   public static void addOtpLink(@NonNull String text, @NonNull TextClassifierImpl tcImpl,
123           @NonNull TextLinks.Builder builder) {
124     if (!containsOtp(text, tcImpl)) {
125       return;
126     }
127     final Map<String, Float> entityScores = Collections.singletonMap(TextClassifierUtils.TYPE_OTP,
128             1f);
129     builder.addLink(0, 0, entityScores, new Bundle());
130   }
131 
132   /**
133    * Checks if a string of text might contain an OTP, based on several regular expressions, and
134    * potentially using a textClassifier to eliminate false positives
135    *
136    * @param text   The text whose content should be checked
137    * @param tcImpl If non null, the provided TextClassifierImpl will be used to find the language
138    *               of the text, and look for a language-specific regex for it.
139    * @return True if we believe an OTP is in the message, false otherwise.
140    */
containsOtp( @onNull String text, @NonNull TextClassifierImpl tcImpl)141   protected static boolean containsOtp(
142           @NonNull String text,
143           @NonNull TextClassifierImpl tcImpl) {
144     if (!containsOtpLikePattern(text)) {
145       return false;
146     }
147 
148     ULocale language = getLanguageWithRegex(text, tcImpl);
149     if (language == null) {
150       return false;
151     }
152     return hasLanguageSpecificOtpWord(text, language.toLanguageTag());
153   }
154 
155   /**
156    * Checks if the given text contains a pattern resembling an OTP.
157    *
158    * <p>This method attempts to identify such patterns by matching against a regular expression.
159    * Avoids false positives by checking for common patterns that might be mistaken for OTPs, such
160    * as phone numbers or dates.</p>
161    *
162    * @param text The text to be checked.
163    * @return {@code true} if the text contains an OTP-like pattern, {@code false} otherwise.
164    */
165   @VisibleForTesting
containsOtpLikePattern(String text)166   protected static boolean containsOtpLikePattern(String text) {
167     Set<String> otpMatches = getAllMatches(text, OTP_REGEX.get());
168     if (otpMatches.isEmpty()) {
169       return false;
170     }
171     Set<String> falsePositives = getAllMatches(text, FALSE_POSITIVE_REGEX.get());
172 
173     // This optional, but having this would help with performance
174     // Example: "Your OTP code is 1234 and this is sent on 01-01-2001"
175     // At this point -> otpMatches: [1234, 01-01-2001] falsePositives=[01-01-2001]
176     // It filters "01-01-2001" in advance and continues to next checks with otpMatches: [1234]
177     otpMatches.removeAll(falsePositives);
178 
179     // Following is to handle text like: "Your OTP can't be shared at this point, please call
180     // (888) 888-8888"
181     // otpMatches: [888-8888] falsePositives=[(888) 888-8888] final=[]
182     for (String otpMatch : otpMatches) {
183       boolean currentOtpIsFalsePositive = false;
184       for (String falsePositive : falsePositives) {
185         if (falsePositive.contains(otpMatch)) {
186           currentOtpIsFalsePositive = true;
187           break;
188         }
189       }
190       if (!currentOtpIsFalsePositive) {
191         return true;
192       }
193     }
194     return false;
195   }
196 
197   /**
198    * Checks if the given text contains a language-specific word or phrase associated with OTPs.
199    * This method uses regular expressions defined for specific languages to identify these words.
200    *
201    * @param text The text to check.
202    * @param languageTag The language tag (e.g., "en", "es", "fr") for which to check.
203    * @return {@code true} if the text contains a language-specific OTP word, {@code false} otherwise.
204    *         Returns {@code false} if no language-specific regex is defined for the given tag.
205    */
206   @VisibleForTesting
hasLanguageSpecificOtpWord(@onNull String text, @NonNull String languageTag)207   protected static boolean hasLanguageSpecificOtpWord(@NonNull String text, @NonNull String languageTag) {
208     if (!EXTRA_LANG_OTP_REGEX.containsKey(languageTag)){
209       return false;
210     }
211     Matcher languageSpecificMatcher = EXTRA_LANG_OTP_REGEX.get(languageTag).get();
212     if (languageSpecificMatcher == null) {
213       return false;
214     }
215     languageSpecificMatcher.reset(text);
216     return languageSpecificMatcher.find();
217   }
218 
getAllMatches(String text, Matcher regex)219   private static Set<String> getAllMatches(String text, Matcher regex) {
220     Set<String> matches = new HashSet<>();
221     regex.reset(text);
222     while (regex.find()) {
223       matches.add(regex.group());
224     }
225     return matches;
226   }
227 
228   // Tries to determine the language of the given text. Will return the language with the highest
229   // confidence score that meets the minimum threshold, and has a language-specific regex, null
230   // otherwise
231   @Nullable
getLanguageWithRegex(String text, @NonNull TextClassifierImpl tcImpl)232   private static ULocale getLanguageWithRegex(String text, @NonNull TextClassifierImpl tcImpl) {
233     float highestConfidence = 0;
234     ULocale highestConfidenceLocale = null;
235     TextLanguage.Request langRequest = new TextLanguage.Request.Builder(text).build();
236     TextLanguage lang;
237     try {
238       lang = tcImpl.detectLanguage(null, null, langRequest);
239     } catch (IOException e) {
240       TcLog.e(TAG, "Except detecting language", e);
241       return null;
242     }
243     for (int i = 0; i < lang.getLocaleHypothesisCount(); i++) {
244       ULocale locale = lang.getLocale(i);
245       float confidence = lang.getConfidenceScore(locale);
246       if (confidence >= TC_THRESHOLD
247           && confidence >= highestConfidence
248           && EXTRA_LANG_OTP_REGEX.containsKey(locale.toLanguageTag())) {
249         highestConfidence = confidence;
250         highestConfidenceLocale = locale;
251       }
252     }
253     return highestConfidenceLocale;
254   }
255 
TextClassifierOtpHelper()256   private TextClassifierOtpHelper() {}
257 
258   private static class RegExStrings {
259     /*
260      * A regex matching a line start, open paren, arrow, colon (not proceeded by a digit), open square
261      * bracket, equals sign, double or single quote, ideographic char, or a space that is not preceded
262      * by a number. It will not consume the start char (meaning START won't be included in the matched
263      * string)
264      */
265     private static final String START =
266             "(^|(?<=((^|[^0-9])\\s)|[>(\"'=\\[\\p{IsIdeographic}]|[^0-9]:))";
267 
268     /*
269      * A regex matching a line end, a space that is not followed by a number, an ideographic char, or
270      * a period, close paren, close square bracket, single or double quote, exclamation point,
271      * question mark, or comma. It will not consume the end char
272      */
273     private static final String END = "(?=\\s[^0-9]|$|\\p{IsIdeographic}|[.?!,)'\\]\"])";
274 
275     private static final String ALL_OTP;
276 
277     static {
278       /* One single OTP char. A number or alphabetical char (that isn't also ideographic) */
279       final String OTP_CHAR = "([0-9\\p{IsAlphabetic}&&[^\\p{IsIdeographic}]])";
280 
281       /* One OTP char, followed by an optional dash */
282       final String OTP_CHAR_WITH_DASH = format("(%s-?)", OTP_CHAR);
283 
284       /*
285        * Performs a lookahead to find a digit after 0 to 7 OTP_CHARs. This ensures that our potential
286        * OTP code contains at least one number
287        */
288       final String FIND_DIGIT = format("(?=%s{0,7}\\d)", OTP_CHAR_WITH_DASH);
289 
290       /*
291        * Matches between 5 and 8 otp chars, with dashes in between. Here, we are assuming an OTP code is
292        * 5-8 characters long. The last char must not be followed by a dash
293        */
294       final String OTP_CHARS = format("(%s{4,7}%s)", OTP_CHAR_WITH_DASH, OTP_CHAR);
295 
296       /* A regex matching four digit numerical codes */
297       final String FOUR_DIGITS = "(\\d{4})";
298 
299       final String FIVE_TO_EIGHT_ALPHANUM_AT_LEAST_ONE_NUM =
300               format("(%s%s)", FIND_DIGIT, OTP_CHARS);
301 
302       /* A regex matching two pairs of 3 digits (ex "123 456") */
303       final String SIX_DIGITS_WITH_SPACE = "(\\d{3}\\s\\d{3})";
304 
305       /*
306        * Combining the regular expressions above, we get an OTP regex: 1. search for START, THEN 2.
307        * match ONE of a. alphanumeric sequence, at least one number, length 5-8, with optional dashes b.
308        * 4 numbers in a row c. pair of 3 digit codes separated by a space THEN 3. search for END Ex:
309        * "6454", " 345 678.", "[YDT-456]"
310        */
311       ALL_OTP =
312               format(
313                       "%s(%s|%s|%s)%s",
314                       START, FIVE_TO_EIGHT_ALPHANUM_AT_LEAST_ONE_NUM, FOUR_DIGITS,
315                       SIX_DIGITS_WITH_SPACE, END);
316     }
317 
318     private static final String FALSE_POSITIVE;
319 
320     static {
321       /*
322        * A Date regular expression. Looks for dates with the month, day, and year separated by dashes.
323        * Handles one and two digit months and days, and four or two-digit years. It makes the following
324        * assumptions: Dates and months will never be higher than 39 If a four digit year is used, the
325        * leading digit will be 1 or 2
326        */
327       final String DATE_WITH_DASHES = "([0-3]?\\d-[0-3]?\\d-([12]\\d)?\\d\\d)";
328 
329       /*
330        * matches a ten digit phone number, when the area code is separated by a space or dash. Supports
331        * optional parentheses around the area code, and an optional dash or space in between the rest of
332        * the numbers. This format registers as an otp match due to the space between the area code and
333        * the rest, but shouldn't.
334        */
335       final String PHONE_WITH_SPACE = "(\\(?\\d{3}\\)?(-|\\s)?\\d{3}(-|\\s)?\\d{4})";
336 
337       /*
338        * A combination of common false positives. These matches are expected to be longer than (or equal
339        * in length to) otp matches.
340        */
341       FALSE_POSITIVE = format("%s(%s|%s)%s", START, DATE_WITH_DASHES, PHONE_WITH_SPACE, END);
342     }
343 
344     /**
345      * A list of regular expressions representing words found in an OTP context (non case sensitive)
346      * Note: TAN is short for Transaction Authentication Number
347      */
348     private static final String[] ENGLISH_CONTEXT_WORDS =
349             new String[] {
350                     "pin",
351                     "pass[-\\s]?(code|word)",
352                     "TAN",
353                     "otp",
354                     "2fa",
355                     "(two|2)[-\\s]?factor",
356                     "log[-\\s]?in",
357                     "auth(enticat(e|ion))?",
358                     "code",
359                     "secret",
360                     "verif(y|ication)",
361                     "one(\\s|-)?time",
362                     "access",
363                     "validat(e|ion)"
364             };
365   }
366 }
367