1 // Copyright 2018 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 package org.chromium.base; 6 7 import android.text.TextUtils; 8 import android.util.Patterns; 9 10 import java.util.regex.Matcher; 11 import java.util.regex.Pattern; 12 13 /** Provides public methods for detecting and eliding sensitive PII. */ 14 public class PiiElider { 15 private static final String EMAIL_ELISION = "XXX@EMAIL.ELIDED"; 16 17 private static final String URL_ELISION = "HTTP://WEBADDRESS.ELIDED"; 18 19 private static final String GOOD_IRI_CHAR = "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; 20 21 private static final String IP_ADDRESS = 22 "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]" 23 + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]" 24 + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" 25 + "|[1-9][0-9]|[0-9]))"; 26 27 private static final String IRI = 28 "[" + GOOD_IRI_CHAR + "]([" + GOOD_IRI_CHAR + "-]{0,61}[" + GOOD_IRI_CHAR + "]){0,1}"; 29 30 private static final String GOOD_GTLD_CHAR = "a-zA-Z\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; 31 private static final String GTLD = "[" + GOOD_GTLD_CHAR + "]{2,63}"; 32 private static final String HOST_NAME = "(" + IRI + "\\.)+" + GTLD; 33 34 private static final String URI_ENCODED_CHAR = "(%[a-fA-F0-9]{2})"; 35 36 private static final String URI_CHAR = "([a-zA-Z0-9$_.+!*'(),;?&=-]|" + URI_ENCODED_CHAR + ")"; 37 38 private static final String PATH_CHAR = 39 // Either a single valid path component character or a URI-encoded character. 40 "(([" + GOOD_IRI_CHAR + ";/?:@&=#~.+!*'(),_-])|" + URI_ENCODED_CHAR + ")"; 41 42 private static final String URI_SCHEME = 43 "((http|https|Http|Https|rtsp|Rtsp)://" 44 + "(" 45 + URI_CHAR 46 + "{1,64}(:" 47 + URI_CHAR 48 + "{1,25})?@)?)"; 49 50 private static final String DOMAIN_NAME = "(" + HOST_NAME + "|" + IP_ADDRESS + ")"; 51 52 private static final String PORT = "(:\\d{1,5})"; 53 54 private static final String URL_WITH_OPTIONAL_SCHEME_AND_PORT = 55 "(" + URI_SCHEME + "?" + DOMAIN_NAME + PORT + "?)"; 56 57 private static final String PATH_COMPONENT = "(" + PATH_CHAR + "+)"; 58 59 // Based on: http://www.faqs.org/rfcs/rfc2396.html#:~:text=Scheme%20Component 60 private static final String INTENT_SCHEME = "[a-zA-Z][a-zA-Z0-9+.-]+://"; 61 62 private static final String INTENT = "(" + INTENT_SCHEME + PATH_COMPONENT + ")"; 63 64 private static final String URL_OR_INTENT = 65 "(" + URL_WITH_OPTIONAL_SCHEME_AND_PORT + "|" + INTENT + ")"; 66 67 private static final Pattern WEB_URL = 68 Pattern.compile( 69 "(\\b|^)" // Always start on a word boundary or start of string. 70 + "(" 71 + URL_OR_INTENT 72 + ")" // Main URL or Intent scheme/domain/root path. 73 + "(/" 74 + PATH_CHAR 75 + "*)?" // Rest of the URI path. 76 + "(\\b|$)"); // Always end on a word boundary or end of string. 77 78 // Example variant info chromium-TrichromeChromeGoogle6432.aab 79 private static final String CHROME_VARIANT_INFO = "chromium-[^\\.]+\\.aab"; 80 private static final Pattern LIKELY_EXCEPTION_LOG = 81 Pattern.compile( 82 "\\sat\\s" 83 // These are all package prefixes of classes that are likely to 84 // exist on a stacktrace and are very unlikely to be a PII url. 85 + "(org\\.chromium|com\\.google|java|android|com\\.android)\\.[^ ]+.|" 86 // if a line has what looks like line number info, it's probably an 87 // exception log. 88 + "\\(" 89 + CHROME_VARIANT_INFO 90 + "[^:]+:\\d+\\)|" 91 // When a class is not found it can fail to satisfy our isClass 92 // check but is still worth noting what it was. 93 + "Caused by: java\\.lang\\." 94 + "(ClassNotFoundException|NoClassDefFoundError):"); 95 96 private static final String IP_ELISION = "1.2.3.4"; 97 private static final String MAC_ELISION = "01:23:45:67:89:AB"; 98 private static final String CONSOLE_ELISION = "[ELIDED:CONSOLE(0)] ELIDED CONSOLE MESSAGE"; 99 100 private static final Pattern MAC_ADDRESS = 101 Pattern.compile("([0-9a-fA-F]{2}[-:]+){5}[0-9a-fA-F]{2}"); 102 103 private static final Pattern CONSOLE_MSG = Pattern.compile("\\[\\w*:CONSOLE.*\\].*"); 104 105 private static final String[] APP_NAMESPACE = 106 new String[] {"org.chromium.", "com.google.", "com.chrome."}; 107 108 private static final String[] SYSTEM_NAMESPACE = 109 new String[] { 110 "android.", 111 "com.android.", 112 "dalvik.", 113 "java.", 114 "javax.", 115 "org.apache.", 116 "org.json.", 117 "org.w3c.dom.", 118 "org.xml.", 119 "org.xmlpull.", 120 "System." 121 }; 122 123 /** 124 * Elides any emails in the specified {@link String} with 125 * {@link #EMAIL_ELISION}. 126 * 127 * @param original String potentially containing emails. 128 * @return String with elided emails. 129 */ elideEmail(String original)130 public static String elideEmail(String original) { 131 return Patterns.EMAIL_ADDRESS.matcher(original).replaceAll(EMAIL_ELISION); 132 } 133 134 /** 135 * Elides any URLs in the specified {@link String} with 136 * {@link #URL_ELISION}. 137 * 138 * @param original String potentially containing URLs. 139 * @return String with elided URLs. 140 */ elideUrl(String original)141 public static String elideUrl(String original) { 142 // Url-matching is fussy. If something looks like an exception message, just return. 143 if (LIKELY_EXCEPTION_LOG.matcher(original).find()) return original; 144 StringBuilder buffer = new StringBuilder(original); 145 Matcher matcher = WEB_URL.matcher(buffer); 146 int start = 0; 147 while (matcher.find(start)) { 148 start = matcher.start(); 149 int end = matcher.end(); 150 String url = buffer.substring(start, end); 151 if (!likelyToBeAppNamespace(url) 152 && !likelyToBeSystemNamespace(url) 153 && !likelyToBeClassOrMethodName(url)) { 154 buffer.replace(start, end, URL_ELISION); 155 end = start + URL_ELISION.length(); 156 matcher = WEB_URL.matcher(buffer); 157 } 158 start = end; 159 } 160 return buffer.toString(); 161 } 162 likelyToBeClassOrMethodName(String url)163 private static boolean likelyToBeClassOrMethodName(String url) { 164 if (isClassName(url)) return true; 165 166 // Since the suspected URL could actually be a method name, check if the portion preceding 167 // the last subdomain is a class name. 168 int indexOfLastPeriod = url.lastIndexOf("."); 169 if (indexOfLastPeriod == -1) return false; 170 return isClassName(url.substring(0, indexOfLastPeriod)); 171 } 172 isClassName(String url)173 private static boolean isClassName(String url) { 174 try { 175 Class.forName(url, false, ContextUtils.getApplicationContext().getClassLoader()); 176 return true; 177 } catch (Throwable e) { 178 // Some examples: ClassNotFoundException, NoClassDefFoundException, VerifyError. 179 } 180 return false; 181 } 182 likelyToBeAppNamespace(String url)183 private static boolean likelyToBeAppNamespace(String url) { 184 for (String ns : APP_NAMESPACE) { 185 if (url.startsWith(ns)) { 186 return true; 187 } 188 } 189 return false; 190 } 191 likelyToBeSystemNamespace(String url)192 private static boolean likelyToBeSystemNamespace(String url) { 193 for (String ns : SYSTEM_NAMESPACE) { 194 if (url.startsWith(ns)) { 195 return true; 196 } 197 } 198 return false; 199 } 200 201 /** 202 * Elides any IP addresses in the specified {@link String} with 203 * {@link #IP_ELISION}. 204 * 205 * @param original String potentially containing IPs. 206 * @return String with elided IPs. 207 */ elideIp(String original)208 public static String elideIp(String original) { 209 return Patterns.IP_ADDRESS.matcher(original).replaceAll(IP_ELISION); 210 } 211 212 /** 213 * Elides any MAC addresses in the specified {@link String} with 214 * {@link #MAC_ELISION}. 215 * 216 * @param original String potentially containing MACs. 217 * @return String with elided MACs. 218 */ elideMac(String original)219 public static String elideMac(String original) { 220 return MAC_ADDRESS.matcher(original).replaceAll(MAC_ELISION); 221 } 222 223 /** 224 * Elides any console messages in the specified {@link String} with 225 * {@link #CONSOLE_ELISION}. 226 * 227 * @param original String potentially containing console messages. 228 * @return String with elided console messages. 229 */ elideConsole(String original)230 public static String elideConsole(String original) { 231 return CONSOLE_MSG.matcher(original).replaceAll(CONSOLE_ELISION); 232 } 233 234 /** 235 * Elides any URL in the exception messages contained inside a stacktrace with 236 * {@link #URL_ELISION}. 237 * 238 * @param stacktrace Multiline stacktrace as a string. 239 * @return Stacktrace with elided URLs. 240 */ sanitizeStacktrace(String stacktrace)241 public static String sanitizeStacktrace(String stacktrace) { 242 if (TextUtils.isEmpty(stacktrace)) { 243 return ""; 244 } 245 String[] frames = stacktrace.split("\\n"); 246 // Sanitize first stacktrace line which contains the exception message. 247 frames[0] = elideUrl(frames[0]); 248 for (int i = 1; i < frames.length; i++) { 249 // Nested exceptions should also have their message sanitized. 250 if (frames[i].startsWith("Caused by:")) { 251 frames[i] = elideUrl(frames[i]); 252 } 253 } 254 return TextUtils.join("\n", frames); 255 } 256 } 257