1#!/usr/bin/env python 2 3# Copyright 2020 The Amber Authors. All rights reserved. 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17""" 18Script to check files for inclusive language. The script will scan all files 19and flag non-inclusive terminology which is identified. 20 21Usage, run the script from a folder and the script will scan down through that 22folder. 23""" 24 25import fnmatch 26import os 27import re 28import sys 29 30REGEXES = [ 31 r"(?i)black[-_]?list", 32 r"(?i)white[-_]?list", 33 r"(?i)gr[ea]y[-_]?list", 34 r"(?i)(first class citizen)", 35 r"(?i)black[-_]?hat", 36 r"(?i)white[-_]?hat", 37 r"(?i)gr[ea]y[-_]?hat", 38 r"(?i)master", 39 r"(?i)slave", 40 r"(?i)\bhim\b", 41 r"(?i)\bhis\b", 42 r"(?i)\bshe\b", 43 r"(?i)\bher\b", 44 r"(?i)\bhers\b", 45 r"(?i)\bman\b", 46 r"(?i)\bwoman\b", 47 r"(?i)\she\s", 48 r"(?i)\she$", 49 r"(?i)^he\s", 50 r"(?i)^he$", 51 r"(?i)\she['|\u2019]d\s", 52 r"(?i)\she['|\u2019]d$", 53 r"(?i)^he['|\u2019]d\s", 54 r"(?i)^he['|\u2019]d$", 55 r"(?i)\she['|\u2019]s\s", 56 r"(?i)\she['|\u2019]s$", 57 r"(?i)^he['|\u2019]s\s", 58 r"(?i)^he['|\u2019]s$", 59 r"(?i)\she['|\u2019]ll\s", 60 r"(?i)\she['|\u2019]ll$", 61 r"(?i)^he['|\u2019]ll\s", 62 r"(?i)^he['|\u2019]ll$", 63 r"(?i)grandfather", 64 r"(?i)\bmitm\b", 65 r"(?i)\bcrazy\b", 66 r"(?i)\binsane\b", 67 r"(?i)\bblind\sto\b", 68 r"(?i)\bflying\sblind\b", 69 r"(?i)\bblind\seye\b", 70 r"(?i)\bcripple\b", 71 r"(?i)\bcrippled\b", 72 r"(?i)\bdumb\b", 73 r"(?i)\bdummy\b", 74 r"(?i)\bparanoid\b", 75 r"(?i)\bsane\b", 76 r"(?i)\bsanity\b", 77 r"(?i)red[-_]?line", 78] 79 80SUPPRESSIONS = [ 81 r"(?i)MS_SLAVE", 82 r"(?i)man[ -_]?page", 83] 84 85 86REGEX_LIST = [] 87for reg in REGEXES: 88 REGEX_LIST.append(re.compile(reg)) 89 90SUPPRESSION_LIST = [] 91for supp in SUPPRESSIONS: 92 SUPPRESSION_LIST.append(re.compile(supp)) 93 94def find(top, filename_glob, skip_glob_list): 95 """Returns files in the tree rooted at top matching filename_glob but not 96 in directories matching skip_glob_list.""" 97 98 file_list = [] 99 for path, dirs, files in os.walk(top): 100 for glob in skip_glob_list: 101 for match in fnmatch.filter(dirs, glob): 102 dirs.remove(match) 103 for filename in fnmatch.filter(files, filename_glob): 104 if filename == os.path.basename(__file__): 105 continue 106 file_list.append(os.path.join(path, filename)) 107 return file_list 108 109 110def filtered_descendants(glob): 111 """Returns glob-matching filenames under the current directory, but skips 112 some irrelevant paths.""" 113 return find('.', glob, ['third_party', 'external', 'build*', 'out*', 114 'CompilerIdCXX', '.git']) 115 116def check_match(filename, contents): 117 """Check if contents contains any matching entries""" 118 ret = False 119 for reg in REGEX_LIST: 120 match = reg.search(contents) 121 if match: 122 suppressed = False 123 for supp in SUPPRESSION_LIST: 124 idx = match.start() 125 supp_match = supp.match(contents[idx:]) 126 if supp_match: 127 suppressed = True 128 129 # This is a hack to handle the MS_ prefix that is needed 130 # to check for. Find a better way if we get more suppressions 131 # which modify the prefix of the string 132 if idx >= 3: 133 supp_match = supp.match(contents[idx - 3:]) 134 if supp_match: 135 suppressed = True 136 137 if not suppressed: 138 # No matching suppression. 139 print("{}: found non-inclusive language: {}".format( 140 filename, match.group(0))) 141 ret = True 142 143 return ret 144 145 146def alert_if_lang_matches(glob): 147 """Prints names of all files matching non-inclusive language. 148 149 Finds all glob-matching files under the current directory and checks if they 150 contain the language pattern. Prints the names of all the files that 151 match. 152 153 Returns the total number of file names printed. 154 """ 155 verbose = False 156 printed_count = 0 157 for file in filtered_descendants(glob): 158 has_match = False 159 try: 160 with open(file, 'r', encoding='utf8') as contents: 161 if check_match(file, contents.read()): 162 printed_count += 1 163 except: 164 if verbose: 165 print("skipping {}".format(file)) 166 167 return printed_count 168 169 170def main(): 171 globs = ['*'] 172 count = 0 173 for glob in globs: 174 count += alert_if_lang_matches(glob) 175 176 sys.exit(count > 0) 177 178if __name__ == '__main__': 179 main() 180