• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2
3# Copyright 2020 The Amber Authors. All rights reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#	http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""
18Script to check files for inclusive language. The script will scan all files
19and flag non-inclusive terminology which is identified.
20
21Usage, run the script from a folder and the script will scan down through that
22folder.
23"""
24
25import fnmatch
26import os
27import re
28import sys
29
30REGEXES = [
31	r"(?i)black[-_]?list",
32	r"(?i)white[-_]?list",
33	r"(?i)gr[ea]y[-_]?list",
34	r"(?i)(first class citizen)",
35	r"(?i)black[-_]?hat",
36	r"(?i)white[-_]?hat",
37	r"(?i)gr[ea]y[-_]?hat",
38	r"(?i)master",
39	r"(?i)slave",
40	r"(?i)\bhim\b",
41	r"(?i)\bhis\b",
42	r"(?i)\bshe\b",
43	r"(?i)\bher\b",
44	r"(?i)\bhers\b",
45	r"(?i)\bman\b",
46	r"(?i)\bwoman\b",
47	r"(?i)\she\s",
48	r"(?i)\she$",
49	r"(?i)^he\s",
50	r"(?i)^he$",
51	r"(?i)\she['|\u2019]d\s",
52	r"(?i)\she['|\u2019]d$",
53	r"(?i)^he['|\u2019]d\s",
54	r"(?i)^he['|\u2019]d$",
55	r"(?i)\she['|\u2019]s\s",
56	r"(?i)\she['|\u2019]s$",
57	r"(?i)^he['|\u2019]s\s",
58	r"(?i)^he['|\u2019]s$",
59	r"(?i)\she['|\u2019]ll\s",
60	r"(?i)\she['|\u2019]ll$",
61	r"(?i)^he['|\u2019]ll\s",
62	r"(?i)^he['|\u2019]ll$",
63	r"(?i)grandfather",
64	r"(?i)\bmitm\b",
65	r"(?i)\bcrazy\b",
66	r"(?i)\binsane\b",
67	r"(?i)\bblind\sto\b",
68	r"(?i)\bflying\sblind\b",
69	r"(?i)\bblind\seye\b",
70	r"(?i)\bcripple\b",
71	r"(?i)\bcrippled\b",
72	r"(?i)\bdumb\b",
73	r"(?i)\bdummy\b",
74	r"(?i)\bparanoid\b",
75	r"(?i)\bsane\b",
76	r"(?i)\bsanity\b",
77	r"(?i)red[-_]?line",
78]
79
80SUPPRESSIONS = [
81	r"(?i)MS_SLAVE",
82	r"(?i)man[ -_]?page",
83]
84
85
86REGEX_LIST = []
87for reg in REGEXES:
88	REGEX_LIST.append(re.compile(reg))
89
90SUPPRESSION_LIST = []
91for supp in SUPPRESSIONS:
92	SUPPRESSION_LIST.append(re.compile(supp))
93
94def find(top, filename_glob, skip_glob_list):
95	"""Returns files in the tree rooted at top matching filename_glob but not
96	in directories matching skip_glob_list."""
97
98	file_list = []
99	for path, dirs, files in os.walk(top):
100		for glob in skip_glob_list:
101			for match in fnmatch.filter(dirs, glob):
102				dirs.remove(match)
103		for filename in fnmatch.filter(files, filename_glob):
104			if filename == os.path.basename(__file__):
105				continue
106			file_list.append(os.path.join(path, filename))
107	return file_list
108
109
110def filtered_descendants(glob):
111	"""Returns glob-matching filenames under the current directory, but skips
112	some irrelevant paths."""
113	return find('.', glob, ['third_party', 'external', 'build*', 'out*',
114							'CompilerIdCXX', '.git'])
115
116def check_match(filename, contents):
117	"""Check if contents contains any matching entries"""
118	ret = False
119	for reg in REGEX_LIST:
120		match = reg.search(contents)
121		if match:
122			suppressed = False
123			for supp in SUPPRESSION_LIST:
124				idx = match.start()
125				supp_match = supp.match(contents[idx:])
126				if supp_match:
127					suppressed = True
128
129				# This is a hack to handle the MS_ prefix that is needed
130				# to check for. Find a better way if we get more suppressions
131				# which modify the prefix of the string
132				if idx >= 3:
133					supp_match = supp.match(contents[idx - 3:])
134					if supp_match:
135						suppressed = True
136
137			if not suppressed:
138				# No matching suppression.
139				print("{}: found non-inclusive language: {}".format(
140						filename, match.group(0)))
141				ret = True
142
143	return ret
144
145
146def alert_if_lang_matches(glob):
147	"""Prints names of all files matching non-inclusive language.
148
149	Finds all glob-matching files under the current directory and checks if they
150	contain the language pattern.  Prints the names of all the files that
151	match.
152
153	Returns the total number of file names printed.
154	"""
155	verbose = False
156	printed_count = 0
157	for file in filtered_descendants(glob):
158		has_match = False
159		try:
160			with open(file, 'r', encoding='utf8') as contents:
161				if check_match(file, contents.read()):
162					printed_count += 1
163		except:
164			if verbose:
165				print("skipping {}".format(file))
166
167	return printed_count
168
169
170def main():
171	globs = ['*']
172	count = 0
173	for glob in globs:
174		count += alert_if_lang_matches(glob)
175
176	sys.exit(count > 0)
177
178if __name__ == '__main__':
179	main()
180