1# Copyright (C) 2018 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html 3 4# Python 2/3 Compatibility (ICU-20299) 5# TODO(ICU-20301): Remove this. 6from __future__ import print_function 7 8from abc import abstractmethod 9from collections import defaultdict 10import re 11import sys 12 13from . import * 14from . import utils 15from .request_types import * 16 17 18# Note: for this to be a proper abstract class, it should extend abc.ABC. 19# There is no nice way to do this that works in both Python 2 and 3. 20# TODO(ICU-20301): Make this inherit from abc.ABC. 21class Filter(object): 22 @staticmethod 23 def create_from_json(json_data, io): 24 assert io != None 25 if "filterType" in json_data: 26 filter_type = json_data["filterType"] 27 else: 28 filter_type = "file-stem" 29 30 if filter_type == "file-stem": 31 return FileStemFilter(json_data) 32 elif filter_type == "language": 33 return LanguageFilter(json_data) 34 elif filter_type == "regex": 35 return RegexFilter(json_data) 36 elif filter_type == "exclude": 37 return ExclusionFilter() 38 elif filter_type == "union": 39 return UnionFilter(json_data, io) 40 elif filter_type == "locale": 41 return LocaleFilter(json_data, io) 42 else: 43 print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr) 44 return None 45 46 def filter(self, request): 47 if not request.apply_file_filter(self): 48 return [] 49 for file in request.all_input_files(): 50 assert self.match(file) 51 return [request] 52 53 @staticmethod 54 def _file_to_file_stem(file): 55 start = file.filename.rfind("/") 56 limit = file.filename.rfind(".") 57 return file.filename[start+1:limit] 58 59 @staticmethod 60 def _file_to_subdir(file): 61 limit = file.filename.rfind("/") 62 if limit == -1: 63 return None 64 return file.filename[:limit] 65 66 @abstractmethod 67 def match(self, file): 68 pass 69 70 71class InclusionFilter(Filter): 72 def match(self, file): 73 return True 74 75 76class ExclusionFilter(Filter): 77 def match(self, file): 78 return False 79 80 81class WhitelistBlacklistFilter(Filter): 82 def __init__(self, json_data): 83 if "whitelist" in json_data: 84 self.is_whitelist = True 85 self.whitelist = json_data["whitelist"] 86 else: 87 assert "blacklist" in json_data, "Need either whitelist or blacklist: %s" % str(json_data) 88 self.is_whitelist = False 89 self.blacklist = json_data["blacklist"] 90 91 def match(self, file): 92 file_stem = self._file_to_file_stem(file) 93 return self._should_include(file_stem) 94 95 @abstractmethod 96 def _should_include(self, file_stem): 97 pass 98 99 100class FileStemFilter(WhitelistBlacklistFilter): 101 def _should_include(self, file_stem): 102 if self.is_whitelist: 103 return file_stem in self.whitelist 104 else: 105 return file_stem not in self.blacklist 106 107 108class LanguageFilter(WhitelistBlacklistFilter): 109 def _should_include(self, file_stem): 110 language = file_stem.split("_")[0] 111 if language == "root": 112 # Always include root.txt 113 return True 114 if self.is_whitelist: 115 return language in self.whitelist 116 else: 117 return language not in self.blacklist 118 119 120class RegexFilter(WhitelistBlacklistFilter): 121 def __init__(self, *args): 122 # TODO(ICU-20301): Change this to: super().__init__(*args) 123 super(RegexFilter, self).__init__(*args) 124 if self.is_whitelist: 125 self.whitelist = [re.compile(pat) for pat in self.whitelist] 126 else: 127 self.blacklist = [re.compile(pat) for pat in self.blacklist] 128 129 def _should_include(self, file_stem): 130 if self.is_whitelist: 131 for pattern in self.whitelist: 132 if pattern.match(file_stem): 133 return True 134 return False 135 else: 136 for pattern in self.blacklist: 137 if pattern.match(file_stem): 138 return False 139 return True 140 141 142class UnionFilter(Filter): 143 def __init__(self, json_data, io): 144 # Collect the sub-filters. 145 self.sub_filters = [] 146 for filter_json in json_data["unionOf"]: 147 self.sub_filters.append(Filter.create_from_json(filter_json, io)) 148 149 def match(self, file): 150 """Match iff any of the sub-filters match.""" 151 for filter in self.sub_filters: 152 if filter.match(file): 153 return True 154 return False 155 156 157LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$") 158LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$") 159 160class LocaleFilter(Filter): 161 def __init__(self, json_data, io): 162 self.locales_requested = list(json_data["whitelist"]) 163 self.include_children = json_data.get("includeChildren", True) 164 self.include_scripts = json_data.get("includeScripts", False) 165 166 # Load the dependency graph from disk 167 self.dependency_data_by_tree = { 168 tree: io.read_locale_deps(tree) 169 for tree in utils.ALL_TREES 170 } 171 172 def match(self, file): 173 tree = self._file_to_subdir(file) 174 assert tree is not None 175 locale = self._file_to_file_stem(file) 176 177 # A locale is *required* if it is *requested* or an ancestor of a 178 # *requested* locale. 179 if locale in self._locales_required(tree): 180 return True 181 182 # Resolve include_scripts and include_children. 183 return self._match_recursive(locale, tree) 184 185 def _match_recursive(self, locale, tree): 186 # Base case: return True if we reached a *requested* locale, 187 # or False if we ascend out of the locale tree. 188 if locale is None: 189 return False 190 if locale in self.locales_requested: 191 return True 192 193 # Check for alternative scripts. 194 # This causes sr_Latn to check sr instead of going directly to root. 195 if self.include_scripts: 196 match = LANGUAGE_SCRIPT_REGEX.match(locale) 197 if match and self._match_recursive(match.group(1), tree): 198 return True 199 200 # Check if we are a descendant of a *requested* locale. 201 if self.include_children: 202 parent = self._get_parent_locale(locale, tree) 203 if self._match_recursive(parent, tree): 204 return True 205 206 # No matches. 207 return False 208 209 def _get_parent_locale(self, locale, tree): 210 """Gets the parent locale in the given tree, according to dependency data.""" 211 dependency_data = self.dependency_data_by_tree[tree] 212 if "parents" in dependency_data and locale in dependency_data["parents"]: 213 return dependency_data["parents"][locale] 214 if "aliases" in dependency_data and locale in dependency_data["aliases"]: 215 return dependency_data["aliases"][locale] 216 if LANGUAGE_ONLY_REGEX.match(locale): 217 return "root" 218 i = locale.rfind("_") 219 if i < 0: 220 assert locale == "root", "Invalid locale: %s/%s" % (tree, locale) 221 return None 222 return locale[:i] 223 224 def _locales_required(self, tree): 225 """Returns a generator of all required locales in the given tree.""" 226 for locale in self.locales_requested: 227 while locale is not None: 228 yield locale 229 locale = self._get_parent_locale(locale, tree) 230 231 232def apply_filters(requests, config, io): 233 """Runs the filters and returns a new list of requests.""" 234 requests = _apply_file_filters(requests, config, io) 235 requests = _apply_resource_filters(requests, config, io) 236 return requests 237 238 239def _apply_file_filters(old_requests, config, io): 240 """Filters out entire files.""" 241 filters = _preprocess_file_filters(old_requests, config, io) 242 new_requests = [] 243 for request in old_requests: 244 category = request.category 245 if category in filters: 246 new_requests += filters[category].filter(request) 247 else: 248 new_requests.append(request) 249 return new_requests 250 251 252def _preprocess_file_filters(requests, config, io): 253 all_categories = set( 254 request.category 255 for request in requests 256 ) 257 all_categories.remove(None) 258 all_categories = list(sorted(all_categories)) 259 json_data = config.filters_json_data 260 filters = {} 261 default_filter_json = "exclude" if config.strategy == "additive" else "include" 262 for category in all_categories: 263 filter_json = default_filter_json 264 # Figure out the correct filter to create 265 if "featureFilters" in json_data and category in json_data["featureFilters"]: 266 filter_json = json_data["featureFilters"][category] 267 if filter_json == "include" and "localeFilter" in json_data and category.endswith("_tree"): 268 filter_json = json_data["localeFilter"] 269 # Resolve the filter JSON into a filter object 270 if filter_json == "exclude": 271 filters[category] = ExclusionFilter() 272 elif filter_json == "include": 273 pass # no-op 274 else: 275 filters[category] = Filter.create_from_json(filter_json, io) 276 if "featureFilters" in json_data: 277 for category in json_data["featureFilters"]: 278 if category not in all_categories: 279 print("Warning: category %s is not known" % category, file=sys.stderr) 280 return filters 281 282 283class ResourceFilterInfo(object): 284 def __init__(self, category, strategy): 285 self.category = category 286 self.strategy = strategy 287 self.filter_tmp_dir = "filters/%s" % category 288 self.input_files = None 289 self.filter_files = None 290 self.rules_by_file = None 291 292 def apply_to_requests(self, all_requests): 293 # Call this method only once per list of requests. 294 assert self.input_files is None 295 for request in all_requests: 296 if request.category != self.category: 297 continue 298 if not isinstance(request, AbstractExecutionRequest): 299 continue 300 if request.tool != IcuTool("genrb"): 301 continue 302 if not request.input_files: 303 continue 304 self._set_files(request.input_files) 305 request.dep_targets += [self.filter_files[:]] 306 arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir 307 request.args = "%s %s" % (arg_str, request.args) 308 309 # Make sure we found the target request 310 if self.input_files is None: 311 print("WARNING: Category not found: %s" % self.category, file=sys.stderr) 312 self.input_files = [] 313 self.filter_files = [] 314 self.rules_by_file = [] 315 316 def _set_files(self, files): 317 # Note: The input files to genrb for a certain category should always 318 # be the same. For example, there are often two genrb calls: one for 319 # --writePoolBundle, and the other for --usePoolBundle. They are both 320 # expected to have the same list of input files. 321 if self.input_files is not None: 322 assert self.input_files == files 323 return 324 self.input_files = list(files) 325 self.filter_files = [ 326 TmpFile("%s/%s" % (self.filter_tmp_dir, basename)) 327 for basename in ( 328 file.filename[file.filename.rfind("/")+1:] 329 for file in files 330 ) 331 ] 332 if self.strategy == "additive": 333 self.rules_by_file = [ 334 [r"-/", r"+/%%ALIAS", r"+/%%Parent"] 335 for _ in range(len(files)) 336 ] 337 else: 338 self.rules_by_file = [ 339 [r"+/"] 340 for _ in range(len(files)) 341 ] 342 343 def add_rules(self, file_filter, rules): 344 for file, rule_list in zip(self.input_files, self.rules_by_file): 345 if file_filter.match(file): 346 rule_list += rules 347 348 def make_requests(self): 349 # Map from rule list to filter files with that rule list 350 unique_rules = defaultdict(list) 351 for filter_file, rules in zip(self.filter_files, self.rules_by_file): 352 unique_rules[tuple(rules)].append(filter_file) 353 354 new_requests = [] 355 i = 0 356 for rules, filter_files in unique_rules.items(): 357 base_filter_file = filter_files[0] 358 new_requests += [ 359 PrintFileRequest( 360 name = "%s_print_%d" % (self.category, i), 361 output_file = base_filter_file, 362 content = self._generate_resource_filter_txt(rules) 363 ) 364 ] 365 i += 1 366 for filter_file in filter_files[1:]: 367 new_requests += [ 368 CopyRequest( 369 name = "%s_copy_%d" % (self.category, i), 370 input_file = base_filter_file, 371 output_file = filter_file 372 ) 373 ] 374 i += 1 375 return new_requests 376 377 @staticmethod 378 def _generate_resource_filter_txt(rules): 379 result = "# Caution: This file is automatically generated\n\n" 380 result += "\n".join(rules) 381 return result 382 383 384def _apply_resource_filters(all_requests, config, io): 385 """Creates filters for looking within resource bundle files.""" 386 json_data = config.filters_json_data 387 if "resourceFilters" not in json_data: 388 return all_requests 389 390 collected = {} 391 for entry in json_data["resourceFilters"]: 392 if "files" in entry: 393 file_filter = Filter.create_from_json(entry["files"], io) 394 else: 395 file_filter = InclusionFilter() 396 for category in entry["categories"]: 397 # not defaultdict because we need to pass arguments to the constructor 398 if category not in collected: 399 filter_info = ResourceFilterInfo(category, config.strategy) 400 filter_info.apply_to_requests(all_requests) 401 collected[category] = filter_info 402 else: 403 filter_info = collected[category] 404 filter_info.add_rules(file_filter, entry["rules"]) 405 406 # Add the filter generation requests to the beginning so that by default 407 # they are made before genrb gets run (order is required by windirect) 408 new_requests = [] 409 for filter_info in collected.values(): 410 new_requests += filter_info.make_requests() 411 new_requests += all_requests 412 return new_requests 413