1# Copyright (C) 2018 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html 3 4# Python 2/3 Compatibility (ICU-20299) 5# TODO(ICU-20301): Remove this. 6from __future__ import print_function 7 8from abc import abstractmethod 9from collections import defaultdict 10import re 11import sys 12 13from . import * 14from . import utils 15from .request_types import * 16 17 18# Note: for this to be a proper abstract class, it should extend abc.ABC. 19# There is no nice way to do this that works in both Python 2 and 3. 20# TODO(ICU-20301): Make this inherit from abc.ABC. 21class Filter(object): 22 @staticmethod 23 def create_from_json(json_data, io): 24 assert io != None 25 if "filterType" in json_data: 26 filter_type = json_data["filterType"] 27 else: 28 filter_type = "file-stem" 29 30 if filter_type == "file-stem": 31 return FileStemFilter(json_data) 32 elif filter_type == "language": 33 return LanguageFilter(json_data) 34 elif filter_type == "regex": 35 return RegexFilter(json_data) 36 elif filter_type == "exclude": 37 return ExclusionFilter() 38 elif filter_type == "union": 39 return UnionFilter(json_data, io) 40 elif filter_type == "locale": 41 return LocaleFilter(json_data, io) 42 else: 43 print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr) 44 return None 45 46 def filter(self, request): 47 if not request.apply_file_filter(self): 48 return [] 49 for file in request.all_input_files(): 50 assert self.match(file) 51 return [request] 52 53 @staticmethod 54 def _file_to_file_stem(file): 55 start = file.filename.rfind("/") 56 limit = file.filename.rfind(".") 57 return file.filename[start+1:limit] 58 59 @staticmethod 60 def _file_to_subdir(file): 61 limit = file.filename.rfind("/") 62 if limit == -1: 63 return None 64 return file.filename[:limit] 65 66 @abstractmethod 67 def match(self, file): 68 pass 69 70 71class InclusionFilter(Filter): 72 def match(self, file): 73 return True 74 75 76class ExclusionFilter(Filter): 77 def match(self, file): 78 return False 79 80 81class IncludeExcludeFilter(Filter): 82 def __init__(self, json_data): 83 if "whitelist" in json_data: 84 self.is_includelist = True 85 self.includelist = json_data["whitelist"] 86 elif "includelist" in json_data: 87 self.is_includelist = True 88 self.includelist = json_data["includelist"] 89 elif "blacklist" in json_data: 90 self.is_includelist = False 91 self.excludelist = json_data["blacklist"] 92 elif "excludelist" in json_data: 93 self.is_includelist = False 94 self.excludelist = json_data["excludelist"] 95 else: 96 raise AssertionError("Need either includelist or excludelist: %s" % str(json_data)) 97 98 def match(self, file): 99 file_stem = self._file_to_file_stem(file) 100 return self._should_include(file_stem) 101 102 @abstractmethod 103 def _should_include(self, file_stem): 104 pass 105 106 107class FileStemFilter(IncludeExcludeFilter): 108 def _should_include(self, file_stem): 109 if self.is_includelist: 110 return file_stem in self.includelist 111 else: 112 return file_stem not in self.excludelist 113 114 115class LanguageFilter(IncludeExcludeFilter): 116 def _should_include(self, file_stem): 117 language = file_stem.split("_")[0] 118 if language == "root": 119 # Always include root.txt 120 return True 121 if self.is_includelist: 122 return language in self.includelist 123 else: 124 return language not in self.excludelist 125 126 127class RegexFilter(IncludeExcludeFilter): 128 def __init__(self, *args): 129 # TODO(ICU-20301): Change this to: super().__init__(*args) 130 super(RegexFilter, self).__init__(*args) 131 if self.is_includelist: 132 self.includelist = [re.compile(pat) for pat in self.includelist] 133 else: 134 self.excludelist = [re.compile(pat) for pat in self.excludelist] 135 136 def _should_include(self, file_stem): 137 if self.is_includelist: 138 for pattern in self.includelist: 139 if pattern.match(file_stem): 140 return True 141 return False 142 else: 143 for pattern in self.excludelist: 144 if pattern.match(file_stem): 145 return False 146 return True 147 148 149class UnionFilter(Filter): 150 def __init__(self, json_data, io): 151 # Collect the sub-filters. 152 self.sub_filters = [] 153 for filter_json in json_data["unionOf"]: 154 self.sub_filters.append(Filter.create_from_json(filter_json, io)) 155 156 def match(self, file): 157 """Match iff any of the sub-filters match.""" 158 for filter in self.sub_filters: 159 if filter.match(file): 160 return True 161 return False 162 163 164LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$") 165LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$") 166 167class LocaleFilter(Filter): 168 def __init__(self, json_data, io): 169 if "whitelist" in json_data: 170 self.locales_requested = list(json_data["whitelist"]) 171 elif "includelist" in json_data: 172 self.locales_requested = list(json_data["includelist"]) 173 else: 174 raise AssertionError("You must have an includelist in a locale filter") 175 self.include_children = json_data.get("includeChildren", True) 176 self.include_scripts = json_data.get("includeScripts", False) 177 178 # Load the dependency graph from disk 179 self.dependency_data_by_tree = { 180 tree: io.read_locale_deps(tree) 181 for tree in utils.ALL_TREES 182 } 183 184 def match(self, file): 185 tree = self._file_to_subdir(file) 186 assert tree is not None 187 locale = self._file_to_file_stem(file) 188 189 # A locale is *required* if it is *requested* or an ancestor of a 190 # *requested* locale. 191 if locale in self._locales_required(tree): 192 return True 193 194 # Resolve include_scripts and include_children. 195 return self._match_recursive(locale, tree) 196 197 def _match_recursive(self, locale, tree): 198 # Base case: return True if we reached a *requested* locale, 199 # or False if we ascend out of the locale tree. 200 if locale is None: 201 return False 202 if locale in self.locales_requested: 203 return True 204 205 # Check for alternative scripts. 206 # This causes sr_Latn to check sr instead of going directly to root. 207 if self.include_scripts: 208 match = LANGUAGE_SCRIPT_REGEX.match(locale) 209 if match and self._match_recursive(match.group(1), tree): 210 return True 211 212 # Check if we are a descendant of a *requested* locale. 213 if self.include_children: 214 parent = self._get_parent_locale(locale, tree) 215 if self._match_recursive(parent, tree): 216 return True 217 218 # No matches. 219 return False 220 221 def _get_parent_locale(self, locale, tree): 222 """Gets the parent locale in the given tree, according to dependency data.""" 223 dependency_data = self.dependency_data_by_tree[tree] 224 if "parents" in dependency_data and locale in dependency_data["parents"]: 225 return dependency_data["parents"][locale] 226 if "aliases" in dependency_data and locale in dependency_data["aliases"]: 227 return dependency_data["aliases"][locale] 228 if LANGUAGE_ONLY_REGEX.match(locale): 229 return "root" 230 i = locale.rfind("_") 231 if i < 0: 232 assert locale == "root", "Invalid locale: %s/%s" % (tree, locale) 233 return None 234 return locale[:i] 235 236 def _locales_required(self, tree): 237 """Returns a generator of all required locales in the given tree.""" 238 for locale in self.locales_requested: 239 while locale is not None: 240 yield locale 241 locale = self._get_parent_locale(locale, tree) 242 243 244def apply_filters(requests, config, io): 245 """Runs the filters and returns a new list of requests.""" 246 requests = _apply_file_filters(requests, config, io) 247 requests = _apply_resource_filters(requests, config, io) 248 return requests 249 250 251def _apply_file_filters(old_requests, config, io): 252 """Filters out entire files.""" 253 filters = _preprocess_file_filters(old_requests, config, io) 254 new_requests = [] 255 for request in old_requests: 256 category = request.category 257 if category in filters: 258 new_requests += filters[category].filter(request) 259 else: 260 new_requests.append(request) 261 return new_requests 262 263 264def _preprocess_file_filters(requests, config, io): 265 all_categories = set( 266 request.category 267 for request in requests 268 ) 269 all_categories.remove(None) 270 all_categories = list(sorted(all_categories)) 271 json_data = config.filters_json_data 272 filters = {} 273 default_filter_json = "exclude" if config.strategy == "additive" else "include" 274 for category in all_categories: 275 filter_json = default_filter_json 276 # Figure out the correct filter to create 277 if "featureFilters" in json_data and category in json_data["featureFilters"]: 278 filter_json = json_data["featureFilters"][category] 279 if filter_json == "include" and "localeFilter" in json_data and category.endswith("_tree"): 280 filter_json = json_data["localeFilter"] 281 # Resolve the filter JSON into a filter object 282 if filter_json == "exclude": 283 filters[category] = ExclusionFilter() 284 elif filter_json == "include": 285 pass # no-op 286 else: 287 filters[category] = Filter.create_from_json(filter_json, io) 288 if "featureFilters" in json_data: 289 for category in json_data["featureFilters"]: 290 if category not in all_categories: 291 print("Warning: category %s is not known" % category, file=sys.stderr) 292 return filters 293 294 295class ResourceFilterInfo(object): 296 def __init__(self, category, strategy): 297 self.category = category 298 self.strategy = strategy 299 self.filter_tmp_dir = "filters/%s" % category 300 self.input_files = None 301 self.filter_files = None 302 self.rules_by_file = None 303 304 def apply_to_requests(self, all_requests): 305 # Call this method only once per list of requests. 306 assert self.input_files is None 307 for request in all_requests: 308 if request.category != self.category: 309 continue 310 if not isinstance(request, AbstractExecutionRequest): 311 continue 312 if request.tool != IcuTool("genrb"): 313 continue 314 if not request.input_files: 315 continue 316 self._set_files(request.input_files) 317 request.dep_targets += [self.filter_files[:]] 318 arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir 319 request.args = "%s %s" % (arg_str, request.args) 320 321 # Make sure we found the target request 322 if self.input_files is None: 323 print("WARNING: Category not found: %s" % self.category, file=sys.stderr) 324 self.input_files = [] 325 self.filter_files = [] 326 self.rules_by_file = [] 327 328 def _set_files(self, files): 329 # Note: The input files to genrb for a certain category should always 330 # be the same. For example, there are often two genrb calls: one for 331 # --writePoolBundle, and the other for --usePoolBundle. They are both 332 # expected to have the same list of input files. 333 if self.input_files is not None: 334 assert self.input_files == files 335 return 336 self.input_files = list(files) 337 self.filter_files = [ 338 TmpFile("%s/%s" % (self.filter_tmp_dir, basename)) 339 for basename in ( 340 file.filename[file.filename.rfind("/")+1:] 341 for file in files 342 ) 343 ] 344 if self.strategy == "additive": 345 self.rules_by_file = [ 346 [r"-/", r"+/%%ALIAS", r"+/%%Parent"] 347 for _ in range(len(files)) 348 ] 349 else: 350 self.rules_by_file = [ 351 [r"+/"] 352 for _ in range(len(files)) 353 ] 354 355 def add_rules(self, file_filter, rules): 356 for file, rule_list in zip(self.input_files, self.rules_by_file): 357 if file_filter.match(file): 358 rule_list += rules 359 360 def make_requests(self): 361 # Map from rule list to filter files with that rule list 362 unique_rules = defaultdict(list) 363 for filter_file, rules in zip(self.filter_files, self.rules_by_file): 364 unique_rules[tuple(rules)].append(filter_file) 365 366 new_requests = [] 367 i = 0 368 for rules, filter_files in unique_rules.items(): 369 base_filter_file = filter_files[0] 370 new_requests += [ 371 PrintFileRequest( 372 name = "%s_print_%d" % (self.category, i), 373 output_file = base_filter_file, 374 content = self._generate_resource_filter_txt(rules) 375 ) 376 ] 377 i += 1 378 for filter_file in filter_files[1:]: 379 new_requests += [ 380 CopyRequest( 381 name = "%s_copy_%d" % (self.category, i), 382 input_file = base_filter_file, 383 output_file = filter_file 384 ) 385 ] 386 i += 1 387 return new_requests 388 389 @staticmethod 390 def _generate_resource_filter_txt(rules): 391 result = "# Caution: This file is automatically generated\n\n" 392 result += "\n".join(rules) 393 return result 394 395 396def _apply_resource_filters(all_requests, config, io): 397 """Creates filters for looking within resource bundle files.""" 398 json_data = config.filters_json_data 399 if "resourceFilters" not in json_data: 400 return all_requests 401 402 collected = {} 403 for entry in json_data["resourceFilters"]: 404 if "files" in entry: 405 file_filter = Filter.create_from_json(entry["files"], io) 406 else: 407 file_filter = InclusionFilter() 408 for category in entry["categories"]: 409 # not defaultdict because we need to pass arguments to the constructor 410 if category not in collected: 411 filter_info = ResourceFilterInfo(category, config.strategy) 412 filter_info.apply_to_requests(all_requests) 413 collected[category] = filter_info 414 else: 415 filter_info = collected[category] 416 filter_info.add_rules(file_filter, entry["rules"]) 417 418 # Add the filter generation requests to the beginning so that by default 419 # they are made before genrb gets run (order is required by windirect) 420 new_requests = [] 421 for filter_info in collected.values(): 422 new_requests += filter_info.make_requests() 423 new_requests += all_requests 424 return new_requests 425