1# Copyright (C) 2018 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html 3 4# Python 2/3 Compatibility (ICU-20299) 5# TODO(ICU-20301): Remove this. 6from __future__ import print_function 7 8from abc import abstractmethod 9from collections import defaultdict 10import re 11import sys 12 13from . import * 14from . import utils 15from .request_types import * 16 17 18# Note: for this to be a proper abstract class, it should extend abc.ABC. 19# There is no nice way to do this that works in both Python 2 and 3. 20# TODO(ICU-20301): Make this inherit from abc.ABC. 21class Filter(object): 22 @staticmethod 23 def create_from_json(json_data, io): 24 assert io != None 25 if "filterType" in json_data: 26 filter_type = json_data["filterType"] 27 else: 28 filter_type = "file-stem" 29 30 if filter_type == "file-stem": 31 return FileStemFilter(json_data) 32 elif filter_type == "language": 33 return LanguageFilter(json_data) 34 elif filter_type == "regex": 35 return RegexFilter(json_data) 36 elif filter_type == "exclude": 37 return ExclusionFilter() 38 elif filter_type == "union": 39 return UnionFilter(json_data, io) 40 elif filter_type == "locale": 41 return LocaleFilter(json_data, io) 42 else: 43 print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr) 44 return None 45 46 def filter(self, request): 47 if not request.apply_file_filter(self): 48 return [] 49 for file in request.all_input_files(): 50 assert self.match(file) 51 return [request] 52 53 @staticmethod 54 def _file_to_file_stem(file): 55 start = file.filename.rfind("/") 56 limit = file.filename.rfind(".") 57 return file.filename[start+1:limit] 58 59 @staticmethod 60 def _file_to_subdir(file): 61 limit = file.filename.rfind("/") 62 if limit == -1: 63 return None 64 return file.filename[:limit] 65 66 @abstractmethod 67 def match(self, file): 68 pass 69 70 71class InclusionFilter(Filter): 72 def match(self, file): 73 return True 74 75 76class ExclusionFilter(Filter): 77 def match(self, file): 78 return False 79 80 81class IncludeExcludeFilter(Filter): 82 def __init__(self, json_data): 83 if "whitelist" in json_data: 84 self.is_includelist = True 85 self.includelist = json_data["whitelist"] 86 elif "includelist" in json_data: 87 self.is_includelist = True 88 self.includelist = json_data["includelist"] 89 elif "blacklist" in json_data: 90 self.is_includelist = False 91 self.excludelist = json_data["blacklist"] 92 elif "excludelist" in json_data: 93 self.is_includelist = False 94 self.excludelist = json_data["excludelist"] 95 else: 96 raise AssertionError("Need either includelist or excludelist: %s" % str(json_data)) 97 98 def match(self, file): 99 file_stem = self._file_to_file_stem(file) 100 return self._should_include(file_stem) 101 102 @abstractmethod 103 def _should_include(self, file_stem): 104 pass 105 106 107class FileStemFilter(IncludeExcludeFilter): 108 def _should_include(self, file_stem): 109 if self.is_includelist: 110 return file_stem in self.includelist 111 else: 112 return file_stem not in self.excludelist 113 114 115class LanguageFilter(IncludeExcludeFilter): 116 def _should_include(self, file_stem): 117 language = file_stem.split("_")[0] 118 if language == "root": 119 # Always include root.txt 120 return True 121 if self.is_includelist: 122 return language in self.includelist 123 else: 124 return language not in self.excludelist 125 126 127class RegexFilter(IncludeExcludeFilter): 128 def __init__(self, *args): 129 # TODO(ICU-20301): Change this to: super().__init__(*args) 130 super(RegexFilter, self).__init__(*args) 131 if self.is_includelist: 132 self.includelist = [re.compile(pat) for pat in self.includelist] 133 else: 134 self.excludelist = [re.compile(pat) for pat in self.excludelist] 135 136 def _should_include(self, file_stem): 137 if self.is_includelist: 138 for pattern in self.includelist: 139 if pattern.match(file_stem): 140 return True 141 return False 142 else: 143 for pattern in self.excludelist: 144 if pattern.match(file_stem): 145 return False 146 return True 147 148 149class UnionFilter(Filter): 150 def __init__(self, json_data, io): 151 # Collect the sub-filters. 152 self.sub_filters = [] 153 for filter_json in json_data["unionOf"]: 154 self.sub_filters.append(Filter.create_from_json(filter_json, io)) 155 156 def match(self, file): 157 """Match iff any of the sub-filters match.""" 158 for filter in self.sub_filters: 159 if filter.match(file): 160 return True 161 return False 162 163 164LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$") 165LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$") 166 167class LocaleFilter(Filter): 168 def __init__(self, json_data, io): 169 if "whitelist" in json_data: 170 self.locales_requested = list(json_data["whitelist"]) 171 elif "includelist" in json_data: 172 self.locales_requested = list(json_data["includelist"]) 173 else: 174 raise AssertionError("You must have an includelist in a locale filter") 175 self.include_children = json_data.get("includeChildren", True) 176 self.include_scripts = json_data.get("includeScripts", False) 177 178 # Load the dependency graph from disk 179 self.dependency_data_by_tree = { 180 tree: io.read_locale_deps(tree) 181 for tree in utils.ALL_TREES 182 } 183 184 def match(self, file): 185 tree = self._file_to_subdir(file) 186 assert tree is not None 187 locale = self._file_to_file_stem(file) 188 189 # A locale is *required* if it is *requested* or an ancestor of a 190 # *requested* locale. 191 if locale in self._locales_required(tree): 192 return True 193 194 # Resolve include_scripts and include_children. 195 return self._match_recursive(locale, tree) 196 197 def _match_recursive(self, locale, tree): 198 # Base case: return True if we reached a *requested* locale, 199 # or False if we ascend out of the locale tree. 200 if locale is None: 201 return False 202 if locale in self.locales_requested: 203 return True 204 205 # Check for alternative scripts. 206 # This causes sr_Latn to check sr instead of going directly to root. 207 if self.include_scripts: 208 match = LANGUAGE_SCRIPT_REGEX.match(locale) 209 if match and self._match_recursive(match.group(1), tree): 210 return True 211 212 # Check if we are a descendant of a *requested* locale. 213 if self.include_children: 214 parent = self._get_parent_locale(locale, tree) 215 if self._match_recursive(parent, tree): 216 return True 217 218 # No matches. 219 return False 220 221 def _get_parent_locale(self, locale, tree): 222 """Gets the parent locale in the given tree, according to dependency data.""" 223 dependency_data = self.dependency_data_by_tree[tree] 224 if "parents" in dependency_data and locale in dependency_data["parents"]: 225 return dependency_data["parents"][locale] 226 if "aliases" in dependency_data and locale in dependency_data["aliases"]: 227 return dependency_data["aliases"][locale] 228 if LANGUAGE_ONLY_REGEX.match(locale): 229 return "root" 230 i = locale.rfind("_") 231 if i < 0: 232 assert locale == "root", "Invalid locale: %s/%s" % (tree, locale) 233 return None 234 return locale[:i] 235 236 def _locales_required(self, tree): 237 """Returns a generator of all required locales in the given tree.""" 238 for locale in self.locales_requested: 239 while locale is not None: 240 yield locale 241 locale = self._get_parent_locale(locale, tree) 242 243 244def apply_filters(requests, config, io): 245 """Runs the filters and returns a new list of requests.""" 246 requests = _apply_file_filters(requests, config, io) 247 requests = _apply_resource_filters(requests, config, io) 248 return requests 249 250 251def _apply_file_filters(old_requests, config, io): 252 """Filters out entire files.""" 253 filters = _preprocess_file_filters(old_requests, config, io) 254 new_requests = [] 255 for request in old_requests: 256 category = request.category 257 if category in filters: 258 new_requests += filters[category].filter(request) 259 else: 260 new_requests.append(request) 261 return new_requests 262 263 264def _preprocess_file_filters(requests, config, io): 265 all_categories = set( 266 request.category 267 for request in requests 268 ) 269 all_categories.remove(None) 270 all_categories = list(sorted(all_categories)) 271 json_data = config.filters_json_data 272 filters = {} 273 default_filter_json = "exclude" if config.strategy == "additive" else "include" 274 for category in all_categories: 275 filter_json = default_filter_json 276 # Special default for category "brkitr_lstm" as "exclude" for now. 277 if "brkitr_lstm" == category: 278 filter_json = "exclude" 279 # Figure out the correct filter to create for now. 280 if "featureFilters" in json_data and category in json_data["featureFilters"]: 281 filter_json = json_data["featureFilters"][category] 282 if filter_json == "include" and "localeFilter" in json_data and category.endswith("_tree"): 283 filter_json = json_data["localeFilter"] 284 # Resolve the filter JSON into a filter object 285 if filter_json == "exclude": 286 filters[category] = ExclusionFilter() 287 elif filter_json == "include": 288 pass # no-op 289 else: 290 filters[category] = Filter.create_from_json(filter_json, io) 291 if "featureFilters" in json_data: 292 for category in json_data["featureFilters"]: 293 if category not in all_categories: 294 print("Warning: category %s is not known" % category, file=sys.stderr) 295 return filters 296 297 298class ResourceFilterInfo(object): 299 def __init__(self, category, strategy): 300 self.category = category 301 self.strategy = strategy 302 self.filter_tmp_dir = "filters/%s" % category 303 self.input_files = None 304 self.filter_files = None 305 self.rules_by_file = None 306 307 def apply_to_requests(self, all_requests): 308 # Call this method only once per list of requests. 309 assert self.input_files is None 310 for request in all_requests: 311 if request.category != self.category: 312 continue 313 if not isinstance(request, AbstractExecutionRequest): 314 continue 315 if request.tool != IcuTool("genrb"): 316 continue 317 if not request.input_files: 318 continue 319 self._set_files(request.input_files) 320 request.dep_targets += [self.filter_files[:]] 321 arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir 322 request.args = "%s %s" % (arg_str, request.args) 323 324 # Make sure we found the target request 325 if self.input_files is None: 326 print("WARNING: Category not found: %s" % self.category, file=sys.stderr) 327 self.input_files = [] 328 self.filter_files = [] 329 self.rules_by_file = [] 330 331 def _set_files(self, files): 332 # Note: The input files to genrb for a certain category should always 333 # be the same. For example, there are often two genrb calls: one for 334 # --writePoolBundle, and the other for --usePoolBundle. They are both 335 # expected to have the same list of input files. 336 if self.input_files is not None: 337 assert self.input_files == files 338 return 339 self.input_files = list(files) 340 self.filter_files = [ 341 TmpFile("%s/%s" % (self.filter_tmp_dir, basename)) 342 for basename in ( 343 file.filename[file.filename.rfind("/")+1:] 344 for file in files 345 ) 346 ] 347 if self.strategy == "additive": 348 self.rules_by_file = [ 349 [r"-/", r"+/%%ALIAS", r"+/%%Parent"] 350 for _ in range(len(files)) 351 ] 352 else: 353 self.rules_by_file = [ 354 [r"+/"] 355 for _ in range(len(files)) 356 ] 357 358 def add_rules(self, file_filter, rules): 359 for file, rule_list in zip(self.input_files, self.rules_by_file): 360 if file_filter.match(file): 361 rule_list += rules 362 363 def make_requests(self): 364 # Map from rule list to filter files with that rule list 365 unique_rules = defaultdict(list) 366 for filter_file, rules in zip(self.filter_files, self.rules_by_file): 367 unique_rules[tuple(rules)].append(filter_file) 368 369 new_requests = [] 370 i = 0 371 for rules, filter_files in unique_rules.items(): 372 base_filter_file = filter_files[0] 373 new_requests += [ 374 PrintFileRequest( 375 name = "%s_print_%d" % (self.category, i), 376 output_file = base_filter_file, 377 content = self._generate_resource_filter_txt(rules) 378 ) 379 ] 380 i += 1 381 for filter_file in filter_files[1:]: 382 new_requests += [ 383 CopyRequest( 384 name = "%s_copy_%d" % (self.category, i), 385 input_file = base_filter_file, 386 output_file = filter_file 387 ) 388 ] 389 i += 1 390 return new_requests 391 392 @staticmethod 393 def _generate_resource_filter_txt(rules): 394 result = "# Caution: This file is automatically generated\n\n" 395 result += "\n".join(rules) 396 return result 397 398 399def _apply_resource_filters(all_requests, config, io): 400 """Creates filters for looking within resource bundle files.""" 401 json_data = config.filters_json_data 402 if "resourceFilters" not in json_data: 403 return all_requests 404 405 collected = {} 406 for entry in json_data["resourceFilters"]: 407 if "files" in entry: 408 file_filter = Filter.create_from_json(entry["files"], io) 409 else: 410 file_filter = InclusionFilter() 411 for category in entry["categories"]: 412 # not defaultdict because we need to pass arguments to the constructor 413 if category not in collected: 414 filter_info = ResourceFilterInfo(category, config.strategy) 415 filter_info.apply_to_requests(all_requests) 416 collected[category] = filter_info 417 else: 418 filter_info = collected[category] 419 filter_info.add_rules(file_filter, entry["rules"]) 420 421 # Add the filter generation requests to the beginning so that by default 422 # they are made before genrb gets run (order is required by windirect) 423 new_requests = [] 424 for filter_info in collected.values(): 425 new_requests += filter_info.make_requests() 426 new_requests += all_requests 427 return new_requests 428