1#! /usr/bin/python -B 2 3# Copyright (C) 2016 and later: Unicode, Inc. and others. 4# License & terms of use: http://www.unicode.org/copyright.html 5 6# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others. 7# All rights reserved. 8 9# 10# Script to check that ICU source files contain only valid UTF-8 encoded text, 11# and that all files except '.txt' files do not contain a Byte Order Mark (BOM). 12# 13# THIS SCRIPT DOES NOT WORK ON WINDOWS 14# It only works correctly on platforms where the native line ending is a plain \n 15# 16# usage: 17# icu-file-utf8-check.py [options] 18# 19# options: 20# -h | --help Print a usage line and exit. 21# 22# The tool operates recursively on the directory from which it is run. 23# Only files from the ICU github repository are checked. 24# No changes are made to the repository; only the working copy will be altered. 25 26from __future__ import print_function 27 28import sys 29import os 30import os.path 31import re 32import getopt 33 34 35def runCommand(cmd): 36 output_file = os.popen(cmd); 37 output_text = output_file.read(); 38 exit_status = output_file.close(); 39 if exit_status: 40 print('"', cmd, '" failed. Exiting.', file=sys.stderr) 41 sys.exit(exit_status) 42 return output_text 43 44 45def usage(): 46 print("usage: " + sys.argv[0] + " [-h | --help]") 47 48 49# 50# File check. Check source code files for UTF-8 and all except text files for not containing a BOM 51# file_name: name of a text file. 52# is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java). 53# 54def check_file(file_name, is_source): 55 f = open(file_name, 'rb') 56 bytes = f.read() 57 f.close() 58 59 if is_source: 60 try: 61 bytes.decode("UTF-8") 62 except UnicodeDecodeError: 63 print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name) 64 65 if bytes[0] == 0xef: 66 if not (file_name.endswith(".txt") or file_name.endswith(".sln") 67 or file_name.endswith(".targets") 68 or ".vcxproj" in file_name): 69 print("Warning: file %s contains a UTF-8 BOM: " % file_name) 70 71 return 72 73def main(argv): 74 try: 75 opts, args = getopt.getopt(argv, "h", ("help")) 76 except getopt.GetoptError: 77 print("unrecognized option: " + argv[0]) 78 usage() 79 sys.exit(2) 80 for opt, arg in opts: 81 if opt in ("-h", "--help"): 82 usage() 83 sys.exit() 84 if args: 85 print("unexpected command line argument") 86 usage() 87 sys.exit() 88 89 output = runCommand("git ls-files "); 90 file_list = output.splitlines() 91 92 source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))") 93 94 for f in file_list: 95 if os.path.isdir(f): 96 print("Skipping dir " + f) 97 continue 98 if not os.path.isfile(f): 99 print("Repository file not in working copy: " + f) 100 continue; 101 102 source_file = source_file_re.match(f) 103 check_file(f, source_file) 104 105if __name__ == "__main__": 106 main(sys.argv[1:]) 107