1#! /usr/bin/python -B 2 3# Copyright (C) 2016 and later: Unicode, Inc. and others. 4# License & terms of use: http://www.unicode.org/copyright.html 5 6# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others. 7# All rights reserved. 8 9# 10# Script to check that ICU source files contain only valid UTF-8 encoded text, 11# and that all files except '.txt' files do not contain a Byte Order Mark (BOM). 12# 13# THIS SCRIPT DOES NOT WORK ON WINDOWS 14# It only works correctly on platforms where the native line ending is a plain \n 15# 16# usage: 17# icu-file-utf8-check.py [options] 18# 19# options: 20# -h | --help Print a usage line and exit. 21# 22# The tool operates recursively on the directory from which it is run. 23# Only files from the ICU github repository are checked. 24# No changes are made to the repository; only the working copy will be altered. 25 26import sys 27import os 28import os.path 29import re 30import getopt 31 32 33def runCommand(cmd): 34 output_file = os.popen(cmd); 35 output_text = output_file.read(); 36 exit_status = output_file.close(); 37 if exit_status: 38 print >>sys.stderr, '"', cmd, '" failed. Exiting.' 39 sys.exit(exit_status) 40 return output_text 41 42 43def usage(): 44 print "usage: " + sys.argv[0] + " [-h | --help]" 45 46 47# 48# File check. Check source code files for UTF-8 and all except text files for not containing a BOM 49# file_name: name of a text file. 50# is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java). 51# 52def check_file(file_name, is_source): 53 f = open(file_name, 'r') 54 bytes = f.read() 55 f.close() 56 57 if is_source: 58 try: 59 bytes.decode("UTF-8") 60 except UnicodeDecodeError: 61 print "Error: %s is a source code file but contains non-utf-8 bytes." % file_name 62 63 if ord(bytes[0]) == 0xef: 64 if not (file_name.endswith(".txt") or file_name.endswith(".sln") 65 or file_name.endswith(".targets") 66 or ".vcxproj" in file_name): 67 print "Warning: file %s contains a UTF-8 BOM: " % file_name 68 69 return 70 71def main(argv): 72 try: 73 opts, args = getopt.getopt(argv, "h", ("help")) 74 except getopt.GetoptError: 75 print "unrecognized option: " + argv[0] 76 usage() 77 sys.exit(2) 78 for opt, arg in opts: 79 if opt in ("-h", "--help"): 80 usage() 81 sys.exit() 82 if args: 83 print "unexpected command line argument" 84 usage() 85 sys.exit() 86 87 output = runCommand("git ls-files "); 88 file_list = output.splitlines() 89 90 source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))") 91 92 for f in file_list: 93 if os.path.isdir(f): 94 print "Skipping dir " + f 95 continue 96 if not os.path.isfile(f): 97 print "Repository file not in working copy: " + f 98 continue; 99 100 source_file = source_file_re.match(f) 101 check_file(f, source_file) 102 103if __name__ == "__main__": 104 main(sys.argv[1:]) 105