1#! /usr/bin/python -B 2 3# Copyright (C) 2016 and later: Unicode, Inc. and others. 4# License & terms of use: http://www.unicode.org/copyright.html 5 6# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others. 7# All rights reserved. 8 9# 10# Script to check that ICU source files contain only valid UTF-8 encoded text, 11# and that all files except '.txt' files do not contain a Byte Order Mark (BOM). 12# 13# THIS SCRIPT DOES NOT WORK ON WINDOWS 14# It only works correctly on platforms where the native line ending is a plain \n 15# 16# usage: 17# icu-file-utf8-check.py [options] 18# 19# options: 20# -h | --help Print a usage line and exit. 21# 22# The tool operates recursively on the directory from which it is run. 23# Only files from the ICU github repository are checked. 24# No changes are made to the repository; only the working copy will be altered. 25# The script checks all source files and returns a non-zero exit code if any of 26# the checked files contain a non-UTF-8 character. 27 28from __future__ import print_function 29 30import sys 31import os 32import os.path 33import re 34import getopt 35 36 37# List of directories to check for UTF-8 and BOM. Currently covers 38# all of icu/. Modify as needed. 39icu_directories_to_be_scanned = ["."] 40 41# Files that are allowed to contain \r line endings. If this list 42# grows too long consider a file instead. 43ignore_cr_in_files = [ 44 "vendor/double-conversion/upstream/msvc/testrunner.cmd" 45 ] 46 47def runCommand(cmd): 48 output_file = os.popen(cmd); 49 output_text = output_file.read(); 50 exit_status = output_file.close(); 51 52 return output_text, exit_status 53 54 55def usage(): 56 print("usage: " + sys.argv[0] + " [-h | --help]") 57 58 59# 60# File check. Check source code files for UTF-8 and all except text files for not containing a BOM 61# file_name: name of a text file. 62# is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java). 63# 64def check_file(file_name, is_source): 65 rc = 0 66 f = open(file_name, 'rb') 67 bytes = f.read() 68 f.close() 69 70 if is_source: 71 try: 72 bytes.decode("UTF-8") 73 except UnicodeDecodeError: 74 print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name) 75 rc = 1 76 77 if bytes[0] == 0xef: 78 if not (file_name.endswith(".txt") or file_name.endswith(".sln") 79 or file_name.endswith(".targets") or ".vcxproj" in file_name): 80 print("Warning: file %s contains a UTF-8 BOM: " % file_name) 81 rc = 1 82 83 return rc 84 85def main(argv): 86 exit_status = 0 87 rc = 0 88 89 try: 90 opts, args = getopt.getopt(argv, "h", ("help")) 91 except getopt.GetoptError: 92 print("unrecognized option: " + argv[0]) 93 usage() 94 sys.exit(2) 95 for opt, arg in opts: 96 if opt in ("-h", "--help"): 97 usage() 98 sys.exit() 99 if args: 100 print("unexpected command line argument") 101 usage() 102 sys.exit(2) 103 104 source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))") 105 git_cmd = "git ls-files DIR" 106 107 for dir in icu_directories_to_be_scanned: 108 print('Scanning ' + dir) 109 cmd = git_cmd.replace("DIR", dir) 110 output, rc = runCommand(cmd) 111 if rc: 112 print('"', cmd, '" failed. Exiting.', file=sys.stderr) 113 file_list = output.splitlines() 114 115 for f in file_list: 116 if os.path.isdir(f): 117 print("Skipping dir " + f) 118 continue 119 if not os.path.isfile(f): 120 print("Repository file not in working copy: " + f) 121 continue; 122 123 source_file = source_file_re.match(f) 124 if check_file(f, source_file) != 0: 125 exit_status = 1 126 127 # Lastly, check the line endings of the file. 128 # Note that 'grep' returns null if it reports a file, 129 # a non-null value otherwise. 130 output, rc = runCommand("grep -rPIl \"\\r\" " + f) 131 if (rc is None): 132 if f not in ignore_cr_in_files: 133 print("File ", f, " has \\r line ending") 134 exit_status = 1 135 136 print(exit_status) 137 sys.exit(exit_status) 138 139if __name__ == "__main__": 140 main(sys.argv[1:]) 141