• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/python -B
2
3# Copyright (C) 2016 and later: Unicode, Inc. and others.
4# License & terms of use: http://www.unicode.org/copyright.html
5
6# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
7# All rights reserved.
8
9#
10#  Script to check that ICU source files contain only valid UTF-8 encoded text,
11#  and that all files except '.txt' files do not contain a Byte Order Mark (BOM).
12#
13#  THIS SCRIPT DOES NOT WORK ON WINDOWS
14#     It only works correctly on platforms where the native line ending is a plain \n
15#
16#  usage:
17#     icu-file-utf8-check.py  [options]
18#
19#  options:
20#     -h | --help    Print a usage line and exit.
21#
22#  The tool operates recursively on the directory from which it is run.
23#  Only files from the ICU github repository are checked.
24#  No changes are made to the repository; only the working copy will be altered.
25#  The script checks all source files and returns a non-zero exit code if any of
26#  the checked files contain a non-UTF-8 character.
27
28from __future__ import print_function
29
30import sys
31import os
32import os.path
33import re
34import getopt
35
36
37# List of directories to check for UTF-8 and BOM. Currently covers
38# all of icu/. Modify as needed.
39icu_directories_to_be_scanned = ["."]
40
41# Files that are allowed to contain \r line endings. If this list
42# grows too long consider a file instead.
43ignore_cr_in_files = [
44    "vendor/double-conversion/upstream/msvc/testrunner.cmd"
45    ]
46
47def runCommand(cmd):
48    output_file = os.popen(cmd);
49    output_text = output_file.read();
50    exit_status = output_file.close();
51
52    return output_text, exit_status
53
54
55def usage():
56    print("usage: " + sys.argv[0] + " [-h | --help]")
57
58
59#
60#  File check.         Check source code files for UTF-8 and all except text files for not containing a BOM
61#    file_name:        name of a text file.
62#    is_source:        Flag, set to True if file is a source code file (.c, .cpp, .h, .java).
63#
64def check_file(file_name, is_source):
65    rc = 0
66    f = open(file_name, 'rb')
67    bytes = f.read()
68    f.close()
69
70    if is_source:
71        try:
72            bytes.decode("UTF-8")
73        except UnicodeDecodeError:
74            print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name)
75            rc = 1
76
77    if bytes[0] == 0xef:
78        if not (file_name.endswith(".txt") or file_name.endswith(".sln")
79                    or file_name.endswith(".targets") or ".vcxproj" in file_name):
80            print("Warning: file %s contains a UTF-8 BOM: " % file_name)
81            rc = 1
82
83    return rc
84
85def main(argv):
86    exit_status = 0
87    rc = 0
88
89    try:
90        opts, args = getopt.getopt(argv, "h", ("help"))
91    except getopt.GetoptError:
92        print("unrecognized option: " + argv[0])
93        usage()
94        sys.exit(2)
95    for opt, arg in opts:
96        if opt in ("-h", "--help"):
97            usage()
98            sys.exit()
99    if args:
100        print("unexpected command line argument")
101        usage()
102        sys.exit(2)
103
104    source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))")
105    git_cmd = "git ls-files DIR"
106
107    for dir in icu_directories_to_be_scanned:
108        print('Scanning ' + dir)
109        cmd = git_cmd.replace("DIR", dir)
110        output, rc = runCommand(cmd)
111        if rc:
112            print('"', cmd, '" failed. Exiting.', file=sys.stderr)
113        file_list = output.splitlines()
114
115        for f in file_list:
116            if os.path.isdir(f):
117                print("Skipping dir " + f)
118                continue
119            if not os.path.isfile(f):
120                print("Repository file not in working copy: " + f)
121                continue;
122
123            source_file = source_file_re.match(f)
124            if check_file(f, source_file) != 0:
125                exit_status = 1
126
127            # Lastly, check the line endings of the file.
128            # Note that 'grep' returns null if it reports a file,
129            # a non-null value otherwise.
130            output, rc = runCommand("grep -rPIl \"\\r\" " + f)
131            if (rc is None):
132                if f not in ignore_cr_in_files:
133                    print("File ", f, " has \\r line ending")
134                    exit_status = 1
135
136    print(exit_status)
137    sys.exit(exit_status)
138
139if __name__ == "__main__":
140    main(sys.argv[1:])
141