• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/python -B
2
3# Copyright (C) 2016 and later: Unicode, Inc. and others.
4# License & terms of use: http://www.unicode.org/copyright.html
5
6# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
7# All rights reserved.
8
9#
10#  Script to check that ICU source files contain only valid UTF-8 encoded text,
11#  and that all files except '.txt' files do not contain a Byte Order Mark (BOM).
12#
13#  THIS SCRIPT DOES NOT WORK ON WINDOWS
14#     It only works correctly on platforms where the native line ending is a plain \n
15#
16#  usage:
17#     icu-file-utf8-check.py  [options]
18#
19#  options:
20#     -h | --help    Print a usage line and exit.
21#
22#  The tool operates recursively on the directory from which it is run.
23#  Only files from the ICU github repository are checked.
24#  No changes are made to the repository; only the working copy will be altered.
25
26import sys
27import os
28import os.path
29import re
30import getopt
31
32
33def runCommand(cmd):
34    output_file = os.popen(cmd);
35    output_text = output_file.read();
36    exit_status = output_file.close();
37    if exit_status:
38        print >>sys.stderr, '"', cmd, '" failed.  Exiting.'
39        sys.exit(exit_status)
40    return output_text
41
42
43def usage():
44    print "usage: " + sys.argv[0] + " [-h | --help]"
45
46
47#
48#  File check.         Check source code files for UTF-8 and all except text files for not containing a BOM
49#    file_name:        name of a text file.
50#    is_source:        Flag, set to True if file is a source code file (.c, .cpp, .h, .java).
51#
52def check_file(file_name, is_source):
53    f = open(file_name, 'r')
54    bytes = f.read()
55    f.close()
56
57    if is_source:
58        try:
59            bytes.decode("UTF-8")
60        except UnicodeDecodeError:
61            print "Error: %s is a source code file but contains non-utf-8 bytes." % file_name
62
63    if ord(bytes[0]) == 0xef:
64        if not (file_name.endswith(".txt") or file_name.endswith(".sln")
65                    or file_name.endswith(".targets")
66                    or ".vcxproj" in file_name):
67            print "Warning: file %s contains a UTF-8 BOM: " % file_name
68
69    return
70
71def main(argv):
72    try:
73        opts, args = getopt.getopt(argv, "h", ("help"))
74    except getopt.GetoptError:
75        print "unrecognized option: " + argv[0]
76        usage()
77        sys.exit(2)
78    for opt, arg in opts:
79        if opt in ("-h", "--help"):
80            usage()
81            sys.exit()
82    if args:
83        print "unexpected command line argument"
84        usage()
85        sys.exit()
86
87    output = runCommand("git ls-files ");
88    file_list = output.splitlines()
89
90    source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))")
91
92    for f in file_list:
93        if os.path.isdir(f):
94            print "Skipping dir " + f
95            continue
96        if not os.path.isfile(f):
97            print "Repository file not in working copy: " + f
98            continue;
99
100        source_file = source_file_re.match(f)
101        check_file(f, source_file)
102
103if __name__ == "__main__":
104    main(sys.argv[1:])
105