• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  ***********************************************************************
5  *
6  * Copyright (C) 2006-2012, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *
9  ***********************************************************************
10  *
11  * BIG5Tool
12  *
13  *    This tool produces the character usage frequency statistics for the Big5
14  *    Chinese charset, for use by the ICU charset detectors.
15  *
16  *    usage:  java BIG5Tool [-d] [directory path]
17  *
18  *        -d:   Produce the data in a form to be exported to the ICU implementation
19  *              Default is to produce an informative dump.
20  *
21  *        -sjis Do Shift_JIS.  The structure of sjis is very similar to Big5.
22  *
23  *        directory path
24  *              Source directory for the text files to be analyzed.
25  *              All files in the specified directory must be in the Big5 encoding.
26  *
27  */
28 
29 package com.ibm.icu.dev.tool.charsetdet.mbcs;
30 
31 import java.io.File;
32 import java.io.FileInputStream;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.util.HashMap;
36 import java.util.List;
37 
38 
39 public class BIG5Tool {
40 
41     // The file buffer and file data length need to be out in class member variables
42     //  so that the code lifted from charSet detection for scanning the multi-byte chars
43     //  can see them conveniently.
44     byte []    buf = new byte[1000000];
45     int        fileSize;
46 
47     boolean    option_d = false;    // data option.  Produce exportable data
48     boolean    option_v = true;     // verbose informaional output.
49     boolean    sjis     = false;    // True if input text files are Shift_JIS encoded.
50 
51 
52 
main(String[] args)53     public static void main(String[] args) {
54         BIG5Tool  This = new BIG5Tool();
55         This.Main(args);
56     }
57 
58 
59 
Main(String[] args)60     void Main(String[] args) {
61         int i;
62 
63         //
64         //   Command Line Option Handling
65         //
66         String     dirName  = null;
67         for (i=0; i<args.length; i++) {
68             if (args[i].equals("-d")) {
69                 option_d = true;
70                 option_v = false;
71                 continue;
72             }
73             if (args[i].equals("-sjis")) {
74                 sjis = true;
75                 continue;
76             }
77             if (args[i].startsWith("-")) {
78                 System.err.println("Unrecognized option: " + args[i]);
79                 System.exit(-1);
80             }
81             if (dirName == null) {
82                 dirName = args[i];
83             } else {
84                 System.err.println("Unrecognized option: " + dirName);
85                 System.exit(-1);
86             }
87         }
88         if (dirName == null) {
89             dirName = ".";
90         }
91 
92         //
93         //  Verify that the specified directory exists.
94         //
95         File dir = new File(dirName);
96         if (dir.isDirectory() == false) {
97             System.err.println("\"" + dirName + "\" is not a directory");
98             System.exit(-1);
99         }
100         processDir(dir);
101 
102     }
103 
104     //
105     // Collect statistics from all ordinary files in a specified directory.
106     //
processDir(File dir)107     void processDir(File dir) {
108         int      totalMbcsChars  = 0;
109         HashMap  m = new HashMap(10000);
110         int      i;
111 
112         System.out.println(dir.getName());
113         File[] files = dir.listFiles();
114         for (i=0; i<files.length; i++) {
115             FileInputStream is = null;
116             try {
117                 if (files[i].isFile()) {
118                     is = new FileInputStream(files[i]);
119                     fileSize = is.read(buf);
120                     if (option_v) {
121                         System.out.println(files[i].getPath());
122                         System.out.println("  " + fileSize + " bytes.");
123                     }
124                     iteratedChar ichar = new iteratedChar();
125                     int fileChars     = 0;
126                     int fileMbcsChars = 0;
127                     int errs          = 0;
128 
129                     while (nextChar(ichar)) {
130                         if (ichar.error == true) {
131                             errs++;
132                             continue;
133                         }
134                         fileChars++;
135                         if (ichar.charValue > 255) {
136                             fileMbcsChars++;
137                             totalMbcsChars++;
138                         }
139                         if (ichar.charValue <= 255) {
140                             // Don't keep occurence statistics for the single byte range
141                             continue;
142                         }
143 
144                         //
145                         //  Frequency of occurence statistics are accumulated in a map.
146                         //
147                         ChEl  keyEl = new ChEl(ichar.charValue, 0);
148                         ChEl  valEl = (ChEl)m.get(keyEl);
149                         if (valEl == null) {
150                             m.put(keyEl, keyEl);
151                             valEl = keyEl;
152                         }
153                         valEl.occurences++;
154                     }
155                     if (option_v) {
156                         System.out.println("  " + fileChars     + " Chars");
157                         System.out.println("  " + fileMbcsChars + " mbcs Chars");
158                         System.out.println("  " + errs          + " errors");
159                         System.out.println("\n");
160                     }
161                 }
162             }
163             catch (Exception e) {
164                 System.err.println("Exception:" + e);
165 
166             }
167             finally {
168                 if (is != null) {
169                     try {
170                         is.close();
171                     } catch (Exception e) {
172                         // ignore
173                     }
174                 }
175             }
176         }
177 
178         //
179         //  We've processed through all of the files.
180         //     sort and dump out the frequency statistics.
181         //
182         Object [] encounteredChars = m.values().toArray();
183         Arrays.sort(encounteredChars);
184         int cumulativeChars = 0;
185         int cumulativePercent = 0;
186         if (option_v) {
187             System.out.println("# <char code> <occurences>  <Cumulative %>");
188             for (i=0; i<encounteredChars.length; i++) {
189                 ChEl c = (ChEl)encounteredChars[i];
190                 cumulativeChars += c.occurences;
191                 cumulativePercent = cumulativeChars*100/totalMbcsChars;
192                 System.out.println(i + "   " + Integer.toHexString(c.charCode) + "        "
193                         + c.occurences + "         " + cumulativePercent);
194             }
195         }
196         if (option_d) {
197             //
198             //   Output the list of characters formatted for pasting into a
199             //     Java source code array initializer.
200             //     Resort into order based on the character code value, not
201             //      on frequency of occurence.
202             //
203             List  charList = new ArrayList();
204 
205             for (i=0; i<100 && cumulativePercent<50; i++) {
206                 ChEl c = (ChEl)encounteredChars[i];
207                 cumulativeChars += c.occurences;
208                 cumulativePercent = cumulativeChars*100/totalMbcsChars;
209                 charList.add(new Integer(c.charCode));
210             }
211             Object [] sortedChars = charList.toArray();
212             Arrays.sort(sortedChars);
213 
214             System.out.print("          {");
215             for (i=0; i<sortedChars.length; i++) {
216                 if (i != 0) {
217                     System.out.print(", ");
218                     if ((i)%10 == 0) {
219                         System.out.print("\n           ");
220                     }
221                 }
222                 int cp = ((Integer)sortedChars[i]).intValue();
223                 System.out.print("0x" + Integer.toHexString(cp));
224             }
225             System.out.println("};");
226         }
227     }
228 
229     //
230     //  This is a little class containing a
231     //    multi-byte character value and an occurence count for that char.
232     //  Instances of this class are kept in the collection that accumulates statistics
233     //
234     //  WARNING:  this class's natural ordering (from Comparable) and equals()
235     //            are inconsistent.
236 
237     static class ChEl implements Comparable {
238         int charCode;
239         int occurences;
240 
ChEl(int c, int o)241         ChEl(int c, int o) {
242             charCode = c;
243             occurences = o;
244         }
245 
246         // Equals needs to work with a map, with the charCode as the key.
247         //   For insertion/lookup, we care about the char code only, not the occurence count.
equals(Object other)248         public boolean equals(Object other) {
249             ChEl o = (ChEl)other;
250             return o.charCode == this.charCode;
251         }
252 
253         // Hashcode needs to be compatible with equals
254         //   We're using this in a hashMap!
hashCode()255         public int hashCode() {
256             return charCode;
257         }
258 
259         // We want to be able to sort the results by frequency of occurence
260         //   Compare backwards.  We want most frequent chars first.
compareTo(Object other)261         public int compareTo(Object other) {
262             ChEl o = (ChEl)other;
263             return (this.occurences> o.occurences? -1 :
264                    (this.occurences==o.occurences?  0 : 1));
265         }
266 
267     }
268 
269     //
270     // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
271     //              Pulls out one logical char according to the rules of EUC encoding.
272     //
273     class iteratedChar {
274         int             charValue = 0;             // The char value is a value from the encoding.
275                                                    //   It's meaning is not well defined, other than
276                                                    //   different encodings
277         int             index     = 0;
278         int             nextIndex = 0;
279         boolean         error     = false;
280         boolean         done      = false;
281 
reset()282         void reset() {
283             charValue = 0;
284             index     = -1;
285             nextIndex = 0;
286             error     = false;
287             done      = false;
288         }
289 
nextByte()290         int nextByte() {
291             if (nextIndex >= fileSize) {
292                 done = true;
293                 return -1;
294             }
295             int byteValue = (int)buf[nextIndex++] & 0x00ff;
296             return byteValue;
297         }
298     }
299 
300 
nextChar(iteratedChar it)301     boolean nextChar(iteratedChar it) {
302         it.index = it.nextIndex;
303         it.error = false;
304         int firstByte  = 0;
305         int secondByte = 0;
306 
307         buildChar: {
308             firstByte = it.charValue = it.nextByte();
309             if (firstByte < 0) {
310                 // Ran off the end of the input data
311                 it.done = true;
312                 break buildChar;
313             }
314             if (firstByte <= 0x0080 ||
315                     (sjis && firstByte>=0x00a0 && firstByte< 0x00e0) ||
316                     (sjis && firstByte>=0x00fd && firstByte<=0x00ff)) {
317                 // single byte char
318                 break buildChar;
319             }
320 
321             secondByte = it.nextByte();
322             it.charValue = (it.charValue << 8) | secondByte;
323 
324             if (secondByte <  0x40 ||
325                 secondByte == 0x007f ||
326                 secondByte == 0x00ff ||
327                 sjis && secondByte >= 0x00fd) {
328                     it.error = true;
329             }
330 
331             if (it.error) {
332                 System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte));
333             }
334        }
335 
336         return (it.done == false);
337     }
338 
339 }
340