1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 *********************************************************************** 5 * 6 * Copyright (C) 2006-2012, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 * 9 *********************************************************************** 10 * 11 * BIG5Tool 12 * 13 * This tool produces the character usage frequency statistics for the Big5 14 * Chinese charset, for use by the ICU charset detectors. 15 * 16 * usage: java BIG5Tool [-d] [directory path] 17 * 18 * -d: Produce the data in a form to be exported to the ICU implementation 19 * Default is to produce an informative dump. 20 * 21 * -sjis Do Shift_JIS. The structure of sjis is very similar to Big5. 22 * 23 * directory path 24 * Source directory for the text files to be analyzed. 25 * All files in the specified directory must be in the Big5 encoding. 26 * 27 */ 28 29 package com.ibm.icu.dev.tool.charsetdet.mbcs; 30 31 import java.io.File; 32 import java.io.FileInputStream; 33 import java.util.ArrayList; 34 import java.util.Arrays; 35 import java.util.HashMap; 36 import java.util.List; 37 38 39 public class BIG5Tool { 40 41 // The file buffer and file data length need to be out in class member variables 42 // so that the code lifted from charSet detection for scanning the multi-byte chars 43 // can see them conveniently. 44 byte [] buf = new byte[1000000]; 45 int fileSize; 46 47 boolean option_d = false; // data option. Produce exportable data 48 boolean option_v = true; // verbose informaional output. 49 boolean sjis = false; // True if input text files are Shift_JIS encoded. 50 51 52 main(String[] args)53 public static void main(String[] args) { 54 BIG5Tool This = new BIG5Tool(); 55 This.Main(args); 56 } 57 58 59 Main(String[] args)60 void Main(String[] args) { 61 int i; 62 63 // 64 // Command Line Option Handling 65 // 66 String dirName = null; 67 for (i=0; i<args.length; i++) { 68 if (args[i].equals("-d")) { 69 option_d = true; 70 option_v = false; 71 continue; 72 } 73 if (args[i].equals("-sjis")) { 74 sjis = true; 75 continue; 76 } 77 if (args[i].startsWith("-")) { 78 System.err.println("Unrecognized option: " + args[i]); 79 System.exit(-1); 80 } 81 if (dirName == null) { 82 dirName = args[i]; 83 } else { 84 System.err.println("Unrecognized option: " + dirName); 85 System.exit(-1); 86 } 87 } 88 if (dirName == null) { 89 dirName = "."; 90 } 91 92 // 93 // Verify that the specified directory exists. 94 // 95 File dir = new File(dirName); 96 if (dir.isDirectory() == false) { 97 System.err.println("\"" + dirName + "\" is not a directory"); 98 System.exit(-1); 99 } 100 processDir(dir); 101 102 } 103 104 // 105 // Collect statistics from all ordinary files in a specified directory. 106 // processDir(File dir)107 void processDir(File dir) { 108 int totalMbcsChars = 0; 109 HashMap m = new HashMap(10000); 110 int i; 111 112 System.out.println(dir.getName()); 113 File[] files = dir.listFiles(); 114 for (i=0; i<files.length; i++) { 115 FileInputStream is = null; 116 try { 117 if (files[i].isFile()) { 118 is = new FileInputStream(files[i]); 119 fileSize = is.read(buf); 120 if (option_v) { 121 System.out.println(files[i].getPath()); 122 System.out.println(" " + fileSize + " bytes."); 123 } 124 iteratedChar ichar = new iteratedChar(); 125 int fileChars = 0; 126 int fileMbcsChars = 0; 127 int errs = 0; 128 129 while (nextChar(ichar)) { 130 if (ichar.error == true) { 131 errs++; 132 continue; 133 } 134 fileChars++; 135 if (ichar.charValue > 255) { 136 fileMbcsChars++; 137 totalMbcsChars++; 138 } 139 if (ichar.charValue <= 255) { 140 // Don't keep occurence statistics for the single byte range 141 continue; 142 } 143 144 // 145 // Frequency of occurence statistics are accumulated in a map. 146 // 147 ChEl keyEl = new ChEl(ichar.charValue, 0); 148 ChEl valEl = (ChEl)m.get(keyEl); 149 if (valEl == null) { 150 m.put(keyEl, keyEl); 151 valEl = keyEl; 152 } 153 valEl.occurences++; 154 } 155 if (option_v) { 156 System.out.println(" " + fileChars + " Chars"); 157 System.out.println(" " + fileMbcsChars + " mbcs Chars"); 158 System.out.println(" " + errs + " errors"); 159 System.out.println("\n"); 160 } 161 } 162 } 163 catch (Exception e) { 164 System.err.println("Exception:" + e); 165 166 } 167 finally { 168 if (is != null) { 169 try { 170 is.close(); 171 } catch (Exception e) { 172 // ignore 173 } 174 } 175 } 176 } 177 178 // 179 // We've processed through all of the files. 180 // sort and dump out the frequency statistics. 181 // 182 Object [] encounteredChars = m.values().toArray(); 183 Arrays.sort(encounteredChars); 184 int cumulativeChars = 0; 185 int cumulativePercent = 0; 186 if (option_v) { 187 System.out.println("# <char code> <occurences> <Cumulative %>"); 188 for (i=0; i<encounteredChars.length; i++) { 189 ChEl c = (ChEl)encounteredChars[i]; 190 cumulativeChars += c.occurences; 191 cumulativePercent = cumulativeChars*100/totalMbcsChars; 192 System.out.println(i + " " + Integer.toHexString(c.charCode) + " " 193 + c.occurences + " " + cumulativePercent); 194 } 195 } 196 if (option_d) { 197 // 198 // Output the list of characters formatted for pasting into a 199 // Java source code array initializer. 200 // Resort into order based on the character code value, not 201 // on frequency of occurence. 202 // 203 List charList = new ArrayList(); 204 205 for (i=0; i<100 && cumulativePercent<50; i++) { 206 ChEl c = (ChEl)encounteredChars[i]; 207 cumulativeChars += c.occurences; 208 cumulativePercent = cumulativeChars*100/totalMbcsChars; 209 charList.add(new Integer(c.charCode)); 210 } 211 Object [] sortedChars = charList.toArray(); 212 Arrays.sort(sortedChars); 213 214 System.out.print(" {"); 215 for (i=0; i<sortedChars.length; i++) { 216 if (i != 0) { 217 System.out.print(", "); 218 if ((i)%10 == 0) { 219 System.out.print("\n "); 220 } 221 } 222 int cp = ((Integer)sortedChars[i]).intValue(); 223 System.out.print("0x" + Integer.toHexString(cp)); 224 } 225 System.out.println("};"); 226 } 227 } 228 229 // 230 // This is a little class containing a 231 // multi-byte character value and an occurence count for that char. 232 // Instances of this class are kept in the collection that accumulates statistics 233 // 234 // WARNING: this class's natural ordering (from Comparable) and equals() 235 // are inconsistent. 236 237 static class ChEl implements Comparable { 238 int charCode; 239 int occurences; 240 ChEl(int c, int o)241 ChEl(int c, int o) { 242 charCode = c; 243 occurences = o; 244 } 245 246 // Equals needs to work with a map, with the charCode as the key. 247 // For insertion/lookup, we care about the char code only, not the occurence count. equals(Object other)248 public boolean equals(Object other) { 249 ChEl o = (ChEl)other; 250 return o.charCode == this.charCode; 251 } 252 253 // Hashcode needs to be compatible with equals 254 // We're using this in a hashMap! hashCode()255 public int hashCode() { 256 return charCode; 257 } 258 259 // We want to be able to sort the results by frequency of occurence 260 // Compare backwards. We want most frequent chars first. compareTo(Object other)261 public int compareTo(Object other) { 262 ChEl o = (ChEl)other; 263 return (this.occurences> o.occurences? -1 : 264 (this.occurences==o.occurences? 0 : 1)); 265 } 266 267 } 268 269 // 270 // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs 271 // Pulls out one logical char according to the rules of EUC encoding. 272 // 273 class iteratedChar { 274 int charValue = 0; // The char value is a value from the encoding. 275 // It's meaning is not well defined, other than 276 // different encodings 277 int index = 0; 278 int nextIndex = 0; 279 boolean error = false; 280 boolean done = false; 281 reset()282 void reset() { 283 charValue = 0; 284 index = -1; 285 nextIndex = 0; 286 error = false; 287 done = false; 288 } 289 nextByte()290 int nextByte() { 291 if (nextIndex >= fileSize) { 292 done = true; 293 return -1; 294 } 295 int byteValue = (int)buf[nextIndex++] & 0x00ff; 296 return byteValue; 297 } 298 } 299 300 nextChar(iteratedChar it)301 boolean nextChar(iteratedChar it) { 302 it.index = it.nextIndex; 303 it.error = false; 304 int firstByte = 0; 305 int secondByte = 0; 306 307 buildChar: { 308 firstByte = it.charValue = it.nextByte(); 309 if (firstByte < 0) { 310 // Ran off the end of the input data 311 it.done = true; 312 break buildChar; 313 } 314 if (firstByte <= 0x0080 || 315 (sjis && firstByte>=0x00a0 && firstByte< 0x00e0) || 316 (sjis && firstByte>=0x00fd && firstByte<=0x00ff)) { 317 // single byte char 318 break buildChar; 319 } 320 321 secondByte = it.nextByte(); 322 it.charValue = (it.charValue << 8) | secondByte; 323 324 if (secondByte < 0x40 || 325 secondByte == 0x007f || 326 secondByte == 0x00ff || 327 sjis && secondByte >= 0x00fd) { 328 it.error = true; 329 } 330 331 if (it.error) { 332 System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte)); 333 } 334 } 335 336 return (it.done == false); 337 } 338 339 } 340