1 package org.unicode.cldr.draft; 2 3 import java.io.BufferedWriter; 4 import java.io.File; 5 import java.io.FileWriter; 6 import java.io.IOException; 7 import java.util.ArrayList; 8 import java.util.List; 9 import java.util.Scanner; 10 import java.util.regex.MatchResult; 11 12 import com.ibm.icu.text.UCharacterIterator; 13 14 /** 15 * Compresses list of Unicode character ranges given as starting and ending char 16 * into a Base88 string. 17 * 18 * Compression usage: 19 * String encodedStr = base88EncodeList(List<Interval>); 20 * 21 * Decompression usage: 22 * List<Interval> decodedStrList = base88DecodeList(encodedStr); 23 * 24 * Interval has two integers - first, last - to represent the range. 25 */ 26 27 public class CharacterListCompressor { 28 29 public static class Interval { 30 int first; 31 int last; 32 Interval(int first, int last)33 public Interval(int first, int last) { 34 this.first = first; 35 this.last = last; 36 } 37 toString()38 public String toString() { 39 return "«" + first + "-" + last + "»"; 40 } 41 first()42 public int first() { 43 return first; 44 } 45 last()46 public int last() { 47 return last; 48 } 49 } 50 51 // 52 // Pairs to Base88 methods 53 // 54 unicode2Base88(int code)55 public static List<Integer> unicode2Base88(int code) { 56 57 List<Integer> list = new ArrayList<Integer>(); 58 int rem = code % 88; 59 list.add(rem); 60 code = code / 88; 61 62 if (code != 0) { 63 rem = code % 88; 64 list.add(rem); 65 code = code / 88; 66 } 67 if (code != 0) { 68 rem = code % 88; 69 list.add(rem); 70 code = code / 88; 71 72 rem = code % 88; 73 list.add(rem); 74 code = code / 88; 75 } 76 return list; 77 } 78 byteCount4Base88(int code)79 public static int byteCount4Base88(int code) { 80 int count = 0; 81 code = code / 88; 82 83 if (code != 0) { 84 count = 1; 85 code = code / 88; 86 } 87 if (code != 0) { 88 count = 2; 89 } 90 return count; 91 } 92 compressPair2Base88(List<Integer> pair)93 public static List<Integer> compressPair2Base88(List<Integer> pair) { 94 int value = pair.get(0); 95 int type = pair.get(1); 96 int code = value * 8 + type * 3; 97 code += byteCount4Base88(code); 98 99 return unicode2Base88(code); 100 } 101 decodeBase88ToValueTypePairs(String str)102 public static List<List<Integer>> decodeBase88ToValueTypePairs(String str) { 103 List<Integer> list = str2list(str); 104 105 int metawindowsize = 8; 106 List<List<Integer>> result = new ArrayList<List<Integer>>(); 107 int i = 0; 108 109 while (i < list.size()) { 110 int c = list.get(i); 111 int meta = c % metawindowsize; 112 int type = meta / 3; 113 int leng = (meta % 3) + 1; 114 115 if (leng == 3) { 116 leng++; 117 } 118 119 int value = getCodeFromListAt(list, i, leng) / metawindowsize; 120 addPair(result, value, type); 121 i += leng; 122 } 123 return result; 124 } 125 getCodeFromListAt(List<Integer> list, int start, int leng)126 public static int getCodeFromListAt(List<Integer> list, int start, int leng) { 127 int result = 0; 128 for (int i = 0; i < leng; i++) { 129 int c = list.get(start + i); 130 result += c * Math.pow(88, i); 131 } 132 return result; 133 } 134 addPair(List<List<Integer>> pairs, int value, int type)135 public static void addPair(List<List<Integer>> pairs, int value, int type) { 136 List<Integer> pair = new ArrayList<Integer>(); 137 pair.add(value); 138 pair.add(type); 139 pairs.add(pair); 140 } 141 encodeValueTypePairs2Base88(List<List<Integer>> pairs)142 public static String encodeValueTypePairs2Base88(List<List<Integer>> pairs) { 143 List<Integer> result = new ArrayList<Integer>(); 144 for (int i = 0; i < pairs.size(); i++) { 145 List<Integer> pair = pairs.get(i); 146 result.addAll(compressPair2Base88(pair)); 147 } 148 return list2str(result); 149 } 150 151 public final static String ascii = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%()*+,-.:;<=>?@[]^_`{|}~"; 152 list2str(List<Integer> list)153 public static String list2str(List<Integer> list) { 154 StringBuilder str = new StringBuilder(); 155 for (int i = 0; i < list.size(); i++) { 156 int code = list.get(i); 157 158 str.append(ascii.charAt(code)); 159 } 160 return str.toString(); 161 } 162 str2list(String str)163 public static List<Integer> str2list(String str) { 164 List<Integer> list = new ArrayList<Integer>(); 165 for (int i = 0; i < str.length(); i++) { 166 char ch = str.charAt(i); 167 int code = ascii.indexOf(ch); 168 169 list.add(code); 170 } 171 return list; 172 } 173 base88EncodeList(List<Interval> intervalList)174 public static String base88EncodeList(List<Interval> intervalList) { 175 List<List<Integer>> pairs = getValueTypePairsFromStrRangeList(intervalList); 176 String encoded = encodeValueTypePairs2Base88(pairs); 177 178 return encoded; 179 } 180 base88DecodeList(String base88String)181 public static List<Interval> base88DecodeList(String base88String) { 182 List<List<Integer>> pairs = decodeBase88ToValueTypePairs(base88String); 183 List<Interval> decoded = getStrRangeListFromValueTypePairs(pairs); 184 185 return decoded; 186 } 187 188 // end of compression methods 189 190 // Value Type pairs -- Str Range List getValueTypePairsFromStrRangeList(List<Interval> ilist)191 public static List<List<Integer>> getValueTypePairsFromStrRangeList(List<Interval> ilist) { 192 List<List<Integer>> result = new ArrayList<List<Integer>>(); 193 int lastCode = 0; 194 195 for (int i = 0; i < ilist.size(); i++) { 196 int value = 0; 197 int first = ilist.get(i).first; 198 int last = ilist.get(i).last; 199 200 if (lastCode < first) { 201 addPair(result, first - lastCode - 1, 0); 202 } else if (lastCode > first) { 203 addPair(result, lastCode - first - 1, 1); 204 } else if (lastCode == first) { 205 System.out.println("I am not expecting two contiguous chars to be the same"); 206 } 207 lastCode = first; 208 209 if (first < last) { 210 value = last - first - 1; 211 212 // range is big and spit it 213 int rangesize = 0x3c8; // 968 = 88 * 88 / 8 214 while (value >= rangesize) { 215 216 addPair(result, rangesize - 1, 2); // rangesize chars - 0..(rangesize - 1) 217 value -= rangesize; // rangesize chars are already added above 218 lastCode += rangesize; 219 } 220 addPair(result, value, 2); 221 lastCode = last; 222 } 223 } 224 return result; 225 } 226 getStrRangeListFromValueTypePairs(List<List<Integer>> pairs)227 public static List<Interval> getStrRangeListFromValueTypePairs(List<List<Integer>> pairs) { 228 ArrayList<Interval> result = new ArrayList<Interval>(); 229 230 int lastCode = 0; 231 for (int i = 0; i < pairs.size(); i++) { 232 List<Integer> pair = pairs.get(i); 233 234 int value = pair.get(0); 235 int type = pair.get(1); 236 237 if (type == 0) { 238 lastCode += value + 1; 239 addInterval(result, lastCode, lastCode); 240 } else if (type == 1) { 241 lastCode -= value + 1; 242 addInterval(result, lastCode, lastCode); 243 } else if (type == 2) { 244 int first = lastCode + 1; 245 int last = first + value; 246 addInterval(result, first, last); 247 lastCode += value + 1; 248 } 249 } 250 return result; 251 } 252 addInterval(List<Interval> list, int first, int last)253 public static void addInterval(List<Interval> list, int first, int last) { 254 Interval i = new Interval(first, last); 255 list.add(i); 256 } 257 258 // Str Range List -- Range Str 259 getStrRangeListFromRangeStr(String str)260 public static List<Interval> getStrRangeListFromRangeStr(String str) { 261 ArrayList<Interval> result = new ArrayList<Interval>(); 262 final UCharacterIterator it = UCharacterIterator.getInstance(str); 263 264 int first; 265 while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) { 266 int last = it.nextCodePoint(); 267 addInterval(result, first, last); 268 } 269 return result; 270 } 271 272 // 273 // To String methods 274 // strRangeList2string(List<Interval> ilist)275 public static String strRangeList2string(List<Interval> ilist) { 276 277 StringBuilder sbuild = new StringBuilder(); 278 for (int i = 0; i < ilist.size(); i++) { 279 int first = ilist.get(i).first; 280 int last = ilist.get(i).last; 281 282 for (int j = first; j <= last; j++) { 283 sbuild.appendCodePoint(j); 284 } 285 } 286 return sbuild.toString(); 287 } 288 rangeString2string(String rstr)289 public static String rangeString2string(String rstr) { 290 291 StringBuilder sbuild = new StringBuilder(); 292 final UCharacterIterator it = UCharacterIterator.getInstance(rstr); 293 294 int first; 295 while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) { 296 int last = it.nextCodePoint(); 297 298 for (int j = first; j <= last; j++) { 299 sbuild.appendCodePoint(j); 300 } 301 } 302 return sbuild.toString(); 303 } 304 305 // 306 // String comparison methods 307 // 308 isStringsEqual(String s1, String s2)309 public static boolean isStringsEqual(String s1, String s2) { 310 311 final UCharacterIterator it1 = UCharacterIterator.getInstance(s1); 312 final UCharacterIterator it2 = UCharacterIterator.getInstance(s2); 313 int c1 = 0; 314 int c2 = 0; 315 int count = 0; 316 while (c1 == c2 && c1 != UCharacterIterator.DONE) { 317 count++; 318 c1 = it1.nextCodePoint(); 319 c2 = it2.nextCodePoint(); 320 321 System.out.print("Comparing c1 = c2 = "); 322 System.out.print(c1); 323 System.out.print((char) c1); 324 System.out.print(" ; count = "); 325 System.out.println(count); 326 } 327 System.out.print(count); 328 System.out.println(" characters compared"); 329 330 if (c1 != c2) { 331 System.out.print("Mismatch at c1 = "); 332 System.out.print(c1); 333 System.out.print(" c2 = "); 334 System.out.println(c2); 335 return false; 336 } 337 return true; 338 } 339 340 // Main 341 main(String[] args)342 public static void main(String[] args) { 343 344 StringBuilder strBuild = new StringBuilder(); 345 try { 346 Scanner sc = new Scanner(new File("/home/cibu/CharData.java"), "UTF-8"); 347 while (sc.hasNext()) { 348 if (sc.findInLine("\\/\\*.*,\"(.*)\"},\\s*") != null) { 349 MatchResult match = sc.match(); 350 String str = match.group(1); 351 str = str.replaceAll("\\\\(.)", "$1"); 352 System.out.println(str); 353 strBuild.append(str); 354 } else { 355 sc.next(); 356 } 357 } 358 sc.close(); 359 } catch (IOException ex) { 360 ex.printStackTrace(); 361 } 362 363 String str = strBuild.toString(); 364 365 if (str.length() == 0) { 366 str = "\uDBFF\uDC00\uDBFF\uDFFD\u0001\u0001"; 367 } 368 369 List<Interval> ilist = getStrRangeListFromRangeStr(str); 370 371 String encodedStr = base88EncodeList(ilist); 372 List<Interval> decodedStrRangeList = base88DecodeList(encodedStr); 373 374 String str1 = rangeString2string(str); 375 String str2 = strRangeList2string(decodedStrRangeList); 376 isStringsEqual(str1, str2); 377 378 try { 379 BufferedWriter out = new BufferedWriter(new FileWriter("/tmp/compressed.txt")); 380 out.write(encodedStr); 381 out.close(); 382 } catch (IOException ex) { 383 ex.printStackTrace(); 384 } 385 } 386 } 387