1 package org.unicode.cldr.draft; 2 3 import java.io.BufferedWriter; 4 import java.io.File; 5 import java.io.FileWriter; 6 import java.io.IOException; 7 import java.util.ArrayList; 8 import java.util.List; 9 import java.util.Scanner; 10 import java.util.regex.MatchResult; 11 12 import com.ibm.icu.text.UCharacterIterator; 13 14 /** 15 * Compresses list of Unicode character ranges given as starting and ending char 16 * into a Base88 string. 17 * 18 * Compression usage: 19 * String encodedStr = base88EncodeList(List<Interval>); 20 * 21 * Decompression usage: 22 * List<Interval> decodedStrList = base88DecodeList(encodedStr); 23 * 24 * Interval has two integers - first, last - to represent the range. 25 */ 26 27 public class CharacterListCompressor { 28 29 public static class Interval { 30 int first; 31 int last; 32 Interval(int first, int last)33 public Interval(int first, int last) { 34 this.first = first; 35 this.last = last; 36 } 37 38 @Override toString()39 public String toString() { 40 return "«" + first + "-" + last + "»"; 41 } 42 first()43 public int first() { 44 return first; 45 } 46 last()47 public int last() { 48 return last; 49 } 50 } 51 52 // 53 // Pairs to Base88 methods 54 // 55 unicode2Base88(int code)56 public static List<Integer> unicode2Base88(int code) { 57 58 List<Integer> list = new ArrayList<>(); 59 int rem = code % 88; 60 list.add(rem); 61 code = code / 88; 62 63 if (code != 0) { 64 rem = code % 88; 65 list.add(rem); 66 code = code / 88; 67 } 68 if (code != 0) { 69 rem = code % 88; 70 list.add(rem); 71 code = code / 88; 72 73 rem = code % 88; 74 list.add(rem); 75 code = code / 88; 76 } 77 return list; 78 } 79 byteCount4Base88(int code)80 public static int byteCount4Base88(int code) { 81 int count = 0; 82 code = code / 88; 83 84 if (code != 0) { 85 count = 1; 86 code = code / 88; 87 } 88 if (code != 0) { 89 count = 2; 90 } 91 return count; 92 } 93 compressPair2Base88(List<Integer> pair)94 public static List<Integer> compressPair2Base88(List<Integer> pair) { 95 int value = pair.get(0); 96 int type = pair.get(1); 97 int code = value * 8 + type * 3; 98 code += byteCount4Base88(code); 99 100 return unicode2Base88(code); 101 } 102 decodeBase88ToValueTypePairs(String str)103 public static List<List<Integer>> decodeBase88ToValueTypePairs(String str) { 104 List<Integer> list = str2list(str); 105 106 int metawindowsize = 8; 107 List<List<Integer>> result = new ArrayList<>(); 108 int i = 0; 109 110 while (i < list.size()) { 111 int c = list.get(i); 112 int meta = c % metawindowsize; 113 int type = meta / 3; 114 int leng = (meta % 3) + 1; 115 116 if (leng == 3) { 117 leng++; 118 } 119 120 int value = getCodeFromListAt(list, i, leng) / metawindowsize; 121 addPair(result, value, type); 122 i += leng; 123 } 124 return result; 125 } 126 getCodeFromListAt(List<Integer> list, int start, int leng)127 public static int getCodeFromListAt(List<Integer> list, int start, int leng) { 128 int result = 0; 129 for (int i = 0; i < leng; i++) { 130 int c = list.get(start + i); 131 result += c * Math.pow(88, i); 132 } 133 return result; 134 } 135 addPair(List<List<Integer>> pairs, int value, int type)136 public static void addPair(List<List<Integer>> pairs, int value, int type) { 137 List<Integer> pair = new ArrayList<>(); 138 pair.add(value); 139 pair.add(type); 140 pairs.add(pair); 141 } 142 encodeValueTypePairs2Base88(List<List<Integer>> pairs)143 public static String encodeValueTypePairs2Base88(List<List<Integer>> pairs) { 144 List<Integer> result = new ArrayList<>(); 145 for (int i = 0; i < pairs.size(); i++) { 146 List<Integer> pair = pairs.get(i); 147 result.addAll(compressPair2Base88(pair)); 148 } 149 return list2str(result); 150 } 151 152 public final static String ascii = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%()*+,-.:;<=>?@[]^_`{|}~"; 153 list2str(List<Integer> list)154 public static String list2str(List<Integer> list) { 155 StringBuilder str = new StringBuilder(); 156 for (int i = 0; i < list.size(); i++) { 157 int code = list.get(i); 158 159 str.append(ascii.charAt(code)); 160 } 161 return str.toString(); 162 } 163 str2list(String str)164 public static List<Integer> str2list(String str) { 165 List<Integer> list = new ArrayList<>(); 166 for (int i = 0; i < str.length(); i++) { 167 char ch = str.charAt(i); 168 int code = ascii.indexOf(ch); 169 170 list.add(code); 171 } 172 return list; 173 } 174 base88EncodeList(List<Interval> intervalList)175 public static String base88EncodeList(List<Interval> intervalList) { 176 List<List<Integer>> pairs = getValueTypePairsFromStrRangeList(intervalList); 177 String encoded = encodeValueTypePairs2Base88(pairs); 178 179 return encoded; 180 } 181 base88DecodeList(String base88String)182 public static List<Interval> base88DecodeList(String base88String) { 183 List<List<Integer>> pairs = decodeBase88ToValueTypePairs(base88String); 184 List<Interval> decoded = getStrRangeListFromValueTypePairs(pairs); 185 186 return decoded; 187 } 188 189 // end of compression methods 190 191 // Value Type pairs -- Str Range List getValueTypePairsFromStrRangeList(List<Interval> ilist)192 public static List<List<Integer>> getValueTypePairsFromStrRangeList(List<Interval> ilist) { 193 List<List<Integer>> result = new ArrayList<>(); 194 int lastCode = 0; 195 196 for (int i = 0; i < ilist.size(); i++) { 197 int value = 0; 198 int first = ilist.get(i).first; 199 int last = ilist.get(i).last; 200 201 if (lastCode < first) { 202 addPair(result, first - lastCode - 1, 0); 203 } else if (lastCode > first) { 204 addPair(result, lastCode - first - 1, 1); 205 } else if (lastCode == first) { 206 System.out.println("I am not expecting two contiguous chars to be the same"); 207 } 208 lastCode = first; 209 210 if (first < last) { 211 value = last - first - 1; 212 213 // range is big and spit it 214 int rangesize = 0x3c8; // 968 = 88 * 88 / 8 215 while (value >= rangesize) { 216 217 addPair(result, rangesize - 1, 2); // rangesize chars - 0..(rangesize - 1) 218 value -= rangesize; // rangesize chars are already added above 219 lastCode += rangesize; 220 } 221 addPair(result, value, 2); 222 lastCode = last; 223 } 224 } 225 return result; 226 } 227 getStrRangeListFromValueTypePairs(List<List<Integer>> pairs)228 public static List<Interval> getStrRangeListFromValueTypePairs(List<List<Integer>> pairs) { 229 ArrayList<Interval> result = new ArrayList<>(); 230 231 int lastCode = 0; 232 for (int i = 0; i < pairs.size(); i++) { 233 List<Integer> pair = pairs.get(i); 234 235 int value = pair.get(0); 236 int type = pair.get(1); 237 238 if (type == 0) { 239 lastCode += value + 1; 240 addInterval(result, lastCode, lastCode); 241 } else if (type == 1) { 242 lastCode -= value + 1; 243 addInterval(result, lastCode, lastCode); 244 } else if (type == 2) { 245 int first = lastCode + 1; 246 int last = first + value; 247 addInterval(result, first, last); 248 lastCode += value + 1; 249 } 250 } 251 return result; 252 } 253 addInterval(List<Interval> list, int first, int last)254 public static void addInterval(List<Interval> list, int first, int last) { 255 Interval i = new Interval(first, last); 256 list.add(i); 257 } 258 259 // Str Range List -- Range Str 260 getStrRangeListFromRangeStr(String str)261 public static List<Interval> getStrRangeListFromRangeStr(String str) { 262 ArrayList<Interval> result = new ArrayList<>(); 263 final UCharacterIterator it = UCharacterIterator.getInstance(str); 264 265 int first; 266 while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) { 267 int last = it.nextCodePoint(); 268 addInterval(result, first, last); 269 } 270 return result; 271 } 272 273 // 274 // To String methods 275 // strRangeList2string(List<Interval> ilist)276 public static String strRangeList2string(List<Interval> ilist) { 277 278 StringBuilder sbuild = new StringBuilder(); 279 for (int i = 0; i < ilist.size(); i++) { 280 int first = ilist.get(i).first; 281 int last = ilist.get(i).last; 282 283 for (int j = first; j <= last; j++) { 284 sbuild.appendCodePoint(j); 285 } 286 } 287 return sbuild.toString(); 288 } 289 rangeString2string(String rstr)290 public static String rangeString2string(String rstr) { 291 292 StringBuilder sbuild = new StringBuilder(); 293 final UCharacterIterator it = UCharacterIterator.getInstance(rstr); 294 295 int first; 296 while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) { 297 int last = it.nextCodePoint(); 298 299 for (int j = first; j <= last; j++) { 300 sbuild.appendCodePoint(j); 301 } 302 } 303 return sbuild.toString(); 304 } 305 306 // 307 // String comparison methods 308 // 309 isStringsEqual(String s1, String s2)310 public static boolean isStringsEqual(String s1, String s2) { 311 312 final UCharacterIterator it1 = UCharacterIterator.getInstance(s1); 313 final UCharacterIterator it2 = UCharacterIterator.getInstance(s2); 314 int c1 = 0; 315 int c2 = 0; 316 int count = 0; 317 while (c1 == c2 && c1 != UCharacterIterator.DONE) { 318 count++; 319 c1 = it1.nextCodePoint(); 320 c2 = it2.nextCodePoint(); 321 322 System.out.print("Comparing c1 = c2 = "); 323 System.out.print(c1); 324 System.out.print((char) c1); 325 System.out.print(" ; count = "); 326 System.out.println(count); 327 } 328 System.out.print(count); 329 System.out.println(" characters compared"); 330 331 if (c1 != c2) { 332 System.out.print("Mismatch at c1 = "); 333 System.out.print(c1); 334 System.out.print(" c2 = "); 335 System.out.println(c2); 336 return false; 337 } 338 return true; 339 } 340 341 // Main 342 main(String[] args)343 public static void main(String[] args) { 344 345 StringBuilder strBuild = new StringBuilder(); 346 try { 347 Scanner sc = new Scanner(new File("/home/cibu/CharData.java"), "UTF-8"); 348 while (sc.hasNext()) { 349 if (sc.findInLine("\\/\\*.*,\"(.*)\"},\\s*") != null) { 350 MatchResult match = sc.match(); 351 String str = match.group(1); 352 str = str.replaceAll("\\\\(.)", "$1"); 353 System.out.println(str); 354 strBuild.append(str); 355 } else { 356 sc.next(); 357 } 358 } 359 sc.close(); 360 } catch (IOException ex) { 361 ex.printStackTrace(); 362 } 363 364 String str = strBuild.toString(); 365 366 if (str.length() == 0) { 367 str = "\uDBFF\uDC00\uDBFF\uDFFD\u0001\u0001"; 368 } 369 370 List<Interval> ilist = getStrRangeListFromRangeStr(str); 371 372 String encodedStr = base88EncodeList(ilist); 373 List<Interval> decodedStrRangeList = base88DecodeList(encodedStr); 374 375 String str1 = rangeString2string(str); 376 String str2 = strRangeList2string(decodedStrRangeList); 377 isStringsEqual(str1, str2); 378 379 try { 380 BufferedWriter out = new BufferedWriter(new FileWriter("/tmp/compressed.txt")); 381 out.write(encodedStr); 382 out.close(); 383 } catch (IOException ex) { 384 ex.printStackTrace(); 385 } 386 } 387 } 388