1 package org.unicode.cldr.draft; 2 3 import com.ibm.icu.text.UCharacterIterator; 4 import java.io.BufferedWriter; 5 import java.io.File; 6 import java.io.FileWriter; 7 import java.io.IOException; 8 import java.util.ArrayList; 9 import java.util.List; 10 import java.util.Scanner; 11 import java.util.regex.MatchResult; 12 13 /** 14 * Compresses list of Unicode character ranges given as starting and ending char into a Base88 15 * string. 16 * 17 * <p>Compression usage: String encodedStr = base88EncodeList(List<Interval>); 18 * 19 * <p>Decompression usage: List<Interval> decodedStrList = base88DecodeList(encodedStr); 20 * 21 * <p>Interval has two integers - first, last - to represent the range. 22 */ 23 public class CharacterListCompressor { 24 25 public static class Interval { 26 int first; 27 int last; 28 Interval(int first, int last)29 public Interval(int first, int last) { 30 this.first = first; 31 this.last = last; 32 } 33 34 @Override toString()35 public String toString() { 36 return "«" + first + "-" + last + "»"; 37 } 38 first()39 public int first() { 40 return first; 41 } 42 last()43 public int last() { 44 return last; 45 } 46 } 47 48 // 49 // Pairs to Base88 methods 50 // 51 unicode2Base88(int code)52 public static List<Integer> unicode2Base88(int code) { 53 54 List<Integer> list = new ArrayList<>(); 55 int rem = code % 88; 56 list.add(rem); 57 code = code / 88; 58 59 if (code != 0) { 60 rem = code % 88; 61 list.add(rem); 62 code = code / 88; 63 } 64 if (code != 0) { 65 rem = code % 88; 66 list.add(rem); 67 code = code / 88; 68 69 rem = code % 88; 70 list.add(rem); 71 code = code / 88; 72 } 73 return list; 74 } 75 byteCount4Base88(int code)76 public static int byteCount4Base88(int code) { 77 int count = 0; 78 code = code / 88; 79 80 if (code != 0) { 81 count = 1; 82 code = code / 88; 83 } 84 if (code != 0) { 85 count = 2; 86 } 87 return count; 88 } 89 compressPair2Base88(List<Integer> pair)90 public static List<Integer> compressPair2Base88(List<Integer> pair) { 91 int value = pair.get(0); 92 int type = pair.get(1); 93 int code = value * 8 + type * 3; 94 code += byteCount4Base88(code); 95 96 return unicode2Base88(code); 97 } 98 decodeBase88ToValueTypePairs(String str)99 public static List<List<Integer>> decodeBase88ToValueTypePairs(String str) { 100 List<Integer> list = str2list(str); 101 102 int metawindowsize = 8; 103 List<List<Integer>> result = new ArrayList<>(); 104 int i = 0; 105 106 while (i < list.size()) { 107 int c = list.get(i); 108 int meta = c % metawindowsize; 109 int type = meta / 3; 110 int leng = (meta % 3) + 1; 111 112 if (leng == 3) { 113 leng++; 114 } 115 116 int value = getCodeFromListAt(list, i, leng) / metawindowsize; 117 addPair(result, value, type); 118 i += leng; 119 } 120 return result; 121 } 122 getCodeFromListAt(List<Integer> list, int start, int leng)123 public static int getCodeFromListAt(List<Integer> list, int start, int leng) { 124 int result = 0; 125 for (int i = 0; i < leng; i++) { 126 int c = list.get(start + i); 127 result += c * Math.pow(88, i); // TODO: implict narrowing of double to int. 128 } 129 return result; 130 } 131 addPair(List<List<Integer>> pairs, int value, int type)132 public static void addPair(List<List<Integer>> pairs, int value, int type) { 133 List<Integer> pair = new ArrayList<>(); 134 pair.add(value); 135 pair.add(type); 136 pairs.add(pair); 137 } 138 encodeValueTypePairs2Base88(List<List<Integer>> pairs)139 public static String encodeValueTypePairs2Base88(List<List<Integer>> pairs) { 140 List<Integer> result = new ArrayList<>(); 141 for (int i = 0; i < pairs.size(); i++) { 142 List<Integer> pair = pairs.get(i); 143 result.addAll(compressPair2Base88(pair)); 144 } 145 return list2str(result); 146 } 147 148 public static final String ascii = 149 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%()*+,-.:;<=>?@[]^_`{|}~"; 150 list2str(List<Integer> list)151 public static String list2str(List<Integer> list) { 152 StringBuilder str = new StringBuilder(); 153 for (int i = 0; i < list.size(); i++) { 154 int code = list.get(i); 155 156 str.append(ascii.charAt(code)); 157 } 158 return str.toString(); 159 } 160 str2list(String str)161 public static List<Integer> str2list(String str) { 162 List<Integer> list = new ArrayList<>(); 163 for (int i = 0; i < str.length(); i++) { 164 char ch = str.charAt(i); 165 int code = ascii.indexOf(ch); 166 167 list.add(code); 168 } 169 return list; 170 } 171 base88EncodeList(List<Interval> intervalList)172 public static String base88EncodeList(List<Interval> intervalList) { 173 List<List<Integer>> pairs = getValueTypePairsFromStrRangeList(intervalList); 174 String encoded = encodeValueTypePairs2Base88(pairs); 175 176 return encoded; 177 } 178 base88DecodeList(String base88String)179 public static List<Interval> base88DecodeList(String base88String) { 180 List<List<Integer>> pairs = decodeBase88ToValueTypePairs(base88String); 181 List<Interval> decoded = getStrRangeListFromValueTypePairs(pairs); 182 183 return decoded; 184 } 185 186 // end of compression methods 187 188 // Value Type pairs -- Str Range List getValueTypePairsFromStrRangeList(List<Interval> ilist)189 public static List<List<Integer>> getValueTypePairsFromStrRangeList(List<Interval> ilist) { 190 List<List<Integer>> result = new ArrayList<>(); 191 int lastCode = 0; 192 193 for (int i = 0; i < ilist.size(); i++) { 194 int value = 0; 195 int first = ilist.get(i).first; 196 int last = ilist.get(i).last; 197 198 if (lastCode < first) { 199 addPair(result, first - lastCode - 1, 0); 200 } else if (lastCode > first) { 201 addPair(result, lastCode - first - 1, 1); 202 } else if (lastCode == first) { 203 System.out.println("I am not expecting two contiguous chars to be the same"); 204 } 205 lastCode = first; 206 207 if (first < last) { 208 value = last - first - 1; 209 210 // range is big and spit it 211 int rangesize = 0x3c8; // 968 = 88 * 88 / 8 212 while (value >= rangesize) { 213 214 addPair(result, rangesize - 1, 2); // rangesize chars - 0..(rangesize - 1) 215 value -= rangesize; // rangesize chars are already added above 216 lastCode += rangesize; 217 } 218 addPair(result, value, 2); 219 lastCode = last; 220 } 221 } 222 return result; 223 } 224 getStrRangeListFromValueTypePairs(List<List<Integer>> pairs)225 public static List<Interval> getStrRangeListFromValueTypePairs(List<List<Integer>> pairs) { 226 ArrayList<Interval> result = new ArrayList<>(); 227 228 int lastCode = 0; 229 for (int i = 0; i < pairs.size(); i++) { 230 List<Integer> pair = pairs.get(i); 231 232 int value = pair.get(0); 233 int type = pair.get(1); 234 235 if (type == 0) { 236 lastCode += value + 1; 237 addInterval(result, lastCode, lastCode); 238 } else if (type == 1) { 239 lastCode -= value + 1; 240 addInterval(result, lastCode, lastCode); 241 } else if (type == 2) { 242 int first = lastCode + 1; 243 int last = first + value; 244 addInterval(result, first, last); 245 lastCode += value + 1; 246 } 247 } 248 return result; 249 } 250 addInterval(List<Interval> list, int first, int last)251 public static void addInterval(List<Interval> list, int first, int last) { 252 Interval i = new Interval(first, last); 253 list.add(i); 254 } 255 256 // Str Range List -- Range Str 257 getStrRangeListFromRangeStr(String str)258 public static List<Interval> getStrRangeListFromRangeStr(String str) { 259 ArrayList<Interval> result = new ArrayList<>(); 260 final UCharacterIterator it = UCharacterIterator.getInstance(str); 261 262 int first; 263 while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) { 264 int last = it.nextCodePoint(); 265 addInterval(result, first, last); 266 } 267 return result; 268 } 269 270 // 271 // To String methods 272 // strRangeList2string(List<Interval> ilist)273 public static String strRangeList2string(List<Interval> ilist) { 274 275 StringBuilder sbuild = new StringBuilder(); 276 for (int i = 0; i < ilist.size(); i++) { 277 int first = ilist.get(i).first; 278 int last = ilist.get(i).last; 279 280 for (int j = first; j <= last; j++) { 281 sbuild.appendCodePoint(j); 282 } 283 } 284 return sbuild.toString(); 285 } 286 rangeString2string(String rstr)287 public static String rangeString2string(String rstr) { 288 289 StringBuilder sbuild = new StringBuilder(); 290 final UCharacterIterator it = UCharacterIterator.getInstance(rstr); 291 292 int first; 293 while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) { 294 int last = it.nextCodePoint(); 295 296 for (int j = first; j <= last; j++) { 297 sbuild.appendCodePoint(j); 298 } 299 } 300 return sbuild.toString(); 301 } 302 303 // 304 // String comparison methods 305 // 306 isStringsEqual(String s1, String s2)307 public static boolean isStringsEqual(String s1, String s2) { 308 309 final UCharacterIterator it1 = UCharacterIterator.getInstance(s1); 310 final UCharacterIterator it2 = UCharacterIterator.getInstance(s2); 311 int c1 = 0; 312 int c2 = 0; 313 int count = 0; 314 while (c1 == c2 && c1 != UCharacterIterator.DONE) { 315 count++; 316 c1 = it1.nextCodePoint(); 317 c2 = it2.nextCodePoint(); 318 319 System.out.print("Comparing c1 = c2 = "); 320 System.out.print(c1); 321 System.out.print((char) c1); 322 System.out.print(" ; count = "); 323 System.out.println(count); 324 } 325 System.out.print(count); 326 System.out.println(" characters compared"); 327 328 if (c1 != c2) { 329 System.out.print("Mismatch at c1 = "); 330 System.out.print(c1); 331 System.out.print(" c2 = "); 332 System.out.println(c2); 333 return false; 334 } 335 return true; 336 } 337 338 // Main 339 main(String[] args)340 public static void main(String[] args) { 341 342 StringBuilder strBuild = new StringBuilder(); 343 try { 344 Scanner sc = new Scanner(new File("/home/cibu/CharData.java"), "UTF-8"); 345 while (sc.hasNext()) { 346 if (sc.findInLine("\\/\\*.*,\"(.*)\"},\\s*") != null) { 347 MatchResult match = sc.match(); 348 String str = match.group(1); 349 str = str.replaceAll("\\\\(.)", "$1"); 350 System.out.println(str); 351 strBuild.append(str); 352 } else { 353 sc.next(); 354 } 355 } 356 sc.close(); 357 } catch (IOException ex) { 358 ex.printStackTrace(); 359 } 360 361 String str = strBuild.toString(); 362 363 if (str.length() == 0) { 364 str = "\uDBFF\uDC00\uDBFF\uDFFD\u0001\u0001"; 365 } 366 367 List<Interval> ilist = getStrRangeListFromRangeStr(str); 368 369 String encodedStr = base88EncodeList(ilist); 370 List<Interval> decodedStrRangeList = base88DecodeList(encodedStr); 371 372 String str1 = rangeString2string(str); 373 String str2 = strRangeList2string(decodedStrRangeList); 374 isStringsEqual(str1, str2); 375 376 try { 377 BufferedWriter out = new BufferedWriter(new FileWriter("/tmp/compressed.txt")); 378 out.write(encodedStr); 379 out.close(); 380 } catch (IOException ex) { 381 ex.printStackTrace(); 382 } 383 } 384 } 385