• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.draft;
2 
3 import java.io.BufferedWriter;
4 import java.io.File;
5 import java.io.FileWriter;
6 import java.io.IOException;
7 import java.util.ArrayList;
8 import java.util.List;
9 import java.util.Scanner;
10 import java.util.regex.MatchResult;
11 
12 import com.ibm.icu.text.UCharacterIterator;
13 
14 /**
15  * Compresses list of Unicode character ranges given as starting and ending char
16  * into a Base88 string.
17  *
18  * Compression usage:
19  * String encodedStr = base88EncodeList(List<Interval>);
20  *
21  * Decompression usage:
22  * List<Interval> decodedStrList = base88DecodeList(encodedStr);
23  *
24  * Interval has two integers - first, last - to represent the range.
25  */
26 
27 public class CharacterListCompressor {
28 
29     public static class Interval {
30         int first;
31         int last;
32 
Interval(int first, int last)33         public Interval(int first, int last) {
34             this.first = first;
35             this.last = last;
36         }
37 
toString()38         public String toString() {
39             return "«" + first + "-" + last + "»";
40         }
41 
first()42         public int first() {
43             return first;
44         }
45 
last()46         public int last() {
47             return last;
48         }
49     }
50 
51     //
52     // Pairs to Base88 methods
53     //
54 
unicode2Base88(int code)55     public static List<Integer> unicode2Base88(int code) {
56 
57         List<Integer> list = new ArrayList<Integer>();
58         int rem = code % 88;
59         list.add(rem);
60         code = code / 88;
61 
62         if (code != 0) {
63             rem = code % 88;
64             list.add(rem);
65             code = code / 88;
66         }
67         if (code != 0) {
68             rem = code % 88;
69             list.add(rem);
70             code = code / 88;
71 
72             rem = code % 88;
73             list.add(rem);
74             code = code / 88;
75         }
76         return list;
77     }
78 
byteCount4Base88(int code)79     public static int byteCount4Base88(int code) {
80         int count = 0;
81         code = code / 88;
82 
83         if (code != 0) {
84             count = 1;
85             code = code / 88;
86         }
87         if (code != 0) {
88             count = 2;
89         }
90         return count;
91     }
92 
compressPair2Base88(List<Integer> pair)93     public static List<Integer> compressPair2Base88(List<Integer> pair) {
94         int value = pair.get(0);
95         int type = pair.get(1);
96         int code = value * 8 + type * 3;
97         code += byteCount4Base88(code);
98 
99         return unicode2Base88(code);
100     }
101 
decodeBase88ToValueTypePairs(String str)102     public static List<List<Integer>> decodeBase88ToValueTypePairs(String str) {
103         List<Integer> list = str2list(str);
104 
105         int metawindowsize = 8;
106         List<List<Integer>> result = new ArrayList<List<Integer>>();
107         int i = 0;
108 
109         while (i < list.size()) {
110             int c = list.get(i);
111             int meta = c % metawindowsize;
112             int type = meta / 3;
113             int leng = (meta % 3) + 1;
114 
115             if (leng == 3) {
116                 leng++;
117             }
118 
119             int value = getCodeFromListAt(list, i, leng) / metawindowsize;
120             addPair(result, value, type);
121             i += leng;
122         }
123         return result;
124     }
125 
getCodeFromListAt(List<Integer> list, int start, int leng)126     public static int getCodeFromListAt(List<Integer> list, int start, int leng) {
127         int result = 0;
128         for (int i = 0; i < leng; i++) {
129             int c = list.get(start + i);
130             result += c * Math.pow(88, i);
131         }
132         return result;
133     }
134 
addPair(List<List<Integer>> pairs, int value, int type)135     public static void addPair(List<List<Integer>> pairs, int value, int type) {
136         List<Integer> pair = new ArrayList<Integer>();
137         pair.add(value);
138         pair.add(type);
139         pairs.add(pair);
140     }
141 
encodeValueTypePairs2Base88(List<List<Integer>> pairs)142     public static String encodeValueTypePairs2Base88(List<List<Integer>> pairs) {
143         List<Integer> result = new ArrayList<Integer>();
144         for (int i = 0; i < pairs.size(); i++) {
145             List<Integer> pair = pairs.get(i);
146             result.addAll(compressPair2Base88(pair));
147         }
148         return list2str(result);
149     }
150 
151     public final static String ascii = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%()*+,-.:;<=>?@[]^_`{|}~";
152 
list2str(List<Integer> list)153     public static String list2str(List<Integer> list) {
154         StringBuilder str = new StringBuilder();
155         for (int i = 0; i < list.size(); i++) {
156             int code = list.get(i);
157 
158             str.append(ascii.charAt(code));
159         }
160         return str.toString();
161     }
162 
str2list(String str)163     public static List<Integer> str2list(String str) {
164         List<Integer> list = new ArrayList<Integer>();
165         for (int i = 0; i < str.length(); i++) {
166             char ch = str.charAt(i);
167             int code = ascii.indexOf(ch);
168 
169             list.add(code);
170         }
171         return list;
172     }
173 
base88EncodeList(List<Interval> intervalList)174     public static String base88EncodeList(List<Interval> intervalList) {
175         List<List<Integer>> pairs = getValueTypePairsFromStrRangeList(intervalList);
176         String encoded = encodeValueTypePairs2Base88(pairs);
177 
178         return encoded;
179     }
180 
base88DecodeList(String base88String)181     public static List<Interval> base88DecodeList(String base88String) {
182         List<List<Integer>> pairs = decodeBase88ToValueTypePairs(base88String);
183         List<Interval> decoded = getStrRangeListFromValueTypePairs(pairs);
184 
185         return decoded;
186     }
187 
188     // end of compression methods
189 
190     // Value Type pairs -- Str Range List
getValueTypePairsFromStrRangeList(List<Interval> ilist)191     public static List<List<Integer>> getValueTypePairsFromStrRangeList(List<Interval> ilist) {
192         List<List<Integer>> result = new ArrayList<List<Integer>>();
193         int lastCode = 0;
194 
195         for (int i = 0; i < ilist.size(); i++) {
196             int value = 0;
197             int first = ilist.get(i).first;
198             int last = ilist.get(i).last;
199 
200             if (lastCode < first) {
201                 addPair(result, first - lastCode - 1, 0);
202             } else if (lastCode > first) {
203                 addPair(result, lastCode - first - 1, 1);
204             } else if (lastCode == first) {
205                 System.out.println("I am not expecting two contiguous chars to be the same");
206             }
207             lastCode = first;
208 
209             if (first < last) {
210                 value = last - first - 1;
211 
212                 // range is big and spit it
213                 int rangesize = 0x3c8; // 968 = 88 * 88 / 8
214                 while (value >= rangesize) {
215 
216                     addPair(result, rangesize - 1, 2); // rangesize chars - 0..(rangesize - 1)
217                     value -= rangesize; // rangesize chars are already added above
218                     lastCode += rangesize;
219                 }
220                 addPair(result, value, 2);
221                 lastCode = last;
222             }
223         }
224         return result;
225     }
226 
getStrRangeListFromValueTypePairs(List<List<Integer>> pairs)227     public static List<Interval> getStrRangeListFromValueTypePairs(List<List<Integer>> pairs) {
228         ArrayList<Interval> result = new ArrayList<Interval>();
229 
230         int lastCode = 0;
231         for (int i = 0; i < pairs.size(); i++) {
232             List<Integer> pair = pairs.get(i);
233 
234             int value = pair.get(0);
235             int type = pair.get(1);
236 
237             if (type == 0) {
238                 lastCode += value + 1;
239                 addInterval(result, lastCode, lastCode);
240             } else if (type == 1) {
241                 lastCode -= value + 1;
242                 addInterval(result, lastCode, lastCode);
243             } else if (type == 2) {
244                 int first = lastCode + 1;
245                 int last = first + value;
246                 addInterval(result, first, last);
247                 lastCode += value + 1;
248             }
249         }
250         return result;
251     }
252 
addInterval(List<Interval> list, int first, int last)253     public static void addInterval(List<Interval> list, int first, int last) {
254         Interval i = new Interval(first, last);
255         list.add(i);
256     }
257 
258     // Str Range List -- Range Str
259 
getStrRangeListFromRangeStr(String str)260     public static List<Interval> getStrRangeListFromRangeStr(String str) {
261         ArrayList<Interval> result = new ArrayList<Interval>();
262         final UCharacterIterator it = UCharacterIterator.getInstance(str);
263 
264         int first;
265         while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) {
266             int last = it.nextCodePoint();
267             addInterval(result, first, last);
268         }
269         return result;
270     }
271 
272     //
273     // To String methods
274     //
strRangeList2string(List<Interval> ilist)275     public static String strRangeList2string(List<Interval> ilist) {
276 
277         StringBuilder sbuild = new StringBuilder();
278         for (int i = 0; i < ilist.size(); i++) {
279             int first = ilist.get(i).first;
280             int last = ilist.get(i).last;
281 
282             for (int j = first; j <= last; j++) {
283                 sbuild.appendCodePoint(j);
284             }
285         }
286         return sbuild.toString();
287     }
288 
rangeString2string(String rstr)289     public static String rangeString2string(String rstr) {
290 
291         StringBuilder sbuild = new StringBuilder();
292         final UCharacterIterator it = UCharacterIterator.getInstance(rstr);
293 
294         int first;
295         while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) {
296             int last = it.nextCodePoint();
297 
298             for (int j = first; j <= last; j++) {
299                 sbuild.appendCodePoint(j);
300             }
301         }
302         return sbuild.toString();
303     }
304 
305     //
306     // String comparison methods
307     //
308 
isStringsEqual(String s1, String s2)309     public static boolean isStringsEqual(String s1, String s2) {
310 
311         final UCharacterIterator it1 = UCharacterIterator.getInstance(s1);
312         final UCharacterIterator it2 = UCharacterIterator.getInstance(s2);
313         int c1 = 0;
314         int c2 = 0;
315         int count = 0;
316         while (c1 == c2 && c1 != UCharacterIterator.DONE) {
317             count++;
318             c1 = it1.nextCodePoint();
319             c2 = it2.nextCodePoint();
320 
321             System.out.print("Comparing c1 = c2 = ");
322             System.out.print(c1);
323             System.out.print((char) c1);
324             System.out.print(" ; count = ");
325             System.out.println(count);
326         }
327         System.out.print(count);
328         System.out.println(" characters compared");
329 
330         if (c1 != c2) {
331             System.out.print("Mismatch at c1 = ");
332             System.out.print(c1);
333             System.out.print(" c2 = ");
334             System.out.println(c2);
335             return false;
336         }
337         return true;
338     }
339 
340     // Main
341 
main(String[] args)342     public static void main(String[] args) {
343 
344         StringBuilder strBuild = new StringBuilder();
345         try {
346             Scanner sc = new Scanner(new File("/home/cibu/CharData.java"), "UTF-8");
347             while (sc.hasNext()) {
348                 if (sc.findInLine("\\/\\*.*,\"(.*)\"},\\s*") != null) {
349                     MatchResult match = sc.match();
350                     String str = match.group(1);
351                     str = str.replaceAll("\\\\(.)", "$1");
352                     System.out.println(str);
353                     strBuild.append(str);
354                 } else {
355                     sc.next();
356                 }
357             }
358             sc.close();
359         } catch (IOException ex) {
360             ex.printStackTrace();
361         }
362 
363         String str = strBuild.toString();
364 
365         if (str.length() == 0) {
366             str = "\uDBFF\uDC00\uDBFF\uDFFD\u0001\u0001";
367         }
368 
369         List<Interval> ilist = getStrRangeListFromRangeStr(str);
370 
371         String encodedStr = base88EncodeList(ilist);
372         List<Interval> decodedStrRangeList = base88DecodeList(encodedStr);
373 
374         String str1 = rangeString2string(str);
375         String str2 = strRangeList2string(decodedStrRangeList);
376         isStringsEqual(str1, str2);
377 
378         try {
379             BufferedWriter out = new BufferedWriter(new FileWriter("/tmp/compressed.txt"));
380             out.write(encodedStr);
381             out.close();
382         } catch (IOException ex) {
383             ex.printStackTrace();
384         }
385     }
386 }
387