• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.draft;
2 
3 import com.ibm.icu.text.UCharacterIterator;
4 import java.io.BufferedWriter;
5 import java.io.File;
6 import java.io.FileWriter;
7 import java.io.IOException;
8 import java.util.ArrayList;
9 import java.util.List;
10 import java.util.Scanner;
11 import java.util.regex.MatchResult;
12 
13 /**
14  * Compresses list of Unicode character ranges given as starting and ending char into a Base88
15  * string.
16  *
17  * <p>Compression usage: String encodedStr = base88EncodeList(List<Interval>);
18  *
19  * <p>Decompression usage: List<Interval> decodedStrList = base88DecodeList(encodedStr);
20  *
21  * <p>Interval has two integers - first, last - to represent the range.
22  */
23 public class CharacterListCompressor {
24 
25     public static class Interval {
26         int first;
27         int last;
28 
Interval(int first, int last)29         public Interval(int first, int last) {
30             this.first = first;
31             this.last = last;
32         }
33 
34         @Override
toString()35         public String toString() {
36             return "«" + first + "-" + last + "»";
37         }
38 
first()39         public int first() {
40             return first;
41         }
42 
last()43         public int last() {
44             return last;
45         }
46     }
47 
48     //
49     // Pairs to Base88 methods
50     //
51 
unicode2Base88(int code)52     public static List<Integer> unicode2Base88(int code) {
53 
54         List<Integer> list = new ArrayList<>();
55         int rem = code % 88;
56         list.add(rem);
57         code = code / 88;
58 
59         if (code != 0) {
60             rem = code % 88;
61             list.add(rem);
62             code = code / 88;
63         }
64         if (code != 0) {
65             rem = code % 88;
66             list.add(rem);
67             code = code / 88;
68 
69             rem = code % 88;
70             list.add(rem);
71             code = code / 88;
72         }
73         return list;
74     }
75 
byteCount4Base88(int code)76     public static int byteCount4Base88(int code) {
77         int count = 0;
78         code = code / 88;
79 
80         if (code != 0) {
81             count = 1;
82             code = code / 88;
83         }
84         if (code != 0) {
85             count = 2;
86         }
87         return count;
88     }
89 
compressPair2Base88(List<Integer> pair)90     public static List<Integer> compressPair2Base88(List<Integer> pair) {
91         int value = pair.get(0);
92         int type = pair.get(1);
93         int code = value * 8 + type * 3;
94         code += byteCount4Base88(code);
95 
96         return unicode2Base88(code);
97     }
98 
decodeBase88ToValueTypePairs(String str)99     public static List<List<Integer>> decodeBase88ToValueTypePairs(String str) {
100         List<Integer> list = str2list(str);
101 
102         int metawindowsize = 8;
103         List<List<Integer>> result = new ArrayList<>();
104         int i = 0;
105 
106         while (i < list.size()) {
107             int c = list.get(i);
108             int meta = c % metawindowsize;
109             int type = meta / 3;
110             int leng = (meta % 3) + 1;
111 
112             if (leng == 3) {
113                 leng++;
114             }
115 
116             int value = getCodeFromListAt(list, i, leng) / metawindowsize;
117             addPair(result, value, type);
118             i += leng;
119         }
120         return result;
121     }
122 
getCodeFromListAt(List<Integer> list, int start, int leng)123     public static int getCodeFromListAt(List<Integer> list, int start, int leng) {
124         int result = 0;
125         for (int i = 0; i < leng; i++) {
126             int c = list.get(start + i);
127             result += c * Math.pow(88, i); // TODO: implict narrowing of double to int.
128         }
129         return result;
130     }
131 
addPair(List<List<Integer>> pairs, int value, int type)132     public static void addPair(List<List<Integer>> pairs, int value, int type) {
133         List<Integer> pair = new ArrayList<>();
134         pair.add(value);
135         pair.add(type);
136         pairs.add(pair);
137     }
138 
encodeValueTypePairs2Base88(List<List<Integer>> pairs)139     public static String encodeValueTypePairs2Base88(List<List<Integer>> pairs) {
140         List<Integer> result = new ArrayList<>();
141         for (int i = 0; i < pairs.size(); i++) {
142             List<Integer> pair = pairs.get(i);
143             result.addAll(compressPair2Base88(pair));
144         }
145         return list2str(result);
146     }
147 
148     public static final String ascii =
149             "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%()*+,-.:;<=>?@[]^_`{|}~";
150 
list2str(List<Integer> list)151     public static String list2str(List<Integer> list) {
152         StringBuilder str = new StringBuilder();
153         for (int i = 0; i < list.size(); i++) {
154             int code = list.get(i);
155 
156             str.append(ascii.charAt(code));
157         }
158         return str.toString();
159     }
160 
str2list(String str)161     public static List<Integer> str2list(String str) {
162         List<Integer> list = new ArrayList<>();
163         for (int i = 0; i < str.length(); i++) {
164             char ch = str.charAt(i);
165             int code = ascii.indexOf(ch);
166 
167             list.add(code);
168         }
169         return list;
170     }
171 
base88EncodeList(List<Interval> intervalList)172     public static String base88EncodeList(List<Interval> intervalList) {
173         List<List<Integer>> pairs = getValueTypePairsFromStrRangeList(intervalList);
174         String encoded = encodeValueTypePairs2Base88(pairs);
175 
176         return encoded;
177     }
178 
base88DecodeList(String base88String)179     public static List<Interval> base88DecodeList(String base88String) {
180         List<List<Integer>> pairs = decodeBase88ToValueTypePairs(base88String);
181         List<Interval> decoded = getStrRangeListFromValueTypePairs(pairs);
182 
183         return decoded;
184     }
185 
186     // end of compression methods
187 
188     // Value Type pairs -- Str Range List
getValueTypePairsFromStrRangeList(List<Interval> ilist)189     public static List<List<Integer>> getValueTypePairsFromStrRangeList(List<Interval> ilist) {
190         List<List<Integer>> result = new ArrayList<>();
191         int lastCode = 0;
192 
193         for (int i = 0; i < ilist.size(); i++) {
194             int value = 0;
195             int first = ilist.get(i).first;
196             int last = ilist.get(i).last;
197 
198             if (lastCode < first) {
199                 addPair(result, first - lastCode - 1, 0);
200             } else if (lastCode > first) {
201                 addPair(result, lastCode - first - 1, 1);
202             } else if (lastCode == first) {
203                 System.out.println("I am not expecting two contiguous chars to be the same");
204             }
205             lastCode = first;
206 
207             if (first < last) {
208                 value = last - first - 1;
209 
210                 // range is big and spit it
211                 int rangesize = 0x3c8; // 968 = 88 * 88 / 8
212                 while (value >= rangesize) {
213 
214                     addPair(result, rangesize - 1, 2); // rangesize chars - 0..(rangesize - 1)
215                     value -= rangesize; // rangesize chars are already added above
216                     lastCode += rangesize;
217                 }
218                 addPair(result, value, 2);
219                 lastCode = last;
220             }
221         }
222         return result;
223     }
224 
getStrRangeListFromValueTypePairs(List<List<Integer>> pairs)225     public static List<Interval> getStrRangeListFromValueTypePairs(List<List<Integer>> pairs) {
226         ArrayList<Interval> result = new ArrayList<>();
227 
228         int lastCode = 0;
229         for (int i = 0; i < pairs.size(); i++) {
230             List<Integer> pair = pairs.get(i);
231 
232             int value = pair.get(0);
233             int type = pair.get(1);
234 
235             if (type == 0) {
236                 lastCode += value + 1;
237                 addInterval(result, lastCode, lastCode);
238             } else if (type == 1) {
239                 lastCode -= value + 1;
240                 addInterval(result, lastCode, lastCode);
241             } else if (type == 2) {
242                 int first = lastCode + 1;
243                 int last = first + value;
244                 addInterval(result, first, last);
245                 lastCode += value + 1;
246             }
247         }
248         return result;
249     }
250 
addInterval(List<Interval> list, int first, int last)251     public static void addInterval(List<Interval> list, int first, int last) {
252         Interval i = new Interval(first, last);
253         list.add(i);
254     }
255 
256     // Str Range List -- Range Str
257 
getStrRangeListFromRangeStr(String str)258     public static List<Interval> getStrRangeListFromRangeStr(String str) {
259         ArrayList<Interval> result = new ArrayList<>();
260         final UCharacterIterator it = UCharacterIterator.getInstance(str);
261 
262         int first;
263         while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) {
264             int last = it.nextCodePoint();
265             addInterval(result, first, last);
266         }
267         return result;
268     }
269 
270     //
271     // To String methods
272     //
strRangeList2string(List<Interval> ilist)273     public static String strRangeList2string(List<Interval> ilist) {
274 
275         StringBuilder sbuild = new StringBuilder();
276         for (int i = 0; i < ilist.size(); i++) {
277             int first = ilist.get(i).first;
278             int last = ilist.get(i).last;
279 
280             for (int j = first; j <= last; j++) {
281                 sbuild.appendCodePoint(j);
282             }
283         }
284         return sbuild.toString();
285     }
286 
rangeString2string(String rstr)287     public static String rangeString2string(String rstr) {
288 
289         StringBuilder sbuild = new StringBuilder();
290         final UCharacterIterator it = UCharacterIterator.getInstance(rstr);
291 
292         int first;
293         while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) {
294             int last = it.nextCodePoint();
295 
296             for (int j = first; j <= last; j++) {
297                 sbuild.appendCodePoint(j);
298             }
299         }
300         return sbuild.toString();
301     }
302 
303     //
304     // String comparison methods
305     //
306 
isStringsEqual(String s1, String s2)307     public static boolean isStringsEqual(String s1, String s2) {
308 
309         final UCharacterIterator it1 = UCharacterIterator.getInstance(s1);
310         final UCharacterIterator it2 = UCharacterIterator.getInstance(s2);
311         int c1 = 0;
312         int c2 = 0;
313         int count = 0;
314         while (c1 == c2 && c1 != UCharacterIterator.DONE) {
315             count++;
316             c1 = it1.nextCodePoint();
317             c2 = it2.nextCodePoint();
318 
319             System.out.print("Comparing c1 = c2 = ");
320             System.out.print(c1);
321             System.out.print((char) c1);
322             System.out.print(" ; count = ");
323             System.out.println(count);
324         }
325         System.out.print(count);
326         System.out.println(" characters compared");
327 
328         if (c1 != c2) {
329             System.out.print("Mismatch at c1 = ");
330             System.out.print(c1);
331             System.out.print(" c2 = ");
332             System.out.println(c2);
333             return false;
334         }
335         return true;
336     }
337 
338     // Main
339 
main(String[] args)340     public static void main(String[] args) {
341 
342         StringBuilder strBuild = new StringBuilder();
343         try {
344             Scanner sc = new Scanner(new File("/home/cibu/CharData.java"), "UTF-8");
345             while (sc.hasNext()) {
346                 if (sc.findInLine("\\/\\*.*,\"(.*)\"},\\s*") != null) {
347                     MatchResult match = sc.match();
348                     String str = match.group(1);
349                     str = str.replaceAll("\\\\(.)", "$1");
350                     System.out.println(str);
351                     strBuild.append(str);
352                 } else {
353                     sc.next();
354                 }
355             }
356             sc.close();
357         } catch (IOException ex) {
358             ex.printStackTrace();
359         }
360 
361         String str = strBuild.toString();
362 
363         if (str.length() == 0) {
364             str = "\uDBFF\uDC00\uDBFF\uDFFD\u0001\u0001";
365         }
366 
367         List<Interval> ilist = getStrRangeListFromRangeStr(str);
368 
369         String encodedStr = base88EncodeList(ilist);
370         List<Interval> decodedStrRangeList = base88DecodeList(encodedStr);
371 
372         String str1 = rangeString2string(str);
373         String str2 = strRangeList2string(decodedStrRangeList);
374         isStringsEqual(str1, str2);
375 
376         try {
377             BufferedWriter out = new BufferedWriter(new FileWriter("/tmp/compressed.txt"));
378             out.write(encodedStr);
379             out.close();
380         } catch (IOException ex) {
381             ex.printStackTrace();
382         }
383     }
384 }
385