• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.draft;
2 
3 import java.io.BufferedWriter;
4 import java.io.File;
5 import java.io.FileWriter;
6 import java.io.IOException;
7 import java.util.ArrayList;
8 import java.util.List;
9 import java.util.Scanner;
10 import java.util.regex.MatchResult;
11 
12 import com.ibm.icu.text.UCharacterIterator;
13 
14 /**
15  * Compresses list of Unicode character ranges given as starting and ending char
16  * into a Base88 string.
17  *
18  * Compression usage:
19  * String encodedStr = base88EncodeList(List<Interval>);
20  *
21  * Decompression usage:
22  * List<Interval> decodedStrList = base88DecodeList(encodedStr);
23  *
24  * Interval has two integers - first, last - to represent the range.
25  */
26 
27 public class CharacterListCompressor {
28 
29     public static class Interval {
30         int first;
31         int last;
32 
Interval(int first, int last)33         public Interval(int first, int last) {
34             this.first = first;
35             this.last = last;
36         }
37 
38         @Override
toString()39         public String toString() {
40             return "«" + first + "-" + last + "»";
41         }
42 
first()43         public int first() {
44             return first;
45         }
46 
last()47         public int last() {
48             return last;
49         }
50     }
51 
52     //
53     // Pairs to Base88 methods
54     //
55 
unicode2Base88(int code)56     public static List<Integer> unicode2Base88(int code) {
57 
58         List<Integer> list = new ArrayList<>();
59         int rem = code % 88;
60         list.add(rem);
61         code = code / 88;
62 
63         if (code != 0) {
64             rem = code % 88;
65             list.add(rem);
66             code = code / 88;
67         }
68         if (code != 0) {
69             rem = code % 88;
70             list.add(rem);
71             code = code / 88;
72 
73             rem = code % 88;
74             list.add(rem);
75             code = code / 88;
76         }
77         return list;
78     }
79 
byteCount4Base88(int code)80     public static int byteCount4Base88(int code) {
81         int count = 0;
82         code = code / 88;
83 
84         if (code != 0) {
85             count = 1;
86             code = code / 88;
87         }
88         if (code != 0) {
89             count = 2;
90         }
91         return count;
92     }
93 
compressPair2Base88(List<Integer> pair)94     public static List<Integer> compressPair2Base88(List<Integer> pair) {
95         int value = pair.get(0);
96         int type = pair.get(1);
97         int code = value * 8 + type * 3;
98         code += byteCount4Base88(code);
99 
100         return unicode2Base88(code);
101     }
102 
decodeBase88ToValueTypePairs(String str)103     public static List<List<Integer>> decodeBase88ToValueTypePairs(String str) {
104         List<Integer> list = str2list(str);
105 
106         int metawindowsize = 8;
107         List<List<Integer>> result = new ArrayList<>();
108         int i = 0;
109 
110         while (i < list.size()) {
111             int c = list.get(i);
112             int meta = c % metawindowsize;
113             int type = meta / 3;
114             int leng = (meta % 3) + 1;
115 
116             if (leng == 3) {
117                 leng++;
118             }
119 
120             int value = getCodeFromListAt(list, i, leng) / metawindowsize;
121             addPair(result, value, type);
122             i += leng;
123         }
124         return result;
125     }
126 
getCodeFromListAt(List<Integer> list, int start, int leng)127     public static int getCodeFromListAt(List<Integer> list, int start, int leng) {
128         int result = 0;
129         for (int i = 0; i < leng; i++) {
130             int c = list.get(start + i);
131             result += c * Math.pow(88, i);
132         }
133         return result;
134     }
135 
addPair(List<List<Integer>> pairs, int value, int type)136     public static void addPair(List<List<Integer>> pairs, int value, int type) {
137         List<Integer> pair = new ArrayList<>();
138         pair.add(value);
139         pair.add(type);
140         pairs.add(pair);
141     }
142 
encodeValueTypePairs2Base88(List<List<Integer>> pairs)143     public static String encodeValueTypePairs2Base88(List<List<Integer>> pairs) {
144         List<Integer> result = new ArrayList<>();
145         for (int i = 0; i < pairs.size(); i++) {
146             List<Integer> pair = pairs.get(i);
147             result.addAll(compressPair2Base88(pair));
148         }
149         return list2str(result);
150     }
151 
152     public final static String ascii = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%()*+,-.:;<=>?@[]^_`{|}~";
153 
list2str(List<Integer> list)154     public static String list2str(List<Integer> list) {
155         StringBuilder str = new StringBuilder();
156         for (int i = 0; i < list.size(); i++) {
157             int code = list.get(i);
158 
159             str.append(ascii.charAt(code));
160         }
161         return str.toString();
162     }
163 
str2list(String str)164     public static List<Integer> str2list(String str) {
165         List<Integer> list = new ArrayList<>();
166         for (int i = 0; i < str.length(); i++) {
167             char ch = str.charAt(i);
168             int code = ascii.indexOf(ch);
169 
170             list.add(code);
171         }
172         return list;
173     }
174 
base88EncodeList(List<Interval> intervalList)175     public static String base88EncodeList(List<Interval> intervalList) {
176         List<List<Integer>> pairs = getValueTypePairsFromStrRangeList(intervalList);
177         String encoded = encodeValueTypePairs2Base88(pairs);
178 
179         return encoded;
180     }
181 
base88DecodeList(String base88String)182     public static List<Interval> base88DecodeList(String base88String) {
183         List<List<Integer>> pairs = decodeBase88ToValueTypePairs(base88String);
184         List<Interval> decoded = getStrRangeListFromValueTypePairs(pairs);
185 
186         return decoded;
187     }
188 
189     // end of compression methods
190 
191     // Value Type pairs -- Str Range List
getValueTypePairsFromStrRangeList(List<Interval> ilist)192     public static List<List<Integer>> getValueTypePairsFromStrRangeList(List<Interval> ilist) {
193         List<List<Integer>> result = new ArrayList<>();
194         int lastCode = 0;
195 
196         for (int i = 0; i < ilist.size(); i++) {
197             int value = 0;
198             int first = ilist.get(i).first;
199             int last = ilist.get(i).last;
200 
201             if (lastCode < first) {
202                 addPair(result, first - lastCode - 1, 0);
203             } else if (lastCode > first) {
204                 addPair(result, lastCode - first - 1, 1);
205             } else if (lastCode == first) {
206                 System.out.println("I am not expecting two contiguous chars to be the same");
207             }
208             lastCode = first;
209 
210             if (first < last) {
211                 value = last - first - 1;
212 
213                 // range is big and spit it
214                 int rangesize = 0x3c8; // 968 = 88 * 88 / 8
215                 while (value >= rangesize) {
216 
217                     addPair(result, rangesize - 1, 2); // rangesize chars - 0..(rangesize - 1)
218                     value -= rangesize; // rangesize chars are already added above
219                     lastCode += rangesize;
220                 }
221                 addPair(result, value, 2);
222                 lastCode = last;
223             }
224         }
225         return result;
226     }
227 
getStrRangeListFromValueTypePairs(List<List<Integer>> pairs)228     public static List<Interval> getStrRangeListFromValueTypePairs(List<List<Integer>> pairs) {
229         ArrayList<Interval> result = new ArrayList<>();
230 
231         int lastCode = 0;
232         for (int i = 0; i < pairs.size(); i++) {
233             List<Integer> pair = pairs.get(i);
234 
235             int value = pair.get(0);
236             int type = pair.get(1);
237 
238             if (type == 0) {
239                 lastCode += value + 1;
240                 addInterval(result, lastCode, lastCode);
241             } else if (type == 1) {
242                 lastCode -= value + 1;
243                 addInterval(result, lastCode, lastCode);
244             } else if (type == 2) {
245                 int first = lastCode + 1;
246                 int last = first + value;
247                 addInterval(result, first, last);
248                 lastCode += value + 1;
249             }
250         }
251         return result;
252     }
253 
addInterval(List<Interval> list, int first, int last)254     public static void addInterval(List<Interval> list, int first, int last) {
255         Interval i = new Interval(first, last);
256         list.add(i);
257     }
258 
259     // Str Range List -- Range Str
260 
getStrRangeListFromRangeStr(String str)261     public static List<Interval> getStrRangeListFromRangeStr(String str) {
262         ArrayList<Interval> result = new ArrayList<>();
263         final UCharacterIterator it = UCharacterIterator.getInstance(str);
264 
265         int first;
266         while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) {
267             int last = it.nextCodePoint();
268             addInterval(result, first, last);
269         }
270         return result;
271     }
272 
273     //
274     // To String methods
275     //
strRangeList2string(List<Interval> ilist)276     public static String strRangeList2string(List<Interval> ilist) {
277 
278         StringBuilder sbuild = new StringBuilder();
279         for (int i = 0; i < ilist.size(); i++) {
280             int first = ilist.get(i).first;
281             int last = ilist.get(i).last;
282 
283             for (int j = first; j <= last; j++) {
284                 sbuild.appendCodePoint(j);
285             }
286         }
287         return sbuild.toString();
288     }
289 
rangeString2string(String rstr)290     public static String rangeString2string(String rstr) {
291 
292         StringBuilder sbuild = new StringBuilder();
293         final UCharacterIterator it = UCharacterIterator.getInstance(rstr);
294 
295         int first;
296         while ((first = it.nextCodePoint()) != UCharacterIterator.DONE) {
297             int last = it.nextCodePoint();
298 
299             for (int j = first; j <= last; j++) {
300                 sbuild.appendCodePoint(j);
301             }
302         }
303         return sbuild.toString();
304     }
305 
306     //
307     // String comparison methods
308     //
309 
isStringsEqual(String s1, String s2)310     public static boolean isStringsEqual(String s1, String s2) {
311 
312         final UCharacterIterator it1 = UCharacterIterator.getInstance(s1);
313         final UCharacterIterator it2 = UCharacterIterator.getInstance(s2);
314         int c1 = 0;
315         int c2 = 0;
316         int count = 0;
317         while (c1 == c2 && c1 != UCharacterIterator.DONE) {
318             count++;
319             c1 = it1.nextCodePoint();
320             c2 = it2.nextCodePoint();
321 
322             System.out.print("Comparing c1 = c2 = ");
323             System.out.print(c1);
324             System.out.print((char) c1);
325             System.out.print(" ; count = ");
326             System.out.println(count);
327         }
328         System.out.print(count);
329         System.out.println(" characters compared");
330 
331         if (c1 != c2) {
332             System.out.print("Mismatch at c1 = ");
333             System.out.print(c1);
334             System.out.print(" c2 = ");
335             System.out.println(c2);
336             return false;
337         }
338         return true;
339     }
340 
341     // Main
342 
main(String[] args)343     public static void main(String[] args) {
344 
345         StringBuilder strBuild = new StringBuilder();
346         try {
347             Scanner sc = new Scanner(new File("/home/cibu/CharData.java"), "UTF-8");
348             while (sc.hasNext()) {
349                 if (sc.findInLine("\\/\\*.*,\"(.*)\"},\\s*") != null) {
350                     MatchResult match = sc.match();
351                     String str = match.group(1);
352                     str = str.replaceAll("\\\\(.)", "$1");
353                     System.out.println(str);
354                     strBuild.append(str);
355                 } else {
356                     sc.next();
357                 }
358             }
359             sc.close();
360         } catch (IOException ex) {
361             ex.printStackTrace();
362         }
363 
364         String str = strBuild.toString();
365 
366         if (str.length() == 0) {
367             str = "\uDBFF\uDC00\uDBFF\uDFFD\u0001\u0001";
368         }
369 
370         List<Interval> ilist = getStrRangeListFromRangeStr(str);
371 
372         String encodedStr = base88EncodeList(ilist);
373         List<Interval> decodedStrRangeList = base88DecodeList(encodedStr);
374 
375         String str1 = rangeString2string(str);
376         String str2 = strRangeList2string(decodedStrRangeList);
377         isStringsEqual(str1, str2);
378 
379         try {
380             BufferedWriter out = new BufferedWriter(new FileWriter("/tmp/compressed.txt"));
381             out.write(encodedStr);
382             out.close();
383         } catch (IOException ex) {
384             ex.printStackTrace();
385         }
386     }
387 }
388