• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.text.ParseException;
5 import java.util.ArrayList;
6 import java.util.Iterator;
7 import java.util.List;
8 import java.util.Locale;
9 import java.util.Set;
10 import java.util.TreeSet;
11 
12 import org.unicode.cldr.util.CldrUtility;
13 import org.unicode.cldr.util.CldrUtility.LineHandler;
14 import org.unicode.cldr.util.Counter2;
15 import org.unicode.cldr.util.StandardCodes;
16 
17 import com.ibm.icu.text.ListFormat;
18 import com.ibm.icu.text.NumberFormat;
19 import com.ibm.icu.text.UnicodeSet;
20 import com.ibm.icu.util.ULocale;
21 
22 public class AddPopulationData {
23     static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false);
24     static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false);
25 
26     enum WBLine {
27         // "Afghanistan","AFG","GNI, PPP (current international $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..",
28 
29         Country_Name, Country_Code, Series_Name, Series_Code, YR2000, YR2001, YR2002, YR2003, YR2004, YR2005, YR2006, YR2007, YR2008, YR2009, YR2010, YR2011, YR2012, YR2013, YR2014, YR2015, YR2016, YR2017, YR2018, YR2019;
get(String[] pieces)30         String get(String[] pieces) {
31             return ordinal() < pieces.length ? pieces[ordinal()] : EMPTY;
32         }
33     }
34 
35     enum FBLine {
36         Rank, Country, Value, Year;
get(String[] pieces)37         String get(String[] pieces) {
38             return pieces[ordinal()];
39         }
40     }
41 
42     enum FBLiteracy {
43         Rank, Country, Percent;
get(String[] pieces)44         String get(String[] pieces) {
45             return pieces[ordinal()];
46         }
47     }
48 
49     private static final String GCP = "NY.GNP.MKTP.PP.CD";
50     private static final String POP = "SP.POP.TOTL";
51     private static final String EMPTY = "..";
52     private static Counter2<String> worldbank_gdp = new Counter2<>();
53     private static Counter2<String> worldbank_population = new Counter2<>();
54     private static Counter2<String> un_literacy = new Counter2<>();
55 
56     private static Counter2<String> factbook_gdp = new Counter2<>();
57     private static Counter2<String> factbook_population = new Counter2<>();
58     private static Counter2<String> factbook_literacy = new Counter2<>();
59 
60     private static CountryData other = new CountryData();
61 
62     static class CountryData {
63         private static Counter2<String> population = new Counter2<>();
64         private static Counter2<String> gdp = new Counter2<>();
65         private static Counter2<String> literacy = new Counter2<>();
66     }
67 
68     final static Set<String> missing = new TreeSet<String>();
main(String[] args)69     public static void main(String[] args) throws IOException {
70 
71         System.out.println("Code"
72             + "\t" + "Name"
73             + "\t" + "Pop"
74             + "\t" + "GDP-PPP"
75             + "\t" + "UN Literacy");
76 
77         for (String country : StandardCodes.make().getGoodCountries()) {
78             showCountryData(country);
79         }
80         Set<String> outliers = new TreeSet<>();
81         outliers.addAll(factbook_population.keySet());
82         outliers.addAll(worldbank_population.keySet());
83         outliers.addAll(factbook_gdp.keySet());
84         outliers.addAll(worldbank_gdp.keySet());
85         outliers.addAll(un_literacy.keySet());
86         for (Iterator<String> it = outliers.iterator(); it.hasNext();) {
87             if (StandardCodes.isCountry(it.next())) {
88                 it.remove();
89             }
90         }
91         // outliers.remove("AN");
92         if (outliers.size() != 0) {
93             System.out.println("Mistakes: data for non-UN codes");
94             for (String country : outliers) {
95                 showCountryData(country);
96             }
97             throw new IllegalArgumentException("Mistakes: data for non-country codes");
98         }
99         Set<String> altNames = new TreeSet<>();
100         String oldCode = "";
101         for (String display : CountryCodeConverter.names()) {
102             String code = CountryCodeConverter.getCodeFromName(display, true, missing);
103             String icu = ULocale.getDisplayCountry("und-" + code, "en");
104             if (!display.equalsIgnoreCase(icu)) {
105                 altNames.add(code + "\t" + display + "\t" + icu);
106             }
107         }
108         oldCode = "";
109         if (SHOW_ALTERNATE_NAMES) {
110             for (String altName : altNames) {
111                 String[] pieces = altName.split("\t");
112                 String code = pieces[0];
113                 if (code.equals("ZZ")) continue;
114                 if (!code.equals(oldCode)) {
115                     oldCode = code;
116                     System.out.println();
117                 }
118                 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]);
119                 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + pieces[1] +
120                 // "</territory> <!-- " + pieces[2] + " -->");
121             }
122         }
123         if (!missing.isEmpty()) {
124             throw new RuntimeException("Could not load codes for: " +
125                 ListFormat.getInstance(Locale.getDefault()).format(missing));
126         }
127     }
128 
showCountryData(String country)129     private static void showCountryData(String country) {
130         number.setMaximumFractionDigits(0);
131         System.out.println(country
132             + "\t" + ULocale.getDisplayCountry("und-" + country, "en")
133             + "\t" + number.format(getPopulation(country))
134             + "\t" + number.format(getGdp(country))
135             + "\t" + percent.format(getLiteracy(country) / 100));
136     }
137 
getLiteracy(String country)138     public static Double getLiteracy(String country) {
139         return firstNonZero(factbook_literacy.getCount(country),
140             un_literacy.getCount(country),
141             CountryData.literacy.getCount(country));
142     }
143 
getGdp(String country)144     public static Double getGdp(String country) {
145         return firstNonZero(factbook_gdp.getCount(country),
146             worldbank_gdp.getCount(country),
147             CountryData.gdp.getCount(country));
148     }
149 
getPopulation(String country)150     public static Double getPopulation(String country) {
151         return firstNonZero(factbook_population.getCount(country),
152             worldbank_population.getCount(country),
153             CountryData.population.getCount(country));
154     }
155 
firstNonZero(Double... items)156     private static Double firstNonZero(Double... items) {
157         for (Double item : items) {
158             if (item.doubleValue() != 0) {
159                 return item;
160             }
161         }
162         return 0.0;
163     }
164 
splitCommaSeparated(String line)165     static String[] splitCommaSeparated(String line) {
166         // items are separated by ','
167         // each item is of the form abc...
168         // or "..." (required if a comma or quote is contained)
169         // " in a field is represented by ""
170         List<String> result = new ArrayList<>();
171         StringBuilder item = new StringBuilder();
172         boolean inQuote = false;
173         for (int i = 0; i < line.length(); ++i) {
174             char ch = line.charAt(i); // don't worry about supplementaries
175             switch (ch) {
176             case '"':
177                 inQuote = !inQuote;
178                 // at start or end, that's enough
179                 // if get a quote when we are not in a quote, and not at start, then add it and return to inQuote
180                 if (inQuote && item.length() != 0) {
181                     item.append('"');
182                     inQuote = true;
183                 }
184                 break;
185             case ',':
186                 if (!inQuote) {
187                     result.add(item.toString());
188                     item.setLength(0);
189                 } else {
190                     item.append(ch);
191                 }
192                 break;
193             default:
194                 item.append(ch);
195                 break;
196             }
197         }
198         result.add(item.toString());
199         return result.toArray(new String[result.size()]);
200     }
201 
loadFactbookInfo(String filename, final Counter2<String> factbookGdp)202     private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) throws IOException {
203         CldrUtility.handleFile(filename, new LineHandler() {
204             @Override
205             public boolean handle(String line) {
206                 if (line.length() == 0 || line.startsWith("This tab") || line.startsWith("Rank")
207                     || line.startsWith(" This file")) {
208                     return false;
209                 }
210                 String[] pieces = line.split("\\s{2,}");
211                 String code = CountryCodeConverter.getCodeFromName(FBLine.Country.get(pieces), true, missing);
212                 if (code == null) {
213                     return false;
214                 }
215                 if (!StandardCodes.isCountry(code)) {
216                     if (ADD_POP) {
217                         System.out.println("Skipping factbook info for: " + code);
218                     }
219                     return false;
220                 }
221                 code = code.toUpperCase(Locale.ENGLISH);
222                 String valueString = FBLine.Value.get(pieces).trim();
223                 if (valueString.startsWith("$")) {
224                     valueString = valueString.substring(1);
225                 }
226                 valueString = valueString.replace(",", "");
227                 double value = Double.parseDouble(valueString.trim());
228                 factbookGdp.add(code, value);
229                 if (ADD_POP) {
230                     System.out.println("Factbook gdp:\t" + code + "\t" + value);
231                 }
232                 return true;
233             }
234         });
235     }
236 
237     static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US);
238     static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US);
239     static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US);
240 
241     static class MyLineHandler implements LineHandler {
242         CountryData countryData;
243 
MyLineHandler(CountryData countryData)244         public MyLineHandler(CountryData countryData) {
245             super();
246             this.countryData = countryData;
247         }
248 
249         @Override
handle(String line)250         public boolean handle(String line) throws ParseException {
251             if (line.startsWith("#")) return true;
252             if (line.length() == 0) {
253                 return true;
254             }
255             String[] pieces = line.split(";");
256             final String code = pieces[0].trim();
257             if (code.equals("Code")) {
258                 return false;
259             }
260             // Code;Name;Type;Data;Source
261             final String typeString = pieces[2].trim();
262             final String data = pieces[3].trim();
263             if (typeString.equals("gdp-ppp")) {
264                 if (StandardCodes.isCountry(data)) {
265                     Double otherPop = getPopulation(data);
266                     Double otherGdp = getGdp(data);
267                     Double myPop = getPopulation(code);
268                     if (myPop.doubleValue() == 0 || otherPop.doubleValue() == 0 || otherGdp.doubleValue() == 0) {
269                         otherPop = getPopulation(data);
270                         otherGdp = getPopulation(data);
271                         myPop = getPopulation(code);
272                         throw new IllegalArgumentException("Zero population");
273                     }
274                     CountryData.gdp.add(code, otherGdp * myPop / otherPop);
275                 } else {
276                     CountryData.gdp.add(code, dollars.parse(data).doubleValue());
277                 }
278             } else if (typeString.equals("population")) {
279                 if (StandardCodes.isCountry(data)) {
280                     throw new IllegalArgumentException("Population can't use other country's");
281                 }
282                 CountryData.population.add(code, number.parse(data).doubleValue());
283             } else if (typeString.equals("literacy")) {
284                 if (StandardCodes.isCountry(data)) {
285                     Double otherPop = getLiteracy(data);
286                     CountryData.literacy.add(code, otherPop);
287                 } else {
288                     CountryData.literacy.add(code, number.parse(data).doubleValue());
289                 }
290             } else {
291                 throw new IllegalArgumentException("Illegal type");
292             }
293             return true;
294         }
295     }
296 
297     static final UnicodeSet DIGITS = new UnicodeSet("[:Nd:]").freeze();
298 
loadFactbookLiteracy()299     private static void loadFactbookLiteracy() throws IOException {
300         final String filename = "external/factbook_literacy.txt";
301         CldrUtility.handleFile(filename, new LineHandler() {
302             @Override
303             public boolean handle(String line) {
304                 String[] pieces = line.split("\\t");
305                 String code = CountryCodeConverter.getCodeFromName(FBLiteracy.Country.get(pieces), true, missing);
306                 if (code == null) {
307                     return false;
308                 }
309                 if (!StandardCodes.isCountry(code)) {
310                     if (ADD_POP) {
311                         System.out.println("Skipping factbook literacy for: " + code);
312                     }
313                     return false;
314                 }
315                 code = code.toUpperCase(Locale.ENGLISH);
316                 String valueString = FBLiteracy.Percent.get(pieces).trim();
317                 double percent = Double.parseDouble(valueString);
318                 factbook_literacy.put(code, percent);
319                 if (ADD_POP) {
320                     System.out.println("Factbook literacy:\t" + code + "\t" + percent);
321                 }
322                 code = null;
323                 return true;
324             }
325         });
326     }
327 
loadWorldBankInfo()328     private static void loadWorldBankInfo() throws IOException {
329         final String filename = "external/world_bank_data.csv";
330 
331         // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename));
332 
333         CldrUtility.handleFile(filename, new LineHandler() {
334             @Override
335             public boolean handle(String line) {
336                 if (line.contains("Series Code")) {
337                     return false;
338                 }
339                 String[] pieces = splitCommaSeparated(line);
340 
341                 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\"");
342 
343                 final String seriesCode = WBLine.Series_Code.get(pieces);
344 
345                 String last = null;
346                 for (WBLine i : WBLine.values()) {
347                     if (i.compareTo(WBLine.YR2000) >= 0) {
348                         String current = i.get(pieces);
349                         if (current.length() != 0 && !current.equals(EMPTY)) {
350                             last = current;
351                         }
352                     }
353                 }
354                 if (last == null) {
355                     return false;
356                 }
357                 String country = CountryCodeConverter.getCodeFromName(WBLine.Country_Name.get(pieces), true, missing);
358                 if (country == null) {
359                     return false;
360                 }
361                 if (!StandardCodes.isCountry(country)) {
362                     if (ADD_POP) {
363                         System.out.println("Skipping worldbank info for: " + country);
364                     }
365                     return false;
366                 }
367                 double value;
368                 try {
369                     value = Double.parseDouble(last);
370                 } catch (NumberFormatException e) {
371                     throw new IllegalArgumentException("File changed format: need to modify code");
372                 }
373                 if (seriesCode.equals(GCP)) {
374                     worldbank_gdp.add(country, value);
375                 } else if (seriesCode.equals(POP)) {
376                     worldbank_population.add(country, value);
377                 } else {
378                     throw new IllegalArgumentException();
379                 }
380                 return true;
381             }
382         });
383     }
384 
loadUnLiteracy()385     private static void loadUnLiteracy() throws IOException {
386         CldrUtility.handleFile("external/un_literacy.csv", new CldrUtility.LineHandler() {
387             @Override
388             public boolean handle(String line) {
389                 // Afghanistan,2000, ,28,43,13,,34,51,18
390                 // "Country or area","Year",,"Adult (15+) literacy rate",,,,,,"         Youth (15-24) literacy rate",,,,
391                 // ,,,Total,Men,Women,,Total,Men,Women
392                 // "Albania",2008,,96,,97,,95,,99,,99,,99
393                 String[] pieces = splitCommaSeparated(line);
394                 if (pieces.length != 14 || pieces[1].length() == 0 || !DIGITS.containsAll(pieces[1])) {
395                     return false;
396                 }
397                 String code = CountryCodeConverter.getCodeFromName(pieces[0], true, missing);
398                 if (code == null) {
399                     return false;
400                 }
401                 if (!StandardCodes.isCountry(code)) {
402                     if (ADD_POP) {
403                         System.out.println("Skipping UN info for: " + code);
404                     }
405                     return false;
406                 }
407                 String totalLiteracy = pieces[3];
408                 if (totalLiteracy.equals("�") || totalLiteracy.equals("…") || totalLiteracy.isEmpty()) {
409                     return true;
410                 }
411                 double percent = Double.parseDouble(totalLiteracy);
412                 un_literacy.add(code, percent);
413                 return true;
414             }
415         });
416     }
417 
418     static {
419         try {
loadFactbookLiteracy()420             loadFactbookLiteracy();
loadUnLiteracy()421             loadUnLiteracy();
422 
423             loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp);
424             loadFactbookInfo("external/factbook_population.txt", factbook_population);
425             CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other));
426 
loadWorldBankInfo()427             loadWorldBankInfo();
428             StandardCodes sc = StandardCodes.make();
429             StringBuilder myErrors = new StringBuilder();
430             for (String territory : sc.getGoodAvailableCodes("territory")) {
431                 if (!StandardCodes.isCountry(territory)) {
432                     continue;
433                 }
434                 double gdp = getGdp(territory);
435                 double literacy = getLiteracy(territory);
436                 double population = getPopulation(territory);
437                 if (gdp == 0) {
438                     // AX;Aland Islands;population;26,200;www.aland.ax
439                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";gdp-ppp;0;reason");
440                 }
441                 if (literacy == 0) {
442                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";literacy;0;reason");
443                 }
444                 if (population == 0) {
445                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory)
446                         + ";population;0;reason");
447                 }
448             }
449             if (myErrors.length() != 0) {
450                 throw new IllegalArgumentException(
451                     "Missing Country values, the following and add to external/other_country_data to fix, chaning the 0 to the real value:"
452                         + myErrors);
453             }
454         } catch (IOException e) {
455         }
456     }
457 }
458