• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.text.ParseException;
5 import java.util.ArrayList;
6 import java.util.HashMap;
7 import java.util.Iterator;
8 import java.util.List;
9 import java.util.Locale;
10 import java.util.Set;
11 import java.util.TreeSet;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14 
15 import org.unicode.cldr.util.CldrUtility;
16 import org.unicode.cldr.util.CldrUtility.LineHandler;
17 import org.unicode.cldr.util.Counter2;
18 import org.unicode.cldr.util.Pair;
19 import org.unicode.cldr.util.StandardCodes;
20 
21 import com.ibm.icu.text.ListFormat;
22 import com.ibm.icu.text.NumberFormat;
23 import com.ibm.icu.text.UnicodeSet;
24 import com.ibm.icu.util.ULocale;
25 
26 public class AddPopulationData {
27     static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false);
28     static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false);
29 
30     enum WBLine {
31         // "Afghanistan","AFG","GNI, PPP (current international $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..",
32 
33         // Country Name,Country Code,Series Name,Series Code,2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]
34         Country_Name, Country_Code, Series_Name, Series_Code,
35         Year("(\\d+)\\s*\\[YR(\\d+)\\]");
36 
37         final Pattern pattern;
WBLine()38         WBLine() {
39             this.pattern = Pattern.compile(name().replaceAll("_", " "));
40         }
WBLine(final String regex)41         WBLine(final String regex) {
42             this.pattern = Pattern.compile(regex);
43         }
44 
match(String str)45         Matcher match(String str) {
46             // Skip BOM
47             if (str.startsWith("\uFEFF")) {
48                 str = str.substring("\uFEFF".length());
49             }
50             return this.pattern.matcher(str);
51         }
52 
find(final String str)53         static Pair<WBLine, Integer> find(final String str) {
54             for (WBLine i : values()) {
55                 final Matcher m = i.match(str);
56                 if (m.matches()) {
57                     Integer val = 0;
58                     if (m.groupCount() > 0) {
59                         val = Integer.parseInt(m.group(1));
60                     }
61                     return Pair.of(i, val);
62                 }
63             }
64             return null;
65         }
66 
parseHeader(final String[] pieces)67         static ArrayList<Pair<WBLine, Integer>> parseHeader(final String[] pieces) {
68             ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null;
69             columnToTypeAndValue = new ArrayList<>();
70             for (int i=0; i<pieces.length; i++) {
71                 columnToTypeAndValue.add(i,WBLine.find(pieces[i]));
72             }
73             return columnToTypeAndValue;
74         }
75     }
76 
77     enum FBLine {
78         Rank, Country, Value, Year;
get(String[] pieces)79         String get(String[] pieces) {
80             return pieces[ordinal()];
81         }
82     }
83 
84     enum FBLiteracy {
85         Rank, Country, Percent;
get(String[] pieces)86         String get(String[] pieces) {
87             return pieces[ordinal()];
88         }
89     }
90 
91     private static final String GCP = "NY.GNP.MKTP.PP.CD";
92     private static final String POP = "SP.POP.TOTL";
93     private static final String EMPTY = "..";
94     private static Counter2<String> worldbank_gdp = new Counter2<>();
95     private static Counter2<String> worldbank_population = new Counter2<>();
96     private static Counter2<String> un_literacy = new Counter2<>();
97 
98     private static Counter2<String> factbook_gdp = new Counter2<>();
99     private static Counter2<String> factbook_population = new Counter2<>();
100     private static Counter2<String> factbook_literacy = new Counter2<>();
101 
102     private static CountryData other = new CountryData();
103 
104     static class CountryData {
105         private static Counter2<String> population = new Counter2<>();
106         private static Counter2<String> gdp = new Counter2<>();
107         private static Counter2<String> literacy = new Counter2<>();
108     }
109 
110     final static Set<String> missing = new TreeSet<String>();
main(String[] args)111     public static void main(String[] args) throws IOException {
112 
113         System.out.println("Code"
114             + "\t" + "Name"
115             + "\t" + "Pop"
116             + "\t" + "GDP-PPP"
117             + "\t" + "UN Literacy");
118 
119         for (String country : StandardCodes.make().getGoodCountries()) {
120             showCountryData(country);
121         }
122         Set<String> outliers = new TreeSet<>();
123         outliers.addAll(factbook_population.keySet());
124         outliers.addAll(worldbank_population.keySet());
125         outliers.addAll(factbook_gdp.keySet());
126         outliers.addAll(worldbank_gdp.keySet());
127         outliers.addAll(un_literacy.keySet());
128         for (Iterator<String> it = outliers.iterator(); it.hasNext();) {
129             if (StandardCodes.isCountry(it.next())) {
130                 it.remove();
131             }
132         }
133         // outliers.remove("AN");
134         if (outliers.size() != 0) {
135             System.out.println("Mistakes: data for non-UN codes");
136             for (String country : outliers) {
137                 showCountryData(country);
138             }
139             throw new IllegalArgumentException("Mistakes: data for non-country codes");
140         }
141         Set<String> altNames = new TreeSet<>();
142         String oldCode = "";
143         for (String display : CountryCodeConverter.names()) {
144             String code = CountryCodeConverter.getCodeFromName(display, true, missing);
145             String icu = ULocale.getDisplayCountry("und-" + code, "en");
146             if (!display.equalsIgnoreCase(icu)) {
147                 altNames.add(code + "\t" + display + "\t" + icu);
148             }
149         }
150         oldCode = "";
151         if (SHOW_ALTERNATE_NAMES) {
152             for (String altName : altNames) {
153                 String[] pieces = altName.split("\t");
154                 String code = pieces[0];
155                 if (code.equals("ZZ")) continue;
156                 if (!code.equals(oldCode)) {
157                     oldCode = code;
158                     System.out.println();
159                 }
160                 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]);
161                 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + pieces[1] +
162                 // "</territory> <!-- " + pieces[2] + " -->");
163             }
164         }
165         if (!missing.isEmpty()) {
166             throw new RuntimeException("Could not load codes for: " +
167                 ListFormat.getInstance(Locale.getDefault()).format(missing));
168         }
169     }
170 
showCountryData(String country)171     private static void showCountryData(String country) {
172         number.setMaximumFractionDigits(0);
173         System.out.println(country
174             + "\t" + ULocale.getDisplayCountry("und-" + country, "en")
175             + "\t" + number.format(getPopulation(country))
176             + "\t" + number.format(getGdp(country))
177             + "\t" + percent.format(getLiteracy(country) / 100));
178     }
179 
getLiteracy(String country)180     public static Double getLiteracy(String country) {
181         return firstNonZero(factbook_literacy.getCount(country),
182             un_literacy.getCount(country),
183             CountryData.literacy.getCount(country));
184     }
185 
getGdp(String country)186     public static Double getGdp(String country) {
187         return firstNonZero(factbook_gdp.getCount(country),
188             worldbank_gdp.getCount(country),
189             CountryData.gdp.getCount(country));
190     }
191 
getPopulation(String country)192     public static Double getPopulation(String country) {
193         return firstNonZero(factbook_population.getCount(country),
194             worldbank_population.getCount(country),
195             CountryData.population.getCount(country));
196     }
197 
firstNonZero(Double... items)198     private static Double firstNonZero(Double... items) {
199         for (Double item : items) {
200             if (item.doubleValue() != 0) {
201                 return item;
202             }
203         }
204         return 0.0;
205     }
206 
splitCommaSeparated(String line)207     static String[] splitCommaSeparated(String line) {
208         // items are separated by ','
209         // each item is of the form abc...
210         // or "..." (required if a comma or quote is contained)
211         // " in a field is represented by ""
212         List<String> result = new ArrayList<>();
213         StringBuilder item = new StringBuilder();
214         boolean inQuote = false;
215         for (int i = 0; i < line.length(); ++i) {
216             char ch = line.charAt(i); // don't worry about supplementaries
217             switch (ch) {
218             case '"':
219                 inQuote = !inQuote;
220                 // at start or end, that's enough
221                 // if get a quote when we are not in a quote, and not at start, then add it and return to inQuote
222                 if (inQuote && item.length() != 0) {
223                     item.append('"');
224                     inQuote = true;
225                 }
226                 break;
227             case ',':
228                 if (!inQuote) {
229                     result.add(item.toString());
230                     item.setLength(0);
231                 } else {
232                     item.append(ch);
233                 }
234                 break;
235             default:
236                 item.append(ch);
237                 break;
238             }
239         }
240         result.add(item.toString());
241         return result.toArray(new String[result.size()]);
242     }
243 
loadFactbookInfo(String filename, final Counter2<String> factbookGdp)244     private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) throws IOException {
245         CldrUtility.handleFile(filename, new LineHandler() {
246             @Override
247             public boolean handle(String line) {
248                 if (line.length() == 0 || line.startsWith("This tab") || line.startsWith("Rank")
249                     || line.startsWith(" This file")) {
250                     return false;
251                 }
252                 String[] pieces = line.split("\\s{2,}");
253                 String code = CountryCodeConverter.getCodeFromName(FBLine.Country.get(pieces), true, missing);
254                 if (code == null) {
255                     return false;
256                 }
257                 if (!StandardCodes.isCountry(code)) {
258                     if (ADD_POP) {
259                         System.out.println("Skipping factbook info for: " + code);
260                     }
261                     return false;
262                 }
263                 code = code.toUpperCase(Locale.ENGLISH);
264                 String valueString = FBLine.Value.get(pieces).trim();
265                 if (valueString.startsWith("$")) {
266                     valueString = valueString.substring(1);
267                 }
268                 valueString = valueString.replace(",", "");
269                 double value = Double.parseDouble(valueString.trim());
270                 factbookGdp.add(code, value);
271                 if (ADD_POP) {
272                     System.out.println("Factbook gdp:\t" + code + "\t" + value);
273                 }
274                 return true;
275             }
276         });
277     }
278 
279     static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US);
280     static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US);
281     static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US);
282 
283     static class MyLineHandler implements LineHandler {
284         CountryData countryData;
285 
MyLineHandler(CountryData countryData)286         public MyLineHandler(CountryData countryData) {
287             super();
288             this.countryData = countryData;
289         }
290 
291         @Override
handle(String line)292         public boolean handle(String line) throws ParseException {
293             if (line.startsWith("#")) return true;
294             if (line.length() == 0) {
295                 return true;
296             }
297             String[] pieces = line.split(";");
298             final String code = pieces[0].trim();
299             if (code.equals("Code")) {
300                 return false;
301             }
302             // Code;Name;Type;Data;Source
303             final String typeString = pieces[2].trim();
304             final String data = pieces[3].trim();
305             if (typeString.equals("gdp-ppp")) {
306                 if (StandardCodes.isCountry(data)) {
307                     Double otherPop = getPopulation(data);
308                     Double otherGdp = getGdp(data);
309                     Double myPop = getPopulation(code);
310                     if (myPop.doubleValue() == 0 || otherPop.doubleValue() == 0 || otherGdp.doubleValue() == 0) {
311                         otherPop = getPopulation(data);
312                         otherGdp = getPopulation(data);
313                         myPop = getPopulation(code);
314                         throw new IllegalArgumentException("Zero population");
315                     }
316                     CountryData.gdp.add(code, otherGdp * myPop / otherPop);
317                 } else {
318                     CountryData.gdp.add(code, dollars.parse(data).doubleValue());
319                 }
320             } else if (typeString.equals("population")) {
321                 if (StandardCodes.isCountry(data)) {
322                     throw new IllegalArgumentException("Population can't use other country's");
323                 }
324                 CountryData.population.add(code, number.parse(data).doubleValue());
325             } else if (typeString.equals("literacy")) {
326                 if (StandardCodes.isCountry(data)) {
327                     Double otherPop = getLiteracy(data);
328                     CountryData.literacy.add(code, otherPop);
329                 } else {
330                     CountryData.literacy.add(code, number.parse(data).doubleValue());
331                 }
332             } else {
333                 throw new IllegalArgumentException("Illegal type");
334             }
335             return true;
336         }
337     }
338 
339     static final UnicodeSet DIGITS = new UnicodeSet("[:Nd:]").freeze();
340 
loadFactbookLiteracy()341     private static void loadFactbookLiteracy() throws IOException {
342         final String filename = "external/factbook_literacy.txt";
343         CldrUtility.handleFile(filename, new LineHandler() {
344             @Override
345             public boolean handle(String line) {
346                 String[] pieces = line.split("\\t");
347                 String code = CountryCodeConverter.getCodeFromName(FBLiteracy.Country.get(pieces), true, missing);
348                 if (code == null) {
349                     return false;
350                 }
351                 if (!StandardCodes.isCountry(code)) {
352                     if (ADD_POP) {
353                         System.out.println("Skipping factbook literacy for: " + code);
354                     }
355                     return false;
356                 }
357                 code = code.toUpperCase(Locale.ENGLISH);
358                 String valueString = FBLiteracy.Percent.get(pieces).trim();
359                 double percent = Double.parseDouble(valueString);
360                 factbook_literacy.put(code, percent);
361                 if (ADD_POP) {
362                     System.out.println("Factbook literacy:\t" + code + "\t" + percent);
363                 }
364                 code = null;
365                 return true;
366             }
367         });
368     }
369 
loadWorldBankInfo()370     private static void loadWorldBankInfo() throws IOException {
371         final String filename = "external/world_bank_data.csv";
372 
373         // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename));
374 
375         CldrUtility.handleFile(filename, new LineHandler() {
376             ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null;
377 
378             @Override
379             public boolean handle(String line) {
380                 String[] pieces = splitCommaSeparated(line);
381                 if (columnToTypeAndValue == null) {
382                     columnToTypeAndValue = WBLine.parseHeader(pieces);
383                     return false;
384                 }
385 
386                 final HashMap<Pair<WBLine, Integer>, String> lineAsHash = new HashMap<>();
387                 for (int i=0; i<pieces.length; i++) {
388                     lineAsHash.put(columnToTypeAndValue.get(i), pieces[i]);
389                 }
390                 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\"");
391                 final String seriesCode = lineAsHash.get(Pair.of(WBLine.Series_Code, 0));
392 
393                 // find the last year
394                 String last = null;
395 
396                 for (int n=0; n<columnToTypeAndValue.size(); n++) {
397                     // assume the years are in ascending order
398                     Pair<WBLine, Integer> i = columnToTypeAndValue.get(n);
399                     if (i.getFirst() == WBLine.Year) {
400                         String current = pieces[n];
401                         if (current.length() != 0 && !current.equals(EMPTY)) {
402                             last = current;
403                         }
404                     }
405                 }
406                 if (last == null) {
407                     return false;
408                 }
409                 final String countryName = lineAsHash.get(Pair.of(WBLine.Country_Name, 0));
410                 String country = CountryCodeConverter.getCodeFromName(countryName, true, missing);
411                 if (country == null) {
412                     return false;
413                 }
414                 if (!StandardCodes.isCountry(country)) {
415                     if (ADD_POP) {
416                         System.out.println("Skipping worldbank info for: " + country);
417                     }
418                     return false;
419                 }
420                 double value;
421                 try {
422                     value = Double.parseDouble(last);
423                 } catch (NumberFormatException e) {
424                     throw new IllegalArgumentException("File changed format: need to modify code");
425                 }
426                 if (seriesCode.equals(GCP)) {
427                     worldbank_gdp.add(country, value);
428                 } else if (seriesCode.equals(POP)) {
429                     worldbank_population.add(country, value);
430                 } else {
431                     throw new IllegalArgumentException();
432                 }
433                 return true;
434             }
435         });
436     }
437 
loadUnLiteracy()438     private static void loadUnLiteracy() throws IOException {
439         CldrUtility.handleFile("external/un_literacy.csv", new CldrUtility.LineHandler() {
440             @Override
441             public boolean handle(String line) {
442                 // Afghanistan,2000, ,28,43,13,,34,51,18
443                 // "Country or area","Year",,"Adult (15+) literacy rate",,,,,,"         Youth (15-24) literacy rate",,,,
444                 // ,,,Total,Men,Women,,Total,Men,Women
445                 // "Albania",2008,,96,,97,,95,,99,,99,,99
446                 String[] pieces = splitCommaSeparated(line);
447                 if (pieces.length != 14 || pieces[1].length() == 0 || !DIGITS.containsAll(pieces[1])) {
448                     return false;
449                 }
450                 String code = CountryCodeConverter.getCodeFromName(pieces[0], true, missing);
451                 if (code == null) {
452                     return false;
453                 }
454                 if (!StandardCodes.isCountry(code)) {
455                     if (ADD_POP) {
456                         System.out.println("Skipping UN info for: " + code);
457                     }
458                     return false;
459                 }
460                 String totalLiteracy = pieces[3];
461                 if (totalLiteracy.equals("�") || totalLiteracy.equals("…") || totalLiteracy.isEmpty()) {
462                     return true;
463                 }
464                 double percent = Double.parseDouble(totalLiteracy);
465                 un_literacy.add(code, percent);
466                 return true;
467             }
468         });
469     }
470 
471     static {
472         try {
loadFactbookLiteracy()473             loadFactbookLiteracy();
loadUnLiteracy()474             loadUnLiteracy();
475 
476             loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp);
477             loadFactbookInfo("external/factbook_population.txt", factbook_population);
478             CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other));
479 
loadWorldBankInfo()480             loadWorldBankInfo();
481             StandardCodes sc = StandardCodes.make();
482             StringBuilder myErrors = new StringBuilder();
483             for (String territory : sc.getGoodAvailableCodes("territory")) {
484                 if (!StandardCodes.isCountry(territory)) {
485                     continue;
486                 }
487                 double gdp = getGdp(territory);
488                 double literacy = getLiteracy(territory);
489                 double population = getPopulation(territory);
490                 if (gdp == 0) {
491                     // AX;Aland Islands;population;26,200;www.aland.ax
492                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";gdp-ppp;0;reason");
493                 }
494                 if (literacy == 0) {
495                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";literacy;0;reason");
496                 }
497                 if (population == 0) {
498                     myErrors.append("\n" + territory + ";" + sc.getData("territory", territory)
499                         + ";population;0;reason");
500                 }
501             }
502             if (myErrors.length() != 0) {
503                 throw new IllegalArgumentException(
504                     "Missing Country values, the following and add to external/other_country_data to fix, chaning the 0 to the real value:"
505                         + myErrors);
506             }
507         } catch (IOException e) {
508         }
509     }
510 }
511