• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import com.ibm.icu.text.ListFormat;
4 import com.ibm.icu.text.NumberFormat;
5 import com.ibm.icu.text.UnicodeSet;
6 import com.ibm.icu.util.Output;
7 import com.ibm.icu.util.ULocale;
8 import java.io.IOException;
9 import java.text.ParseException;
10 import java.util.ArrayList;
11 import java.util.HashMap;
12 import java.util.Iterator;
13 import java.util.LinkedList;
14 import java.util.List;
15 import java.util.Locale;
16 import java.util.Map;
17 import java.util.Set;
18 import java.util.TreeSet;
19 import java.util.regex.Matcher;
20 import java.util.regex.Pattern;
21 import org.unicode.cldr.util.CldrUtility;
22 import org.unicode.cldr.util.CldrUtility.LineHandler;
23 import org.unicode.cldr.util.Counter2;
24 import org.unicode.cldr.util.Pair;
25 import org.unicode.cldr.util.StandardCodes;
26 
27 public class AddPopulationData {
28     static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false);
29     static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false);
30 
31     enum WBLine {
32         // "Afghanistan","AFG","GNI, PPP (current international
33         // $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..",
34 
35         // Country Name,Country Code,Series Name,Series Code,2000 [YR2000],2001 [YR2001],2002
36         // [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008
37         // [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014
38         // [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020
39         // [YR2020]
40         Country_Name,
41         Country_Code,
42         Series_Name,
43         Series_Code,
44         Year("(\\d+)\\s*\\[YR(\\d+)\\]");
45 
46         final Pattern pattern;
47 
WBLine()48         WBLine() {
49             this.pattern = Pattern.compile(name().replaceAll("_", " "));
50         }
51 
WBLine(final String regex)52         WBLine(final String regex) {
53             this.pattern = Pattern.compile(regex);
54         }
55 
match(String str)56         Matcher match(String str) {
57             // Skip BOM
58             if (str.startsWith("\uFEFF")) {
59                 str = str.substring("\uFEFF".length());
60             }
61             return this.pattern.matcher(str);
62         }
63 
find(final String str)64         static Pair<WBLine, Integer> find(final String str) {
65             for (WBLine i : values()) {
66                 final Matcher m = i.match(str);
67                 if (m.matches()) {
68                     Integer val = 0;
69                     if (m.groupCount() > 0) {
70                         val = Integer.parseInt(m.group(1));
71                     }
72                     return Pair.of(i, val);
73                 }
74             }
75             return null;
76         }
77 
parseHeader(final String[] pieces)78         static ArrayList<Pair<WBLine, Integer>> parseHeader(final String[] pieces) {
79             ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null;
80             columnToTypeAndValue = new ArrayList<>();
81             for (int i = 0; i < pieces.length; i++) {
82                 columnToTypeAndValue.add(i, WBLine.find(pieces[i]));
83             }
84             return columnToTypeAndValue;
85         }
86     }
87 
88     enum FactbookLine {
89         CountryName,
90         CountrySlug,
91         Value,
92         DateOfInformation,
93         Ranking,
94         Region;
95 
get(String[] pieces)96         String get(String[] pieces) {
97             return pieces[ordinal()];
98         }
99     }
100 
101     enum FBLiteracy {
102         Rank,
103         Country,
104         Percent;
105 
get(String[] pieces)106         String get(String[] pieces) {
107             return pieces[ordinal()];
108         }
109     }
110 
111     private static final String GCP = "NY.GNP.MKTP.PP.CD";
112     private static final String POP = "SP.POP.TOTL";
113     private static final String EMPTY = "..";
114     private static Counter2<String> worldbank_gdp = new Counter2<>();
115     private static Counter2<String> worldbank_population = new Counter2<>();
116     private static Counter2<String> un_literacy = new Counter2<>();
117 
118     private static Counter2<String> factbook_gdp = new Counter2<>();
119     private static Counter2<String> factbook_population = new Counter2<>();
120     private static Counter2<String> factbook_literacy = new Counter2<>();
121 
122     private static CountryData other = new CountryData();
123 
124     static class CountryData {
125         private static Counter2<String> population = new Counter2<>();
126         private static Counter2<String> gdp = new Counter2<>();
127         private static Counter2<String> literacy = new Counter2<>();
128     }
129 
130     static final Set<String> missing = new TreeSet<String>();
131 
main(String[] args)132     public static void main(String[] args) throws IOException {
133 
134         System.out.println(
135                 "Code" + "\t" + "Name" + "\t" + "Pop" + "\t" + "GDP-PPP" + "\t" + "UN Literacy");
136 
137         for (String country : StandardCodes.make().getGoodCountries()) {
138             showCountryData(country);
139         }
140         Set<String> outliers = new TreeSet<>();
141         outliers.addAll(factbook_population.keySet());
142         outliers.addAll(worldbank_population.keySet());
143         outliers.addAll(factbook_gdp.keySet());
144         outliers.addAll(worldbank_gdp.keySet());
145         outliers.addAll(un_literacy.keySet());
146         for (Iterator<String> it = outliers.iterator(); it.hasNext(); ) {
147             if (StandardCodes.isCountry(it.next())) {
148                 it.remove();
149             }
150         }
151         // outliers.remove("AN");
152         if (outliers.size() != 0) {
153             System.out.println("Mistakes: data for non-UN codes");
154             for (String country : outliers) {
155                 showCountryData(country);
156             }
157             throw new IllegalArgumentException("Mistakes: data for non-country codes");
158         }
159         Set<String> altNames = new TreeSet<>();
160         String oldCode = "";
161         for (String display : CountryCodeConverter.names()) {
162             String code = CountryCodeConverter.getCodeFromName(display, true, missing);
163             String icu = ULocale.getDisplayCountry("und-" + code, "en");
164             if (!display.equalsIgnoreCase(icu)) {
165                 altNames.add(code + "\t" + display + "\t" + icu);
166             }
167         }
168         oldCode = "";
169         if (SHOW_ALTERNATE_NAMES) {
170             for (String altName : altNames) {
171                 String[] pieces = altName.split("\t");
172                 String code = pieces[0];
173                 if (code.equals("ZZ")) continue;
174                 if (!code.equals(oldCode)) {
175                     oldCode = code;
176                     System.out.println();
177                 }
178                 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]);
179                 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" +
180                 // pieces[1] +
181                 // "</territory> <!-- " + pieces[2] + " -->");
182             }
183         }
184         if (!missing.isEmpty()) {
185             throw new RuntimeException(
186                     "Could not load codes for: "
187                             + ListFormat.getInstance(Locale.getDefault()).format(missing));
188         }
189     }
190 
showCountryData(String country)191     private static void showCountryData(String country) {
192         number.setMaximumFractionDigits(0);
193         System.out.println(
194                 country
195                         + "\t"
196                         + ULocale.getDisplayCountry("und-" + country, "en")
197                         + "\t"
198                         + number.format(getPopulation(country))
199                         + "\t"
200                         + number.format(getGdp(country))
201                         + "\t"
202                         + percent.format(getLiteracy(country) / 100));
203     }
204 
205     /**
206      * Gets the percent of people that can read in a particular country. Values are in the range 0
207      * to 100
208      */
getLiteracy(String country)209     public static Double getLiteracy(String country) {
210         return firstNonZero(
211                 factbook_literacy.getCount(country),
212                 un_literacy.getCount(country),
213                 CountryData.literacy.getCount(country));
214     }
215 
getGdp(String country)216     public static Double getGdp(String country) {
217         return firstNonZero(
218                 factbook_gdp.getCount(country),
219                 worldbank_gdp.getCount(country),
220                 CountryData.gdp.getCount(country));
221     }
222 
getPopulation(String country)223     public static Double getPopulation(String country) {
224         return firstNonZero(
225                 factbook_population.getCount(country),
226                 worldbank_population.getCount(country),
227                 CountryData.population.getCount(country));
228     }
229 
firstNonZero(Double... items)230     private static Double firstNonZero(Double... items) {
231         for (Double item : items) {
232             if (item.doubleValue() != 0) {
233                 return item;
234             }
235         }
236         return 0.0;
237     }
238 
splitCommaSeparated(String line)239     static String[] splitCommaSeparated(String line) {
240         // items are separated by ','
241         // each item is of the form abc...
242         // or "..." (required if a comma or quote is contained)
243         // " in a field is represented by ""
244         List<String> result = new ArrayList<>();
245         StringBuilder item = new StringBuilder();
246         boolean inQuote = false;
247         for (int i = 0; i < line.length(); ++i) {
248             char ch = line.charAt(i); // don't worry about supplementaries
249             switch (ch) {
250                 case '"':
251                     inQuote = !inQuote;
252                     // at start or end, that's enough
253                     // if get a quote when we are not in a quote, and not at start, then add it and
254                     // return to inQuote
255                     if (inQuote && item.length() != 0) {
256                         item.append('"');
257                         inQuote = true;
258                     }
259                     break;
260                 case ',':
261                     if (!inQuote) {
262                         result.add(item.toString());
263                         item.setLength(0);
264                     } else {
265                         item.append(ch);
266                     }
267                     break;
268                 default:
269                     item.append(ch);
270                     break;
271             }
272         }
273         result.add(item.toString());
274         return result.toArray(new String[result.size()]);
275     }
276 
loadFactbookInfo(String filename, final Counter2<String> factbookGdp)277     private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp)
278             throws IOException {
279         CldrUtility.handleFile(
280                 filename,
281                 new LineHandler() {
282                     @Override
283                     public boolean handle(String line) {
284                         String[] pieces = splitCommaSeparated(line);
285                         String countryName = FactbookLine.CountryName.get(pieces);
286                         if (countryName.equals("name")) {
287                             return false;
288                         }
289                         String code =
290                                 CountryCodeConverter.getCodeFromName(countryName, true, missing);
291                         if (code == null) {
292                             return false;
293                         }
294                         if (!StandardCodes.isCountry(code)) {
295                             if (ADD_POP) {
296                                 System.out.println("Skipping factbook info for: " + code);
297                             }
298                             return false;
299                         }
300                         code = code.toUpperCase(Locale.ENGLISH);
301                         String valueString = FactbookLine.Value.get(pieces).trim();
302                         if (valueString.startsWith("$")) {
303                             valueString = valueString.substring(1);
304                         }
305                         valueString = valueString.replace(",", "");
306                         double value = Double.parseDouble(valueString.trim());
307                         factbookGdp.add(code, value);
308                         if (ADD_POP) {
309                             System.out.println("Factbook gdp:\t" + code + "\t" + value);
310                         }
311                         return true;
312                     }
313                 });
314     }
315 
316     static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US);
317     static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US);
318     static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US);
319 
320     static class MyLineHandler implements LineHandler {
321         CountryData countryData;
322 
MyLineHandler(CountryData countryData)323         public MyLineHandler(CountryData countryData) {
324             super();
325             this.countryData = countryData;
326         }
327 
328         @Override
handle(String line)329         public boolean handle(String line) throws ParseException {
330             if (line.startsWith("#")) return true;
331             if (line.length() == 0) {
332                 return true;
333             }
334             String[] pieces = line.split(";");
335             final String code = pieces[0].trim();
336             if (code.equals("Code")) {
337                 return false;
338             }
339             // Code;Name;Type;Data;Source
340             final String typeString = pieces[2].trim();
341             final String data = pieces[3].trim();
342             if (typeString.equals("gdp-ppp")) {
343                 if (StandardCodes.isCountry(data)) {
344                     Double otherPop = getPopulation(data);
345                     Double otherGdp = getGdp(data);
346                     Double myPop = getPopulation(code);
347                     if (myPop.doubleValue() == 0
348                             || otherPop.doubleValue() == 0
349                             || otherGdp.doubleValue() == 0) {
350                         otherPop = getPopulation(data);
351                         otherGdp = getPopulation(data);
352                         myPop = getPopulation(code);
353                         throw new IllegalArgumentException("Zero population");
354                     }
355                     CountryData.gdp.add(code, otherGdp * myPop / otherPop);
356                 } else {
357                     CountryData.gdp.add(code, dollars.parse(data).doubleValue());
358                 }
359             } else if (typeString.equals("population")) {
360                 if (StandardCodes.isCountry(data)) {
361                     throw new IllegalArgumentException("Population can't use other country's");
362                 }
363                 CountryData.population.add(code, number.parse(data).doubleValue());
364             } else if (typeString.equals("literacy")) {
365                 if (StandardCodes.isCountry(data)) {
366                     Double otherPop = getLiteracy(data);
367                     CountryData.literacy.add(code, otherPop);
368                 } else {
369                     CountryData.literacy.add(code, number.parse(data).doubleValue());
370                 }
371             } else {
372                 throw new IllegalArgumentException("Illegal type");
373             }
374             return true;
375         }
376     }
377 
378     static final UnicodeSet DIGITS = new UnicodeSet("[:Nd:]").freeze();
379 
loadFactbookLiteracy()380     private static void loadFactbookLiteracy() throws IOException {
381         final String filename = "external/factbook_literacy.txt";
382         CldrUtility.handleFile(
383                 filename,
384                 new LineHandler() {
385                     @Override
386                     public boolean handle(String line) {
387                         String[] pieces = line.split("\\t");
388                         String code =
389                                 CountryCodeConverter.getCodeFromName(
390                                         FBLiteracy.Country.get(pieces), true, missing);
391                         if (code == null) {
392                             return false;
393                         }
394                         if (!StandardCodes.isCountry(code)) {
395                             if (ADD_POP) {
396                                 System.out.println("Skipping factbook literacy for: " + code);
397                             }
398                             return false;
399                         }
400                         code = code.toUpperCase(Locale.ENGLISH);
401                         String valueString =
402                                 FBLiteracy.Percent.get(pieces)
403                                         .trim(); // Values are in the range 0 to 100
404                         double percent = Double.parseDouble(valueString);
405                         factbook_literacy.put(code, percent);
406                         if (ADD_POP) {
407                             System.out.println("Factbook literacy:\t" + code + "\t" + percent);
408                         }
409                         code = null;
410                         return true;
411                     }
412                 });
413     }
414 
loadWorldBankInfo()415     private static void loadWorldBankInfo() throws IOException {
416         final String filename = "external/world_bank_data.csv";
417 
418         // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename));
419 
420         CldrUtility.handleFile(
421                 filename,
422                 new LineHandler() {
423                     ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null;
424 
425                     @Override
426                     public boolean handle(String line) {
427                         String[] pieces = splitCommaSeparated(line);
428                         if (columnToTypeAndValue == null) {
429                             columnToTypeAndValue = WBLine.parseHeader(pieces);
430                             return false;
431                         }
432 
433                         final HashMap<Pair<WBLine, Integer>, String> lineAsHash = new HashMap<>();
434                         for (int i = 0; i < pieces.length; i++) {
435                             lineAsHash.put(columnToTypeAndValue.get(i), pieces[i]);
436                         }
437                         // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\"");
438                         final String seriesCode = lineAsHash.get(Pair.of(WBLine.Series_Code, 0));
439 
440                         // find the last year
441                         String last = null;
442 
443                         for (int n = 0; n < columnToTypeAndValue.size(); n++) {
444                             // assume the years are in ascending order
445                             Pair<WBLine, Integer> i = columnToTypeAndValue.get(n);
446                             if (i.getFirst() == WBLine.Year) {
447                                 String current = pieces[n];
448                                 if (current.length() != 0 && !current.equals(EMPTY)) {
449                                     last = current;
450                                 }
451                             }
452                         }
453                         if (last == null) {
454                             return false;
455                         }
456                         final String countryName = lineAsHash.get(Pair.of(WBLine.Country_Name, 0));
457                         String country =
458                                 CountryCodeConverter.getCodeFromName(countryName, true, missing);
459                         if (country == null) {
460                             return false;
461                         }
462                         if (!StandardCodes.isCountry(country)) {
463                             if (ADD_POP) {
464                                 System.out.println("Skipping worldbank info for: " + country);
465                             }
466                             return false;
467                         }
468                         double value;
469                         try {
470                             value = Double.parseDouble(last);
471                         } catch (NumberFormatException e) {
472                             throw new IllegalArgumentException(
473                                     "File changed format: need to modify code");
474                         }
475                         if (seriesCode.equals(GCP)) {
476                             worldbank_gdp.add(country, value);
477                         } else if (seriesCode.equals(POP)) {
478                             worldbank_population.add(country, value);
479                         } else {
480                             throw new IllegalArgumentException();
481                         }
482                         return true;
483                     }
484                 });
485     }
486 
loadUnLiteracy()487     static void loadUnLiteracy() throws IOException {
488         for (final Pair<String, Double> p : getUnLiteracy(null)) {
489             un_literacy.add(p.getFirst(), p.getSecond());
490         }
491     }
492 
493     /**
494      * @param hadErr on return, true if there were errs
495      * @return list of code,percent values
496      * @throws IOException
497      */
getUnLiteracy(Output<Boolean> hadErr)498     static List<Pair<String, Double>> getUnLiteracy(Output<Boolean> hadErr) throws IOException {
499         List<Pair<String, Double>> result = new LinkedList<>();
500         UnLiteracyParser ulp;
501         try {
502             ulp = new UnLiteracyParser().read();
503         } catch (Throwable t) {
504             throw new IOException("Could not read UN data " + UnLiteracyParser.UN_LITERACY, t);
505         }
506 
507         for (final Map.Entry<String, UnLiteracyParser.PerCountry> e : ulp.perCountry.entrySet()) {
508             final String country = e.getKey();
509             final String latest = e.getValue().latest();
510             final UnLiteracyParser.PerYear py = e.getValue().perYear.get(latest);
511 
512             Long literate = py.total(UnLiteracyParser.LITERATE);
513             Long illiterate = py.total(UnLiteracyParser.ILLITERATE);
514 
515             String code = CountryCodeConverter.getCodeFromName(country, true, missing);
516             if (code == null) {
517                 if (hadErr != null) {
518                     hadErr.value = true;
519                 }
520                 continue;
521             }
522             if (!StandardCodes.isCountry(code)) {
523                 if (ADD_POP) {
524                     System.out.println("Skipping UN info for: " + code);
525                 }
526                 continue;
527             }
528             double total = literate + illiterate;
529             double percent =
530                     ((double) literate)
531                             * 100
532                             / total; // Multiply by 100 to put values in range 0 to 100
533             result.add(Pair.of(code, percent));
534         }
535         if (result.isEmpty()) {
536             hadErr.value = true;
537         }
538         return result;
539     }
540 
541     static {
542         try {
loadFactbookLiteracy()543             loadFactbookLiteracy();
loadUnLiteracy()544             loadUnLiteracy();
545 
546             loadFactbookInfo("external/factbook_gdp_ppp.csv", factbook_gdp);
547             loadFactbookInfo("external/factbook_population.csv", factbook_population);
548             CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other));
549 
loadWorldBankInfo()550             loadWorldBankInfo();
551             StandardCodes sc = StandardCodes.make();
552             StringBuilder myErrors = new StringBuilder();
553             for (String territory : sc.getGoodAvailableCodes("territory")) {
554                 if (!StandardCodes.isCountry(territory)) {
555                     continue;
556                 }
557                 double gdp = getGdp(territory);
558                 double literacy = getLiteracy(territory);
559                 double population = getPopulation(territory);
560                 if (population == 0) {
561                     // AX;Aland Islands;population;26,200;www.aland.ax
562                     myErrors.append(
563                             "\n"
564                                     + territory
565                                     + ";"
566                                     + sc.getData("territory", territory)
567                                     + ";population;0;reason");
568                 }
569                 if (gdp == 0) {
570                     myErrors.append(
571                             "\n"
572                                     + territory
573                                     + ";"
574                                     + sc.getData("territory", territory)
575                                     + ";gdp-ppp;0;reason");
576                 }
577                 if (literacy == 0) {
578                     myErrors.append(
579                             "\n"
580                                     + territory
581                                     + ";"
582                                     + sc.getData("territory", territory)
583                                     + ";literacy;0;reason");
584                 }
585             }
586             if (myErrors.length() != 0) {
587                 throw new IllegalArgumentException(
588                         "Missing Country values, the following and add to external/other_country_data to fix, changing the 0 to the real value:"
589                                 + myErrors);
590             }
591         } catch (IOException e) {
592         }
593     }
594 }
595