1 package org.unicode.cldr.tool; 2 3 import com.ibm.icu.text.ListFormat; 4 import com.ibm.icu.text.NumberFormat; 5 import com.ibm.icu.text.UnicodeSet; 6 import com.ibm.icu.util.Output; 7 import com.ibm.icu.util.ULocale; 8 import java.io.IOException; 9 import java.text.ParseException; 10 import java.util.ArrayList; 11 import java.util.HashMap; 12 import java.util.Iterator; 13 import java.util.LinkedList; 14 import java.util.List; 15 import java.util.Locale; 16 import java.util.Map; 17 import java.util.Set; 18 import java.util.TreeSet; 19 import java.util.regex.Matcher; 20 import java.util.regex.Pattern; 21 import org.unicode.cldr.util.CldrUtility; 22 import org.unicode.cldr.util.CldrUtility.LineHandler; 23 import org.unicode.cldr.util.Counter2; 24 import org.unicode.cldr.util.Pair; 25 import org.unicode.cldr.util.StandardCodes; 26 27 public class AddPopulationData { 28 static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false); 29 static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false); 30 31 enum WBLine { 32 // "Afghanistan","AFG","GNI, PPP (current international 33 // $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..", 34 35 // Country Name,Country Code,Series Name,Series Code,2000 [YR2000],2001 [YR2001],2002 36 // [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 37 // [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 38 // [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 39 // [YR2020] 40 Country_Name, 41 Country_Code, 42 Series_Name, 43 Series_Code, 44 Year("(\\d+)\\s*\\[YR(\\d+)\\]"); 45 46 final Pattern pattern; 47 WBLine()48 WBLine() { 49 this.pattern = Pattern.compile(name().replaceAll("_", " ")); 50 } 51 WBLine(final String regex)52 WBLine(final String regex) { 53 this.pattern = Pattern.compile(regex); 54 } 55 match(String str)56 Matcher match(String str) { 57 // Skip BOM 58 if (str.startsWith("\uFEFF")) { 59 str = str.substring("\uFEFF".length()); 60 } 61 return this.pattern.matcher(str); 62 } 63 find(final String str)64 static Pair<WBLine, Integer> find(final String str) { 65 for (WBLine i : values()) { 66 final Matcher m = i.match(str); 67 if (m.matches()) { 68 Integer val = 0; 69 if (m.groupCount() > 0) { 70 val = Integer.parseInt(m.group(1)); 71 } 72 return Pair.of(i, val); 73 } 74 } 75 return null; 76 } 77 parseHeader(final String[] pieces)78 static ArrayList<Pair<WBLine, Integer>> parseHeader(final String[] pieces) { 79 ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null; 80 columnToTypeAndValue = new ArrayList<>(); 81 for (int i = 0; i < pieces.length; i++) { 82 columnToTypeAndValue.add(i, WBLine.find(pieces[i])); 83 } 84 return columnToTypeAndValue; 85 } 86 } 87 88 enum FactbookLine { 89 CountryName, 90 CountrySlug, 91 Value, 92 DateOfInformation, 93 Ranking, 94 Region; 95 get(String[] pieces)96 String get(String[] pieces) { 97 return pieces[ordinal()]; 98 } 99 } 100 101 enum FBLiteracy { 102 Rank, 103 Country, 104 Percent; 105 get(String[] pieces)106 String get(String[] pieces) { 107 return pieces[ordinal()]; 108 } 109 } 110 111 private static final String GCP = "NY.GNP.MKTP.PP.CD"; 112 private static final String POP = "SP.POP.TOTL"; 113 private static final String EMPTY = ".."; 114 private static Counter2<String> worldbank_gdp = new Counter2<>(); 115 private static Counter2<String> worldbank_population = new Counter2<>(); 116 private static Counter2<String> un_literacy = new Counter2<>(); 117 118 private static Counter2<String> factbook_gdp = new Counter2<>(); 119 private static Counter2<String> factbook_population = new Counter2<>(); 120 private static Counter2<String> factbook_literacy = new Counter2<>(); 121 122 private static CountryData other = new CountryData(); 123 124 static class CountryData { 125 private static Counter2<String> population = new Counter2<>(); 126 private static Counter2<String> gdp = new Counter2<>(); 127 private static Counter2<String> literacy = new Counter2<>(); 128 } 129 130 static final Set<String> missing = new TreeSet<String>(); 131 main(String[] args)132 public static void main(String[] args) throws IOException { 133 134 System.out.println( 135 "Code" + "\t" + "Name" + "\t" + "Pop" + "\t" + "GDP-PPP" + "\t" + "UN Literacy"); 136 137 for (String country : StandardCodes.make().getGoodCountries()) { 138 showCountryData(country); 139 } 140 Set<String> outliers = new TreeSet<>(); 141 outliers.addAll(factbook_population.keySet()); 142 outliers.addAll(worldbank_population.keySet()); 143 outliers.addAll(factbook_gdp.keySet()); 144 outliers.addAll(worldbank_gdp.keySet()); 145 outliers.addAll(un_literacy.keySet()); 146 for (Iterator<String> it = outliers.iterator(); it.hasNext(); ) { 147 if (StandardCodes.isCountry(it.next())) { 148 it.remove(); 149 } 150 } 151 // outliers.remove("AN"); 152 if (outliers.size() != 0) { 153 System.out.println("Mistakes: data for non-UN codes"); 154 for (String country : outliers) { 155 showCountryData(country); 156 } 157 throw new IllegalArgumentException("Mistakes: data for non-country codes"); 158 } 159 Set<String> altNames = new TreeSet<>(); 160 String oldCode = ""; 161 for (String display : CountryCodeConverter.names()) { 162 String code = CountryCodeConverter.getCodeFromName(display, true, missing); 163 String icu = ULocale.getDisplayCountry("und-" + code, "en"); 164 if (!display.equalsIgnoreCase(icu)) { 165 altNames.add(code + "\t" + display + "\t" + icu); 166 } 167 } 168 oldCode = ""; 169 if (SHOW_ALTERNATE_NAMES) { 170 for (String altName : altNames) { 171 String[] pieces = altName.split("\t"); 172 String code = pieces[0]; 173 if (code.equals("ZZ")) continue; 174 if (!code.equals(oldCode)) { 175 oldCode = code; 176 System.out.println(); 177 } 178 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]); 179 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + 180 // pieces[1] + 181 // "</territory> <!-- " + pieces[2] + " -->"); 182 } 183 } 184 if (!missing.isEmpty()) { 185 throw new RuntimeException( 186 "Could not load codes for: " 187 + ListFormat.getInstance(Locale.getDefault()).format(missing)); 188 } 189 } 190 showCountryData(String country)191 private static void showCountryData(String country) { 192 number.setMaximumFractionDigits(0); 193 System.out.println( 194 country 195 + "\t" 196 + ULocale.getDisplayCountry("und-" + country, "en") 197 + "\t" 198 + number.format(getPopulation(country)) 199 + "\t" 200 + number.format(getGdp(country)) 201 + "\t" 202 + percent.format(getLiteracy(country) / 100)); 203 } 204 205 /** 206 * Gets the percent of people that can read in a particular country. Values are in the range 0 207 * to 100 208 */ getLiteracy(String country)209 public static Double getLiteracy(String country) { 210 return firstNonZero( 211 factbook_literacy.getCount(country), 212 un_literacy.getCount(country), 213 CountryData.literacy.getCount(country)); 214 } 215 getGdp(String country)216 public static Double getGdp(String country) { 217 return firstNonZero( 218 factbook_gdp.getCount(country), 219 worldbank_gdp.getCount(country), 220 CountryData.gdp.getCount(country)); 221 } 222 getPopulation(String country)223 public static Double getPopulation(String country) { 224 return firstNonZero( 225 factbook_population.getCount(country), 226 worldbank_population.getCount(country), 227 CountryData.population.getCount(country)); 228 } 229 firstNonZero(Double... items)230 private static Double firstNonZero(Double... items) { 231 for (Double item : items) { 232 if (item.doubleValue() != 0) { 233 return item; 234 } 235 } 236 return 0.0; 237 } 238 splitCommaSeparated(String line)239 static String[] splitCommaSeparated(String line) { 240 // items are separated by ',' 241 // each item is of the form abc... 242 // or "..." (required if a comma or quote is contained) 243 // " in a field is represented by "" 244 List<String> result = new ArrayList<>(); 245 StringBuilder item = new StringBuilder(); 246 boolean inQuote = false; 247 for (int i = 0; i < line.length(); ++i) { 248 char ch = line.charAt(i); // don't worry about supplementaries 249 switch (ch) { 250 case '"': 251 inQuote = !inQuote; 252 // at start or end, that's enough 253 // if get a quote when we are not in a quote, and not at start, then add it and 254 // return to inQuote 255 if (inQuote && item.length() != 0) { 256 item.append('"'); 257 inQuote = true; 258 } 259 break; 260 case ',': 261 if (!inQuote) { 262 result.add(item.toString()); 263 item.setLength(0); 264 } else { 265 item.append(ch); 266 } 267 break; 268 default: 269 item.append(ch); 270 break; 271 } 272 } 273 result.add(item.toString()); 274 return result.toArray(new String[result.size()]); 275 } 276 loadFactbookInfo(String filename, final Counter2<String> factbookGdp)277 private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) 278 throws IOException { 279 CldrUtility.handleFile( 280 filename, 281 new LineHandler() { 282 @Override 283 public boolean handle(String line) { 284 String[] pieces = splitCommaSeparated(line); 285 String countryName = FactbookLine.CountryName.get(pieces); 286 if (countryName.equals("name")) { 287 return false; 288 } 289 String code = 290 CountryCodeConverter.getCodeFromName(countryName, true, missing); 291 if (code == null) { 292 return false; 293 } 294 if (!StandardCodes.isCountry(code)) { 295 if (ADD_POP) { 296 System.out.println("Skipping factbook info for: " + code); 297 } 298 return false; 299 } 300 code = code.toUpperCase(Locale.ENGLISH); 301 String valueString = FactbookLine.Value.get(pieces).trim(); 302 if (valueString.startsWith("$")) { 303 valueString = valueString.substring(1); 304 } 305 valueString = valueString.replace(",", ""); 306 double value = Double.parseDouble(valueString.trim()); 307 factbookGdp.add(code, value); 308 if (ADD_POP) { 309 System.out.println("Factbook gdp:\t" + code + "\t" + value); 310 } 311 return true; 312 } 313 }); 314 } 315 316 static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US); 317 static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US); 318 static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US); 319 320 static class MyLineHandler implements LineHandler { 321 CountryData countryData; 322 MyLineHandler(CountryData countryData)323 public MyLineHandler(CountryData countryData) { 324 super(); 325 this.countryData = countryData; 326 } 327 328 @Override handle(String line)329 public boolean handle(String line) throws ParseException { 330 if (line.startsWith("#")) return true; 331 if (line.length() == 0) { 332 return true; 333 } 334 String[] pieces = line.split(";"); 335 final String code = pieces[0].trim(); 336 if (code.equals("Code")) { 337 return false; 338 } 339 // Code;Name;Type;Data;Source 340 final String typeString = pieces[2].trim(); 341 final String data = pieces[3].trim(); 342 if (typeString.equals("gdp-ppp")) { 343 if (StandardCodes.isCountry(data)) { 344 Double otherPop = getPopulation(data); 345 Double otherGdp = getGdp(data); 346 Double myPop = getPopulation(code); 347 if (myPop.doubleValue() == 0 348 || otherPop.doubleValue() == 0 349 || otherGdp.doubleValue() == 0) { 350 otherPop = getPopulation(data); 351 otherGdp = getPopulation(data); 352 myPop = getPopulation(code); 353 throw new IllegalArgumentException("Zero population"); 354 } 355 CountryData.gdp.add(code, otherGdp * myPop / otherPop); 356 } else { 357 CountryData.gdp.add(code, dollars.parse(data).doubleValue()); 358 } 359 } else if (typeString.equals("population")) { 360 if (StandardCodes.isCountry(data)) { 361 throw new IllegalArgumentException("Population can't use other country's"); 362 } 363 CountryData.population.add(code, number.parse(data).doubleValue()); 364 } else if (typeString.equals("literacy")) { 365 if (StandardCodes.isCountry(data)) { 366 Double otherPop = getLiteracy(data); 367 CountryData.literacy.add(code, otherPop); 368 } else { 369 CountryData.literacy.add(code, number.parse(data).doubleValue()); 370 } 371 } else { 372 throw new IllegalArgumentException("Illegal type"); 373 } 374 return true; 375 } 376 } 377 378 static final UnicodeSet DIGITS = new UnicodeSet("[:Nd:]").freeze(); 379 loadFactbookLiteracy()380 private static void loadFactbookLiteracy() throws IOException { 381 final String filename = "external/factbook_literacy.txt"; 382 CldrUtility.handleFile( 383 filename, 384 new LineHandler() { 385 @Override 386 public boolean handle(String line) { 387 String[] pieces = line.split("\\t"); 388 String code = 389 CountryCodeConverter.getCodeFromName( 390 FBLiteracy.Country.get(pieces), true, missing); 391 if (code == null) { 392 return false; 393 } 394 if (!StandardCodes.isCountry(code)) { 395 if (ADD_POP) { 396 System.out.println("Skipping factbook literacy for: " + code); 397 } 398 return false; 399 } 400 code = code.toUpperCase(Locale.ENGLISH); 401 String valueString = 402 FBLiteracy.Percent.get(pieces) 403 .trim(); // Values are in the range 0 to 100 404 double percent = Double.parseDouble(valueString); 405 factbook_literacy.put(code, percent); 406 if (ADD_POP) { 407 System.out.println("Factbook literacy:\t" + code + "\t" + percent); 408 } 409 code = null; 410 return true; 411 } 412 }); 413 } 414 loadWorldBankInfo()415 private static void loadWorldBankInfo() throws IOException { 416 final String filename = "external/world_bank_data.csv"; 417 418 // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename)); 419 420 CldrUtility.handleFile( 421 filename, 422 new LineHandler() { 423 ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null; 424 425 @Override 426 public boolean handle(String line) { 427 String[] pieces = splitCommaSeparated(line); 428 if (columnToTypeAndValue == null) { 429 columnToTypeAndValue = WBLine.parseHeader(pieces); 430 return false; 431 } 432 433 final HashMap<Pair<WBLine, Integer>, String> lineAsHash = new HashMap<>(); 434 for (int i = 0; i < pieces.length; i++) { 435 lineAsHash.put(columnToTypeAndValue.get(i), pieces[i]); 436 } 437 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\""); 438 final String seriesCode = lineAsHash.get(Pair.of(WBLine.Series_Code, 0)); 439 440 // find the last year 441 String last = null; 442 443 for (int n = 0; n < columnToTypeAndValue.size(); n++) { 444 // assume the years are in ascending order 445 Pair<WBLine, Integer> i = columnToTypeAndValue.get(n); 446 if (i.getFirst() == WBLine.Year) { 447 String current = pieces[n]; 448 if (current.length() != 0 && !current.equals(EMPTY)) { 449 last = current; 450 } 451 } 452 } 453 if (last == null) { 454 return false; 455 } 456 final String countryName = lineAsHash.get(Pair.of(WBLine.Country_Name, 0)); 457 String country = 458 CountryCodeConverter.getCodeFromName(countryName, true, missing); 459 if (country == null) { 460 return false; 461 } 462 if (!StandardCodes.isCountry(country)) { 463 if (ADD_POP) { 464 System.out.println("Skipping worldbank info for: " + country); 465 } 466 return false; 467 } 468 double value; 469 try { 470 value = Double.parseDouble(last); 471 } catch (NumberFormatException e) { 472 throw new IllegalArgumentException( 473 "File changed format: need to modify code"); 474 } 475 if (seriesCode.equals(GCP)) { 476 worldbank_gdp.add(country, value); 477 } else if (seriesCode.equals(POP)) { 478 worldbank_population.add(country, value); 479 } else { 480 throw new IllegalArgumentException(); 481 } 482 return true; 483 } 484 }); 485 } 486 loadUnLiteracy()487 static void loadUnLiteracy() throws IOException { 488 for (final Pair<String, Double> p : getUnLiteracy(null)) { 489 un_literacy.add(p.getFirst(), p.getSecond()); 490 } 491 } 492 493 /** 494 * @param hadErr on return, true if there were errs 495 * @return list of code,percent values 496 * @throws IOException 497 */ getUnLiteracy(Output<Boolean> hadErr)498 static List<Pair<String, Double>> getUnLiteracy(Output<Boolean> hadErr) throws IOException { 499 List<Pair<String, Double>> result = new LinkedList<>(); 500 UnLiteracyParser ulp; 501 try { 502 ulp = new UnLiteracyParser().read(); 503 } catch (Throwable t) { 504 throw new IOException("Could not read UN data " + UnLiteracyParser.UN_LITERACY, t); 505 } 506 507 for (final Map.Entry<String, UnLiteracyParser.PerCountry> e : ulp.perCountry.entrySet()) { 508 final String country = e.getKey(); 509 final String latest = e.getValue().latest(); 510 final UnLiteracyParser.PerYear py = e.getValue().perYear.get(latest); 511 512 Long literate = py.total(UnLiteracyParser.LITERATE); 513 Long illiterate = py.total(UnLiteracyParser.ILLITERATE); 514 515 String code = CountryCodeConverter.getCodeFromName(country, true, missing); 516 if (code == null) { 517 if (hadErr != null) { 518 hadErr.value = true; 519 } 520 continue; 521 } 522 if (!StandardCodes.isCountry(code)) { 523 if (ADD_POP) { 524 System.out.println("Skipping UN info for: " + code); 525 } 526 continue; 527 } 528 double total = literate + illiterate; 529 double percent = 530 ((double) literate) 531 * 100 532 / total; // Multiply by 100 to put values in range 0 to 100 533 result.add(Pair.of(code, percent)); 534 } 535 if (result.isEmpty()) { 536 hadErr.value = true; 537 } 538 return result; 539 } 540 541 static { 542 try { loadFactbookLiteracy()543 loadFactbookLiteracy(); loadUnLiteracy()544 loadUnLiteracy(); 545 546 loadFactbookInfo("external/factbook_gdp_ppp.csv", factbook_gdp); 547 loadFactbookInfo("external/factbook_population.csv", factbook_population); 548 CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other)); 549 loadWorldBankInfo()550 loadWorldBankInfo(); 551 StandardCodes sc = StandardCodes.make(); 552 StringBuilder myErrors = new StringBuilder(); 553 for (String territory : sc.getGoodAvailableCodes("territory")) { 554 if (!StandardCodes.isCountry(territory)) { 555 continue; 556 } 557 double gdp = getGdp(territory); 558 double literacy = getLiteracy(territory); 559 double population = getPopulation(territory); 560 if (population == 0) { 561 // AX;Aland Islands;population;26,200;www.aland.ax 562 myErrors.append( 563 "\n" 564 + territory 565 + ";" 566 + sc.getData("territory", territory) 567 + ";population;0;reason"); 568 } 569 if (gdp == 0) { 570 myErrors.append( 571 "\n" 572 + territory 573 + ";" 574 + sc.getData("territory", territory) 575 + ";gdp-ppp;0;reason"); 576 } 577 if (literacy == 0) { 578 myErrors.append( 579 "\n" 580 + territory 581 + ";" 582 + sc.getData("territory", territory) 583 + ";literacy;0;reason"); 584 } 585 } 586 if (myErrors.length() != 0) { 587 throw new IllegalArgumentException( 588 "Missing Country values, the following and add to external/other_country_data to fix, changing the 0 to the real value:" 589 + myErrors); 590 } 591 } catch (IOException e) { 592 } 593 } 594 } 595