1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.text.ParseException; 5 import java.util.ArrayList; 6 import java.util.HashMap; 7 import java.util.Iterator; 8 import java.util.List; 9 import java.util.Locale; 10 import java.util.Set; 11 import java.util.TreeSet; 12 import java.util.regex.Matcher; 13 import java.util.regex.Pattern; 14 15 import org.unicode.cldr.util.CldrUtility; 16 import org.unicode.cldr.util.CldrUtility.LineHandler; 17 import org.unicode.cldr.util.Counter2; 18 import org.unicode.cldr.util.Pair; 19 import org.unicode.cldr.util.StandardCodes; 20 21 import com.ibm.icu.text.ListFormat; 22 import com.ibm.icu.text.NumberFormat; 23 import com.ibm.icu.text.UnicodeSet; 24 import com.ibm.icu.util.ULocale; 25 26 public class AddPopulationData { 27 static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false); 28 static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false); 29 30 enum WBLine { 31 // "Afghanistan","AFG","GNI, PPP (current international $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..", 32 33 // Country Name,Country Code,Series Name,Series Code,2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020] 34 Country_Name, Country_Code, Series_Name, Series_Code, 35 Year("(\\d+)\\s*\\[YR(\\d+)\\]"); 36 37 final Pattern pattern; WBLine()38 WBLine() { 39 this.pattern = Pattern.compile(name().replaceAll("_", " ")); 40 } WBLine(final String regex)41 WBLine(final String regex) { 42 this.pattern = Pattern.compile(regex); 43 } 44 match(String str)45 Matcher match(String str) { 46 // Skip BOM 47 if (str.startsWith("\uFEFF")) { 48 str = str.substring("\uFEFF".length()); 49 } 50 return this.pattern.matcher(str); 51 } 52 find(final String str)53 static Pair<WBLine, Integer> find(final String str) { 54 for (WBLine i : values()) { 55 final Matcher m = i.match(str); 56 if (m.matches()) { 57 Integer val = 0; 58 if (m.groupCount() > 0) { 59 val = Integer.parseInt(m.group(1)); 60 } 61 return Pair.of(i, val); 62 } 63 } 64 return null; 65 } 66 parseHeader(final String[] pieces)67 static ArrayList<Pair<WBLine, Integer>> parseHeader(final String[] pieces) { 68 ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null; 69 columnToTypeAndValue = new ArrayList<>(); 70 for (int i=0; i<pieces.length; i++) { 71 columnToTypeAndValue.add(i,WBLine.find(pieces[i])); 72 } 73 return columnToTypeAndValue; 74 } 75 } 76 77 enum FBLine { 78 Rank, Country, Value, Year; get(String[] pieces)79 String get(String[] pieces) { 80 return pieces[ordinal()]; 81 } 82 } 83 84 enum FBLiteracy { 85 Rank, Country, Percent; get(String[] pieces)86 String get(String[] pieces) { 87 return pieces[ordinal()]; 88 } 89 } 90 91 private static final String GCP = "NY.GNP.MKTP.PP.CD"; 92 private static final String POP = "SP.POP.TOTL"; 93 private static final String EMPTY = ".."; 94 private static Counter2<String> worldbank_gdp = new Counter2<>(); 95 private static Counter2<String> worldbank_population = new Counter2<>(); 96 private static Counter2<String> un_literacy = new Counter2<>(); 97 98 private static Counter2<String> factbook_gdp = new Counter2<>(); 99 private static Counter2<String> factbook_population = new Counter2<>(); 100 private static Counter2<String> factbook_literacy = new Counter2<>(); 101 102 private static CountryData other = new CountryData(); 103 104 static class CountryData { 105 private static Counter2<String> population = new Counter2<>(); 106 private static Counter2<String> gdp = new Counter2<>(); 107 private static Counter2<String> literacy = new Counter2<>(); 108 } 109 110 final static Set<String> missing = new TreeSet<String>(); main(String[] args)111 public static void main(String[] args) throws IOException { 112 113 System.out.println("Code" 114 + "\t" + "Name" 115 + "\t" + "Pop" 116 + "\t" + "GDP-PPP" 117 + "\t" + "UN Literacy"); 118 119 for (String country : StandardCodes.make().getGoodCountries()) { 120 showCountryData(country); 121 } 122 Set<String> outliers = new TreeSet<>(); 123 outliers.addAll(factbook_population.keySet()); 124 outliers.addAll(worldbank_population.keySet()); 125 outliers.addAll(factbook_gdp.keySet()); 126 outliers.addAll(worldbank_gdp.keySet()); 127 outliers.addAll(un_literacy.keySet()); 128 for (Iterator<String> it = outliers.iterator(); it.hasNext();) { 129 if (StandardCodes.isCountry(it.next())) { 130 it.remove(); 131 } 132 } 133 // outliers.remove("AN"); 134 if (outliers.size() != 0) { 135 System.out.println("Mistakes: data for non-UN codes"); 136 for (String country : outliers) { 137 showCountryData(country); 138 } 139 throw new IllegalArgumentException("Mistakes: data for non-country codes"); 140 } 141 Set<String> altNames = new TreeSet<>(); 142 String oldCode = ""; 143 for (String display : CountryCodeConverter.names()) { 144 String code = CountryCodeConverter.getCodeFromName(display, true, missing); 145 String icu = ULocale.getDisplayCountry("und-" + code, "en"); 146 if (!display.equalsIgnoreCase(icu)) { 147 altNames.add(code + "\t" + display + "\t" + icu); 148 } 149 } 150 oldCode = ""; 151 if (SHOW_ALTERNATE_NAMES) { 152 for (String altName : altNames) { 153 String[] pieces = altName.split("\t"); 154 String code = pieces[0]; 155 if (code.equals("ZZ")) continue; 156 if (!code.equals(oldCode)) { 157 oldCode = code; 158 System.out.println(); 159 } 160 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]); 161 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + pieces[1] + 162 // "</territory> <!-- " + pieces[2] + " -->"); 163 } 164 } 165 if (!missing.isEmpty()) { 166 throw new RuntimeException("Could not load codes for: " + 167 ListFormat.getInstance(Locale.getDefault()).format(missing)); 168 } 169 } 170 showCountryData(String country)171 private static void showCountryData(String country) { 172 number.setMaximumFractionDigits(0); 173 System.out.println(country 174 + "\t" + ULocale.getDisplayCountry("und-" + country, "en") 175 + "\t" + number.format(getPopulation(country)) 176 + "\t" + number.format(getGdp(country)) 177 + "\t" + percent.format(getLiteracy(country) / 100)); 178 } 179 getLiteracy(String country)180 public static Double getLiteracy(String country) { 181 return firstNonZero(factbook_literacy.getCount(country), 182 un_literacy.getCount(country), 183 CountryData.literacy.getCount(country)); 184 } 185 getGdp(String country)186 public static Double getGdp(String country) { 187 return firstNonZero(factbook_gdp.getCount(country), 188 worldbank_gdp.getCount(country), 189 CountryData.gdp.getCount(country)); 190 } 191 getPopulation(String country)192 public static Double getPopulation(String country) { 193 return firstNonZero(factbook_population.getCount(country), 194 worldbank_population.getCount(country), 195 CountryData.population.getCount(country)); 196 } 197 firstNonZero(Double... items)198 private static Double firstNonZero(Double... items) { 199 for (Double item : items) { 200 if (item.doubleValue() != 0) { 201 return item; 202 } 203 } 204 return 0.0; 205 } 206 splitCommaSeparated(String line)207 static String[] splitCommaSeparated(String line) { 208 // items are separated by ',' 209 // each item is of the form abc... 210 // or "..." (required if a comma or quote is contained) 211 // " in a field is represented by "" 212 List<String> result = new ArrayList<>(); 213 StringBuilder item = new StringBuilder(); 214 boolean inQuote = false; 215 for (int i = 0; i < line.length(); ++i) { 216 char ch = line.charAt(i); // don't worry about supplementaries 217 switch (ch) { 218 case '"': 219 inQuote = !inQuote; 220 // at start or end, that's enough 221 // if get a quote when we are not in a quote, and not at start, then add it and return to inQuote 222 if (inQuote && item.length() != 0) { 223 item.append('"'); 224 inQuote = true; 225 } 226 break; 227 case ',': 228 if (!inQuote) { 229 result.add(item.toString()); 230 item.setLength(0); 231 } else { 232 item.append(ch); 233 } 234 break; 235 default: 236 item.append(ch); 237 break; 238 } 239 } 240 result.add(item.toString()); 241 return result.toArray(new String[result.size()]); 242 } 243 loadFactbookInfo(String filename, final Counter2<String> factbookGdp)244 private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) throws IOException { 245 CldrUtility.handleFile(filename, new LineHandler() { 246 @Override 247 public boolean handle(String line) { 248 if (line.length() == 0 || line.startsWith("This tab") || line.startsWith("Rank") 249 || line.startsWith(" This file")) { 250 return false; 251 } 252 String[] pieces = line.split("\\s{2,}"); 253 String code = CountryCodeConverter.getCodeFromName(FBLine.Country.get(pieces), true, missing); 254 if (code == null) { 255 return false; 256 } 257 if (!StandardCodes.isCountry(code)) { 258 if (ADD_POP) { 259 System.out.println("Skipping factbook info for: " + code); 260 } 261 return false; 262 } 263 code = code.toUpperCase(Locale.ENGLISH); 264 String valueString = FBLine.Value.get(pieces).trim(); 265 if (valueString.startsWith("$")) { 266 valueString = valueString.substring(1); 267 } 268 valueString = valueString.replace(",", ""); 269 double value = Double.parseDouble(valueString.trim()); 270 factbookGdp.add(code, value); 271 if (ADD_POP) { 272 System.out.println("Factbook gdp:\t" + code + "\t" + value); 273 } 274 return true; 275 } 276 }); 277 } 278 279 static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US); 280 static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US); 281 static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US); 282 283 static class MyLineHandler implements LineHandler { 284 CountryData countryData; 285 MyLineHandler(CountryData countryData)286 public MyLineHandler(CountryData countryData) { 287 super(); 288 this.countryData = countryData; 289 } 290 291 @Override handle(String line)292 public boolean handle(String line) throws ParseException { 293 if (line.startsWith("#")) return true; 294 if (line.length() == 0) { 295 return true; 296 } 297 String[] pieces = line.split(";"); 298 final String code = pieces[0].trim(); 299 if (code.equals("Code")) { 300 return false; 301 } 302 // Code;Name;Type;Data;Source 303 final String typeString = pieces[2].trim(); 304 final String data = pieces[3].trim(); 305 if (typeString.equals("gdp-ppp")) { 306 if (StandardCodes.isCountry(data)) { 307 Double otherPop = getPopulation(data); 308 Double otherGdp = getGdp(data); 309 Double myPop = getPopulation(code); 310 if (myPop.doubleValue() == 0 || otherPop.doubleValue() == 0 || otherGdp.doubleValue() == 0) { 311 otherPop = getPopulation(data); 312 otherGdp = getPopulation(data); 313 myPop = getPopulation(code); 314 throw new IllegalArgumentException("Zero population"); 315 } 316 CountryData.gdp.add(code, otherGdp * myPop / otherPop); 317 } else { 318 CountryData.gdp.add(code, dollars.parse(data).doubleValue()); 319 } 320 } else if (typeString.equals("population")) { 321 if (StandardCodes.isCountry(data)) { 322 throw new IllegalArgumentException("Population can't use other country's"); 323 } 324 CountryData.population.add(code, number.parse(data).doubleValue()); 325 } else if (typeString.equals("literacy")) { 326 if (StandardCodes.isCountry(data)) { 327 Double otherPop = getLiteracy(data); 328 CountryData.literacy.add(code, otherPop); 329 } else { 330 CountryData.literacy.add(code, number.parse(data).doubleValue()); 331 } 332 } else { 333 throw new IllegalArgumentException("Illegal type"); 334 } 335 return true; 336 } 337 } 338 339 static final UnicodeSet DIGITS = new UnicodeSet("[:Nd:]").freeze(); 340 loadFactbookLiteracy()341 private static void loadFactbookLiteracy() throws IOException { 342 final String filename = "external/factbook_literacy.txt"; 343 CldrUtility.handleFile(filename, new LineHandler() { 344 @Override 345 public boolean handle(String line) { 346 String[] pieces = line.split("\\t"); 347 String code = CountryCodeConverter.getCodeFromName(FBLiteracy.Country.get(pieces), true, missing); 348 if (code == null) { 349 return false; 350 } 351 if (!StandardCodes.isCountry(code)) { 352 if (ADD_POP) { 353 System.out.println("Skipping factbook literacy for: " + code); 354 } 355 return false; 356 } 357 code = code.toUpperCase(Locale.ENGLISH); 358 String valueString = FBLiteracy.Percent.get(pieces).trim(); 359 double percent = Double.parseDouble(valueString); 360 factbook_literacy.put(code, percent); 361 if (ADD_POP) { 362 System.out.println("Factbook literacy:\t" + code + "\t" + percent); 363 } 364 code = null; 365 return true; 366 } 367 }); 368 } 369 loadWorldBankInfo()370 private static void loadWorldBankInfo() throws IOException { 371 final String filename = "external/world_bank_data.csv"; 372 373 // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename)); 374 375 CldrUtility.handleFile(filename, new LineHandler() { 376 ArrayList<Pair<WBLine, Integer>> columnToTypeAndValue = null; 377 378 @Override 379 public boolean handle(String line) { 380 String[] pieces = splitCommaSeparated(line); 381 if (columnToTypeAndValue == null) { 382 columnToTypeAndValue = WBLine.parseHeader(pieces); 383 return false; 384 } 385 386 final HashMap<Pair<WBLine, Integer>, String> lineAsHash = new HashMap<>(); 387 for (int i=0; i<pieces.length; i++) { 388 lineAsHash.put(columnToTypeAndValue.get(i), pieces[i]); 389 } 390 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\""); 391 final String seriesCode = lineAsHash.get(Pair.of(WBLine.Series_Code, 0)); 392 393 // find the last year 394 String last = null; 395 396 for (int n=0; n<columnToTypeAndValue.size(); n++) { 397 // assume the years are in ascending order 398 Pair<WBLine, Integer> i = columnToTypeAndValue.get(n); 399 if (i.getFirst() == WBLine.Year) { 400 String current = pieces[n]; 401 if (current.length() != 0 && !current.equals(EMPTY)) { 402 last = current; 403 } 404 } 405 } 406 if (last == null) { 407 return false; 408 } 409 final String countryName = lineAsHash.get(Pair.of(WBLine.Country_Name, 0)); 410 String country = CountryCodeConverter.getCodeFromName(countryName, true, missing); 411 if (country == null) { 412 return false; 413 } 414 if (!StandardCodes.isCountry(country)) { 415 if (ADD_POP) { 416 System.out.println("Skipping worldbank info for: " + country); 417 } 418 return false; 419 } 420 double value; 421 try { 422 value = Double.parseDouble(last); 423 } catch (NumberFormatException e) { 424 throw new IllegalArgumentException("File changed format: need to modify code"); 425 } 426 if (seriesCode.equals(GCP)) { 427 worldbank_gdp.add(country, value); 428 } else if (seriesCode.equals(POP)) { 429 worldbank_population.add(country, value); 430 } else { 431 throw new IllegalArgumentException(); 432 } 433 return true; 434 } 435 }); 436 } 437 loadUnLiteracy()438 private static void loadUnLiteracy() throws IOException { 439 CldrUtility.handleFile("external/un_literacy.csv", new CldrUtility.LineHandler() { 440 @Override 441 public boolean handle(String line) { 442 // Afghanistan,2000, ,28,43,13,,34,51,18 443 // "Country or area","Year",,"Adult (15+) literacy rate",,,,,," Youth (15-24) literacy rate",,,, 444 // ,,,Total,Men,Women,,Total,Men,Women 445 // "Albania",2008,,96,,97,,95,,99,,99,,99 446 String[] pieces = splitCommaSeparated(line); 447 if (pieces.length != 14 || pieces[1].length() == 0 || !DIGITS.containsAll(pieces[1])) { 448 return false; 449 } 450 String code = CountryCodeConverter.getCodeFromName(pieces[0], true, missing); 451 if (code == null) { 452 return false; 453 } 454 if (!StandardCodes.isCountry(code)) { 455 if (ADD_POP) { 456 System.out.println("Skipping UN info for: " + code); 457 } 458 return false; 459 } 460 String totalLiteracy = pieces[3]; 461 if (totalLiteracy.equals("�") || totalLiteracy.equals("…") || totalLiteracy.isEmpty()) { 462 return true; 463 } 464 double percent = Double.parseDouble(totalLiteracy); 465 un_literacy.add(code, percent); 466 return true; 467 } 468 }); 469 } 470 471 static { 472 try { loadFactbookLiteracy()473 loadFactbookLiteracy(); loadUnLiteracy()474 loadUnLiteracy(); 475 476 loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp); 477 loadFactbookInfo("external/factbook_population.txt", factbook_population); 478 CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other)); 479 loadWorldBankInfo()480 loadWorldBankInfo(); 481 StandardCodes sc = StandardCodes.make(); 482 StringBuilder myErrors = new StringBuilder(); 483 for (String territory : sc.getGoodAvailableCodes("territory")) { 484 if (!StandardCodes.isCountry(territory)) { 485 continue; 486 } 487 double gdp = getGdp(territory); 488 double literacy = getLiteracy(territory); 489 double population = getPopulation(territory); 490 if (gdp == 0) { 491 // AX;Aland Islands;population;26,200;www.aland.ax 492 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";gdp-ppp;0;reason"); 493 } 494 if (literacy == 0) { 495 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";literacy;0;reason"); 496 } 497 if (population == 0) { 498 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) 499 + ";population;0;reason"); 500 } 501 } 502 if (myErrors.length() != 0) { 503 throw new IllegalArgumentException( 504 "Missing Country values, the following and add to external/other_country_data to fix, chaning the 0 to the real value:" 505 + myErrors); 506 } 507 } catch (IOException e) { 508 } 509 } 510 } 511