1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.text.ParseException; 5 import java.util.ArrayList; 6 import java.util.Iterator; 7 import java.util.List; 8 import java.util.Locale; 9 import java.util.Set; 10 import java.util.TreeSet; 11 12 import org.unicode.cldr.util.CldrUtility; 13 import org.unicode.cldr.util.CldrUtility.LineHandler; 14 import org.unicode.cldr.util.Counter2; 15 import org.unicode.cldr.util.StandardCodes; 16 17 import com.ibm.icu.text.ListFormat; 18 import com.ibm.icu.text.NumberFormat; 19 import com.ibm.icu.text.UnicodeSet; 20 import com.ibm.icu.util.ULocale; 21 22 public class AddPopulationData { 23 static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false); 24 static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false); 25 26 enum WBLine { 27 // "Afghanistan","AFG","GNI, PPP (current international $)","NY.GNP.MKTP.PP.CD","..","..","13144920451.3325","16509662130.816","18932631964.8727","22408872945.1924","25820670505.2627","30783369469.7509","32116190092.1429","..", 28 29 Country_Name, Country_Code, Series_Name, Series_Code, YR2000, YR2001, YR2002, YR2003, YR2004, YR2005, YR2006, YR2007, YR2008, YR2009, YR2010, YR2011, YR2012, YR2013, YR2014, YR2015, YR2016, YR2017, YR2018, YR2019; get(String[] pieces)30 String get(String[] pieces) { 31 return ordinal() < pieces.length ? pieces[ordinal()] : EMPTY; 32 } 33 } 34 35 enum FBLine { 36 Rank, Country, Value, Year; get(String[] pieces)37 String get(String[] pieces) { 38 return pieces[ordinal()]; 39 } 40 } 41 42 enum FBLiteracy { 43 Rank, Country, Percent; get(String[] pieces)44 String get(String[] pieces) { 45 return pieces[ordinal()]; 46 } 47 } 48 49 private static final String GCP = "NY.GNP.MKTP.PP.CD"; 50 private static final String POP = "SP.POP.TOTL"; 51 private static final String EMPTY = ".."; 52 private static Counter2<String> worldbank_gdp = new Counter2<>(); 53 private static Counter2<String> worldbank_population = new Counter2<>(); 54 private static Counter2<String> un_literacy = new Counter2<>(); 55 56 private static Counter2<String> factbook_gdp = new Counter2<>(); 57 private static Counter2<String> factbook_population = new Counter2<>(); 58 private static Counter2<String> factbook_literacy = new Counter2<>(); 59 60 private static CountryData other = new CountryData(); 61 62 static class CountryData { 63 private static Counter2<String> population = new Counter2<>(); 64 private static Counter2<String> gdp = new Counter2<>(); 65 private static Counter2<String> literacy = new Counter2<>(); 66 } 67 68 final static Set<String> missing = new TreeSet<String>(); main(String[] args)69 public static void main(String[] args) throws IOException { 70 71 System.out.println("Code" 72 + "\t" + "Name" 73 + "\t" + "Pop" 74 + "\t" + "GDP-PPP" 75 + "\t" + "UN Literacy"); 76 77 for (String country : StandardCodes.make().getGoodCountries()) { 78 showCountryData(country); 79 } 80 Set<String> outliers = new TreeSet<>(); 81 outliers.addAll(factbook_population.keySet()); 82 outliers.addAll(worldbank_population.keySet()); 83 outliers.addAll(factbook_gdp.keySet()); 84 outliers.addAll(worldbank_gdp.keySet()); 85 outliers.addAll(un_literacy.keySet()); 86 for (Iterator<String> it = outliers.iterator(); it.hasNext();) { 87 if (StandardCodes.isCountry(it.next())) { 88 it.remove(); 89 } 90 } 91 // outliers.remove("AN"); 92 if (outliers.size() != 0) { 93 System.out.println("Mistakes: data for non-UN codes"); 94 for (String country : outliers) { 95 showCountryData(country); 96 } 97 throw new IllegalArgumentException("Mistakes: data for non-country codes"); 98 } 99 Set<String> altNames = new TreeSet<>(); 100 String oldCode = ""; 101 for (String display : CountryCodeConverter.names()) { 102 String code = CountryCodeConverter.getCodeFromName(display, true, missing); 103 String icu = ULocale.getDisplayCountry("und-" + code, "en"); 104 if (!display.equalsIgnoreCase(icu)) { 105 altNames.add(code + "\t" + display + "\t" + icu); 106 } 107 } 108 oldCode = ""; 109 if (SHOW_ALTERNATE_NAMES) { 110 for (String altName : altNames) { 111 String[] pieces = altName.split("\t"); 112 String code = pieces[0]; 113 if (code.equals("ZZ")) continue; 114 if (!code.equals(oldCode)) { 115 oldCode = code; 116 System.out.println(); 117 } 118 System.out.println(code + "; " + pieces[2] + "; " + pieces[1]); 119 // System.out.println("<territory type=\"" + code + "\" alt=\"v" + (++alt) + "\">" + pieces[1] + 120 // "</territory> <!-- " + pieces[2] + " -->"); 121 } 122 } 123 if (!missing.isEmpty()) { 124 throw new RuntimeException("Could not load codes for: " + 125 ListFormat.getInstance(Locale.getDefault()).format(missing)); 126 } 127 } 128 showCountryData(String country)129 private static void showCountryData(String country) { 130 number.setMaximumFractionDigits(0); 131 System.out.println(country 132 + "\t" + ULocale.getDisplayCountry("und-" + country, "en") 133 + "\t" + number.format(getPopulation(country)) 134 + "\t" + number.format(getGdp(country)) 135 + "\t" + percent.format(getLiteracy(country) / 100)); 136 } 137 getLiteracy(String country)138 public static Double getLiteracy(String country) { 139 return firstNonZero(factbook_literacy.getCount(country), 140 un_literacy.getCount(country), 141 CountryData.literacy.getCount(country)); 142 } 143 getGdp(String country)144 public static Double getGdp(String country) { 145 return firstNonZero(factbook_gdp.getCount(country), 146 worldbank_gdp.getCount(country), 147 CountryData.gdp.getCount(country)); 148 } 149 getPopulation(String country)150 public static Double getPopulation(String country) { 151 return firstNonZero(factbook_population.getCount(country), 152 worldbank_population.getCount(country), 153 CountryData.population.getCount(country)); 154 } 155 firstNonZero(Double... items)156 private static Double firstNonZero(Double... items) { 157 for (Double item : items) { 158 if (item.doubleValue() != 0) { 159 return item; 160 } 161 } 162 return 0.0; 163 } 164 splitCommaSeparated(String line)165 static String[] splitCommaSeparated(String line) { 166 // items are separated by ',' 167 // each item is of the form abc... 168 // or "..." (required if a comma or quote is contained) 169 // " in a field is represented by "" 170 List<String> result = new ArrayList<>(); 171 StringBuilder item = new StringBuilder(); 172 boolean inQuote = false; 173 for (int i = 0; i < line.length(); ++i) { 174 char ch = line.charAt(i); // don't worry about supplementaries 175 switch (ch) { 176 case '"': 177 inQuote = !inQuote; 178 // at start or end, that's enough 179 // if get a quote when we are not in a quote, and not at start, then add it and return to inQuote 180 if (inQuote && item.length() != 0) { 181 item.append('"'); 182 inQuote = true; 183 } 184 break; 185 case ',': 186 if (!inQuote) { 187 result.add(item.toString()); 188 item.setLength(0); 189 } else { 190 item.append(ch); 191 } 192 break; 193 default: 194 item.append(ch); 195 break; 196 } 197 } 198 result.add(item.toString()); 199 return result.toArray(new String[result.size()]); 200 } 201 loadFactbookInfo(String filename, final Counter2<String> factbookGdp)202 private static void loadFactbookInfo(String filename, final Counter2<String> factbookGdp) throws IOException { 203 CldrUtility.handleFile(filename, new LineHandler() { 204 @Override 205 public boolean handle(String line) { 206 if (line.length() == 0 || line.startsWith("This tab") || line.startsWith("Rank") 207 || line.startsWith(" This file")) { 208 return false; 209 } 210 String[] pieces = line.split("\\s{2,}"); 211 String code = CountryCodeConverter.getCodeFromName(FBLine.Country.get(pieces), true, missing); 212 if (code == null) { 213 return false; 214 } 215 if (!StandardCodes.isCountry(code)) { 216 if (ADD_POP) { 217 System.out.println("Skipping factbook info for: " + code); 218 } 219 return false; 220 } 221 code = code.toUpperCase(Locale.ENGLISH); 222 String valueString = FBLine.Value.get(pieces).trim(); 223 if (valueString.startsWith("$")) { 224 valueString = valueString.substring(1); 225 } 226 valueString = valueString.replace(",", ""); 227 double value = Double.parseDouble(valueString.trim()); 228 factbookGdp.add(code, value); 229 if (ADD_POP) { 230 System.out.println("Factbook gdp:\t" + code + "\t" + value); 231 } 232 return true; 233 } 234 }); 235 } 236 237 static final NumberFormat dollars = NumberFormat.getCurrencyInstance(ULocale.US); 238 static final NumberFormat number = NumberFormat.getNumberInstance(ULocale.US); 239 static final NumberFormat percent = NumberFormat.getPercentInstance(ULocale.US); 240 241 static class MyLineHandler implements LineHandler { 242 CountryData countryData; 243 MyLineHandler(CountryData countryData)244 public MyLineHandler(CountryData countryData) { 245 super(); 246 this.countryData = countryData; 247 } 248 249 @Override handle(String line)250 public boolean handle(String line) throws ParseException { 251 if (line.startsWith("#")) return true; 252 if (line.length() == 0) { 253 return true; 254 } 255 String[] pieces = line.split(";"); 256 final String code = pieces[0].trim(); 257 if (code.equals("Code")) { 258 return false; 259 } 260 // Code;Name;Type;Data;Source 261 final String typeString = pieces[2].trim(); 262 final String data = pieces[3].trim(); 263 if (typeString.equals("gdp-ppp")) { 264 if (StandardCodes.isCountry(data)) { 265 Double otherPop = getPopulation(data); 266 Double otherGdp = getGdp(data); 267 Double myPop = getPopulation(code); 268 if (myPop.doubleValue() == 0 || otherPop.doubleValue() == 0 || otherGdp.doubleValue() == 0) { 269 otherPop = getPopulation(data); 270 otherGdp = getPopulation(data); 271 myPop = getPopulation(code); 272 throw new IllegalArgumentException("Zero population"); 273 } 274 CountryData.gdp.add(code, otherGdp * myPop / otherPop); 275 } else { 276 CountryData.gdp.add(code, dollars.parse(data).doubleValue()); 277 } 278 } else if (typeString.equals("population")) { 279 if (StandardCodes.isCountry(data)) { 280 throw new IllegalArgumentException("Population can't use other country's"); 281 } 282 CountryData.population.add(code, number.parse(data).doubleValue()); 283 } else if (typeString.equals("literacy")) { 284 if (StandardCodes.isCountry(data)) { 285 Double otherPop = getLiteracy(data); 286 CountryData.literacy.add(code, otherPop); 287 } else { 288 CountryData.literacy.add(code, number.parse(data).doubleValue()); 289 } 290 } else { 291 throw new IllegalArgumentException("Illegal type"); 292 } 293 return true; 294 } 295 } 296 297 static final UnicodeSet DIGITS = new UnicodeSet("[:Nd:]").freeze(); 298 loadFactbookLiteracy()299 private static void loadFactbookLiteracy() throws IOException { 300 final String filename = "external/factbook_literacy.txt"; 301 CldrUtility.handleFile(filename, new LineHandler() { 302 @Override 303 public boolean handle(String line) { 304 String[] pieces = line.split("\\t"); 305 String code = CountryCodeConverter.getCodeFromName(FBLiteracy.Country.get(pieces), true, missing); 306 if (code == null) { 307 return false; 308 } 309 if (!StandardCodes.isCountry(code)) { 310 if (ADD_POP) { 311 System.out.println("Skipping factbook literacy for: " + code); 312 } 313 return false; 314 } 315 code = code.toUpperCase(Locale.ENGLISH); 316 String valueString = FBLiteracy.Percent.get(pieces).trim(); 317 double percent = Double.parseDouble(valueString); 318 factbook_literacy.put(code, percent); 319 if (ADD_POP) { 320 System.out.println("Factbook literacy:\t" + code + "\t" + percent); 321 } 322 code = null; 323 return true; 324 } 325 }); 326 } 327 loadWorldBankInfo()328 private static void loadWorldBankInfo() throws IOException { 329 final String filename = "external/world_bank_data.csv"; 330 331 // List<List<String>> data = SpreadSheet.convert(CldrUtility.getUTF8Data(filename)); 332 333 CldrUtility.handleFile(filename, new LineHandler() { 334 @Override 335 public boolean handle(String line) { 336 if (line.contains("Series Code")) { 337 return false; 338 } 339 String[] pieces = splitCommaSeparated(line); 340 341 // String[] pieces = line.substring(1, line.length() - 2).split("\"\t\""); 342 343 final String seriesCode = WBLine.Series_Code.get(pieces); 344 345 String last = null; 346 for (WBLine i : WBLine.values()) { 347 if (i.compareTo(WBLine.YR2000) >= 0) { 348 String current = i.get(pieces); 349 if (current.length() != 0 && !current.equals(EMPTY)) { 350 last = current; 351 } 352 } 353 } 354 if (last == null) { 355 return false; 356 } 357 String country = CountryCodeConverter.getCodeFromName(WBLine.Country_Name.get(pieces), true, missing); 358 if (country == null) { 359 return false; 360 } 361 if (!StandardCodes.isCountry(country)) { 362 if (ADD_POP) { 363 System.out.println("Skipping worldbank info for: " + country); 364 } 365 return false; 366 } 367 double value; 368 try { 369 value = Double.parseDouble(last); 370 } catch (NumberFormatException e) { 371 throw new IllegalArgumentException("File changed format: need to modify code"); 372 } 373 if (seriesCode.equals(GCP)) { 374 worldbank_gdp.add(country, value); 375 } else if (seriesCode.equals(POP)) { 376 worldbank_population.add(country, value); 377 } else { 378 throw new IllegalArgumentException(); 379 } 380 return true; 381 } 382 }); 383 } 384 loadUnLiteracy()385 private static void loadUnLiteracy() throws IOException { 386 CldrUtility.handleFile("external/un_literacy.csv", new CldrUtility.LineHandler() { 387 @Override 388 public boolean handle(String line) { 389 // Afghanistan,2000, ,28,43,13,,34,51,18 390 // "Country or area","Year",,"Adult (15+) literacy rate",,,,,," Youth (15-24) literacy rate",,,, 391 // ,,,Total,Men,Women,,Total,Men,Women 392 // "Albania",2008,,96,,97,,95,,99,,99,,99 393 String[] pieces = splitCommaSeparated(line); 394 if (pieces.length != 14 || pieces[1].length() == 0 || !DIGITS.containsAll(pieces[1])) { 395 return false; 396 } 397 String code = CountryCodeConverter.getCodeFromName(pieces[0], true, missing); 398 if (code == null) { 399 return false; 400 } 401 if (!StandardCodes.isCountry(code)) { 402 if (ADD_POP) { 403 System.out.println("Skipping UN info for: " + code); 404 } 405 return false; 406 } 407 String totalLiteracy = pieces[3]; 408 if (totalLiteracy.equals("�") || totalLiteracy.equals("…") || totalLiteracy.isEmpty()) { 409 return true; 410 } 411 double percent = Double.parseDouble(totalLiteracy); 412 un_literacy.add(code, percent); 413 return true; 414 } 415 }); 416 } 417 418 static { 419 try { loadFactbookLiteracy()420 loadFactbookLiteracy(); loadUnLiteracy()421 loadUnLiteracy(); 422 423 loadFactbookInfo("external/factbook_gdp_ppp.txt", factbook_gdp); 424 loadFactbookInfo("external/factbook_population.txt", factbook_population); 425 CldrUtility.handleFile("external/other_country_data.txt", new MyLineHandler(other)); 426 loadWorldBankInfo()427 loadWorldBankInfo(); 428 StandardCodes sc = StandardCodes.make(); 429 StringBuilder myErrors = new StringBuilder(); 430 for (String territory : sc.getGoodAvailableCodes("territory")) { 431 if (!StandardCodes.isCountry(territory)) { 432 continue; 433 } 434 double gdp = getGdp(territory); 435 double literacy = getLiteracy(territory); 436 double population = getPopulation(territory); 437 if (gdp == 0) { 438 // AX;Aland Islands;population;26,200;www.aland.ax 439 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";gdp-ppp;0;reason"); 440 } 441 if (literacy == 0) { 442 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) + ";literacy;0;reason"); 443 } 444 if (population == 0) { 445 myErrors.append("\n" + territory + ";" + sc.getData("territory", territory) 446 + ";population;0;reason"); 447 } 448 } 449 if (myErrors.length() != 0) { 450 throw new IllegalArgumentException( 451 "Missing Country values, the following and add to external/other_country_data to fix, chaning the 0 to the real value:" 452 + myErrors); 453 } 454 } catch (IOException e) { 455 } 456 } 457 } 458