• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2012 The Libphonenumber Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: Patrick Mezard
16 
17 #include "cpp-build/generate_geocoding_data.h"
18 
19 #include <dirent.h>
20 #include <errno.h>
21 #include <locale>
22 #include <sys/stat.h>
23 #include <algorithm>
24 #include <cctype>
25 #include <cmath>
26 #include <cstdio>
27 #include <cstring>
28 #include <iomanip>
29 #include <iterator>
30 #include <map>
31 #include <set>
32 #include <sstream>
33 #include <string>
34 #include <utility>
35 #include <vector>
36 
37 #include "base/basictypes.h"
38 
39 #include "absl/container/btree_map.h"
40 #include "absl/container/btree_set.h"
41 
42 namespace i18n {
43 namespace phonenumbers {
44 
45 using std::map;
46 using std::string;
47 using std::vector;
48 using std::set;
49 using std::pair;
50 
51 template <typename ResourceType> class AutoCloser {
52  public:
53   typedef int (*ReleaseFunction) (ResourceType* resource);
54 
AutoCloser(ResourceType ** resource,ReleaseFunction release_function)55   AutoCloser(ResourceType** resource, ReleaseFunction release_function)
56       : resource_(resource),
57         release_function_(release_function)
58   {}
59 
~AutoCloser()60   ~AutoCloser() {
61     Close();
62   }
63 
get_resource() const64   ResourceType* get_resource() const {
65     return *resource_;
66   }
67 
Close()68   void Close() {
69     if (*resource_) {
70       release_function_(*resource_);
71       *resource_ = NULL;
72     }
73   }
74 
75  private:
76   ResourceType** resource_;
77   ReleaseFunction release_function_;
78 };
79 
80 enum DirEntryKinds {
81   kFile = 0,
82   kDirectory = 1,
83 };
84 
85 class DirEntry {
86  public:
DirEntry(const char * n,DirEntryKinds k)87   DirEntry(const char* n, DirEntryKinds k)
88       : name_(n),
89         kind_(k)
90   {}
91 
name() const92   const std::string& name() const { return name_; }
kind() const93   DirEntryKinds kind() const { return kind_; }
94 
95  private:
96   std::string name_;
97   DirEntryKinds kind_;
98 };
99 
100 // Lists directory entries in path. "." and ".." are excluded. Returns true on
101 // success.
ListDirectory(const string & path,vector<DirEntry> * entries)102 bool ListDirectory(const string& path, vector<DirEntry>* entries) {
103   entries->clear();
104   DIR* dir = opendir(path.c_str());
105   if (!dir) {
106     return false;
107   }
108   AutoCloser<DIR> dir_closer(&dir, closedir);
109   struct dirent *entry;
110   struct stat entry_stat;
111   while (true) {
112     // Set errno to 0 to be able to check if an error occurs during the
113     // readdir() call. NULL is the return value when the end of the directory
114     // stream is reached or when an error occurs, and the errno check is the
115     // only thing that helps us distinguish between the two cases. See
116     // documentation at
117     // http://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html
118     errno = 0;
119     entry = readdir(dir);
120     if (entry == NULL) {
121       return errno == 0;
122     }
123     if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
124        continue;
125     }
126     const string entry_path = path + "/" + entry->d_name;
127     if (stat(entry_path.c_str(), &entry_stat)) {
128       return false;
129     }
130     DirEntryKinds kind = kFile;
131     if (S_ISDIR(entry_stat.st_mode)) {
132       kind = kDirectory;
133     } else if (!S_ISREG(entry_stat.st_mode)) {
134       continue;
135     }
136     entries->push_back(DirEntry(entry->d_name, kind));
137   }
138 }
139 
140 // Returns true if s ends with suffix.
EndsWith(const string & s,const string & suffix)141 bool EndsWith(const string& s, const string& suffix) {
142   if (suffix.length() > s.length()) {
143     return false;
144   }
145   return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
146 }
147 
148 // Converts string to integer, returns true on success.
StrToInt(const string & s,int32 * n)149 bool StrToInt(const string& s, int32* n) {
150   std::stringstream stream;
151   stream << s;
152   stream >> *n;
153   return !stream.fail();
154 }
155 
156 // Converts integer to string, returns true on success.
IntToStr(int32 n,string * s)157 bool IntToStr(int32 n, string* s) {
158   std::stringstream stream;
159   stream << n;
160   stream >> *s;
161   return !stream.fail();
162 }
163 
164 // Parses the prefix descriptions file at path, clears and fills the output
165 // prefixes phone number prefix to description mapping.
166 // Returns true on success.
ParsePrefixes(const string & path,absl::btree_map<int32,string> * prefixes)167 bool ParsePrefixes(const string& path,
168                    absl::btree_map<int32, string>* prefixes) {
169   prefixes->clear();
170   FILE* input = fopen(path.c_str(), "r");
171   if (!input) {
172     return false;
173   }
174   AutoCloser<FILE> input_closer(&input, fclose);
175   const int kMaxLineLength = 2*1024;
176   vector<char> buffer(kMaxLineLength);
177   vector<char>::iterator begin, end, sep;
178   string prefix, description;
179   int32 prefix_code;
180   while (fgets(&buffer[0], buffer.size(), input)) {
181     begin = buffer.begin();
182     end = std::find(begin, buffer.end(), '\0');
183     if (end == begin) {
184       continue;
185     }
186     --end;
187     if (*end != '\n' && !feof(input)) {
188       // A line without LF can only happen at the end of file.
189       return false;
190     }
191 
192     // Trim and check for comments.
193     for (; begin != end && std::isspace(*begin); ++begin) {}
194     for (; end != begin && std::isspace(*(end - 1)); --end) {}
195     if (begin == end || *begin == '#') {
196       continue;
197     }
198 
199     sep = std::find(begin, end, '|');
200     if (sep == end) {
201       continue;
202     }
203     prefix = string(begin, sep);
204     if (!StrToInt(prefix, &prefix_code)) {
205       return false;
206     }
207     (*prefixes)[prefix_code] = string(sep + 1, end);
208   }
209   return ferror(input) == 0;
210 }
211 
212 // Builds a C string literal from s. The output is enclosed in double-quotes and
213 // care is taken to escape input quotes and non-ASCII or control characters.
214 //
215 // An input string:
216 //   Op\xc3\xa9ra
217 // becomes:
218 //   "Op""\xc3""\xa9""ra"
MakeStringLiteral(const string & s)219 string MakeStringLiteral(const string& s) {
220   std::stringstream buffer;
221   int prev_is_hex = 0;
222   buffer << std::hex << std::setfill('0');
223   buffer << "\"";
224   for (string::const_iterator it = s.begin(); it != s.end(); ++it) {
225     const char c = *it;
226     if (c >= 32 && c < 127) {
227       if (prev_is_hex == 2) {
228         buffer << "\"\"";
229       }
230       if (c == '\'') {
231         buffer << "\\";
232       }
233       buffer << c;
234       prev_is_hex = 1;
235     } else {
236       if (prev_is_hex != 0) {
237         buffer << "\"\"";
238       }
239       buffer << "\\x" << std::setw(2) << (c < 0 ? c + 256 : c);
240       prev_is_hex = 2;
241     }
242   }
243   buffer << "\"";
244   return buffer.str();
245 }
246 
WriteStringLiteral(const string & s,FILE * output)247 void WriteStringLiteral(const string& s, FILE* output) {
248   string literal = MakeStringLiteral(s);
249   fprintf(output, "%s", literal.c_str());
250 }
251 
252 const char kLicense[] =
253   "// Copyright (C) 2012 The Libphonenumber Authors\n"
254   "//\n"
255   "// Licensed under the Apache License, Version 2.0 (the \"License\");\n"
256   "// you may not use this file except in compliance with the License.\n"
257   "// You may obtain a copy of the License at\n"
258   "//\n"
259   "// http://www.apache.org/licenses/LICENSE-2.0\n"
260   "//\n"
261   "// Unless required by applicable law or agreed to in writing, software\n"
262   "// distributed under the License is distributed on an \"AS IS\" BASIS,\n"
263   "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or "
264   "implied.\n"
265   "// See the License for the specific language governing permissions and\n"
266   "// limitations under the License.\n"
267   "//\n"
268   "// This file is generated automatically, do not edit it manually.\n"
269   "\n";
270 
WriteLicense(FILE * output)271 void WriteLicense(FILE* output) {
272   fprintf(output, "%s", kLicense);
273 }
274 
275 const char kI18NNS[] = "i18n";
276 const char kPhoneNumbersNS[] = "phonenumbers";
277 
WriteNSHeader(FILE * output)278 void WriteNSHeader(FILE* output) {
279   fprintf(output, "namespace %s {\n", kI18NNS);
280   fprintf(output, "namespace %s {\n", kPhoneNumbersNS);
281 }
282 
WriteNSFooter(FILE * output)283 void WriteNSFooter(FILE* output) {
284   fprintf(output, "}  // namespace %s\n", kPhoneNumbersNS);
285   fprintf(output, "}  // namespace %s\n", kI18NNS);
286 }
287 
WriteCppHeader(const string & base_name,FILE * output)288 void WriteCppHeader(const string& base_name, FILE* output) {
289   fprintf(output, "#include \"phonenumbers/geocoding/%s.h\"\n",
290           base_name.c_str());
291   fprintf(output, "\n");
292   fprintf(output, "#include \"phonenumbers/base/basictypes.h\"\n");
293   fprintf(output, "\n");
294 }
295 
WriteArrayAndSize(const string & name,FILE * output)296 void WriteArrayAndSize(const string& name, FILE* output) {
297   fprintf(output, "  %s,\n", name.c_str());
298   fprintf(output, "  sizeof(%s)/sizeof(*%s),\n", name.c_str(), name.c_str());
299 }
300 
301 // Writes a PrefixDescriptions variable named "name", with its prefixes field
302 // set to "prefixes_name" variable, its descriptions to "desc_name" and its
303 // possible_lengths to "possible_lengths_name":
304 //
305 // const PrefixDescriptions ${name} = {
306 //   ${prefix_name},
307 //   sizeof(${prefix_name})/sizeof(*${prefix_name}),
308 //   ${desc_name},
309 //   ${possible_lengths_name},
310 //   sizeof(${possible_lengths_name})/sizeof(*${possible_lengths_name}),
311 // };
312 //
WritePrefixDescriptionsDefinition(const string & name,const string & prefixes_name,const string & desc_name,const string & possible_lengths_name,FILE * output)313 void WritePrefixDescriptionsDefinition(
314     const string& name, const string& prefixes_name, const string& desc_name,
315     const string& possible_lengths_name, FILE* output) {
316   fprintf(output, "const PrefixDescriptions %s = {\n", name.c_str());
317   WriteArrayAndSize(prefixes_name, output);
318   fprintf(output, "  %s,\n", desc_name.c_str());
319   WriteArrayAndSize(possible_lengths_name, output);
320   fprintf(output, "};\n");
321 }
322 
323 // Writes prefixes, descriptions and possible_lengths arrays built from the
324 // phone number prefix to description mapping "prefixes". Binds these arrays
325 // in a single PrefixDescriptions variable named "var_name".
326 //
327 // const int32 ${var_name}_prefixes[] = {
328 //   1201,
329 //   1650,
330 // };
331 //
332 // const char* ${var_name}_descriptions[] = {
333 //   "New Jerse",
334 //   "Kalifornie",
335 // };
336 //
337 // const int32 ${var_name}_possible_lengths[] = {
338 //   4,
339 // };
340 //
341 // const PrefixDescriptions ${var_name} = {
342 //   ...
343 // };
344 //
WritePrefixDescriptions(const string & var_name,const absl::btree_map<int,string> & prefixes,FILE * output)345 void WritePrefixDescriptions(const string& var_name,
346                              const absl::btree_map<int, string>& prefixes,
347                              FILE* output) {
348   absl::btree_set<int> possible_lengths;
349   const string prefixes_name = var_name + "_prefixes";
350   fprintf(output, "const int32 %s[] = {\n", prefixes_name.c_str());
351   for (absl::btree_map<int, string>::const_iterator it = prefixes.begin();
352        it != prefixes.end(); ++it) {
353     fprintf(output, "  %d,\n", it->first);
354     possible_lengths.insert(static_cast<int>(log10(it->first) + 1));
355   }
356   fprintf(output,
357           "};\n"
358           "\n");
359 
360   const string desc_name = var_name + "_descriptions";
361   fprintf(output, "const char* %s[] = {\n", desc_name.c_str());
362   for (absl::btree_map<int, string>::const_iterator it = prefixes.begin();
363        it != prefixes.end(); ++it) {
364     fprintf(output, "  ");
365     WriteStringLiteral(it->second, output);
366     fprintf(output, ",\n");
367   }
368   fprintf(output,
369           "};\n"
370           "\n");
371 
372   const string possible_lengths_name = var_name + "_possible_lengths";
373   fprintf(output, "const int32 %s[] = {\n ", possible_lengths_name.c_str());
374   for (absl::btree_set<int>::const_iterator it = possible_lengths.begin();
375        it != possible_lengths.end(); ++it) {
376     fprintf(output, " %d,", *it);
377   }
378   fprintf(output,
379           "\n"
380           "};\n"
381           "\n");
382 
383   WritePrefixDescriptionsDefinition(var_name, prefixes_name, desc_name,
384                                     possible_lengths_name, output);
385   fprintf(output, "\n");
386 }
387 
388 // Writes a pair of arrays mapping prefix language code pairs to
389 // PrefixDescriptions instances. "prefix_var_names" maps language code pairs
390 // to prefix variable names.
391 //
392 // const char* prefix_language_code_pairs[] = {
393 //   "1_de",
394 //   "1_en",
395 // };
396 //
397 // const PrefixDescriptions* prefix_descriptions[] = {
398 //   &prefix_1_de,
399 //   &prefix_1_en,
400 // };
401 //
WritePrefixesDescriptions(const absl::btree_map<string,string> & prefix_var_names,FILE * output)402 void WritePrefixesDescriptions(
403     const absl::btree_map<string, string>& prefix_var_names, FILE* output) {
404   fprintf(output, "const char* prefix_language_code_pairs[] = {\n");
405   for (absl::btree_map<string, string>::const_iterator it = prefix_var_names.begin();
406        it != prefix_var_names.end(); ++it) {
407     fprintf(output, "  \"%s\",\n", it->first.c_str());
408   }
409   fprintf(output,
410           "};\n"
411           "\n"
412           "const PrefixDescriptions* prefixes_descriptions[] = {\n");
413   for (absl::btree_map<string, string>::const_iterator it = prefix_var_names.begin();
414        it != prefix_var_names.end(); ++it) {
415     fprintf(output, "  &%s,\n", it->second.c_str());
416   }
417   fprintf(output,
418           "};\n"
419           "\n");
420 }
421 
422 // For each entry in "languages" mapping a country calling code to a set
423 // of available languages, writes a sorted array of languages, then wraps it
424 // into a CountryLanguages instance. Finally, writes a pair of arrays mapping
425 // country calling codes to CountryLanguages instances.
426 //
427 // const char* country_1[] = {
428 //   "de",
429 //   "en",
430 // };
431 //
432 // const CountryLanguages country_1_languages = {
433 //   country_1,
434 //   sizeof(country_1)/sizeof(*country_1),
435 // };
436 //
437 // [...]
438 //
439 // const CountryLanguages* country_languages[] = {
440 //   &country_1_languages,
441 //   [...]
442 // }
443 //
444 // const int country_calling_codes[] = {
445 //   1,
446 //   [...]
447 // };
448 //
WriteCountryLanguages(const map<int32,set<string>> & languages,FILE * output)449 bool WriteCountryLanguages(const map<int32, set<string> >& languages,
450                            FILE* output) {
451   vector<string> country_languages_vars;
452   vector<string> countries;
453   for (map<int32, set<string> >::const_iterator it = languages.begin();
454        it != languages.end(); ++it) {
455     string country_code;
456     if (!IntToStr(it->first, &country_code)) {
457       return false;
458     }
459     const string country_var = "country_" + country_code;
460     fprintf(output, "const char* %s[] = {\n", country_var.c_str());
461     for (set<string>::const_iterator it_lang = it->second.begin();
462          it_lang != it->second.end(); ++it_lang) {
463       fprintf(output, "  \"%s\",\n", it_lang->c_str());
464     }
465     fprintf(output,
466             "};\n"
467             "\n");
468 
469     const string country_languages_var = country_var + "_languages";
470     fprintf(output, "const CountryLanguages %s = {\n",
471             country_languages_var.c_str());
472     WriteArrayAndSize(country_var, output);
473     fprintf(output,
474             "};\n"
475             "\n");
476     country_languages_vars.push_back(country_languages_var);
477     countries.push_back(country_code);
478   }
479 
480   fprintf(output,
481           "\n"
482           "const CountryLanguages* countries_languages[] = {\n");
483   for (vector<string>::const_iterator
484        it_languages_var = country_languages_vars.begin();
485        it_languages_var != country_languages_vars.end(); ++it_languages_var) {
486     fprintf(output, "  &%s,\n", it_languages_var->c_str());
487   }
488   fprintf(output,
489           "};\n"
490           "\n"
491           "const int country_calling_codes[] = {\n");
492   for (vector<string>::const_iterator it_country = countries.begin();
493        it_country != countries.end(); ++it_country) {
494     fprintf(output, "  %s,\n", it_country->c_str());
495   }
496   fprintf(output,
497           "};\n"
498           "\n");
499   return true;
500 }
501 
502 // Returns a copy of input where all occurences of pattern are replaced with
503 // value. If pattern is empty, input is returned unchanged.
ReplaceAll(const string & input,const string & pattern,const string & value)504 string ReplaceAll(const string& input, const string& pattern,
505                   const string& value) {
506   if (pattern.size() == 0) {
507     return input;
508   }
509   string replaced;
510   std::back_insert_iterator<string> output = std::back_inserter(replaced);
511   string::const_iterator begin = input.begin(), end = begin;
512   while (true) {
513     const size_t pos = input.find(pattern, begin - input.begin());
514     if (pos == string::npos) {
515       std::copy(begin, input.end(), output);
516       break;
517     }
518     end = input.begin() + pos;
519     std::copy(begin, end, output);
520     std::copy(value.begin(), value.end(), output);
521     begin = end + pattern.length();
522   }
523   return replaced;
524 }
525 
526 // Writes data accessor definitions, prefixed with "accessor_prefix".
WriteAccessorsDefinitions(const string & accessor_prefix,FILE * output)527 void WriteAccessorsDefinitions(const string& accessor_prefix, FILE* output) {
528   string templ =
529       "const int* get$prefix$_country_calling_codes() {\n"
530       "  return country_calling_codes;\n"
531       "}\n"
532       "\n"
533       "int get$prefix$_country_calling_codes_size() {\n"
534       "  return sizeof(country_calling_codes)\n"
535       "      /sizeof(*country_calling_codes);\n"
536       "}\n"
537       "\n"
538       "const CountryLanguages* get$prefix$_country_languages(int index) {\n"
539       "  return countries_languages[index];\n"
540       "}\n"
541       "\n"
542       "const char** get$prefix$_prefix_language_code_pairs() {\n"
543       "  return prefix_language_code_pairs;\n"
544       "}\n"
545       "\n"
546       "int get$prefix$_prefix_language_code_pairs_size() {\n"
547       "  return sizeof(prefix_language_code_pairs)\n"
548       "      /sizeof(*prefix_language_code_pairs);\n"
549       "}\n"
550       "\n"
551       "const PrefixDescriptions* get$prefix$_prefix_descriptions(int index) {\n"
552       "  return prefixes_descriptions[index];\n"
553       "}\n";
554   string defs = ReplaceAll(templ, "$prefix$", accessor_prefix);
555   fprintf(output, "%s", defs.c_str());
556 }
557 
558 // Writes geocoding data .cc file. "data_path" is the path of geocoding textual
559 // data directory. "base_name" is the base name of the .h/.cc pair, like
560 // "geocoding_data".
WriteSource(const string & data_path,const string & base_name,const string & accessor_prefix,FILE * output)561 bool WriteSource(const string& data_path, const string& base_name,
562                  const string& accessor_prefix, FILE* output) {
563   WriteLicense(output);
564   WriteCppHeader(base_name, output);
565   WriteNSHeader(output);
566   fprintf(output,
567           "namespace {\n"
568           "\n");
569 
570   // Enumerate language/script directories.
571   absl::btree_map<string, string> prefix_vars;
572   map<int32, set<string> > country_languages;
573   vector<DirEntry> entries;
574   if (!ListDirectory(data_path, &entries)) {
575     fprintf(stderr, "failed to read directory entries");
576     return false;
577   }
578   for (vector<DirEntry>::const_iterator it = entries.begin();
579        it != entries.end(); ++it) {
580     if (it->kind() != kDirectory) {
581       continue;
582     }
583     // Enumerate country calling code files.
584     const string dir_path = data_path + "/" + it->name();
585     vector<DirEntry> files;
586     if (!ListDirectory(dir_path, &files)) {
587       fprintf(stderr, "failed to read file entries\n");
588       return false;
589     }
590     for (vector<DirEntry>::const_iterator it_files = files.begin();
591          it_files != files.end(); ++it_files) {
592       const string fname = it_files->name();
593       if (!EndsWith(fname, ".txt")) {
594        continue;
595       }
596       int32 country_code;
597       const string country_code_str = fname.substr(0, fname.length() - 4);
598       if (!StrToInt(country_code_str, &country_code)) {
599         return false;
600       }
601       const string path = dir_path + "/" + fname;
602 
603       absl::btree_map<int32, string> prefixes;
604       if (!ParsePrefixes(path, &prefixes)) {
605         return false;
606       }
607 
608       const string prefix_var = "prefix_" + country_code_str + "_" + it->name();
609       WritePrefixDescriptions(prefix_var, prefixes, output);
610       prefix_vars[country_code_str + "_" + it->name()] = prefix_var;
611       country_languages[country_code].insert(it->name());
612     }
613   }
614   WritePrefixesDescriptions(prefix_vars, output);
615   if (!WriteCountryLanguages(country_languages, output)) {
616     return false;
617   }
618   fprintf(output, "}  // namespace\n");
619   fprintf(output, "\n");
620   WriteAccessorsDefinitions(accessor_prefix, output);
621   WriteNSFooter(output);
622   return ferror(output) == 0;
623 }
624 
PrintHelp(const string & message)625 int PrintHelp(const string& message) {
626   fprintf(stderr, "error: %s\n", message.c_str());
627   fprintf(stderr, "generate_geocoding_data DATADIR CCPATH");
628   return 1;
629 }
630 
Main(int argc,const char * argv[])631 int Main(int argc, const char* argv[]) {
632   if (argc < 2) {
633     return PrintHelp("geocoding data root directory expected");
634   }
635   if (argc < 3) {
636     return PrintHelp("output source path expected");
637   }
638   string accessor_prefix = "";
639   if (argc > 3) {
640     accessor_prefix = argv[3];
641   }
642   const string root_path(argv[1]);
643   string source_path(argv[2]);
644   std::replace(source_path.begin(), source_path.end(), '\\', '/');
645   string base_name = source_path;
646   if (base_name.rfind('/') != string::npos) {
647     base_name = base_name.substr(base_name.rfind('/') + 1);
648   }
649   base_name = base_name.substr(0, base_name.rfind('.'));
650 
651   FILE* source_fp = fopen(source_path.c_str(), "w");
652   if (!source_fp) {
653     fprintf(stderr, "failed to open %s\n", source_path.c_str());
654     return 1;
655   }
656   AutoCloser<FILE> source_closer(&source_fp, fclose);
657   if (!WriteSource(root_path, base_name, accessor_prefix,
658                    source_fp)) {
659     return 1;
660   }
661   return 0;
662 }
663 
664 }  // namespace phonenumbers
665 }  // namespace i18n
666